Move C runtime source directories into runtime/src/iree. (#8950)

* Pries apart the global include directory situation on both the Bazel and CMake side. On Bazel, we use a new iree_runtime_cc_(library|binary) macro that adds an implicit dep for include propagation and (in the future) can set copts. 
* On the CMake side, we use a path-based implicit dep to similar effect. I tried a couple of other ways and this was the least intrusive.
* Reworks bazel_to_cmake target rewriting to account for the new split root.
* Removes the CMake DATA include::this:file.png style of data includes (used in one place) in favor of a path, since package names are no longer reversible to a location. This seems to be the only place we made that assumption.
* Will do a couple more followups to completely retire the iree/iree directory (in favor of top-level compiler/ and tools/ directories).

Progress on #8955
diff --git a/runtime/BUILD.bazel b/runtime/BUILD.bazel
new file mode 100644
index 0000000..9da145a
--- /dev/null
+++ b/runtime/BUILD.bazel
@@ -0,0 +1,13 @@
+# Copyright 2022 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+exports_files(["lit.cfg.py"])
diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt
index 8a3ce7a..de8151c 100644
--- a/runtime/CMakeLists.txt
+++ b/runtime/CMakeLists.txt
@@ -4,6 +4,8 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+add_subdirectory(src)
+
 if(IREE_BUILD_PYTHON_BINDINGS)
   # Copy Python packaging files to the build dir so that we can install from
   # there.
diff --git a/runtime/lit.cfg.py b/runtime/lit.cfg.py
new file mode 100644
index 0000000..77a0498
--- /dev/null
+++ b/runtime/lit.cfg.py
@@ -0,0 +1,32 @@
+# Copyright 2022 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Lit config for IREE."""
+
+# Lint for undefined variables is disabled as config is not defined inside this
+# file, instead config is injected by way of evaluating runlit.cfg.py from
+# runlit.site.cfg.py which in turn is evaluated by lit.py.
+# pylint: disable=undefined-variable
+
+import os
+import tempfile
+
+import lit.formats
+
+config.name = "IREE"
+config.suffixes = [".mlir", ".txt"]
+config.test_format = lit.formats.ShTest(execute_external=True)
+# Forward all IREE environment variables
+passthrough_env_vars = ["VK_ICD_FILENAMES"]
+config.environment.update({
+    k: v
+    for k, v in os.environ.items()
+    if k.startswith("IREE_") or k in passthrough_env_vars
+})
+
+# Use the most preferred temp directory.
+config.test_exec_root = (os.environ.get("TEST_UNDECLARED_OUTPUTS_DIR") or
+                         os.environ.get("TEST_TMPDIR") or
+                         os.path.join(tempfile.gettempdir(), "lit"))
diff --git a/runtime/src/BUILD b/runtime/src/BUILD
new file mode 100644
index 0000000..ad62c80
--- /dev/null
+++ b/runtime/src/BUILD
@@ -0,0 +1,18 @@
+# Copyright 2022 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "runtime_defines",
+    includes = [
+        ".",
+    ],
+)
diff --git a/runtime/src/CMakeLists.txt b/runtime/src/CMakeLists.txt
new file mode 100644
index 0000000..e19c964
--- /dev/null
+++ b/runtime/src/CMakeLists.txt
@@ -0,0 +1,18 @@
+# Copyright 2022 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# Configures all iree_cc_* targets to take this implicit dep,
+# which provides common includes and copts for the tree.
+set(IREE_IMPLICIT_DEFS_CC_DEPS iree_defs_runtime)
+
+add_library(iree_defs_runtime INTERFACE)
+target_include_directories(
+  iree_defs_runtime INTERFACE
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
+)
+
+add_subdirectory(iree)
diff --git a/runtime/src/iree/CMakeLists.txt b/runtime/src/iree/CMakeLists.txt
new file mode 100644
index 0000000..3b1f024
--- /dev/null
+++ b/runtime/src/iree/CMakeLists.txt
@@ -0,0 +1,15 @@
+# Copyright 2022 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+add_subdirectory(base)
+add_subdirectory(builtins)
+add_subdirectory(hal)
+add_subdirectory(modules)
+add_subdirectory(runtime)
+add_subdirectory(schemas)
+add_subdirectory(task)
+add_subdirectory(testing)
+add_subdirectory(vm)
diff --git a/runtime/src/iree/base/BUILD b/runtime/src/iree/base/BUILD
new file mode 100644
index 0000000..b85f964
--- /dev/null
+++ b/runtime/src/iree/base/BUILD
@@ -0,0 +1,211 @@
+# Copyright 2019 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# Common types and utilities used in the IREE codebase.
+
+load("//iree:build_defs.oss.bzl", "iree_runtime_cc_library", "iree_runtime_cc_test")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+#===------------------------------------------------------------------------===#
+# Public API
+#===------------------------------------------------------------------------===#
+
+iree_runtime_cc_library(
+    name = "base",
+    srcs = [
+        "allocator.c",
+        "allocator.h",
+        "api.c",
+        "assert.h",
+        "bitfield.c",
+        "bitfield.h",
+        "loop.c",
+        "loop.h",
+        "loop_inline.c",
+        "loop_inline.h",
+        "status.c",
+        "status.h",
+        "string_builder.c",
+        "string_builder.h",
+        "string_view.c",
+        "string_view.h",
+        "time.c",
+        "time.h",
+        "wait_source.c",
+        "wait_source.h",
+    ],
+    hdrs = ["api.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":core_headers",
+        ":tracing",
+    ],
+)
+
+# TODO(benvanik): make these srcs and only expose an api_cc.h.
+iree_runtime_cc_library(
+    name = "cc",
+    srcs = [
+        "status_cc.cc",
+    ],
+    hdrs = [
+        "status_cc.h",
+    ],
+    deps = [
+        ":base",
+        ":core_headers",
+        ":logging",
+    ],
+)
+
+iree_runtime_cc_test(
+    name = "bitfield_test",
+    srcs = ["bitfield_test.cc"],
+    deps = [
+        ":base",
+        "//runtime/src/iree/testing:gtest",
+        "//runtime/src/iree/testing:gtest_main",
+    ],
+)
+
+iree_runtime_cc_test(
+    name = "loop_inline_test",
+    srcs = [
+        "loop_inline_test.cc",
+    ],
+    deps = [
+        ":base",
+        ":cc",
+        ":loop_test_hdrs",
+        "//runtime/src/iree/testing:gtest",
+        "//runtime/src/iree/testing:gtest_main",
+    ],
+)
+
+iree_runtime_cc_library(
+    name = "loop_test_hdrs",
+    testonly = 1,
+    hdrs = [
+        "loop_test.h",
+    ],
+    deps = [
+        ":base",
+        ":tracing",
+        "//runtime/src/iree/base/internal:wait_handle",
+        "//runtime/src/iree/testing:gtest",
+    ],
+)
+
+iree_runtime_cc_test(
+    name = "status_test",
+    srcs = ["status_test.cc"],
+    deps = [
+        ":base",
+        ":cc",
+        "//runtime/src/iree/testing:gtest",
+        "//runtime/src/iree/testing:gtest_main",
+    ],
+)
+
+iree_runtime_cc_test(
+    name = "string_builder_test",
+    srcs = ["string_builder_test.cc"],
+    deps = [
+        ":base",
+        "//runtime/src/iree/testing:gtest",
+        "//runtime/src/iree/testing:gtest_main",
+    ],
+)
+
+iree_runtime_cc_test(
+    name = "string_view_test",
+    srcs = ["string_view_test.cc"],
+    deps = [
+        ":base",
+        "//runtime/src/iree/testing:gtest",
+        "//runtime/src/iree/testing:gtest_main",
+    ],
+)
+
+#===------------------------------------------------------------------------===#
+# Core headers (platform detection, compiler compat, etc)
+#===------------------------------------------------------------------------===#
+
+iree_runtime_cc_library(
+    name = "core_headers",
+    hdrs = [
+        "alignment.h",
+        "attributes.h",
+        "config.h",
+        "target_platform.h",
+    ],
+)
+
+iree_runtime_cc_library(
+    name = "target_platform",
+    hdrs = ["target_platform.h"],
+)
+
+#===------------------------------------------------------------------------===#
+# Internal IREE C++ wrappers and utilities
+#===------------------------------------------------------------------------===#
+
+iree_runtime_cc_library(
+    name = "logging",
+    srcs = ["logging.cc"],
+    hdrs = ["logging.h"],
+    linkopts = select({
+        "//iree:iree_is_android": [
+            "-llog",
+        ],
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":core_headers",
+        ":tracing",
+        "//runtime/src/iree/base/internal:flags",
+    ],
+)
+
+iree_runtime_cc_library(
+    name = "loop_sync",
+    srcs = ["loop_sync.c"],
+    hdrs = ["loop_sync.h"],
+    deps = [
+        ":base",
+        ":tracing",
+        "//runtime/src/iree/base/internal",
+        "//runtime/src/iree/base/internal:wait_handle",
+    ],
+)
+
+iree_runtime_cc_test(
+    name = "loop_sync_test",
+    srcs = [
+        "loop_sync_test.cc",
+    ],
+    deps = [
+        ":base",
+        ":cc",
+        ":loop_sync",
+        ":loop_test_hdrs",
+        "//runtime/src/iree/testing:gtest",
+        "//runtime/src/iree/testing:gtest_main",
+    ],
+)
+
+iree_runtime_cc_library(
+    name = "tracing",
+    hdrs = ["tracing.h"],
+    deps = [
+        ":core_headers",
+    ],
+)
diff --git a/runtime/src/iree/base/CMakeLists.txt b/runtime/src/iree/base/CMakeLists.txt
new file mode 100644
index 0000000..1121f63
--- /dev/null
+++ b/runtime/src/iree/base/CMakeLists.txt
@@ -0,0 +1,223 @@
+# Copyright 2019 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+iree_add_all_subdirs()
+
+iree_cc_library(
+  NAME
+    base
+  HDRS
+    "api.h"
+  SRCS
+    "allocator.c"
+    "allocator.h"
+    "api.c"
+    "assert.h"
+    "bitfield.c"
+    "bitfield.h"
+    "loop.c"
+    "loop.h"
+    "loop_inline.c"
+    "loop_inline.h"
+    "status.c"
+    "status.h"
+    "string_builder.c"
+    "string_builder.h"
+    "string_view.c"
+    "string_view.h"
+    "time.c"
+    "time.h"
+    "wait_source.c"
+    "wait_source.h"
+  DEPS
+    ::core_headers
+    ::tracing
+  PUBLIC
+)
+
+iree_cc_library(
+  NAME
+    core_headers
+  HDRS
+    "alignment.h"
+    "attributes.h"
+    "config.h"
+    "target_platform.h"
+  PUBLIC
+)
+
+iree_cc_library(
+  NAME
+    cc
+  HDRS
+    "status_cc.h"
+  SRCS
+    "status_cc.cc"
+  DEPS
+    ::base
+    ::core_headers
+    ::logging
+  PUBLIC
+)
+
+iree_cc_test(
+  NAME
+    bitfield_test
+  SRCS
+    "bitfield_test.cc"
+  DEPS
+    ::base
+    iree::testing::gtest
+    iree::testing::gtest_main
+)
+
+iree_cc_test(
+  NAME
+    loop_inline_test
+  SRCS
+    "loop_inline_test.cc"
+  DEPS
+    ::base
+    ::cc
+    ::loop_test_hdrs
+    ::tracing
+    iree::testing::gtest
+    iree::testing::gtest_main
+)
+
+iree_cc_library(
+  NAME
+    loop_test_hdrs
+  HDRS
+    "loop_test.h"
+  DEPS
+    ::base
+    ::cc
+    ::tracing
+    iree::base::internal::wait_handle
+    iree::testing::gtest
+  TESTONLY
+  PUBLIC
+)
+
+iree_cc_test(
+  NAME
+    status_test
+  SRCS
+    "status_test.cc"
+  DEPS
+    ::base
+    ::cc
+    iree::testing::gtest
+    iree::testing::gtest_main
+)
+
+iree_cc_test(
+  NAME
+    string_builder_test
+  SRCS
+    "string_builder_test.cc"
+  DEPS
+    ::base
+    iree::testing::gtest
+    iree::testing::gtest_main
+)
+
+iree_cc_test(
+  NAME
+    string_view_test
+  SRCS
+    "string_view_test.cc"
+  DEPS
+    ::base
+    iree::testing::gtest
+    iree::testing::gtest_main
+)
+
+iree_cc_library(
+  NAME
+    target_platform
+  HDRS
+    "target_platform.h"
+  PUBLIC
+)
+
+iree_cc_library(
+  NAME
+    logging
+  HDRS
+    "logging.h"
+  SRCS
+    "logging.cc"
+  DEPS
+    ::core_headers
+    ::tracing
+    iree::base::internal::flags
+  PUBLIC
+)
+
+iree_cc_library(
+  NAME
+    loop_sync
+  HDRS
+    "loop_sync.h"
+  SRCS
+    "loop_sync.c"
+  DEPS
+    ::base
+    ::tracing
+    iree::base::internal
+    iree::base::internal::wait_handle
+  PUBLIC
+)
+
+iree_cc_test(
+  NAME
+    loop_sync_test
+  SRCS
+    "loop_sync_test.cc"
+  DEPS
+    ::base
+    ::cc
+    ::loop_sync
+    ::loop_test_hdrs
+    ::tracing
+    iree::testing::gtest
+    iree::testing::gtest_main
+)
+
+# TODO(benvanik): evaluate if we want this as part of the API. Could restrict it
+# to excusively static linkage scenarios and note that it's unstable. It's just
+# really really useful and the only way for applications to interleave with our
+# tracing (today).
+if(${IREE_ENABLE_RUNTIME_TRACING})
+  iree_cc_library(
+    NAME
+      tracing
+    HDRS
+      "tracing.h"
+      "${IREE_ROOT_DIR}/third_party/tracy/Tracy.hpp"
+      "${IREE_ROOT_DIR}/third_party/tracy/TracyC.h"
+    SRCS
+      "tracing.cc"
+    DEPS
+      ${CMAKE_DL_LIBS}
+      ::core_headers
+    DEFINES
+      "IREE_TRACING_MODE=2"
+    PUBLIC
+  )
+else()
+  iree_cc_library(
+    NAME
+      tracing
+    HDRS
+      "tracing.h"
+    DEPS
+      ::core_headers
+    PUBLIC
+  )
+endif()
diff --git a/runtime/src/iree/base/alignment.h b/runtime/src/iree/base/alignment.h
new file mode 100644
index 0000000..1fd0356
--- /dev/null
+++ b/runtime/src/iree/base/alignment.h
@@ -0,0 +1,249 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Implementation of the primitives from stdalign.h used for cross-target
+// value alignment specification and queries.
+
+#ifndef IREE_BASE_ALIGNMENT_H_
+#define IREE_BASE_ALIGNMENT_H_
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "iree/base/config.h"
+#include "iree/base/target_platform.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//===----------------------------------------------------------------------===//
+// Alignment utilities
+//===----------------------------------------------------------------------===//
+
+// https://en.cppreference.com/w/c/types/max_align_t
+#if defined(IREE_PLATFORM_WINDOWS)
+// NOTE: 16 is a specified Microsoft API requirement for some functions.
+#define iree_max_align_t 16
+#else
+#define iree_max_align_t sizeof(long double)
+#endif  // IREE_PLATFORM_*
+
+// https://en.cppreference.com/w/c/language/_Alignas
+// https://en.cppreference.com/w/c/language/_Alignof
+#if defined(IREE_COMPILER_MSVC)
+#define iree_alignas(x) __declspec(align(x))
+#define iree_alignof(x) __alignof(x)
+#else
+#define iree_alignas(x) __attribute__((__aligned__(x)))
+#define iree_alignof(x) __alignof__(x)
+#endif  // IREE_COMPILER_*
+
+// Aligns |value| up to the given power-of-two |alignment| if required.
+// https://en.wikipedia.org/wiki/Data_structure_alignment#Computing_padding
+static inline iree_host_size_t iree_host_align(iree_host_size_t value,
+                                               iree_host_size_t alignment) {
+  return (value + (alignment - 1)) & ~(alignment - 1);
+}
+
+// Returns true if |value| matches the given minimum |alignment|.
+static inline bool iree_host_size_has_alignment(iree_host_size_t value,
+                                                iree_host_size_t alignment) {
+  return iree_host_align(value, alignment) == value;
+}
+
+// Aligns |value| up to the given power-of-two |alignment| if required.
+// https://en.wikipedia.org/wiki/Data_structure_alignment#Computing_padding
+static inline iree_device_size_t iree_device_align(
+    iree_device_size_t value, iree_device_size_t alignment) {
+  return (value + (alignment - 1)) & ~(alignment - 1);
+}
+
+// Returns true if |value| matches the given minimum |alignment|.
+static inline bool iree_device_size_has_alignment(
+    iree_device_size_t value, iree_device_size_t alignment) {
+  return iree_device_align(value, alignment) == value;
+}
+
+// Returns the size of a struct padded out to iree_max_align_t.
+// This must be used when performing manual trailing allocation packing to
+// ensure the alignment requirements of the trailing data are satisfied.
+//
+// NOTE: do not use this if using VLAs (`struct { int trailing[]; }`) - those
+// must precisely follow the normal sizeof(t) as the compiler does the padding
+// for you.
+//
+// Example:
+//  some_buffer_ptr_t* p = NULL;
+//  iree_host_size_t total_size = iree_sizeof_struct(*buffer) + extra_data_size;
+//  IREE_CHECK_OK(iree_allocator_malloc(allocator, total_size, (void**)&p));
+#define iree_sizeof_struct(t) iree_host_align(sizeof(t), iree_max_align_t)
+
+//===----------------------------------------------------------------------===//
+// Alignment-safe memory accesses
+//===----------------------------------------------------------------------===//
+
+// Map little-endian byte indices in memory to the host memory order indices.
+#if defined(IREE_ENDIANNESS_LITTLE)
+#define IREE_LE_IDX_1(i) (i)
+#define IREE_LE_IDX_2(i) (i)
+#define IREE_LE_IDX_4(i) (i)
+#define IREE_LE_IDX_8(i) (i)
+#else
+#define IREE_LE_IDX_1(i) (i)
+#define IREE_LE_IDX_2(i) (1 - (i))
+#define IREE_LE_IDX_4(i) (3 - (i))
+#define IREE_LE_IDX_8(i) (7 - (i))
+#endif  // IREE_ENDIANNESS_*
+
+#if IREE_MEMORY_ACCESS_ALIGNMENT_REQUIRED
+
+static inline uint8_t iree_unaligned_load_le_u8(const uint8_t* ptr) {
+  return *ptr;
+}
+static inline uint16_t iree_unaligned_load_le_u16(const uint16_t* ptr) {
+  const uint8_t* p = (const uint8_t*)ptr;
+  return ((uint16_t)p[IREE_LE_IDX_2(0)]) | ((uint16_t)p[IREE_LE_IDX_2(1)] << 8);
+}
+static inline uint32_t iree_unaligned_load_le_u32(const uint32_t* ptr) {
+  const uint8_t* p = (const uint8_t*)ptr;
+  return ((uint32_t)p[IREE_LE_IDX_4(0)]) |
+         ((uint32_t)p[IREE_LE_IDX_4(1)] << 8) |
+         ((uint32_t)p[IREE_LE_IDX_4(2)] << 16) |
+         ((uint32_t)p[IREE_LE_IDX_4(3)] << 24);
+}
+static inline uint64_t iree_unaligned_load_le_u64(const uint64_t* ptr) {
+  const uint8_t* p = (const uint8_t*)ptr;
+  return ((uint64_t)p[IREE_LE_IDX_8(0)]) |
+         ((uint64_t)p[IREE_LE_IDX_8(1)] << 8) |
+         ((uint64_t)p[IREE_LE_IDX_8(2)] << 16) |
+         ((uint64_t)p[IREE_LE_IDX_8(3)] << 24) |
+         ((uint64_t)p[IREE_LE_IDX_8(4)] << 32) |
+         ((uint64_t)p[IREE_LE_IDX_8(5)] << 40) |
+         ((uint64_t)p[IREE_LE_IDX_8(6)] << 48) |
+         ((uint64_t)p[IREE_LE_IDX_8(7)] << 56);
+}
+static inline float iree_unaligned_load_le_f32(const float* ptr) {
+  uint32_t uint_value = iree_unaligned_load_le_u32((const uint32_t*)ptr);
+  float value;
+  memcpy(&value, &uint_value, sizeof(value));
+  return value;
+}
+static inline double iree_unaligned_load_le_f64(const double* ptr) {
+  uint64_t uint_value = iree_unaligned_load_le_u64((const uint64_t*)ptr);
+  double value;
+  memcpy(&value, &uint_value, sizeof(value));
+  return value;
+}
+
+static inline void iree_unaligned_store_le_u8(uint8_t* ptr, uint8_t value) {
+  *ptr = value;
+}
+static inline void iree_unaligned_store_le_u16(uint16_t* ptr, uint16_t value) {
+  uint8_t* p = (uint8_t*)ptr;
+  p[IREE_LE_IDX_2(0)] = value;
+  p[IREE_LE_IDX_2(1)] = value >> 8;
+}
+static inline void iree_unaligned_store_le_u32(uint32_t* ptr, uint32_t value) {
+  uint8_t* p = (uint8_t*)ptr;
+  p[IREE_LE_IDX_4(0)] = value;
+  p[IREE_LE_IDX_4(1)] = value >> 8;
+  p[IREE_LE_IDX_4(2)] = value >> 16;
+  p[IREE_LE_IDX_4(3)] = value >> 24;
+}
+static inline void iree_unaligned_store_le_u64(uint64_t* ptr, uint64_t value) {
+  uint8_t* p = (uint8_t*)ptr;
+  p[IREE_LE_IDX_8(0)] = value;
+  p[IREE_LE_IDX_8(1)] = value >> 8;
+  p[IREE_LE_IDX_8(2)] = value >> 16;
+  p[IREE_LE_IDX_8(3)] = value >> 24;
+  p[IREE_LE_IDX_8(4)] = value >> 32;
+  p[IREE_LE_IDX_8(5)] = value >> 40;
+  p[IREE_LE_IDX_8(6)] = value >> 48;
+  p[IREE_LE_IDX_8(7)] = value >> 56;
+}
+static inline void iree_unaligned_store_le_f32(float* ptr, float value) {
+  uint32_t uint_value;
+  memcpy(&uint_value, &value, sizeof(value));
+  iree_unaligned_store_le_u32((uint32_t*)ptr, uint_value);
+}
+static inline void iree_unaligned_store_le_f64(double* ptr, double value) {
+  uint64_t uint_value;
+  memcpy(&uint_value, &value, sizeof(value));
+  iree_unaligned_store_le_u64((uint64_t*)ptr, uint_value);
+}
+
+#else
+
+#if defined(IREE_ENDIANNESS_LITTLE)
+
+#define iree_unaligned_load_le_u8(ptr) *(ptr)
+#define iree_unaligned_load_le_u16(ptr) *(ptr)
+#define iree_unaligned_load_le_u32(ptr) *(ptr)
+#define iree_unaligned_load_le_u64(ptr) *(ptr)
+#define iree_unaligned_load_le_f32(ptr) *(ptr)
+#define iree_unaligned_load_le_f64(ptr) *(ptr)
+
+#define iree_unaligned_store_le_u8(ptr, value) *(ptr) = (value)
+#define iree_unaligned_store_le_u16(ptr, value) *(ptr) = (value)
+#define iree_unaligned_store_le_u32(ptr, value) *(ptr) = (value)
+#define iree_unaligned_store_le_u64(ptr, value) *(ptr) = (value)
+#define iree_unaligned_store_le_f32(ptr, value) *(ptr) = (value)
+#define iree_unaligned_store_le_f64(ptr, value) *(ptr) = (value)
+
+#else
+
+#error "TODO(benvanik): little-endian load/store for big-endian archs"
+
+#endif  // IREE_ENDIANNESS_*
+
+#endif  // IREE_MEMORY_ACCESS_ALIGNMENT_REQUIRED
+
+// clang-format off
+
+// Dereferences |ptr| and returns the value.
+// Automatically handles unaligned accesses on architectures that may not
+// support them natively (or efficiently). Memory is treated as little-endian.
+#define iree_unaligned_load_le(ptr)                                               \
+  _Generic((ptr),                                                              \
+        int8_t*: iree_unaligned_load_le_u8((const uint8_t*)(ptr)),             \
+       uint8_t*: iree_unaligned_load_le_u8((const uint8_t*)(ptr)),             \
+       int16_t*: iree_unaligned_load_le_u16((const uint16_t*)(ptr)),           \
+      uint16_t*: iree_unaligned_load_le_u16((const uint16_t*)(ptr)),           \
+       int32_t*: iree_unaligned_load_le_u32((const uint32_t*)(ptr)),           \
+      uint32_t*: iree_unaligned_load_le_u32((const uint32_t*)(ptr)),           \
+       int64_t*: iree_unaligned_load_le_u64((const uint64_t*)(ptr)),           \
+      uint64_t*: iree_unaligned_load_le_u64((const uint64_t*)(ptr)),           \
+         float*: iree_unaligned_load_le_f32((const float*)(ptr)),              \
+        double*: iree_unaligned_load_le_f64((const double*)(ptr))              \
+  )
+
+// Dereferences |ptr| and writes the given |value|.
+// Automatically handles unaligned accesses on architectures that may not
+// support them natively (or efficiently). Memory is treated as little-endian.
+#define iree_unaligned_store(ptr, value)                                       \
+  _Generic((ptr),                                                              \
+        int8_t*: iree_unaligned_store_le_u8((uint8_t*)(ptr), value),           \
+       uint8_t*: iree_unaligned_store_le_u8((uint8_t*)(ptr), value),           \
+       int16_t*: iree_unaligned_store_le_u16((uint16_t*)(ptr), value),         \
+      uint16_t*: iree_unaligned_store_le_u16((uint16_t*)(ptr), value),         \
+       int32_t*: iree_unaligned_store_le_u32((uint32_t*)(ptr), value),         \
+      uint32_t*: iree_unaligned_store_le_u32((uint32_t*)(ptr), value),         \
+       int64_t*: iree_unaligned_store_le_u64((uint64_t*)(ptr), value),         \
+      uint64_t*: iree_unaligned_store_le_u64((uint64_t*)(ptr), value),         \
+         float*: iree_unaligned_store_le_f32((float*)(ptr), value),            \
+        double*: iree_unaligned_store_le_f64((double*)(ptr), value)            \
+  )
+
+// clang-format on
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // IREE_BASE_ALIGNMENT_H_
diff --git a/runtime/src/iree/base/allocator.c b/runtime/src/iree/base/allocator.c
new file mode 100644
index 0000000..d409370
--- /dev/null
+++ b/runtime/src/iree/base/allocator.c
@@ -0,0 +1,261 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+
+//===----------------------------------------------------------------------===//
+// iree_allocator_t (std::allocator-like interface)
+//===----------------------------------------------------------------------===//
+
+static iree_status_t iree_allocator_issue_alloc(
+    iree_allocator_t allocator, iree_allocator_command_t command,
+    iree_host_size_t byte_length, void** inout_ptr) {
+  if (IREE_UNLIKELY(!allocator.ctl)) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "allocator has no control routine");
+  }
+  iree_allocator_alloc_params_t params = {
+      .byte_length = byte_length,
+  };
+  return allocator.ctl(allocator.self, command, &params, inout_ptr);
+}
+
+IREE_API_EXPORT iree_status_t iree_allocator_malloc(
+    iree_allocator_t allocator, iree_host_size_t byte_length, void** out_ptr) {
+  return iree_allocator_issue_alloc(allocator, IREE_ALLOCATOR_COMMAND_CALLOC,
+                                    byte_length, out_ptr);
+}
+
+IREE_API_EXPORT iree_status_t iree_allocator_malloc_uninitialized(
+    iree_allocator_t allocator, iree_host_size_t byte_length, void** out_ptr) {
+  return iree_allocator_issue_alloc(allocator, IREE_ALLOCATOR_COMMAND_MALLOC,
+                                    byte_length, out_ptr);
+}
+
+IREE_API_EXPORT iree_status_t
+iree_allocator_realloc(iree_allocator_t allocator, iree_host_size_t byte_length,
+                       void** inout_ptr) {
+  return iree_allocator_issue_alloc(allocator, IREE_ALLOCATOR_COMMAND_REALLOC,
+                                    byte_length, inout_ptr);
+}
+
+IREE_API_EXPORT iree_status_t
+iree_allocator_clone(iree_allocator_t allocator,
+                     iree_const_byte_span_t source_bytes, void** out_ptr) {
+  IREE_RETURN_IF_ERROR(iree_allocator_malloc_uninitialized(
+      allocator, source_bytes.data_length, out_ptr));
+  memcpy(*out_ptr, source_bytes.data, source_bytes.data_length);
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT void iree_allocator_free(iree_allocator_t allocator,
+                                         void* ptr) {
+  if (ptr && allocator.ctl) {
+    iree_status_ignore(allocator.ctl(
+        allocator.self, IREE_ALLOCATOR_COMMAND_FREE, /*params=*/NULL, &ptr));
+  }
+}
+
+static iree_status_t iree_allocator_system_alloc(
+    iree_allocator_command_t command,
+    const iree_allocator_alloc_params_t* params, void** inout_ptr) {
+  IREE_ASSERT_ARGUMENT(params);
+  IREE_ASSERT_ARGUMENT(inout_ptr);
+  iree_host_size_t byte_length = params->byte_length;
+  if (IREE_UNLIKELY(byte_length == 0)) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "allocations must be >0 bytes");
+  }
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  void* existing_ptr = *inout_ptr;
+  void* new_ptr = NULL;
+  if (existing_ptr && command == IREE_ALLOCATOR_COMMAND_REALLOC) {
+    new_ptr = realloc(existing_ptr, byte_length);
+  } else {
+    existing_ptr = NULL;
+    if (command == IREE_ALLOCATOR_COMMAND_CALLOC) {
+      new_ptr = calloc(1, byte_length);
+    } else {
+      new_ptr = malloc(byte_length);
+    }
+  }
+  if (!new_ptr) {
+    return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED,
+                            "system allocator failed the request");
+  }
+
+  if (existing_ptr) {
+    IREE_TRACE_FREE(existing_ptr);
+  }
+  IREE_TRACE_ALLOC(new_ptr, byte_length);
+
+  *inout_ptr = new_ptr;
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+static iree_status_t iree_allocator_system_free(void** inout_ptr) {
+  IREE_ASSERT_ARGUMENT(inout_ptr);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  void* ptr = *inout_ptr;
+  if (IREE_LIKELY(ptr != NULL)) {
+    IREE_TRACE_FREE(ptr);
+    free(ptr);
+    *inout_ptr = NULL;
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_status_t
+iree_allocator_system_ctl(void* self, iree_allocator_command_t command,
+                          const void* params, void** inout_ptr) {
+  switch (command) {
+    case IREE_ALLOCATOR_COMMAND_MALLOC:
+    case IREE_ALLOCATOR_COMMAND_CALLOC:
+    case IREE_ALLOCATOR_COMMAND_REALLOC:
+      return iree_allocator_system_alloc(
+          command, (const iree_allocator_alloc_params_t*)params, inout_ptr);
+    case IREE_ALLOCATOR_COMMAND_FREE:
+      return iree_allocator_system_free(inout_ptr);
+    default:
+      return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                              "unsupported system allocator command");
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Aligned allocations via iree_allocator_t
+//===----------------------------------------------------------------------===//
+
+// Returns true if |alignment| is a power of two (or 0).
+static inline iree_host_size_t iree_alignment_is_pot(
+    iree_host_size_t alignment) {
+  return (alignment & (alignment - 1)) == 0;
+}
+
+// Returns a pointer into |unaligned_ptr| where |offset| matches |alignment|.
+static inline void* iree_aligned_ptr(void* unaligned_ptr,
+                                     iree_host_size_t alignment,
+                                     iree_host_size_t offset) {
+  return (void*)((((uintptr_t)unaligned_ptr + (alignment + sizeof(void*)) +
+                   offset) &
+                  ~(uintptr_t)(alignment - 1)) -
+                 offset);
+}
+
+// Returns the base unaligned pointer for |aligned_ptr|.
+static inline void* iree_aligned_ptr_get_base(void* aligned_ptr) {
+  void** ptr_ref =
+      (void**)((uintptr_t)aligned_ptr & ~(uintptr_t)(sizeof(void*) - 1));
+  return ptr_ref[-1];
+}
+
+// Sets the base unaligned pointer in |aligned_ptr|.
+static inline void iree_aligned_ptr_set_base(void* aligned_ptr,
+                                             void* base_ptr) {
+  void** ptr_ref =
+      (void**)((uintptr_t)aligned_ptr & ~(uintptr_t)(sizeof(void*) - 1));
+  ptr_ref[-1] = base_ptr;
+}
+
+IREE_API_EXPORT iree_status_t iree_allocator_malloc_aligned(
+    iree_allocator_t allocator, iree_host_size_t byte_length,
+    iree_host_size_t min_alignment, iree_host_size_t offset, void** out_ptr) {
+  IREE_ASSERT_ARGUMENT(out_ptr);
+  if (IREE_UNLIKELY(byte_length == 0)) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "allocations must be >0 bytes");
+  }
+  const iree_host_size_t alignment = iree_max(min_alignment, iree_max_align_t);
+  if (IREE_UNLIKELY(!iree_alignment_is_pot(alignment))) {
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "alignments must be powers of two (got %" PRIhsz ")", min_alignment);
+  }
+
+  // [base ptr] [padding...] [aligned data] [padding...]
+  const iree_host_size_t total_length =
+      sizeof(uintptr_t) + byte_length + alignment;
+  void* unaligned_ptr = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_allocator_malloc(allocator, total_length, (void**)&unaligned_ptr));
+  void* aligned_ptr = iree_aligned_ptr(unaligned_ptr, alignment, offset);
+
+  iree_aligned_ptr_set_base(aligned_ptr, unaligned_ptr);
+  *out_ptr = aligned_ptr;
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_status_t iree_allocator_realloc_aligned(
+    iree_allocator_t allocator, iree_host_size_t byte_length,
+    iree_host_size_t min_alignment, iree_host_size_t offset, void** inout_ptr) {
+  IREE_ASSERT_ARGUMENT(inout_ptr);
+  if (!*inout_ptr) {
+    return iree_allocator_malloc_aligned(allocator, byte_length, min_alignment,
+                                         offset, inout_ptr);
+  }
+  if (IREE_UNLIKELY(byte_length == 0)) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "allocations must be >0 bytes");
+  }
+  const iree_host_size_t alignment = iree_min(min_alignment, iree_max_align_t);
+  if (IREE_UNLIKELY(!iree_alignment_is_pot(alignment))) {
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "alignments must be powers of two (got %" PRIhsz ")", min_alignment);
+  }
+  void* aligned_ptr = *inout_ptr;
+  void* unaligned_ptr = iree_aligned_ptr_get_base(aligned_ptr);
+  if (IREE_UNLIKELY(aligned_ptr !=
+                    iree_aligned_ptr(unaligned_ptr, alignment, offset))) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "reallocation must have the same alignment as the "
+                            "original allocation (got %" PRIhsz ")",
+                            min_alignment);
+  }
+
+  // Since the reallocated memory block may have a different unaligned base to
+  // aligned offset we may need to move the data. Capture the original offset
+  // into the unaligned base where the valid data resides.
+  uintptr_t old_offset = (uintptr_t)aligned_ptr - (uintptr_t)unaligned_ptr;
+
+  // [base ptr] [padding...] [aligned data] [padding...]
+  const iree_host_size_t total_length =
+      sizeof(uintptr_t) + byte_length + alignment;
+  IREE_RETURN_IF_ERROR(
+      iree_allocator_realloc(allocator, total_length, (void**)&unaligned_ptr));
+  aligned_ptr = iree_aligned_ptr(unaligned_ptr, alignment, offset);
+
+  const uint8_t* old_data = (uint8_t*)unaligned_ptr + old_offset;
+  uint8_t* new_data = (uint8_t*)aligned_ptr;
+  if (old_data != new_data) {
+    // Alignment at offset changed; copy data to the new aligned offset.
+    // NOTE: this is copying up to the *new* byte length, as we don't store the
+    // old length and don't know how much to copy. Since we've already
+    // reallocated we know this will always be in-bounds, but it's inefficient.
+    // NOTE: memmove instead of memcpy as the regions may overlap.
+    memmove(new_data, old_data, byte_length);
+  }
+
+  iree_aligned_ptr_set_base(aligned_ptr, unaligned_ptr);
+  *inout_ptr = aligned_ptr;
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT void iree_allocator_free_aligned(iree_allocator_t allocator,
+                                                 void* ptr) {
+  if (ptr) {
+    void* unaligned_ptr = iree_aligned_ptr_get_base(ptr);
+    iree_allocator_free(allocator, unaligned_ptr);
+  }
+}
diff --git a/runtime/src/iree/base/allocator.h b/runtime/src/iree/base/allocator.h
new file mode 100644
index 0000000..9ac26f4
--- /dev/null
+++ b/runtime/src/iree/base/allocator.h
@@ -0,0 +1,286 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BASE_ALLOCATOR_H_
+#define IREE_BASE_ALLOCATOR_H_
+
+#include <memory.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "iree/base/alignment.h"
+#include "iree/base/attributes.h"
+#include "iree/base/config.h"
+#include "iree/base/status.h"
+#include "iree/base/target_platform.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+//===----------------------------------------------------------------------===//
+// Types and Enums
+//===----------------------------------------------------------------------===//
+
+// Returns the number of elements in an array as a compile-time constant, which
+// can be used in defining new arrays. Fails at compile-time if |arr| is not a
+// static array (such as if used on a pointer type). Similar to `countof()`.
+//
+// Example:
+//  uint8_t kConstantArray[512];
+//  assert(IREE_ARRAYSIZE(kConstantArray) == 512);
+#define IREE_ARRAYSIZE(arr) (sizeof(arr) / sizeof(arr[0]))
+
+#define iree_min(lhs, rhs) ((lhs) <= (rhs) ? (lhs) : (rhs))
+#define iree_max(lhs, rhs) ((lhs) <= (rhs) ? (rhs) : (lhs))
+
+#if IREE_STATISTICS_ENABLE
+// Evalutes the expression code only if statistics are enabled.
+//
+// Example:
+//  struct {
+//    IREE_STATISTICS(uint32_t stats_only_value);
+//  } my_object;
+//  IREE_STATISTICS(my_object.stats_only_value = 5);
+//  IREE_STATISTICS({
+//    my_object.stats_only_value = 5;
+//  });
+#define IREE_STATISTICS(expr) expr
+#else
+#define IREE_STATISTICS(expr)
+#endif  // IREE_STATISTICS_ENABLE
+
+//===----------------------------------------------------------------------===//
+// Byte buffers and memory utilities
+//===----------------------------------------------------------------------===//
+
+// A span of mutable bytes (ala std::span of uint8_t).
+typedef struct iree_byte_span_t {
+  uint8_t* data;
+  iree_host_size_t data_length;
+} iree_byte_span_t;
+
+static inline iree_byte_span_t iree_make_byte_span(
+    void* data, iree_host_size_t data_length) {
+  iree_byte_span_t v = {(uint8_t*)data, data_length};
+  return v;
+}
+
+static inline iree_byte_span_t iree_byte_span_empty() {
+  iree_byte_span_t v = {NULL, 0};
+  return v;
+}
+
+static bool iree_byte_span_is_empty(iree_byte_span_t span) {
+  return span.data == NULL || span.data_length == 0;
+}
+
+// A span of constant bytes (ala std::span of const uint8_t).
+typedef struct iree_const_byte_span_t {
+  const uint8_t* data;
+  iree_host_size_t data_length;
+} iree_const_byte_span_t;
+
+static inline iree_const_byte_span_t iree_make_const_byte_span(
+    const void* data, iree_host_size_t data_length) {
+  iree_const_byte_span_t v = {(const uint8_t*)data, data_length};
+  return v;
+}
+
+static inline iree_const_byte_span_t iree_const_byte_span_empty() {
+  iree_const_byte_span_t v = {NULL, 0};
+  return v;
+}
+
+static bool iree_const_byte_span_is_empty(iree_const_byte_span_t span) {
+  return span.data == NULL || span.data_length == 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Totally shady stack allocation
+//===----------------------------------------------------------------------===//
+// TODO(benvanik): remove our uses of this or make them more explicit.
+
+#if defined(IREE_COMPILER_MSVC)
+// The safe malloca that may fall back to heap in the case of stack overflows:
+// https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/malloca?view=vs-2019
+// Because that gets really annoying to deal with during error handling we just
+// go for _alloca which may generate SEH exceptions if we blow the stack.
+#include <malloc.h>
+#define iree_alloca(sz) _alloca(sz)
+#else
+#include <alloca.h>
+#define iree_alloca(sz) alloca(sz)
+#endif  // IREE_COMPILER_MSVC
+
+//===----------------------------------------------------------------------===//
+// iree_allocator_t (std::allocator-like interface)
+//===----------------------------------------------------------------------===//
+
+// Controls the behavior of an iree_allocator_ctl_fn_t callback function.
+typedef enum iree_allocator_command_e {
+  // Allocates |byte_length| of memory and stores the pointer in |inout_ptr|.
+  // Systems should align to 16 byte boundaries (or otherwise their natural
+  // SIMD alignment). The runtime pools internally and small allocations
+  // (usually) won't be made through this interface.
+  //
+  // iree_allocator_ctl_fn_t:
+  //   params: iree_allocator_alloc_params_t
+  //   inout_ptr: set to allocated pointer
+  IREE_ALLOCATOR_COMMAND_MALLOC = 0,
+
+  // As with IREE_ALLOCATOR_COMMAND_MALLOC but zeros the memory.
+  //
+  // The contents of the allocation *must* be zeroed by the allocator prior to
+  // returning. Allocators may be able to elide the zeroing if they allocate
+  // fresh pages from the system. It is always safe to zero contents if the
+  // behavior of the allocator is not under our control.
+  //
+  // iree_allocator_ctl_fn_t:
+  //   params: iree_allocator_alloc_params_t
+  //   inout_ptr: set to allocated pointer
+  IREE_ALLOCATOR_COMMAND_CALLOC,
+
+  // Tries to resize an allocation provided via |inout_ptr|, if possible.
+  // If the existing allocation is not reused then it is freed as if a call to
+  // iree_allocator_free had been called on it. If the allocation fails then
+  // the provided existing allocation is unmodified. Only pointers previously
+  // received from the iree_allocator_t are valid.
+  //
+  // iree_allocator_ctl_fn_t:
+  //   params: iree_allocator_alloc_params_t
+  //   inout_ptr: pointer of existing allocation; updated to realloced pointer
+  IREE_ALLOCATOR_COMMAND_REALLOC,
+
+  // Frees the memory pointed to by |inout_ptr|.
+  //
+  // iree_allocator_ctl_fn_t:
+  //   params: unused
+  //   inout_ptr: pointer to free
+  IREE_ALLOCATOR_COMMAND_FREE,
+} iree_allocator_command_t;
+
+// Parameters for various allocation commands.
+typedef struct iree_allocator_alloc_params_t {
+  // Minimum size, in bytes, of the allocation. The underlying allocator may
+  // pad the length out if needed.
+  iree_host_size_t byte_length;
+} iree_allocator_alloc_params_t;
+
+// Function pointer for an iree_allocator_t control function.
+// |command| provides the operation to perform. Optionally some commands may use
+// |params| to pass additional operation-specific parameters. |inout_ptr| usage
+// is defined by each operation but is generally a pointer to the pointer to
+// set to the newly allocated memory or a pointer to the pointer to free.
+typedef iree_status_t(IREE_API_PTR* iree_allocator_ctl_fn_t)(
+    void* self, iree_allocator_command_t command, const void* params,
+    void** inout_ptr);
+
+// An allocator for host-memory allocations.
+// IREE will attempt to use this in place of the system malloc and free.
+// Pass the iree_allocator_system() macro to use the system allocator.
+typedef struct iree_allocator_t {
+  // Control function data.
+  void* self;
+  // ioctl-style control function servicing all allocator-related commands.
+  // See iree_allocator_command_t for more information.
+  iree_allocator_ctl_fn_t ctl;
+} iree_allocator_t;
+
+// Allocates a block of |byte_length| bytes from the given allocator.
+// The contents of the returned memory is guaranteed to be zeroed.
+IREE_API_EXPORT iree_status_t iree_allocator_malloc(
+    iree_allocator_t allocator, iree_host_size_t byte_length, void** out_ptr);
+
+// Allocates a block of |byte_length| bytes from the given allocator.
+// The content of the buffer returned is undefined: it may be zeros, a
+// debug-fill pattern, or random memory from elsewhere in the process.
+// Only use this when immediately overwriting all memory.
+IREE_API_EXPORT iree_status_t iree_allocator_malloc_uninitialized(
+    iree_allocator_t allocator, iree_host_size_t byte_length, void** out_ptr);
+
+// Reallocates |inout_ptr| to |byte_length| bytes with the given allocator.
+// If the reallocation fails then the original |inout_ptr| is unmodified.
+//
+// WARNING: when extending the newly allocated bytes are undefined.
+// TODO(benvanik): make them zeros; we should have an _uninitialized if needed.
+IREE_API_EXPORT iree_status_t iree_allocator_realloc(
+    iree_allocator_t allocator, iree_host_size_t byte_length, void** inout_ptr);
+
+// Duplicates the given byte block by allocating memory and copying it in.
+IREE_API_EXPORT iree_status_t
+iree_allocator_clone(iree_allocator_t allocator,
+                     iree_const_byte_span_t source_bytes, void** out_ptr);
+
+// Frees a previously-allocated block of memory to the given allocator.
+IREE_API_EXPORT void iree_allocator_free(iree_allocator_t allocator, void* ptr);
+
+// Default C allocator controller using malloc/free.
+IREE_API_EXPORT iree_status_t
+iree_allocator_system_ctl(void* self, iree_allocator_command_t command,
+                          const void* params, void** inout_ptr);
+
+// Allocates using the iree_allocator_malloc and iree_allocator_free methods.
+// These will usually be backed by malloc and free.
+static inline iree_allocator_t iree_allocator_system(void) {
+  iree_allocator_t v = {NULL, iree_allocator_system_ctl};
+  return v;
+}
+
+// Does not perform any allocation or deallocation; used to wrap objects that
+// are owned by external code/live in read-only memory/etc.
+static inline iree_allocator_t iree_allocator_null(void) {
+  iree_allocator_t v = {NULL, NULL};
+  return v;
+}
+
+// Returns true if the allocator is `iree_allocator_null()`.
+static inline bool iree_allocator_is_null(iree_allocator_t allocator) {
+  return allocator.ctl == NULL;
+}
+
+//===----------------------------------------------------------------------===//
+// Aligned allocations via iree_allocator_t
+//===----------------------------------------------------------------------===//
+
+// Allocates memory of size |byte_length| where the byte starting at |offset|
+// has a minimum alignment of |min_alignment|. In many cases |offset| can be 0.
+//
+// The |offset| can be used to ensure the alignment-sensitive portion of a
+// combined allocation is aligned while any prefix metadata has system
+// alignment. For example:
+//   typedef struct {
+//     uint32_t some_metadata;
+//     uint8_t data[];
+//   } buffer_t;
+//   buffer_t* buffer = NULL;
+//   iree_allocator_malloc_aligned(allocator, sizeof(buffer_t) + length,
+//                                 4096, offsetof(buffer_t, data), &buffer);
+//   // `buffer` has system alignment, but the `data` will be aligned on at
+//   // least a 4096 boundary.
+//
+// The contents of the returned memory is guaranteed to be zeroed.
+IREE_API_EXPORT iree_status_t iree_allocator_malloc_aligned(
+    iree_allocator_t allocator, iree_host_size_t byte_length,
+    iree_host_size_t min_alignment, iree_host_size_t offset, void** out_ptr);
+
+// Reallocates memory to |byte_length|, growing or shrinking as needed.
+// Only valid on memory allocated with iree_allocator_malloc_aligned.
+// The newly reallocated memory will have the byte at |offset| aligned to at
+// least |min_alignment|.
+IREE_API_EXPORT iree_status_t iree_allocator_realloc_aligned(
+    iree_allocator_t allocator, iree_host_size_t byte_length,
+    iree_host_size_t min_alignment, iree_host_size_t offset, void** inout_ptr);
+
+// Frees a |ptr| previously returned from iree_allocator_malloc_aligned.
+IREE_API_EXPORT void iree_allocator_free_aligned(iree_allocator_t allocator,
+                                                 void* ptr);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_BASE_ALLOCATOR_H_
diff --git a/runtime/src/iree/base/api.c b/runtime/src/iree/base/api.c
new file mode 100644
index 0000000..6ba51a3
--- /dev/null
+++ b/runtime/src/iree/base/api.c
@@ -0,0 +1,27 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/api.h"
+
+//===----------------------------------------------------------------------===//
+// IREE Core API
+//===----------------------------------------------------------------------===//
+
+IREE_API_EXPORT iree_status_t
+iree_api_version_check(iree_api_version_t expected_version,
+                       iree_api_version_t* out_actual_version) {
+  if (!out_actual_version) {
+    return iree_status_from_code(IREE_STATUS_INVALID_ARGUMENT);
+  }
+  iree_api_version_t actual_version = IREE_API_VERSION_0;
+  *out_actual_version = actual_version;
+  return expected_version == actual_version
+             ? iree_ok_status()
+             : iree_make_status(IREE_STATUS_OUT_OF_RANGE,
+                                "IREE version mismatch; application expected "
+                                "%d but IREE is compiled as %d",
+                                expected_version, actual_version);
+}
diff --git a/runtime/src/iree/base/api.h b/runtime/src/iree/base/api.h
new file mode 100644
index 0000000..682c063
--- /dev/null
+++ b/runtime/src/iree/base/api.h
@@ -0,0 +1,143 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// API Versioning
+// -----------------------------------------------------------------------------
+//
+// The C API is designed to be versioned such that breaking changes either in
+// ABI (data types, struct sizes, etc) or signatures (function arguments change)
+// will result in a bump of the IREE_API_VERSION_LATEST value.
+//
+// When linked in statically the runtime should never have a version conflict,
+// however dynamic linking where the runtime is a shared object loaded at
+// runtime (via dlopen/etc) must always verify the version is as expected.
+//
+// In the current experimental state of the runtime the API may break frequently
+// and the version is pinned at 0.
+//
+// Example:
+//   void* library = dlopen("iree_rt.so", RTLD_LAZY | RTLD_LOCAL);
+//   iree_api_version_t actual_version;
+//   iree_status_t status = \
+//       ((PFN_iree_api_version_check)dlsym(library, "iree_api_version_check"))(
+//       IREE_API_VERSION_LATEST, &actual_version);
+//   IREE_CHECK_OK(status);
+//   dlclose(library);
+//
+// Object Ownership and Lifetime
+// -----------------------------------------------------------------------------
+//
+// The API follows the CoreFoundation ownership policies:
+// https://developer.apple.com/library/archive/documentation/CoreFoundation/Conceptual/CFMemoryMgmt/Concepts/Ownership.html
+//
+// These boil down to:
+// * Objects returned from *_create or *_copy functions are owned by the caller
+//   and must be released when the caller no longer needs them.
+// * Objects returned from accessors are not owned by the caller and must be
+//   retained by the caller if the object lifetime needs to be extended.
+// * Objects passed to functions by argument may be retained by the callee if
+//   required.
+//
+// Example:
+//   iree_file_mapping_t* file_mapping;
+//   s = iree_file_mapping_open_read(..., &file_mapping);
+//   // file_mapping is now owned by this function.
+//   s = iree_file_mapping_some_call(file_mapping, ...);
+//   // Must release ownership when no longer required.
+//   s = iree_file_mapping_release(file_mapping);
+//
+// String Formatting
+// -----------------------------------------------------------------------------
+//
+// Functions that produce variable-length strings follow a standard usage
+// pattern with the arguments:
+//   `iree_host_size_t buffer_capacity`: total bytes including \0 available.
+//   `char* buffer`: optional buffer to write into.
+//   `iree_host_size_t* out_buffer_length`: required/actual length excluding \0.
+//
+// To query the size required for the output and allocate storage:
+//   iree_host_size_t required_length = 0;
+//   iree_format_xyz(/*buffer_capacity=*/0, /*buffer=*/NULL, &required_length);
+//   iree_host_size_t buffer_capacity = required_length + 1;
+//   char* buffer = iree_allocator_malloc(buffer_capacity);
+//   iree_host_size_t actual_length = 0;
+//   iree_format_xyz(buffer_capacity, buffer, &actual_length);
+//   ASSERT(required_length == actual_length);
+//
+// To handle fixed-length maximum strings (common):
+//   // Fails if the string is longer than 127 characters (127 + \0 >= 128).
+//   char buffer[128];
+//   IREE_RETURN_IF_ERROR(iree_format_xyz(sizeof(buffer), buffer, NULL));
+//
+// Try fixed-length and fallback to a dynamic allocation:
+//   char inline_buffer[128];
+//   iree_host_size_t required_length = 0;
+//   iree_status_t inline_status = iree_format_xyz(sizeof(inline_buffer),
+//                                                 inline_buffer,
+//                                                 &required_length);
+//   if (iree_status_is_out_of_range(inline_status)) {
+//     // Spilled inline_buffer, need to allocate required_length bytes and
+//     // try again.
+//     // ... see above for example ...
+//   } else if (iree_status_is_ok(inline_status)) {
+//     // Fit inside inline_buffer, required_length contains actual length.
+//   } else {
+//     return inline_status;
+//   }
+
+#ifndef IREE_BASE_API_H_
+#define IREE_BASE_API_H_
+
+#include "iree/base/alignment.h"       // IWYU pragma: export
+#include "iree/base/allocator.h"       // IWYU pragma: export
+#include "iree/base/assert.h"          // IWYU pragma: export
+#include "iree/base/attributes.h"      // IWYU pragma: export
+#include "iree/base/bitfield.h"        // IWYU pragma: export
+#include "iree/base/config.h"          // IWYU pragma: export
+#include "iree/base/loop.h"            // IWYU pragma: export
+#include "iree/base/loop_inline.h"     // IWYU pragma: export
+#include "iree/base/status.h"          // IWYU pragma: export
+#include "iree/base/string_builder.h"  // IWYU pragma: export
+#include "iree/base/string_view.h"     // IWYU pragma: export
+#include "iree/base/time.h"            // IWYU pragma: export
+#include "iree/base/wait_source.h"     // IWYU pragma: export
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+//===----------------------------------------------------------------------===//
+// IREE Core API
+//===----------------------------------------------------------------------===//
+
+// Sprinkle this wherever to make it easier to find structs/functions that are
+// not yet stable.
+#define IREE_API_UNSTABLE
+
+// Known versions of the API that can be referenced in code.
+// Out-of-bounds values are possible in forward-versioned changes.
+typedef enum iree_api_version_e {
+  IREE_API_VERSION_0 = 0,
+  // Always set to the latest version of the library from source.
+  IREE_API_VERSION_LATEST = IREE_API_VERSION_0,
+} iree_api_version_t;
+
+// Checks whether the |expected_version| of the caller matches the implemented
+// version of |out_actual_version|. Forward compatibility of the API is
+// supported but backward compatibility is not: newer binaries using older
+// shared libraries of the runtime will fail.
+//
+// Returns IREE_STATUS_OUT_OF_RANGE if the actual version is not compatible with
+// the expected version.
+IREE_API_EXPORT iree_status_t
+iree_api_version_check(iree_api_version_t expected_version,
+                       iree_api_version_t* out_actual_version);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_BASE_API_H_
diff --git a/runtime/src/iree/base/assert.h b/runtime/src/iree/base/assert.h
new file mode 100644
index 0000000..930baab
--- /dev/null
+++ b/runtime/src/iree/base/assert.h
@@ -0,0 +1,77 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BASE_ASSERT_H_
+#define IREE_BASE_ASSERT_H_
+
+#include <assert.h>
+
+#include "iree/base/config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+//===----------------------------------------------------------------------===//
+// IREE_ASSERT macros
+//===----------------------------------------------------------------------===//
+// These are no-oped in builds with NDEBUG defined (by default anything but
+// `-c dbg`/`-DCMAKE_BUILD_TYPE=Debug`). They differ from assert in that
+// they avoid unused variable warnings when NDEBUG is defined. As with normal
+// assert() ensure that side-effecting behavior is avoided as the expression
+// will not be evaluated when the asserts are removed!
+
+#if defined(NDEBUG)  // N(o) DEBUG
+
+// Assertions disabled:
+
+#define IREE_ASSERT(condition, ...) \
+  while (false && (condition)) {    \
+  }
+
+// TODO(benvanik): replace the status_matchers version with a test macro.
+// #define IREE_ASSERT_OK(status) IREE_ASSERT(iree_status_is_ok(status))
+
+// However, we still want the compiler to parse x and y because
+// we don't want to lose potentially useful errors and warnings
+// (and want to hide unused variable warnings when asserts are disabled).
+// _IREE_ASSERT_CMP is a helper and should not be used outside of this file.
+#define _IREE_ASSERT_CMP(x, op, y, ...)        \
+  while (false && ((void)(x), (void)(y), 0)) { \
+  }
+
+#else
+
+// Assertions enabled:
+
+#define IREE_ASSERT(condition, ...) assert(condition)
+
+// TODO(#2843): better logging of status assertions.
+// #define IREE_ASSERT_OK(status) IREE_ASSERT(iree_status_is_ok(status))
+
+#define _IREE_ASSERT_CMP(x, op, y, ...) IREE_ASSERT(((x)op(y)), __VA_ARGS__)
+
+#endif  // NDEBUG
+
+#define IREE_ASSERT_ARGUMENT(name) IREE_ASSERT(name)
+
+#define IREE_ASSERT_TRUE(expr, ...) IREE_ASSERT(!!(expr), __VA_ARGS__)
+#define IREE_ASSERT_FALSE(expr, ...) IREE_ASSERT(!(expr), __VA_ARGS__)
+
+#define IREE_ASSERT_UNREACHABLE(...) IREE_ASSERT(false, __VA_ARGS__)
+
+#define IREE_ASSERT_EQ(x, y, ...) _IREE_ASSERT_CMP(x, ==, y, __VA_ARGS__)
+#define IREE_ASSERT_NE(x, y, ...) _IREE_ASSERT_CMP(x, !=, y, __VA_ARGS__)
+#define IREE_ASSERT_LE(x, y, ...) _IREE_ASSERT_CMP(x, <=, y, __VA_ARGS__)
+#define IREE_ASSERT_LT(x, y, ...) _IREE_ASSERT_CMP(x, <, y, __VA_ARGS__)
+#define IREE_ASSERT_GE(x, y, ...) _IREE_ASSERT_CMP(x, >=, y, __VA_ARGS__)
+#define IREE_ASSERT_GT(x, y, ...) _IREE_ASSERT_CMP(x, >, y, __VA_ARGS__)
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_BASE_ASSERT_H_
diff --git a/runtime/src/iree/base/attributes.h b/runtime/src/iree/base/attributes.h
new file mode 100644
index 0000000..bd396a9
--- /dev/null
+++ b/runtime/src/iree/base/attributes.h
@@ -0,0 +1,194 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BASE_ATTRIBUTES_H_
+#define IREE_BASE_ATTRIBUTES_H_
+
+#include "iree/base/target_platform.h"
+
+//===----------------------------------------------------------------------===//
+// API/ABI interop
+//===----------------------------------------------------------------------===//
+
+// Denotes a method exported by the IREE API.
+// Any call annotated with this will be relatively stable.
+// Calls without this are considered private to the IREE implementation and
+// should not be relied upon.
+#ifdef __cplusplus
+#define IREE_API_EXPORT extern "C"
+#else
+#define IREE_API_EXPORT
+#endif  // __cplusplus
+
+// Denotes a function pointer that is exposed as part of the IREE API.
+// Example:
+//   iree_status_t(IREE_API_PTR* some_callback)(int value);
+#define IREE_API_PTR
+
+//===----------------------------------------------------------------------===//
+// IREE_HAVE_ATTRIBUTE
+//===----------------------------------------------------------------------===//
+
+// Queries for [[attribute]] identifiers in modern compilers.
+#ifdef __has_attribute
+#define IREE_HAVE_ATTRIBUTE(x) __has_attribute(x)
+#else
+#define IREE_HAVE_ATTRIBUTE(x) 0
+#endif  // __has_attribute
+
+//===----------------------------------------------------------------------===//
+// IREE_PRINTF_ATTRIBUTE
+//===----------------------------------------------------------------------===//
+
+// Tells the compiler to perform `printf` format string checking if the
+// compiler supports it; see the 'format' attribute in
+// <https://gcc.gnu.org/onlinedocs/gcc-4.7.0/gcc/Function-Attributes.html>.
+#if IREE_HAVE_ATTRIBUTE(format) || (defined(__GNUC__) && !defined(__clang__))
+#define IREE_PRINTF_ATTRIBUTE(string_index, first_to_check) \
+  __attribute__((__format__(__printf__, string_index, first_to_check)))
+#else
+// TODO(benvanik): use _Printf_format_string_ in SAL for MSVC.
+#define IREE_PRINTF_ATTRIBUTE(string_index, first_to_check)
+#endif  // IREE_HAVE_ATTRIBUTE
+
+//===----------------------------------------------------------------------===//
+// IREE_ATTRIBUTE_NORETURN
+//===----------------------------------------------------------------------===//
+
+// Tells the compiler that a given function never returns.
+#if IREE_HAVE_ATTRIBUTE(noreturn) || (defined(__GNUC__) && !defined(__clang__))
+#define IREE_ATTRIBUTE_NORETURN __attribute__((noreturn))
+#elif defined(_MSC_VER)
+#define IREE_ATTRIBUTE_NORETURN __declspec(noreturn)
+#else
+#define IREE_ATTRIBUTE_NORETURN
+#endif  // IREE_HAVE_ATTRIBUTE(noreturn)
+
+//===----------------------------------------------------------------------===//
+// IREE_MUST_USE_RESULT
+//===----------------------------------------------------------------------===//
+
+// Annotation for function return values that ensures that they are used by the
+// caller.
+#if IREE_HAVE_ATTRIBUTE(nodiscard)
+#define IREE_MUST_USE_RESULT [[nodiscard]]
+#elif (defined(__clang__) && IREE_HAVE_ATTRIBUTE(warn_unused_result)) || \
+    (defined(__GNUC__) && (__GNUC__ >= 4))
+#define IREE_MUST_USE_RESULT __attribute__((warn_unused_result))
+#elif defined(_MSC_VER) && (_MSC_VER >= 1700)
+#define IREE_MUST_USE_RESULT _Check_return_
+#else
+#define IREE_MUST_USE_RESULT
+#endif  // IREE_HAVE_ATTRIBUTE(nodiscard)
+
+//===----------------------------------------------------------------------===//
+// IREE_RESTRICT
+//===----------------------------------------------------------------------===//
+
+// `restrict` keyword, not supported by some older compilers.
+// We define our own macro in case dependencies use `restrict` differently.
+#if defined(_MSC_VER) && _MSC_VER >= 1900
+#define IREE_RESTRICT __restrict
+#elif defined(_MSC_VER)
+#define IREE_RESTRICT
+#elif defined(__cplusplus)
+#define IREE_RESTRICT __restrict__
+#else
+#define IREE_RESTRICT restrict
+#endif  // _MSC_VER
+
+//===----------------------------------------------------------------------===//
+// IREE_ATTRIBUTE_ALWAYS_INLINE / IREE_ATTRIBUTE_NOINLINE
+//===----------------------------------------------------------------------===//
+
+// Forces functions to either inline or not inline. Introduced in gcc 3.1.
+#if IREE_HAVE_ATTRIBUTE(always_inline) || \
+    (defined(__GNUC__) && !defined(__clang__))
+#define IREE_ATTRIBUTE_ALWAYS_INLINE __attribute__((always_inline))
+#else
+#define IREE_ATTRIBUTE_ALWAYS_INLINE
+#endif  // IREE_HAVE_ATTRIBUTE(always_inline)
+
+#if IREE_HAVE_ATTRIBUTE(noinline) || (defined(__GNUC__) && !defined(__clang__))
+#define IREE_ATTRIBUTE_NOINLINE __attribute__((noinline))
+#else
+#define IREE_ATTRIBUTE_NOINLINE
+#endif  // IREE_HAVE_ATTRIBUTE(noinline)
+
+//===----------------------------------------------------------------------===//
+// IREE_ATTRIBUTE_HOT / IREE_ATTRIBUTE_COLD
+//===----------------------------------------------------------------------===//
+
+// Tells GCC that a function is hot or cold. GCC can use this information to
+// improve static analysis, i.e. a conditional branch to a cold function
+// is likely to be not-taken.
+// This annotation is used for function declarations.
+//
+// Example:
+//   int foo() IREE_ATTRIBUTE_HOT;
+#if IREE_HAVE_ATTRIBUTE(hot) || (defined(__GNUC__) && !defined(__clang__))
+#define IREE_ATTRIBUTE_HOT __attribute__((hot))
+#else
+#define IREE_ATTRIBUTE_HOT
+#endif  // IREE_HAVE_ATTRIBUTE(hot)
+
+#if IREE_HAVE_ATTRIBUTE(cold) || (defined(__GNUC__) && !defined(__clang__))
+#define IREE_ATTRIBUTE_COLD __attribute__((cold))
+#else
+#define IREE_ATTRIBUTE_COLD
+#endif  // IREE_HAVE_ATTRIBUTE(cold)
+
+//===----------------------------------------------------------------------===//
+// IREE_LIKELY / IREE_UNLIKELY
+//===----------------------------------------------------------------------===//
+
+// Compiler hint that can be used to indicate conditions that are very very very
+// likely or unlikely. This is most useful for ensuring that unlikely cases such
+// as error handling are moved off the mainline code path such that the code is
+// only paged in when an error occurs.
+//
+// Example:
+//   if (IREE_UNLIKELY(something_failed)) {
+//     return do_expensive_error_logging();
+//   }
+#if defined(__GNUC__) || defined(__clang__)
+#define IREE_LIKELY(x) (__builtin_expect(!!(x), 1))
+#define IREE_UNLIKELY(x) (__builtin_expect(!!(x), 0))
+#else
+#define IREE_LIKELY(x) (x)
+#define IREE_UNLIKELY(x) (x)
+#endif  // IREE_HAVE_ATTRIBUTE(likely)
+
+//===----------------------------------------------------------------------===//
+// IREE_ATTRIBUTE_PACKED
+//===----------------------------------------------------------------------===//
+
+#if IREE_HAVE_ATTRIBUTE(packed) || (defined(__GNUC__) && !defined(__clang__))
+#define IREE_ATTRIBUTE_PACKED __attribute__((__packed__))
+#else
+#define IREE_ATTRIBUTE_PACKED
+#endif  // IREE_HAVE_ATTRIBUTE(packed)
+
+//===----------------------------------------------------------------------===//
+// IREE_ATTRIBUTE_UNUSED
+//===----------------------------------------------------------------------===//
+
+// Hints that a variable is _maybe_ unused. This is primarily to quiet
+// diagnostic messages about unused variables that crop up around variables
+// passed to assert/logging/etc that gets stripped in certain configurations.
+//
+// Example:
+//   int some_info IREE_ATTRIBUTE_UNUSED = compute_debug_info();
+//   assert(some_info > 0);  // stripped in NDEBUG
+#if IREE_HAVE_ATTRIBUTE(maybe_unused) && defined(__clang__)
+#define IREE_ATTRIBUTE_UNUSED __attribute__((maybe_unused))
+#elif IREE_HAVE_ATTRIBUTE(unused) || (defined(__GNUC__) && !defined(__clang__))
+#define IREE_ATTRIBUTE_UNUSED __attribute__((unused))
+#else
+#define IREE_ATTRIBUTE_UNUSED
+#endif  // IREE_HAVE_ATTRIBUTE(maybe_unused / unused)
+
+#endif  // IREE_BASE_ATTRIBUTES_H_
diff --git a/runtime/src/iree/base/bitfield.c b/runtime/src/iree/base/bitfield.c
new file mode 100644
index 0000000..15a46b0
--- /dev/null
+++ b/runtime/src/iree/base/bitfield.c
@@ -0,0 +1,55 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/bitfield.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+IREE_API_EXPORT iree_status_t iree_bitfield_format(
+    uint32_t value, const iree_bitfield_string_mapping_t* mappings,
+    iree_host_size_t mapping_count, iree_string_builder_t* string_builder) {
+  uint32_t remaining_bits = value;
+  int i = 0;
+  for (iree_host_size_t mapping_index = 0; mapping_index < mapping_count;
+       ++mapping_index) {
+    const iree_bitfield_string_mapping_t mapping = mappings[mapping_index];
+    if ((remaining_bits & mapping.bits) == mapping.bits) {
+      if (i > 0) {
+        IREE_RETURN_IF_ERROR(
+            iree_string_builder_append_string(string_builder, IREE_SV("|")));
+      }
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_string(string_builder, mapping.string));
+      remaining_bits &= ~mapping.bits;
+      ++i;
+    }
+  }
+  if (remaining_bits != 0u) {
+    if (i > 0) {
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_string(string_builder, IREE_SV("|")));
+    }
+    IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
+        string_builder, "%Xh", remaining_bits));
+  }
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_string_view_t iree_bitfield_format_inline(
+    uint32_t value, const iree_bitfield_string_mapping_t* mappings,
+    iree_host_size_t mapping_count, iree_bitfield_string_temp_t* out_temp) {
+  iree_string_builder_t string_builder;
+  iree_string_builder_initialize_with_storage(
+      out_temp->buffer, IREE_ARRAYSIZE(out_temp->buffer), &string_builder);
+  iree_status_t status =
+      iree_bitfield_format(value, mappings, mapping_count, &string_builder);
+  if (iree_status_is_ok(status)) {
+    return iree_string_builder_view(&string_builder);
+  }
+  iree_status_ignore(status);
+  return IREE_SV("(error)");
+}
diff --git a/runtime/src/iree/base/bitfield.h b/runtime/src/iree/base/bitfield.h
new file mode 100644
index 0000000..e67fce5
--- /dev/null
+++ b/runtime/src/iree/base/bitfield.h
@@ -0,0 +1,85 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BASE_BITFIELD_H_
+#define IREE_BASE_BITFIELD_H_
+
+#include "iree/base/attributes.h"
+#include "iree/base/string_builder.h"
+#include "iree/base/string_view.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+//===----------------------------------------------------------------------===//
+// Bitfield utilities
+//===----------------------------------------------------------------------===//
+
+// Returns true if any bit from |rhs| is set in |lhs|.
+#define iree_any_bit_set(lhs, rhs) (((lhs) & (rhs)) != 0)
+// Returns true iff all bits from |rhs| are set in |lhs|.
+#define iree_all_bits_set(lhs, rhs) (((lhs) & (rhs)) == (rhs))
+
+// Maps bits within a bitfield to a string literal.
+typedef struct iree_bitfield_string_mapping_t {
+  uint32_t bits;
+  iree_string_view_t string;
+} iree_bitfield_string_mapping_t;
+
+// Appends the formatted contents of the given bitfield value.
+// Processes values in the order of the mapping table provided and will only
+// use each bit once. Use this to prioritize combined flags over split ones.
+//
+// Usage:
+//  // Static mapping table:
+//  static const iree_bitfield_string_mapping_t my_bitfield_mappings[] = {
+//    {MY_BITFIELD_ALL, IREE_SVL("ALL")},  // combined flags first
+//    {MY_BITFIELD_A,   IREE_SVL("A")},
+//    {MY_BITFIELD_B,   IREE_SVL("B")},
+//    {MY_BITFIELD_C,   IREE_SVL("C")},
+//  };
+//
+//  // Produces the string "A|B":
+//  IREE_RETURN_IF_ERROR(iree_bitfield_format(
+//      MY_BITFIELD_A | MY_BITFIELD_B,
+//      my_bitfield_mappings, IREE_ARRAYSIZE(my_bitfield_mappings),
+//      &string_builder));
+//
+//  // Produces the string "ALL":
+//  IREE_RETURN_IF_ERROR(iree_bitfield_format(
+//      MY_BITFIELD_A | MY_BITFIELD_B | MY_BITFIELD_C,
+//      my_bitfield_mappings, IREE_ARRAYSIZE(my_bitfield_mappings),
+//      &string_builder));
+IREE_API_EXPORT iree_status_t iree_bitfield_format(
+    uint32_t value, const iree_bitfield_string_mapping_t* mappings,
+    iree_host_size_t mapping_count, iree_string_builder_t* string_builder);
+
+// Stack storage for iree_bitfield_format_inline temporary strings.
+typedef struct iree_bitfield_string_temp_t {
+  char buffer[128];
+} iree_bitfield_string_temp_t;
+
+// Appends the formatted contents of the given bitfield value.
+// As with iree_bitfield_format only the storage for the formatted string is
+// allocated inline on the stack.
+//
+// Usage:
+//  // Produces the string "A|B":
+//  iree_bitfield_string_temp_t temp;
+//  iree_string_view_t my_str = iree_bitfield_format_inline(
+//      MY_BITFIELD_A | MY_BITFIELD_B,
+//      my_bitfield_mappings, IREE_ARRAYSIZE(my_bitfield_mappings),
+//      &temp);
+IREE_API_EXPORT iree_string_view_t iree_bitfield_format_inline(
+    uint32_t value, const iree_bitfield_string_mapping_t* mappings,
+    iree_host_size_t mapping_count, iree_bitfield_string_temp_t* out_temp);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_BASE_BITFIELD_H_
diff --git a/runtime/src/iree/base/bitfield_test.cc b/runtime/src/iree/base/bitfield_test.cc
new file mode 100644
index 0000000..c6e9356
--- /dev/null
+++ b/runtime/src/iree/base/bitfield_test.cc
@@ -0,0 +1,83 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <string>
+
+#include "iree/base/api.h"
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace iree {
+namespace {
+
+enum my_bitfield_e {
+  MY_BITFIELD_NONE = 0,
+  MY_BITFIELD_A = 1 << 0,
+  MY_BITFIELD_B = 1 << 1,
+  MY_BITFIELD_ALL = MY_BITFIELD_A | MY_BITFIELD_B,
+};
+typedef uint32_t my_bitfield_t;
+
+template <size_t mapping_count>
+std::string FormatBitfieldValue(
+    uint32_t value,
+    const iree_bitfield_string_mapping_t (&mappings)[mapping_count]) {
+  iree_bitfield_string_temp_t temp;
+  auto sv = iree_bitfield_format_inline(value, mappings, mapping_count, &temp);
+  return std::string(sv.data, sv.size);
+}
+
+// Tests general usage.
+TEST(BitfieldTest, FormatBitfieldValue) {
+  static const iree_bitfield_string_mapping_t mappings[] = {
+      {MY_BITFIELD_A, IREE_SV("A")},
+      {MY_BITFIELD_B, IREE_SV("B")},
+  };
+  EXPECT_EQ("", FormatBitfieldValue(MY_BITFIELD_NONE, mappings));
+  EXPECT_EQ("A", FormatBitfieldValue(MY_BITFIELD_A, mappings));
+  EXPECT_EQ("A|B",
+            FormatBitfieldValue(MY_BITFIELD_A | MY_BITFIELD_B, mappings));
+}
+
+// Tests that empty mapping tables are fine.
+TEST(BitfieldTest, FormatBitfieldValueEmpty) {
+  static const iree_bitfield_string_mapping_t mappings[1] = {
+      {0, IREE_SV("UNUSED")},
+  };
+  iree_bitfield_string_temp_t temp;
+  auto sv = iree_bitfield_format_inline(MY_BITFIELD_NONE, mappings, 0, &temp);
+  EXPECT_TRUE(iree_string_view_is_empty(sv));
+}
+
+// Tests that values not found in the mappings are still displayed.
+TEST(BitfieldTest, FormatBitfieldValueUnhandledValues) {
+  EXPECT_EQ("A|2h", FormatBitfieldValue(MY_BITFIELD_A | MY_BITFIELD_B,
+                                        {
+                                            {MY_BITFIELD_A, IREE_SV("A")},
+                                        }));
+}
+
+// Tests priority order in the mapping table.
+TEST(BitfieldTest, FormatBitfieldValuePriority) {
+  // No priority, will do separate.
+  EXPECT_EQ("A|B", FormatBitfieldValue(MY_BITFIELD_A | MY_BITFIELD_B,
+                                       {
+                                           {MY_BITFIELD_A, IREE_SV("A")},
+                                           {MY_BITFIELD_B, IREE_SV("B")},
+                                           {MY_BITFIELD_ALL, IREE_SV("ALL")},
+                                       }));
+
+  // Priority on the combined flag, use that instead.
+  EXPECT_EQ("ALL", FormatBitfieldValue(MY_BITFIELD_A | MY_BITFIELD_B,
+                                       {
+                                           {MY_BITFIELD_ALL, IREE_SV("ALL")},
+                                           {MY_BITFIELD_A, IREE_SV("A")},
+                                           {MY_BITFIELD_B, IREE_SV("B")},
+                                       }));
+}
+
+}  // namespace
+}  // namespace iree
diff --git a/runtime/src/iree/base/config.h b/runtime/src/iree/base/config.h
new file mode 100644
index 0000000..49a7a93
--- /dev/null
+++ b/runtime/src/iree/base/config.h
@@ -0,0 +1,254 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+//===----------------------------------------------------------------------===//
+//
+//         ██     ██  █████  ██████  ███    ██ ██ ███    ██  ██████
+//         ██     ██ ██   ██ ██   ██ ████   ██ ██ ████   ██ ██
+//         ██  █  ██ ███████ ██████  ██ ██  ██ ██ ██ ██  ██ ██   ███
+//         ██ ███ ██ ██   ██ ██   ██ ██  ██ ██ ██ ██  ██ ██ ██    ██
+//          ███ ███  ██   ██ ██   ██ ██   ████ ██ ██   ████  ██████
+//
+//===----------------------------------------------------------------------===//
+//
+// This file controls global configuration parameters used throughout IREE.
+// Each option added here should be considered something worth enabling an
+// entirely new testing configuration to test and may involve fanning out many
+// configurations depending on which flags are mutually non-exclusive.
+// Err on the side of using runtime flags for options that have minimal impact
+// to code size or toolchain requirements of our more constrained targets.
+//
+// Examples of good configuration settings:
+// - remote HAL device pointer size (cannot be inferred from local config)
+// - no-op override on synchronization primitives (unsafe, untested)
+//
+// Examples of bad configuration settings:
+// - which HAL backend to use (better as build configuration; link what you use)
+
+#ifndef IREE_BASE_CONFIG_H_
+#define IREE_BASE_CONFIG_H_
+
+#include <inttypes.h>
+#include <stddef.h>
+
+#include "iree/base/target_platform.h"
+
+//===----------------------------------------------------------------------===//
+// User configuration overrides
+//===----------------------------------------------------------------------===//
+// A user include file always included prior to any IREE configuration. This is
+// used to override the default configuration in this file without needing to
+// modify the IREE code.
+//
+// Specify a custom file with `-DIREE_USER_CONFIG_H="my_config.h"`.
+
+#if defined(IREE_USER_CONFIG_H)
+#include IREE_USER_CONFIG_H
+#endif  // IREE_USER_CONFIG_H
+
+//===----------------------------------------------------------------------===//
+// Pointer size specification
+//===----------------------------------------------------------------------===//
+// IREE uses two pointer classes throughout its code:
+//
+//  `iree_host_size_t`:
+//    The native pointer size of the local "host" code. This is always C's
+//    size_t but is aliased to make it easier to differentiate from
+//    "unspecified" size_t and iree_device_size_t. Always prefer using this for
+//    sizes of pointers that never leave the host.
+//
+//  `iree_device_size_t`:
+//    The pointer size - possibly larger than needed - for remote "device" code.
+//    As the host and device may be running on entirely different machines it is
+//    often best to use a conservative value for this: a 32-bit host may be
+//    submitting work for a 64-bit device, and using a 32-bit size_t for device
+//    pointers would truncate bits and prevent round-tripping.
+//
+// The specific values for these can be overridden with configuration settings:
+
+#if !defined(IREE_HOST_SIZE_T)
+#define IREE_HOST_SIZE_T size_t
+#define PRIhsz "zu"
+#endif  // !IREE_HOST_SIZE_T
+
+// Size, in bytes, of a buffer on the local host.
+typedef IREE_HOST_SIZE_T iree_host_size_t;
+
+// Maximum representable value in iree_host_size_t.
+#define IREE_HOST_SIZE_MAX \
+  (sizeof(iree_host_size_t) == 4 ? UINT32_MAX : UINT64_MAX)
+
+#if !defined(IREE_DEVICE_SIZE_T)
+#define IREE_DEVICE_SIZE_T uint64_t
+#define PRIdsz PRIu64
+#endif  // !IREE_DEVICE_SIZE_T
+
+// Size, in bytes, of a buffer on remote devices.
+typedef IREE_DEVICE_SIZE_T iree_device_size_t;
+
+// Maximum representable value in iree_device_size_t.
+#define IREE_DEVICE_SIZE_MAX \
+  (sizeof(iree_device_size_t) == 4 ? UINT32_MAX : UINT64_MAX)
+
+//===----------------------------------------------------------------------===//
+// iree_status_t configuration
+//===----------------------------------------------------------------------===//
+// Controls how much information an iree_status_t carries. When set to 0 all of
+// iree_status_t will be turned into just integer results that will never
+// allocate and all string messages will be stripped. Of course, this isn't
+// very useful and the higher modes should be preferred unless binary size is
+// a major concern.
+//
+// IREE_STATUS_MODE = 0: statuses are just integers
+// IREE_STATUS_MODE = 1: statuses have source location of error
+// IREE_STATUS_MODE = 2: statuses also have custom annotations
+// IREE_STATUS_MODE = 3: statuses also have stack traces of the error site
+
+// If no status mode override is provided we'll change the behavior based on
+// build configuration.
+#if !defined(IREE_STATUS_MODE)
+#ifdef NDEBUG
+// Release mode: just source location.
+#define IREE_STATUS_MODE 2
+#else
+// Debug mode: annotations and stack traces.
+#define IREE_STATUS_MODE 3
+#endif  // NDEBUG
+#endif  // !IREE_STATUS_MODE
+
+//===----------------------------------------------------------------------===//
+// Synchronization and threading
+//===----------------------------------------------------------------------===//
+// On ultra-tiny systems where there may only be a single core - or a single
+// core that is guaranteed to ever call an IREE API - all synchronization
+// primitives used throughout IREE can be turned into no-ops. Note that behavior
+// is undefined if there is use of any `iree_*` API call or memory that is
+// owned by IREE from multiple threads concurrently or across threads without
+// proper barriers in place. Unless your target system is in a similar class to
+// an Arduino this is definitely not what you want.
+
+#if !defined(IREE_SYNCHRONIZATION_DISABLE_UNSAFE)
+#define IREE_SYNCHRONIZATION_DISABLE_UNSAFE 0
+#endif  // !IREE_SYNCHRONIZATION_DISABLE_UNSAFE
+
+//===----------------------------------------------------------------------===//
+// File I/O
+//===----------------------------------------------------------------------===//
+// On platforms without file systems or in applications where no file I/O
+// utilties are used, all file I/O operations can be stripped out. Functions
+// relying on file I/O will still be defined, but they will return errors.
+
+#if !defined(IREE_FILE_IO_ENABLE)
+#define IREE_FILE_IO_ENABLE 1
+#endif  // !IREE_FILE_IO_ENABLE
+
+//===----------------------------------------------------------------------===//
+// Statistics/reporting
+//===----------------------------------------------------------------------===//
+// Conditionally enables programmatic access to aggregate statistics. When
+// enabled statistics requires additional per-operation logic and per-resource
+// state that can bloat otherwise minimal structures. Shared resources may also
+// require synchronization where there otherwise would not be any.
+
+#if !defined(IREE_STATISTICS_ENABLE)
+#define IREE_STATISTICS_ENABLE 1
+#endif  // !IREE_STATISTICS_ENABLE
+
+//===----------------------------------------------------------------------===//
+// IREE HAL configuration
+//===----------------------------------------------------------------------===//
+// Enables optional HAL features. Each of these may add several KB to the final
+// binary when linked dynamically.
+
+#if !defined(IREE_HAL_HEAP_BUFFER_ALIGNMENT)
+// Power of two byte alignment required on all host heap buffers.
+// Executables are compiled with alignment expectations and the runtime
+// alignment must be greater than or equal to the alignment set in the compiler.
+// External buffers wrapped by HAL buffers must meet this alignment requirement.
+#define IREE_HAL_HEAP_BUFFER_ALIGNMENT 64
+#endif  // IREE_HAL_HEAP_BUFFER_ALIGNMENT
+
+#if !defined(IREE_HAL_COMMAND_BUFFER_VALIDATION_ENABLE)
+// Enables additional validation of commands issued against command buffers.
+// This adds small amounts of per-command overhead but in all but the most
+// constrained environments it's recommended to keep it enabled in order to get
+// the really nice error messages.
+#define IREE_HAL_COMMAND_BUFFER_VALIDATION_ENABLE 1
+#endif  // IREE_HAL_COMMAND_BUFFER_VALIDATION_ENABLE
+
+#if !defined(IREE_HAL_MODULE_STRING_UTIL_ENABLE)
+// Enables HAL module methods that perform string printing/parsing.
+// This functionality pulls in a large amount of string manipulation code that
+// can be elided if these ops will not be used at runtime. When disabled
+// applications can still call the parse/print routines directly but compiled
+// modules can not.
+#define IREE_HAL_MODULE_STRING_UTIL_ENABLE 1
+#endif  // IREE_HAL_MODULE_STRING_UTIL_ENABLE
+
+//===----------------------------------------------------------------------===//
+// IREE VM configuration
+//===----------------------------------------------------------------------===//
+// Enables optional VM features. Each of these adds a few KB to the final binary
+// when using the IREE VM. The compiler must be configured to the same set of
+// available extensions in order to ensure that the compiled modules only use
+// features available on the target they are to run on.
+//
+// See the `-iree-vm-target-extension-*` compiler options for more information.
+
+#if !defined(IREE_VM_BACKTRACE_ENABLE)
+// Enables backtraces in VM failures when debugging information is available.
+#define IREE_VM_BACKTRACE_ENABLE 1
+#endif  // !IREE_VM_BACKTRACE_ENABLE
+
+#if !defined(IREE_VM_EXECUTION_TRACING_ENABLE)
+// Enables disassembly of vm bytecode functions and stderr dumping of execution.
+// Increases code size quite, lowers VM performance, and is generally unsafe;
+// include only when debugging or running on trusted inputs.
+#ifdef NDEBUG
+#define IREE_VM_EXECUTION_TRACING_ENABLE 0
+#else
+#define IREE_VM_EXECUTION_TRACING_ENABLE 1
+#endif  // NDEBUG
+#endif  // !IREE_VM_EXECUTION_TRACING_ENABLE
+
+#if !defined(IREE_VM_EXECUTION_TRACING_FORCE_ENABLE)
+// Forces tracing of VM execution by default ignoring runtime flags that may
+// otherwise control the behavior. This can be used to enable tracing in tools
+// that do not have flag parsing or plumbing for per-invocation flags.
+#define IREE_VM_EXECUTION_TRACING_FORCE_ENABLE 0
+#endif  // !IREE_VM_EXECUTION_TRACING_FORCE_ENABLE
+#if IREE_VM_EXECUTION_TRACING_FORCE_ENABLE
+#define IREE_VM_EXECUTION_TRACING_ENABLE 1
+#endif  // IREE_VM_EXECUTION_TRACING_FORCE_ENABLE
+
+#if !defined(IREE_VM_EXECUTION_TRACING_SRC_LOC_ENABLE)
+// Enables printing of the source location of an op when tracing its execution.
+// This may be messy depending on the origin of the locations in the program;
+// for example today the python locs are entire stack traces. Improvements to
+// printing of more complex source locations (or a way to prune them in the
+// compiler) would let this be turned on by default.
+#define IREE_VM_EXECUTION_TRACING_SRC_LOC_ENABLE 0
+#endif  // !IREE_VM_EXECUTION_TRACING_SRC_LOC_ENABLE
+
+#if !defined(IREE_VM_EXT_I64_ENABLE)
+// Enables the 64-bit integer instruction extension.
+// Targeted from the compiler with `-iree-vm-target-extension-i64`.
+#define IREE_VM_EXT_I64_ENABLE 1
+#endif  // !IREE_VM_EXT_I64_ENABLE
+
+#if !defined(IREE_VM_EXT_F32_ENABLE)
+// Enables the 32-bit floating-point instruction extension.
+// Targeted from the compiler with `-iree-vm-target-extension-f32`.
+#define IREE_VM_EXT_F32_ENABLE 1
+#endif  // !IREE_VM_EXT_F32_ENABLE
+
+#if !defined(IREE_VM_EXT_F64_ENABLE)
+// Enables the 64-bit floating-point instruction extension.
+// Targeted from the compiler with `-iree-vm-target-extension-f64`.
+#define IREE_VM_EXT_F64_ENABLE 0
+#endif  // !IREE_VM_EXT_F64_ENABLE
+
+#endif  // IREE_BASE_CONFIG_H_
diff --git a/runtime/src/iree/base/internal/BUILD b/runtime/src/iree/base/internal/BUILD
new file mode 100644
index 0000000..725bc0d
--- /dev/null
+++ b/runtime/src/iree/base/internal/BUILD
@@ -0,0 +1,414 @@
+# Copyright 2019 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# Implementations for iree/base/.
+# These are not part of the IREE API. Though they may be used by external
+# projects their API may change at any time.
+
+load("//iree:build_defs.oss.bzl", "iree_cmake_extra_content", "iree_runtime_cc_library", "iree_runtime_cc_test")
+load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
+load("//build_tools/bazel:cc_binary_benchmark.bzl", "cc_binary_benchmark")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+#===------------------------------------------------------------------------===#
+# Core headers (internal)
+#===------------------------------------------------------------------------===#
+# Put files here that large percentages of the code include only; adding
+# infrequently used files here will cause extraneous large rebuilds.
+
+iree_runtime_cc_library(
+    name = "internal",
+    srcs = [
+        "atomics_clang.h",
+        "atomics_disabled.h",
+        "atomics_gcc.h",
+        "atomics_msvc.h",
+    ],
+    hdrs = [
+        "atomics.h",
+        "debugging.h",
+        "inline_array.h",
+        "math.h",
+    ],
+    deps = [
+        "//runtime/src/iree/base:core_headers",
+    ],
+)
+
+iree_runtime_cc_test(
+    name = "atomics_test",
+    srcs = ["atomics_test.cc"],
+    deps = [
+        ":internal",
+        "//runtime/src/iree/base:core_headers",
+        "//runtime/src/iree/testing:gtest",
+        "//runtime/src/iree/testing:gtest_main",
+    ],
+)
+
+iree_runtime_cc_test(
+    name = "math_test",
+    srcs = ["math_test.cc"],
+    deps = [
+        ":internal",
+        "//runtime/src/iree/testing:gtest",
+        "//runtime/src/iree/testing:gtest_main",
+    ],
+)
+
+#===------------------------------------------------------------------------===#
+# Utilities
+#===------------------------------------------------------------------------===#
+
+iree_runtime_cc_library(
+    name = "arena",
+    srcs = ["arena.c"],
+    hdrs = ["arena.h"],
+    deps = [
+        ":atomic_slist",
+        ":synchronization",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:core_headers",
+        "//runtime/src/iree/base:tracing",
+    ],
+)
+
+iree_runtime_cc_library(
+    name = "atomic_slist",
+    srcs = ["atomic_slist.c"],
+    hdrs = ["atomic_slist.h"],
+    deps = [
+        ":internal",
+        ":synchronization",
+        "//runtime/src/iree/base:core_headers",
+    ],
+)
+
+iree_runtime_cc_test(
+    name = "atomic_slist_test",
+    srcs = ["atomic_slist_test.cc"],
+    deps = [
+        ":atomic_slist",
+        "//runtime/src/iree/testing:gtest",
+        "//runtime/src/iree/testing:gtest_main",
+    ],
+)
+
+iree_runtime_cc_library(
+    name = "cpu",
+    srcs = ["cpu.c"],
+    hdrs = ["cpu.h"],
+    deps = [
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:core_headers",
+    ],
+)
+
+iree_runtime_cc_library(
+    name = "dynamic_library",
+    srcs = [
+        "dynamic_library_posix.c",
+        "dynamic_library_win32.c",
+    ],
+    hdrs = ["dynamic_library.h"],
+    deps = [
+        ":file_path",
+        ":internal",
+        ":synchronization",
+        "//build_tools:default_linkopts",
+        "//build_tools:dl",
+        "//runtime/src/iree/base:core_headers",
+        "//runtime/src/iree/base:tracing",
+    ],
+)
+
+iree_runtime_cc_library(
+    name = "file_io",
+    srcs = ["file_io.c"],
+    hdrs = ["file_io.h"],
+    deps = [
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:core_headers",
+        "//runtime/src/iree/base:tracing",
+    ],
+)
+
+iree_runtime_cc_test(
+    name = "file_io_test",
+    srcs = ["file_io_test.cc"],
+    deps = [
+        ":file_io",
+        "//runtime/src/iree/base:cc",
+        "//runtime/src/iree/base:core_headers",
+        "//runtime/src/iree/base:logging",
+        "//runtime/src/iree/testing:gtest",
+        "//runtime/src/iree/testing:gtest_main",
+    ],
+)
+
+iree_runtime_cc_library(
+    name = "file_path",
+    srcs = ["file_path.c"],
+    hdrs = ["file_path.h"],
+    deps = [
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:core_headers",
+    ],
+)
+
+iree_runtime_cc_test(
+    name = "file_path_test",
+    srcs = [
+        "file_path_test.cc",
+    ],
+    deps = [
+        ":file_path",
+        "//runtime/src/iree/base:core_headers",
+        "//runtime/src/iree/testing:gtest",
+        "//runtime/src/iree/testing:gtest_main",
+    ],
+)
+
+iree_runtime_cc_library(
+    name = "flags",
+    srcs = ["flags.c"],
+    hdrs = ["flags.h"],
+    deps = [
+        ":file_io",
+        ":internal",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:tracing",
+    ],
+)
+
+cc_binary(
+    name = "flags_demo",
+    srcs = ["flags_demo.c"],
+    deps = [
+        ":flags",
+        "//runtime/src/iree/base",
+    ],
+)
+
+iree_lit_test_suite(
+    name = "flags_test",
+    srcs = ["flags_test.txt"],
+    cfg = "//runtime:lit.cfg.py",
+    tags = ["hostonly"],
+    tools = [
+        ":flags_demo",
+        "@llvm-project//llvm:FileCheck",
+    ],
+)
+
+iree_runtime_cc_library(
+    name = "fpu_state",
+    srcs = ["fpu_state.c"],
+    hdrs = ["fpu_state.h"],
+    deps = [
+        ":internal",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:core_headers",
+    ],
+)
+
+cc_binary_benchmark(
+    name = "fpu_state_benchmark",
+    srcs = ["fpu_state_benchmark.cc"],
+    deps = [
+        ":fpu_state",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/testing:benchmark_main",
+        "@com_google_benchmark//:benchmark",
+    ],
+)
+
+iree_runtime_cc_test(
+    name = "fpu_state_test",
+    srcs = ["fpu_state_test.cc"],
+    deps = [
+        ":fpu_state",
+        "//runtime/src/iree/testing:gtest",
+        "//runtime/src/iree/testing:gtest_main",
+    ],
+)
+
+iree_runtime_cc_library(
+    name = "main",
+    srcs = [
+        "main_posix.c",
+        "main_win32.c",
+    ],
+    hdrs = ["main.h"],
+    deps = [
+        "//runtime/src/iree/base:core_headers",
+    ],
+)
+
+iree_runtime_cc_library(
+    name = "prng",
+    hdrs = ["prng.h"],
+    deps = [
+        ":internal",
+        "//runtime/src/iree/base:core_headers",
+    ],
+)
+
+iree_runtime_cc_test(
+    name = "prng_test",
+    srcs = ["prng_test.cc"],
+    deps = [
+        ":prng",
+        "//runtime/src/iree/testing:gtest",
+        "//runtime/src/iree/testing:gtest_main",
+    ],
+)
+
+iree_runtime_cc_library(
+    name = "span",
+    hdrs = ["span.h"],
+)
+
+iree_runtime_cc_library(
+    name = "synchronization",
+    srcs = [
+        "synchronization.c",
+    ],
+    hdrs = [
+        "call_once.h",
+        "synchronization.h",
+    ],
+    deps = [
+        ":internal",
+        "//build_tools:default_linkopts",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:core_headers",
+        "//runtime/src/iree/base:tracing",
+    ],
+)
+
+cc_binary_benchmark(
+    name = "synchronization_benchmark",
+    testonly = True,
+    srcs = ["synchronization_benchmark.cc"],
+    deps = [
+        ":synchronization",
+        "//runtime/src/iree/testing:benchmark_main",
+        "@com_google_benchmark//:benchmark",
+    ],
+)
+
+iree_runtime_cc_test(
+    name = "synchronization_test",
+    srcs = ["synchronization_test.cc"],
+    deps = [
+        ":synchronization",
+        "//runtime/src/iree/testing:gtest",
+        "//runtime/src/iree/testing:gtest_main",
+    ],
+)
+
+iree_runtime_cc_library(
+    name = "wait_handle",
+    srcs = [
+        "wait_handle.c",
+        "wait_handle_epoll.c",
+        "wait_handle_impl.h",
+        "wait_handle_inproc.c",
+        "wait_handle_kqueue.c",
+        "wait_handle_null.c",
+        "wait_handle_poll.c",
+        "wait_handle_posix.c",
+        "wait_handle_posix.h",
+        "wait_handle_win32.c",
+    ],
+    hdrs = ["wait_handle.h"],
+    deps = [
+        ":synchronization",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:core_headers",
+        "//runtime/src/iree/base:tracing",
+    ],
+)
+
+iree_runtime_cc_test(
+    name = "wait_handle_test",
+    srcs = ["wait_handle_test.cc"],
+    deps = [
+        ":wait_handle",
+        "//runtime/src/iree/testing:gtest",
+        "//runtime/src/iree/testing:gtest_main",
+    ],
+)
+
+#===------------------------------------------------------------------------===#
+# Utilities with thread dependencies
+#===------------------------------------------------------------------------===#
+
+iree_cmake_extra_content(
+    content = """
+if(NOT ${IREE_ENABLE_THREADING})
+  return()
+endif()
+""",
+    inline = True,
+)
+
+iree_runtime_cc_library(
+    name = "event_pool",
+    srcs = ["event_pool.c"],
+    hdrs = ["event_pool.h"],
+    deps = [
+        ":internal",
+        ":synchronization",
+        ":wait_handle",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:core_headers",
+        "//runtime/src/iree/base:tracing",
+    ],
+)
+
+iree_runtime_cc_library(
+    name = "threading",
+    srcs = [
+        "threading.c",
+        "threading_darwin.c",
+        "threading_impl.h",
+        "threading_pthreads.c",
+        "threading_win32.c",
+    ],
+    hdrs = ["threading.h"],
+    deps = [
+        ":internal",
+        ":synchronization",
+        "//build_tools:default_linkopts",
+        "//build_tools:dl",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:core_headers",
+        "//runtime/src/iree/base:tracing",
+    ],
+)
+
+iree_runtime_cc_test(
+    name = "threading_test",
+    srcs = [
+        "threading_impl.h",
+        "threading_test.cc",
+    ],
+    deps = [
+        ":internal",
+        ":synchronization",
+        ":threading",
+        "//runtime/src/iree/base:cc",
+        "//runtime/src/iree/testing:gtest",
+        "//runtime/src/iree/testing:gtest_main",
+    ],
+)
diff --git a/runtime/src/iree/base/internal/CMakeLists.txt b/runtime/src/iree/base/internal/CMakeLists.txt
new file mode 100644
index 0000000..d0e59ca
--- /dev/null
+++ b/runtime/src/iree/base/internal/CMakeLists.txt
@@ -0,0 +1,430 @@
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# runtime/src/iree/base/internal/BUILD                                         #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
+
+iree_add_all_subdirs()
+
+iree_cc_library(
+  NAME
+    internal
+  HDRS
+    "atomics.h"
+    "debugging.h"
+    "inline_array.h"
+    "math.h"
+  SRCS
+    "atomics_clang.h"
+    "atomics_disabled.h"
+    "atomics_gcc.h"
+    "atomics_msvc.h"
+  DEPS
+    iree::base::core_headers
+  PUBLIC
+)
+
+iree_cc_test(
+  NAME
+    atomics_test
+  SRCS
+    "atomics_test.cc"
+  DEPS
+    ::internal
+    iree::base::core_headers
+    iree::testing::gtest
+    iree::testing::gtest_main
+)
+
+iree_cc_test(
+  NAME
+    math_test
+  SRCS
+    "math_test.cc"
+  DEPS
+    ::internal
+    iree::testing::gtest
+    iree::testing::gtest_main
+)
+
+iree_cc_library(
+  NAME
+    arena
+  HDRS
+    "arena.h"
+  SRCS
+    "arena.c"
+  DEPS
+    ::atomic_slist
+    ::synchronization
+    iree::base
+    iree::base::core_headers
+    iree::base::tracing
+  PUBLIC
+)
+
+iree_cc_library(
+  NAME
+    atomic_slist
+  HDRS
+    "atomic_slist.h"
+  SRCS
+    "atomic_slist.c"
+  DEPS
+    ::internal
+    ::synchronization
+    iree::base::core_headers
+  PUBLIC
+)
+
+iree_cc_test(
+  NAME
+    atomic_slist_test
+  SRCS
+    "atomic_slist_test.cc"
+  DEPS
+    ::atomic_slist
+    iree::testing::gtest
+    iree::testing::gtest_main
+)
+
+iree_cc_library(
+  NAME
+    cpu
+  HDRS
+    "cpu.h"
+  SRCS
+    "cpu.c"
+  DEPS
+    iree::base
+    iree::base::core_headers
+  PUBLIC
+)
+
+iree_cc_library(
+  NAME
+    dynamic_library
+  HDRS
+    "dynamic_library.h"
+  SRCS
+    "dynamic_library_posix.c"
+    "dynamic_library_win32.c"
+  DEPS
+    ${CMAKE_DL_LIBS}
+    ::file_path
+    ::internal
+    ::synchronization
+    iree::base::core_headers
+    iree::base::tracing
+  PUBLIC
+)
+
+iree_cc_library(
+  NAME
+    file_io
+  HDRS
+    "file_io.h"
+  SRCS
+    "file_io.c"
+  DEPS
+    iree::base
+    iree::base::core_headers
+    iree::base::tracing
+  PUBLIC
+)
+
+iree_cc_test(
+  NAME
+    file_io_test
+  SRCS
+    "file_io_test.cc"
+  DEPS
+    ::file_io
+    iree::base::cc
+    iree::base::core_headers
+    iree::base::logging
+    iree::testing::gtest
+    iree::testing::gtest_main
+)
+
+iree_cc_library(
+  NAME
+    file_path
+  HDRS
+    "file_path.h"
+  SRCS
+    "file_path.c"
+  DEPS
+    iree::base
+    iree::base::core_headers
+  PUBLIC
+)
+
+iree_cc_test(
+  NAME
+    file_path_test
+  SRCS
+    "file_path_test.cc"
+  DEPS
+    ::file_path
+    iree::base::core_headers
+    iree::testing::gtest
+    iree::testing::gtest_main
+)
+
+iree_cc_library(
+  NAME
+    flags
+  HDRS
+    "flags.h"
+  SRCS
+    "flags.c"
+  DEPS
+    ::file_io
+    ::internal
+    iree::base
+    iree::base::tracing
+  PUBLIC
+)
+
+iree_cc_binary(
+  NAME
+    flags_demo
+  SRCS
+    "flags_demo.c"
+  DEPS
+    ::flags
+    iree::base
+)
+
+iree_lit_test_suite(
+  NAME
+    flags_test
+  SRCS
+    "flags_test.txt"
+  TOOLS
+    ::flags_demo
+    FileCheck
+  LABELS
+    "hostonly"
+)
+
+iree_cc_library(
+  NAME
+    fpu_state
+  HDRS
+    "fpu_state.h"
+  SRCS
+    "fpu_state.c"
+  DEPS
+    ::internal
+    iree::base
+    iree::base::core_headers
+  PUBLIC
+)
+
+iree_cc_binary_benchmark(
+  NAME
+    fpu_state_benchmark
+  SRCS
+    "fpu_state_benchmark.cc"
+  DEPS
+    ::fpu_state
+    benchmark
+    iree::base
+    iree::testing::benchmark_main
+  TESTONLY
+)
+
+iree_cc_test(
+  NAME
+    fpu_state_test
+  SRCS
+    "fpu_state_test.cc"
+  DEPS
+    ::fpu_state
+    iree::testing::gtest
+    iree::testing::gtest_main
+)
+
+iree_cc_library(
+  NAME
+    main
+  HDRS
+    "main.h"
+  SRCS
+    "main_posix.c"
+    "main_win32.c"
+  DEPS
+    iree::base::core_headers
+  PUBLIC
+)
+
+iree_cc_library(
+  NAME
+    prng
+  HDRS
+    "prng.h"
+  DEPS
+    ::internal
+    iree::base::core_headers
+  PUBLIC
+)
+
+iree_cc_test(
+  NAME
+    prng_test
+  SRCS
+    "prng_test.cc"
+  DEPS
+    ::prng
+    iree::testing::gtest
+    iree::testing::gtest_main
+)
+
+iree_cc_library(
+  NAME
+    span
+  HDRS
+    "span.h"
+  DEPS
+
+  PUBLIC
+)
+
+iree_cc_library(
+  NAME
+    synchronization
+  HDRS
+    "call_once.h"
+    "synchronization.h"
+  SRCS
+    "synchronization.c"
+  DEPS
+    ::internal
+    iree::base
+    iree::base::core_headers
+    iree::base::tracing
+  PUBLIC
+)
+
+iree_cc_binary_benchmark(
+  NAME
+    synchronization_benchmark
+  SRCS
+    "synchronization_benchmark.cc"
+  DEPS
+    ::synchronization
+    benchmark
+    iree::testing::benchmark_main
+  TESTONLY
+)
+
+iree_cc_test(
+  NAME
+    synchronization_test
+  SRCS
+    "synchronization_test.cc"
+  DEPS
+    ::synchronization
+    iree::testing::gtest
+    iree::testing::gtest_main
+)
+
+iree_cc_library(
+  NAME
+    wait_handle
+  HDRS
+    "wait_handle.h"
+  SRCS
+    "wait_handle.c"
+    "wait_handle_epoll.c"
+    "wait_handle_impl.h"
+    "wait_handle_inproc.c"
+    "wait_handle_kqueue.c"
+    "wait_handle_null.c"
+    "wait_handle_poll.c"
+    "wait_handle_posix.c"
+    "wait_handle_posix.h"
+    "wait_handle_win32.c"
+  DEPS
+    ::synchronization
+    iree::base
+    iree::base::core_headers
+    iree::base::tracing
+  PUBLIC
+)
+
+iree_cc_test(
+  NAME
+    wait_handle_test
+  SRCS
+    "wait_handle_test.cc"
+  DEPS
+    ::wait_handle
+    iree::testing::gtest
+    iree::testing::gtest_main
+)
+
+if(NOT ${IREE_ENABLE_THREADING})
+  return()
+endif()
+
+iree_cc_library(
+  NAME
+    event_pool
+  HDRS
+    "event_pool.h"
+  SRCS
+    "event_pool.c"
+  DEPS
+    ::internal
+    ::synchronization
+    ::wait_handle
+    iree::base
+    iree::base::core_headers
+    iree::base::tracing
+  PUBLIC
+)
+
+iree_cc_library(
+  NAME
+    threading
+  HDRS
+    "threading.h"
+  SRCS
+    "threading.c"
+    "threading_darwin.c"
+    "threading_impl.h"
+    "threading_pthreads.c"
+    "threading_win32.c"
+  DEPS
+    ${CMAKE_DL_LIBS}
+    ::internal
+    ::synchronization
+    iree::base
+    iree::base::core_headers
+    iree::base::tracing
+  PUBLIC
+)
+
+iree_cc_test(
+  NAME
+    threading_test
+  SRCS
+    "threading_impl.h"
+    "threading_test.cc"
+  DEPS
+    ::internal
+    ::synchronization
+    ::threading
+    iree::base::cc
+    iree::testing::gtest
+    iree::testing::gtest_main
+)
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/runtime/src/iree/base/internal/arena.c b/runtime/src/iree/base/internal/arena.c
new file mode 100644
index 0000000..81853d4
--- /dev/null
+++ b/runtime/src/iree/base/internal/arena.c
@@ -0,0 +1,227 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/internal/arena.h"
+
+#include <stdint.h>
+#include <string.h>
+
+#include "iree/base/tracing.h"
+
+//===----------------------------------------------------------------------===//
+// iree_arena_block_pool_t
+//===----------------------------------------------------------------------===//
+
+void iree_arena_block_pool_initialize(iree_host_size_t total_block_size,
+                                      iree_allocator_t block_allocator,
+                                      iree_arena_block_pool_t* out_block_pool) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  memset(out_block_pool, 0, sizeof(*out_block_pool));
+  out_block_pool->total_block_size = total_block_size;
+  out_block_pool->usable_block_size =
+      total_block_size - sizeof(iree_arena_block_t);
+  out_block_pool->block_allocator = block_allocator;
+  iree_atomic_arena_block_slist_initialize(&out_block_pool->available_slist);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_arena_block_pool_deinitialize(iree_arena_block_pool_t* block_pool) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Since all blocks must have been released we can just reuse trim (today) as
+  // it doesn't retain any blocks.
+  iree_arena_block_pool_trim(block_pool);
+  iree_atomic_arena_block_slist_deinitialize(&block_pool->available_slist);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_arena_block_pool_trim(iree_arena_block_pool_t* block_pool) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_arena_block_t* head = NULL;
+  iree_atomic_arena_block_slist_flush(
+      &block_pool->available_slist,
+      IREE_ATOMIC_SLIST_FLUSH_ORDER_APPROXIMATE_LIFO, &head, NULL);
+  while (head) {
+    void* ptr = (uint8_t*)head - block_pool->usable_block_size;
+    head = head->next;
+    iree_allocator_free(block_pool->block_allocator, ptr);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+iree_status_t iree_arena_block_pool_acquire(iree_arena_block_pool_t* block_pool,
+                                            iree_arena_block_t** out_block) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_arena_block_t* block =
+      iree_atomic_arena_block_slist_pop(&block_pool->available_slist);
+
+  if (!block) {
+    // No blocks available; allocate one now.
+    // Note that it's possible for there to be a race here where one thread
+    // releases a block to the pool while we are trying to acquire one - in that
+    // case we may end up allocating a block when perhaps we didn't need to but
+    // that's fine - it's just one block and the contention means there's likely
+    // to be a need for more anyway.
+    uint8_t* block_base = NULL;
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z0, iree_allocator_malloc_uninitialized(block_pool->block_allocator,
+                                                block_pool->total_block_size,
+                                                (void**)&block_base));
+    block = (iree_arena_block_t*)(block_base + block_pool->usable_block_size);
+  }
+
+  block->next = NULL;
+  *out_block = block;
+
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+void iree_arena_block_pool_release(iree_arena_block_pool_t* block_pool,
+                                   iree_arena_block_t* block_head,
+                                   iree_arena_block_t* block_tail) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_atomic_arena_block_slist_concat(&block_pool->available_slist, block_head,
+                                       block_tail);
+  IREE_TRACE_ZONE_END(z0);
+}
+
+//===----------------------------------------------------------------------===//
+// iree_arena_allocator_t
+//===----------------------------------------------------------------------===//
+
+void iree_arena_initialize(iree_arena_block_pool_t* block_pool,
+                           iree_arena_allocator_t* out_arena) {
+  memset(out_arena, 0, sizeof(*out_arena));
+  out_arena->block_pool = block_pool;
+}
+
+void iree_arena_deinitialize(iree_arena_allocator_t* arena) {
+  iree_arena_reset(arena);
+}
+
+void iree_arena_reset(iree_arena_allocator_t* arena) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  if (arena->allocation_head != NULL) {
+    iree_arena_oversized_allocation_t* head = arena->allocation_head;
+    do {
+      void* ptr = (void*)head;
+      head = head->next;
+      iree_allocator_free(arena->block_pool->block_allocator, ptr);
+    } while (head);
+    arena->allocation_head = NULL;
+  }
+  if (arena->block_head != NULL) {
+    iree_arena_block_pool_release(arena->block_pool, arena->block_head,
+                                  arena->block_tail);
+    arena->block_head = NULL;
+    arena->block_tail = NULL;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+iree_status_t iree_arena_allocate(iree_arena_allocator_t* arena,
+                                  iree_host_size_t byte_length,
+                                  void** out_ptr) {
+  *out_ptr = NULL;
+
+  iree_arena_block_pool_t* block_pool = arena->block_pool;
+
+  if (byte_length > block_pool->usable_block_size) {
+    // Oversized allocation that can't be handled by the block pool. We'll
+    // allocate directly from the system allocator and track it ourselves for
+    // freeing during reset.
+    IREE_TRACE_ZONE_BEGIN(z0);
+    iree_host_size_t allocation_size =
+        sizeof(iree_arena_oversized_allocation_t) + byte_length;
+    iree_arena_oversized_allocation_t* allocation = NULL;
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z0,
+        iree_allocator_malloc_uninitialized(
+            block_pool->block_allocator, allocation_size, (void**)&allocation));
+    allocation->next = arena->allocation_head;
+    arena->allocation_head = allocation;
+    arena->total_allocation_size += allocation_size;
+    arena->used_allocation_size += byte_length;
+    *out_ptr = (uint8_t*)allocation + sizeof(iree_arena_oversized_allocation_t);
+    IREE_TRACE_ZONE_END(z0);
+    return iree_ok_status();
+  }
+
+  // Pad length allocated so that each pointer bump is always ending at an
+  // aligned address and the next allocation will start aligned.
+  iree_host_size_t aligned_length =
+      iree_host_align(byte_length, iree_max_align_t);
+
+  // Check to see if the current block (if any) has space - if not, get another.
+  if (arena->block_head == NULL ||
+      arena->block_bytes_remaining < aligned_length) {
+    IREE_TRACE_ZONE_BEGIN(z0);
+    iree_arena_block_t* block = NULL;
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z0, iree_arena_block_pool_acquire(arena->block_pool, &block));
+    block->next = arena->block_head;
+    arena->block_head = block;
+    if (!arena->block_tail) arena->block_tail = block;
+    arena->total_allocation_size += block_pool->total_block_size;
+    arena->block_bytes_remaining = block_pool->usable_block_size;
+    IREE_TRACE_ZONE_END(z0);
+  }
+
+  // Slice out the allocation from the current block.
+  void* ptr = (uint8_t*)arena->block_head - arena->block_bytes_remaining;
+  arena->block_bytes_remaining -= aligned_length;
+  arena->used_allocation_size += aligned_length;
+  *out_ptr = ptr;
+  return iree_ok_status();
+}
+
+static iree_status_t iree_arena_allocator_ctl(void* self,
+                                              iree_allocator_command_t command,
+                                              const void* params,
+                                              void** inout_ptr) {
+  iree_arena_allocator_t* arena = (iree_arena_allocator_t*)self;
+  switch (command) {
+    case IREE_ALLOCATOR_COMMAND_MALLOC:
+    case IREE_ALLOCATOR_COMMAND_CALLOC: {
+      const iree_allocator_alloc_params_t* alloc_params =
+          (const iree_allocator_alloc_params_t*)params;
+      IREE_RETURN_IF_ERROR(
+          iree_arena_allocate(arena, alloc_params->byte_length, inout_ptr));
+      if (command == IREE_ALLOCATOR_COMMAND_CALLOC) {
+        memset(*inout_ptr, 0, alloc_params->byte_length);
+      }
+      return iree_ok_status();
+    }
+    case IREE_ALLOCATOR_COMMAND_FREE: {
+      // Do nothing: can't free from an arena.
+      return iree_ok_status();
+    }
+    default:
+      // NOTE: we could try to support IREE_ALLOCATOR_COMMAND_REALLOC, but
+      // it requires the original size to be able to do properly (without
+      // copying memory we shouldn't have access to). For this and other reasons
+      // we very rarely realloc in IREE so having this limitation isn't too bad.
+      return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                              "unsupported iree_arena_t allocator command");
+  }
+}
+
+iree_allocator_t iree_arena_allocator(iree_arena_allocator_t* arena) {
+  iree_allocator_t v = {
+      .self = arena,
+      .ctl = iree_arena_allocator_ctl,
+  };
+  return v;
+}
diff --git a/runtime/src/iree/base/internal/arena.h b/runtime/src/iree/base/internal/arena.h
new file mode 100644
index 0000000..1d0afae
--- /dev/null
+++ b/runtime/src/iree/base/internal/arena.h
@@ -0,0 +1,153 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BASE_INTERNAL_ARENA_H_
+#define IREE_BASE_INTERNAL_ARENA_H_
+
+#include <stddef.h>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/atomic_slist.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+//===----------------------------------------------------------------------===//
+// iree_arena_block_pool_t
+//===----------------------------------------------------------------------===//
+
+struct iree_arena_block_t;
+
+// NOTE: this struct is at the *end* of allocated blocks such that we don't mess
+// with alignment - byte 0 of a block is always byte 0 of the allocation from
+// the system. We can do this as all blocks have the same size so computing the
+// footer offset from a pointer is easy.
+typedef struct iree_arena_block_t {
+  struct iree_arena_block_t* next;
+} iree_arena_block_t;
+
+// An atomic approximately LIFO singly-linked list.
+IREE_TYPED_ATOMIC_SLIST_WRAPPER(iree_atomic_arena_block, iree_arena_block_t,
+                                offsetof(iree_arena_block_t, next));
+
+// A simple atomic fixed-size block pool.
+// Blocks are allocated from the system as required and kept in the pool to
+// satisfy future requests. Blocks are all of a uniform size specified when the
+// pool is created. It's recommended that power-of-two sizes are used for the
+// blocks so that the underlying allocator is more likely to bucket them
+// appropriately.
+//
+// Thread-safe; multiple threads may acquire and release blocks from the pool.
+// The underlying allocator must also be thread-safe.
+typedef struct iree_arena_block_pool_t {
+  // Block size, in bytes. All blocks in the available_slist will have this
+  // byte size which includes the iree_arena_block_t footer.
+  iree_host_size_t total_block_size;
+  // Block size, in bytes, of the usable bytes within a block.
+  iree_host_size_t usable_block_size;
+  // Allocator used for allocating/freeing each allocation block.
+  iree_allocator_t block_allocator;
+  // Linked list of free blocks (LIFO).
+  iree_atomic_arena_block_slist_t available_slist;
+} iree_arena_block_pool_t;
+
+// Initializes a new block pool in |out_block_pool|.
+// |block_allocator| will be used to allocate and free blocks for the pool.
+// Each block allocated will be |total_block_size| but have a slightly smaller
+// usable size due to the tracking overhead. Prefer powers of two.
+void iree_arena_block_pool_initialize(iree_host_size_t total_block_size,
+                                      iree_allocator_t block_allocator,
+                                      iree_arena_block_pool_t* out_block_pool);
+
+// Deinitializes a block pool and frees all allocations.
+// All blocks that were acquired from the pool must have already been released
+// back to it.
+void iree_arena_block_pool_deinitialize(iree_arena_block_pool_t* block_pool);
+
+// Trims the pool by freeing unused blocks back to the allocator.
+// Acquired blocks are not freed and remain valid.
+void iree_arena_block_pool_trim(iree_arena_block_pool_t* block_pool);
+
+// Acquires a single block from the pool and returns it in |out_block|.
+// The block may be either a new allocation with undefined contents or a reused
+// prior allocation with undefined contents.
+iree_status_t iree_arena_block_pool_acquire(iree_arena_block_pool_t* block_pool,
+                                            iree_arena_block_t** out_block);
+
+// Releases one or more blocks back to the block pool.
+// Any blocks chained in |block_head| will also be released allowing for
+// low-overhead resets when the blocks are already tracked in linked lists.
+void iree_arena_block_pool_release(iree_arena_block_pool_t* block_pool,
+                                   iree_arena_block_t* block_head,
+                                   iree_arena_block_t* block_tail);
+
+//===----------------------------------------------------------------------===//
+// iree_arena_allocator_t
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_arena_oversized_allocation_t {
+  struct iree_arena_oversized_allocation_t* next;
+} iree_arena_oversized_allocation_t;
+
+// A lightweight bump-pointer arena allocator using a shared block pool.
+// As allocations are made from the arena and block capacity is exhausted new
+// blocks will be acquired from the pool. Upon being reset all blocks will be
+// released back to the pool for reuse by either the same arena in the future or
+// other arenas sharing the same pool.
+//
+// The size of each allocated block used by the arena is inherited from the
+// block pool. Allocations from the arena may exceed the block size but will
+// incur additional allocation overhead as the block pool is bypassed and the
+// system allocator is directly used to service the request.
+//
+// Thread-compatible; the shared block pool is thread-safe and may be used by
+// arenas on multiple threads but each arena must only be used by a single
+// thread.
+typedef struct iree_arena_allocator_t {
+  // Fixed-size block pool used to acquire new blocks for the arena.
+  iree_arena_block_pool_t* block_pool;
+  // Total bytes allocated to the arena from the block pool or system allocator.
+  iree_host_size_t total_allocation_size;
+  // Total bytes allocated from the arena; the utilization of the arena can be
+  // checked with `used_allocation_size / total_allocation_size`.
+  iree_host_size_t used_allocation_size;
+  // Linked list of oversized allocations made directly from the system
+  // allocator used by the block pool.
+  iree_arena_oversized_allocation_t* allocation_head;
+  // Linked list of allocated blocks maintained so that reset can release them.
+  iree_arena_block_t* block_head;
+  iree_arena_block_t* block_tail;
+  // The number of bytes remaining in the block pointed to by block_head.
+  iree_host_size_t block_bytes_remaining;
+} iree_arena_allocator_t;
+
+// Initializes an arena that will use |block_pool| for allocating blocks as
+// needed.
+void iree_arena_initialize(iree_arena_block_pool_t* block_pool,
+                           iree_arena_allocator_t* out_arena);
+
+// Deinitializes the arena and returns allocated blocks to the parent pool.
+void iree_arena_deinitialize(iree_arena_allocator_t* arena);
+
+// Resets the entire arena and returns allocated blocks to the parent pool.
+void iree_arena_reset(iree_arena_allocator_t* arena);
+
+// Allocates |byte_length| contiguous bytes from the arena.
+// The returned bytes will have undefined contents and must be initialized by
+// the caller.
+iree_status_t iree_arena_allocate(iree_arena_allocator_t* arena,
+                                  iree_host_size_t byte_length, void** out_ptr);
+
+// Returns an iree_allocator_t that allocates from the given |arena|.
+// Frees are ignored as arenas can only be reset as a whole.
+iree_allocator_t iree_arena_allocator(iree_arena_allocator_t* arena);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_BASE_INTERNAL_ARENA_H_
diff --git a/runtime/src/iree/base/internal/atomic_slist.c b/runtime/src/iree/base/internal/atomic_slist.c
new file mode 100644
index 0000000..3f4a27b
--- /dev/null
+++ b/runtime/src/iree/base/internal/atomic_slist.c
@@ -0,0 +1,111 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/internal/atomic_slist.h"
+
+#include <string.h>
+
+#include "iree/base/attributes.h"
+
+// TODO(benvanik): add TSAN annotations when switched to atomics:
+// https://github.com/gcc-mirror/gcc/blob/master/libsanitizer/include/sanitizer/tsan_interface_atomic.h
+// https://reviews.llvm.org/D18500
+
+void iree_atomic_slist_initialize(iree_atomic_slist_t* out_list) {
+  memset(out_list, 0, sizeof(*out_list));
+  iree_slim_mutex_initialize(&out_list->mutex);
+}
+
+void iree_atomic_slist_deinitialize(iree_atomic_slist_t* list) {
+  // TODO(benvanik): assert empty.
+  iree_slim_mutex_deinitialize(&list->mutex);
+  memset(list, 0, sizeof(*list));
+}
+
+void iree_atomic_slist_concat(iree_atomic_slist_t* list,
+                              iree_atomic_slist_entry_t* head,
+                              iree_atomic_slist_entry_t* tail) {
+  if (IREE_UNLIKELY(!head)) return;
+  iree_slim_mutex_lock(&list->mutex);
+  tail->next = list->head;
+  list->head = head;
+  iree_slim_mutex_unlock(&list->mutex);
+}
+
+void iree_atomic_slist_push(iree_atomic_slist_t* list,
+                            iree_atomic_slist_entry_t* entry) {
+  iree_slim_mutex_lock(&list->mutex);
+  iree_atomic_slist_push_unsafe(list, entry);
+  iree_slim_mutex_unlock(&list->mutex);
+}
+
+void iree_atomic_slist_push_unsafe(iree_atomic_slist_t* list,
+                                   iree_atomic_slist_entry_t* entry) {
+  // NOTE: no lock is held here and no atomic operation will be used when this
+  // is actually made atomic.
+  entry->next = list->head;
+  list->head = entry;
+}
+
+iree_atomic_slist_entry_t* iree_atomic_slist_pop(iree_atomic_slist_t* list) {
+  iree_slim_mutex_lock(&list->mutex);
+  iree_atomic_slist_entry_t* entry = list->head;
+  if (entry != NULL) {
+    list->head = entry->next;
+    entry->next = NULL;
+  }
+  iree_slim_mutex_unlock(&list->mutex);
+  return entry;
+}
+
+bool iree_atomic_slist_flush(iree_atomic_slist_t* list,
+                             iree_atomic_slist_flush_order_t flush_order,
+                             iree_atomic_slist_entry_t** out_head,
+                             iree_atomic_slist_entry_t** out_tail) {
+  // Exchange list head with NULL to steal the entire list. The list will be in
+  // the native LIFO order of the slist.
+  iree_slim_mutex_lock(&list->mutex);
+  iree_atomic_slist_entry_t* head = list->head;
+  list->head = NULL;
+  iree_slim_mutex_unlock(&list->mutex);
+  if (!head) return false;
+
+  switch (flush_order) {
+    case IREE_ATOMIC_SLIST_FLUSH_ORDER_APPROXIMATE_LIFO: {
+      // List is already in native LIFO order. If the user wants a tail we have
+      // to scan for it, though, which we really only want to do when required
+      // as it's a linked list pointer walk.
+      *out_head = head;
+      if (out_tail) {
+        iree_atomic_slist_entry_t* p = head;
+        while (p->next) p = p->next;
+        *out_tail = p;
+      }
+      break;
+    }
+    case IREE_ATOMIC_SLIST_FLUSH_ORDER_APPROXIMATE_FIFO: {
+      // Reverse the list in a single scan. list_head is our tail, so scan
+      // forward to find our head. Since we have to walk the whole list anyway
+      // we can cheaply give both the head and tail to the caller.
+      iree_atomic_slist_entry_t* tail = head;
+      if (out_tail) *out_tail = tail;
+      iree_atomic_slist_entry_t* p = head;
+      do {
+        iree_atomic_slist_entry_t* next = p->next;
+        p->next = head;
+        head = p;
+        p = next;
+      } while (p != NULL);
+      tail->next = NULL;
+      *out_head = head;
+      break;
+    }
+    default:
+      return false;
+  }
+
+  return true;
+}
diff --git a/runtime/src/iree/base/internal/atomic_slist.h b/runtime/src/iree/base/internal/atomic_slist.h
new file mode 100644
index 0000000..eaf852c
--- /dev/null
+++ b/runtime/src/iree/base/internal/atomic_slist.h
@@ -0,0 +1,257 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// NOTE: the best kind of synchronization is no synchronization; always try to
+// design your algorithm so that you don't need anything from this file :)
+// See https://travisdowns.github.io/blog/2020/07/06/concurrency-costs.html
+
+#ifndef IREE_BASE_INTERNAL_ATOMIC_SLIST_H_
+#define IREE_BASE_INTERNAL_ATOMIC_SLIST_H_
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "iree/base/alignment.h"
+#include "iree/base/internal/atomics.h"
+#include "iree/base/internal/synchronization.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// The embedded pointer to the next entry in the slist. This points to the
+// internal iree_atomic_slist_entry_t, *not* the user-provided pointer.
+typedef void* iree_atomic_slist_intrusive_ptr_t;
+
+// DO NOT USE: implementation detail.
+typedef struct iree_atomic_slist_entry_t {
+  struct iree_atomic_slist_entry_t* next;
+} iree_atomic_slist_entry_t;
+
+// Lightweight contention-avoiding singly linked list.
+// This models optimistically-ordered LIFO behavior (stack push/pop) using
+// atomic primitives.
+//
+//           ***************************************************
+//           ******** ONLY APPROXIMATE ORDER GUARANTEES ********
+//           ***************************************************
+//
+// This makes it extremely efficient for when only eventual consistency across
+// producers and consumers is required. The most common example is free lists
+// where all that matters is that entries make it into the list and not that
+// they have any particular order between them. Work queues where all tasks
+// within the queue are able to execute in any order like with wavefront-style
+// scheduling can also benefit from this relaxed behavior.
+//
+// If a strict ordering is required this can be used as a primitive to construct
+// a flat-combining data structure where data structure change requests are
+// published to this list and a combiner is chosen to land the published data in
+// an appropriate order:
+// http://people.csail.mit.edu/shanir/publications/Flat%20Combining%20SPAA%2010.pdf
+//
+// There's often still a benefit in unordered scenarios of having LIFO behavior
+// as it promotes cache-friendly small linked lists when there is a small number
+// of producers and consumers (1:1 is the best case), though as the producer and
+// consumer count increases the LIFO behavior can pessimize performance as there
+// is more contention for the list head pointer. Prefer to shard across multiple
+// per-core/thread lists and use techniques like flat-combining for the
+// cross-core/thread aggregation/sequencing.
+//
+// This API modeled roughly on the Windows SList type:
+// https://docs.microsoft.com/en-us/windows/win32/sync/interlocked-singly-linked-lists
+// which is roughly compatible with the Apple OSAtomic queue:
+// https://developer.apple.com/library/archive/documentation/System/Conceptual/ManPages_iPhoneOS/man3/OSAtomicEnqueue.3.html
+// https://opensource.apple.com/source/libplatform/libplatform-125/include/libkern/OSAtomicQueue.h.auto.html
+//
+// Usage:
+// https://docs.microsoft.com/en-us/windows/win32/sync/using-singly-linked-lists
+//
+// WARNING: this is an extremely sharp pufferfish-esque API. Don't use it. 🐡
+//
+// TODO(benvanik): verify behavior (and worthwhileness) of supporting platform
+// primitives. The benefit of something like OSAtomicEnqueue/Dequeue is that it
+// may have better tooling (TSAN), special intrinsic handling in the compiler,
+// etc. That said, the Windows Interlocked* variants don't seem to. Having a
+// single heavily tested implementation seems more worthwhile than several.
+typedef iree_alignas(iree_max_align_t) struct {
+  // TODO(benvanik): spend some time golfing this. Unblocking myself for now :)
+  iree_slim_mutex_t mutex;
+  iree_atomic_slist_entry_t* head;
+} iree_atomic_slist_t;
+
+// Initializes an slist handle to an empty list.
+// Lists must be flushed to empty and deinitialized when no longer needed with
+// iree_atomic_slist_deinitialize.
+//
+// NOTE: not thread-safe; existing |out_list| contents are discarded.
+void iree_atomic_slist_initialize(iree_atomic_slist_t* out_list);
+
+// Deinitializes an slist.
+// The list must be empty; callers are expected to flush the list from the same
+// thread making this call when it is guaranteed no other thread may be trying
+// to use the list.
+//
+// NOTE: not thread-safe; |list| must not be used by any other thread.
+void iree_atomic_slist_deinitialize(iree_atomic_slist_t* list);
+
+// Concatenates a span of entries into the list in the order they are provided.
+//
+// Example:
+//   existing slist: C B A
+//    provided span: 1 2 3
+//  resulting slist: 1 2 3 C B A
+void iree_atomic_slist_concat(iree_atomic_slist_t* list,
+                              iree_atomic_slist_entry_t* head,
+                              iree_atomic_slist_entry_t* tail);
+
+// Pushes an entry into the list.
+//
+//   existing slist: C B A
+//   provided entry: 1
+//  resulting slist: 1 C B A
+void iree_atomic_slist_push(iree_atomic_slist_t* list,
+                            iree_atomic_slist_entry_t* entry);
+
+// Pushes an entry into the list without using an atomic update.
+// This is useful for when |list| is known to be inaccessible to any other
+// thread, such as when populating a stack-local list prior to sharing it.
+void iree_atomic_slist_push_unsafe(iree_atomic_slist_t* list,
+                                   iree_atomic_slist_entry_t* entry);
+
+// Pops the most recently pushed entry from the list and returns it.
+// Returns NULL if the list was empty at the time it was queried.
+//
+//   existing slist: C B A
+//  resulting slist: B A
+//   returned entry: C
+iree_atomic_slist_entry_t* iree_atomic_slist_pop(iree_atomic_slist_t* list);
+
+// Defines the approximate order in which a span of flushed entries is returned.
+typedef enum iree_atomic_slist_flush_order_e {
+  // |out_head| and |out_tail| will be set to a span of the entries roughly in
+  // the order they were pushed to the list in LIFO (stack) order.
+  //
+  // Example:
+  //    slist: C B A
+  //   result: C B A (or when contended possibly C A B)
+  IREE_ATOMIC_SLIST_FLUSH_ORDER_APPROXIMATE_LIFO = 0,
+  // |out_head| and |out_tail| will be set to the first and last entries
+  // pushed respectively, turning this LIFO slist into a FIFO queue.
+  //
+  // Example:
+  //    slist: C B A
+  //   result: A B C (or when contended possibly B A C)
+  IREE_ATOMIC_SLIST_FLUSH_ORDER_APPROXIMATE_FIFO,
+} iree_atomic_slist_flush_order_t;
+
+// Removes all items from the list and returns them in **APPROXIMATELY** the
+// |flush_order| requested. As there are no order guarantees there may be slight
+// transpositions of entries that were pushed from multiple processors or even
+// interleaved entries within spans of entries pushed with
+// iree_atomic_slist_concat.
+//
+// If |out_tail| is not required it can be omitted and this may avoid the
+// need for the flush to walk the list and touch each entry.
+//
+// Returns true if any items were present and false if the output list is empty.
+// Note that because atomic data structures can race it's possible for there to
+// both be something in the list prior to this call and something in the list
+// after the call and yet the return can still be false.
+bool iree_atomic_slist_flush(iree_atomic_slist_t* list,
+                             iree_atomic_slist_flush_order_t flush_order,
+                             iree_atomic_slist_entry_t** out_head,
+                             iree_atomic_slist_entry_t** out_tail);
+
+//==============================================================================
+// Typed wrapper generator for iree_atomic_slist_t
+//==============================================================================
+
+// Typed and named wrappers for making atomic slists easier to work with.
+//
+// Usage:
+//  typedef struct {
+//    int some_fields;
+//    iree_atomic_slist_intrusive_ptr_t slist_next;
+//    int more_fields;
+//  } my_type_t;
+//  IREE_TYPED_ATOMIC_SLIST_WRAPPER(my_type, my_type_t,
+//                                  offsetof(my_type_t, slist_next));
+//
+//  my_type_slist_t list;
+//  my_type_slist_initialize(&list);
+//  my_type_t* entry = allocate_my_type(123);
+//  my_type_slist_push(&list, entry);
+//  entry = my_type_slist_pop(&list);
+#define IREE_TYPED_ATOMIC_SLIST_WRAPPER(name, type, next_offset)               \
+  static inline iree_atomic_slist_entry_t* name##_slist_entry_from_ptr(        \
+      type* entry) {                                                           \
+    return entry                                                               \
+               ? ((iree_atomic_slist_entry_t*)((uint8_t*)entry + next_offset)) \
+               : NULL;                                                         \
+  }                                                                            \
+  static inline type* name##_slist_entry_to_ptr(                               \
+      iree_atomic_slist_entry_t* entry) {                                      \
+    return entry ? (type*)(((uint8_t*)entry) - next_offset) : NULL;            \
+  }                                                                            \
+                                                                               \
+  static inline type* name##_slist_get_next(type* entry) {                     \
+    if (!entry) return NULL;                                                   \
+    return name##_slist_entry_to_ptr(                                          \
+        ((iree_atomic_slist_entry_t*)((uint8_t*)entry + next_offset))->next);  \
+  }                                                                            \
+  static inline void name##_slist_set_next(type* entry, type* next) {          \
+    name##_slist_entry_from_ptr(entry)->next =                                 \
+        name##_slist_entry_from_ptr(next);                                     \
+  }                                                                            \
+                                                                               \
+  typedef iree_alignas(iree_max_align_t) struct {                              \
+    iree_atomic_slist_t impl;                                                  \
+  } name##_slist_t;                                                            \
+                                                                               \
+  static inline void name##_slist_initialize(name##_slist_t* out_list) {       \
+    iree_atomic_slist_initialize(&out_list->impl);                             \
+  }                                                                            \
+  static inline void name##_slist_deinitialize(name##_slist_t* list) {         \
+    iree_atomic_slist_deinitialize(&list->impl);                               \
+  }                                                                            \
+                                                                               \
+  static inline void name##_slist_push(name##_slist_t* list, type* entry) {    \
+    iree_atomic_slist_push(&list->impl, name##_slist_entry_from_ptr(entry));   \
+  }                                                                            \
+  static inline void name##_slist_push_unsafe(name##_slist_t* list,            \
+                                              type* entry) {                   \
+    iree_atomic_slist_push_unsafe(&list->impl,                                 \
+                                  name##_slist_entry_from_ptr(entry));         \
+  }                                                                            \
+  static inline void name##_slist_concat(name##_slist_t* list, type* head,     \
+                                         type* tail) {                         \
+    iree_atomic_slist_concat(&list->impl, name##_slist_entry_from_ptr(head),   \
+                             name##_slist_entry_from_ptr(tail));               \
+  }                                                                            \
+  static inline type* name##_slist_pop(name##_slist_t* list) {                 \
+    return name##_slist_entry_to_ptr(iree_atomic_slist_pop(&list->impl));      \
+  }                                                                            \
+                                                                               \
+  static inline bool name##_slist_flush(                                       \
+      name##_slist_t* list, iree_atomic_slist_flush_order_t flush_order,       \
+      type** out_head, type** out_tail) {                                      \
+    iree_atomic_slist_entry_t* head = NULL;                                    \
+    iree_atomic_slist_entry_t* tail = NULL;                                    \
+    if (!iree_atomic_slist_flush(&list->impl, flush_order, &head,              \
+                                 out_tail ? &tail : NULL)) {                   \
+      return false; /* empty list */                                           \
+    }                                                                          \
+    *out_head = name##_slist_entry_to_ptr(head);                               \
+    if (out_tail) *out_tail = name##_slist_entry_to_ptr(tail);                 \
+    return true;                                                               \
+  }
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // IREE_BASE_INTERNAL_ATOMIC_SLIST_H_
diff --git a/runtime/src/iree/base/internal/atomic_slist_test.cc b/runtime/src/iree/base/internal/atomic_slist_test.cc
new file mode 100644
index 0000000..120838c
--- /dev/null
+++ b/runtime/src/iree/base/internal/atomic_slist_test.cc
@@ -0,0 +1,185 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/internal/atomic_slist.h"
+
+#include <vector>
+
+#include "iree/testing/gtest.h"
+
+namespace {
+
+struct dummy_entry_t {
+  // NOTE: we purposefully offset the entry pointer
+  size_t value = 0;
+  iree_atomic_slist_intrusive_ptr_t slist_next = NULL;
+};
+IREE_TYPED_ATOMIC_SLIST_WRAPPER(dummy, dummy_entry_t,
+                                offsetof(dummy_entry_t, slist_next));
+
+std::vector<dummy_entry_t> MakeDummySListItems(size_t base_index,
+                                               size_t count) {
+  std::vector<dummy_entry_t> items(count);
+  for (size_t i = 0; i < count; ++i) {
+    items[i].value = base_index + i;
+  }
+  return items;
+}
+
+TEST(AtomicSList, Lifetime) {
+  iree_atomic_slist_t list;  // NOTE: intentionally uninitialized.
+  iree_atomic_slist_initialize(&list);
+  iree_atomic_slist_deinitialize(&list);
+}
+
+TEST(AtomicSList, BasicUsage) {
+  dummy_slist_t list;
+  dummy_slist_initialize(&list);
+
+  // List starts empty.
+  EXPECT_EQ(NULL, dummy_slist_pop(&list));
+
+  // Push some items into the list (LIFO order).
+  // New contents: 5 4 3 2 1 0
+  auto item_storage = MakeDummySListItems(0, 6);
+  for (size_t i = 0; i < item_storage.size(); ++i) {
+    dummy_slist_push(&list, &item_storage[i]);
+  }
+
+  // Now pop them out - they should be in reverse order.
+  // New contents: e
+  for (size_t i = 0; i < item_storage.size(); ++i) {
+    dummy_entry_t* p = dummy_slist_pop(&list);
+    ASSERT_TRUE(p);
+    EXPECT_EQ(item_storage.size() - i - 1, p->value);
+  }
+
+  // List ends empty.
+  EXPECT_EQ(NULL, dummy_slist_pop(&list));
+
+  dummy_slist_deinitialize(&list);
+}
+
+TEST(AtomicSList, Concat) {
+  dummy_slist_t list;
+  dummy_slist_initialize(&list);
+
+  // Push some initial items into the list (LIFO order).
+  // New contents: 1 0
+  auto initial_item_storage = MakeDummySListItems(0, 2);
+  for (size_t i = 0; i < initial_item_storage.size(); ++i) {
+    dummy_slist_push(&list, &initial_item_storage[i]);
+  }
+
+  // Stitch items together modeling what a user may do when building the list
+  // themselves.
+  // Items: 2 3 4
+  auto span_item_storage = MakeDummySListItems(2, 3);
+  for (size_t i = 0; i < span_item_storage.size() - 1; ++i) {
+    dummy_slist_set_next(&span_item_storage[i], &span_item_storage[i + 1]);
+  }
+
+  // Push all of the items to the list at once.
+  // New contents: 2 3 4 1 0
+  dummy_slist_concat(&list, &span_item_storage.front(),
+                     &span_item_storage.back());
+
+  // Pop the span items and verify they are in the correct order: we effectively
+  // pushed them such that popping is FIFO (2->4).
+  // New contents: 1 0
+  for (size_t i = 0; i < span_item_storage.size(); ++i) {
+    dummy_entry_t* p = dummy_slist_pop(&list);
+    ASSERT_TRUE(p);
+    EXPECT_EQ(/*base_index=*/2 + i, p->value);
+  }
+
+  // Pop the initial items and ensure they survived.
+  // New contents: e
+  for (size_t i = 0; i < initial_item_storage.size(); ++i) {
+    dummy_entry_t* p = dummy_slist_pop(&list);
+    ASSERT_TRUE(p);
+    EXPECT_EQ(initial_item_storage.size() - i - 1, p->value);
+  }
+
+  dummy_slist_deinitialize(&list);
+}
+
+TEST(AtomicSList, FlushLIFO) {
+  dummy_slist_t list;
+  dummy_slist_initialize(&list);
+
+  // Flushing when empty is ok.
+  dummy_entry_t* head = NULL;
+  dummy_entry_t* tail = NULL;
+  EXPECT_FALSE(dummy_slist_flush(
+      &list, IREE_ATOMIC_SLIST_FLUSH_ORDER_APPROXIMATE_LIFO, &head, &tail));
+
+  // Push items into the list (LIFO order).
+  // New contents: 3 2 1 0
+  auto item_storage = MakeDummySListItems(0, 4);
+  for (size_t i = 0; i < item_storage.size(); ++i) {
+    dummy_slist_push(&list, &item_storage[i]);
+  }
+
+  // Flush in LIFO order and verify empty.
+  // New contents: e
+  EXPECT_TRUE(dummy_slist_flush(
+      &list, IREE_ATOMIC_SLIST_FLUSH_ORDER_APPROXIMATE_LIFO, &head, &tail));
+  EXPECT_EQ(NULL, dummy_slist_pop(&list));
+
+  // Verify LIFO order and list pointer walking.
+  // Note that head and tail are reverse of item storage!
+  EXPECT_EQ(&item_storage.back(), head);
+  EXPECT_EQ(&item_storage.front(), tail);
+  dummy_entry_t* p = head;
+  for (size_t i = 0; i < item_storage.size(); ++i) {
+    ASSERT_TRUE(p);
+    EXPECT_EQ(item_storage.size() - i - 1, p->value);
+    p = dummy_slist_get_next(p);
+  }
+  EXPECT_EQ(NULL, p);
+
+  dummy_slist_deinitialize(&list);
+}
+
+TEST(AtomicSList, FlushFIFO) {
+  dummy_slist_t list;
+  dummy_slist_initialize(&list);
+
+  // Flushing when empty is ok.
+  dummy_entry_t* head = NULL;
+  dummy_entry_t* tail = NULL;
+  EXPECT_FALSE(dummy_slist_flush(
+      &list, IREE_ATOMIC_SLIST_FLUSH_ORDER_APPROXIMATE_FIFO, &head, &tail));
+
+  // Push items into the list (LIFO order).
+  // New contents: 3 2 1 0
+  auto item_storage = MakeDummySListItems(0, 4);
+  for (size_t i = 0; i < item_storage.size(); ++i) {
+    dummy_slist_push(&list, &item_storage[i]);
+  }
+
+  // Flush in FIFO order and verify empty.
+  // New contents: e
+  EXPECT_TRUE(dummy_slist_flush(
+      &list, IREE_ATOMIC_SLIST_FLUSH_ORDER_APPROXIMATE_FIFO, &head, &tail));
+  EXPECT_EQ(NULL, dummy_slist_pop(&list));
+
+  // Verify FIFO order and list pointer walking.
+  EXPECT_EQ(&item_storage.front(), head);
+  EXPECT_EQ(&item_storage.back(), tail);
+  dummy_entry_t* p = head;
+  for (size_t i = 0; i < item_storage.size(); ++i) {
+    ASSERT_TRUE(p);
+    EXPECT_EQ(i, p->value);
+    p = dummy_slist_get_next(p);
+  }
+  EXPECT_EQ(NULL, p);
+
+  dummy_slist_deinitialize(&list);
+}
+
+}  // namespace
diff --git a/runtime/src/iree/base/internal/atomics.h b/runtime/src/iree/base/internal/atomics.h
new file mode 100644
index 0000000..31eb64c
--- /dev/null
+++ b/runtime/src/iree/base/internal/atomics.h
@@ -0,0 +1,171 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// An implementation of the C11 stdatomics.h utilities we use (which is limited
+// to a subset of types for now). We need this for non-C11-compliant platforms
+// (MSVC), but it has the added benefit of not conflicting with <atomic>
+// (stdatomic.h and atomic cannot be included in the same compilation unit...
+// great design). There shouldn't be any difference between what we do here and
+// what any implementation would do with the platform atomic functions so it's
+// used everywhere.
+//
+// https://en.cppreference.com/w/c/atomic
+
+#ifndef IREE_BASE_INTERNAL_ATOMICS_H_
+#define IREE_BASE_INTERNAL_ATOMICS_H_
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "iree/base/assert.h"
+#include "iree/base/config.h"
+#include "iree/base/target_platform.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//==============================================================================
+// Hardware concurrency information
+//==============================================================================
+
+// https://en.cppreference.com/w/cpp/thread/hardware_destructive_interference_size
+// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2016/p0154r1.html
+// https://norrischiu.github.io/2018/09/08/Cpp-jargon-1.html
+
+// TODO(benvanik): test 128 on x64 (to thwart hardware prefetcher).
+
+// Minimum offset between two objects to avoid false sharing.
+// If two members are aligned to this value they will (likely) not share the
+// same L1 cache line.
+#define iree_hardware_destructive_interference_size 64
+
+// Maximum size of contiguous memory to promote true sharing.
+// If two members are within a span of this value they will (likely) share the
+// same L1 cache line.
+#define iree_hardware_constructive_interference_size 64
+
+//==============================================================================
+// C11-compatible atomic operations
+//==============================================================================
+// We expose support for int32_t, int64_t, and intptr_t (which aliases one of
+// int32_t or int64_t). This limits what we need to port and it's really all
+// that's needed anyway.
+
+#if IREE_SYNCHRONIZATION_DISABLE_UNSAFE
+
+// Atomics are disabled as we've forced ourselves into a fully thread-hostile
+// configuration. Used on bare-metal systems with single cores.
+#include "iree/base/internal/atomics_disabled.h"  // IWYU pragma: export
+
+#elif defined(IREE_COMPILER_MSVC)
+
+// Atomics using the Win32 Interlocked* APIs.
+#include "iree/base/internal/atomics_msvc.h"  // IWYU pragma: export
+
+#elif defined(IREE_COMPILER_CLANG)
+
+// C11 atomics using Clang builtins.
+#include "iree/base/internal/atomics_clang.h"  // IWYU pragma: export
+
+#elif defined(IREE_COMPILER_GCC)
+
+// Atomics for GCC (compatible with both C and C++).
+#include "iree/base/internal/atomics_gcc.h"  // IWYU pragma: export
+
+#else
+
+// Unsupported architecture.
+#error Compiler does not have supported C11-style atomics
+
+#endif  // IREE_COMPILER_*
+
+// If the compiler can automatically determine the types:
+#ifdef iree_atomic_load_auto
+
+#define iree_atomic_load_int32 iree_atomic_load_auto
+#define iree_atomic_store_int32 iree_atomic_store_auto
+#define iree_atomic_fetch_add_int32 iree_atomic_fetch_add_auto
+#define iree_atomic_fetch_sub_int32 iree_atomic_fetch_sub_auto
+#define iree_atomic_fetch_and_int32 iree_atomic_fetch_and_auto
+#define iree_atomic_fetch_or_int32 iree_atomic_fetch_or_auto
+#define iree_atomic_fetch_xor_int32 iree_atomic_fetch_xor_auto
+#define iree_atomic_exchange_int32 iree_atomic_exchange_auto
+#define iree_atomic_compare_exchange_strong_int32 \
+  iree_atomic_compare_exchange_strong_auto
+#define iree_atomic_compare_exchange_weak_int32 \
+  iree_atomic_compare_exchange_weak_auto
+
+#define iree_atomic_load_int64 iree_atomic_load_auto
+#define iree_atomic_store_int64 iree_atomic_store_auto
+#define iree_atomic_fetch_add_int64 iree_atomic_fetch_add_auto
+#define iree_atomic_fetch_sub_int64 iree_atomic_fetch_sub_auto
+#define iree_atomic_fetch_and_int64 iree_atomic_fetch_and_auto
+#define iree_atomic_fetch_or_int64 iree_atomic_fetch_or_auto
+#define iree_atomic_fetch_xor_int64 iree_atomic_fetch_xor_auto
+#define iree_atomic_exchange_int64 iree_atomic_exchange_auto
+#define iree_atomic_compare_exchange_strong_int64 \
+  iree_atomic_compare_exchange_strong_auto
+#define iree_atomic_compare_exchange_weak_int64 \
+  iree_atomic_compare_exchange_weak_auto
+
+#define iree_atomic_load_intptr iree_atomic_load_auto
+#define iree_atomic_store_intptr iree_atomic_store_auto
+#define iree_atomic_fetch_add_intptr iree_atomic_fetch_add_auto
+#define iree_atomic_fetch_sub_intptr iree_atomic_fetch_sub_auto
+#define iree_atomic_exchange_intptr iree_atomic_exchange_auto
+#define iree_atomic_compare_exchange_strong_intptr \
+  iree_atomic_compare_exchange_strong_auto
+#define iree_atomic_compare_exchange_weak_intptr \
+  iree_atomic_compare_exchange_weak_auto
+
+#endif  // iree_atomic_load_auto
+
+//==============================================================================
+// Reference count atomics
+//==============================================================================
+// These are just aliases that allow use to have nicely readable ref counting
+// operands without caring about the exact bit sizes at each site.
+
+typedef iree_atomic_int32_t iree_atomic_ref_count_t;
+#define iree_atomic_ref_count_init(count_ptr) \
+  iree_atomic_store_int32(count_ptr, 1, iree_memory_order_relaxed)
+// Callers of iree_atomic_ref_count_inc typically don't need it to return a
+// value (unlike iree_atomic_ref_count_dec), so we make sure that it does not,
+// which allows the implementation to use faster atomic instructions where
+// available, e.g. STADD on ARMv8.1-a.
+#define iree_atomic_ref_count_inc(count_ptr)                              \
+  do {                                                                    \
+    iree_atomic_fetch_add_int32(count_ptr, 1, iree_memory_order_relaxed); \
+  } while (0)
+#define iree_atomic_ref_count_dec(count_ptr) \
+  iree_atomic_fetch_sub_int32(count_ptr, 1, iree_memory_order_acq_rel)
+
+// Aborts the program if the given reference count value is not 1.
+// This should be avoided in all situations but those where continuing execution
+// would be invalid. If a reference object is allocated on the stack and the
+// parent function is about to return it *must* have a ref count of 1: anything
+// else that may be retaining the object will hold a pointer to (effectively)
+// uninitialized stack memory.
+#define iree_atomic_ref_count_abort_if_uses(count_ptr)                         \
+  if (IREE_UNLIKELY(iree_atomic_load_int32(count_ptr,                          \
+                                           iree_memory_order_seq_cst) != 1)) { \
+    abort();                                                                   \
+  }
+
+// Asserts that the given reference count value is zero.
+#define IREE_ASSERT_REF_COUNT_ZERO(count_ptr)                                  \
+  IREE_ASSERT_EQ(iree_atomic_load_int32(count_ptr, iree_memory_order_seq_cst), \
+                 0, "ref counted object still has uses")
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // IREE_BASE_INTERNAL_ATOMICS_H_
diff --git a/runtime/src/iree/base/internal/atomics_clang.h b/runtime/src/iree/base/internal/atomics_clang.h
new file mode 100644
index 0000000..44514e0
--- /dev/null
+++ b/runtime/src/iree/base/internal/atomics_clang.h
@@ -0,0 +1,73 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BASE_INTERNAL_ATOMICS_CLANG_H_
+#define IREE_BASE_INTERNAL_ATOMICS_CLANG_H_
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "iree/base/target_platform.h"
+
+#if defined(IREE_COMPILER_CLANG)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum iree_memory_order_e {
+  iree_memory_order_relaxed = __ATOMIC_RELAXED,
+  iree_memory_order_consume = __ATOMIC_CONSUME,
+  iree_memory_order_acquire = __ATOMIC_ACQUIRE,
+  iree_memory_order_release = __ATOMIC_RELEASE,
+  iree_memory_order_acq_rel = __ATOMIC_ACQ_REL,
+  iree_memory_order_seq_cst = __ATOMIC_SEQ_CST,
+} iree_memory_order_t;
+
+#define IREE_ATOMIC_VAR_INIT(value) (value)
+
+typedef _Atomic int32_t iree_atomic_int32_t;
+typedef _Atomic int64_t iree_atomic_int64_t;
+// TODO(#3453): check for __int128 support before using
+// typedef _Atomic __int128 iree_atomic_int128_t;
+typedef _Atomic intptr_t iree_atomic_intptr_t;
+
+#define iree_atomic_load_auto(object, order) \
+  __c11_atomic_load((object), (order))
+#define iree_atomic_store_auto(object, desired, order) \
+  __c11_atomic_store((object), (desired), (order))
+#define iree_atomic_fetch_add_auto(object, operand, order) \
+  __c11_atomic_fetch_add((object), (operand), (order))
+#define iree_atomic_fetch_sub_auto(object, operand, order) \
+  __c11_atomic_fetch_sub((object), (operand), (order))
+#define iree_atomic_fetch_and_auto(object, operand, order) \
+  __c11_atomic_fetch_and((object), (operand), (order))
+#define iree_atomic_fetch_or_auto(object, operand, order) \
+  __c11_atomic_fetch_or((object), (operand), (order))
+#define iree_atomic_fetch_xor_auto(object, operand, order) \
+  __c11_atomic_fetch_xor((object), (operand), (order))
+#define iree_atomic_exchange_auto(object, operand, order) \
+  __c11_atomic_exchange((object), (operand), (order))
+#define iree_atomic_compare_exchange_strong_auto(object, expected, desired, \
+                                                 order_succ, order_fail)    \
+  __c11_atomic_compare_exchange_strong((object), (expected), (desired),     \
+                                       (order_succ), (order_fail))
+#define iree_atomic_compare_exchange_weak_auto(object, expected, desired, \
+                                               order_succ, order_fail)    \
+  __c11_atomic_compare_exchange_weak((object), (expected), (desired),     \
+                                     (order_succ), (order_fail))
+
+#define iree_atomic_thread_fence(order) __c11_atomic_thread_fence(order)
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // IREE_COMPILER_CLANG
+
+#endif  // IREE_BASE_INTERNAL_ATOMICS_CLANG_H_
diff --git a/runtime/src/iree/base/internal/atomics_disabled.h b/runtime/src/iree/base/internal/atomics_disabled.h
new file mode 100644
index 0000000..ce9e17e
--- /dev/null
+++ b/runtime/src/iree/base/internal/atomics_disabled.h
@@ -0,0 +1,244 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BASE_INTERNAL_ATOMICS_DISABLED_H_
+#define IREE_BASE_INTERNAL_ATOMICS_DISABLED_H_
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "iree/base/config.h"
+#include "iree/base/target_platform.h"
+
+#if IREE_SYNCHRONIZATION_DISABLE_UNSAFE
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum iree_memory_order_e {
+  iree_memory_order_relaxed,
+  iree_memory_order_consume,
+  iree_memory_order_acquire,
+  iree_memory_order_release,
+  iree_memory_order_acq_rel,
+  iree_memory_order_seq_cst,
+} iree_memory_order_t;
+
+#define IREE_ATOMIC_VAR_INIT(value) (value)
+
+typedef int32_t iree_atomic_int32_t;
+typedef int64_t iree_atomic_int64_t;
+// TODO(#3453): check for __int128 support before using
+// typedef __int128 iree_atomic_int128_t;
+typedef intptr_t iree_atomic_intptr_t;
+
+#define iree_atomic_load_int32(object, order) *(object)
+#define iree_atomic_store_int32(object, desired, order) *(object) = (desired)
+#define iree_atomic_fetch_add_int32(object, operand, order)                 \
+  iree_atomic_fetch_add_int32_impl((volatile iree_atomic_int32_t*)(object), \
+                                   (int32_t)(operand))
+#define iree_atomic_fetch_sub_int32(object, operand, order)                 \
+  iree_atomic_fetch_add_int32_impl((volatile iree_atomic_int32_t*)(object), \
+                                   -(int32_t)(operand))
+#define iree_atomic_fetch_and_int32(object, operand, order)                 \
+  iree_atomic_fetch_and_int32_impl((volatile iree_atomic_int32_t*)(object), \
+                                   (int32_t)(operand))
+#define iree_atomic_fetch_or_int32(object, operand, order)                 \
+  iree_atomic_fetch_or_int32_impl((volatile iree_atomic_int32_t*)(object), \
+                                  (int32_t)(operand))
+#define iree_atomic_fetch_xor_int32(object, operand, order)                 \
+  iree_atomic_fetch_xor_int32_impl((volatile iree_atomic_int32_t*)(object), \
+                                   (int32_t)(operand))
+#define iree_atomic_exchange_int32(object, desired, order) \
+  iree_atomic_fetch_exchange_int32_impl(                   \
+      (volatile iree_atomic_int32_t*)(object), (int32_t)(desired))
+#define iree_atomic_compare_exchange_strong_int32(object, expected, desired, \
+                                                  order_succ, order_fail)    \
+  iree_atomic_compare_exchange_int32_impl(                                   \
+      (volatile iree_atomic_int32_t*)(object), (int32_t*)(expected),         \
+      (int32_t)(desired))
+#define iree_atomic_compare_exchange_weak_int32 \
+  iree_atomic_compare_exchange_strong_int32
+
+#define iree_atomic_load_int64(object, order) *(object)
+#define iree_atomic_store_int64(object, desired, order) *(object) = (desired)
+#define iree_atomic_fetch_add_int64(object, operand, order)                 \
+  iree_atomic_fetch_add_int64_impl((volatile iree_atomic_int64_t*)(object), \
+                                   (int64_t)(operand))
+#define iree_atomic_fetch_sub_int64(object, operand, order)                 \
+  iree_atomic_fetch_add_int64_impl((volatile iree_atomic_int64_t*)(object), \
+                                   -(int64_t)(operand))
+#define iree_atomic_fetch_and_int64(object, operand, order)                 \
+  iree_atomic_fetch_and_int64_impl((volatile iree_atomic_int64_t*)(object), \
+                                   (int64_t)(operand))
+#define iree_atomic_fetch_or_int64(object, operand, order)                 \
+  iree_atomic_fetch_or_int64_impl((volatile iree_atomic_int64_t*)(object), \
+                                  (int64_t)(operand))
+#define iree_atomic_fetch_xor_int64(object, operand, order)                 \
+  iree_atomic_fetch_xor_int64_impl((volatile iree_atomic_int64_t*)(object), \
+                                   (int64_t)(operand))
+#define iree_atomic_exchange_int64(object, desired, order) \
+  iree_atomic_fetch_exchange_int64_impl(                   \
+      (volatile iree_atomic_int64_t*)(object), (int64_t)(desired))
+#define iree_atomic_compare_exchange_strong_int64(object, expected, desired, \
+                                                  order_succ, order_fail)    \
+  iree_atomic_compare_exchange_int64_impl(                                   \
+      (volatile iree_atomic_int64_t*)(object), (int64_t*)(expected),         \
+      (int64_t)(desired))
+#define iree_atomic_compare_exchange_weak_int64 \
+  iree_atomic_compare_exchange_strong_int64
+
+static inline int32_t iree_atomic_fetch_add_int32_impl(
+    volatile iree_atomic_int32_t* object, int32_t operand) {
+  int32_t original = *object;
+  *object += operand;
+  return original;
+}
+
+static inline int32_t iree_atomic_fetch_and_int32_impl(
+    volatile iree_atomic_int32_t* object, int32_t operand) {
+  int32_t original = *object;
+  *object &= operand;
+  return original;
+}
+
+static inline int32_t iree_atomic_fetch_or_int32_impl(
+    volatile iree_atomic_int32_t* object, int32_t operand) {
+  int32_t original = *object;
+  *object |= operand;
+  return original;
+}
+
+static inline int32_t iree_atomic_fetch_xor_int32_impl(
+    volatile iree_atomic_int32_t* object, int32_t operand) {
+  int32_t original = *object;
+  *object ^= operand;
+  return original;
+}
+
+static inline int32_t iree_atomic_fetch_exchange_int32_impl(
+    volatile iree_atomic_int32_t* object, int32_t desired) {
+  int32_t original = *object;
+  *object = desired;
+  return original;
+}
+
+static inline bool iree_atomic_compare_exchange_int32_impl(
+    volatile iree_atomic_int32_t* object, int32_t* expected, int32_t desired) {
+  if (*object == *expected) {
+    *object = desired;
+    return true;
+  } else {
+    *expected = *object;
+    return false;
+  }
+}
+
+static inline int64_t iree_atomic_fetch_add_int64_impl(
+    volatile iree_atomic_int64_t* object, int64_t operand) {
+  int64_t original = *object;
+  *object += operand;
+  return original;
+}
+
+static inline int64_t iree_atomic_fetch_and_int64_impl(
+    volatile iree_atomic_int64_t* object, int64_t operand) {
+  int64_t original = *object;
+  *object &= operand;
+  return original;
+}
+
+static inline int64_t iree_atomic_fetch_or_int64_impl(
+    volatile iree_atomic_int64_t* object, int64_t operand) {
+  int64_t original = *object;
+  *object |= operand;
+  return original;
+}
+
+static inline int64_t iree_atomic_fetch_xor_int64_impl(
+    volatile iree_atomic_int64_t* object, int64_t operand) {
+  int64_t original = *object;
+  *object ^= operand;
+  return original;
+}
+
+static inline int64_t iree_atomic_fetch_exchange_int64_impl(
+    volatile iree_atomic_int64_t* object, int64_t desired) {
+  int64_t original = *object;
+  *object = desired;
+  return original;
+}
+
+static inline bool iree_atomic_compare_exchange_int64_impl(
+    volatile iree_atomic_int64_t* object, int64_t* expected, int64_t desired) {
+  if (*object == *expected) {
+    *object = desired;
+    return true;
+  } else {
+    *expected = *object;
+    return false;
+  }
+}
+
+// There are no pointer-width atomic ops in MSVC so we need to specialize based
+// on the pointer size.
+#if defined(IREE_PTR_SIZE_32)
+#define iree_atomic_load_intptr(object, order) \
+  (intptr_t) iree_atomic_load_int32((iree_atomic_int32_t*)(object), (order))
+#define iree_atomic_store_intptr(object, desired, order)             \
+  (intptr_t) iree_atomic_store_int32((iree_atomic_int32_t*)(object), \
+                                     (int32_t)(desired), (order))
+#define iree_atomic_fetch_add_intptr(object, operand, order)             \
+  (intptr_t) iree_atomic_fetch_add_int32((iree_atomic_int32_t*)(object), \
+                                         (int32_t)(operand), (order))
+#define iree_atomic_fetch_sub_intptr(object, operand, order)             \
+  (intptr_t) iree_atomic_fetch_sub_int32((iree_atomic_int32_t*)(object), \
+                                         (int32_t)(operand), (order))
+#define iree_atomic_exchange_intptr(object, desired, order)             \
+  (intptr_t) iree_atomic_exchange_int32((iree_atomic_int32_t*)(object), \
+                                        (int32_t)(desired), (order))
+#define iree_atomic_compare_exchange_strong_intptr(object, expected, desired, \
+                                                   order_succ, order_fail)    \
+  iree_atomic_compare_exchange_strong_int32(                                  \
+      (iree_atomic_int32_t*)(object), (int32_t*)(expected),                   \
+      (int32_t)(desired), (order_succ), (order_fail))
+#define iree_atomic_compare_exchange_weak_intptr \
+  iree_atomic_compare_exchange_strong_intptr
+#else
+#define iree_atomic_load_intptr(object, order) \
+  (intptr_t) iree_atomic_load_int64((iree_atomic_int64_t*)(object), (order))
+#define iree_atomic_store_intptr(object, desired, order)             \
+  (intptr_t) iree_atomic_store_int64((iree_atomic_int64_t*)(object), \
+                                     (int64_t)(desired), (order))
+#define iree_atomic_fetch_add_intptr(object, operand, order)             \
+  (intptr_t) iree_atomic_fetch_add_int64((iree_atomic_int64_t*)(object), \
+                                         (int64_t)(operand), (order))
+#define iree_atomic_fetch_sub_intptr(object, operand, order)             \
+  (intptr_t) iree_atomic_fetch_sub_int64((iree_atomic_int64_t*)(object), \
+                                         (int64_t)(operand), (order))
+#define iree_atomic_exchange_intptr(object, desired, order)             \
+  (intptr_t) iree_atomic_exchange_int64((iree_atomic_int64_t*)(object), \
+                                        (int64_t)(desired), (order))
+#define iree_atomic_compare_exchange_strong_intptr(object, expected, desired, \
+                                                   order_succ, order_fail)    \
+  iree_atomic_compare_exchange_strong_int64(                                  \
+      (iree_atomic_int64_t*)(object), (int64_t*)(expected),                   \
+      (int64_t)(desired), (order_succ), (order_fail))
+#define iree_atomic_compare_exchange_weak_intptr \
+  iree_atomic_compare_exchange_strong_intptr
+#endif  // IREE_PTR_SIZE_32
+
+#define iree_atomic_thread_fence(order)
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // IREE_SYNCHRONIZATION_DISABLE_UNSAFE
+
+#endif  // IREE_BASE_INTERNAL_ATOMICS_DISABLED_H_
diff --git a/runtime/src/iree/base/internal/atomics_gcc.h b/runtime/src/iree/base/internal/atomics_gcc.h
new file mode 100644
index 0000000..1eb7170
--- /dev/null
+++ b/runtime/src/iree/base/internal/atomics_gcc.h
@@ -0,0 +1,89 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BASE_INTERNAL_ATOMICS_GCC_H_
+#define IREE_BASE_INTERNAL_ATOMICS_GCC_H_
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "iree/base/target_platform.h"
+
+#if defined(IREE_COMPILER_GCC)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum iree_memory_order_e {
+  iree_memory_order_relaxed = __ATOMIC_RELAXED,
+  iree_memory_order_consume = __ATOMIC_CONSUME,
+  iree_memory_order_acquire = __ATOMIC_ACQUIRE,
+  iree_memory_order_release = __ATOMIC_RELEASE,
+  iree_memory_order_acq_rel = __ATOMIC_ACQ_REL,
+  iree_memory_order_seq_cst = __ATOMIC_SEQ_CST,
+} iree_memory_order_t;
+
+#define IREE_ATOMIC_VAR_INIT(value) (value)
+
+typedef int32_t iree_atomic_int32_t;
+typedef int64_t iree_atomic_int64_t;
+// typedef __int128 iree_atomic_int128_t;
+typedef intptr_t iree_atomic_intptr_t;
+
+#ifdef __cplusplus
+// Equiv to C++ auto keyword in C++ mode.
+#define __iree_auto_type auto
+#else
+// Only defined in C mode.
+#define __iree_auto_type __auto_type
+#endif
+
+#define iree_atomic_load_auto(object, order)                       \
+  __extension__({                                                  \
+    __iree_auto_type __atomic_load_ptr = (object);                 \
+    __typeof__(*__atomic_load_ptr) __atomic_load_tmp;              \
+    __atomic_load(__atomic_load_ptr, &__atomic_load_tmp, (order)); \
+    __atomic_load_tmp;                                             \
+  })
+#define iree_atomic_store_auto(object, desired, order)                \
+  __extension__({                                                     \
+    __iree_auto_type __atomic_store_ptr = (object);                   \
+    __typeof__(*__atomic_store_ptr) __atomic_store_tmp = (desired);   \
+    __atomic_store(__atomic_store_ptr, &__atomic_store_tmp, (order)); \
+  })
+#define iree_atomic_fetch_add_auto(object, operand, order) \
+  __atomic_fetch_add((object), (operand), (order))
+#define iree_atomic_fetch_sub_auto(object, operand, order) \
+  __atomic_fetch_sub((object), (operand), (order))
+#define iree_atomic_fetch_and_auto(object, operand, order) \
+  __atomic_fetch_and((object), (operand), (order))
+#define iree_atomic_fetch_or_auto(object, operand, order) \
+  __atomic_fetch_or((object), (operand), (order))
+#define iree_atomic_fetch_xor_auto(object, operand, order) \
+  __atomic_fetch_xor((object), (operand), (order))
+#define iree_atomic_exchange_auto(object, operand, order) \
+  __atomic_exchange_n((object), (operand), (order))
+#define iree_atomic_compare_exchange_strong_auto(object, expected, desired, \
+                                                 order_succ, order_fail)    \
+  __atomic_compare_exchange_n(object, expected, desired, /*weak=*/false,    \
+                              (order_succ), (order_fail))
+#define iree_atomic_compare_exchange_weak_auto(object, expected, desired, \
+                                               order_succ, order_fail)    \
+  __atomic_compare_exchange_n(object, expected, desired, /*weak=*/true,   \
+                              (order_succ), (order_fail))
+
+#define iree_atomic_thread_fence(order) __atomic_thread_fence(order)
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // IREE_COMPILER_GCC
+
+#endif  // IREE_BASE_INTERNAL_ATOMICS_GCC_H_
diff --git a/runtime/src/iree/base/internal/atomics_msvc.h b/runtime/src/iree/base/internal/atomics_msvc.h
new file mode 100644
index 0000000..5cfbf43
--- /dev/null
+++ b/runtime/src/iree/base/internal/atomics_msvc.h
@@ -0,0 +1,182 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BASE_INTERNAL_ATOMICS_MSVC_H_
+#define IREE_BASE_INTERNAL_ATOMICS_MSVC_H_
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "iree/base/target_platform.h"
+
+#if defined(IREE_COMPILER_MSVC)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum iree_memory_order_e {
+  iree_memory_order_relaxed,
+  iree_memory_order_consume,
+  iree_memory_order_acquire,
+  iree_memory_order_release,
+  iree_memory_order_acq_rel,
+  iree_memory_order_seq_cst,
+} iree_memory_order_t;
+
+#define IREE_ATOMIC_VAR_INIT(value) \
+  { (value) }
+
+typedef struct {
+  int32_t __val;
+} iree_atomic_int32_t;
+typedef struct {
+  int64_t __val;
+} iree_atomic_int64_t;
+// typedef __declspec(align(16)) struct {
+//   uint64_t __val[2];
+// } iree_atomic_int128_t;
+typedef struct {
+  intptr_t __val;
+} iree_atomic_intptr_t;
+
+#define iree_atomic_load_int32(object, order) \
+  InterlockedExchangeAdd((volatile LONG*)object, 0)
+#define iree_atomic_store_int32(object, desired, order) \
+  InterlockedExchange((volatile LONG*)object, desired)
+#define iree_atomic_fetch_add_int32(object, operand, order) \
+  InterlockedExchangeAdd((volatile LONG*)object, operand)
+#define iree_atomic_fetch_sub_int32(object, operand, order) \
+  InterlockedExchangeAdd((volatile LONG*)object, -((int32_t)(operand)))
+#define iree_atomic_fetch_and_int32(object, operand, order) \
+  InterlockedAnd((volatile LONG*)object, operand)
+#define iree_atomic_fetch_or_int32(object, operand, order) \
+  InterlockedOr((volatile LONG*)object, operand)
+#define iree_atomic_fetch_xor_int32(object, operand, order) \
+  InterlockedXor((volatile LONG*)object, operand)
+#define iree_atomic_exchange_int32(object, desired, order) \
+  InterlockedExchange((volatile LONG*)object, desired)
+#define iree_atomic_compare_exchange_strong_int32(object, expected, desired, \
+                                                  order_succ, order_fail)    \
+  iree_atomic_compare_exchange_strong_int32_impl(                            \
+      (volatile iree_atomic_int32_t*)(object), (int32_t*)(expected),         \
+      (int32_t)(desired), (order_succ), (order_fail))
+#define iree_atomic_compare_exchange_weak_int32 \
+  iree_atomic_compare_exchange_strong_int32
+
+#define iree_atomic_load_int64(object, order) \
+  InterlockedExchangeAdd64((volatile LONG64*)object, 0)
+#define iree_atomic_store_int64(object, desired, order) \
+  InterlockedExchange64((volatile LONG64*)object, (LONG64)desired)
+#define iree_atomic_fetch_add_int64(object, operand, order) \
+  InterlockedExchangeAdd64((volatile LONG64*)object, (LONG64)operand)
+#define iree_atomic_fetch_sub_int64(object, operand, order) \
+  InterlockedExchangeAdd64((volatile LONG64*)object, -(operand))
+#define iree_atomic_fetch_and_int64(object, operand, order) \
+  InterlockedAnd64((volatile LONG64*)object, operand)
+#define iree_atomic_fetch_or_int64(object, operand, order) \
+  InterlockedOr64((volatile LONG64*)object, operand)
+#define iree_atomic_fetch_xor_int64(object, operand, order) \
+  InterlockedXor64((volatile LONG64*)object, operand)
+#define iree_atomic_exchange_int64(object, desired, order) \
+  InterlockedExchange64((volatile LONG64*)object, desired)
+#define iree_atomic_compare_exchange_strong_int64(object, expected, desired, \
+                                                  order_succ, order_fail)    \
+  iree_atomic_compare_exchange_strong_int64_impl(                            \
+      (volatile iree_atomic_int64_t*)(object), (int64_t*)(expected),         \
+      (int64_t)(desired), (order_succ), (order_fail))
+#define iree_atomic_compare_exchange_weak_int64 \
+  iree_atomic_compare_exchange_strong_int64
+
+#define iree_atomic_thread_fence(order) MemoryBarrier()
+
+static inline bool iree_atomic_compare_exchange_strong_int32_impl(
+    volatile iree_atomic_int32_t* object, int32_t* expected, int32_t desired,
+    iree_memory_order_t order_succ, iree_memory_order_t order_fail) {
+  int32_t expected_value = *expected;
+  int32_t old_value = InterlockedCompareExchange((volatile LONG*)object,
+                                                 desired, expected_value);
+  if (old_value == expected_value) {
+    return true;
+  } else {
+    *expected = old_value;
+    return false;
+  }
+}
+
+static inline bool iree_atomic_compare_exchange_strong_int64_impl(
+    volatile iree_atomic_int64_t* object, int64_t* expected, int64_t desired,
+    iree_memory_order_t order_succ, iree_memory_order_t order_fail) {
+  int64_t expected_value = *expected;
+  int64_t old_value = InterlockedCompareExchange64((volatile LONG64*)object,
+                                                   desired, expected_value);
+  if (old_value == expected_value) {
+    return true;
+  } else {
+    *expected = old_value;
+    return false;
+  }
+}
+
+#define iree_atomic_thread_fence(order) MemoryBarrier()
+
+// There are no pointer-width atomic ops in MSVC so we need to specialize based
+// on the pointer size.
+#if defined(IREE_PTR_SIZE_32)
+#define iree_atomic_load_intptr(object, order) \
+  (intptr_t) iree_atomic_load_int32((iree_atomic_int32_t*)(object), (order))
+#define iree_atomic_store_intptr(object, desired, order)             \
+  (intptr_t) iree_atomic_store_int32((iree_atomic_int32_t*)(object), \
+                                     (int32_t)(desired), (order))
+#define iree_atomic_fetch_add_intptr(object, operand, order)             \
+  (intptr_t) iree_atomic_fetch_add_int32((iree_atomic_int32_t*)(object), \
+                                         (int32_t)(operand), (order))
+#define iree_atomic_fetch_sub_intptr(object, operand, order)             \
+  (intptr_t) iree_atomic_fetch_sub_int32((iree_atomic_int32_t*)(object), \
+                                         (int32_t)(operand), (order))
+#define iree_atomic_exchange_intptr(object, desired, order)             \
+  (intptr_t) iree_atomic_exchange_int32((iree_atomic_int32_t*)(object), \
+                                        (int32_t)(desired), (order))
+#define iree_atomic_compare_exchange_strong_intptr(object, expected, desired, \
+                                                   order_succ, order_fail)    \
+  iree_atomic_compare_exchange_strong_int32(                                  \
+      (iree_atomic_int32_t*)(object), (int32_t*)(expected),                   \
+      (int32_t)(desired), (order_succ), (order_fail))
+#define iree_atomic_compare_exchange_weak_intptr \
+  iree_atomic_compare_exchange_strong_intptr
+#else
+#define iree_atomic_load_intptr(object, order) \
+  (intptr_t) iree_atomic_load_int64((iree_atomic_int64_t*)(object), (order))
+#define iree_atomic_store_intptr(object, desired, order)             \
+  (intptr_t) iree_atomic_store_int64((iree_atomic_int64_t*)(object), \
+                                     (int64_t)(desired), (order))
+#define iree_atomic_fetch_add_intptr(object, operand, order)             \
+  (intptr_t) iree_atomic_fetch_add_int64((iree_atomic_int64_t*)(object), \
+                                         (int64_t)(operand), (order))
+#define iree_atomic_fetch_sub_intptr(object, operand, order)             \
+  (intptr_t) iree_atomic_fetch_sub_int64((iree_atomic_int64_t*)(object), \
+                                         (int64_t)(operand), (order))
+#define iree_atomic_exchange_intptr(object, desired, order)             \
+  (intptr_t) iree_atomic_exchange_int64((iree_atomic_int64_t*)(object), \
+                                        (int64_t)(desired), (order))
+#define iree_atomic_compare_exchange_strong_intptr(object, expected, desired, \
+                                                   order_succ, order_fail)    \
+  iree_atomic_compare_exchange_strong_int64(                                  \
+      (iree_atomic_int64_t*)(object), (int64_t*)(expected),                   \
+      (int64_t)(desired), (order_succ), (order_fail))
+#define iree_atomic_compare_exchange_weak_intptr \
+  iree_atomic_compare_exchange_strong_intptr
+#endif  // IREE_PTR_SIZE_32
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // IREE_COMPILER_MSVC
+
+#endif  // IREE_BASE_INTERNAL_ATOMICS_MSVC_H_
diff --git a/runtime/src/iree/base/internal/atomics_test.cc b/runtime/src/iree/base/internal/atomics_test.cc
new file mode 100644
index 0000000..a9fce2f
--- /dev/null
+++ b/runtime/src/iree/base/internal/atomics_test.cc
@@ -0,0 +1,102 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/internal/atomics.h"
+
+#include <cstddef>
+#include <cstdint>
+
+#include "iree/testing/gtest.h"
+
+namespace {
+
+// NOTE: these tests are just to ensure we correctly compile the macros across
+// our supported toolchains: they don't verify that the memory semantics are
+// correct (as that would be difficult and is really the toolchain's job).
+
+TEST(AtomicPtr, LoadStore) {
+  intptr_t ptr_0 = 0x0;
+  intptr_t ptr_1 = 0x1;
+  iree_atomic_intptr_t value = IREE_ATOMIC_VAR_INIT(ptr_0);
+  EXPECT_EQ(ptr_0, iree_atomic_load_intptr(&value, iree_memory_order_seq_cst));
+  iree_atomic_store_intptr(&value, ptr_1, iree_memory_order_seq_cst);
+  EXPECT_EQ(ptr_1, iree_atomic_load_intptr(&value, iree_memory_order_seq_cst));
+}
+
+TEST(AtomicPtr, AddSub) {
+  intptr_t ptr_0 = 0x0;
+  intptr_t ptr_1 = 0x1;
+  intptr_t ptr_2 = 0x2;
+  iree_atomic_intptr_t value = IREE_ATOMIC_VAR_INIT(ptr_0);
+  EXPECT_EQ(ptr_0, iree_atomic_fetch_add_intptr(&value, ptr_1,
+                                                iree_memory_order_seq_cst));
+  EXPECT_EQ(ptr_1, iree_atomic_fetch_add_intptr(&value, ptr_1,
+                                                iree_memory_order_seq_cst));
+  EXPECT_EQ(ptr_2, iree_atomic_fetch_sub_intptr(&value, ptr_1,
+                                                iree_memory_order_seq_cst));
+  EXPECT_EQ(ptr_1, iree_atomic_fetch_sub_intptr(&value, ptr_1,
+                                                iree_memory_order_seq_cst));
+  EXPECT_EQ(ptr_0, iree_atomic_load_intptr(&value, iree_memory_order_seq_cst));
+}
+
+TEST(AtomicPtr, Exchange) {
+  intptr_t ptr_0 = 0x0;
+  intptr_t ptr_1 = 0x1;
+  intptr_t ptr_2 = 0x2;
+  iree_atomic_intptr_t value = IREE_ATOMIC_VAR_INIT(ptr_0);
+  EXPECT_EQ(ptr_0, iree_atomic_exchange_intptr(&value, ptr_1,
+                                               iree_memory_order_seq_cst));
+  EXPECT_EQ(ptr_1, iree_atomic_exchange_intptr(&value, ptr_2,
+                                               iree_memory_order_seq_cst));
+  EXPECT_EQ(ptr_2, iree_atomic_load_intptr(&value, iree_memory_order_seq_cst));
+}
+
+TEST(AtomicPtr, CompareExchange) {
+  intptr_t ptr_0 = 0x0;
+  intptr_t ptr_1 = 0x1;
+  intptr_t ptr_2 = 0x2;
+  iree_atomic_intptr_t value = IREE_ATOMIC_VAR_INIT(ptr_0);
+  intptr_t ptr_expected = 0;
+
+  // OK: value == ptr_0, CAS(ptr_0 -> ptr_1)
+  iree_atomic_store_intptr(&value, ptr_0, iree_memory_order_seq_cst);
+  ptr_expected = ptr_0;
+  EXPECT_TRUE(iree_atomic_compare_exchange_strong_intptr(
+      &value, &ptr_expected, ptr_1, iree_memory_order_seq_cst,
+      iree_memory_order_seq_cst));
+  EXPECT_EQ(ptr_0, ptr_expected);
+  EXPECT_EQ(ptr_1, iree_atomic_load_intptr(&value, iree_memory_order_seq_cst));
+
+  // OK: value == ptr_1, CAS(ptr_1 -> ptr_2)
+  iree_atomic_store_intptr(&value, ptr_1, iree_memory_order_seq_cst);
+  ptr_expected = ptr_1;
+  EXPECT_TRUE(iree_atomic_compare_exchange_strong_intptr(
+      &value, &ptr_expected, ptr_2, iree_memory_order_seq_cst,
+      iree_memory_order_seq_cst));
+  EXPECT_EQ(ptr_1, ptr_expected);
+  EXPECT_EQ(ptr_2, iree_atomic_load_intptr(&value, iree_memory_order_seq_cst));
+
+  // FAIL: value == ptr_0, CAS(ptr_1 -> ptr_2)
+  iree_atomic_store_intptr(&value, ptr_0, iree_memory_order_seq_cst);
+  ptr_expected = ptr_1;
+  EXPECT_FALSE(iree_atomic_compare_exchange_strong_intptr(
+      &value, &ptr_expected, ptr_2, iree_memory_order_seq_cst,
+      iree_memory_order_seq_cst));
+  EXPECT_EQ(ptr_0, ptr_expected);
+  EXPECT_EQ(ptr_0, iree_atomic_load_intptr(&value, iree_memory_order_seq_cst));
+}
+
+TEST(AtomicRefCount, IncDec) {
+  iree_atomic_ref_count_t count;
+  iree_atomic_ref_count_init(&count);
+  iree_atomic_ref_count_inc(&count);
+  iree_atomic_ref_count_inc(&count);
+  EXPECT_EQ(3, iree_atomic_ref_count_dec(&count));
+  EXPECT_EQ(2, iree_atomic_ref_count_dec(&count));
+  EXPECT_EQ(1, iree_atomic_ref_count_dec(&count));
+}
+
+}  // namespace
diff --git a/runtime/src/iree/base/internal/call_once.h b/runtime/src/iree/base/internal/call_once.h
new file mode 100644
index 0000000..da411dd
--- /dev/null
+++ b/runtime/src/iree/base/internal/call_once.h
@@ -0,0 +1,109 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BASE_INTERNAL_CALL_ONCE_H_
+#define IREE_BASE_INTERNAL_CALL_ONCE_H_
+
+#include <stddef.h>
+
+#include "iree/base/api.h"
+#include "iree/base/target_platform.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//==============================================================================
+// iree_call_once
+//==============================================================================
+// Emulates the C11 call_once feature as few seem to have it.
+// https://en.cppreference.com/w/c/thread/call_once
+
+#if defined(__has_include)
+#if __has_include(<thread.h>)
+#define IREE_HAS_C11_THREAD_H 1
+#endif
+#endif
+
+#if defined(IREE_HAS_C11_THREAD_H)
+
+// Always prefer the C11 header if present.
+#include <thread.h>
+#define IREE_ONCE_FLAG_INIT ONCE_FLAG_INIT
+#define iree_once_flag ONCE_FLAG
+#define iree_call_once call_once
+
+#elif defined(IREE_PLATFORM_WINDOWS)
+
+// Windows fallback using the native InitOnceExecuteOnce:
+// https://docs.microsoft.com/en-us/windows/win32/api/synchapi/nf-synchapi-initonceexecuteonce
+
+// Expands to a value that can be used to initialize an object of type
+// iree_once_flag.
+#define IREE_ONCE_FLAG_INIT INIT_ONCE_STATIC_INIT
+
+// Complete object type capable of holding a flag used by iree_call_once.
+typedef INIT_ONCE iree_once_flag;
+
+typedef struct {
+  void (*func)(void);
+} iree_call_once_impl_params_t;
+static BOOL CALLBACK iree_call_once_callback_impl(PINIT_ONCE InitOnce,
+                                                  PVOID Parameter,
+                                                  PVOID* Context) {
+  // https://docs.microsoft.com/en-us/windows/win32/api/synchapi/nc-synchapi-pinit_once_fn
+  iree_call_once_impl_params_t* param =
+      (iree_call_once_impl_params_t*)Parameter;
+  (param->func)();
+  ((void)InitOnce);
+  ((void)Context);  // suppress warning
+  return TRUE;
+}
+
+// Calls |func| exactly once, even if invoked from several threads.
+// The completion of the function synchronizes with all previous or subsequent
+// calls to call_once with the same flag variable.
+static inline void iree_call_once(iree_once_flag* flag, void (*func)(void)) {
+  iree_call_once_impl_params_t param;
+  param.func = func;
+  InitOnceExecuteOnce(flag, iree_call_once_callback_impl, (PVOID)&param, NULL);
+}
+
+#elif IREE_SYNCHRONIZATION_DISABLE_UNSAFE
+
+// No-op when the thread control is disabled.
+#define IREE_ONCE_FLAG_INIT 1
+#define iree_once_flag uint32_t
+static inline void iree_call_once(iree_once_flag* flag, void (*func)(void)) {}
+
+#else
+
+// Fallback using pthread_once:
+// https://pubs.opengroup.org/onlinepubs/007908775/xsh/pthread_once.html
+
+#include <pthread.h>
+
+// Expands to a value that can be used to initialize an object of type
+// iree_once_flag.
+#define IREE_ONCE_FLAG_INIT PTHREAD_ONCE_INIT
+
+// Complete object type capable of holding a flag used by iree_call_once.
+typedef pthread_once_t iree_once_flag;
+
+// Calls |func| exactly once, even if invoked from several threads.
+// The completion of the function synchronizes with all previous or subsequent
+// calls to call_once with the same flag variable.
+static inline void iree_call_once(iree_once_flag* flag, void (*func)(void)) {
+  pthread_once(flag, func);
+}
+
+#endif  // IREE_HAS_C11_THREAD_H / fallbacks
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // IREE_BASE_INTERNAL_CALL_ONCE_H_
diff --git a/runtime/src/iree/base/internal/cpu.c b/runtime/src/iree/base/internal/cpu.c
new file mode 100644
index 0000000..2a0ed83
--- /dev/null
+++ b/runtime/src/iree/base/internal/cpu.c
@@ -0,0 +1,61 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// NOTE: must be first before _any_ system includes.
+#define _GNU_SOURCE
+
+#include "iree/base/internal/cpu.h"
+
+#include "iree/base/target_platform.h"
+
+//===----------------------------------------------------------------------===//
+// iree_cpu_*
+//===----------------------------------------------------------------------===//
+
+#if defined(IREE_PLATFORM_ANDROID) || defined(IREE_PLATFORM_LINUX)
+
+#include <sched.h>
+
+iree_cpu_processor_id_t iree_cpu_query_processor_id(void) {
+  // This path is relatively portable and should work on linux/bsd/etc-likes.
+  // We may want to use getcpu when available so that we can get the group ID.
+  // https://man7.org/linux/man-pages/man3/sched_getcpu.3.html
+  //
+  // libc implementations can use vDSO and other fun stuff to make this really
+  // cheap: http://git.musl-libc.org/cgit/musl/tree/src/sched/sched_getcpu.c
+  int id = sched_getcpu();
+  return id != -1 ? id : 0;
+}
+
+#elif defined(IREE_PLATFORM_WINDOWS)
+
+iree_cpu_processor_id_t iree_cpu_query_processor_id(void) {
+  PROCESSOR_NUMBER pn;
+  GetCurrentProcessorNumberEx(&pn);
+  return 64 * pn.Group + pn.Number;
+}
+
+#else
+
+// No implementation.
+// We could allow an iree/base/config.h override to externalize this.
+iree_cpu_processor_id_t iree_cpu_query_processor_id(void) { return 0; }
+
+#endif  // IREE_PLATFORM_*
+
+void iree_cpu_requery_processor_id(iree_cpu_processor_tag_t* IREE_RESTRICT tag,
+                                   iree_cpu_processor_id_t* IREE_RESTRICT
+                                       processor_id) {
+  IREE_ASSERT_ARGUMENT(tag);
+  IREE_ASSERT_ARGUMENT(processor_id);
+
+  // TODO(benvanik): set a frequency for this and use a coarse timer
+  // (CLOCK_MONOTONIC_COARSE) to do a ~4-10Hz refresh. We can store the last
+  // query time and the last processor ID in the tag and only perform the query
+  // if it has changed.
+
+  *processor_id = iree_cpu_query_processor_id();
+}
diff --git a/runtime/src/iree/base/internal/cpu.h b/runtime/src/iree/base/internal/cpu.h
new file mode 100644
index 0000000..914f39d
--- /dev/null
+++ b/runtime/src/iree/base/internal/cpu.h
@@ -0,0 +1,40 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BASE_INTERNAL_CPU_H_
+#define IREE_BASE_INTERNAL_CPU_H_
+
+#include <stddef.h>
+
+#include "iree/base/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+//===----------------------------------------------------------------------===//
+// iree_cpu_*
+//===----------------------------------------------------------------------===//
+
+typedef uint32_t iree_cpu_processor_id_t;
+typedef uint32_t iree_cpu_processor_tag_t;
+
+// Returns the ID of the logical processor executing this code.
+iree_cpu_processor_id_t iree_cpu_query_processor_id(void);
+
+// Returns the ID of the logical processor executing this code, using |tag| to
+// memoize the query in cases where it does not change frequently.
+// |tag| must be initialized to 0 on first call and may be reset to 0 by the
+// caller at any time to invalidate the cached result.
+void iree_cpu_requery_processor_id(iree_cpu_processor_tag_t* IREE_RESTRICT tag,
+                                   iree_cpu_processor_id_t* IREE_RESTRICT
+                                       processor_id);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_BASE_INTERNAL_ARENA_H_
diff --git a/runtime/src/iree/base/internal/debugging.h b/runtime/src/iree/base/internal/debugging.h
new file mode 100644
index 0000000..0bf232c
--- /dev/null
+++ b/runtime/src/iree/base/internal/debugging.h
@@ -0,0 +1,109 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BASE_INTERNAL_DEBUGGING_H_
+#define IREE_BASE_INTERNAL_DEBUGGING_H_
+
+#include "iree/base/target_platform.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(IREE_COMPILER_GCC_COMPAT)
+#define IREE_ATTRIBUTE_ALWAYS_INLINE __attribute__((always_inline))
+#elif defined(IREE_COMPILER_MSVC)
+#define IREE_ATTRIBUTE_ALWAYS_INLINE __forceinline
+#else
+#define IREE_ATTRIBUTE_ALWAYS_INLINE
+#endif  // IREE_COMPILER_*
+
+//===----------------------------------------------------------------------===//
+// Debugger interaction
+//===----------------------------------------------------------------------===//
+// NOTE: in general it's not a good idea to change program behavior when running
+// under a debugger as that then makes it harder to reproduce and successfully
+// debug issues that happen without the debugger.
+
+// Forces a break into an attached debugger.
+// May be ignored if no debugger is attached or raise a signal that gives the
+// option to attach a debugger.
+//
+// We implement this directly in the header with ALWAYS_INLINE so that the
+// stack doesn't get all messed up.
+IREE_ATTRIBUTE_ALWAYS_INLINE static inline void iree_debug_break(void) {
+#if defined(IREE_COMPILER_HAS_BUILTIN_DEBUG_TRAP)
+  __builtin_debugtrap();
+#elif defined(IREE_PLATFORM_WINDOWS)
+  __debugbreak();
+#elif defined(IREE_ARCH_ARM_32)
+  __asm__ volatile(".inst 0xe7f001f0");
+#elif defined(IREE_ARCH_ARM_64)
+  __asm__ volatile(".inst 0xd4200000");
+#elif defined(IREE_ARCH_X86_32) || defined(IREE_ARCH_X86_64)
+  __asm__ volatile("int $0x03");
+#elif defined(IREE_PLATFORM_EMSCRIPTEN)
+  EM_ASM({ debugger; });
+#else
+  // NOTE: this is unrecoverable and debugging cannot continue.
+  __builtin_trap();
+#endif  // IREE_COMPILER_HAS_BUILTIN_DEBUG_TRAP
+}
+
+//===----------------------------------------------------------------------===//
+// Sanitizer interfaces
+//===----------------------------------------------------------------------===//
+// These provide hints to the various -fsanitize= features that help us indicate
+// what our code is doing to prevent false positives and gain additional
+// coverage. By default the sanitizers try to hook platform features like
+// mutexes and threads and our own implementations of those aren't automatically
+// picked up. In addition, specific uses of memory like arenas can thwart tools
+// like ASAN that try to detect accesses to freed memory because we are never
+// actually malloc()'ing and free()'ing and need to tell ASAN when blocks of
+// memory come into/out-of the pool.
+//
+// The documentation on these interfaces is pretty sparse but it's possible to
+// find usage examples of the hooks in the compiler-provided hooks themselves.
+//
+// The headers can be viewed here:
+// https://github.com/llvm/llvm-project/tree/main/compiler-rt/include/sanitizer
+// And common interceptors here:
+// https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp
+//
+// NOTE: don't assume the presence of a sanitizer implies clang+llvm+x86! GCC
+// supports all of the sanitizers and MSVC supports ASAN and almost all of them
+// can be used on non-x86 platforms.
+
+#if defined(IREE_SANITIZER_ADDRESS)
+#include <sanitizer/asan_interface.h>
+#include <sanitizer/lsan_interface.h>
+#endif  // IREE_SANITIZER_ADDRESS
+
+// For whenever we want to provide specialized msan/tsan hooks:
+//   #if defined(IREE_SANITIZER_MEMORY)
+//   #include <sanitizer/msan_interface.h>
+//   #endif  // IREE_SANITIZER_MEMORY
+//   #if defined(IREE_SANITIZER_THREAD)
+//   #include <sanitizer/tsan_interface.h>
+//   #endif  // IREE_SANITIZER_THREAD
+
+// Suppresses leak detection false-positives in a region. May be nested.
+// Do not use this for any IREE-owned code: fix your leaks! This is useful when
+// third-party libraries or system calls may create false positives or just be
+// leaky such as GPU drivers and shader compilers (which are notoriously bad).
+#if defined(IREE_SANITIZER_ADDRESS)
+#define IREE_LEAK_CHECK_DISABLE_PUSH() __lsan_disable()
+#define IREE_LEAK_CHECK_DISABLE_POP() __lsan_enable()
+#else
+#define IREE_LEAK_CHECK_DISABLE_PUSH()
+#define IREE_LEAK_CHECK_DISABLE_POP()
+#endif  // IREE_SANITIZER_ADDRESS
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // IREE_BASE_INTERNAL_DEBUGGING_H_
diff --git a/runtime/src/iree/base/internal/dynamic_library.h b/runtime/src/iree/base/internal/dynamic_library.h
new file mode 100644
index 0000000..9856269
--- /dev/null
+++ b/runtime/src/iree/base/internal/dynamic_library.h
@@ -0,0 +1,80 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BASE_INTERNAL_DYNAMIC_LIBRARY_H_
+#define IREE_BASE_INTERNAL_DYNAMIC_LIBRARY_H_
+
+#include <stdint.h>
+
+#include "iree/base/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Defines the behavior of the dynamic library loader.
+enum iree_dynamic_library_flag_bits_t {
+  IREE_DYNAMIC_LIBRARY_FLAG_NONE = 0u,
+};
+typedef uint32_t iree_dynamic_library_flags_t;
+
+// Dynamic library (aka shared object) cross-platform wrapper.
+typedef struct iree_dynamic_library_t iree_dynamic_library_t;
+
+// Loads a system library using both the system library load paths and the given
+// file name. The path may may be absolute or relative.
+//
+// For process-wide search control the LD_LIBRARY_PATH (Linux) or PATH (Windows)
+// is used in addition to the default search path rules of the platform.
+iree_status_t iree_dynamic_library_load_from_file(
+    const char* file_path, iree_dynamic_library_flags_t flags,
+    iree_allocator_t allocator, iree_dynamic_library_t** out_library);
+
+// Loads a system library using both the system library load paths and the given
+// search path/alternative file names. The paths may may be absolute or
+// relative.
+//
+// For process-wide search control the LD_LIBRARY_PATH (Linux) or PATH (Windows)
+// is used in addition to the default search path rules of the platform.
+iree_status_t iree_dynamic_library_load_from_files(
+    iree_host_size_t search_path_count, const char* const* search_paths,
+    iree_dynamic_library_flags_t flags, iree_allocator_t allocator,
+    iree_dynamic_library_t** out_library);
+
+// Opens a dynamic library from a range of bytes in memory.
+// |identifier| will be used as the module name in debugging/profiling tools.
+// |buffer| must remain live for the lifetime of the library.
+iree_status_t iree_dynamic_library_load_from_memory(
+    iree_string_view_t identifier, iree_const_byte_span_t buffer,
+    iree_dynamic_library_flags_t flags, iree_allocator_t allocator,
+    iree_dynamic_library_t** out_library);
+
+// Retains the given |library| for the caller.
+void iree_dynamic_library_retain(iree_dynamic_library_t* library);
+
+// Releases the given |library| from the caller.
+void iree_dynamic_library_release(iree_dynamic_library_t* library);
+
+// Performs a symbol lookup in the dynamic library exports.
+iree_status_t iree_dynamic_library_lookup_symbol(
+    iree_dynamic_library_t* library, const char* symbol_name, void** out_fn);
+
+// Loads a debug database (PDB/DWARF/etc) from the given path providing debug
+// symbols for this library and attaches it to the symbol store (if active).
+iree_status_t iree_dynamic_library_attach_symbols_from_file(
+    iree_dynamic_library_t* library, const char* file_path);
+
+// Loads a debug database (PDB/DWARF/etc) from a range of bytes in memory and
+// attaches it to the symbol store (if active). |buffer| must remain live for
+// the lifetime of the library.
+iree_status_t iree_dynamic_library_attach_symbols_from_memory(
+    iree_dynamic_library_t* library, iree_const_byte_span_t buffer);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // IREE_BASE_INTERNAL_DYNAMIC_LIBRARY_H_
diff --git a/runtime/src/iree/base/internal/dynamic_library_posix.c b/runtime/src/iree/base/internal/dynamic_library_posix.c
new file mode 100644
index 0000000..1e14f32
--- /dev/null
+++ b/runtime/src/iree/base/internal/dynamic_library_posix.c
@@ -0,0 +1,330 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "iree/base/internal/atomics.h"
+#include "iree/base/internal/call_once.h"
+#include "iree/base/internal/dynamic_library.h"
+#include "iree/base/internal/file_path.h"
+#include "iree/base/target_platform.h"
+#include "iree/base/tracing.h"
+
+#if defined(IREE_PLATFORM_ANDROID) || defined(IREE_PLATFORM_APPLE) || \
+    defined(IREE_PLATFORM_LINUX) || defined(IREE_PLATFORM_EMSCRIPTEN)
+
+#include <dlfcn.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+struct iree_dynamic_library_t {
+  iree_atomic_ref_count_t ref_count;
+  iree_allocator_t allocator;
+
+  // dlopen shared object handle.
+  void* handle;
+};
+
+// Allocate a new string from |allocator| returned in |out_file_path| containing
+// a path to a unique file on the filesystem.
+static iree_status_t iree_dynamic_library_make_temp_file_path(
+    const char* prefix, const char* extension, iree_allocator_t allocator,
+    const char* tmpdir, char** out_file_path) {
+  // Stamp in a unique file name (replacing XXXXXX in the string).
+  char temp_path[512];
+  if (snprintf(temp_path, sizeof(temp_path), "%s/iree_dylib_XXXXXX", tmpdir) >=
+      sizeof(temp_path)) {
+    // NOTE: we could dynamically allocate things, but didn't seem worth it.
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "TMPDIR name too long (>%zu chars); keep it reasonable",
+        sizeof(temp_path));
+  }
+  int fd = mkstemp(temp_path);
+  if (fd < 0) {
+    return iree_make_status(iree_status_code_from_errno(errno),
+                            "unable to mkstemp file");
+  }
+
+  // Allocate storage for the full file path and format it in.
+  int file_path_length =
+      snprintf(NULL, 0, "%s_%s.%s", temp_path, prefix, extension);
+  if (file_path_length < 0) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "unable to form temp path string");
+  }
+  IREE_RETURN_IF_ERROR(iree_allocator_malloc(
+      allocator, file_path_length + /*NUL=*/1, (void**)out_file_path));
+  snprintf(*out_file_path, file_path_length + /*NUL=*/1, "%s_%s.%s", temp_path,
+           prefix, extension);
+
+  // Canonicalize away any double path separators.
+  iree_file_path_canonicalize(*out_file_path, file_path_length);
+
+  return iree_ok_status();
+}
+
+// Creates a temp file and writes the |source_data| into it.
+// The file path is returned in |out_file_path|.
+static iree_status_t iree_dynamic_library_write_temp_file(
+    iree_const_byte_span_t source_data, const char* prefix,
+    const char* extension, iree_allocator_t allocator, const char* tmpdir,
+    char** out_file_path) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Reserve a temp file path we can write to.
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_dynamic_library_make_temp_file_path(prefix, extension, allocator,
+                                                   tmpdir, out_file_path));
+
+  iree_status_t status = iree_ok_status();
+
+  // Open the file for writing.
+  FILE* file_handle = fopen(*out_file_path, "wb");
+  if (file_handle == NULL) {
+    status = iree_make_status(iree_status_code_from_errno(errno),
+                              "unable to open file '%s'", *out_file_path);
+  }
+
+  // Write all file bytes.
+  if (iree_status_is_ok(status)) {
+    if (fwrite((char*)source_data.data, source_data.data_length, 1,
+               file_handle) != 1) {
+      status =
+          iree_make_status(iree_status_code_from_errno(errno),
+                           "unable to write file span of %zu bytes to '%s'",
+                           source_data.data_length, *out_file_path);
+    }
+  }
+
+  if (file_handle != NULL) {
+    fclose(file_handle);
+    file_handle = NULL;
+  }
+  if (!iree_status_is_ok(status)) {
+    iree_allocator_free(allocator, *out_file_path);
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+// Allocates an iree_dynamic_library_t with the given allocator.
+static iree_status_t iree_dynamic_library_create(
+    void* handle, iree_allocator_t allocator,
+    iree_dynamic_library_t** out_library) {
+  *out_library = NULL;
+
+  iree_dynamic_library_t* library = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_allocator_malloc(allocator, sizeof(*library), (void**)&library));
+  memset(library, 0, sizeof(*library));
+  iree_atomic_ref_count_init(&library->ref_count);
+  library->allocator = allocator;
+  library->handle = handle;
+
+  *out_library = library;
+  return iree_ok_status();
+}
+
+iree_status_t iree_dynamic_library_load_from_file(
+    const char* file_path, iree_dynamic_library_flags_t flags,
+    iree_allocator_t allocator, iree_dynamic_library_t** out_library) {
+  return iree_dynamic_library_load_from_files(1, &file_path, flags, allocator,
+                                              out_library);
+}
+
+iree_status_t iree_dynamic_library_load_from_files(
+    iree_host_size_t search_path_count, const char* const* search_paths,
+    iree_dynamic_library_flags_t flags, iree_allocator_t allocator,
+    iree_dynamic_library_t** out_library) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_ASSERT_ARGUMENT(out_library);
+  *out_library = NULL;
+
+  // Try to load the module from the set of search paths provided.
+  void* handle = NULL;
+  iree_host_size_t i = 0;
+  for (i = 0; i < search_path_count; ++i) {
+    handle = dlopen(search_paths[i], RTLD_LAZY | RTLD_LOCAL);
+    if (handle) break;
+  }
+  if (!handle) {
+    IREE_TRACE_ZONE_END(z0);
+    return iree_make_status(IREE_STATUS_NOT_FOUND,
+                            "dynamic library not found on any search path");
+  }
+
+  iree_dynamic_library_t* library = NULL;
+  iree_status_t status =
+      iree_dynamic_library_create(handle, allocator, &library);
+
+  if (iree_status_is_ok(status)) {
+    *out_library = library;
+  } else {
+    dlclose(handle);
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static iree_once_flag iree_dynamic_library_temp_dir_init_once_flag_ =
+    IREE_ONCE_FLAG_INIT;
+static const char* iree_dynamic_library_temp_dir_path_;
+static bool iree_dynamic_library_temp_dir_valid_;
+static bool iree_dynamic_library_temp_dir_preserve_;
+
+static bool iree_dynamic_library_path_is_null_or_empty(const char* path) {
+  return path == NULL || path[0] == 0;
+}
+
+static void iree_dynamic_library_init_temp_dir(void) {
+  // Semantics of IREE_PRESERVE_DYLIB_TEMP_FILES:
+  // * If the environment variable is not set, temp files are not preserved.
+  // * If the environment variable is set to "1", temp files are preserved to
+  //   some default temp directory. The TMPDIR environment variable is used if
+  //   set, otherwise a hardcoded default path is used. Example:
+  //     $ IREE_PRESERVE_DYLIB_TEMP_FILES=1 iree-run-module ...
+  // * If the environment variable is set to any other string than "1", temp
+  // files
+  //   are preserved, and the value of the environment variable is interpreted
+  //   as the path of the temporary directory to use. Example:
+  //     $ IREE_PRESERVE_DYLIB_TEMP_FILES=/tmp/iree-benchmarks iree-run-module
+  //     ...
+  const char* path = getenv("IREE_PRESERVE_DYLIB_TEMP_FILES");
+  bool preserve = !iree_dynamic_library_path_is_null_or_empty(path);
+  if (!path || !strcmp(path, "1")) {
+    // TMPDIR is a unix semi-standard thing. It's even defined by default on
+    // Android for the regular shell user (but not root).
+    path = getenv("TMPDIR");
+    if (iree_dynamic_library_path_is_null_or_empty(path)) {
+#ifdef __ANDROID__
+      path = "/data/local/tmp";
+#else
+      path = "/tmp";
+#endif  // __ANDROID__
+    }
+  }
+  iree_dynamic_library_temp_dir_path_ = path;
+  iree_dynamic_library_temp_dir_preserve_ = preserve;
+  // Validate that temp_dir it is the path of a directory. Could fail if it was
+  // user-provided, or on an Android device where /data/local/tmp hasn't been
+  // created yet.
+  struct stat s;
+  iree_dynamic_library_temp_dir_valid_ =
+      stat(path, &s) == 0 && (s.st_mode & S_IFMT) == S_IFDIR;
+}
+
+// TODO(#3845): use dlopen on an fd with either dlopen(/proc/self/fd/NN),
+// fdlopen, or android_dlopen_ext to avoid needing to write the file to disk.
+// Can fallback to memfd_create + dlopen where available, and fallback from
+// that to disk (maybe just windows/mac).
+iree_status_t iree_dynamic_library_load_from_memory(
+    iree_string_view_t identifier, iree_const_byte_span_t buffer,
+    iree_dynamic_library_flags_t flags, iree_allocator_t allocator,
+    iree_dynamic_library_t** out_library) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_ASSERT_ARGUMENT(out_library);
+  *out_library = NULL;
+
+  iree_call_once(&iree_dynamic_library_temp_dir_init_once_flag_,
+                 iree_dynamic_library_init_temp_dir);
+
+  if (!iree_dynamic_library_temp_dir_valid_) {
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "path of dylib temp files (%s) is not the path of a directory",
+        iree_dynamic_library_temp_dir_path_);
+  }
+
+  // Extract the library to a temp file.
+  char* temp_path = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_dynamic_library_write_temp_file(
+              buffer, "mem_", "so", allocator,
+              iree_dynamic_library_temp_dir_path_, &temp_path));
+
+  // Load using the normal load from file routine.
+  iree_status_t status = iree_dynamic_library_load_from_file(
+      temp_path, flags, allocator, out_library);
+
+  // Unlink the temp file - it's still open by the loader but won't be
+  // accessible to anyone else and will be deleted once the library is
+  // unloaded. Note that we don't remove the file if the user requested we keep
+  // it around for tooling to access.
+  if (!iree_dynamic_library_temp_dir_preserve_) {
+    remove(temp_path);
+  }
+  iree_allocator_free(allocator, temp_path);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_dynamic_library_delete(iree_dynamic_library_t* library) {
+  iree_allocator_t allocator = library->allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+  // Leak the library when tracing, since the profiler may still be reading it.
+  // TODO(benvanik): move to an atexit handler instead, verify with ASAN/MSAN
+  // TODO(scotttodd): Make this compatible with testing:
+  //     two test cases, one for each function in the same executable
+  //     first test case passes, second fails to open the file (already open)
+#else
+  // Close the library first as it may be loaded from one of the temp files we
+  // are about to delete.
+  if (library->handle != NULL) {
+    dlclose(library->handle);
+  }
+#endif  // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+
+  iree_allocator_free(allocator, library);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_dynamic_library_retain(iree_dynamic_library_t* library) {
+  if (library) {
+    iree_atomic_ref_count_inc(&library->ref_count);
+  }
+}
+
+void iree_dynamic_library_release(iree_dynamic_library_t* library) {
+  if (library && iree_atomic_ref_count_dec(&library->ref_count) == 1) {
+    iree_dynamic_library_delete(library);
+  }
+}
+
+iree_status_t iree_dynamic_library_lookup_symbol(
+    iree_dynamic_library_t* library, const char* symbol_name, void** out_fn) {
+  IREE_ASSERT_ARGUMENT(library);
+  IREE_ASSERT_ARGUMENT(symbol_name);
+  IREE_ASSERT_ARGUMENT(out_fn);
+  *out_fn = NULL;
+  void* fn = dlsym(library->handle, symbol_name);
+  if (!fn) {
+    return iree_make_status(IREE_STATUS_NOT_FOUND,
+                            "symbol '%s' not found in library", symbol_name);
+  }
+  *out_fn = fn;
+  return iree_ok_status();
+}
+
+iree_status_t iree_dynamic_library_attach_symbols_from_file(
+    iree_dynamic_library_t* library, const char* file_path) {
+  return iree_ok_status();
+}
+
+iree_status_t iree_dynamic_library_attach_symbols_from_memory(
+    iree_dynamic_library_t* library, iree_const_byte_span_t buffer) {
+  return iree_ok_status();
+}
+
+#endif  // IREE_PLATFORM_*
diff --git a/runtime/src/iree/base/internal/dynamic_library_win32.c b/runtime/src/iree/base/internal/dynamic_library_win32.c
new file mode 100644
index 0000000..a55e143
--- /dev/null
+++ b/runtime/src/iree/base/internal/dynamic_library_win32.c
@@ -0,0 +1,417 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "iree/base/internal/atomics.h"
+#include "iree/base/internal/call_once.h"
+#include "iree/base/internal/dynamic_library.h"
+#include "iree/base/internal/file_path.h"
+#include "iree/base/target_platform.h"
+#include "iree/base/tracing.h"
+
+#if defined(IREE_PLATFORM_WINDOWS)
+
+// TODO(benvanik): support PDB overlays when tracy is not enabled; we'll
+// need to rearrange how the dbghelp lock is handled for that (probably moving
+// it here and having the tracy code redirect to this).
+#if defined(TRACY_ENABLE)
+#define IREE_HAVE_DYNAMIC_LIBRARY_PDB_SUPPORT 1
+#pragma warning(disable : 4091)
+#include <dbghelp.h>
+
+void IREEDbgHelpLock(void);
+void IREEDbgHelpUnlock(void);
+#endif  // TRACY_ENABLE
+
+struct iree_dynamic_library_t {
+  iree_atomic_ref_count_t ref_count;
+  iree_allocator_t allocator;
+
+  // Base module name used as an identifier. When loaded from a file this must
+  // be the basename for dbghelp to be able to find symbols.
+  // Owned and allocated as part of the struct upon creation.
+  // Has NUL terminator for compatibility with Windows APIs.
+  char* identifier;
+
+  // File path of the loaded module, if loaded from one.
+  // Owned and allocated as part of the struct upon creation.
+  // Has NUL terminator for compatibility with Windows APIs.
+  char* module_path;
+
+  // Windows module handle.
+  HMODULE module;
+
+  // 0 or more file paths that were created as part of the loading of the
+  // library or attaching of symbols from memory.
+  //
+  // Each path string is allocated using the |allocator| and freed during
+  // library deletion.
+  iree_host_size_t temp_file_count;
+  char* temp_file_paths[2];
+};
+
+static iree_once_flag iree_dynamic_library_temp_path_flag_ =
+    IREE_ONCE_FLAG_INIT;
+static char iree_dynamic_library_temp_path_base_[MAX_PATH + 1];
+static void iree_dynamic_library_init_temp_paths(void) {
+  // Query the temp path from the OS. This can be overridden with the following
+  // environment variables: [TMP, TEMP, USERPROFILE].
+  //
+  // See:
+  // https://docs.microsoft.com/en-us/windows/win32/api/fileapi/nf-fileapi-gettemppatha
+  char temp_path[MAX_PATH];
+  DWORD temp_path_length = GetTempPathA(IREE_ARRAYSIZE(temp_path), temp_path);
+
+  // Append the process ID to the path; this is like what _mktemp does but
+  // without all the hoops.
+  snprintf(iree_dynamic_library_temp_path_base_,
+           sizeof(iree_dynamic_library_temp_path_base_), "%s\\iree_dylib_%08X",
+           temp_path, GetCurrentProcessId());
+
+  // Canonicalize away any double path separators.
+  iree_file_path_canonicalize(iree_dynamic_library_temp_path_base_,
+                              strlen(iree_dynamic_library_temp_path_base_));
+}
+
+// Allocate a new string from |allocator| returned in |out_file_path| containing
+// a path to a unique file on the filesystem.
+static iree_status_t iree_dynamic_library_make_temp_file_path(
+    const char* prefix, const char* extension, iree_allocator_t allocator,
+    char** out_file_path) {
+  // Ensure the root temp paths are queried/initialized.
+  iree_call_once(&iree_dynamic_library_temp_path_flag_,
+                 iree_dynamic_library_init_temp_paths);
+
+  // Generate a per-file unique identifier only unique **within** the current
+  // process. We combine this with the _mktemp path that should be unique to the
+  // process itself.
+  static iree_atomic_int32_t next_unique_id = IREE_ATOMIC_VAR_INIT(0);
+  uint32_t unique_id = (uint32_t)iree_atomic_fetch_add_int32(
+      &next_unique_id, 1, iree_memory_order_seq_cst);
+
+  // Allocate storage for the full file path and format it in.
+  int file_path_length =
+      snprintf(NULL, 0, "%s_%s_%08X.%s", iree_dynamic_library_temp_path_base_,
+               prefix, unique_id, extension);
+  if (file_path_length < 0) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "unable to form temp path string");
+  }
+  IREE_RETURN_IF_ERROR(iree_allocator_malloc(
+      allocator, file_path_length + /*NUL=*/1, (void**)out_file_path));
+  snprintf(*out_file_path, file_path_length + /*NUL=*/1, "%s_%s_%08X.%s",
+           iree_dynamic_library_temp_path_base_, prefix, unique_id, extension);
+
+  return iree_ok_status();
+}
+
+// Creates a temp file and writes the |source_data| into it.
+// The file path is returned in |out_file_path|.
+static iree_status_t iree_dynamic_library_write_temp_file(
+    iree_const_byte_span_t source_data, const char* prefix,
+    const char* extension, iree_allocator_t allocator, char** out_file_path) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Reserve a temp file path we can write to.
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_dynamic_library_make_temp_file_path(prefix, extension, allocator,
+                                                   out_file_path));
+
+  iree_status_t status = iree_ok_status();
+
+  // Open the file for writing.
+  HANDLE file_handle = CreateFileA(
+      /*lpFileName=*/*out_file_path, /*dwDesiredAccess=*/GENERIC_WRITE,
+      /*dwShareMode=*/FILE_SHARE_DELETE, /*lpSecurityAttributes=*/NULL,
+      /*dwCreationDisposition=*/CREATE_ALWAYS,
+      /*dwFlagsAndAttributes=*/FILE_ATTRIBUTE_TEMPORARY,
+      /*hTemplateFile=*/NULL);
+  if (file_handle == INVALID_HANDLE_VALUE) {
+    status = iree_make_status(iree_status_code_from_win32_error(GetLastError()),
+                              "unable to open file '%s'", *out_file_path);
+  }
+
+  // Write all file bytes.
+  if (iree_status_is_ok(status)) {
+    if (WriteFile(file_handle, source_data.data, (DWORD)source_data.data_length,
+                  NULL, NULL) == FALSE) {
+      status =
+          iree_make_status(iree_status_code_from_win32_error(GetLastError()),
+                           "unable to write file span of %zu bytes to '%s'",
+                           source_data.data_length, *out_file_path);
+    }
+  }
+
+  if (file_handle != NULL) {
+    CloseHandle(file_handle);
+    file_handle = NULL;
+  }
+  if (!iree_status_is_ok(status)) {
+    iree_allocator_free(allocator, *out_file_path);
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+// Allocates an iree_dynamic_library_t with the given allocator.
+static iree_status_t iree_dynamic_library_create(
+    iree_string_view_t identifier, iree_string_view_t module_path,
+    HMODULE module, iree_allocator_t allocator,
+    iree_dynamic_library_t** out_library) {
+  *out_library = NULL;
+
+  iree_dynamic_library_t* library = NULL;
+  iree_host_size_t total_size =
+      sizeof(*library) + (identifier.size + 1) + (module_path.size + 1);
+  IREE_RETURN_IF_ERROR(
+      iree_allocator_malloc(allocator, total_size, (void**)&library));
+  memset(library, 0, total_size);
+  iree_atomic_ref_count_init(&library->ref_count);
+  library->allocator = allocator;
+  library->module = module;
+
+  library->identifier = (char*)library + sizeof(*library);
+  memcpy(library->identifier, identifier.data, identifier.size);
+  library->identifier[identifier.size] = 0;  // NUL
+
+  library->module_path = library->identifier + (identifier.size + 1);
+  memcpy(library->module_path, module_path.data, module_path.size);
+  library->module_path[module_path.size] = 0;  // NUL
+
+  *out_library = library;
+  return iree_ok_status();
+}
+
+iree_status_t iree_dynamic_library_load_from_file(
+    const char* file_path, iree_dynamic_library_flags_t flags,
+    iree_allocator_t allocator, iree_dynamic_library_t** out_library) {
+  return iree_dynamic_library_load_from_files(1, &file_path, flags, allocator,
+                                              out_library);
+}
+
+iree_status_t iree_dynamic_library_load_from_files(
+    iree_host_size_t search_path_count, const char* const* search_paths,
+    iree_dynamic_library_flags_t flags, iree_allocator_t allocator,
+    iree_dynamic_library_t** out_library) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_ASSERT_ARGUMENT(out_library);
+  *out_library = NULL;
+
+  // Try to load the module from the set of search paths provided.
+  HMODULE module = NULL;
+  iree_host_size_t i = 0;
+  for (i = 0; i < search_path_count; ++i) {
+    module = LoadLibraryA(search_paths[i]);
+    if (module) break;
+  }
+  if (!module) {
+    IREE_TRACE_ZONE_END(z0);
+    return iree_make_status(IREE_STATUS_NOT_FOUND,
+                            "dynamic library not found on any search path");
+  }
+
+  iree_string_view_t file_path = iree_make_cstring_view(search_paths[i]);
+  iree_string_view_t identifier = iree_file_path_basename(file_path);
+
+  iree_dynamic_library_t* library = NULL;
+  iree_status_t status = iree_dynamic_library_create(
+      identifier, file_path, module, allocator, &library);
+
+  if (iree_status_is_ok(status)) {
+    *out_library = library;
+  } else {
+    FreeLibrary(module);
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+iree_status_t iree_dynamic_library_load_from_memory(
+    iree_string_view_t identifier, iree_const_byte_span_t buffer,
+    iree_dynamic_library_flags_t flags, iree_allocator_t allocator,
+    iree_dynamic_library_t** out_library) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_ASSERT_ARGUMENT(out_library);
+  *out_library = NULL;
+
+  // Extract the library to a temp file.
+  char* temp_path = NULL;
+  iree_status_t status = iree_dynamic_library_write_temp_file(
+      buffer, "mem", "dll", allocator, &temp_path);
+
+  if (iree_status_is_ok(status)) {
+    // Load using the normal load from file routine.
+    status = iree_dynamic_library_load_from_file(temp_path, flags, allocator,
+                                                 out_library);
+  }
+  if (iree_status_is_ok(status)) {
+    // Associate the temp path to the library; the temp_path string and the
+    // backing file will be deleted when the library is closed.
+    iree_dynamic_library_t* library = *out_library;
+    library->temp_file_paths[library->temp_file_count++] = temp_path;
+  } else {
+    iree_allocator_free(allocator, temp_path);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_dynamic_library_delete(iree_dynamic_library_t* library) {
+  iree_allocator_t allocator = library->allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+  // Leak the library when tracing, since the profiler may still be reading it.
+  // TODO(benvanik): move to an atexit handler instead, verify with ASAN/MSAN
+  // TODO(scotttodd): Make this compatible with testing:
+  //     two test cases, one for each function in the same executable
+  //     first test case passes, second fails to open the file (already open)
+#else
+  // Close the library first as it may be loaded from one of the temp files we
+  // are about to delete.
+  if (library->module != NULL) {
+    FreeLibrary(library->module);
+  }
+#endif  // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+
+  // Cleanup all temp files.
+  for (iree_host_size_t i = 0; i < library->temp_file_count; ++i) {
+    char* file_path = library->temp_file_paths[i];
+    DeleteFileA(file_path);
+    iree_allocator_free(allocator, file_path);
+  }
+
+  iree_allocator_free(allocator, library);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_dynamic_library_retain(iree_dynamic_library_t* library) {
+  if (library) {
+    iree_atomic_ref_count_inc(&library->ref_count);
+  }
+}
+
+void iree_dynamic_library_release(iree_dynamic_library_t* library) {
+  if (library && iree_atomic_ref_count_dec(&library->ref_count) == 1) {
+    iree_dynamic_library_delete(library);
+  }
+}
+
+iree_status_t iree_dynamic_library_lookup_symbol(
+    iree_dynamic_library_t* library, const char* symbol_name, void** out_fn) {
+  IREE_ASSERT_ARGUMENT(library);
+  IREE_ASSERT_ARGUMENT(symbol_name);
+  IREE_ASSERT_ARGUMENT(out_fn);
+  *out_fn = NULL;
+  void* fn = GetProcAddress(library->module, symbol_name);
+  if (!fn) {
+    return iree_make_status(IREE_STATUS_NOT_FOUND,
+                            "symbol '%s' not found in library", symbol_name);
+  }
+  *out_fn = fn;
+  return iree_ok_status();
+}
+
+#if defined(IREE_HAVE_DYNAMIC_LIBRARY_PDB_SUPPORT)
+
+typedef struct {
+  const char* module_path;
+  DWORD64 module_base;
+  ULONG module_size;
+} ModuleEnumCallbackState;
+
+static BOOL EnumLoadedModulesCallback(PCSTR ModuleName, DWORD64 ModuleBase,
+                                      ULONG ModuleSize, PVOID UserContext) {
+  ModuleEnumCallbackState* state = (ModuleEnumCallbackState*)UserContext;
+  if (strcmp(ModuleName, state->module_path) != 0) {
+    return TRUE;  // not a match; continue
+  }
+  state->module_base = ModuleBase;
+  state->module_size = ModuleSize;
+  return FALSE;  // match found; stop enumeration
+}
+
+iree_status_t iree_dynamic_library_attach_symbols_from_file(
+    iree_dynamic_library_t* library, const char* file_path) {
+  IREE_ASSERT_ARGUMENT(library);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  IREEDbgHelpLock();
+
+  // Useful for debugging this logic; will print search paths and results:
+  // SymSetOptions(SYMOPT_LOAD_LINES | SYMOPT_DEBUG);
+
+  // Enumerates all loaded modules in the process to extract the module
+  // base/size parameters we need to overlay the PDB. There's other ways to
+  // get this (such as registering a LdrDllNotification callback and snooping
+  // the values during LoadLibrary or using CreateToolhelp32Snapshot), however
+  // EnumerateLoadedModules is in dbghelp which we are using anyway.
+  ModuleEnumCallbackState state;
+  memset(&state, 0, sizeof(state));
+  state.module_path = library->module_path;
+  EnumerateLoadedModules64(GetCurrentProcess(), EnumLoadedModulesCallback,
+                           &state);
+
+  // Load the PDB file and overlay it onto the already-loaded module at the
+  // address range it got loaded into.
+  if (state.module_base != 0) {
+    SymLoadModuleEx(GetCurrentProcess(), NULL, file_path, library->identifier,
+                    state.module_base, state.module_size, NULL, 0);
+  }
+
+  IREEDbgHelpUnlock();
+
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+iree_status_t iree_dynamic_library_attach_symbols_from_memory(
+    iree_dynamic_library_t* library, iree_const_byte_span_t buffer) {
+  IREE_ASSERT_ARGUMENT(library);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  if (library->temp_file_count + 1 > IREE_ARRAYSIZE(library->temp_file_paths)) {
+    return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED,
+                            "too many temp files attached");
+  }
+
+  // Extract the library to a temp file.
+  char* temp_path = NULL;
+  iree_status_t status = iree_dynamic_library_write_temp_file(
+      buffer, "mem_", "pdb", library->allocator, &temp_path);
+  if (iree_status_is_ok(status)) {
+    // Associate the temp path to the library; the temp_path string and the
+    // backing file will be deleted when the library is closed.
+    library->temp_file_paths[library->temp_file_count++] = temp_path;
+
+    // Attempt to attach the extracted temp file to the module.
+    status = iree_dynamic_library_attach_symbols_from_file(library, temp_path);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+#else
+
+iree_status_t iree_dynamic_library_attach_symbols_from_file(
+    iree_dynamic_library_t* library, const char* file_path) {
+  return iree_ok_status();
+}
+
+iree_status_t iree_dynamic_library_attach_symbols_from_memory(
+    iree_dynamic_library_t* library, iree_const_byte_span_t buffer) {
+  return iree_ok_status();
+}
+
+#endif  // IREE_HAVE_DYNAMIC_LIBRARY_PDB_SUPPORT
+
+#endif  // IREE_PLATFORM_WINDOWS
diff --git a/runtime/src/iree/base/internal/event_pool.c b/runtime/src/iree/base/internal/event_pool.c
new file mode 100644
index 0000000..2cc93d4
--- /dev/null
+++ b/runtime/src/iree/base/internal/event_pool.c
@@ -0,0 +1,166 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/internal/event_pool.h"
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <string.h>
+
+#include "iree/base/internal/synchronization.h"
+#include "iree/base/tracing.h"
+
+struct iree_event_pool_t {
+  // Allocator used to create the event pool.
+  iree_allocator_t host_allocator;
+  // Guards the pool. Since this pool is used to get operating system-level
+  // event objects that will be signaled and waited on using syscalls it's got
+  // relatively low contention: callers are rate limited by how fast they can
+  // signal and wait on the events they get.
+  iree_slim_mutex_t mutex;
+  // Maximum number of events that will be maintained in the pool. More events
+  // may be allocated at any time but when they are no longer needed they will
+  // be disposed directly.
+  iree_host_size_t available_capacity;
+  // Total number of available
+  iree_host_size_t available_count;
+  // Dense left-aligned list of available_count events.
+  iree_event_t available_list[];
+};
+
+iree_status_t iree_event_pool_allocate(iree_host_size_t available_capacity,
+                                       iree_allocator_t host_allocator,
+                                       iree_event_pool_t** out_event_pool) {
+  IREE_ASSERT_ARGUMENT(out_event_pool);
+  *out_event_pool = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_event_pool_t* event_pool = NULL;
+  iree_host_size_t total_size =
+      sizeof(*event_pool) +
+      available_capacity * sizeof(event_pool->available_list[0]);
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0,
+      iree_allocator_malloc(host_allocator, total_size, (void**)&event_pool));
+  event_pool->host_allocator = host_allocator;
+  event_pool->available_capacity = available_capacity;
+  event_pool->available_count = 0;
+
+  iree_status_t status = iree_ok_status();
+  for (iree_host_size_t i = 0; i < available_capacity; ++i) {
+    status = iree_event_initialize(
+        /*initial_state=*/false,
+        &event_pool->available_list[event_pool->available_count++]);
+    if (!iree_status_is_ok(status)) break;
+  }
+
+  if (iree_status_is_ok(status)) {
+    *out_event_pool = event_pool;
+  } else {
+    iree_event_pool_free(event_pool);
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+void iree_event_pool_free(iree_event_pool_t* event_pool) {
+  iree_allocator_t host_allocator = event_pool->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  for (iree_host_size_t i = 0; i < event_pool->available_count; ++i) {
+    iree_event_deinitialize(&event_pool->available_list[i]);
+  }
+  iree_slim_mutex_deinitialize(&event_pool->mutex);
+  iree_allocator_free(host_allocator, event_pool);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+iree_status_t iree_event_pool_acquire(iree_event_pool_t* event_pool,
+                                      iree_host_size_t event_count,
+                                      iree_event_t* out_events) {
+  IREE_ASSERT_ARGUMENT(event_pool);
+  if (!event_count) return iree_ok_status();
+  IREE_ASSERT_ARGUMENT(out_events);
+
+  // We'll try to get what we can from the pool and fall back to initializing
+  // new events.
+  iree_host_size_t remaining_count = event_count;
+
+  // Try first to grab from the pool.
+  iree_slim_mutex_lock(&event_pool->mutex);
+  iree_host_size_t from_pool_count =
+      iree_min(event_pool->available_count, event_count);
+  if (from_pool_count > 0) {
+    iree_host_size_t pool_base_index =
+        event_pool->available_count - from_pool_count;
+    memcpy(out_events, &event_pool->available_list[pool_base_index],
+           from_pool_count * sizeof(iree_event_t));
+    event_pool->available_count -= from_pool_count;
+    remaining_count -= from_pool_count;
+  }
+  iree_slim_mutex_unlock(&event_pool->mutex);
+
+  // Allocate the rest of the events.
+  if (remaining_count > 0) {
+    IREE_TRACE_ZONE_BEGIN(z0);
+    iree_status_t status = iree_ok_status();
+    for (iree_host_size_t i = 0; i < remaining_count; ++i) {
+      status = iree_event_initialize(/*initial_state=*/false,
+                                     &out_events[from_pool_count + i]);
+      if (!iree_status_is_ok(status)) {
+        // Must release all events we've acquired so far.
+        iree_event_pool_release(event_pool, from_pool_count + i, out_events);
+        IREE_TRACE_ZONE_END(z0);
+        return status;
+      }
+    }
+    IREE_TRACE_ZONE_END(z0);
+  }
+
+  return iree_ok_status();
+}
+
+void iree_event_pool_release(iree_event_pool_t* event_pool,
+                             iree_host_size_t event_count,
+                             iree_event_t* events) {
+  IREE_ASSERT_ARGUMENT(event_pool);
+  if (!event_count) return;
+  IREE_ASSERT_ARGUMENT(events);
+
+  // We'll try to release all we can back to the pool and then deinitialize
+  // the ones that won't fit.
+  iree_host_size_t remaining_count = event_count;
+
+  // Try first to release to the pool.
+  // Note that we reset the events we add back to the pool so that they are
+  // ready to be acquired again.
+  iree_slim_mutex_lock(&event_pool->mutex);
+  iree_host_size_t to_pool_count =
+      iree_min(event_pool->available_capacity - event_pool->available_count,
+               event_count);
+  if (to_pool_count > 0) {
+    iree_host_size_t pool_base_index = event_pool->available_count;
+    for (iree_host_size_t i = 0; i < to_pool_count; ++i) {
+      iree_event_reset(&events[i]);
+    }
+    memcpy(&event_pool->available_list[pool_base_index], events,
+           to_pool_count * sizeof(iree_event_t));
+    event_pool->available_count += to_pool_count;
+    remaining_count -= to_pool_count;
+  }
+  iree_slim_mutex_unlock(&event_pool->mutex);
+
+  // Deallocate the rest of the events. We don't bother resetting them as we are
+  // getting rid of them.
+  if (remaining_count > 0) {
+    IREE_TRACE_ZONE_BEGIN(z0);
+    for (iree_host_size_t i = 0; i < remaining_count; ++i) {
+      iree_event_deinitialize(&events[to_pool_count + i]);
+    }
+    IREE_TRACE_ZONE_END(z0);
+  }
+}
diff --git a/runtime/src/iree/base/internal/event_pool.h b/runtime/src/iree/base/internal/event_pool.h
new file mode 100644
index 0000000..7ac56cb
--- /dev/null
+++ b/runtime/src/iree/base/internal/event_pool.h
@@ -0,0 +1,49 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BASE_INTERNAL_EVENT_POOL_H_
+#define IREE_BASE_INTERNAL_EVENT_POOL_H_
+
+#include "iree/base/api.h"
+#include "iree/base/internal/wait_handle.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// A simple pool of iree_event_ts to recycle.
+//
+// Thread-safe; multiple threads may acquire and release events from the pool.
+typedef struct iree_event_pool_t iree_event_pool_t;
+
+// Allocates a new event pool with up to |available_capacity| events.
+iree_status_t iree_event_pool_allocate(iree_host_size_t available_capacity,
+                                       iree_allocator_t host_allocator,
+                                       iree_event_pool_t** out_event_pool);
+
+// Deallocates an event pool and destroys all events.
+// All events that were acquired from the pool must have already been released
+// back to it prior to deallocation.
+void iree_event_pool_free(iree_event_pool_t* event_pool);
+
+// Acquires one or more events from the event pool.
+// The returned events will be unsignaled and ready for use. Callers may set and
+// reset the events as much as they want prior to releasing them back to the
+// pool with iree_event_pool_release.
+iree_status_t iree_event_pool_acquire(iree_event_pool_t* event_pool,
+                                      iree_host_size_t event_count,
+                                      iree_event_t* out_events);
+
+// Releases one or more events back to the block pool.
+void iree_event_pool_release(iree_event_pool_t* event_pool,
+                             iree_host_size_t event_count,
+                             iree_event_t* events);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_BASE_INTERNAL_EVENT_POOL_H_
diff --git a/runtime/src/iree/base/internal/file_io.c b/runtime/src/iree/base/internal/file_io.c
new file mode 100644
index 0000000..d9b8076
--- /dev/null
+++ b/runtime/src/iree/base/internal/file_io.c
@@ -0,0 +1,276 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/internal/file_io.h"
+
+#include "iree/base/config.h"
+
+#if IREE_FILE_IO_ENABLE
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "iree/base/target_platform.h"
+#include "iree/base/tracing.h"
+
+#if defined(IREE_PLATFORM_WINDOWS)
+#include <fcntl.h>
+#include <io.h>
+#define IREE_SET_BINARY_MODE(handle) _setmode(_fileno(handle), O_BINARY)
+#else
+#define IREE_SET_BINARY_MODE(handle) ((void)0)
+#endif  // IREE_PLATFORM_WINDOWS
+
+// We could take alignment as an arg, but roughly page aligned should be
+// acceptable for all uses - if someone cares about memory usage they won't
+// be using this method.
+#define IREE_FILE_BASE_ALIGNMENT 4096
+
+iree_status_t iree_file_exists(const char* path) {
+  IREE_ASSERT_ARGUMENT(path);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  struct stat stat_buf;
+  iree_status_t status =
+      stat(path, &stat_buf) == 0
+          ? iree_ok_status()
+          : iree_make_status(IREE_STATUS_NOT_FOUND, "'%s'", path);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+iree_status_t iree_file_contents_allocator_ctl(void* self,
+                                               iree_allocator_command_t command,
+                                               const void* params,
+                                               void** inout_ptr) {
+  if (command != IREE_ALLOCATOR_COMMAND_FREE) {
+    return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                            "file contents deallocator must only be used to "
+                            "deallocate file contents");
+  }
+  iree_file_contents_t* contents = (iree_file_contents_t*)self;
+  if (contents->buffer.data != *inout_ptr) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "only the file contents buffer is valid");
+  }
+  iree_allocator_t allocator = contents->allocator;
+  iree_allocator_free(allocator, contents);
+  return iree_ok_status();
+}
+
+iree_allocator_t iree_file_contents_deallocator(
+    iree_file_contents_t* contents) {
+  iree_allocator_t allocator = {
+      .self = contents,
+      .ctl = iree_file_contents_allocator_ctl,
+  };
+  return allocator;
+}
+
+void iree_file_contents_free(iree_file_contents_t* contents) {
+  if (!contents) return;
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_allocator_free(contents->allocator, contents);
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_status_t iree_file_read_contents_impl(
+    FILE* file, iree_allocator_t allocator,
+    iree_file_contents_t** out_contents) {
+  // Seek to the end of the file.
+  if (fseek(file, 0, SEEK_END) == -1) {
+    return iree_make_status(iree_status_code_from_errno(errno), "seek (end)");
+  }
+
+  // Query the position, telling us the total file length in bytes.
+  size_t file_size = ftell(file);
+  if (file_size == -1L) {
+    return iree_make_status(iree_status_code_from_errno(errno), "size query");
+  }
+
+  // Seek back to the file start.
+  if (fseek(file, 0, SEEK_SET) == -1) {
+    return iree_make_status(iree_status_code_from_errno(errno), "seek (beg)");
+  }
+
+  // Compute total size with alignment padding.
+  // We allocate +1 to force a trailing \0 in case this is used as a cstring.
+  iree_file_contents_t* contents = NULL;
+  iree_host_size_t total_size =
+      sizeof(*contents) + IREE_FILE_BASE_ALIGNMENT + file_size + /*NUL*/ 1;
+  IREE_RETURN_IF_ERROR(
+      iree_allocator_malloc(allocator, total_size, (void**)&contents));
+
+  contents->allocator = allocator;
+  contents->buffer.data = (void*)iree_host_align(
+      (uintptr_t)contents + sizeof(*contents), IREE_FILE_BASE_ALIGNMENT);
+  contents->buffer.data_length = file_size;
+
+  // Attempt to read the file into memory.
+  if (fread(contents->buffer.data, file_size, 1, file) != 1) {
+    iree_allocator_free(allocator, contents);
+    return iree_make_status(iree_status_code_from_errno(errno),
+                            "unable to read entire %zu file bytes", file_size);
+  }
+
+  // Add trailing NUL to make the contents C-string compatible.
+  contents->buffer.data[file_size] = 0;  // NUL
+  *out_contents = contents;
+  return iree_ok_status();
+}
+
+iree_status_t iree_file_read_contents(const char* path,
+                                      iree_allocator_t allocator,
+                                      iree_file_contents_t** out_contents) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_ASSERT_ARGUMENT(path);
+  IREE_ASSERT_ARGUMENT(out_contents);
+  *out_contents = NULL;
+
+  FILE* file = fopen(path, "rb");
+  if (file == NULL) {
+    IREE_TRACE_ZONE_END(z0);
+    return iree_make_status(iree_status_code_from_errno(errno),
+                            "failed to open file '%s'", path);
+  }
+
+  // Read the file contents into memory.
+  iree_status_t status =
+      iree_file_read_contents_impl(file, allocator, out_contents);
+  if (!iree_status_is_ok(status)) {
+    status = iree_status_annotate_f(status, "reading file '%s'", path);
+  }
+
+  fclose(file);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+iree_status_t iree_file_write_contents(const char* path,
+                                       iree_const_byte_span_t content) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_ASSERT_ARGUMENT(path);
+
+  FILE* file = fopen(path, "wb");
+  if (file == NULL) {
+    IREE_TRACE_ZONE_END(z0);
+    return iree_make_status(iree_status_code_from_errno(errno),
+                            "failed to open file '%s'", path);
+  }
+
+  int ret = fwrite((char*)content.data, content.data_length, 1, file);
+  iree_status_t status = iree_ok_status();
+  if (ret != 1) {
+    status =
+        iree_make_status(IREE_STATUS_DATA_LOSS,
+                         "unable to write file contents of %zu bytes to '%s'",
+                         content.data_length, path);
+  }
+
+  fclose(file);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static iree_status_t iree_stdin_read_contents_impl(
+    iree_allocator_t allocator, iree_file_contents_t** out_contents) {
+  // HACK: fix stdin mode to binary on Windows to match Unix behavior.
+  // Ideally we'd do this in one place for all our tools.
+  IREE_SET_BINARY_MODE(stdin);
+
+  iree_host_size_t capacity = 4096;
+  iree_file_contents_t* contents = NULL;
+  IREE_RETURN_IF_ERROR(iree_allocator_malloc(
+      allocator, sizeof(*contents) + IREE_FILE_BASE_ALIGNMENT + capacity,
+      (void**)&contents));
+  contents->buffer.data = (void*)iree_host_align(
+      (uintptr_t)contents + sizeof(*contents), IREE_FILE_BASE_ALIGNMENT);
+
+  iree_host_size_t size = 0;
+  for (int c = getchar(); c != EOF; c = getchar()) {
+    if (size >= capacity - /*NUL*/ 1) {
+      // NOTE: if we realloc we may end up with a new alignment and need to move
+      // the data around.
+      uintptr_t old_offset =
+          (uintptr_t)contents->buffer.data - (uintptr_t)contents;
+      iree_host_size_t new_capacity = capacity * 2;
+      iree_file_contents_t* new_contents = contents;
+      iree_status_t status = iree_allocator_realloc(
+          allocator,
+          sizeof(*new_contents) + IREE_FILE_BASE_ALIGNMENT + new_capacity,
+          (void**)&new_contents);
+      if (!iree_status_is_ok(status)) {
+        iree_allocator_free(allocator, contents);
+        return status;
+      }
+      contents = new_contents;
+      uint8_t* old_data = (uint8_t*)new_contents + old_offset;
+      uint8_t* new_data = (uint8_t*)iree_host_align(
+          (uintptr_t)new_contents + sizeof(*new_contents),
+          IREE_FILE_BASE_ALIGNMENT);
+      if (new_data != old_data) {
+        // Alignment changed; move the data with safety for overlapping.
+        memmove(new_data, old_data, size);
+      }
+      contents->buffer.data = new_data;
+      capacity = new_capacity;
+    }
+    contents->buffer.data[size++] = c;
+  }
+
+  contents->allocator = allocator;
+  contents->buffer.data[size] = 0;  // NUL
+  contents->buffer.data_length = size;
+  *out_contents = contents;
+  return iree_ok_status();
+}
+
+iree_status_t iree_stdin_read_contents(iree_allocator_t allocator,
+                                       iree_file_contents_t** out_contents) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_ASSERT_ARGUMENT(out_contents);
+  *out_contents = NULL;
+  iree_status_t status = iree_stdin_read_contents_impl(allocator, out_contents);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+#else
+
+iree_status_t iree_file_exists(const char* path) {
+  return iree_make_status(IREE_STATUS_UNAVAILABLE, "File I/O is disabled");
+}
+
+iree_allocator_t iree_file_contents_deallocator(
+    iree_file_contents_t* contents) {
+  return iree_allocator_null();
+}
+
+void iree_file_contents_free(iree_file_contents_t* contents) {}
+
+iree_status_t iree_file_read_contents(const char* path,
+                                      iree_allocator_t allocator,
+                                      iree_file_contents_t** out_contents) {
+  return iree_make_status(IREE_STATUS_UNAVAILABLE, "File I/O is disabled");
+}
+
+iree_status_t iree_file_write_contents(const char* path,
+                                       iree_const_byte_span_t content) {
+  return iree_make_status(IREE_STATUS_UNAVAILABLE, "File I/O is disabled");
+}
+
+iree_status_t iree_stdin_read_contents(iree_allocator_t allocator,
+                                       iree_file_contents_t** out_contents) {
+  return iree_make_status(IREE_STATUS_UNAVAILABLE, "File I/O is disabled");
+}
+
+#endif  // IREE_FILE_IO_ENABLE
diff --git a/runtime/src/iree/base/internal/file_io.h b/runtime/src/iree/base/internal/file_io.h
new file mode 100644
index 0000000..3418c62
--- /dev/null
+++ b/runtime/src/iree/base/internal/file_io.h
@@ -0,0 +1,68 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BASE_INTERNAL_FILE_IO_H_
+#define IREE_BASE_INTERNAL_FILE_IO_H_
+
+#include "iree/base/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Checks if a file exists at the provided |path|.
+//
+// Returns an OK status if the file definitely exists. An OK status does not
+// indicate that attempts to read or write the file will succeed.
+// Returns IREE_STATUS_NOT_FOUND if the file does not exist.
+iree_status_t iree_file_exists(const char* path);
+
+// Loaded file contents.
+typedef struct iree_file_contents_t {
+  iree_allocator_t allocator;
+  union {
+    iree_byte_span_t buffer;
+    iree_const_byte_span_t const_buffer;
+  };
+} iree_file_contents_t;
+
+// Returns an allocator that deallocates the |contents|.
+// This can be passed to functions that require a deallocation mechanism.
+iree_allocator_t iree_file_contents_deallocator(iree_file_contents_t* contents);
+
+// Frees memory associated with |contents|.
+void iree_file_contents_free(iree_file_contents_t* contents);
+
+// Synchronously reads a file's contents into memory.
+//
+// Returns the contents of the file in |out_contents|.
+// |allocator| is used to allocate the memory and the caller must use
+// iree_file_contents_free to release the memory.
+iree_status_t iree_file_read_contents(const char* path,
+                                      iree_allocator_t allocator,
+                                      iree_file_contents_t** out_contents);
+
+// Synchronously writes a byte buffer into a file.
+// Existing contents are overwritten.
+iree_status_t iree_file_write_contents(const char* path,
+                                       iree_const_byte_span_t content);
+
+// Reads the contents of stdin until EOF into memory.
+// The contents will specify up until EOF and the allocation will have a
+// trailing NUL to allow use as a C-string (assuming the contents themselves
+// don't contain NUL).
+//
+// Returns the contents of the file in |out_contents|.
+// |allocator| is used to allocate the memory and the caller must use
+// iree_file_contents_free to release the memory.
+iree_status_t iree_stdin_read_contents(iree_allocator_t allocator,
+                                       iree_file_contents_t** out_contents);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // IREE_BASE_INTERNAL_FILE_IO_H_
diff --git a/runtime/src/iree/base/internal/file_io_test.cc b/runtime/src/iree/base/internal/file_io_test.cc
new file mode 100644
index 0000000..fd975a6
--- /dev/null
+++ b/runtime/src/iree/base/internal/file_io_test.cc
@@ -0,0 +1,82 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/internal/file_io.h"
+
+#include "iree/base/config.h"
+
+#if IREE_FILE_IO_ENABLE
+
+#include <cstdlib>
+#include <cstring>
+#include <ostream>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "iree/base/logging.h"
+#include "iree/base/status_cc.h"
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace iree {
+namespace file_io {
+namespace {
+
+using ::iree::testing::status::StatusIs;
+
+std::string GetUniquePath(const char* unique_name) {
+  char* test_tmpdir = getenv("TEST_TMPDIR");
+  if (!test_tmpdir) {
+    test_tmpdir = getenv("TMPDIR");
+  }
+  if (!test_tmpdir) {
+    test_tmpdir = getenv("TEMP");
+  }
+  IREE_CHECK(test_tmpdir) << "TEST_TMPDIR/TMPDIR/TEMP not defined";
+  return test_tmpdir + std::string("/iree_test_") + unique_name;
+}
+
+std::string GetUniqueContents(const char* unique_name) {
+  return std::string("Test with name ") + unique_name + "\n";
+}
+
+TEST(FileIO, ReadWriteContents) {
+  constexpr const char* kUniqueName = "ReadWriteContents";
+  auto path = GetUniquePath(kUniqueName);
+
+  // File must not exist.
+  iree_status_t status = iree_file_exists(path.c_str());
+  IREE_EXPECT_STATUS_IS(IREE_STATUS_NOT_FOUND, status);
+  iree_status_free(status);
+
+  // Generate file contents.
+  auto write_contents = GetUniqueContents(kUniqueName);
+
+  // Write the contents to disk.
+  IREE_ASSERT_OK(iree_file_write_contents(
+      path.c_str(),
+      iree_make_const_byte_span(write_contents.data(), write_contents.size())));
+
+  // Read the contents from disk.
+  iree_file_contents_t* read_contents = NULL;
+  IREE_ASSERT_OK(iree_file_read_contents(path.c_str(), iree_allocator_system(),
+                                         &read_contents));
+
+  // Expect the contents are equal.
+  EXPECT_EQ(write_contents.size(), read_contents->const_buffer.data_length);
+  EXPECT_EQ(memcmp(write_contents.data(), read_contents->const_buffer.data,
+                   read_contents->const_buffer.data_length),
+            0);
+
+  iree_file_contents_free(read_contents);
+}
+
+}  // namespace
+}  // namespace file_io
+}  // namespace iree
+
+#endif  // IREE_FILE_IO_ENABLE
diff --git a/runtime/src/iree/base/internal/file_path.c b/runtime/src/iree/base/internal/file_path.c
new file mode 100644
index 0000000..0499ec0
--- /dev/null
+++ b/runtime/src/iree/base/internal/file_path.c
@@ -0,0 +1,220 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/internal/file_path.h"
+
+#include <stddef.h>
+#include <string.h>
+
+#include "iree/base/target_platform.h"
+
+static iree_status_t iree_string_view_dup(iree_string_view_t value,
+                                          iree_allocator_t allocator,
+                                          char** out_buffer) {
+  char* buffer = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_allocator_malloc(allocator, value.size + 1, (void**)&buffer));
+  memcpy(buffer, value.data, value.size);
+  buffer[value.size] = 0;  // NUL
+  *out_buffer = buffer;
+  return iree_ok_status();
+}
+
+static iree_status_t iree_string_view_cat(iree_string_view_t lhs,
+                                          iree_string_view_t rhs,
+                                          iree_allocator_t allocator,
+                                          char** out_buffer) {
+  // Allocate storage buffer with NUL character.
+  iree_host_size_t total_length = lhs.size + rhs.size;
+  char* buffer = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_allocator_malloc(allocator, total_length + 1, (void**)&buffer));
+
+  // Copy both parts.
+  memcpy(buffer, lhs.data, lhs.size);
+  memcpy(buffer + lhs.size, rhs.data, rhs.size);
+
+  buffer[total_length] = 0;  // NUL
+  *out_buffer = buffer;
+  return iree_ok_status();
+}
+
+static iree_status_t iree_string_view_join(iree_host_size_t part_count,
+                                           const iree_string_view_t* parts,
+                                           iree_string_view_t separator,
+                                           iree_allocator_t allocator,
+                                           char** out_buffer) {
+  // Compute total output size in characters.
+  iree_host_size_t total_length = 0;
+  for (iree_host_size_t i = 0; i < part_count; ++i) {
+    total_length += parts[i].size;
+  }
+  total_length += part_count > 0 ? separator.size * (part_count - 1) : 0;
+
+  // Allocate storage buffer with NUL character.
+  char* buffer = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_allocator_malloc(allocator, total_length + 1, (void**)&buffer));
+
+  // Append each part and a separator between each.
+  char* p = buffer;
+  for (iree_host_size_t i = 0; i < part_count; ++i) {
+    memcpy(p, parts[i].data, parts[i].size);
+    p += parts[i].size;
+    if (i != part_count - 1) {
+      memcpy(p, separator.data, separator.size);
+      p += separator.size;
+    }
+  }
+
+  buffer[total_length] = 0;  // NUL
+  *out_buffer = buffer;
+  return iree_ok_status();
+}
+
+static iree_host_size_t iree_file_path_canonicalize_unix(
+    char* path, iree_host_size_t path_length) {
+  char* p = path;
+  iree_host_size_t new_length = path_length;
+
+  // Replace `//` with `/`.
+  if (new_length > 1) {
+    for (iree_host_size_t i = 0; i < new_length - 1; ++i) {
+      if (p[i] == '/' && p[i + 1] == '/') {
+        memmove(&p[i + 1], &p[i + 2], new_length - i - 2);
+        --new_length;
+        --i;
+      }
+    }
+  }
+
+  path[new_length] = 0;  // NUL
+  return new_length;
+}
+
+static iree_host_size_t iree_file_path_canonicalize_win32(
+    char* path, iree_host_size_t path_length) {
+  char* p = path;
+  iree_host_size_t new_length = path_length;
+
+  // Replace `/` with `\`.
+  for (iree_host_size_t i = 0; i < new_length; ++i) {
+    if (p[i] == '/') p[i] = '\\';
+  }
+
+  // Replace `\\` with `\`.
+  if (new_length > 1) {
+    for (iree_host_size_t i = 0; i < new_length - 1; ++i) {
+      if (p[i] == '\\' && p[i + 1] == '\\') {
+        memmove(&p[i + 1], &p[i + 2], new_length - i - 2);
+        --new_length;
+        --i;
+      }
+    }
+  }
+
+  path[new_length] = 0;  // NUL
+  return new_length;
+}
+
+iree_host_size_t iree_file_path_canonicalize(char* path,
+                                             iree_host_size_t path_length) {
+#if defined(IREE_PLATFORM_WINDOWS)
+  return iree_file_path_canonicalize_win32(path, path_length);
+#else
+  return iree_file_path_canonicalize_unix(path, path_length);
+#endif  // IREE_PLATFORM_WINDOWS
+}
+
+iree_status_t iree_file_path_join(iree_string_view_t lhs,
+                                  iree_string_view_t rhs,
+                                  iree_allocator_t allocator, char** out_path) {
+  if (iree_string_view_is_empty(lhs)) {
+    return iree_string_view_dup(rhs, allocator, out_path);
+  }
+  if (iree_string_view_is_empty(rhs)) {
+    return iree_string_view_dup(lhs, allocator, out_path);
+  }
+  if (lhs.data[lhs.size - 1] == '/') {
+    if (rhs.data[0] == '/') {
+      return iree_string_view_cat(
+          lhs, iree_string_view_substr(rhs, 1, IREE_STRING_VIEW_NPOS),
+          allocator, out_path);
+    }
+  } else {
+    if (rhs.data[0] != '/') {
+      iree_string_view_t parts[2] = {lhs, rhs};
+      return iree_string_view_join(IREE_ARRAYSIZE(parts), parts,
+                                   iree_make_cstring_view("/"), allocator,
+                                   out_path);
+    }
+  }
+  return iree_string_view_cat(lhs, rhs, allocator, out_path);
+}
+
+void iree_file_path_split(iree_string_view_t path,
+                          iree_string_view_t* out_dirname,
+                          iree_string_view_t* out_basename) {
+  iree_host_size_t pos = iree_string_view_find_last_of(
+      path, iree_make_cstring_view("/"), IREE_STRING_VIEW_NPOS);
+  if (pos == IREE_STRING_VIEW_NPOS) {
+    // No '/' in path.
+    *out_dirname = iree_string_view_empty();
+    *out_basename = path;
+  } else if (pos == 0) {
+    // Single leading '/' in path.
+    *out_dirname = iree_string_view_substr(path, 0, 1);
+    *out_basename = iree_string_view_substr(path, 1, IREE_STRING_VIEW_NPOS);
+  } else {
+    *out_dirname = iree_string_view_substr(path, 0, pos);
+    *out_basename =
+        iree_string_view_substr(path, pos + 1, IREE_STRING_VIEW_NPOS);
+  }
+}
+
+iree_string_view_t iree_file_path_dirname(iree_string_view_t path) {
+  iree_string_view_t dirname = iree_string_view_empty();
+  iree_string_view_t basename = iree_string_view_empty();
+  iree_file_path_split(path, &dirname, &basename);
+  return dirname;
+}
+
+iree_string_view_t iree_file_path_basename(iree_string_view_t path) {
+  iree_string_view_t dirname = iree_string_view_empty();
+  iree_string_view_t basename = iree_string_view_empty();
+  iree_file_path_split(path, &dirname, &basename);
+  return basename;
+}
+
+void iree_file_path_split_basename(iree_string_view_t path,
+                                   iree_string_view_t* out_stem,
+                                   iree_string_view_t* out_extension) {
+  path = iree_file_path_basename(path);
+  iree_host_size_t pos = iree_string_view_find_last_of(
+      path, iree_make_cstring_view("."), IREE_STRING_VIEW_NPOS);
+  if (pos == IREE_STRING_VIEW_NPOS) {
+    *out_stem = path;
+    *out_extension = iree_string_view_empty();
+  } else {
+    *out_stem = iree_string_view_substr(path, 0, pos);
+    *out_extension =
+        iree_string_view_substr(path, pos + 1, IREE_STRING_VIEW_NPOS);
+  }
+}
+
+iree_string_view_t iree_file_path_stem(iree_string_view_t path) {
+  iree_string_view_t stem = iree_string_view_empty();
+  iree_string_view_t extension = iree_string_view_empty();
+  iree_file_path_split_basename(path, &stem, &extension);
+  return stem;
+}
+
+iree_string_view_t iree_file_path_extension(iree_string_view_t path) {
+  iree_string_view_t stem = iree_string_view_empty();
+  iree_string_view_t extension = iree_string_view_empty();
+  iree_file_path_split_basename(path, &stem, &extension);
+  return extension;
+}
diff --git a/runtime/src/iree/base/internal/file_path.h b/runtime/src/iree/base/internal/file_path.h
new file mode 100644
index 0000000..9893566
--- /dev/null
+++ b/runtime/src/iree/base/internal/file_path.h
@@ -0,0 +1,70 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BASE_INTERNAL_FILE_PATH_H_
+#define IREE_BASE_INTERNAL_FILE_PATH_H_
+
+#include "iree/base/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Canonicalizes the given |path| to the platform convention by replacing `/`
+// with the appropriate character (`\` on Windows) and stripping extraneous
+// slashes that may have ended up in the filename.
+//
+// NOTE: this is *not* the same as canonicalizing the path via system utilities
+// that may, for example, resolve network paths or symlinks.
+//
+// |path| (of character length |path_length|) is mutated in-place and will have
+// the same or smaller length upon return. Returns the new length of the path. A
+// NUL terminator will be ensured at the end.
+iree_host_size_t iree_file_path_canonicalize(char* path,
+                                             iree_host_size_t path_length);
+
+// Joins two paths together by inserting `/` as needed.
+//
+// For example:
+//   iree_file_path_join('foo', 'bar') --> 'foo/bar'
+//   iree_file_path_join('/foo/', '/bar') --> '/foo/bar'
+//
+// Returns the canonicalized path allocated from |allocator| in |out_path|.
+// Callers must free the string when they are done with it.
+iree_status_t iree_file_path_join(iree_string_view_t lhs,
+                                  iree_string_view_t rhs,
+                                  iree_allocator_t allocator, char** out_path);
+
+// Splits |path| into the dirname and basename at the final `/`.
+void iree_file_path_split(iree_string_view_t path,
+                          iree_string_view_t* out_dirname,
+                          iree_string_view_t* out_basename);
+
+// Gets the directory name component of a file |path| (everything before the
+// final `/`).
+iree_string_view_t iree_file_path_dirname(iree_string_view_t path);
+
+// Returns the part of the |path| after the final `/`.
+iree_string_view_t iree_file_path_basename(iree_string_view_t path);
+
+// Returns the parts of the basename of path, split on the final `.`.
+// If there is no `.` in the basename or `.` is the final character in the
+// basename the second value will be empty.
+void iree_file_path_split_basename(iree_string_view_t path,
+                                   iree_string_view_t* out_stem,
+                                   iree_string_view_t* out_extension);
+
+// Returns the part of the basename of |path| prior to the final `.`.
+iree_string_view_t iree_file_path_stem(iree_string_view_t path);
+
+// Returns the part of the basename of |path| after to the final `.`.
+iree_string_view_t iree_file_path_extension(iree_string_view_t path);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // IREE_BASE_INTERNAL_FILE_PATH_H_
diff --git a/runtime/src/iree/base/internal/file_path_test.cc b/runtime/src/iree/base/internal/file_path_test.cc
new file mode 100644
index 0000000..456431d
--- /dev/null
+++ b/runtime/src/iree/base/internal/file_path_test.cc
@@ -0,0 +1,170 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/internal/file_path.h"
+
+#include <string>
+
+#include "iree/base/target_platform.h"
+#include "iree/testing/gtest.h"
+
+namespace {
+
+#define _SV(str) iree_make_cstring_view(str)
+
+#define EXPECT_SV_EQ(actual, expected) \
+  EXPECT_TRUE(iree_string_view_equal(actual, expected))
+
+TEST(FilePathTest, Canonicalize) {
+  auto canonicalize = [](std::string value) {
+    value.resize(
+        iree_file_path_canonicalize((char*)value.data(), value.size()));
+    return value;
+  };
+  EXPECT_EQ(canonicalize(""), "");
+  EXPECT_EQ(canonicalize("a"), "a");
+  EXPECT_EQ(canonicalize("ab"), "ab");
+
+#if defined(IREE_PLATFORM_WINDOWS)
+  EXPECT_EQ(canonicalize("/"), "\\");
+  EXPECT_EQ(canonicalize("\\"), "\\");
+  EXPECT_EQ(canonicalize("a/b"), "a\\b");
+  EXPECT_EQ(canonicalize("a//b"), "a\\b");
+  EXPECT_EQ(canonicalize("a////b"), "a\\b");
+  EXPECT_EQ(canonicalize("a\\//b"), "a\\b");
+  EXPECT_EQ(canonicalize("a\\\\b"), "a\\b");
+  EXPECT_EQ(canonicalize("\\a"), "\\a");
+  EXPECT_EQ(canonicalize("/a"), "\\a");
+  EXPECT_EQ(canonicalize("//a"), "\\a");
+  EXPECT_EQ(canonicalize("a/"), "a\\");
+  EXPECT_EQ(canonicalize("a//"), "a\\");
+#else
+  EXPECT_EQ(canonicalize("/"), "/");
+  EXPECT_EQ(canonicalize("a/b"), "a/b");
+  EXPECT_EQ(canonicalize("a//b"), "a/b");
+  EXPECT_EQ(canonicalize("a////b"), "a/b");
+  EXPECT_EQ(canonicalize("/a"), "/a");
+  EXPECT_EQ(canonicalize("//a"), "/a");
+  EXPECT_EQ(canonicalize("a/"), "a/");
+  EXPECT_EQ(canonicalize("a//"), "a/");
+#endif  // IREE_PLATFORM_WINDOWS
+}
+
+static std::string JoinPaths(std::string lhs, std::string rhs) {
+  char* result_str = NULL;
+  IREE_IGNORE_ERROR(
+      iree_file_path_join(iree_make_string_view(lhs.data(), lhs.size()),
+                          iree_make_string_view(rhs.data(), rhs.size()),
+                          iree_allocator_system(), &result_str));
+  std::string result;
+  result.resize(strlen(result_str));
+  memcpy((char*)result.data(), result_str, result.size());
+  iree_allocator_free(iree_allocator_system(), result_str);
+  return result;
+}
+
+TEST(FilePathTest, JoinPathsEmpty) {
+  EXPECT_EQ(JoinPaths("", ""), "");
+  EXPECT_EQ(JoinPaths("", "bar"), "bar");
+  EXPECT_EQ(JoinPaths("foo", ""), "foo");
+}
+
+TEST(FilePathTest, JoinPathsSlash) {
+  EXPECT_EQ(JoinPaths("foo", "bar"), "foo/bar");
+  EXPECT_EQ(JoinPaths("foo", "bar/"), "foo/bar/");
+  EXPECT_EQ(JoinPaths("foo", "/bar"), "foo/bar");
+  EXPECT_EQ(JoinPaths("foo", "/bar/"), "foo/bar/");
+
+  EXPECT_EQ(JoinPaths("foo/", "bar"), "foo/bar");
+  EXPECT_EQ(JoinPaths("foo/", "bar/"), "foo/bar/");
+  EXPECT_EQ(JoinPaths("foo/", "/bar"), "foo/bar");
+  EXPECT_EQ(JoinPaths("foo/", "/bar/"), "foo/bar/");
+
+  EXPECT_EQ(JoinPaths("/foo", "bar"), "/foo/bar");
+  EXPECT_EQ(JoinPaths("/foo", "bar/"), "/foo/bar/");
+  EXPECT_EQ(JoinPaths("/foo", "/bar"), "/foo/bar");
+  EXPECT_EQ(JoinPaths("/foo", "/bar/"), "/foo/bar/");
+
+  EXPECT_EQ(JoinPaths("/foo/", "bar"), "/foo/bar");
+  EXPECT_EQ(JoinPaths("/foo/", "bar/"), "/foo/bar/");
+  EXPECT_EQ(JoinPaths("/foo/", "/bar"), "/foo/bar");
+  EXPECT_EQ(JoinPaths("/foo/", "/bar/"), "/foo/bar/");
+}
+
+TEST(FilePathTest, JoinPathsDoubleSlash) {
+  EXPECT_EQ(JoinPaths("foo//", "bar"), "foo//bar");
+  EXPECT_EQ(JoinPaths("foo", "//bar"), "foo//bar");
+}
+
+TEST(FilePathTest, DirnameEmpty) {
+  EXPECT_SV_EQ(iree_file_path_dirname(_SV("")), _SV(""));
+}
+
+TEST(FilePathTest, DirnameAbsolute) {
+  EXPECT_SV_EQ(iree_file_path_dirname(_SV("/")), _SV("/"));
+  EXPECT_SV_EQ(iree_file_path_dirname(_SV("/foo")), _SV("/"));
+  EXPECT_SV_EQ(iree_file_path_dirname(_SV("/foo/")), _SV("/foo"));
+  EXPECT_SV_EQ(iree_file_path_dirname(_SV("/foo/bar")), _SV("/foo"));
+  EXPECT_SV_EQ(iree_file_path_dirname(_SV("/foo/bar/")), _SV("/foo/bar"));
+}
+
+TEST(FilePathTest, DirnameRelative) {
+  EXPECT_SV_EQ(iree_file_path_dirname(_SV("foo")), _SV(""));
+  EXPECT_SV_EQ(iree_file_path_dirname(_SV("foo/")), _SV("foo"));
+  EXPECT_SV_EQ(iree_file_path_dirname(_SV("foo/bar")), _SV("foo"));
+  EXPECT_SV_EQ(iree_file_path_dirname(_SV("foo/bar/")), _SV("foo/bar"));
+}
+
+TEST(FilePathTest, DirnameDoubleSlash) {
+  EXPECT_SV_EQ(iree_file_path_dirname(_SV("foo//")), _SV("foo/"));
+}
+
+TEST(FilePathTest, BasenameEmpty) {
+  EXPECT_SV_EQ(iree_file_path_basename(_SV("")), _SV(""));
+}
+
+TEST(FilePathTest, BasenameAbsolute) {
+  EXPECT_SV_EQ(iree_file_path_basename(_SV("/")), _SV(""));
+  EXPECT_SV_EQ(iree_file_path_basename(_SV("/foo")), _SV("foo"));
+  EXPECT_SV_EQ(iree_file_path_basename(_SV("/foo/")), _SV(""));
+  EXPECT_SV_EQ(iree_file_path_basename(_SV("/foo/bar")), _SV("bar"));
+  EXPECT_SV_EQ(iree_file_path_basename(_SV("/foo/bar/")), _SV(""));
+}
+
+TEST(FilePathTest, BasenameRelative) {
+  EXPECT_SV_EQ(iree_file_path_basename(_SV("foo")), _SV("foo"));
+  EXPECT_SV_EQ(iree_file_path_basename(_SV("foo/")), _SV(""));
+  EXPECT_SV_EQ(iree_file_path_basename(_SV("foo/bar")), _SV("bar"));
+  EXPECT_SV_EQ(iree_file_path_basename(_SV("foo/bar/")), _SV(""));
+}
+
+TEST(FilePathTest, BasenameDoubleSlash) {
+  EXPECT_SV_EQ(iree_file_path_basename(_SV("foo//")), _SV(""));
+}
+
+TEST(FilePathTest, Stem) {
+  EXPECT_SV_EQ(iree_file_path_stem(_SV("")), _SV(""));
+  EXPECT_SV_EQ(iree_file_path_stem(_SV("foo")), _SV("foo"));
+  EXPECT_SV_EQ(iree_file_path_stem(_SV("foo.")), _SV("foo"));
+  EXPECT_SV_EQ(iree_file_path_stem(_SV("foo.bar")), _SV("foo"));
+  EXPECT_SV_EQ(iree_file_path_stem(_SV("foo..")), _SV("foo."));
+  EXPECT_SV_EQ(iree_file_path_stem(_SV("foo..bar")), _SV("foo."));
+  EXPECT_SV_EQ(iree_file_path_stem(_SV(".bar")), _SV(""));
+  EXPECT_SV_EQ(iree_file_path_stem(_SV("..bar")), _SV("."));
+}
+
+TEST(FilePathTest, Extension) {
+  EXPECT_SV_EQ(iree_file_path_extension(_SV("")), _SV(""));
+  EXPECT_SV_EQ(iree_file_path_extension(_SV("foo")), _SV(""));
+  EXPECT_SV_EQ(iree_file_path_extension(_SV("foo.")), _SV(""));
+  EXPECT_SV_EQ(iree_file_path_extension(_SV("foo.bar")), _SV("bar"));
+  EXPECT_SV_EQ(iree_file_path_extension(_SV("foo..")), _SV(""));
+  EXPECT_SV_EQ(iree_file_path_extension(_SV("foo..bar")), _SV("bar"));
+  EXPECT_SV_EQ(iree_file_path_extension(_SV(".bar")), _SV("bar"));
+  EXPECT_SV_EQ(iree_file_path_extension(_SV("..bar")), _SV("bar"));
+}
+
+}  // namespace
diff --git a/runtime/src/iree/base/internal/flags.c b/runtime/src/iree/base/internal/flags.c
new file mode 100644
index 0000000..7f1c961
--- /dev/null
+++ b/runtime/src/iree/base/internal/flags.c
@@ -0,0 +1,545 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/internal/flags.h"
+
+#include <errno.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "iree/base/api.h"
+
+#if IREE_FLAGS_ENABLE_CLI == 1
+
+#include "iree/base/internal/debugging.h"
+#include "iree/base/tracing.h"
+
+//===----------------------------------------------------------------------===//
+// Flag manipulation utilities
+//===----------------------------------------------------------------------===//
+
+static iree_status_t iree_flags_leaky_allocator_ctl(
+    void* self, iree_allocator_command_t command, const void* params,
+    void** inout_ptr) {
+  IREE_LEAK_CHECK_DISABLE_PUSH();
+  iree_status_t status =
+      iree_allocator_system_ctl(/*self=*/NULL, command, params, inout_ptr);
+  IREE_LEAK_CHECK_DISABLE_POP();
+  return status;
+}
+
+static void iree_flags_leaky_free(void* self, void* ptr) { free(ptr); }
+
+// Allocates heap memory that is leaked without triggering leak checkers.
+// We do this so that we have valid memory for the lifetime of the process.
+// The memory may still be freed but if not will not hurt anything (besides the
+// private working set size).
+static iree_allocator_t iree_flags_leaky_allocator(void) {
+  iree_allocator_t allocator = {
+      .self = NULL,
+      .ctl = iree_flags_leaky_allocator_ctl,
+  };
+  return allocator;
+}
+
+//===----------------------------------------------------------------------===//
+// Flag registry
+//===----------------------------------------------------------------------===//
+
+// Storage for registered flags.
+typedef struct iree_flag_t {
+  // __FILE__ of flag definition.
+  const char* file;
+  // __LINE__ of flag definition.
+  int line;
+  // Defines what data is at |storage| and how to parse/print it.
+  iree_flag_type_t type;
+  // Registered callback to issue when the flag is parsed, if any.
+  iree_flag_parse_callback_fn_t parse_callback;
+  // Registered callback to issue when the flag is to be printed, if any.
+  iree_flag_print_callback_fn_t print_callback;
+  // Direct reference to the variable storing the flag value of |type|.
+  void* storage;
+  // Name of the flag on the command line ('foo' => '--foo=value').
+  iree_string_view_t name;
+  // Short description string.
+  iree_string_view_t description;
+} iree_flag_t;
+
+// State used for flag registration and reflection.
+typedef struct iree_flag_registry_t {
+  const char* program_name;
+  const char* usage;
+
+  // Total number of entries in the |flags| list.
+  int flag_count;
+  // All registered flags in the executable in an undefined order.
+  iree_flag_t flags[IREE_FLAGS_CAPACITY];
+} iree_flag_registry_t;
+
+// Global flags state.
+// This will persist for the lifetime of the program so that flags can be
+// reparsed/dumped. If you're concerned about the .data overhead then you
+// probably just want to disable the CLI support for flags entirely.
+static iree_flag_registry_t iree_flag_registry = {
+    .program_name = NULL,
+    .usage = NULL,
+    .flag_count = 0,
+};
+
+int iree_flag_register(const char* file, int line, iree_flag_type_t type,
+                       void* storage,
+                       iree_flag_parse_callback_fn_t parse_callback,
+                       iree_flag_print_callback_fn_t print_callback,
+                       iree_string_view_t name,
+                       iree_string_view_t description) {
+  // TODO(benvanik): make the registry a linked list and externalize the
+  // flag storage - then no need for a fixed count. If you're hitting this then
+  // file an issue :)
+  iree_flag_registry_t* registry = &iree_flag_registry;
+  IREE_ASSERT_LE(registry->flag_count + 1, IREE_FLAGS_CAPACITY,
+                 "flag registry overflow; too many flags registered");
+  int flag_ordinal = registry->flag_count++;
+  iree_flag_t* flag = &registry->flags[flag_ordinal];
+  flag->file = file;
+  flag->line = line;
+  flag->type = type;
+  flag->parse_callback = parse_callback;
+  flag->print_callback = print_callback;
+  flag->storage = storage;
+  flag->name = name;
+  flag->description = description;
+  return flag_ordinal;
+}
+
+// Returns the flag registration with the given |name| or NULL if not found.
+static iree_flag_t* iree_flag_lookup(iree_string_view_t name) {
+  iree_flag_registry_t* registry = &iree_flag_registry;
+  for (int i = 0; i < registry->flag_count; ++i) {
+    iree_flag_t* flag = &registry->flags[i];
+    if (iree_string_view_equal(flag->name, name)) {
+      return flag;
+    }
+  }
+  return NULL;
+}
+
+static int iree_flag_cmp(const void* lhs_ptr, const void* rhs_ptr) {
+  const iree_flag_t* lhs = (const iree_flag_t*)lhs_ptr;
+  const iree_flag_t* rhs = (const iree_flag_t*)rhs_ptr;
+  int ret = strcmp(lhs->file, rhs->file);
+  if (ret == 0) {
+    return lhs->line - rhs->line;
+  }
+  return ret;
+}
+
+// Sorts the flags in the flag registry by file > line.
+static void iree_flag_registry_sort(iree_flag_registry_t* registry) {
+  qsort(registry->flags, registry->flag_count, sizeof(iree_flag_t),
+        iree_flag_cmp);
+}
+
+//===----------------------------------------------------------------------===//
+// Flag parsing/printing
+//===----------------------------------------------------------------------===//
+
+void iree_flags_set_usage(const char* program_name, const char* usage) {
+  iree_flag_registry_t* registry = &iree_flag_registry;
+  registry->program_name = program_name;
+  registry->usage = usage;
+}
+
+// Parses a flag value from the given string and stores it.
+static iree_status_t iree_flag_parse(iree_flag_t* flag,
+                                     iree_string_view_t value) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_TEXT(z0, flag->name.data, flag->name.size);
+  IREE_TRACE_ZONE_APPEND_TEXT(z0, value.data, value.size);
+
+  // Insert NUL on the flag value. This is safe as the value is either coming
+  // from C argv memory which is mutable or a flagfile that we loaded into
+  // memory ourselves.
+  char* str_value = (char*)value.data;
+  if (value.size > 0) {
+    str_value[value.size] = 0;
+  }
+
+  iree_status_t status = iree_ok_status();
+  switch (flag->type) {
+    case IREE_FLAG_TYPE_callback:
+      status = flag->parse_callback(flag->name, flag->storage, value);
+      break;
+    case IREE_FLAG_TYPE_bool:
+      if (value.size == 0 || strcmp(str_value, "true") == 0 ||
+          strcmp(str_value, "1") == 0) {
+        *(bool*)flag->storage = true;
+      } else {
+        *(bool*)flag->storage = false;
+      }
+      break;
+    case IREE_FLAG_TYPE_int32_t:
+      *(int32_t*)flag->storage = value.size ? atoi(str_value) : 0;
+      break;
+    case IREE_FLAG_TYPE_int64_t:
+      *(int64_t*)flag->storage = value.size ? atoll(str_value) : 0;
+      break;
+    case IREE_FLAG_TYPE_float:
+      *(float*)flag->storage = value.size ? (float)atof(str_value) : 0.0f;
+      break;
+    case IREE_FLAG_TYPE_double:
+      *(double*)flag->storage = value.size ? atof(str_value) : 0.0;
+      break;
+    case IREE_FLAG_TYPE_string: {
+      iree_host_size_t str_length = value.size;
+      if (str_length > 2) {
+        // Strip double quotes: "foo" -> foo.
+        // This may not be worth the complexity.
+        if (str_value[0] == '"' && str_value[str_length - 1] == '"') {
+          str_value[str_length - 1] = 0;
+          ++str_value;
+          str_length = str_length - 2;
+        }
+      }
+      *(const char**)flag->storage = str_value;
+      break;
+    }
+    default:
+      status = iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                                "invalid flag type %u", flag->type);
+      break;
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+// Prints a flag value to |file| (like 'true' or '5.43').
+static void iree_flag_print(FILE* file, iree_flag_t* flag) {
+  if (flag->type == IREE_FLAG_TYPE_callback) {
+    flag->print_callback(flag->name, flag->storage, file);
+    return;
+  }
+  fprintf(file, "--%.*s", (int)flag->name.size, flag->name.data);
+  if (flag->storage == NULL) return;
+  switch (flag->type) {
+    case IREE_FLAG_TYPE_bool:
+      fprintf(file, "=%s", (*(bool*)flag->storage) ? "true" : "false");
+      break;
+    case IREE_FLAG_TYPE_int32_t:
+      fprintf(file, "=%" PRId32, *(int32_t*)flag->storage);
+      break;
+    case IREE_FLAG_TYPE_int64_t:
+      fprintf(file, "=%" PRId64, *(int64_t*)flag->storage);
+      break;
+    case IREE_FLAG_TYPE_float:
+      fprintf(file, "=%g", *(float*)flag->storage);
+      break;
+    case IREE_FLAG_TYPE_double:
+      fprintf(file, "=%g", *(double*)flag->storage);
+      break;
+    case IREE_FLAG_TYPE_string:
+      fprintf(file, "=\"%s\"", *(const char**)flag->storage);
+      break;
+    default:
+      fprintf(file, "=<INVALID>");
+      break;
+  }
+  fprintf(file, "\n");
+}
+
+// Dumps a flag definition and value to |file|.
+static void iree_flag_dump(iree_flag_dump_mode_t mode, FILE* file,
+                           iree_flag_t* flag) {
+  if (iree_all_bits_set(mode, IREE_FLAG_DUMP_MODE_VERBOSE)) {
+    if (!iree_string_view_is_empty(flag->description)) {
+      iree_string_view_t description = flag->description;
+      while (!iree_string_view_is_empty(description)) {
+        iree_string_view_t line;
+        iree_string_view_split(description, '\n', &line, &description);
+        if (!iree_string_view_is_empty(line)) {
+          fprintf(file, "# %.*s\n", (int)line.size, line.data);
+        }
+      }
+    }
+  }
+  iree_flag_print(file, flag);
+}
+
+static iree_status_t iree_flags_parse_help(iree_string_view_t flag_name,
+                                           void* storage,
+                                           iree_string_view_t value) {
+  iree_flag_registry_t* registry = &iree_flag_registry;
+
+  fprintf(stdout,
+          "# "
+          "===================================================================="
+          "========\n");
+  fprintf(stdout, "# 👻 IREE: %s\n",
+          registry->program_name ? registry->program_name : "");
+  fprintf(stdout,
+          "# "
+          "===================================================================="
+          "========\n\n");
+  if (registry->usage) {
+    fprintf(stdout, "%s\n", registry->usage);
+  }
+  iree_flags_dump(IREE_FLAG_DUMP_MODE_VERBOSE, stdout);
+  fprintf(stdout, "\n");
+
+  return iree_ok_status();
+}
+static void iree_flags_print_help(iree_string_view_t flag_name, void* storage,
+                                  FILE* file) {
+  fprintf(file, "# --%.*s\n", (int)flag_name.size, flag_name.data);
+}
+IREE_FLAG_CALLBACK(iree_flags_parse_help, iree_flags_print_help, NULL, help,
+                   "Displays command line usage information.");
+
+// Removes argument |arg| from the argument list.
+static void iree_flags_remove_arg(int arg, int* argc_ptr, char*** argv_ptr) {
+  int argc = *argc_ptr;
+  char** argv = *argv_ptr;
+  memmove(&argv[arg], &argv[arg + 1], (argc - arg) * sizeof(char*));
+  *argc_ptr = argc - 1;
+}
+
+iree_status_t iree_flags_parse(iree_flags_parse_mode_t mode, int* argc_ptr,
+                               char*** argv_ptr) {
+  if (argc_ptr == NULL || argv_ptr == NULL || *argc_ptr == 0) {
+    // No flags; that's fine - in some environments flags aren't supported.
+    return iree_ok_status();
+  }
+
+  // Always sort the registry; though we may parse flags multiple times this is
+  // not a hot path and this is easier than trying to keep track of whether we
+  // need to or not.
+  iree_flag_registry_sort(&iree_flag_registry);
+
+  int argc = *argc_ptr;
+  char** argv = *argv_ptr;
+
+  for (int arg_ordinal = 1; arg_ordinal < argc; ++arg_ordinal) {
+    iree_string_view_t arg = iree_make_cstring_view(argv[arg_ordinal]);
+
+    // Strip whitespace.
+    arg = iree_string_view_trim(arg);
+
+    // Position arguments are ignored; they may appear anywhere in the list.
+    if (!iree_string_view_starts_with(arg, iree_make_cstring_view("--"))) {
+      continue;
+    }
+
+    // Strip `--`.
+    arg = iree_string_view_remove_prefix(arg, 2);
+
+    // Split into `flag_name` = `flag_value`.
+    iree_string_view_t flag_name;
+    iree_string_view_t flag_value;
+    iree_string_view_split(arg, '=', &flag_name, &flag_value);
+    flag_name = iree_string_view_trim(flag_name);
+    flag_value = iree_string_view_trim(flag_value);
+
+    // Lookup the flag by name.
+    iree_flag_t* flag = iree_flag_lookup(flag_name);
+    if (!flag) {
+      // If --undefok allows undefined flags then we just skip this one. Note
+      // that we leave it in the argument list so that subsequent flag parsers
+      // can try to handle it.
+      if (iree_all_bits_set(mode, IREE_FLAGS_PARSE_MODE_UNDEFINED_OK)) {
+        continue;
+      }
+      return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                              "flag '%.*s' not recognized", (int)flag_name.size,
+                              flag_name.data);
+    }
+
+    // Parse and store the flag value.
+    IREE_RETURN_IF_ERROR(iree_flag_parse(flag, flag_value));
+
+    // --help gets special handling due to interop with external libraries that
+    // may also need to find it. If indicated we keep --help in the argument
+    // list and don't exit.
+    if (iree_string_view_equal(flag_name, iree_make_cstring_view("help"))) {
+      if (iree_all_bits_set(mode, IREE_FLAGS_PARSE_MODE_CONTINUE_AFTER_HELP)) {
+        continue;  // don't remove the arg below
+      }
+      exit(0);  // --help exits by default.
+    }
+
+    // Splice out the flag from the argv list.
+    iree_flags_remove_arg(arg_ordinal, &argc, &argv);
+    --arg_ordinal;
+  }
+
+  *argc_ptr = argc;
+  return iree_ok_status();
+}
+
+void iree_flags_parse_checked(iree_flags_parse_mode_t mode, int* argc,
+                              char*** argv) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  for (int i = 0; i < *argc; ++i) {
+    IREE_TRACE_ZONE_APPEND_TEXT_CSTRING(z0, (*argv)[i]);
+  }
+  iree_status_t status = iree_flags_parse(mode, argc, argv);
+  IREE_TRACE_ZONE_END(z0);
+  if (iree_status_is_ok(status)) return;
+
+  fprintf(stderr, "\x1b[31mFLAGS ERROR: (╯°□°)╯︵👻\x1b[0m\n");
+  iree_status_fprint(stderr, status);
+  fflush(stderr);
+
+  exit(EXIT_FAILURE);
+}
+
+void iree_flags_dump(iree_flag_dump_mode_t mode, FILE* file) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Always sort the registry; though we may dump flags multiple times this is
+  // not a hot path and this is easier than trying to keep track of whether we
+  // need to or not.
+  iree_flag_registry_sort(&iree_flag_registry);
+
+  const char* last_file = NULL;
+  for (size_t i = 0; i < iree_flag_registry.flag_count; ++i) {
+    iree_flag_t* flag = &iree_flag_registry.flags[i];
+    if (iree_all_bits_set(mode, IREE_FLAG_DUMP_MODE_VERBOSE)) {
+      if (last_file) {
+        fprintf(file, "\n");
+      }
+      if (!last_file || strcmp(flag->file, last_file) != 0) {
+        fprintf(file,
+                "# "
+                "===-----------------------------------------------------------"
+                "-----------===\n");
+        fprintf(file, "# Flags in %s:%d\n", flag->file, flag->line);
+        fprintf(file,
+                "# "
+                "===-----------------------------------------------------------"
+                "-----------===\n\n");
+        last_file = flag->file;
+      }
+    }
+    iree_flag_dump(mode, file, flag);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+//===----------------------------------------------------------------------===//
+// --flagfile= support
+//===----------------------------------------------------------------------===//
+// NOTE: this is conditionally enabled as some platforms may not have IO.
+
+#if IREE_FLAGS_ENABLE_FLAG_FILE == 1
+
+#include "iree/base/internal/file_io.h"
+
+// Parses a newline-separated list of flags from a file.
+static iree_status_t iree_flags_parse_file(iree_string_view_t file_path) {
+  // Read file contents.
+  // NOTE: we intentionally leak the contents here so that the flags remain in
+  // memory in case they are referenced.
+  // NOTE: safe to use file_path.data here as it will always have a NUL
+  // terminator.
+  iree_allocator_t allocator = iree_flags_leaky_allocator();
+  iree_file_contents_t* file_contents = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_file_read_contents(file_path.data, allocator, &file_contents),
+      "while trying to parse flagfile");
+
+  // Run through the file line-by-line.
+  int line_number = 0;
+  iree_string_view_t contents =
+      iree_make_string_view((const char*)file_contents->buffer.data,
+                            file_contents->buffer.data_length);
+  while (!iree_string_view_is_empty(contents)) {
+    // Split into a single line and the entire rest of the file contents.
+    iree_string_view_t line;
+    iree_string_view_split(contents, '\n', &line, &contents);
+    ++line_number;
+
+    // Strip whitespace.
+    line = iree_string_view_trim(line);
+    if (iree_string_view_is_empty(line)) continue;
+
+    // Ignore comments.
+    if (iree_string_view_starts_with(line, iree_make_cstring_view("#")) ||
+        iree_string_view_starts_with(line, iree_make_cstring_view("//"))) {
+      continue;
+    }
+
+    // Strip `--`.
+    if (!iree_string_view_starts_with(line, iree_make_cstring_view("--"))) {
+      // Positional arguments can't be specified in flag files.
+      return iree_make_status(
+          IREE_STATUS_INVALID_ARGUMENT,
+          "%.*s:%d: positional arguments not allowed in flag files",
+          (int)file_path.size, file_path.data, line_number);
+    }
+    line = iree_string_view_remove_prefix(line, 2);
+
+    // Split into `flag_name` = `flag_value`.
+    iree_string_view_t flag_name;
+    iree_string_view_t flag_value;
+    iree_string_view_split(line, '=', &flag_name, &flag_value);
+    flag_name = iree_string_view_trim(flag_name);
+    flag_value = iree_string_view_trim(flag_value);
+
+    // Lookup the flag by name.
+    iree_flag_t* flag = iree_flag_lookup(flag_name);
+    if (!flag) {
+      return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                              "%.*s:%d: flag '%.*s' not recognized",
+                              (int)file_path.size, file_path.data, line_number,
+                              (int)flag_name.size, flag_name.data);
+    }
+
+    // Parse the flag value.
+    IREE_RETURN_IF_ERROR(iree_flag_parse(flag, flag_value),
+                         "%.*s:%d: while parsing flag '%.*s'",
+                         (int)file_path.size, file_path.data, line_number,
+                         (int)line.size, line.data);
+  }
+
+  // NOTE: we intentionally leak the memory as flags may continue to reference
+  // segments of it for their string values.
+  return iree_ok_status();
+}
+
+static iree_status_t iree_flags_parse_flagfile(iree_string_view_t flag_name,
+                                               void* storage,
+                                               iree_string_view_t value) {
+  if (iree_string_view_is_empty(value)) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "--%.*s= requires a file path", (int)flag_name.size,
+                            flag_name.data);
+  }
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_TEXT(z0, value.data, value.size);
+  iree_status_t status = iree_flags_parse_file(value);
+  IREE_TRACE_ZONE_END(z0);
+
+  return status;
+}
+static void iree_flags_print_flagfile(iree_string_view_t flag_name,
+                                      void* storage, FILE* file) {
+  fprintf(file, "# --%.*s=[path]\n", (int)flag_name.size, flag_name.data);
+}
+IREE_FLAG_CALLBACK(iree_flags_parse_flagfile, iree_flags_print_flagfile, NULL,
+                   flagfile,
+                   "Parses a newline-separated list of flags from a file.\n"
+                   "Flags are parsed at the point where the flagfile is "
+                   "specified\nand following flags may override the parsed "
+                   "values.");
+
+#endif  // IREE_FLAGS_ENABLE_FLAG_FILE
+
+#endif  // IREE_FLAGS_ENABLE_CLI
diff --git a/runtime/src/iree/base/internal/flags.h b/runtime/src/iree/base/internal/flags.h
new file mode 100644
index 0000000..213c1f3
--- /dev/null
+++ b/runtime/src/iree/base/internal/flags.h
@@ -0,0 +1,297 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BASE_INTERNAL_FLAGS_H_
+#define IREE_BASE_INTERNAL_FLAGS_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include "iree/base/api.h"
+#include "iree/base/target_platform.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+//===----------------------------------------------------------------------===//
+// Flags configuration
+//===----------------------------------------------------------------------===//
+
+// 1 to enable command line parsing from argc/argv; 0 otherwise.
+// When parsing is disabled flags are just variables that can still be queried
+// and manually overridden by code if desired.
+#if !defined(IREE_FLAGS_ENABLE_CLI)
+#define IREE_FLAGS_ENABLE_CLI 1
+#endif  // !IREE_FLAGS_ENABLE_CLI
+
+// 1 to enable --flagfile= support.
+#if !defined(IREE_FLAGS_ENABLE_FLAG_FILE)
+#define IREE_FLAGS_ENABLE_FLAG_FILE 1
+#endif  // !IREE_FLAGS_ENABLE_FLAG_FILE
+
+// Maximum number of flags that can be registered in a single binary.
+#if !defined(IREE_FLAGS_CAPACITY)
+#define IREE_FLAGS_CAPACITY 64
+#endif  // !IREE_FLAGS_CAPACITY
+
+//===----------------------------------------------------------------------===//
+// Static initialization utility
+//===----------------------------------------------------------------------===//
+// This declares a static initialization function with the given name.
+// Usage:
+//   IREE_STATIC_INITIALIZER(initializer_name) {
+//     // Do something here! Note that initialization order is undefined and
+//     // what you do should be tolerant to that.
+//
+//     // If you want a finalizer (you probably don't; they may not get run)
+//     // then you can use atexit:
+//     atexit(some_finalizer_fn);
+//   }
+
+#ifdef __cplusplus
+
+#define IREE_STATIC_INITIALIZER(f) \
+  static void f(void);             \
+  struct f##_t_ {                  \
+    f##_t_(void) { f(); }          \
+  };                               \
+  static f##_t_ f##_;              \
+  static void f(void)
+
+#elif defined(IREE_COMPILER_MSVC)
+
+// `__attribute__((constructor))`-like behavior in MSVC. See:
+// https://docs.microsoft.com/en-us/cpp/c-runtime-library/crt-initialization?view=msvc-160
+
+#pragma section(".CRT$XCU", read)
+#define IREE_STATIC_INITIALIZER_IMPL(f, p)                 \
+  static void f(void);                                     \
+  __declspec(allocate(".CRT$XCU")) void (*f##_)(void) = f; \
+  __pragma(comment(linker, "/include:" p #f "_")) static void f(void)
+#ifdef _WIN64
+#define IREE_STATIC_INITIALIZER(f) IREE_STATIC_INITIALIZER_IMPL(f, "")
+#else
+#define IREE_STATIC_INITIALIZER(f) IREE_STATIC_INITIALIZER_IMPL(f, "_")
+#endif  // _WIN64
+
+#else
+
+#define IREE_STATIC_INITIALIZER(f)                  \
+  static void f(void) __attribute__((constructor)); \
+  static void f(void)
+
+#endif  // __cplusplus / MSVC
+
+//===----------------------------------------------------------------------===//
+// Flag definition
+//===----------------------------------------------------------------------===//
+
+enum iree_flag_dump_mode_bits_t {
+  IREE_FLAG_DUMP_MODE_DEFAULT = 0u,
+  IREE_FLAG_DUMP_MODE_VERBOSE = 1u << 0,
+};
+typedef uint32_t iree_flag_dump_mode_t;
+
+#define IREE_FLAG_CTYPE_bool bool
+#define IREE_FLAG_CTYPE_int32_t int32_t
+#define IREE_FLAG_CTYPE_int64_t int64_t
+#define IREE_FLAG_CTYPE_float float
+#define IREE_FLAG_CTYPE_double double
+#define IREE_FLAG_CTYPE_string const char*
+
+#if IREE_FLAGS_ENABLE_CLI == 1
+
+// Types of flags supported by the parser.
+typedef enum iree_flag_type_e {
+  // Empty/unspecified sentinel.
+  IREE_FLAG_TYPE_none = 0,
+  // Custom parsing callback; see IREE_FLAG_CALLBACK.
+  IREE_FLAG_TYPE_callback = 1,
+  // Boolean flag:
+  //  --foo (set true)
+  //  --foo=true | --foo=false
+  IREE_FLAG_TYPE_bool,
+  // 32-bit integer flag:
+  //  --foo=123
+  IREE_FLAG_TYPE_int32_t,
+  // 64-bit integer flag:
+  //  --foo=123
+  IREE_FLAG_TYPE_int64_t,
+  // 32-bit floating-point flag:
+  //  --foo=1.2
+  IREE_FLAG_TYPE_float,
+  // 64-bit floating-point flag:
+  //  --foo=1.2
+  IREE_FLAG_TYPE_double,
+  // String flag:
+  //  --foo=abc
+  //  --foo="a b c"
+  // Holds a reference to constant string data; assigned values must remain
+  // live for as long as the flag value references them.
+  IREE_FLAG_TYPE_string,
+} iree_flag_type_t;
+
+// Custom callback issued for each time the flag is seen during parsing.
+// The |value| provided will already be trimmed and may be empty. For
+// compatibility with non-IREE APIs there will be a NUL terminator immediately
+// following the flag value in memory such that `value.data` can be used as a
+// C-string.
+typedef iree_status_t(IREE_API_PTR* iree_flag_parse_callback_fn_t)(
+    iree_string_view_t flag_name, void* storage, iree_string_view_t value);
+
+// Custom callback issued for each time the flag is to be printed.
+// The callback should print the flag and its value to |file|.
+// Example: `--my_flag=value\n`
+typedef void(IREE_API_PTR* iree_flag_print_callback_fn_t)(
+    iree_string_view_t flag_name, void* storage, FILE* file);
+
+int iree_flag_register(const char* file, int line, iree_flag_type_t type,
+                       void* storage,
+                       iree_flag_parse_callback_fn_t parse_callback,
+                       iree_flag_print_callback_fn_t print_callback,
+                       iree_string_view_t name, iree_string_view_t description);
+
+// Defines a flag with the given |type| and |name|.
+//
+// Conceptually the flag is just a variable and can be loaded/stored:
+//   IREE_FLAG(bool, foo, true, "hello");
+//  =>
+//   static bool FLAG_foo = true;
+//  ...
+//   if (FLAG_foo) do_something();
+//
+// If flag parsing is enabled with IREE_FLAGS_ENABLE_CLI == 1 then the flag
+// value can be specified on the command line with --name:
+//   --foo
+//   --foo=true
+//
+// See iree_flag_type_t for the types supported and how they are parsed.
+#define IREE_FLAG(type, name, default_value, description)                      \
+  static IREE_FLAG_CTYPE_##type FLAG_##name = (default_value);                 \
+  IREE_STATIC_INITIALIZER(iree_flag_register_##name) {                         \
+    iree_flag_register(__FILE__, __LINE__, IREE_FLAG_TYPE_##type,              \
+                       (void**)&(FLAG_##name), /*parse_callback=*/NULL,        \
+                       /*print_callback=*/NULL, iree_make_cstring_view(#name), \
+                       iree_make_cstring_view(description));                   \
+  }
+
+// Defines a flag issues |callback| for custom parsing.
+//
+// Usage:
+//  iree_status_t parse_callback(const char* flag_name, void* storage,
+//                               iree_string_view_t value) {
+//    // Parse |value| and store in |storage|, however you want.
+//    // Returning IREE_STATUS_INVALID_ARGUMENT will trigger --help.
+//    int* storage_ptr = (int*)storage;
+//    printf("hello! %d", (*storage_ptr)++);
+//    return iree_ok_status();
+//  }
+//  void print_callback(const char* flag_name, void* storage, FILE* file) {
+//    // Print the value in |storage|, however you want. For repeated fields
+//    // you can print multiple separated by newlines.
+//    int* storage_ptr = (int*)storage;
+//    fprintf(file, "--say_hello=%d\n", *storage_ptr);
+//  }
+//  int my_storage = 0;
+//  IREE_FLAG_CALLBACK(parse_callback, print_callback, &my_storage,
+//                     say_hello, "Say hello!");
+#define IREE_FLAG_CALLBACK(parse_callback, print_callback, storage, name, \
+                           description)                                   \
+  IREE_STATIC_INITIALIZER(iree_flag_register_##name) {                    \
+    iree_flag_register(__FILE__, __LINE__, IREE_FLAG_TYPE_callback,       \
+                       (void*)storage, parse_callback, print_callback,    \
+                       iree_make_cstring_view(#name),                     \
+                       iree_make_cstring_view(description));              \
+  }
+
+#else
+
+#define IREE_FLAG(type, name, default_value, description) \
+  static const IREE_FLAG_CTYPE_##type FLAG_##name = (default_value);
+
+#define IREE_FLAG_CALLBACK(parse_callback, print_callback, storage, name, \
+                           description)
+
+#endif  // IREE_FLAGS_ENABLE_CLI
+
+//===----------------------------------------------------------------------===//
+// Flag parsing
+//===----------------------------------------------------------------------===//
+
+// Controls how flag parsing is performed.
+enum iree_flags_parse_mode_bits_t {
+  IREE_FLAGS_PARSE_MODE_DEFAULT = 0,
+  // Do not error out on undefined flags; leave them in the list.
+  // Useful when needing to chain multiple flag parsers together.
+  IREE_FLAGS_PARSE_MODE_UNDEFINED_OK = 1u << 0,
+  // Continues parsing and returns success without exiting when `--help` is
+  // encountered. This allows for IREE flag parsing to happen before another
+  // external library parses its flags. `--help` will remain in the flag set
+  // such that the subsequent parsing can find it.
+  IREE_FLAGS_PARSE_MODE_CONTINUE_AFTER_HELP = 1u << 1,
+};
+typedef uint32_t iree_flags_parse_mode_t;
+
+#if IREE_FLAGS_ENABLE_CLI == 1
+
+// Sets the usage information printed when --help is passed on the command line.
+// Both strings must remain live for the lifetime of the program.
+void iree_flags_set_usage(const char* program_name, const char* usage);
+
+// Parses flags from the given command line arguments.
+// All flag-style arguments ('--foo', '-f', etc) will be consumed and argc/argv
+// will be updated to contain only the program name (index 0) and any remaining
+// positional arguments.
+//
+// Returns 0 if all flags were parsed and execution should continue.
+// Returns >0 if execution should be cancelled such as when --help is used.
+// Returns <0 if parsing fails.
+//
+// Usage:
+//   extern "C" int main(int argc, char** argv) {
+//     iree_status_t status = iree_flags_parse(&argc, &argv);
+//     if (!iree_status_is_ok(status)) { exit(1); }
+//     consume_positional_args(argc, argv);
+//     return 0;
+//   }
+//
+// Example:
+//   argc = 4, argv = ['program', 'abc', '--flag=2']
+// Results:
+//   argc = 2, argv = ['program', 'abc']
+iree_status_t iree_flags_parse(iree_flags_parse_mode_t mode, int* argc,
+                               char*** argv);
+
+// Parses flags as with iree_flags_parse but will use exit() or abort().
+// WARNING: this almost always what you want in a command line tool and *never*
+// what you want when embedded in a host process. You don't want to have a flag
+// typo and shut down your entire server/sandbox/Android app/etc.
+void iree_flags_parse_checked(iree_flags_parse_mode_t mode, int* argc,
+                              char*** argv);
+
+// Dumps all flags and their current values to the given |file|.
+void iree_flags_dump(iree_flag_dump_mode_t mode, FILE* file);
+
+#else
+
+inline void iree_flags_set_usage(const char* program_name, const char* usage) {}
+inline int iree_flags_parse(iree_flags_parse_mode_t mode, int* argc,
+                            char*** argv) {
+  return 0;
+}
+inline void iree_flags_parse_checked(iree_flags_parse_mode_t mode, int* argc,
+                                     char*** argv) {}
+inline void iree_flags_dump(iree_flag_dump_mode_t mode, FILE* file) {}
+
+#endif  // IREE_FLAGS_ENABLE_CLI
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_BASE_INTERNAL_FLAGS_H_
diff --git a/runtime/src/iree/base/internal/flags_demo.c b/runtime/src/iree/base/internal/flags_demo.c
new file mode 100644
index 0000000..82f1213
--- /dev/null
+++ b/runtime/src/iree/base/internal/flags_demo.c
@@ -0,0 +1,62 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <inttypes.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/flags.h"
+
+IREE_FLAG(bool, test_bool, false, "A boolean value.");
+IREE_FLAG(int32_t, test_int32, 123, "An int32_t value.");
+IREE_FLAG(int64_t, test_int64, 555, "An int64_t value.");
+IREE_FLAG(float, test_float, 1.0f, "A float value.");
+IREE_FLAG(string, test_string, "some default", "A string\nvalue.");
+
+static iree_status_t parse_callback(iree_string_view_t flag_name, void* storage,
+                                    iree_string_view_t value) {
+  int* count_ptr = (int*)storage;
+  if (strcmp(value.data, "FORCE_FAILURE") == 0) {
+    return iree_make_status(IREE_STATUS_INTERNAL,
+                            "callbacks can do verification");
+  }
+  *count_ptr += atoi(value.data);
+  return iree_ok_status();
+}
+static void print_callback(iree_string_view_t flag_name, void* storage,
+                           FILE* file) {
+  int* count_ptr = (int*)storage;
+  fprintf(file, "--%.*s=%d\n", (int)flag_name.size, flag_name.data, *count_ptr);
+}
+static int callback_count = 0;
+IREE_FLAG_CALLBACK(parse_callback, print_callback, &callback_count,
+                   test_callback, "Callback!");
+
+int main(int argc, char** argv) {
+  // Parse flags, updating argc/argv with position arguments.
+  iree_flags_parse_checked(IREE_FLAGS_PARSE_MODE_DEFAULT, &argc, &argv);
+
+  // Report parsed flag values:
+  printf("FLAG[test_bool] = %s\n", FLAG_test_bool ? "true" : "false");
+  printf("FLAG[test_int32] = %" PRId32 "\n", FLAG_test_int32);
+  printf("FLAG[test_int64] = %" PRId64 "\n", FLAG_test_int64);
+  printf("FLAG[test_float] = %g\n", FLAG_test_float);
+  printf("FLAG[test_string] = %s\n", FLAG_test_string);
+  printf("FLAG[test_callback] = %d\n", callback_count);
+
+  // Report positional arguments:
+  for (int i = 0; i < argc; ++i) {
+    printf("ARG(%d) = %s\n", i, argv[i]);
+  }
+
+  // Dump all flags back out for round-tripping:
+  iree_flags_dump(IREE_FLAG_DUMP_MODE_DEFAULT, stdout);
+
+  return 0;
+}
diff --git a/runtime/src/iree/base/internal/flags_test.txt b/runtime/src/iree/base/internal/flags_test.txt
new file mode 100644
index 0000000..74f14ed
--- /dev/null
+++ b/runtime/src/iree/base/internal/flags_test.txt
@@ -0,0 +1,94 @@
+// RUN: ( flags_demo )  | FileCheck --check-prefix=NO-FLAGS %s
+// NO-FLAGS: FLAG[test_bool] = false
+// NO-FLAGS: FLAG[test_int32] = 123
+// NO-FLAGS: FLAG[test_int64] = 555
+// NO-FLAGS: FLAG[test_float] = 1
+// NO-FLAGS: FLAG[test_string] = some default
+// NO-FLAGS: FLAG[test_callback] = 0
+// NO-FLAGS: ARG(0) ={{.+}}flags_demo
+
+// RUN: ( flags_demo --help )  | FileCheck --check-prefix=FLAGS-HELP %s
+// FLAGS-HELP: # {{.+}} IREE
+// FLAGS-HELP: # Flags in {{.+}}flags.c
+// FLAGS-HELP: # Displays command line usage information.
+// FLAGS-HELP: --help
+// FLAGS-HELP: # Flags in {{.+}}flags_demo.c
+// FLAGS-HELP: # A boolean value.
+// FLAGS-HELP: --test_bool=false
+// FLAGS-HELP: # An int32_t value.
+// FLAGS-HELP: --test_int32=123
+// FLAGS-HELP: # An int64_t value.
+// FLAGS-HELP: --test_int64=555
+// FLAGS-HELP: # A float value.
+// FLAGS-HELP: --test_float=1
+// FLAGS-HELP: # A string
+// FLAGS-HELP: # value.
+// FLAGS-HELP: --test_string="some default"
+// FLAGS-HELP: # Callback!
+// FLAGS-HELP: --test_callback=0
+
+// RUN: ( flags_demo --unknown-flag 2>&1 || [[ $? == 1 ]] ) | FileCheck --check-prefix=UNKNOWN-FLAG %s
+// UNKNOWN-FLAG: INVALID_ARGUMENT; flag 'unknown-flag' not recognized
+
+// RUN: ( flags_demo --test_bool=true ) | FileCheck --check-prefix=FLAG-BOOL-TRUE %s
+// FLAG-BOOL-TRUE: FLAG[test_bool] = true
+// RUN: ( flags_demo --test_bool=1 ) | FileCheck --check-prefix=FLAG-BOOL-1 %s
+// FLAG-BOOL-1: FLAG[test_bool] = true
+// RUN: ( flags_demo --test_bool=true --test_bool=false ) | FileCheck --check-prefix=FLAG-BOOL-OVERRIDE %s
+// FLAG-BOOL-OVERRIDE: FLAG[test_bool] = false
+
+// RUN: ( flags_demo --test_int32=456 ) | FileCheck --check-prefix=FLAG-INT32 %s
+// FLAG-INT32: FLAG[test_int32] = 456
+// RUN: ( flags_demo --test_int32=-2147483648 ) | FileCheck --check-prefix=FLAG-INT32-MIN %s
+// FLAG-INT32-MIN: FLAG[test_int32] = -2147483648
+// RUN: ( flags_demo --test_int32=2147483647 ) | FileCheck --check-prefix=FLAG-INT32-MAX %s
+// FLAG-INT32-MAX: FLAG[test_int32] = 2147483647
+
+// RUN: ( flags_demo --test_int64=902834 ) | FileCheck --check-prefix=FLAG-INT64 %s
+// FLAG-INT64: FLAG[test_int64] = 902834
+// RUN: ( flags_demo --test_int64=-9223372036854775808 ) | FileCheck --check-prefix=FLAG-INT64-MIN %s
+// FLAG-INT64-MIN: FLAG[test_int64] = -9223372036854775808
+// RUN: ( flags_demo --test_int64=9223372036854775807 ) | FileCheck --check-prefix=FLAG-INT64-MAX %s
+// FLAG-INT64-MAX: FLAG[test_int64] = 9223372036854775807
+
+// RUN: ( flags_demo --test_float=1.1234 ) | FileCheck --check-prefix=FLAG-FLOAT %s
+// FLAG-FLOAT: FLAG[test_float] = 1.1234
+
+// RUN: ( flags_demo --test_string= ) | FileCheck --check-prefix=FLAG-STRING-EMPTY %s
+// FLAG-STRING-EMPTY: FLAG[test_string] =
+// RUN: ( flags_demo --test_string=abc ) | FileCheck --check-prefix=FLAG-STRING-ABC %s
+// FLAG-STRING-ABC: FLAG[test_string] = abc
+// RUN: ( flags_demo --test_string="with some space" ) | FileCheck --check-prefix=FLAG-STRING-SPACES %s
+// FLAG-STRING-SPACES: FLAG[test_string] = with some space
+
+// RUN: ( flags_demo --test_callback=1 ) | FileCheck --check-prefix=FLAG-CALLBACK-1 %s
+// FLAG-CALLBACK-1: FLAG[test_callback] = 1
+// RUN: ( flags_demo --test_callback=4 ) | FileCheck --check-prefix=FLAG-CALLBACK-4 %s
+// FLAG-CALLBACK-4: FLAG[test_callback] = 4
+// RUN: ( flags_demo --test_callback=FORCE_FAILURE 2>&1 || [[ $? == 1 ]] ) | FileCheck --check-prefix=FLAG-CALLBACK-ERROR %s
+// FLAG-CALLBACK-ERROR: INTERNAL; callbacks can do verification
+
+// RUN: ( flags_demo arg1 ) | FileCheck --check-prefix=FLAG-POSITIONAL-1 %s
+// FLAG-POSITIONAL-1: ARG(1) = arg1
+// RUN: ( flags_demo arg1 arg2 arg3 ) | FileCheck --check-prefix=FLAG-POSITIONAL-3 %s
+// FLAG-POSITIONAL-3: ARG(1) = arg1
+// FLAG-POSITIONAL-3: ARG(2) = arg2
+// FLAG-POSITIONAL-3: ARG(3) = arg3
+
+// RUN: ( flags_demo --test_bool=true --flagfile=not_found.txt 2>&1 || [[ $? == 1 ]] ) | FileCheck --check-prefix=MISSING-FLAGFILE %s
+// MISSING-FLAGFILE: NOT_FOUND; failed to open file 'not_found.txt'
+
+// RUN: ( flags_demo --test_bool=true --flagfile=%s ) | FileCheck --check-prefix=FLAGFILE %s
+# Comments are ignored.
+// FLAGFILE: FLAG[test_bool] = false
+--test_bool=false
+// FLAGFILE: FLAG[test_int64] = 123111
+// Note that whitespace is ignored in case you are copy/pasting flags around.
+  --test_int64=123111
+// FLAGFILE: FLAG[test_float] = 55.1
+--test_float=55.1
+// FLAGFILE: FLAG[test_string] = override spaces
+--test_string="override spaces"
+
+
+# NOTE: above two lines are to test that vertical whitespace is ok.
diff --git a/runtime/src/iree/base/internal/flatcc/BUILD b/runtime/src/iree/base/internal/flatcc/BUILD
new file mode 100644
index 0000000..c7a93dd
--- /dev/null
+++ b/runtime/src/iree/base/internal/flatcc/BUILD
@@ -0,0 +1,53 @@
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("//build_tools/bazel:iree_flatcc.bzl", "iree_flatbuffer_c_library")
+load("//iree:build_defs.oss.bzl", "iree_runtime_cc_library")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+iree_runtime_cc_library(
+    name = "building",
+    hdrs = ["building.h"],
+    deps = [
+        ":dummy",
+        ":parsing",
+        "@com_github_dvidelabs_flatcc//:runtime",
+    ],
+)
+
+iree_runtime_cc_library(
+    name = "debugging",
+    hdrs = ["debugging.h"],
+    deps = [
+        ":dummy",
+        "@com_github_dvidelabs_flatcc//:runtime",
+    ],
+)
+
+iree_runtime_cc_library(
+    name = "parsing",
+    hdrs = ["parsing.h"],
+    deps = [
+        ":dummy",
+        "@com_github_dvidelabs_flatcc//:parsing",
+    ],
+)
+
+iree_flatbuffer_c_library(
+    name = "dummy",
+    srcs = ["dummy.fbs"],
+    flatcc_args = [
+        "--reader",
+        "--builder",
+        "--verifier",
+        "--json",
+    ],
+)
diff --git a/runtime/src/iree/base/internal/flatcc/CMakeLists.txt b/runtime/src/iree/base/internal/flatcc/CMakeLists.txt
new file mode 100644
index 0000000..92d2ee7
--- /dev/null
+++ b/runtime/src/iree/base/internal/flatcc/CMakeLists.txt
@@ -0,0 +1,60 @@
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# runtime/src/iree/base/internal/flatcc/BUILD                                  #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
+
+iree_add_all_subdirs()
+
+iree_cc_library(
+  NAME
+    building
+  HDRS
+    "building.h"
+  DEPS
+    ::dummy
+    ::parsing
+    flatcc::runtime
+  PUBLIC
+)
+
+iree_cc_library(
+  NAME
+    debugging
+  HDRS
+    "debugging.h"
+  DEPS
+    ::dummy
+    flatcc::runtime
+  PUBLIC
+)
+
+iree_cc_library(
+  NAME
+    parsing
+  HDRS
+    "parsing.h"
+  DEPS
+    ::dummy
+    flatcc::parsing
+  PUBLIC
+)
+
+flatbuffer_c_library(
+  NAME
+    dummy
+  SRCS
+    "dummy.fbs"
+  FLATCC_ARGS
+    "--reader"
+    "--builder"
+    "--verifier"
+    "--json"
+  PUBLIC
+)
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/runtime/src/iree/base/internal/flatcc/building.h b/runtime/src/iree/base/internal/flatcc/building.h
new file mode 100644
index 0000000..14fa965
--- /dev/null
+++ b/runtime/src/iree/base/internal/flatcc/building.h
@@ -0,0 +1,32 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BASE_INTERNAL_FLATCC_BUILDING_H_
+#define IREE_BASE_INTERNAL_FLATCC_BUILDING_H_
+
+//===----------------------------------------------------------------------===//
+// flatcc include order fixes
+//===----------------------------------------------------------------------===//
+//
+// This header merely wraps the flatcc headers that are generally useful to
+// include in various places that may not know the specific messages they are
+// working with.
+//
+// If using flatcc prefer to include this file over any hard-to-handle flatcc
+// file such as flatbuffers_common_reader.h or flatbuffers_common_builder.h.
+//
+// NOTE: order matters for these includes so stop clang from messing with it:
+// clang-format off
+
+#include "iree/base/internal/flatcc/parsing.h"
+
+#include "flatcc/flatcc_builder.h" // IWYU pragma: export
+#include "flatcc/reflection/flatbuffers_common_builder.h" // IWYU pragma: export
+#include "iree/base/internal/flatcc/dummy_builder.h" // IWYU pragma: export
+
+// clang-format on
+
+#endif  // IREE_BASE_INTERNAL_FLATCC_BUILDING_H_
diff --git a/runtime/src/iree/base/internal/flatcc/debugging.h b/runtime/src/iree/base/internal/flatcc/debugging.h
new file mode 100644
index 0000000..fdbc7e5
--- /dev/null
+++ b/runtime/src/iree/base/internal/flatcc/debugging.h
@@ -0,0 +1,34 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BASE_INTERNAL_FLATCC_DEBUGGING_H_
+#define IREE_BASE_INTERNAL_FLATCC_DEBUGGING_H_
+
+//===----------------------------------------------------------------------===//
+// flatcc include order fixes
+//===----------------------------------------------------------------------===//
+//
+// This header merely wraps the flatcc headers that are generally useful to
+// include in various places that may not know the specific messages they are
+// working with.
+//
+// If using flatcc prefer to include this file over any hard-to-handle flatcc
+// file such as flatbuffers_common_reader.h or flatbuffers_common_builder.h.
+//
+// NOTE: order matters for these includes so stop clang from messing with it:
+// clang-format off
+
+#include "iree/base/internal/flatcc/parsing.h"
+
+#include "flatcc/flatcc_json_parser.h" // IWYU pragma: export
+#include "iree/base/internal/flatcc/dummy_json_parser.h" // IWYU pragma: export
+
+#include "flatcc/flatcc_json_printer.h" // IWYU pragma: export
+#include "iree/base/internal/flatcc/dummy_json_printer.h" // IWYU pragma: export
+
+// clang-format on
+
+#endif  // IREE_BASE_INTERNAL_FLATCC_DEBUGGING_H_
diff --git a/runtime/src/iree/base/internal/flatcc/dummy.fbs b/runtime/src/iree/base/internal/flatcc/dummy.fbs
new file mode 100644
index 0000000..626af1e
--- /dev/null
+++ b/runtime/src/iree/base/internal/flatcc/dummy.fbs
@@ -0,0 +1,22 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+namespace iree_flatcc;
+
+// HACK: flatcc public API headers are incomplete and some things only exist
+// when pulled in via generated headers. So here we give ourselves something to
+// include that's always available and cheap.
+//
+// Instead of directly including this file use iree/base/internal/flatcc/*.h.
+//
+// Normally including any generated file will include the appropriate headers in
+// the required order (as they are non-hermetic), but that requires that we have
+// a generated file. Though most of the API is exposed through the main includes
+// there are various types that only get generated and included by way of the
+// common headers that are not easily included.
+struct __IncludeWorkaround {
+  reserved:int;
+}
diff --git a/runtime/src/iree/base/internal/flatcc/parsing.h b/runtime/src/iree/base/internal/flatcc/parsing.h
new file mode 100644
index 0000000..4e1c675
--- /dev/null
+++ b/runtime/src/iree/base/internal/flatcc/parsing.h
@@ -0,0 +1,32 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BASE_INTERNAL_FLATCC_PARSING_H_
+#define IREE_BASE_INTERNAL_FLATCC_PARSING_H_
+
+//===----------------------------------------------------------------------===//
+// flatcc include order fixes
+//===----------------------------------------------------------------------===//
+//
+// This header merely wraps the flatcc headers that are generally useful to
+// include in various places that may not know the specific messages they are
+// working with.
+//
+// If using flatcc prefer to include this file over any hard-to-handle flatcc
+// file such as flatbuffers_common_reader.h or flatbuffers_common_builder.h.
+//
+// NOTE: order matters for these includes so stop clang from messing with it:
+// clang-format off
+
+#include "flatcc/reflection/flatbuffers_common_reader.h"  // IWYU pragma: export
+#include "iree/base/internal/flatcc/dummy_reader.h" // IWYU pragma: export
+
+#include "flatcc/flatcc_verifier.h" // IWYU pragma: export
+#include "iree/base/internal/flatcc/dummy_verifier.h" // IWYU pragma: export
+
+// clang-format on
+
+#endif  // IREE_BASE_INTERNAL_FLATCC_PARSING_H_
diff --git a/runtime/src/iree/base/internal/fpu_state.c b/runtime/src/iree/base/internal/fpu_state.c
new file mode 100644
index 0000000..f44af3b
--- /dev/null
+++ b/runtime/src/iree/base/internal/fpu_state.c
@@ -0,0 +1,108 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/internal/fpu_state.h"
+
+#include <stdbool.h>
+
+#include "iree/base/target_platform.h"
+
+#if defined(IREE_ARCH_X86_32) || defined(IREE_ARCH_X86_64)
+#include <xmmintrin.h>
+#endif  // IREE_ARCH_X86_*
+
+#if defined(IREE_COMPILER_MSVC)
+#include <intrin.h>
+#endif  // IREE_COMPILER_MSVC
+
+//==============================================================================
+// iree_fpu_state_t
+//==============================================================================
+// https://github.com/petewarden/tensorflow_makefile/blob/master/tensorflow/core/platform/denormal.cc
+// https://chromium.googlesource.com/chromium/blink/+/master/Source/platform/audio/DenormalDisabler.h
+
+static uint64_t iree_fpu_state_set_dtz(uint64_t state, bool denormals_to_zero);
+
+#if defined(IREE_ARCH_ARM_32)
+static uint64_t iree_fpu_state_set_dtz(uint64_t state, bool denormals_to_zero) {
+  return (state & ~0x1000000) | (denormals_to_zero ? 0x1000000 : 0);
+}
+#elif defined(IREE_ARCH_ARM_64)
+static uint64_t iree_fpu_state_set_dtz(uint64_t state, bool denormals_to_zero) {
+  return (state & ~0x1080000) | (denormals_to_zero ? 0x1080000 : 0);
+}
+#elif defined(IREE_ARCH_X86_32) || defined(IREE_ARCH_X86_64)
+static uint64_t iree_fpu_state_set_dtz(uint64_t state, bool denormals_to_zero) {
+  return (state & ~0x8040) | (denormals_to_zero ? 0x8040 : 0);
+}
+#else
+static uint64_t iree_fpu_state_set_dtz(uint64_t state, bool denormals_to_zero) {
+  return state;
+}
+#endif  // IREE_ARCH_*
+
+static uint64_t iree_fpu_load_state(void);
+static void iree_fpu_store_state(uint64_t state);
+
+#if defined(IREE_ARCH_ARM_32) && defined(IREE_COMPILER_MSVC)
+static uint64_t iree_fpu_load_state(void) {
+  return (uint64_t)_MoveFromCoprocessor(10, 7, 1, 0, 0);
+}
+static void iree_fpu_store_state(uint64_t state) {
+  _MoveToCoprocessor((int)state, 10, 7, 1, 0, 0);
+}
+#elif defined(IREE_ARCH_ARM_32)
+static uint64_t iree_fpu_load_state() {
+  uint32_t fpscr;
+  __asm__ __volatile__("VMRS %[fpscr], fpscr" : [ fpscr ] "=r"(fpscr));
+  return (uint64_t)fpscr;
+}
+static void iree_fpu_store_state(uint64_t state) {
+  __asm__ __volatile__("VMSR fpscr, %[fpscr]" : : [ fpscr ] "r"(state));
+}
+#elif defined(IREE_ARCH_ARM_64) && defined(IREE_COMPILER_MSVC)
+static uint64_t iree_fpu_load_state(void) {
+  return (uint64_t)_ReadStatusReg(0x5A20);
+}
+static void iree_fpu_store_state(uint64_t state) {
+  _WriteStatusReg(0x5A20, (__int64)state);
+}
+#elif defined(IREE_ARCH_ARM_64)
+static uint64_t iree_fpu_load_state(void) {
+  uint64_t fpcr;
+  __asm__ __volatile__("MRS %[fpcr], fpcr" : [ fpcr ] "=r"(fpcr));
+  return fpcr;
+}
+static void iree_fpu_store_state(uint64_t state) {
+  __asm__ __volatile__("MSR fpcr, %[fpcr]" : : [ fpcr ] "r"(state));
+}
+#elif defined(IREE_ARCH_X86_32) || defined(IREE_ARCH_X86_64)
+static uint64_t iree_fpu_load_state(void) { return (uint64_t)_mm_getcsr(); }
+static void iree_fpu_store_state(uint64_t state) {
+  _mm_setcsr((unsigned int)state);
+}
+#else
+static uint64_t iree_fpu_load_state(void) { return 0; }
+static void iree_fpu_store_state(uint64_t state) {}
+#endif  // IREE_ARCH_*
+
+iree_fpu_state_t iree_fpu_state_push(iree_fpu_state_flags_t flags) {
+  iree_fpu_state_t state;
+  state.current_value = state.previous_value = iree_fpu_load_state();
+  state.current_value = iree_fpu_state_set_dtz(
+      state.current_value,
+      (flags & IREE_FPU_STATE_FLAG_FLUSH_DENORMALS_TO_ZERO) ? true : false);
+  if (state.previous_value != state.current_value) {
+    iree_fpu_store_state(state.current_value);
+  }
+  return state;
+}
+
+void iree_fpu_state_pop(iree_fpu_state_t state) {
+  if (state.previous_value != state.current_value) {
+    iree_fpu_store_state(state.previous_value);
+  }
+}
diff --git a/runtime/src/iree/base/internal/fpu_state.h b/runtime/src/iree/base/internal/fpu_state.h
new file mode 100644
index 0000000..fc9a36c
--- /dev/null
+++ b/runtime/src/iree/base/internal/fpu_state.h
@@ -0,0 +1,59 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BASE_INTERNAL_FPU_STATE_H_
+#define IREE_BASE_INTERNAL_FPU_STATE_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/base/target_platform.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//==============================================================================
+// iree_fpu_state_*
+//==============================================================================
+
+// Flags controlling FPU features.
+enum iree_fpu_state_flag_bits_t {
+  // Platform default.
+  IREE_FPU_STATE_DEFAULT = 0,
+
+  // Denormals can cause some serious slowdowns in certain ISAs where they may
+  // be implemented in microcode. Flushing them to zero instead of letting them
+  // propagate ensures that the slow paths aren't hit. This is a fast-math style
+  // optimization (and is often part of all compiler's fast-math set of flags).
+  //
+  // https://en.wikipedia.org/wiki/Denormal_number
+  // https://carlh.net/plugins/denormals.php
+  // https://www.xspdf.com/resolution/50507310.html
+  IREE_FPU_STATE_FLAG_FLUSH_DENORMALS_TO_ZERO = 1 << 0,
+};
+typedef uint32_t iree_fpu_state_flags_t;
+
+// Opaque FPU state vector manipulated with iree_fpu_* functions.
+typedef struct iree_fpu_state_t {
+  uint64_t previous_value;
+  uint64_t current_value;
+} iree_fpu_state_t;
+
+// Pushes a new floating-point unit (FPU) state for the current thread.
+// May lead to a pipeline flush; avoid if possible.
+iree_fpu_state_t iree_fpu_state_push(iree_fpu_state_flags_t flags);
+
+// Restores the FPU state of the thread to its original value.
+// May lead to a pipeline flush; avoid if possible.
+void iree_fpu_state_pop(iree_fpu_state_t state);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // IREE_BASE_INTERNAL_FPU_STATE_H_
diff --git a/runtime/src/iree/base/internal/fpu_state_benchmark.cc b/runtime/src/iree/base/internal/fpu_state_benchmark.cc
new file mode 100644
index 0000000..ff8ffa7
--- /dev/null
+++ b/runtime/src/iree/base/internal/fpu_state_benchmark.cc
@@ -0,0 +1,124 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <cstddef>
+
+#include "benchmark/benchmark.h"
+#include "iree/base/api.h"
+#include "iree/base/internal/fpu_state.h"
+
+namespace {
+
+constexpr size_t kElementBufferSize = 2048;
+
+// Scales a buffer of floats by |scale| and disables autovectorization.
+// Will generally be normal scalar floating point math and indicate whether the
+// FPU has issues with denormals.
+static float UnvectorizedScaleBufferByValue(float scale) {
+  float buffer[kElementBufferSize];
+  for (size_t i = 0; i < IREE_ARRAYSIZE(buffer); ++i) {
+    buffer[i] = 1.0f;
+  }
+  benchmark::DoNotOptimize(*buffer);
+  for (size_t i = 0; i < IREE_ARRAYSIZE(buffer); ++i) {
+    buffer[i] *= scale;
+    benchmark::DoNotOptimize(buffer[i]);
+  }
+  benchmark::DoNotOptimize(*buffer);
+  float sum = 0.0f;
+  for (size_t i = 0; i < IREE_ARRAYSIZE(buffer); ++i) {
+    sum += buffer[i];
+  }
+  return sum;
+}
+
+// Scales a buffer of floats by |scale| and allows autovectorization.
+// Will generally be SIMD floating point math and indicate whether the vector
+// units (NEON, AVX, etc) have issues with denormals.
+static float VectorizedScaleBufferByValue(float scale) {
+  float buffer[kElementBufferSize];
+  for (size_t i = 0; i < IREE_ARRAYSIZE(buffer); ++i) {
+    buffer[i] = 1.0f;
+  }
+  benchmark::DoNotOptimize(*buffer);
+  for (size_t i = 0; i < IREE_ARRAYSIZE(buffer); ++i) {
+    buffer[i] *= scale;
+  }
+  benchmark::DoNotOptimize(*buffer);
+  float sum = 0.0f;
+  for (size_t i = 0; i < IREE_ARRAYSIZE(buffer); ++i) {
+    sum += buffer[i];
+  }
+  return sum;
+}
+
+void BM_UnvectorizedNormals(benchmark::State& state) {
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(UnvectorizedScaleBufferByValue(1.0f));
+  }
+}
+BENCHMARK(BM_UnvectorizedNormals);
+
+void BM_UnvectorizedDenormals(benchmark::State& state) {
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(UnvectorizedScaleBufferByValue(1e-39f));
+  }
+}
+BENCHMARK(BM_UnvectorizedDenormals);
+
+void BM_UnvectorizedDenormalsFlushedToZero(benchmark::State& state) {
+  iree_fpu_state_t fpu_state =
+      iree_fpu_state_push(IREE_FPU_STATE_FLAG_FLUSH_DENORMALS_TO_ZERO);
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(UnvectorizedScaleBufferByValue(1e-39f));
+  }
+  iree_fpu_state_pop(fpu_state);
+}
+BENCHMARK(BM_UnvectorizedDenormalsFlushedToZero);
+
+void BM_UnvectorizedDenormalsNotFlushedToZero(benchmark::State& state) {
+  iree_fpu_state_t fpu_state = iree_fpu_state_push(IREE_FPU_STATE_DEFAULT);
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(UnvectorizedScaleBufferByValue(1e-39f));
+  }
+  iree_fpu_state_pop(fpu_state);
+}
+BENCHMARK(BM_UnvectorizedDenormalsNotFlushedToZero);
+
+void BM_VectorizedNormals(benchmark::State& state) {
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(VectorizedScaleBufferByValue(1.0f));
+  }
+}
+BENCHMARK(BM_VectorizedNormals);
+
+void BM_VectorizedDenormals(benchmark::State& state) {
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(VectorizedScaleBufferByValue(1e-39f));
+  }
+}
+BENCHMARK(BM_VectorizedDenormals);
+
+void BM_VectorizedDenormalsFlushedToZero(benchmark::State& state) {
+  iree_fpu_state_t fpu_state =
+      iree_fpu_state_push(IREE_FPU_STATE_FLAG_FLUSH_DENORMALS_TO_ZERO);
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(VectorizedScaleBufferByValue(1e-39f));
+  }
+  iree_fpu_state_pop(fpu_state);
+}
+BENCHMARK(BM_VectorizedDenormalsFlushedToZero);
+
+void BM_VectorizedDenormalsNotFlushedToZero(benchmark::State& state) {
+  iree_fpu_state_t fpu_state = iree_fpu_state_push(IREE_FPU_STATE_DEFAULT);
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(VectorizedScaleBufferByValue(1e-39f));
+  }
+  iree_fpu_state_pop(fpu_state);
+}
+BENCHMARK(BM_VectorizedDenormalsNotFlushedToZero);
+
+}  // namespace
diff --git a/runtime/src/iree/base/internal/fpu_state_test.cc b/runtime/src/iree/base/internal/fpu_state_test.cc
new file mode 100644
index 0000000..74bc0eb
--- /dev/null
+++ b/runtime/src/iree/base/internal/fpu_state_test.cc
@@ -0,0 +1,28 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/internal/fpu_state.h"
+
+#include "iree/testing/gtest.h"
+
+namespace {
+
+// NOTE: depending on compiler options or architecture denormals may always be
+// flushed to zero. Here we just test that they are flushed when we request them
+// to be.
+TEST(FPUStateTest, FlushDenormalsToZero) {
+  iree_fpu_state_t fpu_state =
+      iree_fpu_state_push(IREE_FPU_STATE_FLAG_FLUSH_DENORMALS_TO_ZERO);
+
+  float f = 1.0f;
+  volatile float* fp = &f;
+  *fp = *fp * 1e-39f;
+  EXPECT_EQ(0.0f, f);
+
+  iree_fpu_state_pop(fpu_state);
+}
+
+}  // namespace
diff --git a/runtime/src/iree/base/internal/inline_array.h b/runtime/src/iree/base/internal/inline_array.h
new file mode 100644
index 0000000..ccbaf14
--- /dev/null
+++ b/runtime/src/iree/base/internal/inline_array.h
@@ -0,0 +1,59 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BASE_INTERNAL_INLINE_ARRAY_H_
+#define IREE_BASE_INTERNAL_INLINE_ARRAY_H_
+
+#include "iree/base/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//==============================================================================
+// iree_inline_array_t
+//==============================================================================
+
+// Maximum number of bytes that can be allocated from the stack.
+// Arrays exceeding this size will incur a heap allocation.
+#define IREE_INLINE_ARRAY_MAX_STACK_ALLOCATION 512
+
+#define iree_inline_array(type, variable, initial_size, allocator)       \
+  const iree_allocator_t variable##_allocator = (allocator);             \
+  struct {                                                               \
+    iree_host_size_t size;                                               \
+    type* data;                                                          \
+  } variable = {                                                         \
+      (initial_size),                                                    \
+      NULL,                                                              \
+  };                                                                     \
+  if (IREE_UNLIKELY(sizeof(type) * (initial_size) >                      \
+                    IREE_INLINE_ARRAY_MAX_STACK_ALLOCATION)) {           \
+    IREE_CHECK_OK(iree_allocator_malloc(variable##_allocator,            \
+                                        sizeof(type) * (initial_size),   \
+                                        (void**)&(variable).data));      \
+  } else {                                                               \
+    (variable).data = (type*)iree_alloca(sizeof(type) * (initial_size)); \
+  }
+
+#define iree_inline_array_deinitialize(variable)                 \
+  if (IREE_UNLIKELY(sizeof(*(variable).data) * (variable).size > \
+                    IREE_INLINE_ARRAY_MAX_STACK_ALLOCATION)) {   \
+    iree_allocator_free(variable##_allocator, (variable).data);  \
+  }
+
+#define iree_inline_array_size(variable) (variable).size
+
+#define iree_inline_array_capacity(variable) (variable).capacity
+#define iree_inline_array_data(variable) (variable).data
+
+#define iree_inline_array_at(variable, index) &(variable).data[(index)]
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // IREE_BASE_INTERNAL_INLINE_ARRAY_H_
diff --git a/runtime/src/iree/base/internal/main.h b/runtime/src/iree/base/internal/main.h
new file mode 100644
index 0000000..1321832
--- /dev/null
+++ b/runtime/src/iree/base/internal/main.h
@@ -0,0 +1,20 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BASE_INTERNAL_MAIN_H_
+#define IREE_BASE_INTERNAL_MAIN_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+int iree_main(int argc, char** argv);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_BASE_INTERNAL_MAIN_H_
diff --git a/runtime/src/iree/base/internal/main_posix.c b/runtime/src/iree/base/internal/main_posix.c
new file mode 100644
index 0000000..cf884a3
--- /dev/null
+++ b/runtime/src/iree/base/internal/main_posix.c
@@ -0,0 +1,15 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/internal/main.h"
+#include "iree/base/target_platform.h"
+
+#if defined(IREE_PLATFORM_ANDROID) || defined(IREE_PLATFORM_APPLE) || \
+    defined(IREE_PLATFORM_LINUX)
+
+int main(int argc, char** argv) { return iree_main(argc, argv); }
+
+#endif  // IREE_PLATFORM_*
diff --git a/runtime/src/iree/base/internal/main_win32.c b/runtime/src/iree/base/internal/main_win32.c
new file mode 100644
index 0000000..119ed19
--- /dev/null
+++ b/runtime/src/iree/base/internal/main_win32.c
@@ -0,0 +1,35 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <stdlib.h>
+
+#include "iree/base/internal/main.h"
+#include "iree/base/target_platform.h"
+
+#if defined(IREE_PLATFORM_WINDOWS)
+
+#include <combaseapi.h>
+
+// Entry point when using /SUBSYSTEM:CONSOLE is the standard main().
+int main(int argc, char** argv) { return iree_main(argc, argv); }
+
+// Entry point when using /SUBSYSTEM:WINDOWS.
+// https://docs.microsoft.com/en-us/windows/win32/api/winbase/nf-winbase-winmain
+int WINAPI WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance,
+                   LPSTR lpCmdLine, int nShowCmd) {
+  // Setup COM on the main thread.
+  // NOTE: this may fail if COM has already been initialized - that's OK.
+  CoInitializeEx(NULL, COINIT_MULTITHREADED);
+
+  // Run standard main function.
+  // We use the MSVCRT __argc/__argv to get access to the standard argc/argv
+  // vs. using the flattened string passed to WinMain (that would require
+  // complex unicode splitting/etc).
+  // https://docs.microsoft.com/en-us/cpp/c-runtime-library/argc-argv-wargv
+  return iree_main(__argc, __argv);
+}
+
+#endif  // IREE_PLATFORM_WINDOWS
diff --git a/runtime/src/iree/base/internal/math.h b/runtime/src/iree/base/internal/math.h
new file mode 100644
index 0000000..af767a7
--- /dev/null
+++ b/runtime/src/iree/base/internal/math.h
@@ -0,0 +1,310 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BASE_INTERNAL_MATH_H_
+#define IREE_BASE_INTERNAL_MATH_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "iree/base/alignment.h"
+#include "iree/base/api.h"
+#include "iree/base/target_platform.h"
+
+// Haswell or later, gcc compile time option: -mlzcnt
+#if defined(__LZCNT__)
+#include <x86intrin.h>
+#endif
+
+// Clang on Windows has __builtin_clzll; otherwise we need to use the
+// windows intrinsic functions.
+#if defined(IREE_COMPILER_MSVC)
+#include <intrin.h>
+#if defined(IREE_ARCH_ARM_64) || defined(IREE_ARCH_X86_64)
+#pragma intrinsic(_BitScanReverse64)
+#pragma intrinsic(_BitScanForward64)
+#endif
+#pragma intrinsic(_BitScanReverse)
+#pragma intrinsic(_BitScanForward)
+#endif  // IREE_COMPILER_MSVC
+
+#define iree_shr(value, shamt) \
+  (((shamt) < sizeof(value) * 8) ? ((value) >> (shamt)) : 0)
+
+//==============================================================================
+// Bitwise rotation (aka circular shifts)
+//==============================================================================
+
+// Unsigned rotate-left a 64-bit integer.
+// https://en.cppreference.com/w/cpp/numeric/rotl
+//
+//
+// NOTE: this exact form is confirmed to be recognized by the compilers we care
+// about; do not modify: https://godbolt.org/z/xzof9d
+static inline uint64_t iree_math_rotl_u64(const uint64_t n, uint32_t c) {
+  const uint32_t mask = 8 * sizeof(n) - 1;
+  c &= mask;
+  if (!c) return n;
+  return (n << c) | (n >> (64 - c));
+}
+
+// Unsigned rotate-right a 64-bit integer.
+// https://en.cppreference.com/w/cpp/numeric/rotr
+//
+// NOTE: this exact form is confirmed to be recognized by the compilers we care
+// about **except MSVC**; do not modify: https://godbolt.org/z/xzof9d
+static inline uint64_t iree_math_rotr_u64(const uint64_t n, uint32_t c) {
+  const uint32_t mask = 8 * sizeof(n) - 1;
+  c &= mask;
+  if (!c) return n;
+  return (n >> c) | (n << ((-c) & mask));
+}
+
+//==============================================================================
+// Bit scanning/counting
+//==============================================================================
+
+static inline int iree_math_count_leading_zeros_u32(const uint32_t n) {
+#if defined(IREE_COMPILER_MSVC)
+  unsigned long result = 0;  // NOLINT(runtime/int)
+  if (_BitScanReverse(&result, n)) {
+    return (int)(31 - result);
+  }
+  return 32;
+#elif defined(IREE_COMPILER_GCC_COMPAT)
+#if defined(__LCZNT__)
+  // NOTE: LZCNT is a risky instruction; it is not supported on architectures
+  // before Haswell, yet it is encoded as 'rep bsr', which typically ignores
+  // invalid rep prefixes, and interprets it as the 'bsr' instruction, which
+  // returns the index of the value rather than the count, resulting in
+  // incorrect code.
+  return (int)__lzcnt32(n);
+#endif  // defined(__LCZNT__)
+
+  // Handle 0 as a special case because __builtin_clz(0) is undefined.
+  if (n == 0) return 32;
+  // Use __builtin_clz, which uses the following instructions:
+  //  x86: bsr
+  //  ARM64: clz
+  //  PPC: cntlzd
+  return (int)__builtin_clz(n);
+#else
+#error No clz for this arch.
+#endif  // IREE_COMPILER_MSVC / IREE_COMPILER_GCC_COMPAT
+}
+
+static inline int iree_math_count_leading_zeros_u64(uint64_t n) {
+#if defined(IREE_COMPILER_MSVC) && \
+    (defined(IREE_ARCH_ARM_64) || defined(IREE_ARCH_X86_64))
+  // MSVC does not have __buitin_clzll. Use _BitScanReverse64.
+  unsigned long result = 0;  // NOLINT(runtime/int)
+  if (_BitScanReverse64(&result, n)) {
+    return (int)(63 - result);
+  }
+  return 64;
+#elif defined(IREE_COMPILER_MSVC)
+  // MSVC does not have __buitin_clzll. Compose two calls to _BitScanReverse
+  unsigned long result = 0;  // NOLINT(runtime/int)
+  if ((n >> 32) && _BitScanReverse(&result, n >> 32)) {
+    return (int)(31 - result);
+  }
+  if (_BitScanReverse(&result, n)) {
+    return (int)(63 - result);
+  }
+  return 64;
+#elif defined(IREE_COMPILER_GCC_COMPAT)
+#if defined(__LCZNT__)
+  // NOTE: LZCNT is a risky instruction; it is not supported on architectures
+  // before Haswell, yet it is encoded as 'rep bsr', which typically ignores
+  // invalid rep prefixes, and interprets it as the 'bsr' instruction, which
+  // returns the index of the value rather than the count, resulting in
+  // incorrect code.
+  return __lzcnt64(n);
+#elif defined(__aarch64__) || defined(__powerpc64__)
+  // Empirically verified that __builtin_clzll(0) works as expected.
+  return (int)__builtin_clzll(n);
+#endif
+  // Handle 0 as a special case because __builtin_clzll(0) is undefined.
+  if (!n) return 64;
+  // Use __builtin_clzll, which uses the following instructions:
+  //    x86: bsr
+  //    PPC: cntlzd
+  //   WASM: i32.clz
+  // RISC-V: __clzsi2 in GCC, splat out in clang
+  return (int)__builtin_clzll(n);
+#else
+#error No clz for this arch.
+#endif  // IREE_COMPILER_MSVC / IREE_COMPILER_GCC_COMPAT
+}
+
+static inline int iree_math_count_trailing_zeros_u32(uint32_t n) {
+#if defined(IREE_COMPILER_MSVC)
+  unsigned long result = 0;  // NOLINT(runtime/int)
+  _BitScanForward(&result, n);
+  return (int)result;
+#elif defined(IREE_COMPILER_GCC_COMPAT)
+  return (int)__builtin_ctz(n);
+#else
+  int c = 31;
+  n &= ~n + 1;
+  if (n & 0x0000FFFFu) c -= 16;
+  if (n & 0x00FF00FFu) c -= 8;
+  if (n & 0x0F0F0F0Fu) c -= 4;
+  if (n & 0x33333333u) c -= 2;
+  if (n & 0x55555555u) c -= 1;
+  return c;
+#endif  // IREE_COMPILER_MSVC / IREE_COMPILER_GCC_COMPAT
+}
+
+static inline int iree_math_count_trailing_zeros_u64(uint64_t n) {
+#if defined(IREE_COMPILER_MSVC) && defined(IREE_PTR_SIZE_64)
+  unsigned long result = 0;  // NOLINT(runtime/int)
+  _BitScanForward64(&result, n);
+  return (int)result;
+#elif defined(IREE_COMPILER_MSVC) && defined(IREE_PTR_SIZE_32)
+  unsigned long result = 0;  // NOLINT(runtime/int)
+  if ((uint32_t)(n) == 0) {
+    _BitScanForward(&result, n >> 32);
+    return result + 32;
+  }
+  _BitScanForward(&result, n);
+  return (int)result;
+#elif defined(IREE_COMPILER_GCC_COMPAT)
+  // Use __builtin_clzll, which uses the following instructions:
+  //    x86: bsr
+  //    PPC: cntlzd
+  //   WASM: i64.clz
+  // RISC-V: __clzdi2 in GCC, splat out in clang
+  return __builtin_ctzll(n);
+#else
+  int c = 63;
+  n &= ~n + 1;
+  if (n & 0x00000000FFFFFFFFull) c -= 32;
+  if (n & 0x0000FFFF0000FFFFull) c -= 16;
+  if (n & 0x00FF00FF00FF00FFull) c -= 8;
+  if (n & 0x0F0F0F0F0F0F0F0Full) c -= 4;
+  if (n & 0x3333333333333333ull) c -= 2;
+  if (n & 0x5555555555555555ull) c -= 1;
+  return c;
+#endif  // IREE_COMPILER_MSVC / IREE_COMPILER_GCC_COMPAT
+}
+
+//==============================================================================
+// Population count
+//==============================================================================
+
+// Returns the number of 1 bits in a 32 bit value.
+static inline int iree_math_count_ones_u32(uint32_t n) {
+  n -= ((n >> 1) & 0x55555555u);
+  n = ((n >> 2) & 0x33333333u) + (n & 0x33333333u);
+  return (int)((((n + (n >> 4)) & 0x0F0F0F0Fu) * 0x01010101u) >> 24);
+}
+
+// Returns the number of 1 bits in a 64 bit value.
+static inline int iree_math_count_ones_u64(uint64_t n) {
+  return iree_math_count_ones_u32(n >> 32) +
+         iree_math_count_ones_u32(n & 0xFFFFFFFFu);
+}
+
+//==============================================================================
+// Rounding and alignment
+//==============================================================================
+// There are certain platforms - mostly those with poorer quality compilers or
+// more restricted instruction sets - where we want to avoid the clz path as
+// it is emulated and instead we use some bit-twiddling hacks. On other
+// platforms it's the opposite - they may emulate clz but doing so saves
+// dozens of bytes that otherwise would have been the shift/or tree.
+//
+// Which to choose is entirely determined by fiddling on godbolt for the
+// target platform: https://godbolt.org/z/h4vPzo
+
+// Rounds up the value to the nearest power of 2 (if not already a power of 2).
+// For 32-bit numbers this only supports values <= 2^31; higher will wrap.
+static inline uint32_t iree_math_round_up_to_pow2_u32(uint32_t n) {
+#if 0    // golf required; can be bloated
+  const uint32_t i = (n != 1);
+  return (1 + i) << ((iree_math_count_leading_zeros_u32(n - i) ^ 31));
+#elif 0  // golf required; can be bloated
+  return n == 1 ? 1u : 2u << ((iree_math_count_leading_zeros_u32(n - 1) ^ 31));
+#else
+  // https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+  n--;
+  n |= n >> 1;
+  n |= n >> 2;
+  n |= n >> 4;
+  n |= n >> 8;
+  n |= n >> 16;
+  return n + 1;
+#endif  // 1
+}
+
+// Rounds up the value to the nearest power of 2 (if not already a power of 2).
+// For 64-bit numbers this only supports values <= 2^63; higher will wrap.
+static inline uint64_t iree_math_round_up_to_pow2_u64(uint64_t n) {
+#if 0    // golf required; can be bloated
+  const uint64_t i = (n != 1);
+  return (1 + i) << ((iree_math_count_leading_zeros_u64(n - i) ^ 63));
+#elif 0  // golf required; can be bloated
+  return n == 1 ? 1ull
+                : 2ull << ((iree_math_count_leading_zeros_u64(n - 1) ^ 63));
+#else
+  // https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+  n--;
+  n |= n >> 1;
+  n |= n >> 2;
+  n |= n >> 4;
+  n |= n >> 8;
+  n |= n >> 16;
+  n |= n >> 32;
+  return n + 1;
+#endif  // 1
+}
+
+//==============================================================================
+// FP16 support
+//==============================================================================
+
+// Converts a 16-bit floating-point value to a 32-bit C `float`.
+//
+// NOTE: this implementation does not handle corner cases around NaN and such;
+// we can improve this implementation over time if it is used for such cases.
+static inline float iree_math_f16_to_f32(const uint16_t f16_value) {
+  const uint32_t sign = ((uint32_t)((f16_value & 0x8000u) >> 15)) << 31;
+  uint32_t exp = ((f16_value & 0x7C00u) >> 10);
+  uint32_t mantissa = 0;
+  if (exp > 0) {
+    exp = (exp + 127 - 15) << 23;
+    mantissa = ((uint32_t)(f16_value & 0x3FFu)) << (23 - 10);
+  }
+  const uint32_t u32_value = sign | exp | mantissa;
+  float f32_value;
+  memcpy(&f32_value, &u32_value, sizeof(f32_value));
+  return f32_value;
+}
+
+// Converts a 32-bit C `float` value to a 16-bit floating-point value.
+//
+// NOTE: this implementation does not handle corner cases around NaN and such;
+// we can improve this implementation over time if it is used for such cases.
+static inline uint16_t iree_math_f32_to_f16(const float f32_value) {
+  uint32_t u32_value;
+  memcpy(&u32_value, &f32_value, sizeof(u32_value));
+  const uint32_t sign = ((u32_value & 0x80000000u) >> 31) << 15;
+  const uint32_t mantissa = (u32_value & 0x007FFFFFu) >> (23 - 10);
+  int32_t exp = ((u32_value & 0x7F800000u) >> 23) - 127 + 15;
+  if (exp > 31) {
+    exp = 31 << 10;
+  } else if (exp < 0) {
+    exp = 0;
+  } else {
+    exp = exp << 10;
+  }
+  return (uint16_t)(sign | exp | mantissa);
+}
+
+#endif  // IREE_BASE_INTERNAL_MATH_H_
diff --git a/runtime/src/iree/base/internal/math_test.cc b/runtime/src/iree/base/internal/math_test.cc
new file mode 100644
index 0000000..8984e54
--- /dev/null
+++ b/runtime/src/iree/base/internal/math_test.cc
@@ -0,0 +1,202 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/internal/math.h"
+
+#include <cfloat>
+
+#include "iree/testing/gtest.h"
+
+namespace {
+
+//==============================================================================
+// Bitwise rotation (aka circular shifts)
+//==============================================================================
+
+TEST(BitwiseRotationTest, ROTL64) {
+  EXPECT_EQ(0ull, iree_math_rotl_u64(0ull, 0u));
+  EXPECT_EQ(0ull, iree_math_rotl_u64(0ull, 0u));
+  EXPECT_EQ(1ull, iree_math_rotl_u64(1ull, 0u));
+  EXPECT_EQ(1ull, iree_math_rotl_u64(1ull, 0u));
+
+  EXPECT_EQ(2ull, iree_math_rotl_u64(1ull, 1u));
+  EXPECT_EQ(2ull, iree_math_rotl_u64(1ull, 1u));
+  EXPECT_EQ(UINT64_MAX, iree_math_rotl_u64(UINT64_MAX, 63u));
+  EXPECT_EQ(UINT64_MAX, iree_math_rotl_u64(UINT64_MAX, 64u));
+}
+
+TEST(BitwiseRotationTest, ROTR64) {
+  EXPECT_EQ(0ull, iree_math_rotr_u64(0ull, 0u));
+  EXPECT_EQ(0ull, iree_math_rotr_u64(0ull, 0u));
+  EXPECT_EQ(1ull, iree_math_rotr_u64(1ull, 0u));
+  EXPECT_EQ(1ull, iree_math_rotr_u64(1ull, 0u));
+
+  EXPECT_EQ(1ull, iree_math_rotr_u64(2ull, 1u));
+  EXPECT_EQ(0x8000000000000000ull, iree_math_rotr_u64(2ull, 2u));
+  EXPECT_EQ(0x8000000000000000ull, iree_math_rotr_u64(1ull, 1u));
+  EXPECT_EQ(0x4000000000000000ull, iree_math_rotr_u64(1ull, 2u));
+}
+
+//==============================================================================
+// Bit scanning/counting
+//==============================================================================
+
+TEST(BitwiseScansTest, CLZ32) {
+  EXPECT_EQ(32, iree_math_count_leading_zeros_u32(uint32_t{}));
+  EXPECT_EQ(0, iree_math_count_leading_zeros_u32(~uint32_t{}));
+  for (int index = 0; index < 32; index++) {
+    uint32_t x = 1u << index;
+    const int cnt = 31 - index;
+    ASSERT_EQ(cnt, iree_math_count_leading_zeros_u32(x)) << index;
+    ASSERT_EQ(cnt, iree_math_count_leading_zeros_u32(x + x - 1)) << index;
+  }
+}
+
+TEST(BitwiseScansTest, CLZ64) {
+  EXPECT_EQ(64, iree_math_count_leading_zeros_u64(uint64_t{}));
+  EXPECT_EQ(0, iree_math_count_leading_zeros_u64(~uint64_t{}));
+  for (int index = 0; index < 64; index++) {
+    uint64_t x = 1ull << index;
+    const int cnt = 63 - index;
+    ASSERT_EQ(cnt, iree_math_count_leading_zeros_u64(x)) << index;
+    ASSERT_EQ(cnt, iree_math_count_leading_zeros_u64(x + x - 1)) << index;
+  }
+}
+
+TEST(BitwiseScansTest, CTZ32) {
+  EXPECT_EQ(0, iree_math_count_trailing_zeros_u32(~uint32_t{}));
+  for (int index = 0; index < 32; index++) {
+    uint32_t x = static_cast<uint32_t>(1) << index;
+    const int cnt = index;
+    ASSERT_EQ(cnt, iree_math_count_trailing_zeros_u32(x)) << index;
+    ASSERT_EQ(cnt, iree_math_count_trailing_zeros_u32(~(x - 1))) << index;
+  }
+}
+
+TEST(BitwiseScansTest, CTZ64) {
+  // iree_math_count_trailing_zeros_u32
+  EXPECT_EQ(0, iree_math_count_trailing_zeros_u64(~uint64_t{}));
+  for (int index = 0; index < 64; index++) {
+    uint64_t x = static_cast<uint64_t>(1) << index;
+    const int cnt = index;
+    ASSERT_EQ(cnt, iree_math_count_trailing_zeros_u64(x)) << index;
+    ASSERT_EQ(cnt, iree_math_count_trailing_zeros_u64(~(x - 1))) << index;
+  }
+}
+
+//==============================================================================
+// Population count
+//==============================================================================
+
+TEST(PopulationCountTest, Ones32) {
+  EXPECT_EQ(0, iree_math_count_ones_u32(0u));
+  EXPECT_EQ(1, iree_math_count_ones_u32(1u));
+  EXPECT_EQ(29, iree_math_count_ones_u32(-15u));
+  EXPECT_EQ(5, iree_math_count_ones_u32(341u));
+  EXPECT_EQ(32, iree_math_count_ones_u32(UINT32_MAX));
+  EXPECT_EQ(31, iree_math_count_ones_u32(UINT32_MAX - 1));
+}
+
+TEST(PopulationCountTest, Ones64) {
+  EXPECT_EQ(0, iree_math_count_ones_u64(0ull));
+  EXPECT_EQ(1, iree_math_count_ones_u64(1ull));
+  EXPECT_EQ(61, iree_math_count_ones_u64(-15ull));
+  EXPECT_EQ(5, iree_math_count_ones_u64(341ull));
+  EXPECT_EQ(64, iree_math_count_ones_u64(UINT64_MAX));
+  EXPECT_EQ(63, iree_math_count_ones_u64(UINT64_MAX - 1ull));
+}
+
+//==============================================================================
+// Rounding and alignment
+//==============================================================================
+
+TEST(RoundingTest, UpToNextPow232) {
+  constexpr uint32_t kUint16Max = UINT16_MAX;
+  constexpr uint32_t kUint32Max = UINT32_MAX;
+  EXPECT_EQ(0u, iree_math_round_up_to_pow2_u32(0u));
+  EXPECT_EQ(1u, iree_math_round_up_to_pow2_u32(1u));
+  EXPECT_EQ(2u, iree_math_round_up_to_pow2_u32(2u));
+  EXPECT_EQ(4u, iree_math_round_up_to_pow2_u32(3u));
+  EXPECT_EQ(8u, iree_math_round_up_to_pow2_u32(8u));
+  EXPECT_EQ(16u, iree_math_round_up_to_pow2_u32(9u));
+  EXPECT_EQ(kUint16Max + 1u, iree_math_round_up_to_pow2_u32(kUint16Max - 1u));
+  EXPECT_EQ(kUint16Max + 1u, iree_math_round_up_to_pow2_u32(kUint16Max));
+  EXPECT_EQ(kUint16Max + 1u, iree_math_round_up_to_pow2_u32(kUint16Max + 1u));
+  EXPECT_EQ(131072u, iree_math_round_up_to_pow2_u32(kUint16Max + 2u));
+  EXPECT_EQ(262144u, iree_math_round_up_to_pow2_u32(262144u - 1u));
+  EXPECT_EQ(0x80000000u, iree_math_round_up_to_pow2_u32(0x7FFFFFFFu));
+  EXPECT_EQ(0x80000000u, iree_math_round_up_to_pow2_u32(0x80000000u));
+
+  // NOTE: wrap to 0.
+  EXPECT_EQ(0u, iree_math_round_up_to_pow2_u32(0x80000001u));
+  EXPECT_EQ(0u, iree_math_round_up_to_pow2_u32(kUint32Max - 1u));
+  EXPECT_EQ(0u, iree_math_round_up_to_pow2_u32(kUint32Max));
+}
+
+TEST(RoundingTest, UpToNextPow264) {
+  constexpr uint64_t kUint16Max = UINT16_MAX;
+  constexpr uint64_t kUint64Max = UINT64_MAX;
+  EXPECT_EQ(0ull, iree_math_round_up_to_pow2_u64(0ull));
+  EXPECT_EQ(1ull, iree_math_round_up_to_pow2_u64(1ull));
+  EXPECT_EQ(2ull, iree_math_round_up_to_pow2_u64(2ull));
+  EXPECT_EQ(4ull, iree_math_round_up_to_pow2_u64(3ull));
+  EXPECT_EQ(8ull, iree_math_round_up_to_pow2_u64(8ull));
+  EXPECT_EQ(16ull, iree_math_round_up_to_pow2_u64(9ull));
+  EXPECT_EQ(kUint16Max + 1ull,
+            iree_math_round_up_to_pow2_u64(kUint16Max - 1ull));
+  EXPECT_EQ(kUint16Max + 1ull, iree_math_round_up_to_pow2_u64(kUint16Max));
+  EXPECT_EQ(kUint16Max + 1ull,
+            iree_math_round_up_to_pow2_u64(kUint16Max + 1ull));
+  EXPECT_EQ(131072ull, iree_math_round_up_to_pow2_u64(kUint16Max + 2ull));
+  EXPECT_EQ(0x100000000ull, iree_math_round_up_to_pow2_u64(0xFFFFFFFEull));
+  EXPECT_EQ(0x100000000ull, iree_math_round_up_to_pow2_u64(0xFFFFFFFFull));
+  EXPECT_EQ(0x80000000ull, iree_math_round_up_to_pow2_u64(0x7FFFFFFFull));
+  EXPECT_EQ(0x80000000ull, iree_math_round_up_to_pow2_u64(0x80000000ull));
+  EXPECT_EQ(0x100000000ull, iree_math_round_up_to_pow2_u64(0x80000001ull));
+
+  // NOTE: wrap to 0.
+  EXPECT_EQ(0ull, iree_math_round_up_to_pow2_u64(0x8000000000000001ull));
+  EXPECT_EQ(0ull, iree_math_round_up_to_pow2_u64(kUint64Max - 1ull));
+  EXPECT_EQ(0ull, iree_math_round_up_to_pow2_u64(kUint64Max));
+}
+
+//==============================================================================
+// FP16 support
+//==============================================================================
+
+TEST(F16ConversionTest, F32ToF16) {
+  // Within range, normal truncation.
+  EXPECT_EQ(0x3400, iree_math_f32_to_f16(0.25f));
+  EXPECT_EQ(0xd646, iree_math_f32_to_f16(-100.375f));
+  // Overflow
+  EXPECT_EQ(0x7fff, iree_math_f32_to_f16(FLT_MAX));
+  EXPECT_EQ(0xffff, iree_math_f32_to_f16(-FLT_MAX));
+  // Underflow
+  EXPECT_EQ(0, iree_math_f32_to_f16(FLT_MIN));
+  EXPECT_EQ(0x8000, iree_math_f32_to_f16(-FLT_MIN));
+}
+
+TEST(F16ConversionTest, F32ToF16ToF32) {
+  constexpr float kF16Max = 65504.f;
+  constexpr float kF16Min = 0.0000610351563f;
+  // Within range, should just recover.
+  EXPECT_EQ(0.25f, iree_math_f16_to_f32(iree_math_f32_to_f16(0.25f)));
+  EXPECT_EQ(-100.375f, iree_math_f16_to_f32(iree_math_f32_to_f16(-100.375f)));
+  EXPECT_EQ(kF16Max, iree_math_f16_to_f32(iree_math_f32_to_f16(kF16Max)));
+  EXPECT_EQ(kF16Min, iree_math_f16_to_f32(iree_math_f32_to_f16(kF16Min)));
+  // Overflow
+  EXPECT_GE(FLT_MAX, iree_math_f16_to_f32(iree_math_f32_to_f16(FLT_MAX)));
+  EXPECT_LT(-FLT_MAX, iree_math_f16_to_f32(iree_math_f32_to_f16(-FLT_MAX)));
+  EXPECT_GT(kF16Max + 1.f,
+            iree_math_f16_to_f32(iree_math_f32_to_f16(kF16Max + 1.f)));
+  // Underflow
+  EXPECT_EQ(0.0f, iree_math_f16_to_f32(iree_math_f32_to_f16(FLT_MIN)));
+  EXPECT_EQ(0.0f, iree_math_f16_to_f32(iree_math_f32_to_f16(-FLT_MIN)));
+  EXPECT_EQ(0.0f,
+            iree_math_f16_to_f32(iree_math_f32_to_f16(kF16Min - kF16Min / 2)));
+}
+
+}  // namespace
diff --git a/runtime/src/iree/base/internal/prng.h b/runtime/src/iree/base/internal/prng.h
new file mode 100644
index 0000000..8a97df8
--- /dev/null
+++ b/runtime/src/iree/base/internal/prng.h
@@ -0,0 +1,205 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+//==============================================================================
+//
+// Pseudo-random number generators (PRNGs): **NOT CRYPTOGRAPHICALLY SECURE*
+//
+// Only use these tiny little PRNGs to introduce a bit of randomnessish behavior
+// to things like balancing and backoff algorithms.
+//
+//==============================================================================
+
+#ifndef IREE_BASE_INTERNAL_PRNG_H_
+#define IREE_BASE_INTERNAL_PRNG_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/math.h"
+#include "iree/base/target_platform.h"
+
+#if defined(IREE_ARCH_ARM_64)
+#include <arm_neon.h>
+#endif  // IREE_ARCH_ARM_64
+
+//==============================================================================
+// Pseudo-random number generators (PRNGs): **NOT CRYPTOGRAPHICALLY SECURE*
+//==============================================================================
+
+// A fixed-increment version of Java 8's SplittableRandom generator
+// See http://dx.doi.org/10.1145/2714064.2660195 and
+// http://docs.oracle.com/javase/8/docs/api/java/util/SplittableRandom.html
+//
+// SplitMix64 as recommended for use with xoroshiro by the authors:
+// http://prng.di.unimi.it/splitmix64.c
+// http://rosettacode.org/wiki/Pseudo-random_numbers/Splitmix64
+typedef uint64_t iree_prng_splitmix64_state_t;
+
+// Initializes a SplitMix64 PRNG state vector; |out_state| is overwritten.
+// |seed| may be any 64-bit value.
+static inline void iree_prng_splitmix64_initialize(
+    uint64_t seed, iree_prng_splitmix64_state_t* out_state) {
+  *out_state = seed;
+}
+
+// Steps a SplitMix64 PRNG state vector and yields a value for use.
+static inline uint64_t iree_prng_splitmix64_next(
+    iree_prng_splitmix64_state_t* state) {
+  uint64_t z = (*state += 0x9E3779B97F4A7C15ull);
+  z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull;
+  z = (z ^ (z >> 27)) * 0x94D049BB133111EBull;
+  return z ^ (z >> 31);
+}
+
+// A small **pseudorandom** number generator (named after the operations used).
+// http://prng.di.unimi.it/
+typedef struct {
+  uint64_t value[2];
+} iree_prng_xoroshiro128_state_t;
+
+// Initializes a xoroshiro128+ PRNG state vector; |out_state| is overwritten.
+// |seed| may be any 64-bit value.
+static inline void iree_prng_xoroshiro128_initialize(
+    uint64_t seed, iree_prng_xoroshiro128_state_t* out_state) {
+  // The authors recommend using SplitMix64 to go from a single int seed
+  // into the two state values we need. It's critical that we don't use a
+  // xoroshiro128 for this as seeding a PRNG with the results of itself is...
+  // unsound.
+  iree_prng_splitmix64_state_t init_state;
+  iree_prng_splitmix64_initialize(seed, &init_state);
+  out_state->value[0] = iree_prng_splitmix64_next(&seed);
+  out_state->value[1] = iree_prng_splitmix64_next(&seed);
+
+  // A state of 0 will never produce anything but zeros so ensure that doesn't
+  // happen; of course, after running splitmix that should be closer to the
+  // side of never than not.
+  if (!out_state->value[0] && !out_state->value[1]) {
+    out_state->value[0] = 1;
+  }
+}
+
+// Steps a xoroshiro128 state vector and yields a value for use.
+// xoroshiro128+ variant: produces a single value with 32-bit bits of entropy.
+// This is the fastest variant but the lower 4 bits of the returned value may
+// not be sufficiently well-distributed. This is fine if the usage requires
+// fewer than 60 bits such as when sampling bools or array indices.
+// Note also that this works great for floating-point numbers where only 23 or
+// 53 bits are required to populate a mantissa and an additional step can be
+// used to generate the sign/exponent when required.
+//
+//   footprint: 128-bits
+//      period: 2^128 - 1
+//  ns/64-bits: 0.72
+// cycles/byte: 0.29
+//
+// http://prng.di.unimi.it/xoroshiro128plus.c
+static inline uint64_t iree_prng_xoroshiro128plus_next_uint60(
+    iree_prng_xoroshiro128_state_t* state) {
+  uint64_t s0 = state->value[0];
+  uint64_t s1 = state->value[1];
+  const uint64_t result = s0 + s1;
+  s1 ^= s0;
+  state->value[0] = iree_math_rotl_u64(s0, 24) ^ s1 ^ (s1 << 16);  // a, b
+  state->value[1] = iree_math_rotl_u64(s1, 37);                    // c
+  return result;
+}
+
+// Steps a xoroshiro128 state vector and yields a single boolean value for use.
+// See iree_prng_xoroshiro128plus_next_uint60 for details.
+static inline bool iree_prng_xoroshiro128plus_next_bool(
+    iree_prng_xoroshiro128_state_t* state) {
+  return (bool)(iree_prng_xoroshiro128plus_next_uint60(state) >> (64 - 1));
+}
+
+// Steps a xoroshiro128 state vector and yields a single uint8_t value for use.
+// See iree_prng_xoroshiro128plus_next_uint60 for details.
+static inline uint8_t iree_prng_xoroshiro128plus_next_uint8(
+    iree_prng_xoroshiro128_state_t* state) {
+  return (uint8_t)(iree_prng_xoroshiro128plus_next_uint60(state) >> (64 - 8));
+}
+
+// Steps a xoroshiro128 state vector and yields a single uint32_t value for use.
+// See iree_prng_xoroshiro128plus_next_uint60 for details.
+static inline uint32_t iree_prng_xoroshiro128plus_next_uint32(
+    iree_prng_xoroshiro128_state_t* state) {
+  return (uint32_t)(iree_prng_xoroshiro128plus_next_uint60(state) >> (64 - 32));
+}
+
+// Steps a xoroshiro128 state vector and yields a value for use.
+// xoroshiro128** variant: produces a single value with 32-bit bits of entropy.
+// Prefer this to xoroshiro128+ when good distribution over the integer range
+// is required; see xoroshiro128+ for details of its issues.
+//
+//   footprint: 128-bits
+//      period: 2^128 - 1
+//  ns/64-bits: 0.93
+// cycles/byte: 0.42
+//
+// http://prng.di.unimi.it/xoroshiro128starstar.c
+static inline uint64_t iree_prng_xoroshiro128starstar_next_uint64(
+    iree_prng_xoroshiro128_state_t* state) {
+  uint64_t s0 = state->value[0];
+  uint64_t s1 = state->value[1];
+  const uint64_t result = iree_math_rotl_u64(s0 * 5, 7) * 9;
+  s1 ^= s0;
+  state->value[0] = iree_math_rotl_u64(s0, 24) ^ s1 ^ (s1 << 16);  // a, b
+  state->value[1] = iree_math_rotl_u64(s1, 37);                    // c
+  return result;
+}
+
+// MiniLcg by @bjacob: A shot at the cheapest possible PRNG on ARM NEON
+// https://gist.github.com/bjacob/7d635b91acd02559d73a6d159fe9cfbe
+// I have no idea what the entropy characteristics of it are but it's really
+// fast and in a lot of places that's all we need. For example, whatever number
+// we generate when doing worker thread selection is going to get AND'ed with
+// some other bitmasks by the caller -- and once you do that to a random number
+// you've pretty much admitted it's ok to not be so strong and may as well
+// capitalize on it!
+typedef iree_alignas(iree_max_align_t) struct {
+  uint8_t value[16];  // first to ensure alignment
+  int8_t remaining;   // number of remaining valid values in the state
+} iree_prng_minilcg128_state_t;
+
+#define IREE_PRNG_MINILCG_INIT_MUL_CONSTANT 13
+#define IREE_PRNG_MINILCG_INIT_ADD_CONSTANT 47
+#define IREE_PRNG_MINILCG_NEXT_MUL_CONSTANT 37
+#define IREE_PRNG_MINILCG_NEXT_ADD_CONSTANT 47
+
+// Initializes a MiniLcg PRNG state vector; |out_state| is overwritten.
+// |seed| may be any 8-bit value.
+static inline void iree_prng_minilcg128_initialize(
+    uint64_t seed, iree_prng_minilcg128_state_t* out_state) {
+  uint8_t value = (seed ^ 11400714819323198485ull) & 0xFF;
+  for (size_t i = 0; i < 16; ++i) {
+    out_state->value[i] = value;
+    value = value * IREE_PRNG_MINILCG_INIT_MUL_CONSTANT +
+            IREE_PRNG_MINILCG_INIT_ADD_CONSTANT;
+  }
+  out_state->remaining = 16;
+}
+
+static inline uint8_t iree_prng_minilcg128_next_uint8(
+    iree_prng_minilcg128_state_t* state) {
+  if (IREE_UNLIKELY(--state->remaining < 0)) {
+#if defined(IREE_ARCH_ARM_64)
+    uint8x16_t kmul = vdupq_n_u8(IREE_PRNG_MINILCG_NEXT_MUL_CONSTANT);
+    uint8x16_t kadd = vdupq_n_u8(IREE_PRNG_MINILCG_NEXT_ADD_CONSTANT);
+    vst1q_u8(state->value, vmlaq_u8(kadd, kmul, vld1q_u8(state->value)));
+#else
+    for (size_t i = 0; i < 16; ++i) {
+      state->value[i] = state->value[i] * IREE_PRNG_MINILCG_NEXT_MUL_CONSTANT +
+                        IREE_PRNG_MINILCG_NEXT_ADD_CONSTANT;
+    }
+#endif  // IREE_ARCH_ARM_64
+    state->remaining = 15;
+  }
+  return state->value[16 - state->remaining - 1];
+}
+
+#endif  // IREE_BASE_INTERNAL_PRNG_H_
diff --git a/runtime/src/iree/base/internal/prng_test.cc b/runtime/src/iree/base/internal/prng_test.cc
new file mode 100644
index 0000000..95adb12
--- /dev/null
+++ b/runtime/src/iree/base/internal/prng_test.cc
@@ -0,0 +1,91 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+//==============================================================================
+// Pseudo-random number generators (PRNGs): **NOT CRYPTOGRAPHICALLY SECURE*
+//==============================================================================
+// NOTE: we leave the real testing to the authors; this just ensures we aren't
+// `return 4;`ing it or ignoring the seed.
+
+#include "iree/base/internal/prng.h"
+
+#include "iree/testing/gtest.h"
+
+namespace {
+
+TEST(PRNG, SplitMix64) {
+  iree_prng_splitmix64_state_t state;
+
+  iree_prng_splitmix64_initialize(/*seed=*/0ull, &state);
+  EXPECT_EQ(16294208416658607535ull, iree_prng_splitmix64_next(&state));
+  EXPECT_EQ(7960286522194355700ull, iree_prng_splitmix64_next(&state));
+
+  iree_prng_splitmix64_initialize(/*seed=*/1ull, &state);
+  EXPECT_EQ(10451216379200822465ull, iree_prng_splitmix64_next(&state));
+  EXPECT_EQ(13757245211066428519ull, iree_prng_splitmix64_next(&state));
+
+  iree_prng_splitmix64_initialize(/*seed=*/UINT64_MAX, &state);
+  EXPECT_EQ(16490336266968443936ull, iree_prng_splitmix64_next(&state));
+  EXPECT_EQ(16834447057089888969ull, iree_prng_splitmix64_next(&state));
+}
+
+TEST(PRNG, Xoroshiro128) {
+  iree_prng_xoroshiro128_state_t state;
+
+  iree_prng_xoroshiro128_initialize(/*seed=*/0ull, &state);
+  EXPECT_EQ(5807750865143411619ull,
+            iree_prng_xoroshiro128plus_next_uint60(&state));
+  EXPECT_TRUE(iree_prng_xoroshiro128plus_next_bool(&state));
+  EXPECT_EQ(218u, iree_prng_xoroshiro128plus_next_uint8(&state));
+  EXPECT_EQ(1647201753u, iree_prng_xoroshiro128plus_next_uint32(&state));
+  EXPECT_EQ(7260361800523965311ull,
+            iree_prng_xoroshiro128starstar_next_uint64(&state));
+
+  iree_prng_xoroshiro128_initialize(/*seed=*/1ull, &state);
+  EXPECT_EQ(5761717516557699368ull,
+            iree_prng_xoroshiro128plus_next_uint60(&state));
+  EXPECT_TRUE(iree_prng_xoroshiro128plus_next_bool(&state));
+  EXPECT_EQ(103u, iree_prng_xoroshiro128plus_next_uint8(&state));
+  EXPECT_EQ(2242241045u, iree_prng_xoroshiro128plus_next_uint32(&state));
+  EXPECT_EQ(661144386810419178ull,
+            iree_prng_xoroshiro128starstar_next_uint64(&state));
+
+  iree_prng_xoroshiro128_initialize(/*seed=*/UINT64_MAX, &state);
+  EXPECT_EQ(14878039250348781289ull,
+            iree_prng_xoroshiro128plus_next_uint60(&state));
+  EXPECT_FALSE(iree_prng_xoroshiro128plus_next_bool(&state));
+  EXPECT_EQ(137u, iree_prng_xoroshiro128plus_next_uint8(&state));
+  EXPECT_EQ(2111322015u, iree_prng_xoroshiro128plus_next_uint32(&state));
+  EXPECT_EQ(138107609852220106ull,
+            iree_prng_xoroshiro128starstar_next_uint64(&state));
+}
+
+TEST(PRNG, MiniLcg128) {
+  iree_prng_minilcg128_state_t state;
+
+  iree_prng_minilcg128_initialize(/*seed=*/0ull, &state);
+  EXPECT_EQ(21u, iree_prng_minilcg128_next_uint8(&state));
+  for (int i = 0; i < 100; ++i) {
+    iree_prng_minilcg128_next_uint8(&state);
+  }
+  EXPECT_EQ(18u, iree_prng_minilcg128_next_uint8(&state));
+
+  iree_prng_minilcg128_initialize(/*seed=*/1ull, &state);
+  EXPECT_EQ(20u, iree_prng_minilcg128_next_uint8(&state));
+  for (int i = 0; i < 100; ++i) {
+    iree_prng_minilcg128_next_uint8(&state);
+  }
+  EXPECT_EQ(13u, iree_prng_minilcg128_next_uint8(&state));
+
+  iree_prng_minilcg128_initialize(/*seed=*/UINT64_MAX, &state);
+  EXPECT_EQ(234u, iree_prng_minilcg128_next_uint8(&state));
+  for (int i = 0; i < 100; ++i) {
+    iree_prng_minilcg128_next_uint8(&state);
+  }
+  EXPECT_EQ(59u, iree_prng_minilcg128_next_uint8(&state));
+}
+
+}  // namespace
diff --git a/runtime/src/iree/base/internal/span.h b/runtime/src/iree/base/internal/span.h
new file mode 100644
index 0000000..82fd91c
--- /dev/null
+++ b/runtime/src/iree/base/internal/span.h
@@ -0,0 +1,187 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BASE_INTERNAL_SPAN_H_
+#define IREE_BASE_INTERNAL_SPAN_H_
+#ifdef __cplusplus
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <initializer_list>
+#include <iterator>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+// std::span is available starting in C++20.
+// Prior to that we fall back to our simplified implementation below.
+#if defined(__has_include)
+#if __has_include(<span>) && __cplusplus >= 202002L
+#define IREE_HAVE_STD_SPAN 1
+#include <span>
+#endif  // __has_include(<span>)
+#endif  // __has_include
+
+#ifndef IREE_HAVE_STD_SPAN
+#include <limits>
+#endif
+
+namespace iree {
+
+#if defined(IREE_HAVE_STD_SPAN)
+
+// Alias. Once we bump up our minimum C++ version we can drop this entire file.
+template <typename T>
+using span = std::span<T>;
+
+#else
+
+constexpr std::size_t dynamic_extent = std::numeric_limits<std::size_t>::max();
+
+// A pared down version of std::span doing just enough for our uses in IREE.
+// Most of the IREE code started using absl::Span which while close to std::span
+// has some additional functionality of its own and is missing some from std.
+// The benefit here is that means we only need to implement the intersection of
+// the two as none of our code uses those newer std features.
+//
+// https://en.cppreference.com/w/cpp/container/span/subspan
+template <typename T>
+class span {
+ private:
+  template <typename V>
+  using remove_cv_t = typename std::remove_cv<V>::type;
+  template <typename V>
+  using decay_t = typename std::decay<V>::type;
+
+  template <typename C>
+  static constexpr auto GetDataImpl(C& c, char) noexcept -> decltype(c.data()) {
+    return c.data();
+  }
+  static inline char* GetDataImpl(std::string& s, int) noexcept {
+    return &s[0];
+  }
+  template <typename C>
+  static constexpr auto GetData(C& c) noexcept -> decltype(GetDataImpl(c, 0)) {
+    return GetDataImpl(c, 0);
+  }
+
+  template <typename C>
+  using HasSize =
+      std::is_integral<decay_t<decltype(std::declval<C&>().size())> >;
+
+  template <typename V, typename C>
+  using HasData =
+      std::is_convertible<decay_t<decltype(GetData(std::declval<C&>()))>*,
+                          V* const*>;
+
+  template <typename C>
+  using EnableIfConvertibleFrom =
+      typename std::enable_if<HasData<T, C>::value && HasSize<C>::value>::type;
+
+  template <typename U>
+  using EnableIfConstView =
+      typename std::enable_if<std::is_const<T>::value, U>::type;
+
+  template <typename U>
+  using EnableIfMutableView =
+      typename std::enable_if<!std::is_const<T>::value, U>::type;
+
+ public:
+  using value_type = remove_cv_t<T>;
+  using pointer = T*;
+  using const_pointer = const T*;
+  using reference = T&;
+  using const_reference = const T&;
+  using iterator = pointer;
+  using const_iterator = const_pointer;
+  using reverse_iterator = std::reverse_iterator<iterator>;
+  using const_reverse_iterator = std::reverse_iterator<const_iterator>;
+  using size_type = size_t;
+  using difference_type = ptrdiff_t;
+
+  constexpr span() noexcept : span(nullptr, 0) {}
+  constexpr span(pointer array, size_type length) noexcept
+      : ptr_(array), len_(length) {}
+
+  template <size_type N>
+  constexpr span(T (&a)[N]) noexcept : span(a, N) {}
+
+  template <typename V, typename = EnableIfConvertibleFrom<V>,
+            typename = EnableIfMutableView<V> >
+  explicit span(V& v) noexcept : span(GetData(v), v.size()) {}
+
+  template <typename V, typename = EnableIfConvertibleFrom<V>,
+            typename = EnableIfConstView<V> >
+  constexpr span(const V& v) noexcept : span(GetData(v), v.size()) {}
+
+  template <typename LazyT = T, typename = EnableIfConstView<LazyT> >
+  span(std::initializer_list<value_type> v) noexcept
+      : span(v.begin(), v.size()) {}
+
+  constexpr pointer data() const noexcept { return ptr_; }
+
+  constexpr size_type size() const noexcept { return len_; }
+
+  constexpr size_type length() const noexcept { return size(); }
+
+  constexpr bool empty() const noexcept { return size() == 0; }
+
+  constexpr reference operator[](size_type i) const noexcept {
+    // MSVC 2015 accepts this as constexpr, but not ptr_[i]
+    assert(i < size());
+    return *(data() + i);
+  }
+
+  constexpr reference at(size_type i) const {
+    return i < size() ? *(data() + i) : (std::abort(), *(data() + i));
+  }
+
+  constexpr reference front() const noexcept {
+    assert(size() > 0);
+    return *data();
+  }
+  constexpr reference back() const noexcept {
+    assert(size() > 0);
+    return *(data() + size() - 1);
+  }
+
+  constexpr iterator begin() const noexcept { return data(); }
+  constexpr iterator end() const noexcept { return data() + size(); }
+
+  constexpr reverse_iterator rbegin() const noexcept {
+    return reverse_iterator(end());
+  }
+  constexpr reverse_iterator rend() const noexcept {
+    return reverse_iterator(begin());
+  }
+
+  constexpr span subspan(size_type pos = 0,
+                         size_type len = iree::dynamic_extent) const {
+    return (pos <= size()) ? span(data() + pos, std::min(size() - pos, len))
+                           : (std::abort(), span());
+  }
+
+  constexpr span first(size_type len) const {
+    return (len <= size()) ? span(data(), len) : (std::abort(), span());
+  }
+
+  constexpr span last(size_type len) const {
+    return (len <= size()) ? span(size() - len + data(), len)
+                           : (std::abort(), span());
+  }
+
+ private:
+  pointer ptr_;
+  size_type len_;
+};
+
+#endif  // IREE_HAVE_STD_SPAN
+
+}  // namespace iree
+
+#endif  // __cplusplus
+#endif  // IREE_BASE_INTERNAL_SPAN_H_
diff --git a/runtime/src/iree/base/internal/synchronization.c b/runtime/src/iree/base/internal/synchronization.c
new file mode 100644
index 0000000..936238c
--- /dev/null
+++ b/runtime/src/iree/base/internal/synchronization.c
@@ -0,0 +1,778 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/internal/synchronization.h"
+
+#include <assert.h>
+#include <string.h>
+
+#if IREE_SYNCHRONIZATION_DISABLE_UNSAFE
+
+// Disabled.
+
+#elif defined(IREE_PLATFORM_EMSCRIPTEN)
+
+#include <emscripten/threading.h>
+#include <errno.h>
+
+#elif defined(IREE_PLATFORM_ANDROID) || defined(IREE_PLATFORM_LINUX)
+
+#include <errno.h>
+#include <linux/futex.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+// Oh Android...
+#ifndef SYS_futex
+#define SYS_futex __NR_futex
+#endif  // !SYS_futex
+#ifndef FUTEX_PRIVATE_FLAG
+#define FUTEX_PRIVATE_FLAG 128
+#endif  // !FUTEX_PRIVATE_FLAG
+
+#endif  // IREE_PLATFORM_*
+
+#if defined(NDEBUG)
+#define SYNC_ASSERT(x) (void)(x)
+#else
+#define SYNC_ASSERT(x) assert(x)
+#endif  // NDEBUG
+
+// Tag functions in .c files with this to indicate that thread safety analysis
+// warnings should not show. This is useful on our implementation functions as
+// clang cannot reason about lock-free magic.
+#define IREE_DISABLE_THREAD_SAFETY_ANALYSIS \
+  IREE_THREAD_ANNOTATION_ATTRIBUTE(no_thread_safety_analysis)
+
+//==============================================================================
+// Cross-platform futex mappings (where supported)
+//==============================================================================
+
+#if defined(IREE_PLATFORM_HAS_FUTEX)
+
+// Waits in the OS for the value at the specified |address| to change.
+// If the contents of |address| do not match |expected_value| the wait will
+// fail and return IREE_STATUS_UNAVAILABLE and should be retried.
+//
+// |timeout_ms| can be either IREE_INFINITE_TIMEOUT_MS to wait forever or a
+// relative number of milliseconds to wait prior to returning early with
+// IREE_STATUS_DEADLINE_EXCEEDED.
+static inline iree_status_code_t iree_futex_wait(void* address,
+                                                 uint32_t expected_value,
+                                                 uint32_t timeout_ms);
+
+// Wakes at most |count| threads waiting for the |address| to change.
+// Use IREE_ALL_WAITERS to wake all waiters. Which waiters are woken is
+// undefined and it is not guaranteed that higher priority waiters will be woken
+// over lower priority waiters.
+static inline void iree_futex_wake(void* address, int32_t count);
+
+#if defined(IREE_PLATFORM_EMSCRIPTEN)
+
+static inline iree_status_code_t iree_futex_wait(void* address,
+                                                 uint32_t expected_value,
+                                                 uint32_t timeout_ms) {
+  int rc = emscripten_futex_wait(address, expected_value, (double)timeout_ms);
+  switch (rc) {
+    default:
+      return IREE_STATUS_OK;
+    case -ETIMEDOUT:
+      return IREE_STATUS_DEADLINE_EXCEEDED;
+    case -EWOULDBLOCK:
+      return IREE_STATUS_UNAVAILABLE;
+  }
+}
+
+static inline void iree_futex_wake(void* address, int32_t count) {
+  emscripten_futex_wake(address, count);
+}
+
+#elif defined(IREE_PLATFORM_WINDOWS)
+
+#pragma comment(lib, "Synchronization.lib")
+
+static inline iree_status_code_t iree_futex_wait(void* address,
+                                                 uint32_t expected_value,
+                                                 uint32_t timeout_ms) {
+  if (IREE_LIKELY(WaitOnAddress(address, &expected_value,
+                                sizeof(expected_value), timeout_ms) == TRUE)) {
+    return IREE_STATUS_OK;
+  }
+  if (GetLastError() == ERROR_TIMEOUT) {
+    return IREE_STATUS_DEADLINE_EXCEEDED;
+  }
+  return IREE_STATUS_UNAVAILABLE;
+}
+
+static inline void iree_futex_wake(void* address, int32_t count) {
+  if (count == INT32_MAX) {
+    WakeByAddressAll(address);
+    return;
+  }
+  for (; count > 0; --count) {
+    WakeByAddressSingle(address);
+  }
+}
+
+#elif defined(IREE_PLATFORM_ANDROID) || defined(IREE_PLATFORM_LINUX)
+
+static inline iree_status_code_t iree_futex_wait(void* address,
+                                                 uint32_t expected_value,
+                                                 uint32_t timeout_ms) {
+  struct timespec timeout = {
+      .tv_sec = timeout_ms / 1000,
+      .tv_nsec = (timeout_ms % 1000) * 1000000,
+  };
+  int rc = syscall(
+      SYS_futex, address, FUTEX_WAIT | FUTEX_PRIVATE_FLAG, expected_value,
+      timeout_ms == IREE_INFINITE_TIMEOUT_MS ? NULL : &timeout, NULL, 0);
+  if (IREE_LIKELY(rc == 0) || errno == EAGAIN) {
+    return IREE_STATUS_OK;
+  } else if (errno == ETIMEDOUT) {
+    return IREE_STATUS_DEADLINE_EXCEEDED;
+  }
+  return IREE_STATUS_UNAVAILABLE;
+}
+
+static inline void iree_futex_wake(void* address, int32_t count) {
+  syscall(SYS_futex, address, FUTEX_WAKE | FUTEX_PRIVATE_FLAG, count, NULL,
+          NULL, 0);
+}
+
+#endif  // IREE_PLATFORM_*
+
+#endif  // IREE_PLATFORM_HAS_FUTEX
+
+//==============================================================================
+// iree_mutex_t
+//==============================================================================
+
+#if IREE_SYNCHRONIZATION_DISABLE_UNSAFE
+
+#define iree_mutex_impl_initialize(mutex)
+#define iree_mutex_impl_deinitialize(mutex)
+#define iree_mutex_impl_lock(mutex)
+#define iree_mutex_impl_try_lock(mutex) true
+#define iree_mutex_impl_unlock(mutex)
+
+#elif defined(IREE_PLATFORM_WINDOWS) && defined(IREE_MUTEX_USE_WIN32_SRW)
+
+// Win32 Slim Reader/Writer (SRW) Lock (same as std::mutex)
+#define iree_mutex_impl_initialize(mutex) InitializeSRWLock(&(mutex)->value)
+#define iree_mutex_impl_deinitialize(mutex)
+#define iree_mutex_impl_lock(mutex) AcquireSRWLockExclusive(&(mutex)->value)
+#define iree_mutex_impl_try_lock(mutex) \
+  (TryAcquireSRWLockExclusive(&(mutex)->value) == TRUE)
+#define iree_mutex_impl_unlock(mutex) ReleaseSRWLockExclusive(&(mutex)->value)
+
+#elif defined(IREE_PLATFORM_WINDOWS)
+
+// Win32 CRITICAL_SECTION
+#define IREE_WIN32_CRITICAL_SECTION_FLAG_DYNAMIC_SPIN 0x02000000
+#define iree_mutex_impl_initialize(mutex)            \
+  InitializeCriticalSectionEx(&(mutex)->value, 4000, \
+                              IREE_WIN32_CRITICAL_SECTION_FLAG_DYNAMIC_SPIN)
+#define iree_mutex_impl_deinitialize(mutex) \
+  DeleteCriticalSection(&(mutex)->value)
+#define iree_mutex_impl_lock(mutex) EnterCriticalSection(&(mutex)->value)
+#define iree_mutex_impl_try_lock(mutex) \
+  (TryEnterCriticalSection(&(mutex)->value) == TRUE)
+#define iree_mutex_impl_unlock(mutex) LeaveCriticalSection(&(mutex)->value)
+
+#else
+
+// pthreads pthread_mutex_t
+#define iree_mutex_impl_initialize(mutex) \
+  pthread_mutex_init(&(mutex)->value, NULL)
+#define iree_mutex_impl_deinitialize(mutex) \
+  pthread_mutex_destroy(&(mutex)->value)
+#define iree_mutex_impl_lock(mutex) pthread_mutex_lock(&(mutex)->value)
+#define iree_mutex_impl_try_lock(mutex) \
+  (pthread_mutex_trylock(&(mutex)->value) == 0)
+#define iree_mutex_impl_unlock(mutex) pthread_mutex_unlock(&(mutex)->value)
+
+#endif  // IREE_PLATFORM_*
+
+#if (IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_SLOW_LOCKS)
+
+// NOTE: the tracy mutex tracing code takes locks itself (which makes it slower
+// and may cause deadlocks).
+
+void iree_mutex_initialize_impl(const iree_tracing_location_t* src_loc,
+                                iree_mutex_t* out_mutex) {
+  memset(out_mutex, 0, sizeof(*out_mutex));
+  iree_tracing_mutex_announce(src_loc, &out_mutex->lock_id);
+  iree_mutex_impl_initialize(out_mutex);
+}
+
+void iree_mutex_deinitialize(iree_mutex_t* mutex) {
+  iree_mutex_impl_deinitialize(mutex);
+  iree_tracing_mutex_terminate(mutex->lock_id);
+  memset(mutex, 0, sizeof(*mutex));
+}
+
+void iree_mutex_lock(iree_mutex_t* mutex) IREE_DISABLE_THREAD_SAFETY_ANALYSIS {
+  iree_tracing_mutex_before_lock(mutex->lock_id);
+  iree_mutex_impl_lock(mutex);
+  iree_tracing_mutex_after_lock(mutex->lock_id);
+}
+
+bool iree_mutex_try_lock(iree_mutex_t* mutex)
+    IREE_DISABLE_THREAD_SAFETY_ANALYSIS {
+  bool was_acquired = iree_mutex_impl_try_lock(mutex);
+  iree_tracing_mutex_after_try_lock(mutex->lock_id, was_acquired);
+  return was_acquired;
+}
+
+void iree_mutex_unlock(iree_mutex_t* mutex)
+    IREE_DISABLE_THREAD_SAFETY_ANALYSIS {
+  iree_mutex_impl_unlock(mutex);
+  iree_tracing_mutex_after_unlock(mutex->lock_id);
+}
+
+#else
+
+void iree_mutex_initialize(iree_mutex_t* out_mutex) {
+  memset(out_mutex, 0, sizeof(*out_mutex));
+  iree_mutex_impl_initialize(out_mutex);
+}
+
+void iree_mutex_deinitialize(iree_mutex_t* mutex) {
+  iree_mutex_impl_deinitialize(mutex);
+  memset(mutex, 0, sizeof(*mutex));
+}
+
+void iree_mutex_lock(iree_mutex_t* mutex) IREE_DISABLE_THREAD_SAFETY_ANALYSIS {
+  iree_mutex_impl_lock(mutex);
+}
+
+bool iree_mutex_try_lock(iree_mutex_t* mutex)
+    IREE_DISABLE_THREAD_SAFETY_ANALYSIS {
+  return iree_mutex_impl_try_lock(mutex);
+}
+
+void iree_mutex_unlock(iree_mutex_t* mutex)
+    IREE_DISABLE_THREAD_SAFETY_ANALYSIS {
+  iree_mutex_impl_unlock(mutex);
+}
+
+#endif  // IREE_TRACING_FEATURE_SLOW_LOCKS
+
+//==============================================================================
+// iree_slim_mutex_t
+//==============================================================================
+
+#if (IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_FAST_LOCKS)
+
+// Turn fast locks into slow locks.
+// This lets us just reuse that code at the cost of obscuring our lock
+// performance; but at the time you are recording 2+ tracy messages per lock use
+// there's not much interesting to gain from that level of granularity anyway.
+// If these start showing up in traces it means that the higher-level algorithm
+// is taking too many locks and not that this taking time is the core issue.
+
+void iree_slim_mutex_initialize_impl(const iree_tracing_location_t* src_loc,
+                                     iree_slim_mutex_t* out_mutex) {
+  iree_mutex_initialize_impl(src_loc, &out_mutex->impl);
+}
+
+void iree_slim_mutex_deinitialize(iree_slim_mutex_t* mutex) {
+  iree_mutex_deinitialize(&mutex->impl);
+}
+
+void iree_slim_mutex_lock(iree_slim_mutex_t* mutex)
+    IREE_DISABLE_THREAD_SAFETY_ANALYSIS {
+  iree_mutex_lock(&mutex->impl);
+}
+
+bool iree_slim_mutex_try_lock(iree_slim_mutex_t* mutex)
+    IREE_DISABLE_THREAD_SAFETY_ANALYSIS {
+  return iree_mutex_try_lock(&mutex->impl);
+}
+
+void iree_slim_mutex_unlock(iree_slim_mutex_t* mutex)
+    IREE_DISABLE_THREAD_SAFETY_ANALYSIS {
+  iree_mutex_unlock(&mutex->impl);
+}
+
+#else
+
+#if IREE_SYNCHRONIZATION_DISABLE_UNSAFE
+
+void iree_slim_mutex_initialize(iree_slim_mutex_t* out_mutex) {}
+
+void iree_slim_mutex_deinitialize(iree_slim_mutex_t* mutex) {}
+
+void iree_slim_mutex_lock(iree_slim_mutex_t* mutex)
+    IREE_DISABLE_THREAD_SAFETY_ANALYSIS {}
+
+bool iree_slim_mutex_try_lock(iree_slim_mutex_t* mutex)
+    IREE_DISABLE_THREAD_SAFETY_ANALYSIS {
+  return iree_mutex_try_lock((iree_mutex_t*)&mutex->reserved);
+}
+
+void iree_slim_mutex_unlock(iree_slim_mutex_t* mutex)
+    IREE_DISABLE_THREAD_SAFETY_ANALYSIS {}
+
+#elif defined(IREE_PLATFORM_APPLE)
+
+void iree_slim_mutex_initialize(iree_slim_mutex_t* out_mutex) {
+  out_mutex->value = OS_UNFAIR_LOCK_INIT;
+}
+
+void iree_slim_mutex_deinitialize(iree_slim_mutex_t* mutex) {
+  os_unfair_lock_assert_not_owner(&mutex->value);
+}
+
+void iree_slim_mutex_lock(iree_slim_mutex_t* mutex)
+    IREE_DISABLE_THREAD_SAFETY_ANALYSIS {
+  os_unfair_lock_lock(&mutex->value);
+}
+
+bool iree_slim_mutex_try_lock(iree_slim_mutex_t* mutex)
+    IREE_DISABLE_THREAD_SAFETY_ANALYSIS {
+  return os_unfair_lock_trylock(&mutex->value);
+}
+
+void iree_slim_mutex_unlock(iree_slim_mutex_t* mutex)
+    IREE_DISABLE_THREAD_SAFETY_ANALYSIS {
+  os_unfair_lock_unlock(&mutex->value);
+}
+
+#elif defined(IREE_PLATFORM_WINDOWS) && defined(IREE_MUTEX_USE_WIN32_SRW)
+
+// The SRW on Windows is pointer-sized and slightly better than what we emulate
+// with the futex so let's just use that.
+
+void iree_slim_mutex_initialize(iree_slim_mutex_t* out_mutex) {
+  iree_mutex_impl_initialize(out_mutex);
+}
+
+void iree_slim_mutex_deinitialize(iree_slim_mutex_t* mutex) {
+  iree_mutex_impl_deinitialize(mutex);
+}
+
+void iree_slim_mutex_lock(iree_slim_mutex_t* mutex)
+    IREE_DISABLE_THREAD_SAFETY_ANALYSIS {
+  iree_mutex_impl_lock(mutex);
+}
+
+bool iree_slim_mutex_try_lock(iree_slim_mutex_t* mutex)
+    IREE_DISABLE_THREAD_SAFETY_ANALYSIS {
+  return iree_mutex_impl_try_lock(mutex);
+}
+
+void iree_slim_mutex_unlock(iree_slim_mutex_t* mutex)
+    IREE_DISABLE_THREAD_SAFETY_ANALYSIS {
+  iree_mutex_impl_unlock(mutex);
+}
+
+#elif defined(IREE_PLATFORM_HAS_FUTEX)
+
+// This implementation is a combo of several sources:
+//
+// Basics of Futexes by Eli Bendersky:
+// https://eli.thegreenplace.net/2018/basics-of-futexes/
+//
+// Futex based locks for C11’s generic atomics by Jens Gustedt:
+// https://hal.inria.fr/hal-01236734/document
+//
+// Mutexes and Condition Variables using Futexes:
+// http://locklessinc.com/articles/mutex_cv_futex/
+//
+// The high bit of the atomic value indicates whether the lock is held; each
+// thread tries to transition the bit from 0->1 to acquire the lock and 1->0 to
+// release it. The lower bits of the value are whether there are any interested
+// waiters. We track these waiters so that we know when we can avoid performing
+// the futex wake syscall.
+
+#define iree_slim_mutex_value(value) (0x80000000u | (value))
+#define iree_slim_mutex_is_locked(value) (0x80000000u & (value))
+
+void iree_slim_mutex_initialize(iree_slim_mutex_t* out_mutex) {
+  memset(out_mutex, 0, sizeof(*out_mutex));
+}
+
+void iree_slim_mutex_deinitialize(iree_slim_mutex_t* mutex) {
+  // Assert unlocked (callers must ensure the mutex is no longer in use).
+  SYNC_ASSERT(
+      iree_atomic_load_int32(&mutex->value, iree_memory_order_seq_cst) == 0);
+}
+
+void iree_slim_mutex_lock(iree_slim_mutex_t* mutex)
+    IREE_DISABLE_THREAD_SAFETY_ANALYSIS {
+  // Try first to acquire the lock from an unlocked state.
+  // Note that the weak form can fail spuriously. That's fine, as the perf
+  // benefit in the uncontended cases is worth the additional loop below that
+  // will correctly handle any such failures in contended cases.
+  int32_t value = 0;
+  if (iree_atomic_compare_exchange_weak_int32(
+          &mutex->value, &value, iree_slim_mutex_value(1),
+          iree_memory_order_acquire, iree_memory_order_relaxed)) {
+    // Successfully took the lock and there were no other waiters.
+    return;
+  }
+
+  // Increment the count bits to indicate that we want the lock and are willing
+  // to wait for it to be available. Note that between the CAS above and this
+  // the lock could have been made available and we want to ensure we don't
+  // change the lock bit.
+  value =
+      iree_atomic_fetch_add_int32(&mutex->value, 1, iree_memory_order_relaxed) +
+      1;
+
+  while (true) {
+    // While the lock is available: try to acquire it for this thread.
+    while (!iree_slim_mutex_is_locked(value)) {
+      if (iree_atomic_compare_exchange_weak_int32(
+              &mutex->value, &value, iree_slim_mutex_value(value),
+              iree_memory_order_acquire, iree_memory_order_relaxed)) {
+        // Successfully took the lock.
+        return;
+      }
+
+      // Spin a small amount to give us a tiny chance of falling through to the
+      // wait. We can tune this value based on likely contention, however 10-60
+      // is the recommended value and we should keep it in that order of
+      // magnitude. A way to think of this is "how many spins would we have to
+      // do to equal one call to iree_futex_wait" - if it's faster just to do
+      // a futex wait then we shouldn't be spinning!
+      // TODO(benvanik): measure on real workload on ARM; maybe remove entirely.
+      int spin_count = 100;
+      for (int i = 0; i < spin_count && iree_slim_mutex_is_locked(value); ++i) {
+        value =
+            iree_atomic_load_int32(&mutex->value, iree_memory_order_relaxed);
+      }
+    }
+
+    // While the lock is unavailable: wait for it to become available.
+    while (iree_slim_mutex_is_locked(value)) {
+      // NOTE: we don't care about wait failure here as we are going to loop
+      // and check again anyway.
+      iree_futex_wait(&mutex->value, value, IREE_INFINITE_TIMEOUT_MS);
+      value = iree_atomic_load_int32(&mutex->value, iree_memory_order_relaxed);
+    }
+  }
+}
+
+bool iree_slim_mutex_try_lock(iree_slim_mutex_t* mutex)
+    IREE_DISABLE_THREAD_SAFETY_ANALYSIS {
+  // Attempt to acquire the lock from an unlocked state.
+  // We don't care if this fails spuriously as that's the whole point of a try.
+  int32_t value = 0;
+  return iree_atomic_compare_exchange_weak_int32(
+      &mutex->value, &value, iree_slim_mutex_value(1),
+      iree_memory_order_acquire, iree_memory_order_relaxed);
+}
+
+void iree_slim_mutex_unlock(iree_slim_mutex_t* mutex)
+    IREE_DISABLE_THREAD_SAFETY_ANALYSIS {
+  // Transition 1->0 (unlocking with no waiters) or 2->1 (with waiters).
+  if (iree_atomic_fetch_sub_int32(&mutex->value, iree_slim_mutex_value(1),
+                                  iree_memory_order_release) !=
+      iree_slim_mutex_value(1)) {
+    // One (or more) waiters; wake a single one to avoid a thundering herd of
+    // multiple threads all waking and trying to grab the lock (as only one will
+    // win).
+    //
+    // Note that futexes (futeces? futices? futii?) are unfair and what thread
+    // gets woken is undefined (not FIFO on waiters).
+    iree_futex_wake(&mutex->value, 1);
+  }
+}
+
+#else
+
+// Pass-through to iree_mutex_t as a fallback for platforms without a futex we
+// can use to implement a slim lock. Note that since we are reusing iree_mutex_t
+// when tracing all slim mutexes will be traced along with the fat mutexes.
+
+void iree_slim_mutex_initialize(iree_slim_mutex_t* out_mutex) {
+  iree_mutex_initialize(&out_mutex->impl);
+}
+
+void iree_slim_mutex_deinitialize(iree_slim_mutex_t* mutex) {
+  iree_mutex_deinitialize(&mutex->impl);
+}
+
+void iree_slim_mutex_lock(iree_slim_mutex_t* mutex)
+    IREE_DISABLE_THREAD_SAFETY_ANALYSIS {
+  iree_mutex_lock(&mutex->impl);
+}
+
+bool iree_slim_mutex_try_lock(iree_slim_mutex_t* mutex)
+    IREE_DISABLE_THREAD_SAFETY_ANALYSIS {
+  return iree_mutex_try_lock(&mutex->impl);
+}
+
+void iree_slim_mutex_unlock(iree_slim_mutex_t* mutex)
+    IREE_DISABLE_THREAD_SAFETY_ANALYSIS {
+  iree_mutex_unlock(&mutex->impl);
+}
+
+#endif  // IREE_PLATFORM_*
+
+#endif  //  IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_SLOW_LOCKS
+
+//==============================================================================
+// iree_notification_t
+//==============================================================================
+
+#if IREE_SYNCHRONIZATION_DISABLE_UNSAFE
+
+// No-op implementation that is only used when there is guaranteed to be one
+// thread at a time touching IREE-related code. It is unsafe to use in any
+// situation where either IREE or a user of IREE has multiple threads!
+
+void iree_notification_initialize(iree_notification_t* out_notification) {
+  memset(out_notification, 0, sizeof(*out_notification));
+}
+
+void iree_notification_deinitialize(iree_notification_t* notification) {}
+
+void iree_notification_post(iree_notification_t* notification, int32_t count) {}
+
+iree_wait_token_t iree_notification_prepare_wait(
+    iree_notification_t* notification) {
+  return (iree_wait_token_t)0;
+}
+
+bool iree_notification_commit_wait(iree_notification_t* notification,
+                                   iree_wait_token_t wait_token,
+                                   iree_time_t deadline_ns) {
+  return true;
+}
+
+void iree_notification_cancel_wait(iree_notification_t* notification) {}
+
+#elif !defined(IREE_PLATFORM_HAS_FUTEX)
+
+// Emulation of a lock-free futex-backed notification using pthreads.
+// This is a normal cond-var-like usage with support for our prepare/cancel API
+// so that users can still perform their own wait logic.
+
+void iree_notification_initialize(iree_notification_t* out_notification) {
+  memset(out_notification, 0, sizeof(*out_notification));
+  pthread_mutex_init(&out_notification->mutex, NULL);
+  pthread_cond_init(&out_notification->cond, NULL);
+}
+
+void iree_notification_deinitialize(iree_notification_t* notification) {
+  // Assert no more waiters (callers must tear down waiters first).
+  pthread_mutex_lock(&notification->mutex);
+  SYNC_ASSERT(notification->waiters == 0);
+  pthread_cond_destroy(&notification->cond);
+  pthread_mutex_unlock(&notification->mutex);
+  pthread_mutex_destroy(&notification->mutex);
+}
+
+void iree_notification_post(iree_notification_t* notification, int32_t count) {
+  pthread_mutex_lock(&notification->mutex);
+  ++notification->epoch;
+  if (notification->waiters > 0) {
+    // NOTE: we only do the signal if we have waiters - this avoids a syscall
+    // in cases where no one is actively listening.
+    if (count == IREE_ALL_WAITERS) {
+      pthread_cond_broadcast(&notification->cond);
+    } else {
+      for (int32_t i = 0; i < count; ++i) {
+        pthread_cond_signal(&notification->cond);
+      }
+    }
+  }
+  pthread_mutex_unlock(&notification->mutex);
+}
+
+iree_wait_token_t iree_notification_prepare_wait(
+    iree_notification_t* notification) {
+  pthread_mutex_lock(&notification->mutex);
+  iree_wait_token_t epoch = notification->epoch;
+  ++notification->waiters;
+  pthread_mutex_unlock(&notification->mutex);
+  return epoch;
+}
+
+bool iree_notification_commit_wait(iree_notification_t* notification,
+                                   iree_wait_token_t wait_token,
+                                   iree_time_t deadline_ns) {
+  struct timespec abs_ts = {
+      .tv_sec = (time_t)(deadline_ns / 1000000000ull),
+      .tv_nsec = (long)(deadline_ns % 1000000000ull),
+  };
+
+  pthread_mutex_lock(&notification->mutex);
+
+  // Spin until notified and the epoch increments from what we captured during
+  // iree_notification_prepare_wait.
+  bool result = true;
+  while (notification->epoch == wait_token) {
+    int ret = pthread_cond_timedwait(&notification->cond, &notification->mutex,
+                                     &abs_ts);
+    if (ret != 0) {
+      // Wait failed (timeout/etc); cancel the wait.
+      // This may happen in spurious wakes but that's fine - the caller is
+      // designed to handle looping again and may want the chance to do some
+      // bookkeeping while it has the thread.
+      result = false;
+      break;
+    }
+  }
+
+  // Remove us from the waiter list - the caller will need to reacquire a wait
+  // token if it wants to wait again.
+  SYNC_ASSERT(notification->waiters > 0);
+  --notification->waiters;
+
+  pthread_mutex_unlock(&notification->mutex);
+
+  return result;
+}
+
+void iree_notification_cancel_wait(iree_notification_t* notification) {
+  pthread_mutex_lock(&notification->mutex);
+  SYNC_ASSERT(notification->waiters > 0);
+  --notification->waiters;
+  pthread_mutex_unlock(&notification->mutex);
+}
+
+#else
+
+// The 64-bit value used to atomically read-modify-write (RMW) the state is
+// split in two and treated as independent 32-bit ints:
+//
+//  MSB (63)                           32                               LSB (0)
+// +-------------------------------------+-------------------------------------+
+// |            epoch/notification count |                        waiter count |
+// +-------------------------------------+-------------------------------------+
+//
+// We use the epoch to wait/wake the futex (which is 32-bits), and as such when
+// we pass the value address to the futex APIs we need to ensure we are only
+// passing the most significant 32-bit value regardless of endianness.
+//
+// We use signed addition on the full 64-bit value to increment/decrement the
+// waiter count. This means that an add of -1ll will decrement the waiter count
+// and do nothing to the epoch count.
+#if defined(IREE_ENDIANNESS_LITTLE)
+#define IREE_NOTIFICATION_EPOCH_OFFSET (/*words=*/1)
+#else
+#define IREE_NOTIFICATION_EPOCH_OFFSET (/*words=*/0)
+#endif  // IREE_ENDIANNESS_*
+#define iree_notification_epoch_address(notification) \
+  ((iree_atomic_int32_t*)(&(notification)->value) +   \
+   IREE_NOTIFICATION_EPOCH_OFFSET)
+#define IREE_NOTIFICATION_WAITER_INC 0x0000000000000001ull
+#define IREE_NOTIFICATION_WAITER_DEC 0xFFFFFFFFFFFFFFFFull
+#define IREE_NOTIFICATION_WAITER_MASK 0x00000000FFFFFFFFull
+#define IREE_NOTIFICATION_EPOCH_SHIFT 32
+#define IREE_NOTIFICATION_EPOCH_INC \
+  (0x00000001ull << IREE_NOTIFICATION_EPOCH_SHIFT)
+
+void iree_notification_initialize(iree_notification_t* out_notification) {
+  memset(out_notification, 0, sizeof(*out_notification));
+}
+
+void iree_notification_deinitialize(iree_notification_t* notification) {
+  // Assert no more waiters (callers must tear down waiters first).
+  SYNC_ASSERT(
+      (iree_atomic_load_int64(&notification->value, iree_memory_order_seq_cst) &
+       IREE_NOTIFICATION_WAITER_MASK) == 0);
+}
+
+void iree_notification_post(iree_notification_t* notification, int32_t count) {
+  uint64_t previous_value = iree_atomic_fetch_add_int64(
+      &notification->value, IREE_NOTIFICATION_EPOCH_INC,
+      iree_memory_order_acq_rel);
+  // Ensure we have at least one waiter; wake up to |count| of them.
+  if (IREE_UNLIKELY(previous_value & IREE_NOTIFICATION_WAITER_MASK)) {
+    iree_futex_wake(iree_notification_epoch_address(notification), count);
+  }
+}
+
+iree_wait_token_t iree_notification_prepare_wait(
+    iree_notification_t* notification) {
+  uint64_t previous_value = iree_atomic_fetch_add_int64(
+      &notification->value, IREE_NOTIFICATION_WAITER_INC,
+      iree_memory_order_acq_rel);
+  return (iree_wait_token_t)(previous_value >> IREE_NOTIFICATION_EPOCH_SHIFT);
+}
+
+bool iree_notification_commit_wait(iree_notification_t* notification,
+                                   iree_wait_token_t wait_token,
+                                   iree_time_t deadline_ns) {
+  bool result = true;
+
+  // Spin until notified and the epoch increments from what we captured during
+  // iree_notification_prepare_wait.
+  while ((iree_atomic_load_int64(&notification->value,
+                                 iree_memory_order_acquire) >>
+          IREE_NOTIFICATION_EPOCH_SHIFT) == wait_token) {
+    // NOTE: we do an abs->rel conversion within the loop so that we can account
+    // for spurious wakes that may cause us to loop several times with waits of
+    // various time inbetween.
+    uint32_t timeout_ms = iree_absolute_deadline_to_timeout_ms(deadline_ns);
+    iree_status_code_t status_code = iree_futex_wait(
+        iree_notification_epoch_address(notification), wait_token, timeout_ms);
+    if (status_code != IREE_STATUS_OK) {
+      result = false;
+      break;
+    }
+  }
+
+  // TODO(benvanik): benchmark under real workloads.
+  // iree_memory_order_relaxed would suffice for correctness but the faster
+  // the waiter count gets to 0 the less likely we'll wake on the futex.
+  uint64_t previous_value = iree_atomic_fetch_add_int64(
+      &notification->value, IREE_NOTIFICATION_WAITER_DEC,
+      iree_memory_order_seq_cst);
+  SYNC_ASSERT((previous_value & IREE_NOTIFICATION_WAITER_MASK) != 0);
+
+  return result;
+}
+
+void iree_notification_cancel_wait(iree_notification_t* notification) {
+  // TODO(benvanik): benchmark under real workloads.
+  // iree_memory_order_relaxed would suffice for correctness but the faster
+  // the waiter count gets to 0 the less likely we'll wake on the futex.
+  uint64_t previous_value = iree_atomic_fetch_add_int64(
+      &notification->value, IREE_NOTIFICATION_WAITER_DEC,
+      iree_memory_order_seq_cst);
+  SYNC_ASSERT((previous_value & IREE_NOTIFICATION_WAITER_MASK) != 0);
+}
+
+#endif  // DISABLED / HAS_FUTEX
+
+bool iree_notification_await(iree_notification_t* notification,
+                             iree_condition_fn_t condition_fn,
+                             void* condition_arg, iree_timeout_t timeout) {
+  if (IREE_LIKELY(condition_fn(condition_arg))) {
+    // Fast-path with condition already met.
+    return true;
+  }
+
+  // If a (silly) query then bail immediately after our first condition check.
+  // Otherwise we may have a real deadline and want it in absolute form so that
+  // we can easily handle spurious wakes.
+  if (iree_timeout_is_immediate(timeout)) return false;
+  const iree_time_t deadline_ns = iree_timeout_as_deadline_ns(timeout);
+
+  // Slow-path: try-wait until the condition is met.
+  while (true) {
+    iree_wait_token_t wait_token = iree_notification_prepare_wait(notification);
+    if (condition_fn(condition_arg)) {
+      // Condition is now met; no need to wait on the futex.
+      iree_notification_cancel_wait(notification);
+      return true;
+    } else {
+      if (!iree_notification_commit_wait(notification, wait_token,
+                                         deadline_ns)) {
+        // Wait hit the deadline before we hit the condition.
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
diff --git a/runtime/src/iree/base/internal/synchronization.h b/runtime/src/iree/base/internal/synchronization.h
new file mode 100644
index 0000000..45f3f59
--- /dev/null
+++ b/runtime/src/iree/base/internal/synchronization.h
@@ -0,0 +1,398 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// NOTE: the best kind of synchronization is no synchronization; always try to
+// design your algorithm so that you don't need anything from this file :)
+// See https://travisdowns.github.io/blog/2020/07/06/concurrency-costs.html
+
+#ifndef IREE_BASE_INTERNAL_SYNCHRONIZATION_H_
+#define IREE_BASE_INTERNAL_SYNCHRONIZATION_H_
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/atomics.h"
+#include "iree/base/target_platform.h"
+#include "iree/base/tracing.h"
+
+// NOTE: clang cannot support thread annotations in C code due to some
+// representational bugs... which means that we can't use it here. Boo.
+// There's some workarounds I've seen but getting TSAN working would be much
+// easier as a starting point.
+#if 0  // defined(IREE_COMPILER_CLANG)
+#define IREE_THREAD_ANNOTATION_ATTRIBUTE(x) __attribute__((x))
+#else
+#define IREE_THREAD_ANNOTATION_ATTRIBUTE(x)
+#endif  // IREE_COMPILER_CLANG
+
+#ifdef __cplusplus
+// Documents if a shared field or global variable needs to be protected by a
+// mutex. IREE_GUARDED_BY() allows the user to specify a particular mutex that
+// should be held when accessing the annotated variable.
+#define IREE_GUARDED_BY(x) IREE_THREAD_ANNOTATION_ATTRIBUTE(guarded_by(x))
+#else
+#define IREE_GUARDED_BY(x)
+#endif  // __cplusplus
+
+#ifdef __cplusplus
+// Like IREE_GUARDED_BY but specifies that the contents of a pointer are guarded
+// by a mutex instead of the pointer itself.
+#define IREE_PTR_GUARDED_BY(x) \
+  IREE_THREAD_ANNOTATION_ATTRIBUTE(pt_guarded_by(x))
+#else
+#define IREE_PTR_GUARDED_BY(x)
+#endif  // __cplusplus
+
+// Allow users to fully disable all synchronization for systems that are known
+// to never need it. This removes our dependency on pthreads.
+#if !IREE_SYNCHRONIZATION_DISABLE_UNSAFE
+
+// NOTE: we only support futex when not using tsan as we need to add annotations
+// for tsan to understand what we are doing.
+// https://github.com/llvm-mirror/compiler-rt/blob/master/include/sanitizer/tsan_interface.h
+#if defined(IREE_PLATFORM_ANDROID) || defined(IREE_PLATFORM_EMSCRIPTEN) || \
+    defined(IREE_PLATFORM_LINUX) || defined(IREE_PLATFORM_WINDOWS)
+#if !defined(IREE_SANITIZER_THREAD)
+#define IREE_PLATFORM_HAS_FUTEX 1
+#endif  // !IREE_SANITIZER_THREAD
+#endif  // IREE_PLATFORM_*
+
+#if defined(IREE_PLATFORM_APPLE)
+#include <os/lock.h>
+#endif  // IREE_PLATFORM_APPLE
+
+#if !defined(IREE_PLATFORM_WINDOWS)
+#include <pthread.h>
+#endif  // !IREE_PLATFORM_WINDOWS
+
+// We have the CRITICAL_SECTION path for now but Slim Reader/Writer lock (SRW)
+// is much better (and what std::mutex uses). SRW doesn't spin, though, and has
+// some other implications that don't quite line up with pthread_mutex_t on most
+// platforms. Once we have larger end-to-end benchmarks we should choose based
+// on workloads.
+#define IREE_MUTEX_USE_WIN32_SRW 1
+
+#endif  // !IREE_SYNCHRONIZATION_DISABLE_UNSAFE
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define IREE_ALL_WAITERS INT32_MAX
+#define IREE_INFINITE_TIMEOUT_MS UINT32_MAX
+
+//==============================================================================
+// iree_mutex_t
+//==============================================================================
+
+// A normal fat mutex (ala std::mutex).
+// This may be implemented as a slim mutex on certain platforms but in the worst
+// case will be the native platform primitive (like pthread_mutex_t) and as such
+// should not be embedded in structures meant to be kept small.
+//
+// Windows: Slim Reader/Writer (SRW) Locks
+// All others: pthread_mutex_t
+typedef struct iree_mutex_t IREE_THREAD_ANNOTATION_ATTRIBUTE(
+    capability("mutex")) {
+#if IREE_SYNCHRONIZATION_DISABLE_UNSAFE
+  int reserved;
+#elif defined(IREE_PLATFORM_WINDOWS) && defined(IREE_MUTEX_USE_WIN32_SRW)
+  SRWLOCK value;
+#elif defined(IREE_PLATFORM_WINDOWS)
+  CRITICAL_SECTION value;
+#else
+  pthread_mutex_t value;
+#endif  // IREE_PLATFORM_*
+#if (IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_SLOW_LOCKS)
+  uint32_t lock_id;
+#endif  // IREE_TRACING_FEATURE_SLOW_LOCKS
+} iree_mutex_t;
+
+#if (IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_SLOW_LOCKS)
+// Initializes |out_mutex| to the well-defined unlocked contents.
+// Must be called prior to using any other iree_mutex_* method.
+#define iree_mutex_initialize(out_mutex)                                      \
+  static const iree_tracing_location_t TracyConcat(                           \
+      __tracy_source_location, __LINE__) = {NULL, __FUNCTION__, __FILE__,     \
+                                            (uint32_t)__LINE__, 0};           \
+  iree_mutex_initialize_impl(&TracyConcat(__tracy_source_location, __LINE__), \
+                             out_mutex);
+void iree_mutex_initialize_impl(const iree_tracing_location_t* src_loc,
+                                iree_mutex_t* out_mutex);
+#else
+// Initializes |out_mutex| to the well-defined unlocked contents.
+// Must be called prior to using any other iree_mutex_* method.
+void iree_mutex_initialize(iree_mutex_t* out_mutex);
+#endif  // IREE_TRACING_FEATURE_SLOW_LOCKS
+
+// Deinitializes |mutex| (after a prior call to iree_mutex_initialize).
+// The mutex must not be held by any thread.
+void iree_mutex_deinitialize(iree_mutex_t* mutex)
+    IREE_THREAD_ANNOTATION_ATTRIBUTE(locks_excluded(mutex));
+
+// Locks the |mutex| and returns when held by the caller.
+void iree_mutex_lock(iree_mutex_t* mutex)
+    IREE_THREAD_ANNOTATION_ATTRIBUTE(acquire_capability(mutex));
+
+// Tries to lock the |mutex| and returns true if the caller holds the lock.
+bool iree_mutex_try_lock(iree_mutex_t* mutex)
+    IREE_THREAD_ANNOTATION_ATTRIBUTE(try_acquire_capability(true, mutex));
+
+// Unlocks the |mutex|, which must be held by the caller.
+void iree_mutex_unlock(iree_mutex_t* mutex)
+    IREE_THREAD_ANNOTATION_ATTRIBUTE(release_capability(mutex));
+
+//==============================================================================
+// iree_slim_mutex_t
+//==============================================================================
+
+// TODO(benvanik): instrument with tracy; need to capture source location on
+// init and add storage for ID.
+
+// A lightweight unfair lock.
+// Depending on platform this is significantly smaller than a mutex (4-8 bytes
+// vs 64+ bytes), can always be statically initialized/requires no allocations,
+// and performs the minimal amount of work possible while still playing nicely
+// with the OS thread scheduler.
+//
+// Unlike a full mutex these don't have the ability to be shared across
+// processes (not something we care about), don't have a way to define timeouts,
+// and have only a binary held/unheld state. They are often an order of
+// magnitude faster in uncontended/lightly-contended code and the same
+// performance in highly-contended code, though, so it's worth it for locks that
+// be guarding small data structures (queue pointers, etc) and touched from many
+// threads. Since they are so lightweight it's possible to embed them per-object
+// instead of per-manager and change from a single highly-contended lock to
+// thousands of almost completely uncontended slim locks.
+//
+// Though these locks support spinning they always have a fallback path that
+// ends up calling into the kernel to properly wait the thread. This is critical
+// to avoid pathological cases under contention and allowing for thread priority
+// inheritance when there are multiple threads competing that may otherwise be
+// scheduled in a potentially livelocking order.
+//
+// The "unfair" here comes from the fact that it's possible on certain platforms
+// for certain threads to never be able to acquire the lock in cases of
+// extremely high contention or widely disparate thread priority levels. This is
+// mitigated by ensuring only very small regions of code are guarded and that
+// there's enough work happening outside of the lock on any particular thread to
+// ensure that there's some chance of other threads being able to acquire it.
+//
+// MacOS/iOS: os_unfair_lock
+//   Spins and after a short backoff drops to a futex-like behavior of waiting
+//   in the kernel. Unfortunately real futexes aren't supported.
+// See:
+//   https://developer.apple.com/documentation/os/synchronization
+//   https://opensource.apple.com/source/libplatform/libplatform-125/src/os/lock.c.auto.html
+//
+// Emscripten: emscripten_futex_wait/emscripten_futex_wake
+//   Spins and after a short backoff drops to a futex-like behavior of waiting
+//   in the kernel.
+// See:
+//   https://github.com/emscripten-core/emscripten/blob/b43474f55aeb49083b9df74fdd0e52ec8decf788/system/include/emscripten/threading.h#L114-L120
+//   https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Atomics/wait
+//   https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Atomics/notify
+//
+// Windows: WaitOnAddress/WakeByAddress*
+//   Spins and after a short backoff drops to a futex and waits in the kernel.
+// See:
+//   https://docs.microsoft.com/en-us/windows/win32/api/synchapi/nf-synchapi-waitonaddress
+//   https://devblogs.microsoft.com/oldnewthing/20170601-00/?p=96265
+//
+// Linux/Android/others: futex
+//   Spins and after a short backoff drops to a futex and waits in the kernel.
+// See:
+//   http://locklessinc.com/articles/futex_cheat_sheet/
+//   https://man7.org/linux/man-pages/man2/futex.2.html
+//   https://eli.thegreenplace.net/2018/basics-of-futexes/
+//   https://bartoszmilewski.com/2008/09/01/thin-lock-vs-futex/
+typedef struct iree_slim_mutex_t IREE_THREAD_ANNOTATION_ATTRIBUTE(
+    capability("mutex")) {
+#if IREE_SYNCHRONIZATION_DISABLE_UNSAFE
+  int reserved;
+#elif (IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_FAST_LOCKS)
+  iree_mutex_t impl;  // re-route to slow mutex
+#elif defined(IREE_PLATFORM_APPLE)
+  os_unfair_lock value;
+#elif defined(IREE_PLATFORM_WINDOWS) && defined(IREE_MUTEX_USE_WIN32_SRW)
+  SRWLOCK value;
+#elif defined(IREE_PLATFORM_HAS_FUTEX)
+  iree_atomic_int32_t value;
+#else
+  iree_mutex_t impl;  // fallback
+#endif  // IREE_PLATFORM_*
+} iree_slim_mutex_t;
+
+#if (IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_FAST_LOCKS)
+// Initializes |out_mutex| to the well-defined unlocked contents.
+// Must be called prior to using any other iree_slim_mutex_* method.
+#define iree_slim_mutex_initialize(out_mutex)                             \
+  static const iree_tracing_location_t TracyConcat(                       \
+      __tracy_source_location, __LINE__) = {NULL, __FUNCTION__, __FILE__, \
+                                            (uint32_t)__LINE__, 0};       \
+  iree_slim_mutex_initialize_impl(                                        \
+      &TracyConcat(__tracy_source_location, __LINE__), out_mutex);
+void iree_slim_mutex_initialize_impl(const iree_tracing_location_t* src_loc,
+                                     iree_slim_mutex_t* out_mutex);
+#else
+// Initializes |out_mutex| to the well-defined unlocked contents.
+// Must be called prior to using any other iree_slim_mutex_* method.
+//
+// Though optional (static initialization is fine) this is required to support
+// lock tracing. Assume it's (mostly) free and always call it if possible. This
+// also allows us to swap in a non-slim lock for enhanced debugging if we run
+// into threading issues.
+void iree_slim_mutex_initialize(iree_slim_mutex_t* out_mutex);
+#endif  // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_FAST_LOCKS
+
+// Deinitializes |mutex| (after a prior call to iree_slim_mutex_initialize).
+// The mutex must not be held by any thread.
+void iree_slim_mutex_deinitialize(iree_slim_mutex_t* mutex)
+    IREE_THREAD_ANNOTATION_ATTRIBUTE(locks_excluded(mutex));
+
+// Locks the |mutex| and returns when held by the caller.
+void iree_slim_mutex_lock(iree_slim_mutex_t* mutex)
+    IREE_THREAD_ANNOTATION_ATTRIBUTE(acquire_capability(mutex));
+
+// Tries to lock the |mutex| and returns true if the caller holds the lock.
+bool iree_slim_mutex_try_lock(iree_slim_mutex_t* mutex)
+    IREE_THREAD_ANNOTATION_ATTRIBUTE(try_acquire_capability(true, mutex));
+
+// Unlocks the |mutex|, which must be held by the caller.
+void iree_slim_mutex_unlock(iree_slim_mutex_t* mutex)
+    IREE_THREAD_ANNOTATION_ATTRIBUTE(release_capability(mutex));
+
+//==============================================================================
+// iree_notification_t
+//==============================================================================
+
+// TODO(benvanik): add tracy support for watching the waits.
+
+// A lightweight wait-free cross-thread notification mechanism.
+// Classically called an 'event counter', these replace the use of condvars in
+// lock-free code where you wouldn't want to guard a lock-free data structure
+// with a lock.
+//
+// See:
+// http://www.1024cores.net/home/lock-free-algorithms/eventcounts
+// https://software.intel.com/en-us/forums/intel-threading-building-blocks/topic/299245
+// https://github.com/r10a/Event-Counts
+// https://github.com/facebook/folly/blob/master/folly/experimental/EventCount.h
+// https://github.com/concurrencykit/ck/blob/master/include/ck_ec.h
+typedef struct iree_notification_t {
+#if IREE_SYNCHRONIZATION_DISABLE_UNSAFE
+  // Nothing required. Unused field to make compilers happy.
+  int reserved;
+#elif !defined(IREE_PLATFORM_HAS_FUTEX)
+  // No futex on darwin/when using TSAN, so use mutex/condvar instead.
+  pthread_mutex_t mutex;
+  pthread_cond_t cond;
+  uint32_t epoch;
+  uint32_t waiters;
+#else
+  iree_atomic_int64_t value;
+#endif  // IREE_PLATFORM_*
+} iree_notification_t;
+
+#if IREE_SYNCHRONIZATION_DISABLE_UNSAFE
+#define IREE_NOTIFICATION_INIT \
+  { IREE_ATOMIC_VAR_INIT(0) }
+#elif !defined(IREE_PLATFORM_HAS_FUTEX)
+#define IREE_NOTIFICATION_INIT \
+  { PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER, 0, 0 }
+#else
+#define IREE_NOTIFICATION_INIT \
+  { IREE_ATOMIC_VAR_INIT(0) }
+#endif  // notification type
+
+// Initializes a notification to no waiters and an initial epoch of 0.
+void iree_notification_initialize(iree_notification_t* out_notification);
+
+// Deinitializes |notification| (after a prior call to
+// iree_notification_initialize). No threads may be waiting on the notification.
+void iree_notification_deinitialize(iree_notification_t* notification);
+
+// Notifies up to |count| waiters of a change. Each waiter will wake and can
+// check to see if they need to do any additional work.
+// To notify all potential waiters pass IREE_ALL_WAITERS.
+//
+// Acts as (at least) a memory_order_release barrier:
+//   A store operation with this memory order performs the release operation: no
+//   reads or writes in the current thread can be reordered after this store.
+//   All writes in the current thread are visible in other threads that acquire
+//   the same atomic variable and writes that carry a dependency into the atomic
+//   variable become visible in other threads that consume the same atomic.
+void iree_notification_post(iree_notification_t* notification, int32_t count);
+
+typedef uint32_t iree_wait_token_t;  // opaque
+
+// Prepares for a wait operation, returning a token that must be passed to
+// iree_notification_commit_wait to perform the actual wait.
+//
+// Acts as a memory_order_acq_rel barrier:
+//   A read-modify-write operation with this memory order is both an acquire
+//   operation and a release operation. No memory reads or writes in the current
+//   thread can be reordered before or after this store. All writes in other
+//   threads that release the same atomic variable are visible before the
+//   modification and the modification is visible in other threads that acquire
+//   the same atomic variable.
+iree_wait_token_t iree_notification_prepare_wait(
+    iree_notification_t* notification);
+
+// Commits a pending wait operation when the caller has ensured it must wait.
+// Waiting will continue until a notification has been posted or |deadline_ns|
+// is reached. Returns false if the deadline is reached before a notification is
+// posted.
+//
+// Acts as (at least) a memory_order_acquire barrier:
+//   A load operation with this memory order performs the acquire operation on
+//   the affected memory location: no reads or writes in the current thread can
+//   be reordered before this load. All writes in other threads that release the
+//   same atomic variable are visible in the current thread.
+bool iree_notification_commit_wait(iree_notification_t* notification,
+                                   iree_wait_token_t wait_token,
+                                   iree_time_t deadline_ns);
+
+// Cancels a pending wait operation without blocking.
+//
+// Acts as (at least) a memory_order_relaxed barrier:
+//   Relaxed operation: there are no synchronization or ordering constraints
+//   imposed on other reads or writes, only this operation's atomicity is
+//   guaranteed.
+void iree_notification_cancel_wait(iree_notification_t* notification);
+
+// Returns true if the condition is true.
+// |arg| is the |condition_arg| passed to the await function.
+// Implementations must ensure they are coherent with their state values.
+typedef bool (*iree_condition_fn_t)(void* arg);
+
+// Blocks and waits until |condition_fn| returns true. Other threads must modify
+// state checked by the |condition_fn| and post the notification.
+// Returns true if the condition is true before |timeout| is reached. If the
+// timeout is infinite then the return will always be true.
+//
+// Example:
+//  thread 1:
+//   bool check_flag_pred(void* arg) {
+//     return iree_atomic_int32_load((iree_atomic_int32_t*)arg,
+//                                   iree_memory_order_acquire) == 1;
+//   }
+//   iree_atomic_int32_t* flag = ...;
+//   iree_notification_await(&notification, check_flag_pred, flag);
+//  thread 2:
+//   iree_atomic_int32_store(flag, 1, iree_memory_order_release);
+//   iree_notification_post(&notification, IREE_ALL_WAITERS);
+bool iree_notification_await(iree_notification_t* notification,
+                             iree_condition_fn_t condition_fn,
+                             void* condition_arg, iree_timeout_t timeout);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // IREE_BASE_INTERNAL_SYNCHRONIZATION_H_
diff --git a/runtime/src/iree/base/internal/synchronization_benchmark.cc b/runtime/src/iree/base/internal/synchronization_benchmark.cc
new file mode 100644
index 0000000..9bdc13a
--- /dev/null
+++ b/runtime/src/iree/base/internal/synchronization_benchmark.cc
@@ -0,0 +1,256 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <cstddef>
+#include <mutex>
+
+#include "benchmark/benchmark.h"
+#include "iree/base/internal/synchronization.h"
+
+namespace {
+
+//==============================================================================
+// Inlined timing utils
+//==============================================================================
+
+void SpinDelay(int count, int* data) {
+  // This emulates work we may be doing while holding the lock (like swapping
+  // around some pointers).
+  for (size_t i = 0; i < count * 10; ++i) {
+    ++(*data);
+    benchmark::DoNotOptimize(*data);
+  }
+}
+
+//==============================================================================
+// iree_mutex_t / iree_slim_mutex_t
+//==============================================================================
+
+void BM_Mutex(benchmark::State& state) {
+  static iree_mutex_t* mu = ([]() -> iree_mutex_t* {
+    auto mutex = new iree_mutex_t();
+    iree_mutex_initialize(mutex);
+    return mutex;
+  })();
+  for (auto _ : state) {
+    iree_mutex_lock(mu);
+    benchmark::DoNotOptimize(*mu);
+    iree_mutex_unlock(mu);
+  }
+}
+BENCHMARK(BM_Mutex)->UseRealTime()->Threads(1)->ThreadPerCpu();
+
+template <typename MutexType>
+class RaiiLocker;
+
+template <>
+class RaiiLocker<iree_mutex_t> {
+ public:
+  static void Initialize(iree_mutex_t* out_mu) {
+    iree_mutex_initialize(out_mu);
+  }
+  static void Deinitialize(iree_mutex_t* mu) { iree_mutex_deinitialize(mu); }
+  explicit RaiiLocker(iree_mutex_t* mu)
+      IREE_THREAD_ANNOTATION_ATTRIBUTE(no_thread_safety_analysis)
+      : mu_(mu) {
+    iree_mutex_lock(mu_);
+  }
+  ~RaiiLocker() IREE_THREAD_ANNOTATION_ATTRIBUTE(no_thread_safety_analysis) {
+    iree_mutex_unlock(mu_);
+  }
+
+ private:
+  iree_mutex_t* mu_;
+};
+
+template <>
+class RaiiLocker<iree_slim_mutex_t> {
+ public:
+  static void Initialize(iree_slim_mutex_t* out_mu) {
+    iree_slim_mutex_initialize(out_mu);
+  }
+  static void Deinitialize(iree_slim_mutex_t* mu) {
+    iree_slim_mutex_deinitialize(mu);
+  }
+  explicit RaiiLocker(iree_slim_mutex_t* mu)
+      IREE_THREAD_ANNOTATION_ATTRIBUTE(no_thread_safety_analysis)
+      : mu_(mu) {
+    iree_slim_mutex_lock(mu_);
+  }
+  ~RaiiLocker() IREE_THREAD_ANNOTATION_ATTRIBUTE(no_thread_safety_analysis) {
+    iree_slim_mutex_unlock(mu_);
+  }
+
+ private:
+  iree_slim_mutex_t* mu_;
+};
+
+template <>
+class RaiiLocker<std::mutex> {
+ public:
+  static void Initialize(std::mutex* out_mu) {}
+  static void Deinitialize(std::mutex* mu) {}
+  explicit RaiiLocker(std::mutex* mu) : mu_(mu) { mu_->lock(); }
+  ~RaiiLocker() { mu_->unlock(); }
+
+ private:
+  std::mutex* mu_;
+};
+
+template <typename MutexType>
+void BM_CreateDelete(benchmark::State& state) {
+  for (auto _ : state) {
+    MutexType mu;
+    RaiiLocker<MutexType>::Initialize(&mu);
+    benchmark::DoNotOptimize(mu);
+    RaiiLocker<MutexType>::Deinitialize(&mu);
+  }
+}
+
+BENCHMARK_TEMPLATE(BM_CreateDelete, iree_mutex_t)->UseRealTime()->Threads(1);
+
+BENCHMARK_TEMPLATE(BM_CreateDelete, iree_slim_mutex_t)
+    ->UseRealTime()
+    ->Threads(1);
+
+BENCHMARK_TEMPLATE(BM_CreateDelete, std::mutex)->UseRealTime()->Threads(1);
+
+template <typename MutexType>
+void BM_Uncontended(benchmark::State& state) {
+  MutexType mu;
+  RaiiLocker<MutexType>::Initialize(&mu);
+  int data = 0;
+  int local = 0;
+  for (auto _ : state) {
+    // Here we model both local work outside of the critical section as well as
+    // some work inside of the critical section. The idea is to capture some
+    // more or less realisitic contention levels.
+    // If contention is too low, the benchmark won't measure anything useful.
+    // If contention is unrealistically high, the benchmark will favor
+    // bad mutex implementations that block and otherwise distract threads
+    // from the mutex and shared state for as much as possible.
+    // To achieve this amount of local work is multiplied by number of threads
+    // to keep ratio between local work and critical section approximately
+    // equal regardless of number of threads.
+    SpinDelay(100 * state.threads(), &local);
+    RaiiLocker<MutexType> locker(&mu);
+    SpinDelay(static_cast<int>(state.range(0)), &data);
+  }
+}
+
+BENCHMARK_TEMPLATE(BM_Uncontended, iree_mutex_t)
+    ->UseRealTime()
+    ->Threads(1)
+    ->Arg(50)
+    ->Arg(200);
+
+BENCHMARK_TEMPLATE(BM_Uncontended, iree_slim_mutex_t)
+    ->UseRealTime()
+    ->Threads(1)
+    ->Arg(50)
+    ->Arg(200);
+
+BENCHMARK_TEMPLATE(BM_Uncontended, std::mutex)
+    ->UseRealTime()
+    ->Threads(1)
+    ->Arg(50)
+    ->Arg(200);
+
+template <typename MutexType>
+void BM_Contended(benchmark::State& state) {
+  struct Shared {
+    MutexType mu;
+    int data = 0;
+    Shared() { RaiiLocker<MutexType>::Initialize(&mu); }
+  };
+  static auto* shared = new Shared();
+  int local = 0;
+  for (auto _ : state) {
+    // Here we model both local work outside of the critical section as well as
+    // some work inside of the critical section. The idea is to capture some
+    // more or less realisitic contention levels.
+    // If contention is too low, the benchmark won't measure anything useful.
+    // If contention is unrealistically high, the benchmark will favor
+    // bad mutex implementations that block and otherwise distract threads
+    // from the mutex and shared state for as much as possible.
+    // To achieve this amount of local work is multiplied by number of threads
+    // to keep ratio between local work and critical section approximately
+    // equal regardless of number of threads.
+    SpinDelay(100 * state.threads(), &local);
+    RaiiLocker<MutexType> locker(&shared->mu);
+    SpinDelay(static_cast<int>(state.range(0)), &shared->data);
+  }
+}
+
+BENCHMARK_TEMPLATE(BM_Contended, iree_mutex_t)
+    ->UseRealTime()
+    // ThreadPerCpu poorly handles non-power-of-two CPU counts.
+    ->Threads(1)
+    ->Threads(2)
+    ->Threads(4)
+    ->Threads(6)
+    ->Threads(8)
+    ->Threads(12)
+    ->Threads(16)
+    ->Threads(24)
+    ->Threads(32)
+    ->Threads(48)
+    ->Threads(64)
+    ->Threads(96)
+    // Some empirically chosen amounts of work in critical section.
+    // 1 is low contention, 200 is high contention and few values in between.
+    ->Arg(50)
+    ->Arg(200);
+
+BENCHMARK_TEMPLATE(BM_Contended, iree_slim_mutex_t)
+    ->UseRealTime()
+    // ThreadPerCpu poorly handles non-power-of-two CPU counts.
+    ->Threads(1)
+    ->Threads(2)
+    ->Threads(4)
+    ->Threads(6)
+    ->Threads(8)
+    ->Threads(12)
+    ->Threads(16)
+    ->Threads(24)
+    ->Threads(32)
+    ->Threads(48)
+    ->Threads(64)
+    ->Threads(96)
+    // Some empirically chosen amounts of work in critical section.
+    // 1 is low contention, 200 is high contention and few values in between.
+    ->Arg(50)
+    ->Arg(200);
+
+BENCHMARK_TEMPLATE(BM_Contended, std::mutex)
+    ->UseRealTime()
+    // ThreadPerCpu poorly handles non-power-of-two CPU counts.
+    ->Threads(1)
+    ->Threads(2)
+    ->Threads(4)
+    ->Threads(6)
+    ->Threads(8)
+    ->Threads(12)
+    ->Threads(16)
+    ->Threads(24)
+    ->Threads(32)
+    ->Threads(48)
+    ->Threads(64)
+    ->Threads(96)
+    // Some empirically chosen amounts of work in critical section.
+    // 1 is low contention, 200 is high contention and few values in between.
+    ->Arg(50)
+    ->Arg(200);
+
+//==============================================================================
+// iree_notification_t
+//==============================================================================
+
+// TODO(benvanik): benchmark this; it should in the worst case be as bad as
+// mutex/futex (as that's what is used), but at the moment we don't really
+// care beyond that.
+
+}  // namespace
diff --git a/runtime/src/iree/base/internal/synchronization_test.cc b/runtime/src/iree/base/internal/synchronization_test.cc
new file mode 100644
index 0000000..a44b994
--- /dev/null
+++ b/runtime/src/iree/base/internal/synchronization_test.cc
@@ -0,0 +1,218 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/internal/synchronization.h"
+
+#include <thread>
+
+#include "iree/testing/gtest.h"
+
+namespace {
+
+//==============================================================================
+// Test utils
+//==============================================================================
+
+template <typename T>
+class Mutex;
+
+template <>
+class Mutex<iree_mutex_t> {
+ public:
+  static void Initialize(iree_mutex_t* out_mu) {
+    iree_mutex_initialize(out_mu);
+  }
+  static void Deinitialize(iree_mutex_t* mu) { iree_mutex_deinitialize(mu); }
+  static void Lock(iree_mutex_t* mu)
+      IREE_THREAD_ANNOTATION_ATTRIBUTE(no_thread_safety_analysis) {
+    iree_mutex_lock(mu);
+  }
+  static bool TryLock(iree_mutex_t* mu)
+      IREE_THREAD_ANNOTATION_ATTRIBUTE(no_thread_safety_analysis) {
+    return iree_mutex_try_lock(mu);
+  }
+  static void Unlock(iree_mutex_t* mu)
+      IREE_THREAD_ANNOTATION_ATTRIBUTE(no_thread_safety_analysis) {
+    iree_mutex_unlock(mu);
+  }
+};
+
+template <>
+class Mutex<iree_slim_mutex_t> {
+ public:
+  static void Initialize(iree_slim_mutex_t* out_mu) {
+    iree_slim_mutex_initialize(out_mu);
+  }
+  static void Deinitialize(iree_slim_mutex_t* mu) {
+    iree_slim_mutex_deinitialize(mu);
+  }
+  static void Lock(iree_slim_mutex_t* mu)
+      IREE_THREAD_ANNOTATION_ATTRIBUTE(no_thread_safety_analysis) {
+    iree_slim_mutex_lock(mu);
+  }
+  static bool TryLock(iree_slim_mutex_t* mu)
+      IREE_THREAD_ANNOTATION_ATTRIBUTE(no_thread_safety_analysis) {
+    return iree_slim_mutex_try_lock(mu);
+  }
+  static void Unlock(iree_slim_mutex_t* mu)
+      IREE_THREAD_ANNOTATION_ATTRIBUTE(no_thread_safety_analysis) {
+    iree_slim_mutex_unlock(mu);
+  }
+};
+
+// Tests that a mutex allows exclusive access to a region by touching it from
+// multiple threads.
+template <typename T>
+void TestMutexExclusiveAccess() {
+  // We'll increment the counter back and forth as we touch it from multiple
+  // threads.
+  int counter = 0;
+
+  T mu;
+  Mutex<T>::Initialize(&mu);
+
+  // Hold the lock at the start. The threads should block waiting for the lock
+  // to be released so they can take it.
+  ASSERT_EQ(0, counter);
+  Mutex<T>::Lock(&mu);
+
+  // Start up a thread to ++counter (it should block since we hold the lock).
+  std::thread th1([&]() {
+    Mutex<T>::Lock(&mu);
+    ++counter;
+    Mutex<T>::Unlock(&mu);
+  });
+
+  // Unlock and wait for the thread to acquire the lock and finish its work.
+  ASSERT_EQ(0, counter);
+  Mutex<T>::Unlock(&mu);
+  th1.join();
+
+  // Thread should have been able to increment the counter.
+  ASSERT_EQ(1, counter);
+
+  Mutex<T>::Deinitialize(&mu);
+}
+
+// Tests that try lock bails when the lock is held by another thread.
+template <typename T>
+void TestMutexExclusiveAccessTryLock() {
+  int counter = 0;
+  T mu;
+  Mutex<T>::Initialize(&mu);
+
+  // Hold the lock at the start. The try lock should fail and the thread should
+  // exit without changing the counter value.
+  ASSERT_EQ(0, counter);
+  Mutex<T>::Lock(&mu);
+  std::thread th1([&]() {
+    if (Mutex<T>::TryLock(&mu)) {
+      ++counter;
+      Mutex<T>::Unlock(&mu);
+    }
+  });
+
+  // Wait for the thread to try (and fail).
+  th1.join();
+  Mutex<T>::Unlock(&mu);
+
+  // The thread should not have been able to change the counter.
+  ASSERT_EQ(0, counter);
+
+  Mutex<T>::Deinitialize(&mu);
+}
+
+//==============================================================================
+// iree_mutex_t
+//==============================================================================
+
+TEST(MutexTest, Lifetime) {
+  iree_mutex_t mutex;
+  iree_mutex_initialize(&mutex);
+  bool did_lock = iree_mutex_try_lock(&mutex);
+  EXPECT_TRUE(did_lock);
+  if (did_lock) iree_mutex_unlock(&mutex);
+  iree_mutex_lock(&mutex);
+  iree_mutex_unlock(&mutex);
+  iree_mutex_deinitialize(&mutex);
+}
+
+TEST(MutexTest, ExclusiveAccess) { TestMutexExclusiveAccess<iree_mutex_t>(); }
+
+TEST(MutexTest, ExclusiveAccessTryLock) {
+  TestMutexExclusiveAccessTryLock<iree_mutex_t>();
+}
+
+//==============================================================================
+// iree_slim_mutex_t
+//==============================================================================
+
+TEST(SlimMutexTest, Lifetime) {
+  iree_slim_mutex_t mutex;
+  iree_slim_mutex_initialize(&mutex);
+  bool did_lock = iree_slim_mutex_try_lock(&mutex);
+  EXPECT_TRUE(did_lock);
+  if (did_lock) iree_slim_mutex_unlock(&mutex);
+  iree_slim_mutex_lock(&mutex);
+  iree_slim_mutex_unlock(&mutex);
+  iree_slim_mutex_deinitialize(&mutex);
+}
+
+TEST(SlimMutexTest, ExclusiveAccess) {
+  TestMutexExclusiveAccess<iree_slim_mutex_t>();
+}
+
+TEST(SlimMutexTest, ExclusiveAccessTryLock) {
+  TestMutexExclusiveAccessTryLock<iree_slim_mutex_t>();
+}
+
+//==============================================================================
+// iree_notification_t
+//==============================================================================
+
+// Tested implicitly in threading_test.cc.
+
+TEST(NotificationTest, TimeoutImmediate) {
+  iree_notification_t notification;
+  iree_notification_initialize(&notification);
+
+  iree_time_t start_ns = iree_time_now();
+
+  EXPECT_FALSE(iree_notification_await(
+      &notification,
+      +[](void* entry_arg) -> bool {
+        return false;  // condition is never true
+      },
+      NULL, iree_immediate_timeout()));
+
+  iree_duration_t delta_ns = iree_time_now() - start_ns;
+  iree_duration_t delta_ms = delta_ns / 1000000;
+  EXPECT_LT(delta_ms, 50);  // slop
+
+  iree_notification_deinitialize(&notification);
+}
+
+TEST(NotificationTest, Timeout) {
+  iree_notification_t notification;
+  iree_notification_initialize(&notification);
+
+  iree_time_t start_ns = iree_time_now();
+
+  EXPECT_FALSE(iree_notification_await(
+      &notification,
+      +[](void* entry_arg) -> bool {
+        return false;  // condition is never true
+      },
+      NULL, iree_make_timeout_ms(100)));
+
+  iree_duration_t delta_ns = iree_time_now() - start_ns;
+  iree_duration_t delta_ms = delta_ns / 1000000;
+  EXPECT_GE(delta_ms, 50);  // slop
+
+  iree_notification_deinitialize(&notification);
+}
+
+}  // namespace
diff --git a/runtime/src/iree/base/internal/threading.c b/runtime/src/iree/base/internal/threading.c
new file mode 100644
index 0000000..699941d
--- /dev/null
+++ b/runtime/src/iree/base/internal/threading.c
@@ -0,0 +1,167 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/internal/threading.h"
+
+#include <assert.h>
+#include <errno.h>
+#include <string.h>
+
+#include "iree/base/internal/synchronization.h"
+#include "iree/base/internal/threading_impl.h"
+
+int iree_strncpy_s(char* IREE_RESTRICT dest, size_t destsz,
+                   const char* IREE_RESTRICT src, size_t count) {
+#if defined(IREE_COMPILER_MSVC) || defined(__STDC_LIB_EXT1__)
+  return strncpy_s(dest, destsz, src, count);
+#else
+  if (!src || !dest || !destsz) return EINVAL;
+  size_t src_len = strnlen(src, destsz);
+  if (count >= destsz && destsz <= src_len) return ERANGE;
+  if (src_len > count) src_len = count;
+  while (*src != 0 && src_len > 0) {
+    *(dest++) = *(src++);
+    --src_len;
+  }
+  *dest = 0;
+  return 0;
+#endif  // GNU
+}
+
+//==============================================================================
+// iree_thread_affinity_t
+//==============================================================================
+
+// TODO(benvanik): add more helpers and possibly move cpuinfo usage into here.
+
+void iree_thread_affinity_set_any(iree_thread_affinity_t* out_thread_affinity) {
+  memset(out_thread_affinity, 0x00, sizeof(*out_thread_affinity));
+}
+
+//==============================================================================
+// iree_thread_override_list_t
+//==============================================================================
+// This is shared by multiple platform implementations and gets stripped in LTO
+// when unused.
+
+struct iree_thread_override_t {
+  iree_thread_override_list_t* list;
+  iree_thread_override_t* next;
+  iree_thread_override_t* prev;
+  iree_thread_t* thread;
+  iree_thread_priority_class_t priority_class;
+};
+
+void iree_thread_override_list_initialize(
+    iree_thread_set_priority_fn_t set_priority_fn,
+    iree_thread_priority_class_t base_priority_class,
+    iree_allocator_t allocator, iree_thread_override_list_t* out_list) {
+  memset(out_list, 0, sizeof(*out_list));
+  out_list->set_priority_fn = set_priority_fn;
+  out_list->base_priority_class = base_priority_class;
+  out_list->allocator = allocator;
+  iree_slim_mutex_initialize(&out_list->mutex);
+  out_list->current_priority_class = base_priority_class;
+}
+
+void iree_thread_override_list_deinitialize(iree_thread_override_list_t* list) {
+#if !defined(NDEBUG)
+  // Assert that all overrides have been removed (and properly freed).
+  iree_slim_mutex_lock(&list->mutex);
+  assert(!list->head);
+  iree_slim_mutex_unlock(&list->mutex);
+#endif  // !NDEBUG
+
+  iree_slim_mutex_deinitialize(&list->mutex);
+}
+
+// Updates the priority class of the thread to the maximum across all overrides
+// and the base thread priority class.
+//
+// NOTE: assumes the lock is held so the list can be safely walked.
+static void iree_thread_override_list_update_priority_class(
+    iree_thread_override_list_t* list, iree_thread_t* thread) {
+  // Compute the new maximum priority class with the override now added.
+  iree_thread_priority_class_t max_priority_class = list->base_priority_class;
+  for (iree_thread_override_t* override = list->head; override != NULL;
+       override = override->next) {
+    max_priority_class = iree_max(max_priority_class, override->priority_class);
+  }
+  bool needs_update = max_priority_class != list->current_priority_class;
+  list->current_priority_class = max_priority_class;
+
+  // Change priority if needed (this way we are avoiding syscalls if we get a
+  // wave of overrides at the same priority class).
+  //
+  // NOTE: we do this inside the lock so that we don't lose priorities. It'd be
+  // nice to do this outside the lock if we could so we aren't holding it during
+  // a syscall. Overrides should (hopefully) be infrequent enough that this is
+  // rarely called.
+  if (needs_update) {
+    list->set_priority_fn(thread, max_priority_class);
+  }
+}
+
+iree_thread_override_t* iree_thread_override_list_add(
+    iree_thread_override_list_t* list, iree_thread_t* thread,
+    iree_thread_priority_class_t priority_class) {
+  // Allocate the override struct we'll pass back to the caller.
+  iree_thread_override_t* override = NULL;
+  iree_status_t status = iree_allocator_malloc(
+      list->allocator, sizeof(*override), (void**)&override);
+  if (IREE_UNLIKELY(!iree_status_is_ok(iree_status_consume_code(status)))) {
+    return NULL;
+  }
+  override->list = list;
+  override->next = NULL;
+  override->prev = NULL;
+  override->thread = thread;
+  override->priority_class = priority_class;
+
+  iree_slim_mutex_lock(&list->mutex);
+
+  // Add the override to the list.
+  override->next = list->head;
+  if (list->head) {
+    list->head->prev = override;
+  }
+  list->head = override;
+
+  // Update and change priority if needed.
+  // NOTE: the lock must be held.
+  iree_thread_override_list_update_priority_class(list, thread);
+
+  iree_slim_mutex_unlock(&list->mutex);
+
+  return override;
+}
+
+void iree_thread_override_remove_self(iree_thread_override_t* override) {
+  iree_thread_override_list_t* list = override->list;
+  iree_slim_mutex_lock(&list->mutex);
+
+  // Remove the override from the list.
+  if (override->prev) {
+    override->prev->next = override->next;
+  }
+  if (override->next) {
+    override->next->prev = override->prev;
+  }
+  if (list->head == override) {
+    list->head = override->next;
+  }
+
+  // Update and change priority if needed.
+  // NOTE: the lock must be held.
+  iree_thread_t* thread = override->thread;
+  iree_thread_override_list_update_priority_class(list, thread);
+
+  iree_slim_mutex_unlock(&list->mutex);
+
+  // Deallocate the override outside of the lock as no one should be using it
+  // anymore.
+  iree_allocator_free(list->allocator, override);
+}
diff --git a/runtime/src/iree/base/internal/threading.h b/runtime/src/iree/base/internal/threading.h
new file mode 100644
index 0000000..1518fd0
--- /dev/null
+++ b/runtime/src/iree/base/internal/threading.h
@@ -0,0 +1,179 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BASE_INTERNAL_THREADING_H_
+#define IREE_BASE_INTERNAL_THREADING_H_
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/base/target_platform.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//==============================================================================
+// iree_thread_t
+//==============================================================================
+
+typedef struct iree_thread_t iree_thread_t;
+
+// Specifies a thread's priority class.
+// These translate roughly to the same thing across all platforms, though they
+// are just a hint and the schedulers on various platforms may behave very
+// differently. When in doubt prefer to write code that works at the extremes
+// of the classes.
+typedef enum iree_thread_priority_class_e {
+  // Lowest possible priority used for background/idle work.
+  // Maps to QOS_CLASS_BACKGROUND.
+  IREE_THREAD_PRIORITY_CLASS_LOWEST = -2,
+  // Low priority work but still something the user expects to complete soon.
+  // Maps to QOS_CLASS_UTILITY.
+  IREE_THREAD_PRIORITY_CLASS_LOW = -1,
+  // Normal/default priority for the system.
+  // Maps to QOS_CLASS_DEFAULT.
+  IREE_THREAD_PRIORITY_CLASS_NORMAL = 0,
+  // High priority work for operations the user is waiting on.
+  // Maps to QOS_CLASS_USER_INITIATED.
+  IREE_THREAD_PRIORITY_CLASS_HIGH = 1,
+  // Highest possible priority used for interactive work.
+  // Maps to QOS_CLASS_USER_INTERACTIVE.
+  IREE_THREAD_PRIORITY_CLASS_HIGHEST = 2,
+} iree_thread_priority_class_t;
+
+// Specifies the processor affinity for a particular thread.
+// Each platform handles this differently (if at all).
+//
+// macOS/iOS:
+//   Only affinity tags are supported; the ID will be used by the kernel to
+//   group threads that having matching values together and (hopefully) schedule
+//   them on cores that may share some level of the cache hierarchy. The API is
+//   effectively just asking nicely and hoping the kernel is on the same
+//   wavelength.
+//
+// Linux/Android:
+//   sched_setaffinity is used to pin the thread to the core with the given ID.
+//   There are, naturally, issues on Android where if the governor has turned
+//   off some cores (such as powering down big cores in an ARM big.LITTLE
+//   configuration) the affinity request will be dropped on the floor even if
+//   the cores are later enabled. This is one of the reasons why we note in
+//   iree_thread_request_affinity that requests may need to be made at
+//   ¯\_(ツ)_/¯ intervals. In the future we can try to hook into power
+//   management infra to see if we can tell when we need to do this.
+//
+// Windows:
+//   Stuff just works. Love it.
+typedef struct iree_thread_affinity_t {
+  uint32_t specified : 1;
+  uint32_t smt : 1;
+  uint32_t group : 7;
+  uint32_t id : 23;
+} iree_thread_affinity_t;
+
+// Sets |thread_affinity| to match with any processor in the system.
+void iree_thread_affinity_set_any(iree_thread_affinity_t* out_thread_affinity);
+
+// Thread creation parameters.
+// All are optional and the entire struct can safely be zero-initialized.
+typedef struct iree_thread_create_params_t {
+  // Developer-visible name for the thread displayed in tooling.
+  // May be omitted for the system-default name (usually thread ID).
+  iree_string_view_t name;
+
+  // Stack size of the new thread, in bytes. If omitted a platform-defined
+  // default system stack size will be used.
+  size_t stack_size;
+
+  // Whether to create the thread in a suspended state. The thread will be
+  // initialized but not call the entry routine until it is resumed with
+  // iree_thread_resume. This can be useful to avoid a thundering herd upon
+  // creation of many threads.
+  bool create_suspended;
+
+  // Initial priority class.
+  // This may be changed later via iree_thread_priority_class_override_begin;
+  // see that for more information.
+  iree_thread_priority_class_t priority_class;
+
+  // Initial thread affinity.
+  // This may be changed later via iree_thread_request_affinity; see that for
+  // more information.
+  iree_thread_affinity_t initial_affinity;
+} iree_thread_create_params_t;
+
+typedef int (*iree_thread_entry_t)(void* entry_arg);
+
+// Creates a new thread and calls |entry| with |entry_arg|.
+// |params| can be used to specify additional thread creation parameters but can
+// also be zero-initialized to use defaults.
+//
+// The thread will be created and configured prior to returning from the
+// function. If the create_suspended parameter is set the thread will be
+// suspended and must be resumed with iree_thread_resume. Otherwise, the thread
+// may already be inside of the |entry| function by the time the function
+// returns.
+//
+// |entry_arg| lifetime is not managed and unless the caller is waiting for the
+// thread to start must not be stack-allocated.
+iree_status_t iree_thread_create(iree_thread_entry_t entry, void* entry_arg,
+                                 iree_thread_create_params_t params,
+                                 iree_allocator_t allocator,
+                                 iree_thread_t** out_thread);
+
+// Retains the given |thread| for the caller.
+void iree_thread_retain(iree_thread_t* thread);
+
+// Releases the given |thread| from the caller.
+void iree_thread_release(iree_thread_t* thread);
+
+// Returns a platform-defined thread ID for the given |thread|.
+uintptr_t iree_thread_id(iree_thread_t* thread);
+
+typedef struct iree_thread_override_t iree_thread_override_t;
+
+// Begins overriding the priority class of the given |thread|.
+// The priority of the thread will be the max of the base priority and the
+// overridden priority. Callers must pass the returned override token to
+// iree_thread_override_end.
+iree_thread_override_t* iree_thread_priority_class_override_begin(
+    iree_thread_t* thread, iree_thread_priority_class_t priority_class);
+
+// Ends a priority class override that was began for a thread with
+// iree_thread_priority_class_override_begin.
+void iree_thread_override_end(iree_thread_override_t* override_token);
+
+// Updates the thread affinity of the given |thread|.
+// Affinities are not sticky and may need to be refreshed over time as CPUs are
+// enabled/disabled by the OS (such as power mode changes, governor adjustments,
+// etc). Users wanting to ensure threads have specific affinities may want to
+// request updates whenever new large amounts of work are about to be performed.
+//
+// NOTE: thread affinities are just a hint. The OS scheduler is free to do
+// whatever it wants up to and including entirely ignoring the specified
+// affinity. In many cases where cores are oversubscribed setting an affinity
+// mask can pessimize battery/thermals/performance as the OS will sometimes try
+// to shuffle around threads to disable physical cores/etc.
+//
+// Compatibility warning: Apple/darwin only support affinity groups, with each
+// unique affinity sharing time with all others of the same value. This means
+// that trying to get clever with several thread sets with overlapping
+// affinities will likely not work as expected. Try to stick with threads that
+// run only on a single processor.
+void iree_thread_request_affinity(iree_thread_t* thread,
+                                  iree_thread_affinity_t affinity);
+
+// Resumes |thread| if it was created suspended.
+// This has no effect if the thread is not suspended.
+void iree_thread_resume(iree_thread_t* thread);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // IREE_BASE_INTERNAL_THREADING_H_
diff --git a/runtime/src/iree/base/internal/threading_darwin.c b/runtime/src/iree/base/internal/threading_darwin.c
new file mode 100644
index 0000000..7f3bd00
--- /dev/null
+++ b/runtime/src/iree/base/internal/threading_darwin.c
@@ -0,0 +1,250 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// NOTE: must be first to ensure that we can define settings for all includes.
+#include "iree/base/internal/threading_impl.h"
+
+#if defined(IREE_PLATFORM_APPLE)
+
+#include <errno.h>
+#include <mach/mach.h>
+#include <mach/thread_act.h>
+#include <pthread.h>
+#include <string.h>
+
+#include "iree/base/internal/atomics.h"
+#include "iree/base/internal/threading.h"
+#include "iree/base/tracing.h"
+
+// Useful to see how pthreads is implemented on (old) darwin:
+// https://opensource.apple.com/source/Libc/Libc-825.40.1/pthreads/pthread.c.auto.html
+
+struct iree_thread_t {
+  iree_atomic_ref_count_t ref_count;
+  iree_allocator_t allocator;
+
+  char name[16];
+  pthread_t handle;
+  mach_port_t mach_port;
+
+  iree_thread_entry_t entry;
+  void* entry_arg;
+
+  iree_atomic_int32_t is_suspended;
+};
+
+static qos_class_t iree_thread_qos_class_for_priority_class(
+    iree_thread_priority_class_t priority_class);
+
+static void iree_thread_set_name(const char* name) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  pthread_setname_np(name);
+  IREE_TRACE_SET_THREAD_NAME(name);
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static void* iree_thread_start_routine(void* param) {
+  // NOTE: we own a reference to the thread handle so that the creation
+  // thread can't delete this out from under us.
+  iree_thread_t* thread = (iree_thread_t*)param;
+
+  // Set the thread name used by debuggers and tracy (which must be called on
+  // the thread).
+  iree_thread_set_name(thread->name);
+
+  // "Consume" the entry info so that we don't see it again (as we don't own
+  // its lifetime).
+  iree_thread_entry_t entry = thread->entry;
+  void* entry_arg = thread->entry_arg;
+  thread->entry = NULL;
+  thread->entry_arg = NULL;
+
+  // Release our ownership of the thread handle. If the creating thread doesn't
+  // want it this will free the memory and fully detach the thread.
+  iree_thread_release(thread);
+
+  // Call the user thread entry point function.
+  // Note that this can be a tail-call which saves a stack frame in all threads
+  // (which is really just to make call stacks in debuggers much cleaner).
+  return (void*)((uintptr_t)entry(entry_arg));
+}
+
+iree_status_t iree_thread_create(iree_thread_entry_t entry, void* entry_arg,
+                                 iree_thread_create_params_t params,
+                                 iree_allocator_t allocator,
+                                 iree_thread_t** out_thread) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Allocate our thread struct; we'll use it to shuttle params into the thread
+  // (including the user-specified entry_arg).
+  iree_thread_t* thread = NULL;
+  iree_status_t status =
+      iree_allocator_malloc(allocator, sizeof(*thread), (void**)&thread);
+  if (!iree_status_is_ok(status)) {
+    IREE_TRACE_ZONE_END(z0);
+    return status;
+  }
+  iree_atomic_ref_count_init(&thread->ref_count);
+  thread->allocator = allocator;
+  thread->entry = entry;
+  thread->entry_arg = entry_arg;
+  iree_strncpy_s(thread->name, IREE_ARRAYSIZE(thread->name), params.name.data,
+                 iree_min(params.name.size, IREE_ARRAYSIZE(thread->name) - 1));
+  iree_atomic_store_int32(&thread->is_suspended,
+                          params.create_suspended ? 1 : 0,
+                          iree_memory_order_relaxed);
+
+  pthread_attr_t thread_attr;
+  pthread_attr_init(&thread_attr);
+  pthread_attr_setdetachstate(&thread_attr, PTHREAD_CREATE_JOINABLE);
+  if (params.stack_size) {
+    pthread_attr_setstacksize(&thread_attr, params.stack_size);
+  }
+
+  // Ensure we start with the right QoS class.
+  qos_class_t qos_class =
+      iree_thread_qos_class_for_priority_class(params.priority_class);
+  pthread_attr_set_qos_class_np(&thread_attr, qos_class, 0);
+
+  // Retain the thread for the thread itself; this way if the caller immediately
+  // releases the iree_thread_t handle the thread won't explode.
+  iree_thread_retain(thread);
+  *out_thread = thread;
+
+  // Create the thread either suspended or running as the user requested.
+  int rc;
+  if (params.create_suspended) {
+    IREE_TRACE_ZONE_BEGIN_NAMED(z1, "pthread_create_suspended_np");
+    rc = pthread_create_suspended_np(&thread->handle, &thread_attr,
+                                     &iree_thread_start_routine, thread);
+    IREE_TRACE_ZONE_END(z1);
+  } else {
+    IREE_TRACE_ZONE_BEGIN_NAMED(z1, "pthread_create");
+    rc = pthread_create(&thread->handle, &thread_attr,
+                        &iree_thread_start_routine, thread);
+    IREE_TRACE_ZONE_END(z1);
+  }
+  pthread_attr_destroy(&thread_attr);
+  if (rc != 0) {
+    iree_thread_release(thread);  // for self
+    iree_thread_release(thread);  // for caller
+    *out_thread = NULL;
+    IREE_TRACE_ZONE_END(z0);
+    return iree_make_status(IREE_STATUS_INTERNAL,
+                            "thread creation failed with %d", rc);
+  }
+
+  thread->mach_port = pthread_mach_thread_np(thread->handle);
+  if (params.initial_affinity.specified) {
+    iree_thread_request_affinity(thread, params.initial_affinity);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+static void iree_thread_delete(iree_thread_t* thread) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_thread_resume(thread);
+  pthread_join(thread->handle, NULL);
+
+  iree_allocator_free(thread->allocator, thread);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_thread_retain(iree_thread_t* thread) {
+  if (thread) {
+    iree_atomic_ref_count_inc(&thread->ref_count);
+  }
+}
+
+void iree_thread_release(iree_thread_t* thread) {
+  if (thread && iree_atomic_ref_count_dec(&thread->ref_count) == 1) {
+    iree_thread_delete(thread);
+  }
+}
+
+uintptr_t iree_thread_id(iree_thread_t* thread) {
+  return (uintptr_t)thread->handle;
+}
+
+// Maps an IREE iree_thread_priority_class_t value to a QoS type.
+// https://developer.apple.com/library/archive/documentation/Performance/Conceptual/EnergyGuide-iOS/PrioritizeWorkWithQoS.html
+static qos_class_t iree_thread_qos_class_for_priority_class(
+    iree_thread_priority_class_t priority_class) {
+  switch (priority_class) {
+    case IREE_THREAD_PRIORITY_CLASS_LOWEST:
+      return QOS_CLASS_BACKGROUND;
+    case IREE_THREAD_PRIORITY_CLASS_LOW:
+      return QOS_CLASS_UTILITY;
+    default:
+    case IREE_THREAD_PRIORITY_CLASS_NORMAL:
+      return QOS_CLASS_DEFAULT;
+    case IREE_THREAD_PRIORITY_CLASS_HIGH:
+      return QOS_CLASS_USER_INITIATED;
+    case IREE_THREAD_PRIORITY_CLASS_HIGHEST:
+      return QOS_CLASS_USER_INTERACTIVE;
+  }
+}
+
+iree_thread_override_t* iree_thread_priority_class_override_begin(
+    iree_thread_t* thread, iree_thread_priority_class_t priority_class) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  qos_class_t qos_class =
+      iree_thread_qos_class_for_priority_class(priority_class);
+  pthread_override_t override =
+      pthread_override_qos_class_start_np(thread->handle, qos_class, 0);
+
+  IREE_TRACE_ZONE_END(z0);
+  return (iree_thread_override_t*)override;
+}
+
+void iree_thread_override_end(iree_thread_override_t* override) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  pthread_override_qos_class_end_np((pthread_override_t)override);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_thread_request_affinity(iree_thread_t* thread,
+                                  iree_thread_affinity_t affinity) {
+  if (!affinity.specified) return;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // See:
+  // https://gist.github.com/Coneko/4234842
+  // https://fergofrog.com/code/cbowser/xnu/osfmk/mach/thread_policy.h.html
+  // http://www.hybridkernel.com/2015/01/18/binding_threads_to_cores_osx.html
+  thread_affinity_policy_data_t policy_data = {affinity.id};
+  thread_policy_set(thread->mach_port, THREAD_AFFINITY_POLICY,
+                    (thread_policy_t)(&policy_data),
+                    THREAD_AFFINITY_POLICY_COUNT);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_thread_resume(iree_thread_t* thread) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // NOTE: we don't track the suspend/resume depth here because we don't
+  // expose suspend as an operation (yet). If we did we'd want to make sure we
+  // always balance suspend/resume or else we'll mess with any
+  // debuggers/profilers that may be suspending threads for their own uses.
+  int32_t expected = 1;
+  if (iree_atomic_compare_exchange_strong_int32(
+          &thread->is_suspended, &expected, 0, iree_memory_order_seq_cst,
+          iree_memory_order_seq_cst)) {
+    thread_resume(thread->mach_port);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+#endif  // IREE_PLATFORM_APPLE
diff --git a/runtime/src/iree/base/internal/threading_impl.h b/runtime/src/iree/base/internal/threading_impl.h
new file mode 100644
index 0000000..0fdbbd6
--- /dev/null
+++ b/runtime/src/iree/base/internal/threading_impl.h
@@ -0,0 +1,77 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BASE_INTERNAL_THREADING_IMPL_H_
+#define IREE_BASE_INTERNAL_THREADING_IMPL_H_
+
+// Ensure that any posix header we include exposes GNU stuff. Ignored on
+// platforms where we either don't have the GNU stuff or don't have posix
+// headers at all.
+//
+// Note that this does not need to be the same for all compilation units, only
+// those we want to access the non-portable features in. It *must* be defined
+// prior to including any of the files, though, as otherwise header-guards will
+// cause the setting at the time of first inclusion to win.
+//
+// https://stackoverflow.com/a/5583764
+#define _GNU_SOURCE 1
+
+#include <assert.h>
+#include <errno.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/synchronization.h"
+#include "iree/base/internal/threading.h"
+#include "iree/base/target_platform.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// strncpy_s shall copy the first N characters of src to dst, where N is the
+// lesser of MaxCount and the length of src.
+//
+// We have this here patching over GNU being stubborn about supporting this.
+// If we start using it other places we can move it into a helper file.
+int iree_strncpy_s(char* dest, size_t destsz, const char* src, size_t count);
+
+typedef void (*iree_thread_set_priority_fn_t)(
+    iree_thread_t* thread, iree_thread_priority_class_t priority_class);
+
+typedef struct iree_thread_override_list_t {
+  iree_thread_set_priority_fn_t set_priority_fn;
+  iree_thread_priority_class_t base_priority_class;
+  iree_allocator_t allocator;
+  iree_slim_mutex_t mutex;
+  iree_thread_priority_class_t current_priority_class;
+  iree_thread_override_t* head;
+} iree_thread_override_list_t;
+
+// Initializes the override list for a thread with |base_priority_class|.
+// |set_priority_fn| will be used to update the thread priority when needed.
+void iree_thread_override_list_initialize(
+    iree_thread_set_priority_fn_t set_priority_fn,
+    iree_thread_priority_class_t base_priority_class,
+    iree_allocator_t allocator, iree_thread_override_list_t* out_list);
+
+// Deinitializes an override list; expects that all overrides have been removed.
+void iree_thread_override_list_deinitialize(iree_thread_override_list_t* list);
+
+// Adds a new override to the list and returns an allocated handle.
+iree_thread_override_t* iree_thread_override_list_add(
+    iree_thread_override_list_t* list, iree_thread_t* thread,
+    iree_thread_priority_class_t priority_class);
+
+// Removes an override from its parent list and deallocates it.
+void iree_thread_override_remove_self(iree_thread_override_t* override);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // IREE_BASE_INTERNAL_THREADING_IMPL_H_
diff --git a/runtime/src/iree/base/internal/threading_pthreads.c b/runtime/src/iree/base/internal/threading_pthreads.c
new file mode 100644
index 0000000..a197f50
--- /dev/null
+++ b/runtime/src/iree/base/internal/threading_pthreads.c
@@ -0,0 +1,356 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// NOTE: must be first to ensure that we can define settings for all includes.
+#include "iree/base/internal/threading_impl.h"
+
+#if defined(IREE_PLATFORM_ANDROID) || defined(IREE_PLATFORM_EMSCRIPTEN) || \
+    defined(IREE_PLATFORM_LINUX)
+
+#include <dlfcn.h>
+#include <errno.h>
+#include <pthread.h>
+#include <sched.h>
+#include <string.h>
+#include <sys/resource.h>
+#include <sys/syscall.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "iree/base/internal/atomics.h"
+#include "iree/base/internal/call_once.h"
+#include "iree/base/internal/synchronization.h"
+#include "iree/base/internal/threading.h"
+#include "iree/base/tracing.h"
+
+#if defined(IREE_PLATFORM_EMSCRIPTEN)
+#include <emscripten/threading.h>
+#endif  // IREE_PLATFORM_EMSCRIPTEN
+
+// Older glibc doesn't have a gettid wrapper:
+// https://stackoverflow.com/a/63494768
+#if __GLIBC__ == 2 && __GLIBC_MINOR__ < 30
+#define gettid() syscall(SYS_gettid)
+#endif
+
+struct iree_thread_t {
+  iree_atomic_ref_count_t ref_count;
+  iree_allocator_t allocator;
+
+  char name[16];
+  pthread_t handle;
+
+  iree_thread_entry_t entry;
+  void* entry_arg;
+
+  iree_atomic_int32_t suspend_count;
+  iree_notification_t suspend_barrier;
+
+  // Thread-safe (has its own synchronization).
+  iree_thread_override_list_t qos_override_list;
+};
+
+static void iree_thread_set_priority_class(
+    iree_thread_t* thread, iree_thread_priority_class_t priority_class);
+
+static bool iree_thread_resumed_predicate(void* arg) {
+  iree_thread_t* thread = (iree_thread_t*)arg;
+  return iree_atomic_load_int32(&thread->suspend_count,
+                                iree_memory_order_seq_cst) == 0;
+}
+
+#if defined(IREE_PLATFORM_EMSCRIPTEN)
+
+static int iree_thread_set_name(pthread_t handle, const char* name) {
+  emscripten_set_thread_name(handle, name);
+  return 0;
+}
+
+#else
+
+typedef int (*pthread_setname_np_fn_t)(pthread_t thread, const char* name);
+
+static pthread_setname_np_fn_t iree_pthread_setname_np_fn = NULL;
+static void iree_thread_try_query_setname_fn(void) {
+  iree_pthread_setname_np_fn =
+      (pthread_setname_np_fn_t)dlsym(RTLD_DEFAULT, "pthread_setname_np");
+}
+
+static int iree_thread_set_name(pthread_t handle, const char* name) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  static iree_once_flag fn_query_flag = IREE_ONCE_FLAG_INIT;
+  iree_call_once(&fn_query_flag, iree_thread_try_query_setname_fn);
+  int rc;
+  if (iree_pthread_setname_np_fn) {
+    rc = iree_pthread_setname_np_fn(handle, name);
+  } else {
+    rc = EINVAL;
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return rc;
+}
+
+#endif  // IREE_PLATFORM_EMSCRIPTEN
+
+static void* iree_thread_start_routine(void* param) {
+  // NOTE: we own a reference to the thread handle so that the creation
+  // thread can't delete this out from under us.
+  iree_thread_t* thread = (iree_thread_t*)param;
+
+  // Set the thread name used by debuggers and tracy (which must be called on
+  // the thread).
+  iree_thread_set_name(thread->handle, thread->name);
+  IREE_TRACE_SET_THREAD_NAME(thread->name);
+
+  // Wait until we resume if we were created suspended.
+  while (iree_atomic_load_int32(&thread->suspend_count,
+                                iree_memory_order_seq_cst) > 0) {
+    iree_notification_await(&thread->suspend_barrier,
+                            iree_thread_resumed_predicate, thread,
+                            iree_infinite_timeout());
+  }
+
+  // "Consume" the entry info so that we don't see it again (as we don't own
+  // its lifetime).
+  iree_thread_entry_t entry = thread->entry;
+  void* entry_arg = thread->entry_arg;
+  thread->entry = NULL;
+  thread->entry_arg = NULL;
+
+  // Release our ownership of the thread handle. If the creating thread doesn't
+  // want it this will free the memory and fully detach the thread.
+  iree_thread_release(thread);
+
+  // Call the user thread entry point function.
+  // Note that this can be a tail-call which saves a stack frame in all threads
+  // (which is really just to make call stacks in debuggers much cleaner).
+  return (void*)((uintptr_t)entry(entry_arg));
+}
+
+iree_status_t iree_thread_create(iree_thread_entry_t entry, void* entry_arg,
+                                 iree_thread_create_params_t params,
+                                 iree_allocator_t allocator,
+                                 iree_thread_t** out_thread) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Allocate our thread struct; we'll use it to shuttle params into the thread
+  // (including the user-specified entry_arg).
+  iree_thread_t* thread = NULL;
+  iree_status_t status =
+      iree_allocator_malloc(allocator, sizeof(*thread), (void**)&thread);
+  if (!iree_status_is_ok(status)) {
+    IREE_TRACE_ZONE_END(z0);
+    return status;
+  }
+  iree_atomic_ref_count_init(&thread->ref_count);
+  thread->allocator = allocator;
+  thread->entry = entry;
+  thread->entry_arg = entry_arg;
+  iree_strncpy_s(thread->name, IREE_ARRAYSIZE(thread->name), params.name.data,
+                 iree_min(params.name.size, IREE_ARRAYSIZE(thread->name) - 1));
+  thread->suspend_count = IREE_ATOMIC_VAR_INIT(params.create_suspended ? 1 : 0);
+  iree_notification_initialize(&thread->suspend_barrier);
+  iree_thread_override_list_initialize(iree_thread_set_priority_class,
+                                       params.priority_class, thread->allocator,
+                                       &thread->qos_override_list);
+
+  pthread_attr_t thread_attr;
+  pthread_attr_init(&thread_attr);
+  pthread_attr_setdetachstate(&thread_attr, PTHREAD_CREATE_JOINABLE);
+  if (params.stack_size) {
+    pthread_attr_setstacksize(&thread_attr, params.stack_size);
+  }
+
+  // Retain the thread for the thread itself; this way if the caller immediately
+  // releases the iree_thread_t handle the thread won't explode.
+  iree_thread_retain(thread);
+  *out_thread = thread;
+
+  // Unfortunately we can't create the thread suspended (no API). This means
+  // that we are likely to incur some thrashing here as the thread gets spun up
+  // immediately. We emulate the create_suspended behavior by waiting in the
+  // thread until iree_thread_resume is called which at least gives us the same
+  // execution order guarantee across all platforms.
+  int rc;
+  {
+    IREE_TRACE_ZONE_BEGIN_NAMED(z1, "pthread_create");
+    rc = pthread_create(&thread->handle, &thread_attr,
+                        &iree_thread_start_routine, thread);
+    IREE_TRACE_ZONE_END(z1);
+  }
+  pthread_attr_destroy(&thread_attr);
+  if (rc != 0) {
+    iree_thread_release(thread);  // for self
+    iree_thread_release(thread);  // for caller
+    *out_thread = NULL;
+    IREE_TRACE_ZONE_END(z0);
+    return iree_make_status(IREE_STATUS_INTERNAL,
+                            "thread creation failed with %d", rc);
+  }
+
+  if (params.priority_class != IREE_THREAD_PRIORITY_CLASS_NORMAL) {
+    iree_thread_set_priority_class(thread, params.priority_class);
+  }
+  if (params.initial_affinity.specified) {
+    iree_thread_request_affinity(thread, params.initial_affinity);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+static void iree_thread_delete(iree_thread_t* thread) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_thread_resume(thread);
+  pthread_join(thread->handle, NULL);
+
+  iree_notification_deinitialize(&thread->suspend_barrier);
+  iree_thread_override_list_deinitialize(&thread->qos_override_list);
+  iree_allocator_free(thread->allocator, thread);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_thread_retain(iree_thread_t* thread) {
+  if (thread) {
+    iree_atomic_ref_count_inc(&thread->ref_count);
+  }
+}
+
+void iree_thread_release(iree_thread_t* thread) {
+  if (thread && iree_atomic_ref_count_dec(&thread->ref_count) == 1) {
+    iree_thread_delete(thread);
+  }
+}
+
+uintptr_t iree_thread_id(iree_thread_t* thread) {
+  return (uintptr_t)thread->handle;
+}
+
+// Maps an IREE iree_thread_priority_class_t value to a pthreads priority param.
+// The min/max ranges of the priority are implementation dependent so we need to
+// do this at runtime.
+static struct sched_param iree_thread_sched_param_for_priority_class(
+    int policy, iree_thread_priority_class_t priority_class) {
+  struct sched_param param;
+  memset(&param, 0, sizeof(param));
+  int min_priority = sched_get_priority_min(policy);
+  int max_priority = sched_get_priority_max(policy);
+  int normal_priority = (max_priority - min_priority) / 2 + min_priority;
+  switch (priority_class) {
+    case IREE_THREAD_PRIORITY_CLASS_LOWEST:
+      param.sched_priority = min_priority;
+      break;
+    case IREE_THREAD_PRIORITY_CLASS_LOW:
+      param.sched_priority =
+          (normal_priority - min_priority) / 2 + min_priority;
+      break;
+    case IREE_THREAD_PRIORITY_CLASS_NORMAL:
+      param.sched_priority = normal_priority;
+      break;
+    case IREE_THREAD_PRIORITY_CLASS_HIGH:
+      param.sched_priority =
+          (max_priority - normal_priority) / 2 + normal_priority;
+      break;
+    case IREE_THREAD_PRIORITY_CLASS_HIGHEST:
+      param.sched_priority = max_priority;
+      break;
+  }
+  return param;
+}
+
+// Sets the thread priority to the given |priority_class|, resetting any
+// previous value.
+//
+// NOTE: this probably doesn't work on Android, because Android.
+// They seem to use linux LWPs and setpriority/nice on the tid will actually
+// change the priority. It doesn't seem possible to elevate priority above
+// normal (without root), but it would at least be useful to be able to
+// indicate background threads.
+//
+// See:
+// https://stackoverflow.com/questions/17398075/change-native-thread-priority-on-android-in-c-c
+// https://android.googlesource.com/platform/frameworks/native/+/android-4.2.2_r1/include/utils/ThreadDefs.h
+//
+// TODO(benvanik): try this from filament:
+// https://github.com/google/filament/blob/56682794d398236c4caa5be40d80acdb73a13bc8/libs/utils/src/JobSystem.cpp
+static void iree_thread_set_priority_class(
+    iree_thread_t* thread, iree_thread_priority_class_t priority_class) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+#if defined(IREE_PLATFORM_ANDROID) || defined(IREE_PLATFORM_EMSCRIPTEN)
+  // TODO(benvanik): Some sort of solution on Android, if possible (see above)
+  // TODO(benvanik): Some sort of solution on Emscripten, if possible
+#else
+  int policy = 0;
+  struct sched_param param;
+  pthread_getschedparam(thread->handle, &policy, &param);
+  param = iree_thread_sched_param_for_priority_class(policy, priority_class);
+  pthread_setschedparam(thread->handle, policy, &param);
+#endif  // IREE_PLATFORM_ANDROID
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+iree_thread_override_t* iree_thread_priority_class_override_begin(
+    iree_thread_t* thread, iree_thread_priority_class_t priority_class) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_thread_override_t* override = iree_thread_override_list_add(
+      &thread->qos_override_list, thread, priority_class);
+  IREE_TRACE_ZONE_END(z0);
+  return override;
+}
+
+void iree_thread_override_end(iree_thread_override_t* override) {
+  if (!override) return;
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_thread_override_remove_self(override);
+  IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_thread_request_affinity(iree_thread_t* thread,
+                                  iree_thread_affinity_t affinity) {
+  if (!affinity.specified) return;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  cpu_set_t cpu_set;
+  CPU_ZERO(&cpu_set);
+  CPU_SET(affinity.id, &cpu_set);
+  if (affinity.smt) {
+    CPU_SET(affinity.id + 1, &cpu_set);
+  }
+
+#if defined(IREE_PLATFORM_ANDROID)
+  // `pthread_gettid_np` is only available on API 21+ and it is needed to set
+  // affinity so skip it for older API versions.
+#if __ANDROID_API__ >= 21
+  // Android doesn't have pthread_setaffinity_np but that's usually just
+  // implemented as this sequence anyway:
+  pid_t tid = pthread_gettid_np(thread->handle);
+  sched_setaffinity(tid, sizeof(cpu_set), &cpu_set);
+#endif  // __ANDROID_API__ >= 21
+#elif defined(IREE_PLATFORM_EMSCRIPTEN)
+  // TODO(benvanik): Some sort of solution on Emscripten, if possible
+#else
+  pthread_setaffinity_np(thread->handle, sizeof(cpu_set), &cpu_set);
+#endif  // IREE_PLATFORM_*
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_thread_resume(iree_thread_t* thread) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  if (iree_atomic_exchange_int32(&thread->suspend_count, 0,
+                                 iree_memory_order_seq_cst) == 1) {
+    iree_notification_post(&thread->suspend_barrier, IREE_ALL_WAITERS);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+#endif  // IREE_PLATFORM_*
diff --git a/runtime/src/iree/base/internal/threading_test.cc b/runtime/src/iree/base/internal/threading_test.cc
new file mode 100644
index 0000000..18f4f5e
--- /dev/null
+++ b/runtime/src/iree/base/internal/threading_test.cc
@@ -0,0 +1,223 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/internal/threading.h"
+
+#include <chrono>
+#include <cstring>
+#include <thread>
+
+#include "iree/base/internal/atomics.h"
+#include "iree/base/internal/synchronization.h"
+#include "iree/base/internal/threading_impl.h"  // to test the override list
+#include "iree/base/status_cc.h"
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace {
+
+using iree::Status;
+
+//==============================================================================
+// iree_thread_t
+//==============================================================================
+
+TEST(ThreadTest, Lifetime) {
+  // Default parameters:
+  iree_thread_create_params_t params;
+  memset(&params, 0, sizeof(params));
+
+  // Our thread: do a bit of math and notify the main test thread when done.
+  struct entry_data_t {
+    iree_atomic_int32_t value;
+    iree_notification_t barrier;
+  } entry_data;
+  iree_atomic_store_int32(&entry_data.value, 123, iree_memory_order_relaxed);
+  iree_notification_initialize(&entry_data.barrier);
+  iree_thread_entry_t entry_fn = +[](void* entry_arg) -> int {
+    auto* entry_data = reinterpret_cast<struct entry_data_t*>(entry_arg);
+    iree_atomic_fetch_add_int32(&entry_data->value, 1,
+                                iree_memory_order_acq_rel);
+    iree_notification_post(&entry_data->barrier, IREE_ALL_WAITERS);
+    return 0;
+  };
+
+  // Create the thread and immediately begin running it.
+  iree_thread_t* thread = nullptr;
+  IREE_ASSERT_OK(iree_thread_create(entry_fn, &entry_data, params,
+                                    iree_allocator_system(), &thread));
+  EXPECT_NE(0, iree_thread_id(thread));
+
+  // Wait for the thread to finish.
+  iree_notification_await(
+      &entry_data.barrier,
+      +[](void* entry_arg) -> bool {
+        auto* entry_data = reinterpret_cast<struct entry_data_t*>(entry_arg);
+        return iree_atomic_load_int32(&entry_data->value,
+                                      iree_memory_order_relaxed) == (123 + 1);
+      },
+      &entry_data, iree_infinite_timeout());
+
+  // By holding on to the thread object and releasing it here after the thread
+  // has finished, we ensure that destruction occurs on the main thread,
+  // avoiding data races reported by TSan.
+  iree_thread_release(thread);
+  iree_notification_deinitialize(&entry_data.barrier);
+}
+
+TEST(ThreadTest, CreateSuspended) {
+  iree_thread_create_params_t params;
+  memset(&params, 0, sizeof(params));
+  params.create_suspended = true;
+
+  struct entry_data_t {
+    iree_atomic_int32_t value;
+    iree_notification_t barrier;
+  } entry_data;
+  iree_atomic_store_int32(&entry_data.value, 123, iree_memory_order_relaxed);
+  iree_notification_initialize(&entry_data.barrier);
+  iree_thread_entry_t entry_fn = +[](void* entry_arg) -> int {
+    auto* entry_data = reinterpret_cast<struct entry_data_t*>(entry_arg);
+    iree_atomic_fetch_add_int32(&entry_data->value, 1,
+                                iree_memory_order_acq_rel);
+    iree_notification_post(&entry_data->barrier, IREE_ALL_WAITERS);
+    return 0;
+  };
+
+  iree_thread_t* thread = nullptr;
+  IREE_ASSERT_OK(iree_thread_create(entry_fn, &entry_data, params,
+                                    iree_allocator_system(), &thread));
+  EXPECT_NE(0, iree_thread_id(thread));
+
+  // NOTE: the thread will not be running and we should not expect a change in
+  // the value. I can't think of a good way to test this, though, so we'll just
+  // wait a moment here and assume that if the thread was able to run it would
+  // have during this wait.
+  ASSERT_EQ(123, iree_atomic_load_int32(&entry_data.value,
+                                        iree_memory_order_seq_cst));
+  std::this_thread::sleep_for(std::chrono::milliseconds(150));
+  ASSERT_EQ(123, iree_atomic_load_int32(&entry_data.value,
+                                        iree_memory_order_seq_cst));
+
+  // Resume the thread and wait for it to finish its work.
+  iree_thread_resume(thread);
+  iree_notification_await(
+      &entry_data.barrier,
+      +[](void* entry_arg) -> bool {
+        auto* entry_data = reinterpret_cast<struct entry_data_t*>(entry_arg);
+        return iree_atomic_load_int32(&entry_data->value,
+                                      iree_memory_order_relaxed) == (123 + 1);
+      },
+      &entry_data, iree_infinite_timeout());
+  iree_notification_deinitialize(&entry_data.barrier);
+  iree_thread_release(thread);
+}
+
+// NOTE: testing whether priority took effect is really hard given that on
+// certain platforms the priority may not be respected or may be clamped by
+// the system. This is here to test the mechanics of the priority override code
+// on our side and assumes that if we tell the OS something it respects it.
+TEST(ThreadTest, PriorityOverride) {
+  iree_thread_create_params_t params;
+  memset(&params, 0, sizeof(params));
+
+  struct entry_data_t {
+    iree_atomic_int32_t value;
+    iree_notification_t barrier;
+  } entry_data;
+  iree_atomic_store_int32(&entry_data.value, 0, iree_memory_order_relaxed);
+  iree_notification_initialize(&entry_data.barrier);
+  iree_thread_entry_t entry_fn = +[](void* entry_arg) -> int {
+    auto* entry_data = reinterpret_cast<struct entry_data_t*>(entry_arg);
+    iree_atomic_fetch_add_int32(&entry_data->value, 1,
+                                iree_memory_order_acq_rel);
+    iree_notification_post(&entry_data->barrier, IREE_ALL_WAITERS);
+    return 0;
+  };
+
+  iree_thread_t* thread = nullptr;
+  IREE_ASSERT_OK(iree_thread_create(entry_fn, &entry_data, params,
+                                    iree_allocator_system(), &thread));
+  EXPECT_NE(0, iree_thread_id(thread));
+
+  // Push a few overrides.
+  iree_thread_override_t* override0 = iree_thread_priority_class_override_begin(
+      thread, IREE_THREAD_PRIORITY_CLASS_HIGH);
+  EXPECT_NE(nullptr, override0);
+  iree_thread_override_t* override1 = iree_thread_priority_class_override_begin(
+      thread, IREE_THREAD_PRIORITY_CLASS_HIGHEST);
+  EXPECT_NE(nullptr, override1);
+  iree_thread_override_t* override2 = iree_thread_priority_class_override_begin(
+      thread, IREE_THREAD_PRIORITY_CLASS_LOWEST);
+  EXPECT_NE(nullptr, override2);
+
+  // Wait for the thread to finish.
+  iree_notification_await(
+      &entry_data.barrier,
+      +[](void* entry_arg) -> bool {
+        auto* entry_data = reinterpret_cast<struct entry_data_t*>(entry_arg);
+        return iree_atomic_load_int32(&entry_data->value,
+                                      iree_memory_order_relaxed) == 1;
+      },
+      &entry_data, iree_infinite_timeout());
+  iree_notification_deinitialize(&entry_data.barrier);
+
+  // Pop overrides (in opposite order intentionally).
+  iree_thread_override_end(override0);
+  iree_thread_override_end(override1);
+  iree_thread_override_end(override2);
+
+  iree_thread_release(thread);
+}
+
+//==============================================================================
+// iree_thread_override_list_t
+//==============================================================================
+// This is an implementation detail but useful to test on its own as it's shared
+// across several platform implementations.
+
+TEST(ThreadOverrideListTest, PriorityClass) {
+  static iree_thread_t* kThreadSentinel =
+      reinterpret_cast<iree_thread_t*>(0x123);
+  static iree_thread_priority_class_t current_priority_class =
+      IREE_THREAD_PRIORITY_CLASS_NORMAL;
+  iree_thread_override_list_t list;
+  iree_thread_override_list_initialize(
+      +[](iree_thread_t* thread, iree_thread_priority_class_t priority_class) {
+        EXPECT_EQ(kThreadSentinel, thread);
+        EXPECT_NE(current_priority_class, priority_class);
+        current_priority_class = priority_class;
+      },
+      current_priority_class, iree_allocator_system(), &list);
+
+  // (NORMAL) -> HIGH -> [ignored LOW] -> HIGHEST
+  ASSERT_EQ(IREE_THREAD_PRIORITY_CLASS_NORMAL, current_priority_class);
+  iree_thread_override_t* override0 = iree_thread_override_list_add(
+      &list, kThreadSentinel, IREE_THREAD_PRIORITY_CLASS_HIGH);
+  EXPECT_NE(nullptr, override0);
+  ASSERT_EQ(IREE_THREAD_PRIORITY_CLASS_HIGH, current_priority_class);
+  iree_thread_override_t* override1 = iree_thread_override_list_add(
+      &list, kThreadSentinel, IREE_THREAD_PRIORITY_CLASS_LOW);
+  EXPECT_NE(nullptr, override1);
+  ASSERT_EQ(IREE_THREAD_PRIORITY_CLASS_HIGH, current_priority_class);
+  iree_thread_override_t* override2 = iree_thread_override_list_add(
+      &list, kThreadSentinel, IREE_THREAD_PRIORITY_CLASS_HIGHEST);
+  EXPECT_NE(nullptr, override2);
+  ASSERT_EQ(IREE_THREAD_PRIORITY_CLASS_HIGHEST, current_priority_class);
+
+  // Out of order to ensure highest bit sticks:
+  ASSERT_EQ(IREE_THREAD_PRIORITY_CLASS_HIGHEST, current_priority_class);
+  iree_thread_override_remove_self(override1);
+  ASSERT_EQ(IREE_THREAD_PRIORITY_CLASS_HIGHEST, current_priority_class);
+  iree_thread_override_remove_self(override0);
+  ASSERT_EQ(IREE_THREAD_PRIORITY_CLASS_HIGHEST, current_priority_class);
+  iree_thread_override_remove_self(override2);
+  ASSERT_EQ(IREE_THREAD_PRIORITY_CLASS_NORMAL, current_priority_class);
+
+  iree_thread_override_list_deinitialize(&list);
+}
+
+}  // namespace
diff --git a/runtime/src/iree/base/internal/threading_win32.c b/runtime/src/iree/base/internal/threading_win32.c
new file mode 100644
index 0000000..6e550e3
--- /dev/null
+++ b/runtime/src/iree/base/internal/threading_win32.c
@@ -0,0 +1,328 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// clang-format off: must be included before all other headers.
+#include "iree/base/internal/threading_impl.h"
+// clang-format on
+
+#if defined(IREE_PLATFORM_WINDOWS)
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/atomics.h"
+#include "iree/base/internal/threading.h"
+#include "iree/base/target_platform.h"
+#include "iree/base/tracing.h"
+
+// Great documentation:
+// https://www.microsoftpressstore.com/articles/article.aspx?p=2233328
+
+struct iree_thread_t {
+  iree_atomic_ref_count_t ref_count;
+  iree_allocator_t allocator;
+
+  char name[16];
+  HANDLE handle;
+  DWORD id;
+
+  iree_thread_entry_t entry;
+  void* entry_arg;
+
+  iree_atomic_int32_t is_suspended;
+
+  // Thread-safe (has its own synchronization).
+  iree_thread_override_list_t qos_override_list;
+};
+
+static void iree_thread_set_priority_class(
+    iree_thread_t* thread, iree_thread_priority_class_t priority_class);
+
+// Sets the thread's name to the given NUL-terminated string.
+//
+// See:
+// https://docs.microsoft.com/en-us/visualstudio/debugger/how-to-set-a-thread-name-in-native-code
+static void iree_thread_set_name(HANDLE handle, const char* name) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Try first to use the modern SetThreadDescription API.
+  // This will work even if a debugger is not attached meaning that tools that
+  // don't use the debugger API can still query thread names. It's only
+  // available on Win10+.
+  typedef HRESULT(WINAPI * SetThreadDescriptionFn)(HANDLE hThread,
+                                                   PCWSTR lpThreadDescription);
+  SetThreadDescriptionFn pSetThreadDescription =
+      (SetThreadDescriptionFn)GetProcAddress(GetModuleHandleW(L"Kernel32.dll"),
+                                             "SetThreadDescription");
+  if (pSetThreadDescription) {
+    wchar_t name_wide[16] = {0};
+    MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, name, -1, name_wide,
+                        IREE_ARRAYSIZE(name_wide) - 1);
+    pSetThreadDescription(handle, name_wide);
+    IREE_TRACE_ZONE_END(z0);
+    return;
+  }
+
+  if (!IsDebuggerPresent()) {
+    // The name is only captured if a debugger is attached so we can avoid
+    // doing any of the work if none is present. This means that a debugger
+    // attached to the process after thread creation won't see thread names but
+    // that's a rare case anyway.
+    IREE_TRACE_ZONE_END(z0);
+    return;
+  }
+
+#pragma pack(push, 8)
+  struct THREADNAME_INFO {
+    DWORD dwType;      // Must be 0x1000.
+    LPCSTR szName;     // Pointer to name (in user addr space).
+    DWORD dwThreadID;  // Thread ID (-1=caller thread).
+    DWORD dwFlags;     // Reserved for future use, must be zero.
+  };
+#pragma pack(pop)
+
+#pragma warning(push)
+#pragma warning(disable : 6320 6322)
+  struct THREADNAME_INFO info;
+  info.dwType = 0x1000;
+  info.szName = name;
+  info.dwThreadID = GetThreadId(handle);
+  info.dwFlags = 0;
+  __try {
+    RaiseException(0x406D1388u, 0, sizeof(info) / sizeof(ULONG_PTR),
+                   (ULONG_PTR*)(&info));
+  } __except (EXCEPTION_EXECUTE_HANDLER) {
+  }
+#pragma warning(pop)
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static DWORD WINAPI iree_thread_start_routine(LPVOID param) {
+  // NOTE: we own a reference to the thread handle so that the creation
+  // thread can't delete this out from under us.
+  iree_thread_t* thread = (iree_thread_t*)param;
+
+  // Set the thread name used by tracy (which must be called on the thread).
+  IREE_TRACE_SET_THREAD_NAME(thread->name);
+
+  // "Consume" the entry info so that we don't see it again (as we don't own
+  // its lifetime).
+  iree_thread_entry_t entry = thread->entry;
+  void* entry_arg = thread->entry_arg;
+  thread->entry = NULL;
+  thread->entry_arg = NULL;
+
+  // Release our ownership of the thread handle. If the creating thread doesn't
+  // want it this will free the memory and fully detach the thread.
+  iree_thread_release(thread);
+
+  // Call the user thread entry point function.
+  // Note that this can be a tail-call which saves a stack frame in all threads
+  // (which is really just to make call stacks in debuggers much cleaner).
+  return (DWORD)entry(entry_arg);
+}
+
+iree_status_t iree_thread_create(iree_thread_entry_t entry, void* entry_arg,
+                                 iree_thread_create_params_t params,
+                                 iree_allocator_t allocator,
+                                 iree_thread_t** out_thread) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Allocate our thread struct; we'll use it to shuttle params into the thread
+  // (including the user-specified entry_arg).
+  iree_thread_t* thread = NULL;
+  iree_status_t status =
+      iree_allocator_malloc(allocator, sizeof(*thread), (void**)&thread);
+  if (!iree_status_is_ok(status)) {
+    IREE_TRACE_ZONE_END(z0);
+    return status;
+  }
+  iree_atomic_ref_count_init(&thread->ref_count);
+  thread->allocator = allocator;
+  thread->entry = entry;
+  thread->entry_arg = entry_arg;
+  strncpy_s(thread->name, IREE_ARRAYSIZE(thread->name), params.name.data,
+            min(params.name.size, IREE_ARRAYSIZE(thread->name) - 1));
+  iree_atomic_store_int32(&thread->is_suspended,
+                          params.create_suspended ? 1 : 0,
+                          iree_memory_order_relaxed);
+  iree_thread_override_list_initialize(iree_thread_set_priority_class,
+                                       params.priority_class, thread->allocator,
+                                       &thread->qos_override_list);
+
+  // Retain the thread for the thread itself; this way if the caller immediately
+  // releases the iree_thread_t handle the thread won't explode.
+  iree_thread_retain(thread);
+  *out_thread = thread;
+
+  // Create the thread either suspended or running as the user requested.
+  {
+    IREE_TRACE_ZONE_BEGIN_NAMED(z1, "CreateThread");
+    thread->handle = CreateThread(
+        NULL, params.stack_size, iree_thread_start_routine, thread,
+        params.create_suspended ? CREATE_SUSPENDED : 0, &thread->id);
+    IREE_TRACE_ZONE_END(z1);
+  }
+  if (thread->handle == INVALID_HANDLE_VALUE) {
+    iree_thread_release(thread);  // for self
+    iree_thread_release(thread);  // for caller
+    *out_thread = NULL;
+    IREE_TRACE_ZONE_END(z0);
+    return iree_make_status(IREE_STATUS_INTERNAL,
+                            "thread creation failed with %lu", GetLastError());
+  }
+
+  // Immediately set thread properties before resuming (so that we don't
+  // start on the wrong core/at the wrong priority).
+  if (!iree_string_view_is_empty(params.name)) {
+    iree_thread_set_name(thread->handle, thread->name);
+  }
+  if (params.priority_class != IREE_THREAD_PRIORITY_CLASS_NORMAL) {
+    iree_thread_set_priority_class(thread, params.priority_class);
+  }
+  if (params.initial_affinity.specified) {
+    iree_thread_request_affinity(thread, params.initial_affinity);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+static void iree_thread_delete(iree_thread_t* thread) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_thread_resume(thread);
+
+  if (thread->id != GetCurrentThreadId()) {
+    // Join with the thread. Since threads can delete themselves we must ensure
+    // they don't try to join with themselves and deadlock.
+    WaitForSingleObject(thread->handle, INFINITE);
+  }
+  CloseHandle(thread->handle);
+  iree_thread_override_list_deinitialize(&thread->qos_override_list);
+  iree_allocator_free(thread->allocator, thread);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_thread_retain(iree_thread_t* thread) {
+  if (thread) {
+    iree_atomic_ref_count_inc(&thread->ref_count);
+  }
+}
+
+void iree_thread_release(iree_thread_t* thread) {
+  if (thread && iree_atomic_ref_count_dec(&thread->ref_count) == 1) {
+    iree_thread_delete(thread);
+  }
+}
+
+uintptr_t iree_thread_id(iree_thread_t* thread) {
+  return (uintptr_t)thread->id;
+}
+
+// Sets the thread priority to the given |priority_class| immediately.
+static void iree_thread_set_priority_class(
+    iree_thread_t* thread, iree_thread_priority_class_t priority_class) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  DWORD priority = THREAD_PRIORITY_NORMAL;
+  switch (priority_class) {
+    case IREE_THREAD_PRIORITY_CLASS_LOWEST:
+      priority = THREAD_PRIORITY_LOWEST;
+      break;
+    case IREE_THREAD_PRIORITY_CLASS_LOW:
+      priority = THREAD_PRIORITY_BELOW_NORMAL;
+      break;
+    case IREE_THREAD_PRIORITY_CLASS_NORMAL:
+      priority = THREAD_PRIORITY_NORMAL;
+      break;
+    case IREE_THREAD_PRIORITY_CLASS_HIGH:
+      priority = THREAD_PRIORITY_ABOVE_NORMAL;
+      break;
+    case IREE_THREAD_PRIORITY_CLASS_HIGHEST:
+      priority = THREAD_PRIORITY_HIGHEST;
+      break;
+  }
+  SetThreadPriority(thread->handle, priority);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+iree_thread_override_t* iree_thread_priority_class_override_begin(
+    iree_thread_t* thread, iree_thread_priority_class_t priority_class) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_thread_override_t* override = iree_thread_override_list_add(
+      &thread->qos_override_list, thread, priority_class);
+  IREE_TRACE_ZONE_END(z0);
+  return override;
+}
+
+void iree_thread_override_end(iree_thread_override_t* override) {
+  if (!override) return;
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_thread_override_remove_self(override);
+  IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_thread_request_affinity(iree_thread_t* thread,
+                                  iree_thread_affinity_t affinity) {
+  if (!affinity.specified) return;
+  IREE_TRACE_ZONE_BEGIN(z0);
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+  char affinity_desc[32];
+  int affinity_desc_length = snprintf(
+      affinity_desc, IREE_ARRAYSIZE(affinity_desc), "group=%d, id=%d, smt=%d",
+      affinity.group, affinity.id, affinity.smt);
+  IREE_TRACE_ZONE_APPEND_TEXT_STRING_VIEW(z0, affinity_desc,
+                                          affinity_desc_length);
+#endif  // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+
+  GROUP_AFFINITY group_affinity;
+  memset(&group_affinity, 0, sizeof(group_affinity));
+  group_affinity.Group = affinity.group;
+  KAFFINITY affinity_mask = 1ull << affinity.id;
+  if (affinity.smt) {
+    affinity_mask |= 1ull << (affinity.id + 1);
+  }
+  group_affinity.Mask = affinity_mask;
+  SetThreadGroupAffinity(thread->handle, &group_affinity, NULL);
+
+  // TODO(benvanik): figure out of this is a bad thing; sometimes it can result
+  // in the scheduler alternating cores within the affinity mask; in theory it's
+  // just an SMT ID change and doesn't have any impact on caches but it'd be
+  // good to check.
+  PROCESSOR_NUMBER ideal_processor;
+  memset(&ideal_processor, 0, sizeof(ideal_processor));
+  ideal_processor.Group = affinity.group;
+  ideal_processor.Number = affinity.id;
+  SetThreadIdealProcessorEx(thread->handle, &ideal_processor, NULL);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_thread_resume(iree_thread_t* thread) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // NOTE: we don't track the suspend/resume depth here because we don't
+  // expose suspend as an operation (yet). If we did we'd want to make sure we
+  // always balance suspend/resume or else we'll mess with any
+  // debuggers/profilers that may be suspending threads for their own uses.
+  int32_t expected = 1;
+  if (iree_atomic_compare_exchange_strong_int32(
+          &thread->is_suspended, &expected, 0, iree_memory_order_seq_cst,
+          iree_memory_order_seq_cst)) {
+    ResumeThread(thread->handle);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+#endif  // IREE_PLATFORM_WINDOWS
diff --git a/runtime/src/iree/base/internal/wait_handle.c b/runtime/src/iree/base/internal/wait_handle.c
new file mode 100644
index 0000000..b3e1ed3
--- /dev/null
+++ b/runtime/src/iree/base/internal/wait_handle.c
@@ -0,0 +1,102 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/internal/wait_handle.h"
+
+#include <string.h>
+
+//===----------------------------------------------------------------------===//
+// iree_wait_handle_t
+//===----------------------------------------------------------------------===//
+
+void iree_wait_handle_wrap_primitive(
+    iree_wait_primitive_type_t primitive_type,
+    iree_wait_primitive_value_t primitive_value,
+    iree_wait_handle_t* out_handle) {
+  memset(out_handle, 0, sizeof(*out_handle));
+  out_handle->type = primitive_type;
+  out_handle->value = primitive_value;
+}
+
+void iree_wait_handle_deinitialize(iree_wait_handle_t* handle) {
+  memset(handle, 0, sizeof(*handle));
+}
+
+iree_status_t iree_wait_handle_ctl(iree_wait_source_t wait_source,
+                                   iree_wait_source_command_t command,
+                                   const void* params, void** inout_ptr) {
+  iree_wait_handle_t* wait_handle = iree_wait_handle_from_source(&wait_source);
+  switch (command) {
+    case IREE_WAIT_SOURCE_COMMAND_QUERY: {
+      iree_status_code_t* out_wait_status_code = (iree_status_code_t*)inout_ptr;
+      if (iree_wait_handle_is_immediate(*wait_handle)) {
+        // Immediately resolved.
+        *out_wait_status_code = IREE_STATUS_OK;
+        return iree_ok_status();
+      } else {
+        // Poll the handle: a deadline exceeded indicates unresolved.
+        iree_status_t status =
+            iree_wait_one(wait_handle, IREE_TIME_INFINITE_PAST);
+        if (iree_status_is_deadline_exceeded(status)) {
+          *out_wait_status_code = IREE_STATUS_DEFERRED;
+          return iree_status_ignore(status);
+        }
+        return status;
+      }
+    }
+    case IREE_WAIT_SOURCE_COMMAND_WAIT_ONE: {
+      // Wait for the handle.
+      return iree_wait_one(
+          wait_handle,
+          iree_timeout_as_deadline_ns(
+              ((const iree_wait_source_wait_params_t*)params)->timeout));
+    }
+    case IREE_WAIT_SOURCE_COMMAND_EXPORT: {
+      iree_wait_primitive_type_t target_type =
+          ((const iree_wait_source_export_params_t*)params)->target_type;
+      if (target_type != IREE_WAIT_PRIMITIVE_TYPE_ANY &&
+          target_type != wait_handle->type) {
+        return iree_make_status(
+            IREE_STATUS_UNAVAILABLE,
+            "requested wait primitive type %d is unavailable; have %d",
+            (int)target_type, (int)wait_handle->type);
+      }
+      iree_wait_primitive_t* out_wait_primitive =
+          (iree_wait_primitive_t*)inout_ptr;
+      out_wait_primitive->type = wait_handle->type;
+      out_wait_primitive->value = wait_handle->value;
+      return iree_ok_status();
+    }
+    default:
+      return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                              "unimplemented wait_source command");
+  }
+}
+
+IREE_API_EXPORT iree_status_t iree_wait_source_import(
+    iree_wait_primitive_t wait_primitive, iree_wait_source_t* out_wait_source) {
+  if (iree_wait_primitive_is_immediate(wait_primitive)) {
+    *out_wait_source = iree_wait_source_immediate();
+  } else {
+    iree_wait_handle_t* wait_handle =
+        (iree_wait_handle_t*)out_wait_source->storage;
+    iree_wait_handle_wrap_primitive(wait_primitive.type, wait_primitive.value,
+                                    wait_handle);
+    out_wait_source->ctl = iree_wait_handle_ctl;
+  }
+  return iree_ok_status();
+}
+
+//===----------------------------------------------------------------------===//
+// iree_event_t
+//===----------------------------------------------------------------------===//
+
+iree_wait_source_t iree_event_await(iree_event_t* event) {
+  iree_wait_source_t wait_source;
+  memcpy(wait_source.storage, event, sizeof(*event));
+  wait_source.ctl = iree_wait_handle_ctl;
+  return wait_source;
+}
diff --git a/runtime/src/iree/base/internal/wait_handle.h b/runtime/src/iree/base/internal/wait_handle.h
new file mode 100644
index 0000000..b173e7c
--- /dev/null
+++ b/runtime/src/iree/base/internal/wait_handle.h
@@ -0,0 +1,243 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BASE_INTERNAL_WAIT_HANDLE_H_
+#define IREE_BASE_INTERNAL_WAIT_HANDLE_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/base/target_platform.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+//===----------------------------------------------------------------------===//
+// iree_wait_handle_t
+//===----------------------------------------------------------------------===//
+
+// Non-owning handle reference to a waitable object.
+// TODO(benvanik): packing to ensure we are getting the expected alignments.
+typedef struct iree_wait_handle_t {
+  union {
+    // Used by iree_wait_set_t storage to track the number of duplicate
+    // instances of a particular handle within the set to avoid needing to store
+    // them all separately. A dupe_count of 0 means there is one unique handle.
+    uint32_t dupe_count : 16;
+    // Used by iree_wait_any and iree_wait_set_erase to optimize the
+    // wait-wake-erase pattern by avoiding the need to scan the internal storage
+    // list to erase a handle.
+    uint32_t index : 16;
+    // (3 bytes total available)
+    uint8_t storage[3];
+  } set_internal;
+  // Inlined iree_wait_primitive_t to get better packing:
+  iree_wait_primitive_type_t type;  // uint8_t
+  iree_wait_primitive_value_t value;
+} iree_wait_handle_t;
+static_assert(sizeof(iree_wait_handle_t) <= sizeof(uint64_t) * 2,
+              "iree_wait_handle_t must fit in 16-bytes so it can be stored in "
+              "other data structures");
+
+// Returns a wait handle that is immediately resolved.
+static inline iree_wait_handle_t iree_wait_handle_immediate(void) {
+  iree_wait_handle_t wait_handle;
+  memset(&wait_handle, 0, sizeof(wait_handle));
+  return wait_handle;
+}
+
+// Returns true if the wait |handle| is resolved immediately (empty).
+static inline bool iree_wait_handle_is_immediate(iree_wait_handle_t handle) {
+  return handle.type == IREE_WAIT_PRIMITIVE_TYPE_NONE;
+}
+
+// Initializes a wait handle with the given primitive type and value.
+// Wait handles do not retain the provided primitives and they must be kept
+// valid (allocated and open) for the duration any wait handle references them.
+void iree_wait_handle_wrap_primitive(
+    iree_wait_primitive_type_t primitive_type,
+    iree_wait_primitive_value_t primitive_value,
+    iree_wait_handle_t* out_handle);
+
+// Deinitializes a wait handle.
+// Note that wait handles do not retain the underlying wait primitive and
+// deinitializing a handle will not close the resource.
+void iree_wait_handle_deinitialize(iree_wait_handle_t* handle);
+
+// Closes a wait handle and resets |handle|.
+void iree_wait_handle_close(iree_wait_handle_t* handle);
+
+// iree_wait_source_t control function.
+iree_status_t iree_wait_handle_ctl(iree_wait_source_t wait_source,
+                                   iree_wait_source_command_t command,
+                                   const void* params, void** inout_ptr);
+
+// Returns a pointer to the wait handle in |wait_source| if it is using
+// iree_wait_handle_ctl and otherwise NULL.
+static inline iree_wait_handle_t* iree_wait_handle_from_source(
+    iree_wait_source_t* wait_source) {
+  return wait_source->ctl == iree_wait_handle_ctl
+             ? (iree_wait_handle_t*)wait_source->storage
+             : NULL;
+}
+
+//===----------------------------------------------------------------------===//
+// iree_wait_set_t
+//===----------------------------------------------------------------------===//
+
+// A platform-specific cache of wait handles that can be multi-waited.
+// By caching callers don't need to build the list each wait and implementations
+// can store acceleration information or kernel API data structures and either
+// optimize or make compliant sets such as by deduplicating or sorting by
+// primitive type to perform a multi-api muli-wait.
+//
+// Certain handle types may also gain benefits: when syncfile is used we can use
+// sync_merge to coalesce wait handles when performing a wait-all on multiple
+// handles.
+//
+// This cache shines when handles are persistent (such as sockets/eventfds/etc)
+// and the set will rarely be changing relative to how many times it will be
+// waited on. It's not as optimal in the cases of one-shot waits on small
+// numbers of handles but those are also the cases where the set overhead is
+// small (2 set insertions all touching hot cache lines is fine) and we gain
+// the benefits of a unified code path and nice error handling/validation.
+//
+// Thread-compatible; only one thread may be manipulating or waiting on a
+// particular set at any time.
+typedef struct iree_wait_set_t iree_wait_set_t;
+
+// Allocates a wait set with the maximum |capacity| of unique handles.
+iree_status_t iree_wait_set_allocate(iree_host_size_t capacity,
+                                     iree_allocator_t allocator,
+                                     iree_wait_set_t** out_set);
+
+// Frees a wait set. The wait set must not be being waited on.
+void iree_wait_set_free(iree_wait_set_t* set);
+
+// Returns true if there are no handles registered with the set.
+bool iree_wait_set_is_empty(const iree_wait_set_t* set);
+
+// Inserts a wait handle into the set.
+// If the handle is already in the set it will be reference counted such that a
+// matching number of iree_wait_set_erase calls are required.
+iree_status_t iree_wait_set_insert(iree_wait_set_t* set,
+                                   iree_wait_handle_t handle);
+
+// Erases a single instance of a wait handle from the set.
+// Decrements the reference count; if the same handle was inserted multiple
+// times then it may still remain in the set after the call returns.
+void iree_wait_set_erase(iree_wait_set_t* set, iree_wait_handle_t handle);
+
+// Clears all handles from the wait set.
+void iree_wait_set_clear(iree_wait_set_t* set);
+
+// TODO(benvanik): signal/interrupt API to make a wait set wake up.
+// Can be implemented with signals/QueueUserAPC/etc. The workaround is that the
+// caller will need to create their own events to add to the set where for
+// transient wakes we could avoid that extra overhead.
+
+// Blocks the caller until all of the passed wait handles are signaled or the
+// |deadline_ns| elapses.
+//
+// A deadline of IREE_DURATION_ZERO will act as a poll and not block the caller.
+// IREE_DURATION_INFINITE can be used to block until signaled.
+//
+// Returns success if all handles were signaled either prior to the call or
+// during the wait.
+//
+// Returns IREE_STATUS_DEADLINE_EXCEEDED if the deadline elapses without all
+// handles having been signaled. Note that zero or more handles may have
+// actually signaled even if the deadline is exceeded (such as if they signal
+// while the waiting thread is resuming from the failed wait).
+//
+// iree_wait_set_t is thread-compatible; only one thread may be manipulating or
+// waiting on a set at any time.
+iree_status_t iree_wait_all(iree_wait_set_t* set, iree_time_t deadline_ns);
+
+// Blocks the caller until at least one of the handles is signaled or the
+// |deadline_ns| elapses.
+//
+// A deadline of IREE_TIME_INFINITE_PAST will act as a poll and not block the
+// caller. IREE_TIME_INFINITE_FUTURE can be used to block until signaled.
+//
+// Returns success if all handles were signaled either prior to the call or
+// during the wait. A handle of one of the signaled handles will be returned in
+// the optional |out_wake_handle| argument; note however that one or more
+// handles may have signaled and which handle is returned is unspecified.
+// Callers are expected to use the handle to short-circuit scanning the handles
+// list but if a full scan is going to happen regardless it can be ignored.
+//
+// |out_wake_handle| contains an optimization for wait-wake-erase set
+// operations; it is cheap to pass the woken handle to iree_wait_set_erase if
+// there are no interleaving operations that change the set layout.
+//
+// Returns IREE_STATUS_DEADLINE_EXCEEDED if the deadline elapses without any
+// handle having been signaled.
+//
+// iree_wait_set_t is thread-compatible; only one thread may be manipulating or
+// waiting on a set at any time.
+iree_status_t iree_wait_any(iree_wait_set_t* set, iree_time_t deadline_ns,
+                            iree_wait_handle_t* out_wake_handle);
+
+// Blocks the caller until the given wait handle is signaled or |deadline_ns|
+// elapses. This is functionally equivalent to iree_wait_any/iree_wait_all used
+// on a set with a single handle in it but depending on the implementation may
+// not require additional allocations/state tracking.
+//
+// A deadline of IREE_TIME_INFINITE_PAST will act as a poll and not block the
+// caller. IREE_TIME_INFINITE_FUTURE can be used to block until signaled.
+//
+// Returns success if the handle was signaled either prior to the call or
+// during the wait.
+//
+// Returns IREE_STATUS_DEADLINE_EXCEEDED if the deadline elapses without the
+// handle having been signaled.
+iree_status_t iree_wait_one(iree_wait_handle_t* handle,
+                            iree_time_t deadline_ns);
+
+//===----------------------------------------------------------------------===//
+// iree_event_t
+//===----------------------------------------------------------------------===//
+
+// A manual reset event (aka binary semaphore).
+// https://docs.microsoft.com/en-us/windows/win32/sync/event-objects
+//
+// Events are much heavier than iree_notification_t but are waitable objects
+// that can be passed to iree_wait_all/iree_wait_any. Prefer iree_notification_t
+// when multiwaiting is not required.
+//
+// Which primitive is used will depend on the current platform.
+typedef iree_wait_handle_t iree_event_t;
+
+// Initializes an event in either the signaled or unsignaled state.
+// The event must be closed with iree_event_deinitialize.
+iree_status_t iree_event_initialize(bool initial_state,
+                                    iree_event_t* out_event);
+
+// Deinitializes an event.
+void iree_event_deinitialize(iree_event_t* event);
+
+// Sets the event object to the signaled state.
+// The event stays signaled until iree_event_reset is called. Multiple waiters
+// will be woken and attempted waits while the event is set will succeed
+// immediately.
+void iree_event_set(iree_event_t* event);
+
+// Resets the event object to the unsignaled state.
+// Resetting an event that is already reset has no effect.
+void iree_event_reset(iree_event_t* event);
+
+// Returns a wait_source reference to |event|.
+// The event must be kept live for as long as the reference is live.
+iree_wait_source_t iree_event_await(iree_event_t* event);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_BASE_INTERNAL_WAIT_HANDLE_H_
diff --git a/runtime/src/iree/base/internal/wait_handle_epoll.c b/runtime/src/iree/base/internal/wait_handle_epoll.c
new file mode 100644
index 0000000..0bd08d6
--- /dev/null
+++ b/runtime/src/iree/base/internal/wait_handle_epoll.c
@@ -0,0 +1,66 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// NOTE: must be first to ensure that we can define settings for all includes.
+#include "iree/base/internal/wait_handle_impl.h"
+
+#if IREE_WAIT_API == IREE_WAIT_API_EPOLL
+
+#include "iree/base/internal/wait_handle_posix.h"
+#include "iree/base/tracing.h"
+
+//===----------------------------------------------------------------------===//
+// iree_wait_set_t
+//===----------------------------------------------------------------------===//
+
+// TODO(benvanik): iree_wait_set_s using an epoll fd.
+// epoll lets us route the wait set operations right to kernel and not need our
+// own duplicate data structure. epoll is great, just not available on mac/ios
+// so we still need poll for that. linux/android/bsd all have epoll, though.
+struct iree_wait_set_t {
+  // NOTE: we could in theory use the epoll handle directly (iree_wait_set_s
+  // then is just a pointer). Then allocate/free just go straight to the system.
+  int reserved;
+};
+
+iree_status_t iree_wait_set_allocate(iree_host_size_t capacity,
+                                     iree_allocator_t allocator,
+                                     iree_wait_set_t** out_set) {
+  // TODO(benvanik): epoll_create()
+}
+
+void iree_wait_set_free(iree_wait_set_t* set) {
+  // TODO(benvanik): close()
+}
+
+iree_status_t iree_wait_set_insert(iree_wait_set_t* set,
+                                   iree_wait_handle_t handle) {
+  // TODO(benvanik): epoll_ctl(EPOLL_CTL_ADD)
+}
+
+void iree_wait_set_erase(iree_wait_set_t* set, iree_wait_handle_t handle) {
+  // TODO(benvanik): epoll_ctl(EPOLL_CTL_DEL)
+}
+
+void iree_wait_set_clear(iree_wait_set_t* set) {
+  // TODO(benvanik): close and reopen?
+}
+
+iree_status_t iree_wait_all(iree_wait_set_t* set, iree_time_t deadline_ns) {
+  // TODO(benvanik): epoll_wait
+}
+
+iree_status_t iree_wait_any(iree_wait_set_t* set, iree_time_t deadline_ns,
+                            iree_wait_handle_t* out_wake_handle) {
+  // TODO(benvanik): epoll_wait
+}
+
+iree_status_t iree_wait_one(iree_wait_handle_t* handle,
+                            iree_time_t deadline_ns) {
+  // TODO(benvanik): just use poll?
+}
+
+#endif  // IREE_WAIT_API == IREE_WAIT_API_EPOLL
diff --git a/runtime/src/iree/base/internal/wait_handle_impl.h b/runtime/src/iree/base/internal/wait_handle_impl.h
new file mode 100644
index 0000000..b22ba78
--- /dev/null
+++ b/runtime/src/iree/base/internal/wait_handle_impl.h
@@ -0,0 +1,86 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BASE_INTERNAL_WAIT_HANDLE_IMPL_H_
+#define IREE_BASE_INTERNAL_WAIT_HANDLE_IMPL_H_
+
+//===----------------------------------------------------------------------===//
+// Platform overrides
+//===----------------------------------------------------------------------===//
+// NOTE: this must come first prior to any local/system includes!
+
+// Ensure that any posix header we include exposes GNU stuff. Ignored on
+// platforms where we either don't have the GNU stuff or don't have posix
+// headers at all.
+//
+// Note that this does not need to be the same for all compilation units, only
+// those we want to access the non-portable features in. It *must* be defined
+// prior to including any of the files, though, as otherwise header-guards will
+// cause the setting at the time of first inclusion to win.
+//
+// https://stackoverflow.com/a/5583764
+#define _GNU_SOURCE 1
+
+//===----------------------------------------------------------------------===//
+// Active wait API implementation selection (wait_handle_*.c)
+//===----------------------------------------------------------------------===//
+
+#include "iree/base/config.h"
+#include "iree/base/target_platform.h"
+
+// NOTE: order matters; priorities are (kqueue|epoll) > ppoll > poll.
+// When overridden with NULL (no platform primitives) or on Win32 we always use
+// those implementations (today).
+#define IREE_WAIT_API_NULL 0
+#define IREE_WAIT_API_INPROC 1
+#define IREE_WAIT_API_WIN32 2
+#define IREE_WAIT_API_POLL 3
+#define IREE_WAIT_API_PPOLL 4
+#define IREE_WAIT_API_EPOLL 5
+#define IREE_WAIT_API_KQUEUE 6
+
+// We allow overriding the wait API via command line flags. If unspecified we
+// try to guess based on the target platform.
+#if !defined(IREE_WAIT_API)
+
+// NOTE: we could be tighter here, but we today only have win32 or not-win32.
+#if IREE_SYNCHRONIZATION_DISABLE_UNSAFE
+#define IREE_WAIT_API IREE_WAIT_API_NULL
+#elif defined(IREE_PLATFORM_GENERIC) || defined(IREE_PLATFORM_EMSCRIPTEN)
+#define IREE_WAIT_API IREE_WAIT_API_INPROC
+#elif defined(IREE_PLATFORM_WINDOWS)
+#define IREE_WAIT_API IREE_WAIT_API_WIN32  // WFMO used in wait_handle_win32.c
+#else
+// TODO(benvanik): EPOLL on android/linux/bsd/etc.
+// TODO(benvanik): KQUEUE on mac/ios.
+// KQUEUE is not implemented yet. Use POLL for mac/ios
+// Android ppoll requires API version >= 21
+#if !defined(IREE_PLATFORM_APPLE) && \
+    (!defined(__ANDROID_API__) || __ANDROID_API__ >= 21)
+#define IREE_WAIT_API IREE_WAIT_API_PPOLL
+#else
+#define IREE_WAIT_API IREE_WAIT_API_POLL
+#endif  // insanity
+#endif  // IREE_SYNCHRONIZATION_DISABLE_UNSAFE / IREE_PLATFORM_WINDOWS
+
+#endif  // !IREE_WAIT_API
+
+// Many implementations share the same posix-like nature (file descriptors/etc)
+// and can share most of their code.
+#if (IREE_WAIT_API == IREE_WAIT_API_POLL) ||  \
+    (IREE_WAIT_API == IREE_WAIT_API_PPOLL) || \
+    (IREE_WAIT_API == IREE_WAIT_API_EPOLL) || \
+    (IREE_WAIT_API == IREE_WAIT_API_KQUEUE)
+#define IREE_WAIT_API_POSIX_LIKE 1
+#endif  // IREE_WAIT_API = posix-like
+
+//===----------------------------------------------------------------------===//
+// Wait handle included with options set
+//===----------------------------------------------------------------------===//
+
+#include "iree/base/internal/wait_handle.h"
+
+#endif  // IREE_BASE_INTERNAL_WAIT_HANDLE_IMPL_H_
diff --git a/runtime/src/iree/base/internal/wait_handle_inproc.c b/runtime/src/iree/base/internal/wait_handle_inproc.c
new file mode 100644
index 0000000..eff64cb
--- /dev/null
+++ b/runtime/src/iree/base/internal/wait_handle_inproc.c
@@ -0,0 +1,378 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// clang-format off: must be included before all other headers.
+#include "iree/base/internal/wait_handle_impl.h"
+// clang-format on
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/synchronization.h"
+#include "iree/base/internal/wait_handle.h"
+#include "iree/base/target_platform.h"
+
+// This implementation uses iree_notification_t - backed by a futex in most
+// cases - to simulate system wait handles. When using a single handle such as
+// an iree_event_t and waiting on it with iree_wait_one things behave just as
+// the base iree_notification_t: threads can block and wait for the event to
+// be signaled. Multi-wait, however, requires some trickery as we need to be
+// able to wake when one or more events are signaled and unfortunately there are
+// no multi-wait futex APIs. To get around this we have a shared notification
+// that is posted every time an event is signaled and multi-waits await that.
+// This can lead to spurious wakes when under heavy load as disparate events may
+// wake unrelated multi-waiters, however by design in IREE we tend to avoid that
+// and centralize waits via things like the task system poller such that this
+// isn't so bad. The cases that are likely to suffer are heavy multi-tenant
+// workloads in the same process but those should be using a real wait handle
+// implementation instead of this bare-metal friendly one anyway.
+#if IREE_WAIT_API == IREE_WAIT_API_INPROC
+
+//===----------------------------------------------------------------------===//
+// iree_wait_primitive_* raw calls
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_futex_handle_t {
+  iree_atomic_int64_t value;
+  iree_notification_t notification;
+} iree_futex_handle_t;
+
+static bool iree_wait_primitive_compare_identical(iree_wait_handle_t* lhs,
+                                                  iree_wait_handle_t* rhs) {
+  return lhs->type == rhs->type &&
+         memcmp(&lhs->value, &rhs->value, sizeof(lhs->value)) == 0;
+}
+
+void iree_wait_handle_close(iree_wait_handle_t* handle) {
+  switch (handle->type) {
+#if defined(IREE_HAVE_WAIT_TYPE_LOCAL_FUTEX)
+    case IREE_WAIT_PRIMITIVE_TYPE_LOCAL_FUTEX: {
+      iree_futex_handle_t* futex =
+          (iree_futex_handle_t*)handle->value.local_futex;
+      iree_notification_deinitialize(&futex->notification);
+      iree_allocator_free(iree_allocator_system(), futex);
+      break;
+    }
+#endif  // IREE_HAVE_WAIT_TYPE_LOCAL_FUTEX
+    default:
+      break;
+  }
+  iree_wait_handle_deinitialize(handle);
+}
+
+//===----------------------------------------------------------------------===//
+// Multi-wait emulation
+//===----------------------------------------------------------------------===//
+
+// Returns a notification that is shared with all waiters in the process.
+// Waiting on the notification will cause a wake whenever any event is set.
+static iree_notification_t* iree_wait_multi_notification(void) {
+  static iree_notification_t shared_notification = IREE_NOTIFICATION_INIT;
+  return &shared_notification;
+}
+
+//===----------------------------------------------------------------------===//
+// iree_wait_set_t
+//===----------------------------------------------------------------------===//
+
+struct iree_wait_set_t {
+  iree_allocator_t allocator;
+
+  // Total capacity of handles in the set (including duplicates).
+  // This defines the capacity of handles to ensure that we don't get insanely
+  // hard to debug behavioral differences when some handles happen to be
+  // duplicates vs all being unique.
+  //
+  // If you added 1000 duplicate handles to the set you'd need a capacity
+  // of 1000 even though handle_count (expluding duplicates) would be 1.
+  iree_host_size_t capacity;
+
+  // Total number of handles in the set (including duplicates).
+  // We use this to ensure that we provide consistent capacity errors;
+  iree_host_size_t total_handle_count;
+
+  // Number of handles in the set (excluding duplicates), defining the valid
+  // size of the dense handles list.
+  iree_host_size_t handle_count;
+
+  // De-duped user-provided handles. iree_wait_handle_t::set_internal.dupe_count
+  // is used to indicate how many additional duplicates there are of a
+  // particular handle. For example, dupe_count=0 means that there are no
+  // duplicates.
+  iree_wait_handle_t handles[];
+};
+
+iree_status_t iree_wait_set_allocate(iree_host_size_t capacity,
+                                     iree_allocator_t allocator,
+                                     iree_wait_set_t** out_set) {
+  // Be reasonable; 64K objects is too high.
+  if (capacity >= UINT16_MAX) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "wait set capacity of %zu is unreasonably large",
+                            capacity);
+  }
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, (int64_t)capacity);
+  *out_set = NULL;
+
+  iree_wait_set_t* set = NULL;
+  iree_status_t status = iree_allocator_malloc(
+      allocator, sizeof(*set) + capacity * sizeof(iree_wait_handle_t),
+      (void**)&set);
+  if (iree_status_is_ok(status)) {
+    set->allocator = allocator;
+    set->capacity = capacity;
+    iree_wait_set_clear(set);
+  }
+
+  *out_set = set;
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+void iree_wait_set_free(iree_wait_set_t* set) {
+  if (!set) return;
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_allocator_t allocator = set->allocator;
+  iree_allocator_free(allocator, set);
+  IREE_TRACE_ZONE_END(z0);
+}
+
+bool iree_wait_set_is_empty(const iree_wait_set_t* set) {
+  return set->handle_count != 0;
+}
+
+iree_status_t iree_wait_set_insert(iree_wait_set_t* set,
+                                   iree_wait_handle_t handle) {
+  if (set->total_handle_count + 1 > set->capacity) {
+    return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED,
+                            "wait set capacity %" PRIhsz
+                            " reached; no more wait handles available",
+                            set->capacity);
+  } else if (handle.type != IREE_WAIT_PRIMITIVE_TYPE_LOCAL_FUTEX) {
+    return iree_make_status(
+        IREE_STATUS_UNIMPLEMENTED,
+        "unimplemented primitive type %d (expected LOCAL_FUTEX)",
+        (int)handle.type);
+  }
+
+  // First check to see if we already have the handle in the set; most native
+  // system APIs don't allow duplicates so we match that behavior here to be
+  // consistent. It also helps in cases where the same event is waited on
+  // multiple times (such as when joining on a semaphore) as they can be routed
+  // to the much more efficient iree_wait_one.
+  for (iree_host_size_t i = 0; i < set->handle_count; ++i) {
+    iree_wait_handle_t* existing_handle = &set->handles[i];
+    if (iree_wait_primitive_compare_identical(existing_handle, &handle)) {
+      // Handle already exists in the set; just increment the reference count.
+      ++existing_handle->set_internal.dupe_count;
+      ++set->total_handle_count;
+      return iree_ok_status();
+    }
+  }
+
+  ++set->total_handle_count;
+  iree_host_size_t index = set->handle_count++;
+  iree_wait_handle_t* stored_handle = &set->handles[index];
+  iree_wait_handle_wrap_primitive(handle.type, handle.value, stored_handle);
+  stored_handle->set_internal.dupe_count = 0;  // just us so far
+
+  return iree_ok_status();
+}
+
+void iree_wait_set_erase(iree_wait_set_t* set, iree_wait_handle_t handle) {
+  // Find the user handle in the set. This either requires a linear scan to
+  // find the matching user handle or - if valid - we can use the native index
+  // set after an iree_wait_any wake to do a quick lookup.
+  iree_host_size_t index = handle.set_internal.index;
+  if (IREE_UNLIKELY(index >= set->handle_count) ||
+      IREE_UNLIKELY(!iree_wait_primitive_compare_identical(&set->handles[index],
+                                                           &handle))) {
+    // Fallback to a linear scan of (hopefully) a small list.
+    for (iree_host_size_t i = 0; i < set->handle_count; ++i) {
+      if (iree_wait_primitive_compare_identical(&set->handles[i], &handle)) {
+        index = i;
+        break;
+      }
+    }
+  }
+
+  // Decrement reference count.
+  iree_wait_handle_t* existing_handle = &set->handles[index];
+  if (existing_handle->set_internal.dupe_count-- > 0) {
+    // Still one or more remaining in the set; leave it in the handle list.
+    --set->total_handle_count;
+    return;
+  }
+
+  // No more references remaining; remove from both handle lists.
+  // Since we make no guarantees about the order of the lists we can just swap
+  // with the last value.
+  int tail_index = (int)set->handle_count - 1;
+  if (tail_index > index) {
+    memcpy(&set->handles[index], &set->handles[tail_index],
+           sizeof(*set->handles));
+  }
+  --set->total_handle_count;
+  --set->handle_count;
+}
+
+void iree_wait_set_clear(iree_wait_set_t* set) {
+  memset(&set->handles[0], 0, set->handle_count * sizeof(iree_wait_handle_t));
+  set->total_handle_count = 0;
+  set->handle_count = 0;
+}
+
+typedef struct {
+  iree_wait_set_t* set;
+  iree_wait_handle_t* wake_handle;  // if set then wait-any
+} iree_wait_set_check_params_t;
+
+static bool iree_wait_set_check(const iree_wait_set_check_params_t* params) {
+  iree_host_size_t ready_count = 0;
+  for (iree_host_size_t i = 0; i < params->set->handle_count; ++i) {
+    iree_wait_handle_t* wait_handle = &params->set->handles[i];
+    iree_futex_handle_t* futex =
+        (iree_futex_handle_t*)wait_handle->value.local_futex;
+    if (iree_atomic_load_int64(&futex->value, iree_memory_order_acquire) != 0) {
+      ++ready_count;
+      if (params->wake_handle) {
+        *params->wake_handle = *wait_handle;
+        return true;
+      }
+    }
+  }
+  return ready_count == params->set->handle_count;
+}
+
+static iree_status_t iree_wait_multi(iree_wait_set_t* set,
+                                     iree_time_t deadline_ns,
+                                     iree_wait_handle_t* out_wake_handle) {
+  if (set->handle_count == 0) return iree_ok_status();  // no-op
+  if (set->handle_count == 1) {
+    // It's much more efficient to use a wait-one as then we will only wake if
+    // the specific handle is signaled; otherwise we will use the multi-wait
+    // notification and potentially wake many times.
+    return iree_wait_one(&set->handles[0], deadline_ns);
+  }
+
+  iree_wait_set_check_params_t params = {
+      .set = set,
+      .wake_handle = out_wake_handle,
+  };
+  if (!iree_notification_await(iree_wait_multi_notification(),
+                               (iree_condition_fn_t)iree_wait_set_check,
+                               &params, iree_make_deadline(deadline_ns))) {
+    return iree_status_from_code(IREE_STATUS_DEADLINE_EXCEEDED);
+  }
+  return iree_ok_status();
+}
+
+iree_status_t iree_wait_all(iree_wait_set_t* set, iree_time_t deadline_ns) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_status_t status = iree_wait_multi(set, deadline_ns,
+                                         /*out_wake_handle=*/NULL);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+iree_status_t iree_wait_any(iree_wait_set_t* set, iree_time_t deadline_ns,
+                            iree_wait_handle_t* out_wake_handle) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  memset(out_wake_handle, 0, sizeof(*out_wake_handle));
+  iree_status_t status = iree_wait_multi(set, deadline_ns, out_wake_handle);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static bool iree_futex_handle_check(iree_futex_handle_t* futex) {
+  return iree_atomic_load_int64(&futex->value, iree_memory_order_acquire) != 0;
+}
+
+iree_status_t iree_wait_one(iree_wait_handle_t* handle,
+                            iree_time_t deadline_ns) {
+  if (handle->type == IREE_WAIT_PRIMITIVE_TYPE_NONE) {
+    return iree_ok_status();
+  }
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_status_t status = iree_ok_status();
+  if (handle->type == IREE_WAIT_PRIMITIVE_TYPE_LOCAL_FUTEX) {
+    iree_futex_handle_t* futex =
+        (iree_futex_handle_t*)handle->value.local_futex;
+    if (!iree_notification_await(&futex->notification,
+                                 (iree_condition_fn_t)iree_futex_handle_check,
+                                 futex, iree_make_deadline(deadline_ns))) {
+      status = iree_status_from_code(IREE_STATUS_DEADLINE_EXCEEDED);
+    }
+  } else {
+    return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                            "unhandled primitive type");
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+//===----------------------------------------------------------------------===//
+// iree_event_t
+//===----------------------------------------------------------------------===//
+
+iree_status_t iree_event_initialize(bool initial_state,
+                                    iree_event_t* out_event) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  memset(out_event, 0, sizeof(*out_event));
+
+  iree_futex_handle_t* futex = NULL;
+  iree_status_t status = iree_allocator_malloc(iree_allocator_system(),
+                                               sizeof(*futex), (void**)&futex);
+  if (iree_status_is_ok(status)) {
+    out_event->type = IREE_WAIT_PRIMITIVE_TYPE_LOCAL_FUTEX;
+    out_event->value.local_futex = (void*)futex;
+    iree_atomic_store_int64(&futex->value, initial_state ? 1 : 0,
+                            iree_memory_order_release);
+    iree_notification_initialize(&futex->notification);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+void iree_event_deinitialize(iree_event_t* event) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_wait_handle_close(event);
+  IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_event_set(iree_event_t* event) {
+  if (!event) return;
+  iree_futex_handle_t* futex = (iree_futex_handle_t*)event->value.local_futex;
+  if (!futex) return;
+
+  // Try to transition from unset -> set.
+  // No-op if already set and otherwise we successfully signaled the event and
+  // need to notify all waiters.
+  if (iree_atomic_exchange_int64(&futex->value, 1, iree_memory_order_release) ==
+      0) {
+    // Notify those waiting on just this event.
+    iree_notification_post(&futex->notification, IREE_ALL_WAITERS);
+    // Notify any multi-waits that may have this event as part of their set.
+    iree_notification_post(iree_wait_multi_notification(), IREE_ALL_WAITERS);
+  }
+}
+
+void iree_event_reset(iree_event_t* event) {
+  if (!event) return;
+  iree_futex_handle_t* futex = (iree_futex_handle_t*)event->value.local_futex;
+  if (!futex) return;
+  iree_atomic_store_int64(&futex->value, 0, iree_memory_order_release);
+}
+
+#endif  // IREE_WAIT_API == IREE_WAIT_API_INPROC
diff --git a/runtime/src/iree/base/internal/wait_handle_kqueue.c b/runtime/src/iree/base/internal/wait_handle_kqueue.c
new file mode 100644
index 0000000..826ce51
--- /dev/null
+++ b/runtime/src/iree/base/internal/wait_handle_kqueue.c
@@ -0,0 +1,63 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// NOTE: must be first to ensure that we can define settings for all includes.
+#include "iree/base/internal/wait_handle_impl.h"
+
+#if IREE_WAIT_API == IREE_WAIT_API_KQUEUE
+
+#include "iree/base/internal/wait_handle_posix.h"
+#include "iree/base/tracing.h"
+
+//===----------------------------------------------------------------------===//
+// iree_wait_set_t
+//===----------------------------------------------------------------------===//
+
+// TODO(benvanik): iree_wait_set_s using a kqueue.
+// Could just cast the kqueue() fd to iree_wait_set_s* to avoid allocs.
+// https://developer.apple.com/library/archive/documentation/System/Conceptual/ManPages_iPhoneOS/man2/kqueue.2.html
+struct iree_wait_set_t {
+  int reserved;
+};
+
+iree_status_t iree_wait_set_allocate(iree_host_size_t capacity,
+                                     iree_allocator_t allocator,
+                                     iree_wait_set_t** out_set) {
+  // TODO(benvanik): kqueue support
+}
+
+void iree_wait_set_free(iree_wait_set_t* set) {
+  // TODO(benvanik): close()
+}
+
+iree_status_t iree_wait_set_insert(iree_wait_set_t* set,
+                                   iree_wait_handle_t handle) {
+  // TODO(benvanik): kqueue support
+}
+
+void iree_wait_set_erase(iree_wait_set_t* set, iree_wait_handle_t handle) {
+  // TODO(benvanik): kqueue support
+}
+
+void iree_wait_set_clear(iree_wait_set_t* set) {
+  // TODO(benvanik): kqueue support
+}
+
+iree_status_t iree_wait_all(iree_wait_set_t* set, iree_time_t deadline_ns) {
+  // TODO(benvanik): kqueue support
+}
+
+iree_status_t iree_wait_any(iree_wait_set_t* set, iree_time_t deadline_ns,
+                            iree_wait_handle_t* out_wake_handle) {
+  // TODO(benvanik): kqueue support
+}
+
+iree_status_t iree_wait_one(iree_wait_handle_t* handle,
+                            iree_time_t deadline_ns) {
+  // TODO(benvanik): kqueue support
+}
+
+#endif  // IREE_WAIT_API == IREE_WAIT_API_KQUEUE
diff --git a/runtime/src/iree/base/internal/wait_handle_null.c b/runtime/src/iree/base/internal/wait_handle_null.c
new file mode 100644
index 0000000..0dd8614
--- /dev/null
+++ b/runtime/src/iree/base/internal/wait_handle_null.c
@@ -0,0 +1,92 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// clang-format off: must be included before all other headers.
+#include "iree/base/internal/wait_handle_impl.h"
+// clang-format on
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/wait_handle.h"
+#include "iree/base/target_platform.h"
+
+#if IREE_WAIT_API == IREE_WAIT_API_NULL
+
+//===----------------------------------------------------------------------===//
+// iree_wait_primitive_* raw calls
+//===----------------------------------------------------------------------===//
+
+void iree_wait_handle_close(iree_wait_handle_t* handle) {
+  iree_wait_handle_deinitialize(handle);
+}
+
+//===----------------------------------------------------------------------===//
+// iree_wait_set_t
+//===----------------------------------------------------------------------===//
+
+struct iree_wait_set_t {
+  int reserved;
+};
+
+iree_status_t iree_wait_set_allocate(iree_host_size_t capacity,
+                                     iree_allocator_t allocator,
+                                     iree_wait_set_t** out_set) {
+  *out_set = NULL;
+  return iree_make_status(IREE_STATUS_UNAVAILABLE,
+                          "wait primitives not available on this platform");
+}
+
+void iree_wait_set_free(iree_wait_set_t* set) {}
+
+iree_status_t iree_wait_set_insert(iree_wait_set_t* set,
+                                   iree_wait_handle_t handle) {
+  return iree_make_status(IREE_STATUS_UNAVAILABLE,
+                          "wait primitives not available on this platform");
+}
+
+void iree_wait_set_erase(iree_wait_set_t* set, iree_wait_handle_t handle) {}
+
+void iree_wait_set_clear(iree_wait_set_t* set) {}
+
+iree_status_t iree_wait_all(iree_wait_set_t* set, iree_time_t deadline_ns) {
+  return iree_make_status(IREE_STATUS_DEADLINE_EXCEEDED,
+                          "wait primitives not available on this platform");
+}
+
+iree_status_t iree_wait_any(iree_wait_set_t* set, iree_time_t deadline_ns,
+                            iree_wait_handle_t* out_wake_handle) {
+  return iree_make_status(IREE_STATUS_DEADLINE_EXCEEDED,
+                          "wait primitives not available on this platform");
+}
+
+iree_status_t iree_wait_one(iree_wait_handle_t* handle,
+                            iree_time_t deadline_ns) {
+  return iree_make_status(IREE_STATUS_DEADLINE_EXCEEDED,
+                          "wait primitives not available on this platform");
+}
+
+//===----------------------------------------------------------------------===//
+// iree_event_t
+//===----------------------------------------------------------------------===//
+
+iree_status_t iree_event_initialize(bool initial_state,
+                                    iree_event_t* out_event) {
+  memset(out_event, 0, sizeof(*out_event));
+  return iree_make_status(IREE_STATUS_UNAVAILABLE,
+                          "events not available on this platform");
+}
+
+void iree_event_deinitialize(iree_event_t* event) {}
+
+void iree_event_set(iree_event_t* event) {}
+
+void iree_event_reset(iree_event_t* event) {}
+
+#endif  // IREE_WAIT_API == IREE_WAIT_API_NULL
diff --git a/runtime/src/iree/base/internal/wait_handle_poll.c b/runtime/src/iree/base/internal/wait_handle_poll.c
new file mode 100644
index 0000000..5eba4e7
--- /dev/null
+++ b/runtime/src/iree/base/internal/wait_handle_poll.c
@@ -0,0 +1,406 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// NOTE: must be first to ensure that we can define settings for all includes.
+#include "iree/base/internal/wait_handle_impl.h"
+
+#if IREE_WAIT_API == IREE_WAIT_API_POLL || IREE_WAIT_API == IREE_WAIT_API_PPOLL
+
+#include <errno.h>
+#include <poll.h>
+#include <time.h>
+
+#include "iree/base/internal/wait_handle_posix.h"
+#include "iree/base/tracing.h"
+
+//===----------------------------------------------------------------------===//
+// Platform utilities
+//===----------------------------------------------------------------------===//
+
+// ppoll is preferred as it has a much better timing mechanism; poll can have a
+// large slop on the deadline as not only is it at ms timeout granularity but
+// in general tends to round more.
+//
+// poll/ppoll may spuriously wake with an EINTR. We don't do anything with that
+// opportunity (no fancy signal stuff), but we do need to retry the poll and
+// ensure that we do so with an updated timeout based on the deadline.
+//
+// Documentation: https://linux.die.net/man/2/poll
+
+#if IREE_WAIT_API == IREE_WAIT_API_POLL
+static iree_status_t iree_syscall_poll(struct pollfd* fds, nfds_t nfds,
+                                       iree_time_t deadline_ns,
+                                       int* out_signaled_count) {
+  *out_signaled_count = 0;
+  int rv = -1;
+  do {
+    uint32_t timeout_ms = iree_absolute_deadline_to_timeout_ms(deadline_ns);
+    rv = poll(fds, nfds, (int)timeout_ms);
+  } while (rv < 0 && errno == EINTR);
+  if (rv > 0) {
+    // One or more events set.
+    *out_signaled_count = rv;
+    return iree_ok_status();
+  } else if (IREE_UNLIKELY(rv < 0)) {
+    return iree_make_status(iree_status_code_from_errno(errno),
+                            "poll failure %d", errno);
+  }
+  // rv == 0
+  // Timeout; no events set.
+  return iree_status_from_code(IREE_STATUS_DEADLINE_EXCEEDED);
+}
+#elif IREE_WAIT_API == IREE_WAIT_API_PPOLL
+static iree_status_t iree_syscall_poll(struct pollfd* fds, nfds_t nfds,
+                                       iree_time_t deadline_ns,
+                                       int* out_signaled_count) {
+  *out_signaled_count = 0;
+  int rv = -1;
+  do {
+    // Convert the deadline into a tmo_p struct for ppoll that controls whether
+    // the call is blocking or non-blocking. Note that we must do this every
+    // iteration of the loop as a previous ppoll may have taken some of the
+    // time.
+    //
+    // See the ppoll docs for more information as to what the expected value is:
+    // http://man7.org/linux/man-pages/man2/poll.2.html
+    struct timespec timeout_ts;
+    struct timespec* tmo_p = &timeout_ts;
+    if (deadline_ns == IREE_TIME_INFINITE_PAST) {
+      // Block never.
+      memset(&timeout_ts, 0, sizeof(timeout_ts));
+    } else if (deadline_ns == IREE_TIME_INFINITE_FUTURE) {
+      // Block forever (NULL timeout to ppoll).
+      tmo_p = NULL;
+    } else {
+      // Wait only for as much time as we have before the deadline is exceeded.
+      iree_duration_t timeout_ns = deadline_ns - iree_time_now();
+      if (timeout_ns < 0) {
+        // We've reached the deadline; we'll still perform the poll though as
+        // the caller is likely expecting that behavior (intentional context
+        // switch/thread yield/etc).
+        memset(&timeout_ts, 0, sizeof(timeout_ts));
+      } else {
+        timeout_ts.tv_sec = (time_t)(timeout_ns / 1000000000ull);
+        timeout_ts.tv_nsec = (long)(timeout_ns % 1000000000ull);
+      }
+    }
+    rv = ppoll(fds, nfds, tmo_p, NULL);
+  } while (rv < 0 && errno == EINTR);
+  if (rv > 0) {
+    // One or more events set.
+    *out_signaled_count = rv;
+    return iree_ok_status();
+  } else if (rv < 0) {
+    return iree_make_status(iree_status_code_from_errno(errno),
+                            "ppoll failure %d", errno);
+  }
+  // rv == 0
+  // Timeout; no events set.
+  return iree_status_from_code(IREE_STATUS_DEADLINE_EXCEEDED);
+}
+#else
+#error "unsupported IREE_WAIT_API value"
+#endif  // IREE_WAIT_API
+
+//===----------------------------------------------------------------------===//
+// iree_wait_set_t
+//===----------------------------------------------------------------------===//
+
+struct iree_wait_set_t {
+  iree_allocator_t allocator;
+
+  // Total capacity of each handle list.
+  iree_host_size_t handle_capacity;
+
+  // Total number of valid user_handles/poll_fds.
+  iree_host_size_t handle_count;
+
+  // User-provided handles.
+  // We only really need to track these so that we can preserve the handle
+  // types; we could either just do that (a few bytes) or keep them here as-is
+  // where they are a bit easier to debug.
+  iree_wait_handle_t* user_handles;
+
+  // Native list of fds+req we can pass to poll/ppoll/etc and that will receive
+  // the output information like which events were triggered during the wait.
+  //
+  // pollfd::events is specified when the fds are added to the set and then each
+  // wait pollfd::revents is modified during the poll syscall.
+  struct pollfd* poll_fds;
+};
+
+iree_status_t iree_wait_set_allocate(iree_host_size_t capacity,
+                                     iree_allocator_t allocator,
+                                     iree_wait_set_t** out_set) {
+  IREE_ASSERT_ARGUMENT(out_set);
+
+  // Be reasonable; 64K objects is too high (even if poll supports it, which is
+  // hard to tell if it does).
+  if (capacity >= UINT16_MAX) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "wait set capacity of %zu is unreasonably large",
+                            capacity);
+  }
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_host_size_t user_handle_list_size =
+      capacity * iree_sizeof_struct(iree_wait_handle_t);
+  iree_host_size_t poll_fd_list_size = capacity * sizeof(struct pollfd);
+  iree_host_size_t total_size = iree_sizeof_struct(iree_wait_set_t) +
+                                user_handle_list_size + poll_fd_list_size;
+
+  iree_wait_set_t* set = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_allocator_malloc(allocator, total_size, (void**)&set));
+  set->allocator = allocator;
+  set->handle_capacity = capacity;
+  iree_wait_set_clear(set);
+
+  set->user_handles =
+      (iree_wait_handle_t*)((uint8_t*)set +
+                            iree_sizeof_struct(iree_wait_set_t));
+  set->poll_fds =
+      (struct pollfd*)((uint8_t*)set->user_handles + user_handle_list_size);
+
+  *out_set = set;
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+void iree_wait_set_free(iree_wait_set_t* set) {
+  if (!set) return;
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_allocator_free(set->allocator, set);
+  IREE_TRACE_ZONE_END(z0);
+}
+
+bool iree_wait_set_is_empty(const iree_wait_set_t* set) {
+  return set->handle_count != 0;
+}
+
+iree_status_t iree_wait_set_insert(iree_wait_set_t* set,
+                                   iree_wait_handle_t handle) {
+  if (set->handle_count + 1 > set->handle_capacity) {
+    return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED,
+                            "wait set capacity reached");
+  }
+
+  iree_host_size_t index = set->handle_count++;
+
+  iree_wait_handle_t* user_handle = &set->user_handles[index];
+  iree_wait_handle_wrap_primitive(handle.type, handle.value, user_handle);
+
+  // NOTE: poll will ignore any negative fds.
+  struct pollfd* poll_fd = &set->poll_fds[index];
+  poll_fd->fd = iree_wait_primitive_get_read_fd(&handle);
+  poll_fd->events = POLLIN | POLLPRI;  // implicit POLLERR | POLLHUP | POLLNVAL
+  poll_fd->revents = 0;
+
+  return iree_ok_status();
+}
+
+void iree_wait_set_erase(iree_wait_set_t* set, iree_wait_handle_t handle) {
+  // Find the user handle in the set. This either requires a linear scan to
+  // find the matching user handle or - if valid - we can use the native index
+  // set after an iree_wait_any wake to do a quick lookup.
+  iree_host_size_t index = handle.set_internal.index;
+  if (IREE_UNLIKELY(index >= set->handle_count) ||
+      IREE_UNLIKELY(!iree_wait_primitive_compare_identical(
+          &set->user_handles[index], &handle))) {
+    // Fallback to a linear scan of (hopefully) a small list.
+    for (iree_host_size_t i = 0; i < set->handle_count; ++i) {
+      if (iree_wait_primitive_compare_identical(&set->user_handles[i],
+                                                &handle)) {
+        index = i;
+        break;
+      }
+    }
+  }
+
+  // Remove from both handle lists.
+  // Since we make no guarantees about the order of the lists we can just swap
+  // with the last value.
+  int tail_index = (int)set->handle_count - 1;
+  if (tail_index > index) {
+    memcpy(&set->poll_fds[index], &set->poll_fds[tail_index],
+           sizeof(*set->poll_fds));
+    memcpy(&set->user_handles[index], &set->user_handles[tail_index],
+           sizeof(*set->user_handles));
+  }
+  --set->handle_count;
+}
+
+void iree_wait_set_clear(iree_wait_set_t* set) { set->handle_count = 0; }
+
+// Maps a poll revent bitfield result to a status (on failure) and an indicator
+// of whether the event was signaled.
+static iree_status_t iree_wait_set_resolve_poll_events(short revents,
+                                                       bool* out_signaled) {
+  if (revents & POLLERR) {
+    return iree_make_status(IREE_STATUS_INTERNAL, "POLLERR on fd");
+  } else if (revents & POLLHUP) {
+    return iree_make_status(IREE_STATUS_CANCELLED, "POLLHUP on fd");
+  } else if (revents & POLLNVAL) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "POLLNVAL on fd");
+  }
+  *out_signaled = (revents & POLLIN) != 0;
+  return iree_ok_status();
+}
+
+iree_status_t iree_wait_all(iree_wait_set_t* set, iree_time_t deadline_ns) {
+  // Make the syscall only when we have at least one valid fd.
+  // Don't use this as a sleep.
+  if (set->handle_count <= 0) {
+    return iree_ok_status();
+  }
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // TODO(benvanik): see if we can use tracy's mutex tracking to make waits
+  // nicer (at least showing signal->wait relations).
+
+  // Certain poll implementations have a nasty behavior where they allow
+  // negative fds to ignore entries... except for at [0]. To avoid any
+  // additional tracking here we manage a local pollfd list that we keep offset
+  // to the first non-negative fd.
+  //
+  // Gotcha is buried in here (and various spooky bug reports on the web):
+  // https://manpages.debian.org/buster/manpages-dev/poll.2.en.html
+  //   This provides an easy way of ignoring a file descriptor for a single
+  //   poll() call: simply negate the fd field. Note, however, that this
+  //   technique can't be used to ignore file descriptor 0.
+  //
+  // Thanks guys 🙄
+  struct pollfd* poll_fd_base = set->poll_fds;
+  nfds_t poll_fd_count = set->handle_count;
+
+  // Wait-all requires that we repeatedly poll until all handles have been
+  // signaled. To reduce overhead (and not miss events) we mark any handle we
+  // have successfully polled as invalid (fd<0) so that the kernel ignores it.
+  // Only when all handles are invalid does it mean that we've actually waited
+  // for all of them.
+  iree_status_t status = iree_ok_status();
+  int unsignaled_count = poll_fd_count;
+  do {
+    // Eat any negative handles at the start to avoid the mentioned fd[0] bug.
+    while (poll_fd_base[0].fd < 0) {
+      ++poll_fd_base;
+      --poll_fd_count;
+    }
+
+    int signaled_count = 0;
+    status = iree_syscall_poll(poll_fd_base, poll_fd_count, deadline_ns,
+                               &signaled_count);
+    if (!iree_status_is_ok(status)) {
+      // Failed during the poll itself. Ensure that we fall-through and refresh
+      // the poll_fds handle list.
+      break;
+    }
+    unsignaled_count -= signaled_count;
+
+    // Neuter any that have successfully resolved.
+    for (nfds_t i = 0; i < poll_fd_count; ++i) {
+      if (poll_fd_base[i].fd < 0) continue;
+      bool signaled = false;
+      status =
+          iree_wait_set_resolve_poll_events(poll_fd_base[i].revents, &signaled);
+      if (!iree_status_is_ok(status)) {
+        // One (or more) fds had an issue. Ensure that we fall-through and
+        // refresh the poll_fds handle list.
+        break;
+      }
+      if (signaled) {
+        // Negate fd so that we ignore it in the next poll.
+        poll_fd_base[i].fd = -poll_fd_base[i].fd;
+      }
+    }
+  } while (unsignaled_count > 0);
+
+  // Since we destroyed the list of handles during the operation we need to
+  // refresh them with their fds so that the next wait can happen. This is the
+  // kind of thing kqueue/epoll solves (mutable in-place updates on polls) and
+  // an unfortunate reality of using an ancient API. Thankfully most waits are
+  // wait-any so a little loop isn't the worst thing in the wait-all case.
+  for (nfds_t i = 0; i < set->handle_count; ++i) {
+    set->poll_fds[i].fd = -set->poll_fds[i].fd;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+iree_status_t iree_wait_any(iree_wait_set_t* set, iree_time_t deadline_ns,
+                            iree_wait_handle_t* out_wake_handle) {
+  // Make the syscall only when we have at least one valid fd.
+  // Don't use this as a sleep.
+  if (set->handle_count <= 0) {
+    memset(out_wake_handle, 0, sizeof(*out_wake_handle));
+    return iree_ok_status();
+  }
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // TODO(benvanik): see if we can use tracy's mutex tracking to make waits
+  // nicer (at least showing signal->wait relations).
+
+  // Wait-any lets us just poll all the handles we have without needing to worry
+  // about whether all of them were signaled.
+  int signaled_count = 0;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_syscall_poll(set->poll_fds, set->handle_count, deadline_ns,
+                            &signaled_count));
+
+  // Find at least one signaled handle.
+  memset(out_wake_handle, 0, sizeof(*out_wake_handle));
+  if (signaled_count > 0) {
+    for (iree_host_size_t i = 0; i < set->handle_count; ++i) {
+      bool signaled = false;
+      IREE_RETURN_AND_END_ZONE_IF_ERROR(
+          z0, iree_wait_set_resolve_poll_events(set->poll_fds[i].revents,
+                                                &signaled));
+      if (signaled) {
+        memcpy(out_wake_handle, &set->user_handles[i],
+               sizeof(*out_wake_handle));
+        out_wake_handle->set_internal.index = i;
+        break;
+      }
+    }
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+iree_status_t iree_wait_one(iree_wait_handle_t* handle,
+                            iree_time_t deadline_ns) {
+  struct pollfd poll_fds;
+  poll_fds.fd = iree_wait_primitive_get_read_fd(handle);
+  if (poll_fds.fd == -1) return false;
+  poll_fds.events = POLLIN;
+  poll_fds.revents = 0;
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // TODO(benvanik): see if we can use tracy's mutex tracking to make waits
+  // nicer (at least showing signal->wait relations).
+
+  // Just check for our single handle/event.
+  // The benefit of this is that we didn't need to heap alloc the pollfds and
+  // the cache should all stay hot. Reusing the same iree_syscall_pool as the
+  // multi-wait variants ensures consistent handling (and the same syscall
+  // showing in strace/tracy/etc).
+  int signaled_count = 0;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_syscall_poll(&poll_fds, 1, deadline_ns, &signaled_count));
+
+  IREE_TRACE_ZONE_END(z0);
+  return signaled_count ? iree_ok_status()
+                        : iree_status_from_code(IREE_STATUS_DEADLINE_EXCEEDED);
+}
+
+#endif  // IREE_WAIT_API == IREE_WAIT_API_POLL ||
+        // IREE_WAIT_API == IREE_WAIT_API_PPOLL
diff --git a/runtime/src/iree/base/internal/wait_handle_posix.c b/runtime/src/iree/base/internal/wait_handle_posix.c
new file mode 100644
index 0000000..fcec4b8
--- /dev/null
+++ b/runtime/src/iree/base/internal/wait_handle_posix.c
@@ -0,0 +1,288 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/internal/wait_handle_posix.h"
+
+#include "iree/base/tracing.h"
+
+#if defined(IREE_WAIT_API_POSIX_LIKE)
+
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#if defined(IREE_HAVE_WAIT_TYPE_EVENTFD)
+#include <sys/eventfd.h>
+#endif  // IREE_HAVE_WAIT_TYPE_EVENTFD
+#if defined(IREE_HAVE_WAIT_TYPE_SYNC_FILE)
+#include <android/sync.h>
+#endif  // IREE_HAVE_WAIT_TYPE_SYNC_FILE
+
+//===----------------------------------------------------------------------===//
+// iree_wait_primitive_* raw calls
+//===----------------------------------------------------------------------===//
+
+#if defined(IREE_HAVE_WAIT_TYPE_EVENTFD)
+static iree_status_t iree_wait_primitive_create_eventfd(
+    bool initial_state, iree_wait_handle_t* out_handle) {
+  memset(out_handle, 0, sizeof(*out_handle));
+  out_handle->type = IREE_WAIT_PRIMITIVE_TYPE_EVENT_FD;
+
+  // https://man7.org/linux/man-pages/man2/eventfd.2.html
+  out_handle->value.event.fd =
+      eventfd(initial_state ? 1 : 0, EFD_CLOEXEC | EFD_NONBLOCK);
+  if (IREE_UNLIKELY(out_handle->value.event.fd == -1)) {
+    return iree_make_status(iree_status_code_from_errno(errno),
+                            "failed to create eventfd (%d)", errno);
+  }
+
+  return iree_ok_status();
+}
+#endif  // IREE_HAVE_WAIT_TYPE_EVENTFD
+
+#if defined(IREE_HAVE_WAIT_TYPE_PIPE)
+static iree_status_t iree_wait_primitive_create_pipe(
+    bool initial_state, iree_wait_handle_t* out_handle) {
+  memset(out_handle, 0, sizeof(*out_handle));
+  out_handle->type = IREE_WAIT_PRIMITIVE_TYPE_PIPE;
+
+  // Create read (fds[0]) and write (fds[1]) handles.
+  // https://man7.org/linux/man-pages/man2/pipe.2.html
+  if (IREE_UNLIKELY(pipe(out_handle->value.pipe.fds) < 0)) {
+    return iree_make_status(iree_status_code_from_errno(errno),
+                            "failed to create pipe (%d)", errno);
+  }
+
+  // Set both fds to non-blocking.
+  // NOTE: we could use pipe2 when available on linux to avoid the need for the
+  // fcntl, but BSD/darwin/etc don't have it so we'd still need a fallback. This
+  // is effectively the same as passing O_NONBLOCK to pipe2.
+  for (int i = 0; i < 2; ++i) {
+    if (IREE_UNLIKELY(
+            fcntl(out_handle->value.pipe.fds[i], F_SETFL, O_NONBLOCK) < 0)) {
+      return iree_make_status(iree_status_code_from_errno(errno),
+                              "failed to set pipe fd %d to non-blocking (%d)",
+                              i, errno);
+    }
+  }
+
+  // Initially triggered means we just write once to the pipe.
+  // This write must not fail as if the caller requested the state they would
+  // likely deadlock if the first read would block.
+  if (initial_state) {
+    iree_status_t status = iree_wait_primitive_write(out_handle);
+    if (!iree_status_is_ok(status)) {
+      iree_wait_handle_close(out_handle);
+      return status;
+    }
+  }
+
+  return iree_ok_status();
+}
+#endif  // IREE_HAVE_WAIT_TYPE_PIPE
+
+iree_status_t iree_wait_primitive_create_native(
+    bool initial_state, iree_wait_handle_t* out_handle) {
+  memset(out_handle, 0, sizeof(*out_handle));
+#if defined(IREE_HAVE_WAIT_TYPE_EVENTFD)
+  // Always prefer eventfd when present; they rock.
+  return iree_wait_primitive_create_eventfd(initial_state, out_handle);
+#elif defined(IREE_HAVE_WAIT_TYPE_PIPE)
+  // Pipes are fine but much heavier than eventfds.
+  return iree_wait_primitive_create_pipe(initial_state, out_handle);
+#else
+  return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                          "no native wait handle type supported");
+#endif  // IREE_HAVE_WAIT_TYPE_*
+}
+
+static void iree_wait_handle_close_fd(int fd) {
+  int rv;
+  IREE_SYSCALL(rv, close(fd));
+  // NOTE: we could fail to close if the handle is invalid/already closed/etc.
+  // As Windows has undefined behavior when handles are closed while there are
+  // active waits we don't use fd closes as load-bearing operations and it's
+  // fine to ignore the error.
+}
+
+void iree_wait_handle_close(iree_wait_handle_t* handle) {
+  switch (handle->type) {
+#if defined(IREE_HAVE_WAIT_TYPE_EVENTFD)
+    case IREE_WAIT_PRIMITIVE_TYPE_EVENT_FD: {
+      iree_wait_handle_close_fd(handle->value.event.fd);
+      break;
+    }
+#endif  // IREE_HAVE_WAIT_TYPE_EVENTFD
+#if defined(IREE_HAVE_WAIT_TYPE_SYNC_FILE)
+    case IREE_WAIT_PRIMITIVE_TYPE_SYNC_FILE:
+      iree_wait_handle_close_fd(handle->value.sync_file.fd);
+      break;
+#endif  // IREE_HAVE_WAIT_TYPE_SYNC_FILE
+#if defined(IREE_HAVE_WAIT_TYPE_PIPE)
+    case IREE_WAIT_PRIMITIVE_TYPE_PIPE: {
+      iree_wait_handle_close_fd(handle->value.pipe.read_fd);
+      iree_wait_handle_close_fd(handle->value.pipe.write_fd);
+      break;
+    }
+#endif  // IREE_HAVE_WAIT_TYPE_PIPE
+    default:
+      break;
+  }
+  iree_wait_handle_deinitialize(handle);
+}
+
+bool iree_wait_primitive_compare_identical(const iree_wait_handle_t* lhs,
+                                           const iree_wait_handle_t* rhs) {
+  return lhs->type == rhs->type &&
+         memcmp(&lhs->value, &rhs->value, sizeof(lhs->value)) == 0;
+}
+
+int iree_wait_primitive_get_read_fd(const iree_wait_handle_t* handle) {
+  switch (handle->type) {
+#if defined(IREE_HAVE_WAIT_TYPE_EVENTFD)
+    case IREE_WAIT_PRIMITIVE_TYPE_EVENT_FD:
+      return handle->value.event.fd;
+#endif  // IREE_HAVE_WAIT_TYPE_EVENTFD
+#if defined(IREE_HAVE_WAIT_TYPE_SYNC_FILE)
+    case IREE_WAIT_PRIMITIVE_TYPE_SYNC_FILE:
+      return handle->value.sync_file.fd;
+#endif  // IREE_HAVE_WAIT_TYPE_SYNC_FILE
+#if defined(IREE_HAVE_WAIT_TYPE_PIPE)
+    case IREE_WAIT_PRIMITIVE_TYPE_PIPE:
+      return handle->value.pipe.read_fd;
+#endif  // IREE_HAVE_WAIT_TYPE_PIPE
+    default:
+      return -1;
+  }
+}
+
+iree_status_t iree_wait_primitive_read(iree_wait_handle_t* handle,
+                                       iree_time_t deadline_ns) {
+  // Until we need it this does not support anything but polling.
+  // If we want to support auto reset events we'd want to implement blocking.
+  if (deadline_ns != IREE_TIME_INFINITE_PAST) {
+    return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                            "reads are just polls today");
+  }
+
+  int rv = -1;
+  switch (handle->type) {
+    case IREE_WAIT_PRIMITIVE_TYPE_NONE:
+      return iree_ok_status();  // no-op
+#if defined(IREE_HAVE_WAIT_TYPE_EVENTFD)
+    case IREE_WAIT_PRIMITIVE_TYPE_EVENT_FD: {
+      eventfd_t val = 0;
+      IREE_SYSCALL(rv, eventfd_read(handle->value.event.fd, &val));
+      break;
+    }
+#endif  // IREE_HAVE_WAIT_TYPE_EVENTFD
+#if defined(IREE_HAVE_WAIT_TYPE_SYNC_FILE)
+    case IREE_WAIT_PRIMITIVE_TYPE_SYNC_FILE:
+      return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                              "sync files not yet implemented");
+#endif  // IREE_HAVE_WAIT_TYPE_SYNC_FILE
+#if defined(IREE_HAVE_WAIT_TYPE_PIPE)
+    case IREE_WAIT_PRIMITIVE_TYPE_PIPE: {
+      char buf;
+      IREE_SYSCALL(rv, read(handle->value.pipe.read_fd, &buf, 1));
+      break;
+    }
+#endif  // IREE_HAVE_WAIT_TYPE_PIPE
+    default:
+      return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                              "unhandled wait type %d", (int)handle->type);
+  }
+  if (rv >= 0) {
+    // Read completed successfully.
+    return iree_ok_status();
+  } else if (errno == EWOULDBLOCK) {
+    // Would have blocked meaning that there's no data waiting.
+    // NOTE: we purposefully avoid a full status result here as this is a
+    // non-exceptional result.
+    return iree_status_from_code(IREE_STATUS_DEADLINE_EXCEEDED);
+  } else {
+    return iree_make_status(iree_status_code_from_errno(errno),
+                            "fd read failure %d", errno);
+  }
+}
+
+iree_status_t iree_wait_primitive_write(iree_wait_handle_t* handle) {
+  int rv = -1;
+  switch (handle->type) {
+    case IREE_WAIT_PRIMITIVE_TYPE_NONE:
+      return iree_ok_status();  // no-op
+#if defined(IREE_HAVE_WAIT_TYPE_EVENTFD)
+    case IREE_WAIT_PRIMITIVE_TYPE_EVENT_FD: {
+      IREE_SYSCALL(rv, eventfd_write(handle->value.event.fd, 1ull));
+      break;
+    }
+#endif  // IREE_HAVE_WAIT_TYPE_EVENTFD
+#if defined(IREE_HAVE_WAIT_TYPE_SYNC_FILE)
+    case IREE_WAIT_PRIMITIVE_TYPE_SYNC_FILE:
+      return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                              "sync files not yet implemented");
+#endif  // IREE_HAVE_WAIT_TYPE_SYNC_FILE
+#if defined(IREE_HAVE_WAIT_TYPE_PIPE)
+    case IREE_WAIT_PRIMITIVE_TYPE_PIPE: {
+      char buf = '\n';
+      IREE_SYSCALL(rv, write(handle->value.pipe.write_fd, &buf, 1));
+      break;
+    }
+#endif  // IREE_HAVE_WAIT_TYPE_PIPE
+    default:
+      return iree_make_status(IREE_STATUS_UNIMPLEMENTED, "unhandled wait type");
+  }
+  if (rv >= 0) {
+    // Write completed successfully.
+    return iree_ok_status();
+  } else {
+    return iree_make_status(iree_status_code_from_errno(errno),
+                            "fd write failure %d", errno);
+  }
+}
+
+iree_status_t iree_wait_primitive_clear(iree_wait_handle_t* handle) {
+  // No-op for null handles.
+  if (handle->type == IREE_WAIT_PRIMITIVE_TYPE_NONE) return iree_ok_status();
+
+  // Read in a loop until the read would block.
+  // Depending on how the user setup the fd the act of reading may reset the
+  // entire handle (such as with the default eventfd mode) or multiple reads may
+  // be required (such as with semaphores).
+  while (true) {
+    iree_status_t status =
+        iree_wait_primitive_read(handle, IREE_TIME_INFINITE_PAST);
+    if (iree_status_is_deadline_exceeded(status)) {
+      // Would have blocked reading which means we've cleared the fd.
+      return iree_ok_status();
+    } else if (!iree_status_is_ok(status)) {
+      return status;
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// iree_event_t
+//===----------------------------------------------------------------------===//
+
+iree_status_t iree_event_initialize(bool initial_state,
+                                    iree_event_t* out_event) {
+  return iree_wait_primitive_create_native(initial_state, out_event);
+}
+
+void iree_event_deinitialize(iree_event_t* event) {
+  iree_wait_handle_close(event);
+}
+
+void iree_event_set(iree_event_t* event) {
+  IREE_IGNORE_ERROR(iree_wait_primitive_write(event));
+}
+
+void iree_event_reset(iree_event_t* event) {
+  IREE_IGNORE_ERROR(iree_wait_primitive_clear(event));
+}
+
+#endif  // IREE_WAIT_API_POSIX_LIKE
diff --git a/runtime/src/iree/base/internal/wait_handle_posix.h b/runtime/src/iree/base/internal/wait_handle_posix.h
new file mode 100644
index 0000000..bf77093
--- /dev/null
+++ b/runtime/src/iree/base/internal/wait_handle_posix.h
@@ -0,0 +1,77 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// NOTE: must be first to ensure that we can define settings for all includes.
+#include "iree/base/internal/wait_handle_impl.h"
+
+#ifndef IREE_BASE_INTERNAL_WAIT_HANDLE_POSIX_H_
+#define IREE_BASE_INTERNAL_WAIT_HANDLE_POSIX_H_
+
+#if defined(IREE_WAIT_API_POSIX_LIKE)
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Perform a syscall with a retry on EINTR (spurious wake/signal/etc).
+//
+// Usage:
+//  int rv;
+//  IREE_SYSCALL(rv, fcntl(...));
+//  if (rv < 0) { /* failure */ }
+#define IREE_SYSCALL(result_value, expr) \
+  do {                                   \
+    result_value = expr;                 \
+  } while (result_value < 0 && errno == EINTR);
+
+// NOTE: these are intended for low-level signaling and may expose various
+// platform quirks to the caller. Always prefer using a higher level type such
+// as iree_event_t when possible.
+
+// Creates a wait primitive of the type native to the current platform.
+// May fail if resources are exhausted or wait handles are not supported.
+// The handle must be closed with iree_wait_handle_close to release its
+// resources.
+iree_status_t iree_wait_primitive_create_native(bool initial_state,
+                                                iree_wait_handle_t* out_handle);
+
+// Closes an existing handle from iree_wait_primitive_create_native or
+// iree_wait_primitive_clone. Must not be called while there are any waiters on
+// the handle.
+void iree_wait_handle_close(iree_wait_handle_t* handle);
+
+// Returns true if the two handles are identical in representation.
+// Note that two unique handles may point to the same underlying primitive
+// object (such as when they have been cloned).
+bool iree_wait_primitive_compare_identical(const iree_wait_handle_t* lhs,
+                                           const iree_wait_handle_t* rhs);
+
+// Returns an fd that can be used to read/wait on the handle.
+// Returns -1 if the handle is invalid.
+int iree_wait_primitive_get_read_fd(const iree_wait_handle_t* handle);
+
+// Reads a nonce from the given handle and blocks the caller if none are
+// available. IREE_TIME_INFINITE_PAST can be used to poll (the call will never
+// block) and IREE_TIME_INFINITE_FUTURE can be used to block until the primitive
+// is written.
+iree_status_t iree_wait_primitive_read(iree_wait_handle_t* handle,
+                                       iree_time_t deadline_ns);
+
+// Writes a nonce to the given handle causing it to signal any waiters.
+// The exact value written is platform/primitive specific.
+iree_status_t iree_wait_primitive_write(iree_wait_handle_t* handle);
+
+// Clears the wait primitive by repeatedly reading values until no more remain.
+// Never blocks the caller.
+iree_status_t iree_wait_primitive_clear(iree_wait_handle_t* handle);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_WAIT_API_POSIX_LIKE
+
+#endif  // IREE_BASE_INTERNAL_WAIT_HANDLE_POSIX_H_
diff --git a/runtime/src/iree/base/internal/wait_handle_test.cc b/runtime/src/iree/base/internal/wait_handle_test.cc
new file mode 100644
index 0000000..c022aee
--- /dev/null
+++ b/runtime/src/iree/base/internal/wait_handle_test.cc
@@ -0,0 +1,857 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/internal/wait_handle.h"
+
+#if !defined(IREE_WAIT_HANDLE_DISABLED)
+
+#include <atomic>
+#include <chrono>
+#include <cstddef>
+#include <cstring>
+#include <thread>
+
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace iree {
+namespace {
+
+// We don't want to wait too long in here but when we are testing that timeouts
+// work as expected we do have to sometimes wait. These are set to hopefully
+// reduce flakes and not hang a build bot forever if something is broken :)
+constexpr iree_duration_t kShortTimeoutNS = 1000000ull;     // 1ms
+constexpr iree_duration_t kLongTimeoutNS = 60000000000ull;  // 1min
+
+//===----------------------------------------------------------------------===//
+// IREE_WAIT_PRIMITIVE_TYPE_EVENT_FD
+//===----------------------------------------------------------------------===//
+
+#if defined(IREE_HAVE_WAIT_TYPE_EVENTFD)
+
+// TODO(benvanik): tests wrapping external eventfds.
+
+#endif  // IREE_HAVE_WAIT_TYPE_EVENTFD
+
+//===----------------------------------------------------------------------===//
+// IREE_WAIT_PRIMITIVE_TYPE_SYNC_FILE
+//===----------------------------------------------------------------------===//
+
+#if defined(IREE_HAVE_WAIT_TYPE_SYNC_FILE)
+
+// TODO(benvanik): tests wrapping external sync files.
+
+#endif  // IREE_HAVE_WAIT_TYPE_SYNC_FILE
+
+//===----------------------------------------------------------------------===//
+// IREE_WAIT_PRIMITIVE_TYPE_PIPE
+//===----------------------------------------------------------------------===//
+
+#if defined(IREE_HAVE_WAIT_TYPE_PIPE)
+
+// TODO(benvanik): tests wrapping external pipes.
+
+#endif  // IREE_HAVE_WAIT_TYPE_PIPE
+
+//===----------------------------------------------------------------------===//
+// IREE_WAIT_PRIMITIVE_TYPE_WIN32_HANDLE
+//===----------------------------------------------------------------------===//
+
+#if defined(IREE_HAVE_WAIT_TYPE_WIN32_HANDLE)
+
+// TODO(benvanik): tests wrapping external win32 handles.
+
+#endif  // IREE_HAVE_WAIT_TYPE_WIN32_HANDLE
+
+//===----------------------------------------------------------------------===//
+// iree_event_t
+//===----------------------------------------------------------------------===//
+// NOTE: this is testing the user-visible behavior of iree_event_t and the use
+// of functions like iree_wait_one is not exhaustive as that is tested
+// elsewhere.
+
+// Tests that we don't leak.
+TEST(Event, Lifetime) {
+  iree_event_t event;
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/true, &event));
+  iree_event_deinitialize(&event);
+}
+
+TEST(Event, WaitOneInitialFalse) {
+  iree_event_t event;
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/false, &event));
+  IREE_EXPECT_STATUS_IS(IREE_STATUS_DEADLINE_EXCEEDED,
+                        iree_wait_one(&event, IREE_TIME_INFINITE_PAST));
+  iree_event_deinitialize(&event);
+}
+
+TEST(Event, WaitOneInitialTrue) {
+  iree_event_t event;
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/true, &event));
+  IREE_EXPECT_OK(iree_wait_one(&event, IREE_TIME_INFINITE_PAST));
+  iree_event_deinitialize(&event);
+}
+
+// Tests an event that was wrapped from an immediate primitive.
+// These are used to neuter events in lists/sets and should be no-ops.
+TEST(Event, ImmediateEvent) {
+  iree_event_t event;
+  iree_wait_handle_wrap_primitive(IREE_WAIT_PRIMITIVE_TYPE_NONE, {0}, &event);
+  iree_event_set(&event);
+  IREE_EXPECT_OK(iree_wait_one(&event, IREE_TIME_INFINITE_PAST));
+  iree_event_reset(&event);
+  IREE_EXPECT_OK(iree_wait_one(&event, IREE_TIME_INFINITE_PAST));
+}
+
+TEST(Event, SetWait) {
+  iree_event_t event;
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/false, &event));
+
+  // Initially unset.
+  IREE_EXPECT_STATUS_IS(IREE_STATUS_DEADLINE_EXCEEDED,
+                        iree_wait_one(&event, IREE_TIME_INFINITE_PAST));
+
+  // Set and wait.
+  iree_event_set(&event);
+  IREE_EXPECT_OK(iree_wait_one(&event, IREE_TIME_INFINITE_PAST));
+
+  // Set should be sticky until reset manually.
+  IREE_EXPECT_OK(iree_wait_one(&event, IREE_TIME_INFINITE_PAST));
+
+  // Resetting should unsignal the event.
+  iree_event_reset(&event);
+  IREE_EXPECT_STATUS_IS(IREE_STATUS_DEADLINE_EXCEEDED,
+                        iree_wait_one(&event, IREE_TIME_INFINITE_PAST));
+
+  iree_event_deinitialize(&event);
+}
+
+// Tests that we can use set/reset and that certain behavior (such as sets
+// without intervening resets) is allowed. Note that this does not wait and is
+// just testing the client behavior; it's possible to implement these such that
+// a set while another set is pending fails and we want to verify that here.
+TEST(Event, SetReset) {
+  iree_event_t event;
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/false, &event));
+
+  IREE_EXPECT_STATUS_IS(IREE_STATUS_DEADLINE_EXCEEDED,
+                        iree_wait_one(&event, IREE_TIME_INFINITE_PAST));
+
+  iree_event_set(&event);
+  IREE_EXPECT_OK(iree_wait_one(&event, IREE_TIME_INFINITE_PAST));
+  iree_event_set(&event);
+  IREE_EXPECT_OK(iree_wait_one(&event, IREE_TIME_INFINITE_PAST));
+
+  iree_event_reset(&event);
+  IREE_EXPECT_STATUS_IS(IREE_STATUS_DEADLINE_EXCEEDED,
+                        iree_wait_one(&event, IREE_TIME_INFINITE_PAST));
+  iree_event_reset(&event);
+  IREE_EXPECT_STATUS_IS(IREE_STATUS_DEADLINE_EXCEEDED,
+                        iree_wait_one(&event, IREE_TIME_INFINITE_PAST));
+
+  iree_event_set(&event);
+  IREE_EXPECT_OK(iree_wait_one(&event, IREE_TIME_INFINITE_PAST));
+  iree_event_set(&event);
+  IREE_EXPECT_OK(iree_wait_one(&event, IREE_TIME_INFINITE_PAST));
+
+  iree_event_deinitialize(&event);
+}
+
+TEST(Event, BlockingBehavior) {
+  iree_event_t main_to_thread;
+  IREE_ASSERT_OK(
+      iree_event_initialize(/*initial_state=*/false, &main_to_thread));
+  iree_event_t thread_to_main;
+  IREE_ASSERT_OK(
+      iree_event_initialize(/*initial_state=*/false, &thread_to_main));
+
+  // Spinup a thread to signal the event.
+  // Note that it waits on the main_to_thread event until we get further along.
+  std::atomic<bool> did_run_thread{false};
+  std::thread thread([&]() {
+    // Wait for main thread to signal (below).
+    IREE_ASSERT_OK(iree_wait_one(&main_to_thread, IREE_TIME_INFINITE_FUTURE));
+
+    // Set something so we know this ran at all.
+    did_run_thread.store(true);
+
+    // Notify the caller thread.
+    iree_event_set(&thread_to_main);
+  });
+
+  // The thread may take some time to spin up; it must wait for us to allow it
+  // to run its body though so we should be fine here.
+  std::this_thread::sleep_for(std::chrono::milliseconds(50));
+  ASSERT_FALSE(did_run_thread.load());
+
+  // Allow the thread to continue and wait for it to exit.
+  iree_event_set(&main_to_thread);
+  IREE_ASSERT_OK(iree_wait_one(&thread_to_main, IREE_TIME_INFINITE_FUTURE));
+  ASSERT_TRUE(did_run_thread.load());
+
+  thread.join();
+  iree_event_deinitialize(&main_to_thread);
+  iree_event_deinitialize(&thread_to_main);
+}
+
+// Tests using an iree_event_t as a wait source for waiting.
+TEST(Event, WaitSourceBlocking) {
+  iree_event_t event;
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/false, &event));
+  iree_wait_source_t wait_source = iree_event_await(&event);
+
+  // Initially unset.
+  IREE_EXPECT_STATUS_IS(
+      IREE_STATUS_DEADLINE_EXCEEDED,
+      iree_wait_source_wait_one(wait_source, iree_immediate_timeout()));
+
+  // Set and wait.
+  iree_event_set(&event);
+  IREE_EXPECT_OK(
+      iree_wait_source_wait_one(wait_source, iree_immediate_timeout()));
+
+  // Set should be sticky until reset manually.
+  IREE_EXPECT_OK(
+      iree_wait_source_wait_one(wait_source, iree_immediate_timeout()));
+
+  // Resetting should unsignal the event.
+  iree_event_reset(&event);
+  IREE_EXPECT_STATUS_IS(
+      IREE_STATUS_DEADLINE_EXCEEDED,
+      iree_wait_source_wait_one(wait_source, iree_immediate_timeout()));
+
+  iree_event_deinitialize(&event);
+}
+
+//===----------------------------------------------------------------------===//
+// iree_wait_set_t
+//===----------------------------------------------------------------------===//
+
+// Tests basic usage of the wait set API without waiting.
+TEST(WaitSet, Lifetime) {
+  iree_event_t event;
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/false, &event));
+
+  iree_wait_set_t* wait_set = NULL;
+  IREE_ASSERT_OK(
+      iree_wait_set_allocate(128, iree_allocator_system(), &wait_set));
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, event));
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, event));
+  iree_wait_set_erase(wait_set, event);
+  iree_wait_set_clear(wait_set);
+  iree_wait_set_free(wait_set);
+
+  iree_event_deinitialize(&event);
+}
+
+TEST(WaitSet, UnreasonableCapacity) {
+  iree_wait_set_t* wait_set = NULL;
+  iree_status_t status = iree_wait_set_allocate(
+      1 * 1024 * 1024, iree_allocator_system(), &wait_set);
+  IREE_EXPECT_STATUS_IS(IREE_STATUS_INVALID_ARGUMENT, status);
+  iree_status_free(status);
+}
+
+// Tests that inserting the same handles multiple times is tracked correctly.
+TEST(WaitSet, Deduplication) {
+  iree_event_t ev_unset, ev_dupe;
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/false, &ev_unset));
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/true, &ev_dupe));
+  iree_wait_set_t* wait_set = NULL;
+  IREE_ASSERT_OK(
+      iree_wait_set_allocate(128, iree_allocator_system(), &wait_set));
+
+  // We want to test for duplication on ev_dupe here so ensure it's added.
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_unset));
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_dupe));
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_dupe));
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_dupe));
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_unset));
+
+  // Wait should succeed immediately because ev_dupe is set (and our wake handle
+  // should be ev_dupe).
+  iree_wait_handle_t wake_handle;
+  IREE_ASSERT_OK(
+      iree_wait_any(wait_set, IREE_TIME_INFINITE_PAST, &wake_handle));
+  EXPECT_EQ(0,
+            memcmp(&ev_dupe.value, &wake_handle.value, sizeof(ev_dupe.value)));
+
+  // Erase the events one at a time and ensure we still get the expected number
+  // of waits on ev_dupe.
+  iree_wait_set_erase(wait_set, wake_handle);
+  IREE_ASSERT_OK(
+      iree_wait_any(wait_set, IREE_TIME_INFINITE_PAST, &wake_handle));
+  EXPECT_EQ(0,
+            memcmp(&ev_dupe.value, &wake_handle.value, sizeof(ev_dupe.value)));
+  iree_wait_set_erase(wait_set, wake_handle);
+  IREE_ASSERT_OK(
+      iree_wait_any(wait_set, IREE_TIME_INFINITE_PAST, &wake_handle));
+  EXPECT_EQ(0,
+            memcmp(&ev_dupe.value, &wake_handle.value, sizeof(ev_dupe.value)));
+  iree_wait_set_erase(wait_set, wake_handle);
+
+  // Now there should just be ev_unset present in the set and a poll will fail.
+  IREE_EXPECT_STATUS_IS(
+      IREE_STATUS_DEADLINE_EXCEEDED,
+      iree_wait_any(wait_set, IREE_TIME_INFINITE_PAST, &wake_handle));
+
+  iree_wait_set_free(wait_set);
+  iree_event_deinitialize(&ev_unset);
+  iree_event_deinitialize(&ev_dupe);
+}
+
+// Tests that clear handles things right in the face of dupes.
+TEST(WaitSet, Clear) {
+  iree_event_t ev_unset, ev_dupe;
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/false, &ev_unset));
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/true, &ev_dupe));
+  iree_wait_set_t* wait_set = NULL;
+  IREE_ASSERT_OK(
+      iree_wait_set_allocate(128, iree_allocator_system(), &wait_set));
+
+  // We want to test for duplication o n ev_dupe here.
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_unset));
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_dupe));
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_dupe));
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_dupe));
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_unset));
+
+  // Wait should succeed immediately because ev_dupe is set (and our wake handle
+  // should be ev_dupe).
+  iree_wait_handle_t wake_handle;
+  IREE_ASSERT_OK(
+      iree_wait_any(wait_set, IREE_TIME_INFINITE_PAST, &wake_handle));
+  EXPECT_EQ(0,
+            memcmp(&ev_dupe.value, &wake_handle.value, sizeof(ev_dupe.value)));
+
+  // Erase all events from the set.
+  iree_wait_set_clear(wait_set);
+
+  // No more events remaining; should pass immediately.
+  IREE_ASSERT_OK(
+      iree_wait_any(wait_set, IREE_TIME_INFINITE_PAST, &wake_handle));
+
+  iree_wait_set_free(wait_set);
+  iree_event_deinitialize(&ev_unset);
+  iree_event_deinitialize(&ev_dupe);
+}
+
+// Tests iree_wait_all when polling (deadline_ns = IREE_TIME_INFINITE_PAST).
+TEST(WaitSet, WaitAllPolling) {
+  iree_event_t ev_unset_0, ev_unset_1;
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/false, &ev_unset_0));
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/false, &ev_unset_1));
+  iree_event_t ev_set_0, ev_set_1;
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/true, &ev_set_0));
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/true, &ev_set_1));
+  iree_wait_set_t* wait_set = NULL;
+  IREE_ASSERT_OK(
+      iree_wait_set_allocate(128, iree_allocator_system(), &wait_set));
+
+  // Polls when empty should never block.
+  iree_wait_set_clear(wait_set);
+  IREE_ASSERT_OK(iree_wait_all(wait_set, IREE_TIME_INFINITE_PAST));
+
+  // Polls with only unset handles should never block.
+  iree_wait_set_clear(wait_set);
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_unset_0));
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_unset_1));
+  IREE_EXPECT_STATUS_IS(IREE_STATUS_DEADLINE_EXCEEDED,
+                        iree_wait_all(wait_set, IREE_TIME_INFINITE_PAST));
+
+  // Polls with only set handles should return immediately.
+  iree_wait_set_clear(wait_set);
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_set_0));
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_set_1));
+  IREE_ASSERT_OK(iree_wait_all(wait_set, IREE_TIME_INFINITE_PAST));
+
+  // Polls with mixed set/unset should never succeed.
+  iree_wait_set_clear(wait_set);
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_unset_0));
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_unset_1));
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_set_0));
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_set_1));
+  IREE_EXPECT_STATUS_IS(IREE_STATUS_DEADLINE_EXCEEDED,
+                        iree_wait_all(wait_set, IREE_TIME_INFINITE_PAST));
+
+  iree_wait_set_free(wait_set);
+  iree_event_deinitialize(&ev_unset_0);
+  iree_event_deinitialize(&ev_unset_1);
+  iree_event_deinitialize(&ev_set_0);
+  iree_event_deinitialize(&ev_set_1);
+}
+
+// Tests iree_wait_all with timeouts (deadline_ns = non-zero).
+TEST(WaitSet, WaitAllTimeout) {
+  iree_event_t ev_unset_0, ev_unset_1;
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/false, &ev_unset_0));
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/false, &ev_unset_1));
+  iree_event_t ev_set_0, ev_set_1;
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/true, &ev_set_0));
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/true, &ev_set_1));
+  iree_wait_set_t* wait_set = NULL;
+  IREE_ASSERT_OK(
+      iree_wait_set_allocate(128, iree_allocator_system(), &wait_set));
+
+  // Timeouts when empty should never block.
+  iree_wait_set_clear(wait_set);
+  IREE_ASSERT_OK(iree_wait_all(wait_set, iree_time_now() + kShortTimeoutNS));
+
+  // Timeouts with only unset handles should block (and then expire).
+  iree_wait_set_clear(wait_set);
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_unset_0));
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_unset_1));
+  constexpr iree_duration_t kShortTimeoutNS = 1000000ull;
+  IREE_EXPECT_STATUS_IS(
+      IREE_STATUS_DEADLINE_EXCEEDED,
+      iree_wait_all(wait_set, iree_time_now() + kShortTimeoutNS));
+
+  // Timeouts with only set handles should return immediately.
+  iree_wait_set_clear(wait_set);
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_set_0));
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_set_1));
+  IREE_ASSERT_OK(iree_wait_all(wait_set, iree_time_now() + kShortTimeoutNS));
+
+  // Timeouts with mixed set/unset should never succeed.
+  iree_wait_set_clear(wait_set);
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_unset_0));
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_unset_1));
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_set_0));
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_set_1));
+  IREE_EXPECT_STATUS_IS(
+      IREE_STATUS_DEADLINE_EXCEEDED,
+      iree_wait_all(wait_set, iree_time_now() + kShortTimeoutNS));
+
+  iree_wait_set_free(wait_set);
+  iree_event_deinitialize(&ev_unset_0);
+  iree_event_deinitialize(&ev_unset_1);
+  iree_event_deinitialize(&ev_set_0);
+  iree_event_deinitialize(&ev_set_1);
+}
+
+// Tests iree_wait_all when blocking (deadline_ns = IREE_TIME_INFINITE_FUTURE).
+TEST(WaitSet, WaitAllBlocking) {
+  iree_event_t thread_to_main;
+  IREE_ASSERT_OK(
+      iree_event_initialize(/*initial_state=*/false, &thread_to_main));
+  iree_event_t ev_set_0, ev_set_1;
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/true, &ev_set_0));
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/true, &ev_set_1));
+  iree_wait_set_t* wait_set = NULL;
+  IREE_ASSERT_OK(
+      iree_wait_set_allocate(128, iree_allocator_system(), &wait_set));
+
+  // Throw in some other set handles so that we are multi-waiting for just the
+  // thread_to_main event to be set.
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_set_0));
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_set_1));
+
+  // Wait forever (no timeout).
+  // We approximate that by forking off a thread to signal our local event. We
+  // can assume that a moderate wait is enough to verify the forever behavior as
+  // otherwise we are probably just messing up the math and will timeout.
+  std::thread thread([&]() {
+    // Notify the caller thread after sleeping (to ensure it's not polling).
+    std::this_thread::sleep_for(std::chrono::milliseconds(50));
+    iree_event_set(&thread_to_main);
+  });
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, thread_to_main));
+  IREE_ASSERT_OK(iree_wait_all(wait_set, IREE_TIME_INFINITE_FUTURE));
+
+  thread.join();
+  iree_wait_set_free(wait_set);
+  iree_event_deinitialize(&thread_to_main);
+  iree_event_deinitialize(&ev_set_0);
+  iree_event_deinitialize(&ev_set_1);
+}
+
+// Tests iree_wait_all when one or more handles are duplicated.
+TEST(WaitSet, WaitAllDuplicates) {
+  iree_event_t ev_set;
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/true, &ev_set));
+  iree_wait_set_t* wait_set = NULL;
+  IREE_ASSERT_OK(
+      iree_wait_set_allocate(128, iree_allocator_system(), &wait_set));
+
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_set));
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_set));
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_set));
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_set));
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_set));
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_set));
+
+  // Wait should succeed immediately because ev_set is set.
+  IREE_ASSERT_OK(iree_wait_all(wait_set, IREE_TIME_INFINITE_PAST));
+
+  iree_wait_set_free(wait_set);
+  iree_event_deinitialize(&ev_set);
+}
+
+// Tests iree_wait_any; note that this is only focused on testing the wait.
+TEST(WaitSet, WaitAny) {
+  iree_event_t ev_unset, ev_set;
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/false, &ev_unset));
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/true, &ev_set));
+  iree_wait_set_t* wait_set = NULL;
+  IREE_ASSERT_OK(
+      iree_wait_set_allocate(128, iree_allocator_system(), &wait_set));
+
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_unset));
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_set));
+
+  // Wait should succeed immediately because ev_set is set (and our wake handle
+  // should be ev_set).
+  iree_wait_handle_t wake_handle;
+  IREE_ASSERT_OK(
+      iree_wait_any(wait_set, IREE_TIME_INFINITE_PAST, &wake_handle));
+  EXPECT_EQ(0, memcmp(&ev_set.value, &wake_handle.value, sizeof(ev_set.value)));
+
+  iree_wait_set_free(wait_set);
+  iree_event_deinitialize(&ev_unset);
+  iree_event_deinitialize(&ev_set);
+}
+
+// Tests iree_wait_any when polling (deadline_ns = IREE_TIME_INFINITE_PAST).
+TEST(WaitSet, WaitAnyPolling) {
+  iree_event_t ev_unset_0, ev_unset_1;
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/false, &ev_unset_0));
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/false, &ev_unset_1));
+  iree_event_t ev_set_0, ev_set_1;
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/true, &ev_set_0));
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/true, &ev_set_1));
+  iree_wait_set_t* wait_set = NULL;
+  IREE_ASSERT_OK(
+      iree_wait_set_allocate(128, iree_allocator_system(), &wait_set));
+
+  iree_wait_handle_t empty_handle;
+  memset(&empty_handle, 0, sizeof(empty_handle));
+
+  // Polls when empty should never block and return an empty wake handle.
+  // This is so that if the caller touches the wake_handle they at least have
+  // initialized memory.
+  iree_wait_set_clear(wait_set);
+  iree_wait_handle_t wake_handle;
+  IREE_ASSERT_OK(
+      iree_wait_any(wait_set, IREE_TIME_INFINITE_PAST, &wake_handle));
+  EXPECT_EQ(0, memcmp(&empty_handle, &wake_handle, sizeof(empty_handle)));
+
+  // Polls with only unset handles should never block.
+  iree_wait_set_clear(wait_set);
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_unset_0));
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_unset_1));
+  IREE_EXPECT_STATUS_IS(
+      IREE_STATUS_DEADLINE_EXCEEDED,
+      iree_wait_any(wait_set, IREE_TIME_INFINITE_PAST, &wake_handle));
+  EXPECT_EQ(0, memcmp(&empty_handle, &wake_handle, sizeof(empty_handle)));
+
+  // Polls with only set handles should return immediately.
+  // Note that which handle is returned is not specified.
+  iree_wait_set_clear(wait_set);
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_set_0));
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_set_1));
+  IREE_ASSERT_OK(
+      iree_wait_any(wait_set, IREE_TIME_INFINITE_PAST, &wake_handle));
+  EXPECT_TRUE(
+      0 ==
+          memcmp(&ev_set_0.value, &wake_handle.value, sizeof(ev_set_0.value)) ||
+      0 == memcmp(&ev_set_1.value, &wake_handle.value, sizeof(ev_set_1.value)));
+
+  // Polls with mixed set/unset should return immediately.
+  // Note that which handle is returned is not specified but we know it should
+  // at least be one of the signaled ones.
+  iree_wait_set_clear(wait_set);
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_unset_0));
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_unset_1));
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_set_0));
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_set_1));
+  IREE_ASSERT_OK(
+      iree_wait_any(wait_set, IREE_TIME_INFINITE_PAST, &wake_handle));
+  EXPECT_TRUE(
+      0 ==
+          memcmp(&ev_set_0.value, &wake_handle.value, sizeof(ev_set_0.value)) ||
+      0 == memcmp(&ev_set_1.value, &wake_handle.value, sizeof(ev_set_1.value)));
+
+  iree_wait_set_free(wait_set);
+  iree_event_deinitialize(&ev_unset_0);
+  iree_event_deinitialize(&ev_unset_1);
+  iree_event_deinitialize(&ev_set_0);
+  iree_event_deinitialize(&ev_set_1);
+}
+
+// Tests iree_wait_any with timeouts (deadline_ns = non-zero).
+TEST(WaitSet, WaitAnyTimeout) {
+  iree_event_t ev_unset_0, ev_unset_1;
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/false, &ev_unset_0));
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/false, &ev_unset_1));
+  iree_event_t ev_set_0, ev_set_1;
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/true, &ev_set_0));
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/true, &ev_set_1));
+  iree_wait_set_t* wait_set = NULL;
+  IREE_ASSERT_OK(
+      iree_wait_set_allocate(128, iree_allocator_system(), &wait_set));
+
+  iree_wait_handle_t empty_handle;
+  memset(&empty_handle, 0, sizeof(empty_handle));
+
+  // Timeouts when empty should never block.
+  iree_wait_set_clear(wait_set);
+  iree_wait_handle_t wake_handle;
+  IREE_ASSERT_OK(
+      iree_wait_any(wait_set, iree_time_now() + kShortTimeoutNS, &wake_handle));
+  EXPECT_EQ(0, memcmp(&empty_handle, &wake_handle, sizeof(empty_handle)));
+
+  // Timeouts with only unset handles should block (and then expire).
+  iree_wait_set_clear(wait_set);
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_unset_0));
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_unset_1));
+  constexpr iree_duration_t kShortTimeoutNS = 1000000ull;
+  IREE_EXPECT_STATUS_IS(
+      IREE_STATUS_DEADLINE_EXCEEDED,
+      iree_wait_any(wait_set, iree_time_now() + kShortTimeoutNS, &wake_handle));
+  EXPECT_EQ(0, memcmp(&empty_handle, &wake_handle, sizeof(empty_handle)));
+
+  // Timeouts with only set handles should return immediately and have one of
+  // the set handles as the wake handle.
+  iree_wait_set_clear(wait_set);
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_set_0));
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_set_1));
+  IREE_ASSERT_OK(
+      iree_wait_any(wait_set, iree_time_now() + kShortTimeoutNS, &wake_handle));
+  EXPECT_TRUE(
+      0 ==
+          memcmp(&ev_set_0.value, &wake_handle.value, sizeof(ev_set_0.value)) ||
+      0 == memcmp(&ev_set_1.value, &wake_handle.value, sizeof(ev_set_1.value)));
+
+  // Timeouts with mixed set/unset should return immediately and have one of the
+  // set handles as the wake handle.
+  iree_wait_set_clear(wait_set);
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_unset_0));
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_unset_1));
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_set_0));
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_set_1));
+  IREE_ASSERT_OK(
+      iree_wait_any(wait_set, iree_time_now() + kShortTimeoutNS, &wake_handle));
+  EXPECT_TRUE(
+      0 ==
+          memcmp(&ev_set_0.value, &wake_handle.value, sizeof(ev_set_0.value)) ||
+      0 == memcmp(&ev_set_1.value, &wake_handle.value, sizeof(ev_set_1.value)));
+
+  iree_wait_set_free(wait_set);
+  iree_event_deinitialize(&ev_unset_0);
+  iree_event_deinitialize(&ev_unset_1);
+  iree_event_deinitialize(&ev_set_0);
+  iree_event_deinitialize(&ev_set_1);
+}
+
+// Tests iree_wait_any when blocking (deadline_ns = IREE_TIME_INFINITE_FUTURE).
+TEST(WaitSet, WaitAnyBlocking) {
+  iree_event_t thread_to_main;
+  IREE_ASSERT_OK(
+      iree_event_initialize(/*initial_state=*/false, &thread_to_main));
+  iree_event_t ev_unset_0, ev_unset_1;
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/false, &ev_unset_0));
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/false, &ev_unset_1));
+  iree_wait_set_t* wait_set = NULL;
+  IREE_ASSERT_OK(
+      iree_wait_set_allocate(128, iree_allocator_system(), &wait_set));
+
+  // Throw in some unset handles so that we are multi-waiting for just the
+  // thread_to_main event to be set.
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_unset_0));
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_unset_1));
+
+  // Wait forever (no timeout).
+  // We approximate that by forking off a thread to signal our local event. We
+  // can assume that a moderate wait is enough to verify the forever behavior as
+  // otherwise we are probably just messing up the math and will timeout.
+  std::thread thread([&]() {
+    // Notify the caller thread after sleeping (to ensure it's not polling).
+    std::this_thread::sleep_for(std::chrono::milliseconds(50));
+    iree_event_set(&thread_to_main);
+  });
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, thread_to_main));
+  iree_wait_handle_t wake_handle;
+  IREE_ASSERT_OK(
+      iree_wait_any(wait_set, IREE_TIME_INFINITE_FUTURE, &wake_handle));
+  EXPECT_EQ(0, memcmp(&thread_to_main.value, &wake_handle.value,
+                      sizeof(thread_to_main.value)));
+
+  thread.join();
+  iree_wait_set_free(wait_set);
+  iree_event_deinitialize(&thread_to_main);
+  iree_event_deinitialize(&ev_unset_0);
+  iree_event_deinitialize(&ev_unset_1);
+}
+
+// Tests that an iree_wait_any followed by an iree_wait_set_erase properly
+// chooses the right handle to erase.
+TEST(WaitSet, WaitAnyErase) {
+  iree_event_t ev_unset_0, ev_unset_1;
+  iree_event_t ev_set;
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/false, &ev_unset_0));
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/false, &ev_unset_1));
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/true, &ev_set));
+  iree_wait_set_t* wait_set = NULL;
+  IREE_ASSERT_OK(
+      iree_wait_set_allocate(128, iree_allocator_system(), &wait_set));
+
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_unset_0));
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_set));
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_unset_1));
+
+  // Wait should succeed immediately because ev_set is set (and our wake handle
+  // should be ev_set).
+  iree_wait_handle_t wake_handle;
+  IREE_ASSERT_OK(
+      iree_wait_any(wait_set, IREE_TIME_INFINITE_PAST, &wake_handle));
+  EXPECT_EQ(0, memcmp(&ev_set.value, &wake_handle.value, sizeof(ev_set.value)));
+
+  // Erase the woken handle.
+  // NOTE: to get the behavior we want to test we must pass wake_handle here and
+  // not the ev_set value.
+  iree_wait_set_erase(wait_set, wake_handle);
+
+  // Try to wait again; this time we should timeout because only ev_unset_*
+  // remains in the set.
+  IREE_EXPECT_STATUS_IS(
+      IREE_STATUS_DEADLINE_EXCEEDED,
+      iree_wait_any(wait_set, IREE_TIME_INFINITE_PAST, &wake_handle));
+
+  iree_wait_set_free(wait_set);
+  iree_event_deinitialize(&ev_unset_0);
+  iree_event_deinitialize(&ev_unset_1);
+  iree_event_deinitialize(&ev_set);
+}
+
+// Tests that an iree_wait_any followed by an iree_wait_set_erase properly
+// chooses the right handle to erase (the tail one).
+TEST(WaitSet, WaitAnyEraseTail) {
+  iree_event_t ev_unset, ev_set;
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/false, &ev_unset));
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/true, &ev_set));
+  iree_wait_set_t* wait_set = NULL;
+  IREE_ASSERT_OK(
+      iree_wait_set_allocate(128, iree_allocator_system(), &wait_set));
+
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_unset));
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_set));
+
+  // Wait should succeed immediately because ev_set is set (and our wake handle
+  // should be ev_set).
+  iree_wait_handle_t wake_handle;
+  IREE_ASSERT_OK(
+      iree_wait_any(wait_set, IREE_TIME_INFINITE_PAST, &wake_handle));
+  EXPECT_EQ(0, memcmp(&ev_set.value, &wake_handle.value, sizeof(ev_set.value)));
+
+  // Erase the woken handle.
+  // NOTE: to get the behavior we want to test we must pass wake_handle here and
+  // not the ev_set value.
+  iree_wait_set_erase(wait_set, wake_handle);
+
+  // Try to wait again; this time we should timeout because only ev_unset
+  // remains in the set.
+  IREE_EXPECT_STATUS_IS(
+      IREE_STATUS_DEADLINE_EXCEEDED,
+      iree_wait_any(wait_set, IREE_TIME_INFINITE_PAST, &wake_handle));
+
+  iree_wait_set_free(wait_set);
+  iree_event_deinitialize(&ev_unset);
+  iree_event_deinitialize(&ev_set);
+}
+
+// Tests that an iree_wait_any followed by an iree_wait_set_erase without using
+// the wake_handle still erases the correct handle.
+TEST(WaitSet, WaitAnyEraseSplit) {
+  iree_event_t ev_unset, ev_set;
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/false, &ev_unset));
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/true, &ev_set));
+  iree_wait_set_t* wait_set = NULL;
+  IREE_ASSERT_OK(
+      iree_wait_set_allocate(128, iree_allocator_system(), &wait_set));
+
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_unset));
+  IREE_ASSERT_OK(iree_wait_set_insert(wait_set, ev_set));
+
+  // Wait should succeed immediately because ev_set is set (and our wake handle
+  // should be ev_set).
+  iree_wait_handle_t wake_handle;
+  IREE_ASSERT_OK(
+      iree_wait_any(wait_set, IREE_TIME_INFINITE_PAST, &wake_handle));
+  EXPECT_EQ(0, memcmp(&ev_set.value, &wake_handle.value, sizeof(ev_set.value)));
+
+  // Erase the woken handle *WITHOUT* using the wake_handle.
+  iree_wait_set_erase(wait_set, ev_set);
+
+  // Try to wait again; this time we should timeout because only ev_unset
+  // remains in the set.
+  IREE_EXPECT_STATUS_IS(
+      IREE_STATUS_DEADLINE_EXCEEDED,
+      iree_wait_any(wait_set, IREE_TIME_INFINITE_PAST, &wake_handle));
+
+  iree_wait_set_free(wait_set);
+  iree_event_deinitialize(&ev_unset);
+  iree_event_deinitialize(&ev_set);
+}
+
+// Tests iree_wait_one when polling (deadline_ns = IREE_TIME_INFINITE_PAST).
+TEST(WaitSet, WaitOnePolling) {
+  iree_event_t ev_unset, ev_set;
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/false, &ev_unset));
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/true, &ev_set));
+
+  // Polling (don't block even if unset).
+  IREE_EXPECT_STATUS_IS(IREE_STATUS_DEADLINE_EXCEEDED,
+                        iree_wait_one(&ev_unset, IREE_TIME_INFINITE_PAST));
+  IREE_ASSERT_OK(iree_wait_one(&ev_set, IREE_TIME_INFINITE_PAST));
+
+  iree_event_deinitialize(&ev_unset);
+  iree_event_deinitialize(&ev_set);
+}
+
+// Tests iree_wait_one with timeouts (deadline_ns = non-zero).
+TEST(WaitSet, WaitOneTimeout) {
+  iree_event_t ev_unset, ev_set;
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/false, &ev_unset));
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/true, &ev_set));
+
+  // Force a timeout by waiting on an event that'll never get set.
+  IREE_EXPECT_STATUS_IS(
+      IREE_STATUS_DEADLINE_EXCEEDED,
+      iree_wait_one(&ev_unset, iree_time_now() + kShortTimeoutNS));
+
+  // Ensure we return immediately when waiting on a set value (and not wait
+  // 100 years because we messed up our math).
+  IREE_ASSERT_OK(iree_wait_one(&ev_set, iree_time_now() + kLongTimeoutNS));
+
+  iree_event_deinitialize(&ev_unset);
+  iree_event_deinitialize(&ev_set);
+}
+
+// Tests iree_wait_one when blocking (deadline_ns = IREE_TIME_INFINITE_FUTURE).
+TEST(WaitSet, WaitOneBlocking) {
+  iree_event_t thread_to_main;
+  IREE_ASSERT_OK(
+      iree_event_initialize(/*initial_state=*/false, &thread_to_main));
+
+  // Wait forever (no timeout).
+  // We approximate that by forking off a thread to signal our local event. We
+  // can assume that a moderate wait is enough to verify the forever behavior as
+  // otherwise we are probably just messing up the math and will timeout.
+  std::thread thread([&]() {
+    // Notify the caller thread after sleeping (to ensure it's not polling).
+    std::this_thread::sleep_for(std::chrono::milliseconds(50));
+    iree_event_set(&thread_to_main);
+  });
+  IREE_ASSERT_OK(iree_wait_one(&thread_to_main, IREE_TIME_INFINITE_FUTURE));
+
+  thread.join();
+  iree_event_deinitialize(&thread_to_main);
+}
+
+}  // namespace
+}  // namespace iree
+
+#endif  // !IREE_WAIT_HANDLE_DISABLED
diff --git a/runtime/src/iree/base/internal/wait_handle_win32.c b/runtime/src/iree/base/internal/wait_handle_win32.c
new file mode 100644
index 0000000..b583a36
--- /dev/null
+++ b/runtime/src/iree/base/internal/wait_handle_win32.c
@@ -0,0 +1,468 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// clang-format off: must be included before all other headers.
+#include "iree/base/internal/wait_handle_impl.h"
+// clang-format on
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/wait_handle.h"
+#include "iree/base/target_platform.h"
+
+#if IREE_WAIT_API == IREE_WAIT_API_WIN32
+
+#include "iree/base/tracing.h"
+
+//===----------------------------------------------------------------------===//
+// Platform utilities
+//===----------------------------------------------------------------------===//
+
+static_assert(
+    sizeof(iree_wait_primitive_value_t) == sizeof(HANDLE),
+    "win32 HANDLE type must match uintptr size in wait primitive struct");
+
+//===----------------------------------------------------------------------===//
+// iree_wait_primitive_* raw calls
+//===----------------------------------------------------------------------===//
+
+// Clones a wait handle such that both the |source_handle| and new
+// |out_target_handle| both reference the same wait primitive. The handle must
+// be closed with iree_wait_handle_close as if it had been created.
+static iree_status_t iree_wait_primitive_clone(
+    iree_wait_handle_t* source_handle, iree_wait_handle_t* out_target_handle) {
+  if (source_handle->type != IREE_WAIT_PRIMITIVE_TYPE_WIN32_HANDLE) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "source wait handle must be a win32 HANDLE");
+  }
+
+  iree_wait_primitive_value_t value;
+  memset(&value, 0, sizeof(value));
+  HANDLE process = GetCurrentProcess();
+  if (!DuplicateHandle(process, (HANDLE)source_handle->value.win32.handle,
+                       process, (LPHANDLE)&value.win32.handle, 0, FALSE,
+                       DUPLICATE_SAME_ACCESS)) {
+    return iree_make_status(
+        iree_status_code_from_win32_error(GetLastError()),
+        "unable to duplicate HANDLE; possibly out of process handles");
+  }
+  iree_wait_handle_wrap_primitive(IREE_WAIT_PRIMITIVE_TYPE_WIN32_HANDLE, value,
+                                  out_target_handle);
+  return iree_ok_status();
+}
+
+// Closes an existing handle that was either created manually or via
+// iree_wait_primitive_clone. Must not be called while there are any waiters on
+// the handle.
+void iree_wait_handle_close(iree_wait_handle_t* handle) {
+  if (IREE_LIKELY(handle->value.win32.handle != 0)) {
+    CloseHandle((HANDLE)handle->value.win32.handle);
+  }
+  iree_wait_handle_deinitialize(handle);
+}
+
+// Returns true if the two handles share the same underlying primitive object.
+static bool iree_wait_primitive_compare(const iree_wait_handle_t* lhs,
+                                        const iree_wait_handle_t* rhs) {
+  if (lhs->type != rhs->type) return false;
+  bool handles_match =
+      memcmp(&lhs->value, &rhs->value, sizeof(lhs->value)) == 0;
+  switch (lhs->type) {
+    case IREE_WAIT_PRIMITIVE_TYPE_WIN32_HANDLE:
+      // Note that multiple HANDLEs may point at the same underlying object
+      // (such as if they have been cloned).
+      return handles_match ||
+                     CompareObjectHandles((HANDLE)lhs->value.win32.handle,
+                                          (HANDLE)rhs->value.win32.handle)
+                 ? true
+                 : false;
+    default:
+      return handles_match;
+  }
+}
+
+// Returns true if the two handles are identical in representation.
+// Note that two unique handles may point to the same underlying primitive
+// object (such as when they have been cloned); if testing for duplicate
+// primitives prefer iree_wait_primitive_compare.
+static bool iree_wait_primitive_compare_identical(
+    const iree_wait_handle_t* lhs, const iree_wait_handle_t* rhs) {
+  return lhs->type == rhs->type &&
+         memcmp(&lhs->value, &rhs->value, sizeof(lhs->value)) == 0;
+}
+
+//===----------------------------------------------------------------------===//
+// iree_wait_set_t
+//===----------------------------------------------------------------------===//
+
+struct iree_wait_set_t {
+  iree_allocator_t allocator;
+
+  // Total capacity of handles in the set (including duplicates).
+  // This defines the capacity of user_handles and native_handles and to ensure
+  // that we don't get insanely hard to debug behavioral differences when some
+  // handles happen to be duplicates we track the total count against this total
+  // capacity including duplicates.
+  //
+  // If you added 1000 duplicate handles to the set you'd need a handle_capacity
+  // of 1000 even though handle_count (expluding duplicates) would be 1.
+  iree_host_size_t handle_capacity;
+
+  // Total number of handles in the set (including duplicates).
+  // We use this to ensure that we provide consistent capacity errors;
+  iree_host_size_t total_handle_count;
+
+  // Number of handles in the set (excluding duplicates), defining the valid
+  // size of both user_handles and native_handles.
+  iree_host_size_t handle_count;
+
+  // De-duped user-provided handles. iree_wait_handle_t::set_internal.dupe_count
+  // is used to indicate how many additional duplicates there are of a
+  // particular handle. For example, dupe_count=0 means that there are no
+  // duplicates.
+  iree_wait_handle_t* user_handles;
+
+  // Native list of win32 HANDLE we will pass directly to WFMO.
+  // This list may be smaller than the total_handle_count if handles have been
+  // deduplicated.
+  HANDLE* native_handles;
+};
+
+iree_status_t iree_wait_set_allocate(iree_host_size_t capacity,
+                                     iree_allocator_t allocator,
+                                     iree_wait_set_t** out_set) {
+  IREE_ASSERT_ARGUMENT(out_set);
+
+  // Be reasonable; 64 MAXIMUM_WAIT_OBJECTS is low, but 64K objects is too high.
+  if (capacity >= UINT16_MAX) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "wait set capacity of %zu is unreasonably large",
+                            capacity);
+  }
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_host_size_t user_handle_list_size =
+      capacity * sizeof(iree_wait_handle_t);
+  iree_host_size_t native_handle_list_size = capacity * sizeof(HANDLE);
+  iree_host_size_t total_size = iree_sizeof_struct(iree_wait_set_t) +
+                                user_handle_list_size + native_handle_list_size;
+
+  iree_wait_set_t* set = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_allocator_malloc(allocator, total_size, (void**)&set));
+  set->allocator = allocator;
+  set->handle_capacity = capacity;
+  iree_wait_set_clear(set);
+
+  set->user_handles =
+      (iree_wait_handle_t*)((uint8_t*)set +
+                            iree_sizeof_struct(iree_wait_set_t));
+  set->native_handles =
+      (HANDLE*)((uint8_t*)set->user_handles + user_handle_list_size);
+
+  *out_set = set;
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+void iree_wait_set_free(iree_wait_set_t* set) {
+  if (!set) return;
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_allocator_free(set->allocator, set);
+  IREE_TRACE_ZONE_END(z0);
+}
+
+bool iree_wait_set_is_empty(const iree_wait_set_t* set) {
+  return set->handle_count != 0;
+}
+
+iree_status_t iree_wait_set_insert(iree_wait_set_t* set,
+                                   iree_wait_handle_t handle) {
+  if (set->total_handle_count + 1 > set->handle_capacity) {
+    return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED,
+                            "wait set capacity %" PRIhsz
+                            " reached; no more wait handles available",
+                            set->handle_capacity);
+  }
+
+  // First check to see if we already have the handle in the set; since APIs
+  // like WFMO don't allow duplicate handles in their arguments this is our
+  // workaround (with the benefit of also reducing the native handle count).
+  for (iree_host_size_t i = 0; i < set->handle_count; ++i) {
+    iree_wait_handle_t* existing_handle = &set->user_handles[i];
+    if (iree_wait_primitive_compare_identical(existing_handle, &handle)) {
+      // Handle already exists in the set; just increment the reference count.
+      ++existing_handle->set_internal.dupe_count;
+      ++set->total_handle_count;
+      return iree_ok_status();
+    }
+  }
+
+  HANDLE native_handle = NULL;
+  if (IREE_LIKELY(handle.type == IREE_WAIT_PRIMITIVE_TYPE_WIN32_HANDLE)) {
+    // Our normal handle type; pass-through below.
+    native_handle = (HANDLE)handle.value.win32.handle;
+  } else {
+    return iree_make_status(
+        IREE_STATUS_UNIMPLEMENTED,
+        "unimplemented primitive type %d (expected PERMANENT/WIN32_HANDLE)",
+        (int)handle.type);
+  }
+
+  // There's a max of 64 waitable handles. If we want to support more than that
+  // we can spawn threads to wait on 64 objects and then wait on all those
+  // threads. For example:
+  //   iree_wait_multi(...180 handles...):
+  //     -> spawn th0 and wait on handles 0-63 (64 handles)
+  //     -> spawn th1 and wait on handles 64-127 (64 handles)
+  //     wait on [th0, th1, handles 128-179] (threads + 52 remaining handles)
+  //
+  // At the point you're multiwaiting on that many things, though, it indicates
+  // that there may be higher level coalescing that can be done by the
+  // application itself (by, say, multiplexing sockets onto a single fd instead
+  // of trying to wait on every unique socket handle via this API).
+  if (native_handle &&
+      IREE_UNLIKELY(set->handle_count + 1 > MAXIMUM_WAIT_OBJECTS)) {
+    return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED,
+                            "max wait objects exceeded; only up to %d native "
+                            "wait handles are supported in WFMO",
+                            (int)MAXIMUM_WAIT_OBJECTS);
+  }
+
+  ++set->total_handle_count;
+  iree_host_size_t index = set->handle_count++;
+  iree_wait_handle_t* user_handle = &set->user_handles[index];
+  iree_wait_handle_wrap_primitive(handle.type, handle.value, user_handle);
+  user_handle->set_internal.dupe_count = 0;  // just us so far
+  set->native_handles[index] = native_handle;
+
+  return iree_ok_status();
+}
+
+void iree_wait_set_erase(iree_wait_set_t* set, iree_wait_handle_t handle) {
+  // Find the user handle in the set. This either requires a linear scan to
+  // find the matching user handle or - if valid - we can use the native index
+  // set after an iree_wait_any wake to do a quick lookup.
+  iree_host_size_t index = handle.set_internal.index;
+  if (IREE_UNLIKELY(index >= set->handle_count) ||
+      IREE_UNLIKELY(!iree_wait_primitive_compare_identical(
+          &set->user_handles[index], &handle))) {
+    // Fallback to a linear scan of (hopefully) a small list.
+    for (iree_host_size_t i = 0; i < set->handle_count; ++i) {
+      if (iree_wait_primitive_compare_identical(&set->user_handles[i],
+                                                &handle)) {
+        index = i;
+        break;
+      }
+    }
+  }
+
+  // Decrement reference count.
+  iree_wait_handle_t* existing_handle = &set->user_handles[index];
+  if (existing_handle->set_internal.dupe_count-- > 0) {
+    // Still one or more remaining in the set; leave it in the handle list.
+    --set->total_handle_count;
+    return;
+  }
+
+  // No more references remaining; remove from both handle lists.
+  // Since we make no guarantees about the order of the lists we can just swap
+  // with the last value.
+  int tail_index = (int)set->handle_count - 1;
+  if (tail_index > index) {
+    memcpy(&set->native_handles[index], &set->native_handles[tail_index],
+           sizeof(*set->native_handles));
+    memcpy(&set->user_handles[index], &set->user_handles[tail_index],
+           sizeof(*set->user_handles));
+  }
+  --set->total_handle_count;
+  --set->handle_count;
+}
+
+void iree_wait_set_clear(iree_wait_set_t* set) {
+  set->total_handle_count = 0;
+  set->handle_count = 0;
+}
+
+static iree_status_t iree_wait_multi(iree_wait_set_t* set, bool require_all,
+                                     iree_time_t deadline_ns,
+                                     iree_wait_handle_t* out_wake_handle) {
+  // TODO(benvanik): see if we can use tracy's mutex tracking to make waits
+  // nicer (at least showing signal->wait relations).
+
+  // Early-exit when there's nothing to wait on.
+  if (set->handle_count == 0) {
+    if (out_wake_handle) memset(out_wake_handle, 0, sizeof(*out_wake_handle));
+    return iree_ok_status();
+  }
+
+  // Remap absolute timeout to relative timeout, handling special values as
+  // needed.
+  DWORD timeout_ms = iree_absolute_deadline_to_timeout_ms(deadline_ns);
+
+  // Perform the wait; this is allowed to yield the calling thread even if the
+  // timeout_ms is 0 to indicate a poll.
+  DWORD result =
+      WaitForMultipleObjectsEx(set->handle_count, set->native_handles,
+                               /*bWaitAll=*/(require_all ? TRUE : FALSE),
+                               timeout_ms, /*bAlertable=*/FALSE);
+
+  if (result == WAIT_TIMEOUT) {
+    // Timeout elapsed while waiting; note that the timeout may have been 0 to
+    // force a poll and be an expected result. We avoid a full status object
+    // here as we don't want to track all that in non-exceptional cases.
+    return iree_status_from_code(IREE_STATUS_DEADLINE_EXCEEDED);
+  } else if (result >= WAIT_OBJECT_0 &&
+             result < WAIT_OBJECT_0 + set->handle_count) {
+    // One (or more) handles were signaled sucessfully.
+    if (out_wake_handle) {
+      DWORD wake_index = result - WAIT_OBJECT_0;
+      iree_wait_primitive_value_t wake_value;
+      memset(&wake_value, 0, sizeof(wake_value));
+      wake_value.win32.handle = (uintptr_t)set->native_handles[wake_index];
+      iree_wait_handle_wrap_primitive(IREE_WAIT_PRIMITIVE_TYPE_WIN32_HANDLE,
+                                      wake_value, out_wake_handle);
+
+      // Optimization for wait-wake-erase; this lets us avoid scanning the
+      // native handle list (the kernel already did that for us!).
+      out_wake_handle->set_internal.index = wake_index;
+    }
+    return iree_ok_status();
+  } else if (result >= WAIT_ABANDONED_0 &&
+             result < WAIT_ABANDONED_0 + set->handle_count) {
+    // One (or more) mutex handles were abandonded during the wait.
+    // This happens when a thread holding the mutex dies without releasing it.
+    // This is less common in-process and more for the cross-process situations
+    // where we have duped/opened a remote handle and the remote process dies.
+    // That's a pretty situation but not quite unheard of in sandboxing impls
+    // where death is a feature.
+    //
+    // NOTE: we shouldn't get abandoned handles in regular cases - both because
+    // we don't really use mutex handles (though users may provide them) and
+    // that mutex abandonment is exceptional. If you see this you are probably
+    // going to want to look for thread exit messages or zombie processes.
+    DWORD wake_index = result - WAIT_ABANDONED_0;
+    return iree_make_status(
+        IREE_STATUS_DATA_LOSS,
+        "mutex native handle %lu abanonded; shared state is "
+        "(likely) inconsistent",
+        wake_index);
+  } else if (result == WAIT_FAILED) {
+    return iree_make_status(iree_status_code_from_win32_error(GetLastError()),
+                            "WFMO failed");
+  } else {
+    return iree_make_status(IREE_STATUS_INTERNAL,
+                            "WFMO internal error (unimplemented APC?)");
+  }
+}
+
+iree_status_t iree_wait_all(iree_wait_set_t* set, iree_time_t deadline_ns) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_status_t status =
+      iree_wait_multi(set, /*require_all=*/true, deadline_ns, NULL);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+iree_status_t iree_wait_any(iree_wait_set_t* set, iree_time_t deadline_ns,
+                            iree_wait_handle_t* out_wake_handle) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_status_t status =
+      iree_wait_multi(set, /*require_all=*/false, deadline_ns, out_wake_handle);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+iree_status_t iree_wait_one(iree_wait_handle_t* handle,
+                            iree_time_t deadline_ns) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Remap absolute timeout to relative timeout, handling special values as
+  // needed.
+  DWORD timeout_ms = iree_absolute_deadline_to_timeout_ms(deadline_ns);
+
+  // Perform the wait; this is allowed to yield the calling thread even if the
+  // timeout_ms is 0 to indicate a poll.
+  DWORD result =
+      WaitForSingleObjectEx((HANDLE)handle->value.win32.handle, timeout_ms,
+                            /*bAlertable=*/FALSE);
+
+  iree_status_t status;
+  if (result == WAIT_TIMEOUT) {
+    // Timeout elapsed while waiting; note that the timeout may have been 0 to
+    // force a poll and be an expected result. We avoid a full status object
+    // here as we don't want to track all that in non-exceptional cases.
+    status = iree_status_from_code(IREE_STATUS_DEADLINE_EXCEEDED);
+  } else if (result == WAIT_OBJECT_0) {
+    // Handle was signaled successfully.
+    status = iree_ok_status();
+  } else if (result == WAIT_ABANDONED_0) {
+    // The mutex handle was abandonded during the wait.
+    // This happens when a thread holding the mutex dies without releasing it.
+    // This is less common in-process and more for the cross-process situations
+    // where we have duped/opened a remote handle and the remote process dies.
+    // That's a pretty situation but not quite unheard of in sandboxing impls
+    // where death is a feature.
+    //
+    // NOTE: we shouldn't get abandoned handles in regular cases - both because
+    // we don't really use mutex handles (though users may provide them) and
+    // that mutex abandonment is exceptional. If you see this you are probably
+    // going to want to look for thread exit messages or zombie processes.
+    status = iree_make_status(IREE_STATUS_DATA_LOSS,
+                              "mutex native handle abanonded; shared state is "
+                              "(likely) inconsistent");
+  } else if (result == WAIT_FAILED) {
+    status = iree_make_status(iree_status_code_from_win32_error(GetLastError()),
+                              "WFSO failed");
+  } else {
+    status = iree_make_status(IREE_STATUS_INTERNAL,
+                              "WFSO internal error (unimplemented APC?)");
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+//===----------------------------------------------------------------------===//
+// iree_event_t
+//===----------------------------------------------------------------------===//
+
+iree_status_t iree_event_initialize(bool initial_state,
+                                    iree_event_t* out_event) {
+  memset(out_event, 0, sizeof(*out_event));
+  iree_wait_primitive_value_t value;
+  memset(&value, 0, sizeof(value));
+  value.win32.handle =
+      (uintptr_t)CreateEvent(NULL, TRUE, initial_state ? TRUE : FALSE, NULL);
+  if (!value.win32.handle) {
+    return iree_make_status(iree_status_code_from_win32_error(GetLastError()),
+                            "unable to create event");
+  }
+  iree_wait_handle_wrap_primitive(IREE_WAIT_PRIMITIVE_TYPE_WIN32_HANDLE, value,
+                                  out_event);
+  return iree_ok_status();
+}
+
+void iree_event_deinitialize(iree_event_t* event) {
+  iree_wait_handle_close(event);
+}
+
+void iree_event_set(iree_event_t* event) {
+  HANDLE handle = (HANDLE)event->value.win32.handle;
+  if (handle) SetEvent(handle);
+}
+
+void iree_event_reset(iree_event_t* event) {
+  HANDLE handle = (HANDLE)event->value.win32.handle;
+  if (handle) ResetEvent(handle);
+}
+
+#endif  // IREE_WAIT_API == IREE_WAIT_API_WIN32
diff --git a/runtime/src/iree/base/logging.cc b/runtime/src/iree/base/logging.cc
new file mode 100644
index 0000000..b27342f
--- /dev/null
+++ b/runtime/src/iree/base/logging.cc
@@ -0,0 +1,189 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/logging.h"
+
+#include <cstddef>
+#include <cstdio>
+#include <cstdlib>
+
+#ifdef __ANDROID__
+#include <android/log.h>
+#endif
+
+#include "iree/base/internal/flags.h"
+#include "iree/base/tracing.h"
+
+IREE_FLAG(int32_t, iree_minloglevel, 0,
+          "Minimum logging level. 0 = INFO and above.");
+IREE_FLAG(int32_t, iree_v, 0,
+          "Verbosity level maximum. 1 = IREE_VLOG(0-1), 2 = IREE_VLOG(0-2).");
+
+namespace iree {
+namespace internal {
+
+namespace {
+
+// Parse log level (int64_t) from environment variable (char*).
+// Returns true if the value was present and parsed successfully.
+bool LogLevelStrToInt(const char* iree_env_var_val, int64_t* out_level) {
+  *out_level = 0;
+  if (iree_env_var_val == nullptr) {
+    return false;
+  }
+
+  std::string min_log_level(iree_env_var_val);
+  std::istringstream ss(min_log_level);
+  int64_t level;
+  if (!(ss >> level)) {
+    // Invalid vlog level setting, set level to default (0).
+    return false;
+  }
+
+  *out_level = level;
+  return true;
+}
+
+int64_t MinLogLevelFromEnv() {
+  const char* iree_env_var_val = getenv("IREE_MIN_LOG_LEVEL");
+  int64_t level = 0;
+  if (LogLevelStrToInt(iree_env_var_val, &level)) {
+    return level;
+  }
+  return FLAG_iree_minloglevel;
+}
+
+int64_t MinVLogLevelFromEnv() {
+  const char* iree_env_var_val = getenv("IREE_MIN_VLOG_LEVEL");
+  int64_t level = 0;
+  if (LogLevelStrToInt(iree_env_var_val, &level)) {
+    return level;
+  }
+  return FLAG_iree_v;
+}
+
+}  // namespace
+
+LogMessage::LogMessage(const char* file_name, int line, int severity)
+    : file_name_(file_name), line_(line), severity_(severity) {}
+
+LogMessage::~LogMessage() {
+  // Read the min log level once during the first call to logging.
+  static int64_t min_log_level = MinLogLevelFromEnv();
+  if (IREE_LIKELY(severity_ >= min_log_level)) {
+    EmitLogMessage();
+  }
+}
+
+int64_t LogMessage::MinVLogLevel() {
+  static int64_t min_vlog_level = MinVLogLevelFromEnv();
+  return min_vlog_level;
+}
+
+void LogMessage::EmitLogMessage() {
+  // TODO(scotttodd): Include current system time
+  fprintf(stderr, "%c %s:%d] %s\n", "IWEF"[severity_], file_name_, line_,
+          str().c_str());
+
+#if defined(__ANDROID__)
+  // Define equivalent android log levels to map to IREE.
+  constexpr int kStatusToAndroidLevel[4] = {
+      4,  // Android info
+      5,  // Android waring
+      6,  // Android error
+      6   // Android fatal (doesn't exist, so reusing error)
+  };
+
+  // NOTE: this truncates. That's fine for now and stderr is still usable.
+  int android_severity = kStatusToAndroidLevel[severity_];
+  {
+    // NOTE: this truncates. That's fine for now and stderr is still usable.
+    char str_buffer[512];
+    snprintf(str_buffer, sizeof(str_buffer), "%s:%d] %s\n", file_name_, line_,
+             str().c_str());
+    __android_log_write(android_severity, "native", str_buffer);
+  }
+#endif  // !defined(__ANDROID__)
+
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_LOG_MESSAGES
+  constexpr int kLevelColors[4] = {
+      IREE_TRACING_MESSAGE_LEVEL_INFO,     // INFO
+      IREE_TRACING_MESSAGE_LEVEL_WARNING,  // WARNING
+      IREE_TRACING_MESSAGE_LEVEL_ERROR,    // ERROR
+      IREE_TRACING_MESSAGE_LEVEL_ERROR,    // FATAL
+  };
+  {
+    // NOTE: this truncates. That's fine for now and stderr is still usable.
+    char str_buffer[512];
+    int str_length = snprintf(str_buffer, sizeof(str_buffer), "%s:%d] %s\n",
+                              file_name_, line_, str().c_str());
+    IREE_TRACE_MESSAGE_DYNAMIC_COLORED(kLevelColors[severity_], str_buffer,
+                                       str_length);
+  }
+#endif  // IREE_TRACING_FEATURES& IREE_TRACING_FEATURE_LOG_MESSAGES
+}
+
+LogMessageFatal::LogMessageFatal(const char* file, int line)
+    : LogMessage(file, line, FATAL) {}
+
+LogMessageFatal::~LogMessageFatal() {
+  EmitLogMessage();
+
+  // abort() ensures we don't return (as promised via ATTRIBUTE_NORETURN).
+  abort();
+}
+
+template <>
+void MakeCheckOpValueString(std::ostream* os, const char& v) {
+  if (v >= 32 && v <= 126) {
+    (*os) << "'" << v << "'";
+  } else {
+    (*os) << "char value " << static_cast<int16_t>(v);
+  }
+}
+
+template <>
+void MakeCheckOpValueString(std::ostream* os, const int8_t& v) {
+  if (v >= 32 && v <= 126) {
+    (*os) << "'" << v << "'";
+  } else {
+    (*os) << "signed char value " << static_cast<int16_t>(v);
+  }
+}
+
+template <>
+void MakeCheckOpValueString(std::ostream* os, const uint8_t& v) {
+  if (v >= 32 && v <= 126) {
+    (*os) << "'" << v << "'";
+  } else {
+    (*os) << "unsigned char value " << static_cast<uint16_t>(v);
+  }
+}
+
+template <>
+void MakeCheckOpValueString(std::ostream* os, const std::nullptr_t& v) {
+  (*os) << "nullptr";
+}
+
+CheckOpMessageBuilder::CheckOpMessageBuilder(const char* exprtext)
+    : stream_(new std::ostringstream) {
+  *stream_ << "Check failed: " << exprtext << " (";
+}
+
+CheckOpMessageBuilder::~CheckOpMessageBuilder() { delete stream_; }
+
+std::ostream* CheckOpMessageBuilder::ForVar2() {
+  *stream_ << " vs. ";
+  return stream_;
+}
+
+std::string* CheckOpMessageBuilder::NewString() {
+  *stream_ << ")";
+  return new std::string(stream_->str());
+}
+
+}  // namespace internal
+}  // namespace iree
diff --git a/runtime/src/iree/base/logging.h b/runtime/src/iree/base/logging.h
new file mode 100644
index 0000000..3051dc3
--- /dev/null
+++ b/runtime/src/iree/base/logging.h
@@ -0,0 +1,374 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+//===----------------------------------------------------------------------===//
+//                                                                            //
+//  (             (      (                                             (      //
+//  )\ )          )\ )   )\ )           (       (        *   )         )\ )   //
+// (()/(    (    (()/(  (()/(   (       )\      )\     ` )  /(   (    (()/(   //
+//  /(_))   )\    /(_))  /(_))  )\    (((_)  ((((_)(    ( )(_))  )\    /(_))  //
+// (_))_   ((_)  (_))   (_))   ((_)   )\___   )\ _ )\  (_(_())  ((_)  (_))_   //
+//  |   \  | __| | _ \  | _ \  | __| ((/ __|  (_)_\(_) |_   _|  | __|  |   \  //
+//  | |) | | _|  |  _/  |   /  | _|   | (__    / _ \     | |    | _|   | |) | //
+//  |___/  |___| |_|    |_|_\  |___|   \___|  /_/ \_\    |_|    |___|  |___/  //
+//                                                                            //
+//===----------------------------------------------------------------------===//
+// TODO(#2843): replace this file with a C sink API. IREE itself should not
+// perform any logging by default and instead route all logging through a
+// pluggable interface (similar to how we have iree_allocator_t to plug in
+// allocators). This will allow applications to scope their logging (critical
+// in multi-tenant situations where logs need to route back to clients), bring
+// their own logging libraries, and support logging on platforms we otherwise
+// cannot. The code in this file is currently C++ only and not great.
+
+#ifndef IREE_BASE_LOGGING_H_
+#define IREE_BASE_LOGGING_H_
+
+// IREE_LOG(severity) << ...;
+//   Logs a message at the given severity.
+//   Severity:
+//     INFO    Logs information text.
+//     WARNING Logs a warning.
+//     ERROR   Logs an error.
+//     FATAL   Logs an error and exit(1).
+//
+// IREE_DLOG(severity) << ...;
+//   Behaves like `IREE_LOG` in debug mode (i.e. `#ifndef NDEBUG`).
+//   Otherwise, it compiles away and does nothing.
+//
+// IREE_VLOG(level) << ...;
+//   Logs a verbose message at the given verbosity level.
+//
+// IREE_DVLOG(level) << ...;
+//   Behaves like `IREE_VLOG` in debug mode (i.e. `#ifndef NDEBUG`).
+//   Otherwise, it compiles away and does nothing.
+//
+// IREE_CHECK(condition) << ...;
+//   Runtime asserts that the given condition is true even in release builds.
+//   It's recommended that IREE_DCHECK is used instead as too many CHECKs
+//   can impact performance.
+//
+// IREE_CHECK_EQ|NE|LT|GT|LE|GE(val1, val2) << ...;
+//   Runtime assert the specified operation with the given values.
+//
+// IREE_DCHECK(condition) << ...;
+//   Runtime asserts that the given condition is true only in non-opt builds.
+//
+// IREE_DCHECK_EQ|NE|LT|GT|LE|GE(val1, val2) << ...;
+//   Runtime assert the specified operation with the given values in non-opt
+//   builds.
+//
+// IREE_QCHECK(condition) << ...;
+// IREE_QCHECK_EQ|NE|LT|GT|LE|GE(val1, val2) << ...;
+//   These behave like `IREE_CHECK` but do not print a full stack trace.
+//   They are useful when problems are definitely unrelated to program flow,
+//   e.g. when validating user input.
+
+#include <cstddef>
+#include <cstdint>
+#include <ios>
+#include <limits>
+#include <sstream>
+#include <string>
+
+#include "iree/base/attributes.h"
+
+namespace iree {
+
+// ------------------------------------------------------------------------- //
+// |                               IREE_LOG                                | //
+// ------------------------------------------------------------------------- //
+
+// Severity levels for IREE_LOG().
+const int INFO = 0;
+const int WARNING = 1;
+const int ERROR = 2;
+const int FATAL = 3;
+
+namespace internal {
+
+class LogMessage : public std::basic_ostringstream<char> {
+ public:
+  LogMessage(const char* file_name, int line, int severity);
+  ~LogMessage();
+
+  const char* file_name() const { return file_name_; }
+  int line() const { return line_; }
+  int severity() const { return severity_; }
+
+  // Returns the minimum log level for IREE_VLOG statements.
+  // E.g., if MinVLogLevel() is 2, then IREE_VLOG(2) statements will produce
+  // output, but IREE_VLOG(3) will not. Defaults to 0.
+  static int64_t MinVLogLevel();
+
+ protected:
+  void EmitLogMessage();
+
+ private:
+  const char* file_name_;
+  int line_;
+  int severity_;
+};
+
+// LogMessageFatal ensures the process exits in failure after logging a message.
+class LogMessageFatal : public LogMessage {
+ public:
+  LogMessageFatal(const char* file, int line) IREE_ATTRIBUTE_COLD;
+  IREE_ATTRIBUTE_NORETURN ~LogMessageFatal();
+};
+
+// NullStream implements operator<< but does nothing.
+class NullStream {
+ public:
+  NullStream& stream() { return *this; }
+};
+template <typename T>
+inline NullStream& operator<<(NullStream& str, const T&) {
+  return str;
+}
+inline NullStream& operator<<(NullStream& str,
+                              std::ostream& (*)(std::ostream& os)) {
+  return str;
+}
+inline NullStream& operator<<(NullStream& str,
+                              std::ios_base& (*)(std::ios_base& os)) {
+  return str;
+}
+
+#define _IREE_LOG_INFO \
+  ::iree::internal::LogMessage(__FILE__, __LINE__, ::iree::INFO)
+#define _IREE_LOG_WARNING \
+  ::iree::internal::LogMessage(__FILE__, __LINE__, ::iree::WARNING)
+#define _IREE_LOG_ERROR \
+  ::iree::internal::LogMessage(__FILE__, __LINE__, ::iree::ERROR)
+#define _IREE_LOG_FATAL ::iree::internal::LogMessageFatal(__FILE__, __LINE__)
+
+#define IREE_LOG(severity) _IREE_LOG_##severity
+
+#ifndef NDEBUG
+#define IREE_DLOG IREE_LOG
+#else
+#define IREE_DLOG(severity) \
+  switch (0)                \
+  default:                  \
+    ::iree::internal::NullStream().stream()
+#endif
+
+#define IREE_VLOG_IS_ON(lvl) \
+  ((lvl) <= ::iree::internal::LogMessage::MinVLogLevel())
+
+#define IREE_VLOG(lvl)                     \
+  if (IREE_UNLIKELY(IREE_VLOG_IS_ON(lvl))) \
+  ::iree::internal::LogMessage(__FILE__, __LINE__, ::iree::INFO)
+
+// `IREE_DVLOG` behaves like `IREE_VLOG` in debug mode (i.e. `#ifndef NDEBUG`).
+// Otherwise, it compiles away and does nothing.
+#ifndef NDEBUG
+#define IREE_DVLOG IREE_VLOG
+#else
+#define IREE_DVLOG(verbose_level) \
+  while (false && (verbose_level) > 0) ::iree::internal::NullStream().stream()
+#endif  // !NDEBUG
+
+// ------------------------------------------------------------------------- //
+// |                              IREE_CHECK                               | //
+// ------------------------------------------------------------------------- //
+
+// IREE_CHECK dies with a fatal error if condition is not true.  It is *not*
+// controlled by NDEBUG, so the check will be executed regardless of
+// compilation mode.  Therefore, it is safe to do things like:
+//    IREE_CHECK(fp->Write(x) == 4)
+#define IREE_CHECK(condition)      \
+  if (IREE_UNLIKELY(!(condition))) \
+  IREE_LOG(FATAL) << "Check failed: " #condition " "
+
+// Function is overloaded for integral types to allow static const
+// integrals declared in classes and not defined to be used as arguments to
+// IREE_CHECK* macros. It's not encouraged though.
+template <typename T>
+inline const T& GetReferenceableValue(const T& t) {
+  return t;
+}
+inline char GetReferenceableValue(char t) { return t; }
+inline int8_t GetReferenceableValue(int8_t t) { return t; }
+inline uint8_t GetReferenceableValue(uint8_t t) { return t; }
+inline int16_t GetReferenceableValue(int16_t t) { return t; }
+inline uint16_t GetReferenceableValue(uint16_t t) { return t; }
+inline int32_t GetReferenceableValue(int32_t t) { return t; }
+inline uint32_t GetReferenceableValue(uint32_t t) { return t; }
+inline int64_t GetReferenceableValue(int64_t t) { return t; }
+inline uint64_t GetReferenceableValue(uint64_t t) { return t; }
+
+// This formats a value for a failing IREE_CHECK_XX statement.  Ordinarily,
+// it uses the definition for operator<<, with a few special cases below.
+template <typename T>
+inline void MakeCheckOpValueString(std::ostream* os, const T& v) {
+  (*os) << v;
+}
+
+// Overrides for char types provide readable values for unprintable
+// characters.
+template <>
+void MakeCheckOpValueString(std::ostream* os, const char& v);
+template <>
+void MakeCheckOpValueString(std::ostream* os, const int8_t& v);
+template <>
+void MakeCheckOpValueString(std::ostream* os, const uint8_t& v);
+// We need an explicit specialization for std::nullptr_t.
+template <>
+void MakeCheckOpValueString(std::ostream* os, const std::nullptr_t& v);
+
+// A container for a string pointer which can be evaluated to a bool -
+// true iff the pointer is non-NULL.
+struct CheckOpString {
+  CheckOpString(std::string* str) : str_(str) {}  // NOLINT
+  // No destructor: if str_ is non-NULL, we're about to IREE_LOG(FATAL),
+  // so there's no point in cleaning up str_.
+  operator bool() const { return IREE_UNLIKELY(str_ != NULL); }
+  std::string* str_;
+};
+
+// Build the error message string. Specify no inlining for code size.
+template <typename T1, typename T2>
+std::string* MakeCheckOpString(const T1& v1, const T2& v2,
+                               const char* exprtext) IREE_ATTRIBUTE_NOINLINE;
+
+// A helper class for formatting "expr (V1 vs. V2)" in a IREE_CHECK_XX
+// statement. See MakeCheckOpString for sample usage.
+class CheckOpMessageBuilder {
+ public:
+  // Inserts "exprtext" and " (" to the stream.
+  explicit CheckOpMessageBuilder(const char* exprtext);
+  // Deletes "stream_".
+  ~CheckOpMessageBuilder();
+  // For inserting the first variable.
+  std::ostream* ForVar1() { return stream_; }
+  // For inserting the second variable (adds an intermediate " vs. ").
+  std::ostream* ForVar2();
+  // Get the result (inserts the closing ")").
+  std::string* NewString();
+
+ private:
+  std::ostringstream* stream_;
+};
+
+template <typename T1, typename T2>
+std::string* MakeCheckOpString(const T1& v1, const T2& v2,
+                               const char* exprtext) {
+  CheckOpMessageBuilder comb(exprtext);
+  MakeCheckOpValueString(comb.ForVar1(), v1);
+  MakeCheckOpValueString(comb.ForVar2(), v2);
+  return comb.NewString();
+}
+
+// Helper functions for IREE_CHECK_OP macro.
+// The (int, int) specialization works around the issue that the compiler
+// will not instantiate the template version of the function on values of
+// unnamed enum type - see comment below.
+// The (size_t, int) and (int, size_t) specialization are to handle unsigned
+// comparison errors while still being thorough with the comparison.
+#define _IREE_DEFINE_CHECK_OP_IMPL(name, op)                             \
+  template <typename T1, typename T2>                                    \
+  inline std::string* name##Impl(const T1& v1, const T2& v2,             \
+                                 const char* exprtext) {                 \
+    if (IREE_LIKELY(v1 op v2))                                           \
+      return NULL;                                                       \
+    else                                                                 \
+      return ::iree::internal::MakeCheckOpString(v1, v2, exprtext);      \
+  }                                                                      \
+  inline std::string* name##Impl(int v1, int v2, const char* exprtext) { \
+    return name##Impl<int, int>(v1, v2, exprtext);                       \
+  }                                                                      \
+  inline std::string* name##Impl(const size_t v1, const int v2,          \
+                                 const char* exprtext) {                 \
+    if (IREE_UNLIKELY(v2 < 0)) {                                         \
+      return ::iree::internal::MakeCheckOpString(v1, v2, exprtext);      \
+    }                                                                    \
+    const size_t uval = (size_t)((unsigned)v1);                          \
+    return name##Impl<size_t, size_t>(uval, v2, exprtext);               \
+  }                                                                      \
+  inline std::string* name##Impl(const int v1, const size_t v2,          \
+                                 const char* exprtext) {                 \
+    if (IREE_UNLIKELY(v2 >= std::numeric_limits<int>::max())) {          \
+      return ::iree::internal::MakeCheckOpString(v1, v2, exprtext);      \
+    }                                                                    \
+    const size_t uval = (size_t)((unsigned)v2);                          \
+    return name##Impl<size_t, size_t>(v1, uval, exprtext);               \
+  }
+
+_IREE_DEFINE_CHECK_OP_IMPL(Check_EQ, ==)
+_IREE_DEFINE_CHECK_OP_IMPL(Check_NE, !=)
+_IREE_DEFINE_CHECK_OP_IMPL(Check_LE, <=)
+_IREE_DEFINE_CHECK_OP_IMPL(Check_LT, <)
+_IREE_DEFINE_CHECK_OP_IMPL(Check_GE, >=)
+_IREE_DEFINE_CHECK_OP_IMPL(Check_GT, >)
+#undef _IREE_DEFINE_CHECK_OP_IMPL
+
+// In optimized mode, use CheckOpString to hint to compiler that
+// the while condition is unlikely.
+#define IREE_CHECK_OP_LOG(name, op, val1, val2)                 \
+  while (::iree::internal::CheckOpString _result =              \
+             ::iree::internal::name##Impl(                      \
+                 ::iree::internal::GetReferenceableValue(val1), \
+                 ::iree::internal::GetReferenceableValue(val2), \
+                 #val1 " " #op " " #val2))                      \
+  ::iree::internal::LogMessageFatal(__FILE__, __LINE__) << *(_result.str_)
+
+#define IREE_CHECK_OP(name, op, val1, val2) \
+  IREE_CHECK_OP_LOG(name, op, val1, val2)
+
+// IREE_CHECK_EQ/NE/...
+#define IREE_CHECK_EQ(val1, val2) IREE_CHECK_OP(Check_EQ, ==, val1, val2)
+#define IREE_CHECK_NE(val1, val2) IREE_CHECK_OP(Check_NE, !=, val1, val2)
+#define IREE_CHECK_LE(val1, val2) IREE_CHECK_OP(Check_LE, <=, val1, val2)
+#define IREE_CHECK_LT(val1, val2) IREE_CHECK_OP(Check_LT, <, val1, val2)
+#define IREE_CHECK_GE(val1, val2) IREE_CHECK_OP(Check_GE, >=, val1, val2)
+#define IREE_CHECK_GT(val1, val2) IREE_CHECK_OP(Check_GT, >, val1, val2)
+
+#ifndef NDEBUG
+#define IREE_DCHECK(condition) IREE_CHECK(condition)
+#define IREE_DCHECK_EQ(val1, val2) IREE_CHECK_EQ(val1, val2)
+#define IREE_DCHECK_NE(val1, val2) IREE_CHECK_NE(val1, val2)
+#define IREE_DCHECK_LE(val1, val2) IREE_CHECK_LE(val1, val2)
+#define IREE_DCHECK_LT(val1, val2) IREE_CHECK_LT(val1, val2)
+#define IREE_DCHECK_GE(val1, val2) IREE_CHECK_GE(val1, val2)
+#define IREE_DCHECK_GT(val1, val2) IREE_CHECK_GT(val1, val2)
+
+#else
+
+#define IREE_DCHECK(condition) \
+  while (false && (condition)) IREE_LOG(FATAL)
+
+// NDEBUG is defined, so IREE_DCHECK_EQ(x, y) and so on do nothing.
+// However, we still want the compiler to parse x and y, because
+// we don't want to lose potentially useful errors and warnings.
+// _IREE_DCHECK_NOP is a helper, and should not be used outside of this file.
+#define _IREE_DCHECK_NOP(x, y) \
+  while (false && ((void)(x), (void)(y), 0)) IREE_LOG(FATAL)
+
+#define IREE_DCHECK_EQ(x, y) _IREE_DCHECK_NOP(x, y)
+#define IREE_DCHECK_NE(x, y) _IREE_DCHECK_NOP(x, y)
+#define IREE_DCHECK_LE(x, y) _IREE_DCHECK_NOP(x, y)
+#define IREE_DCHECK_LT(x, y) _IREE_DCHECK_NOP(x, y)
+#define IREE_DCHECK_GE(x, y) _IREE_DCHECK_NOP(x, y)
+#define IREE_DCHECK_GT(x, y) _IREE_DCHECK_NOP(x, y)
+
+#endif  // !NDEBUG
+
+// These are for when you don't want a IREE_CHECK failure to print a verbose
+// stack trace.  The implementation of IREE_CHECK* in this file already doesn't.
+#define IREE_QCHECK(condition) IREE_CHECK(condition)
+#define IREE_QCHECK_EQ(x, y) IREE_CHECK_EQ(x, y)
+#define IREE_QCHECK_NE(x, y) IREE_CHECK_NE(x, y)
+#define IREE_QCHECK_LE(x, y) IREE_CHECK_LE(x, y)
+#define IREE_QCHECK_LT(x, y) IREE_CHECK_LT(x, y)
+#define IREE_QCHECK_GE(x, y) IREE_CHECK_GE(x, y)
+#define IREE_QCHECK_GT(x, y) IREE_CHECK_GT(x, y)
+
+}  // namespace internal
+}  // namespace iree
+
+#endif  // IREE_BASE_LOGGING_H_
diff --git a/runtime/src/iree/base/loop.c b/runtime/src/iree/base/loop.c
new file mode 100644
index 0000000..00dd83d
--- /dev/null
+++ b/runtime/src/iree/base/loop.c
@@ -0,0 +1,203 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/loop.h"
+
+#include "iree/base/tracing.h"
+
+//===----------------------------------------------------------------------===//
+// iree_loop_t
+//===----------------------------------------------------------------------===//
+
+IREE_API_EXPORT iree_status_t iree_loop_call(iree_loop_t loop,
+                                             iree_loop_priority_t priority,
+                                             iree_loop_callback_fn_t callback,
+                                             void* user_data) {
+  if (IREE_UNLIKELY(!loop.ctl)) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "null loop");
+  }
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  const iree_loop_call_params_t params = {
+      .callback =
+          {
+              .fn = callback,
+              .user_data = user_data,
+          },
+      .priority = priority,
+  };
+  iree_status_t status =
+      loop.ctl(loop.self, IREE_LOOP_COMMAND_CALL, &params, NULL);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_loop_dispatch(
+    iree_loop_t loop, const uint32_t workgroup_count_xyz[3],
+    iree_loop_workgroup_fn_t workgroup_callback,
+    iree_loop_callback_fn_t completion_callback, void* user_data) {
+  if (IREE_UNLIKELY(!loop.ctl)) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "null loop");
+  }
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, (uint64_t)workgroup_count_xyz[0]);
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, (uint64_t)workgroup_count_xyz[1]);
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, (uint64_t)workgroup_count_xyz[2]);
+
+  const iree_loop_dispatch_params_t params = {
+      .callback =
+          {
+              .fn = completion_callback,
+              .user_data = user_data,
+          },
+      .workgroup_fn = workgroup_callback,
+      .workgroup_count_xyz =
+          {
+              workgroup_count_xyz[0],
+              workgroup_count_xyz[1],
+              workgroup_count_xyz[2],
+          },
+  };
+  iree_status_t status =
+      loop.ctl(loop.self, IREE_LOOP_COMMAND_DISPATCH, &params, NULL);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t
+iree_loop_wait_until(iree_loop_t loop, iree_timeout_t timeout,
+                     iree_loop_callback_fn_t callback, void* user_data) {
+  if (IREE_UNLIKELY(!loop.ctl)) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "null loop");
+  }
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Capture time as an absolute value as we don't know when it's going to run.
+  iree_time_t deadline_ns = iree_timeout_as_deadline_ns(timeout);
+
+  const iree_loop_wait_until_params_t params = {
+      .callback =
+          {
+              .fn = callback,
+              .user_data = user_data,
+          },
+      .deadline_ns = deadline_ns,
+  };
+  iree_status_t status =
+      loop.ctl(loop.self, IREE_LOOP_COMMAND_WAIT_UNTIL, &params, NULL);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_loop_wait_one(
+    iree_loop_t loop, iree_wait_source_t wait_source, iree_timeout_t timeout,
+    iree_loop_callback_fn_t callback, void* user_data) {
+  if (IREE_UNLIKELY(!loop.ctl)) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "null loop");
+  }
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Capture time as an absolute value as we don't know when it's going to run.
+  iree_time_t deadline_ns = iree_timeout_as_deadline_ns(timeout);
+
+  const iree_loop_wait_one_params_t params = {
+      .callback =
+          {
+              .fn = callback,
+              .user_data = user_data,
+          },
+      .deadline_ns = deadline_ns,
+      .wait_source = wait_source,
+  };
+  iree_status_t status =
+      loop.ctl(loop.self, IREE_LOOP_COMMAND_WAIT_ONE, &params, NULL);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static iree_status_t iree_loop_wait_multi(
+    iree_loop_command_t command, iree_loop_t loop, iree_host_size_t count,
+    iree_wait_source_t* wait_sources, iree_timeout_t timeout,
+    iree_loop_callback_fn_t callback, void* user_data) {
+  if (count == 0) {
+    // No wait handles; issue the callback as if it had completed async.
+    return iree_loop_call(loop, IREE_LOOP_PRIORITY_DEFAULT, callback,
+                          user_data);
+  } else if (count == 1) {
+    // One wait handle can go down the fast path.
+    return iree_loop_wait_one(loop, wait_sources[0], timeout, callback,
+                              user_data);
+  }
+
+  // Capture time as an absolute value as we don't know when it's going to run.
+  iree_time_t deadline_ns = iree_timeout_as_deadline_ns(timeout);
+
+  const iree_loop_wait_multi_params_t params = {
+      .callback =
+          {
+              .fn = callback,
+              .user_data = user_data,
+          },
+      .deadline_ns = deadline_ns,
+      .count = count,
+      .wait_sources = wait_sources,
+  };
+  return loop.ctl(loop.self, command, &params, NULL);
+}
+
+IREE_API_EXPORT iree_status_t iree_loop_wait_any(
+    iree_loop_t loop, iree_host_size_t count, iree_wait_source_t* wait_sources,
+    iree_timeout_t timeout, iree_loop_callback_fn_t callback, void* user_data) {
+  if (IREE_UNLIKELY(!loop.ctl)) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "null loop");
+  }
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, (uint64_t)count);
+  iree_status_t status =
+      iree_loop_wait_multi(IREE_LOOP_COMMAND_WAIT_ANY, loop, count,
+                           wait_sources, timeout, callback, user_data);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_loop_wait_all(
+    iree_loop_t loop, iree_host_size_t count, iree_wait_source_t* wait_sources,
+    iree_timeout_t timeout, iree_loop_callback_fn_t callback, void* user_data) {
+  if (IREE_UNLIKELY(!loop.ctl)) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "null loop");
+  }
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, (uint64_t)count);
+  iree_status_t status =
+      iree_loop_wait_multi(IREE_LOOP_COMMAND_WAIT_ALL, loop, count,
+                           wait_sources, timeout, callback, user_data);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_loop_drain(iree_loop_t loop,
+                                              iree_timeout_t timeout) {
+  if (IREE_UNLIKELY(!loop.ctl)) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "null loop");
+  }
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Capture time as an absolute value as we don't know when it's going to run.
+  iree_time_t deadline_ns = iree_timeout_as_deadline_ns(timeout);
+
+  const iree_loop_drain_params_t params = {
+      .deadline_ns = deadline_ns,
+  };
+  iree_status_t status =
+      loop.ctl(loop.self, IREE_LOOP_COMMAND_DRAIN, &params, NULL);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
diff --git a/runtime/src/iree/base/loop.h b/runtime/src/iree/base/loop.h
new file mode 100644
index 0000000..da2cbd2
--- /dev/null
+++ b/runtime/src/iree/base/loop.h
@@ -0,0 +1,337 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BASE_LOOP_H_
+#define IREE_BASE_LOOP_H_
+
+#include <inttypes.h>
+
+#include "iree/base/allocator.h"
+#include "iree/base/attributes.h"
+#include "iree/base/status.h"
+#include "iree/base/time.h"
+#include "iree/base/wait_source.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+//===----------------------------------------------------------------------===//
+// iree_loop_t public API
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_loop_t iree_loop_t;
+typedef uint32_t iree_loop_command_t;
+
+// TODO(benvanik): define prioritization. This is useful for ensuring fast
+// coroutine switching by avoiding the current coroutine being set to the back
+// of the loop. It's easy to shoot yourself in the foot, though: cooperative
+// scheduling can be tricky.
+typedef enum iree_loop_priority_e {
+  IREE_LOOP_PRIORITY_DEFAULT = 0u,
+} iree_loop_priority_t;
+
+// Callback to execute user code used by the loop.
+// |user_data| contains the value provided to the callback when enqueuing the
+// operation and must remain live until the callback is made.
+//
+// If the callback is to be executed as normal |status| will be OK.
+// A non-fatal error case of IREE_STATUS_DEADLINE_EXCEEDED can occur if the
+// operation had a deadline specified and it elapsed prior to the condition
+// being met.
+//
+// |status| otherwise indicates that the operation failed (such as a failed wait
+// or a failed workgroup callback).
+//
+// Callbacks may reentrantly queue work on the |loop| _unless_ the passed
+// |status| is IREE_STATUS_ABORTED indicating that the loop is shutting down or
+// the operation is being aborted because of a prior failure.
+//
+// Any non-OK result will be routed to a loop-global error handler (depending on
+// implementation) or otherwise ignored; users must set their own exit bits.
+typedef iree_status_t(IREE_API_PTR* iree_loop_callback_fn_t)(
+    void* user_data, iree_loop_t loop, iree_status_t status);
+
+// Callback to execute a single workgroup in a grid dispatch.
+// Each call receives the XYZ location in the grid and may run concurrently with
+// any other workgroup call.
+//
+// Any non-OK result will be routed to the completion callback of the dispatch
+// operation but not otherwise trigger loop failure. Other workgroups may
+// continue to run up until the completion callback is issued.
+typedef iree_status_t(IREE_API_PTR* iree_loop_workgroup_fn_t)(
+    void* user_data, iree_loop_t loop, uint32_t workgroup_x,
+    uint32_t workgroup_y, uint32_t workgroup_z);
+
+// Function pointer for an iree_loop_t control function.
+// |command| provides the operation to perform. Commands may use |params| to
+// pass additional operation-specific parameters. |inout_ptr| usage is defined
+// by each operation.
+typedef iree_status_t(IREE_API_PTR* iree_loop_ctl_fn_t)(
+    void* self, iree_loop_command_t command, const void* params,
+    void** inout_ptr);
+
+// An event system for executing queued asynchronous work.
+// Implementations are allowed to execute operations in any order but generally
+// runs FIFO and will only ever execute one operation at a time. The thread used
+// for execution may change from operation to operation. Usage that has order
+// requirements is required to perform the ordering themselves.
+//
+// This is a form of cooperative scheduling and the loop _may_ not make forward
+// progress if a callback issues a blocking operation. All blocking operations
+// should either be done on user-controlled threads or via the loop primitives
+// such as iree_loop_wait_one. Callbacks may enqueue zero or more operations
+// with 2+ performing a conceptual fork. The iree_loop_dispatch operation allows
+// for a constrained style of concurrency matching a GPU grid dispatch and can
+// be used as a primitive to implement other kinds of parallel loops.
+//
+// User data passed to callbacks is unowned and must be kept live by the
+// requester. All callbacks are guaranteed to be issued even on failure and
+// allocations made when enqueuing operations are safe to free in the callbacks.
+//
+// The rough behavior of the loop matches that of the web event loop
+// dispatching events/promises/timeouts/etc. It's a stackless design where the
+// owner of the primary control loop is hidden from the users of the loop. This
+// allows implementations to integrate into existing scheduling mechanisms
+// (ALooper, libuv, io_uring, the browser main event loop, etc) in a generic
+// way. The design of the API here is meant to make it easy to put the
+// implementation in external code (python/javascript/rust/java/etc) as only a
+// single method with a fixed interface is used to cross the boundaries.
+//
+// Note that by default this implementation is only intended for host-level
+// synchronization and scheduling: fairly coarse events performed fairly
+// infrequently. Optimized multi-threaded workloads are intended to execute on
+// the iree/task/ system via command buffers.
+typedef struct iree_loop_t {
+  // Control function data.
+  void* self;
+  // ioctl-style control function servicing all loop-related commands.
+  // See iree_loop_command_t for more information.
+  iree_loop_ctl_fn_t ctl;
+} iree_loop_t;
+
+// A loop that can do no work. Attempts to enqueue work will fail.
+static inline iree_loop_t iree_loop_null() {
+  iree_loop_t loop = {NULL, NULL};
+  return loop;
+}
+
+// Executes |callback| from the loop at some point in the future.
+//
+// The callback is guaranteed to be issued but in an undefined order.
+// |user_data| is not retained and must be live until the callback is issued.
+IREE_API_EXPORT iree_status_t iree_loop_call(iree_loop_t loop,
+                                             iree_loop_priority_t priority,
+                                             iree_loop_callback_fn_t callback,
+                                             void* user_data);
+
+// Executes |workgroup_callback| from the loop at some point in the future
+// with grid dispatch of |workgroup_count_xyz| workgroups. Each
+// |workgroup_callback| will receive its XYZ location in the grid and
+// |completion_callback| will be issued upon completion (or failure).
+// The dispatched workgroups are not guaranteed to run concurrently and must
+// not perform blocking operations.
+//
+// The completion callback is guaranteed to be issued but in an undefined order.
+// The workgroup callback runs serially or concurrently from multiple threads.
+// |user_data| is not retained and must be live until the callback is issued.
+IREE_API_EXPORT iree_status_t iree_loop_dispatch(
+    iree_loop_t loop, const uint32_t workgroup_count_xyz[3],
+    iree_loop_workgroup_fn_t workgroup_callback,
+    iree_loop_callback_fn_t completion_callback, void* user_data);
+
+// Waits until |timeout| is reached and then issues |callback|.
+// There may be a significant latency between |timeout| and when the |callback|
+// is executed.
+//
+// The callback is guaranteed to be issued.
+// |user_data| is not retained and must be live until the callback is issued.
+IREE_API_EXPORT iree_status_t
+iree_loop_wait_until(iree_loop_t loop, iree_timeout_t timeout,
+                     iree_loop_callback_fn_t callback, void* user_data);
+
+// Waits until the |wait_source| is satisfied or |timeout| is reached and then
+// issues |callback|.
+//
+// The callback is guaranteed to be issued.
+// |user_data| is not retained and must be live until the callback is issued.
+IREE_API_EXPORT iree_status_t iree_loop_wait_one(
+    iree_loop_t loop, iree_wait_source_t wait_source, iree_timeout_t timeout,
+    iree_loop_callback_fn_t callback, void* user_data);
+
+// Waits until one or more of the |wait_sources| is satisfied or |timeout| is
+// reached and then issues |callback|.
+//
+// The callback is guaranteed to be issued.
+// |wait_sources| and |user_data| is not retained and must be live until the
+// callback is issued.
+IREE_API_EXPORT iree_status_t iree_loop_wait_any(
+    iree_loop_t loop, iree_host_size_t count, iree_wait_source_t* wait_sources,
+    iree_timeout_t timeout, iree_loop_callback_fn_t callback, void* user_data);
+
+// Waits until all of the |wait_sources| is satisfied or |timeout| is reached
+// and then issues |callback|.
+//
+// The callback is guaranteed to be issued.
+// |wait_sources| and |user_data| is not retained and must be live until the
+// callback is issued.
+IREE_API_EXPORT iree_status_t iree_loop_wait_all(
+    iree_loop_t loop, iree_host_size_t count, iree_wait_source_t* wait_sources,
+    iree_timeout_t timeout, iree_loop_callback_fn_t callback, void* user_data);
+
+// Blocks the caller and waits until the loop is idle or |timeout| is reached.
+//
+// Not all implementations support this and may return
+// IREE_STATUS_DEADLINE_EXCEEDED immediately when work is still pending.
+// |user_data| is not retained and must be live until the callback is issued.
+IREE_API_EXPORT iree_status_t iree_loop_drain(iree_loop_t loop,
+                                              iree_timeout_t timeout);
+
+//===----------------------------------------------------------------------===//
+// iree_loop_t implementation details
+//===----------------------------------------------------------------------===//
+// These are exposed so that user applications can implement their own loops and
+// are otherwise private to the API.
+
+// Controls the behavior of an iree_loop_ctl_fn_t callback function.
+enum iree_loop_command_e {
+  // Issues the callback from the loop at some point in the future.
+  // The callback will always be called (including when aborted).
+  //
+  // iree_loop_ctl_fn_t:
+  //   params: iree_loop_call_params_t
+  //   inout_ptr: unused
+  IREE_LOOP_COMMAND_CALL = 0u,
+
+  // Issues a workgroup callback across a grid and then issues the callback.
+  // The completion callback will always be called (including when aborted).
+  //
+  // iree_loop_ctl_fn_t:
+  //   params: iree_loop_dispatch_params_t
+  //   inout_ptr: unused
+  IREE_LOOP_COMMAND_DISPATCH,
+
+  // TODO(benvanik): open/read/write/close/etc with iovecs.
+  // Our iree_byte_span_t matches with `struct iovec` and if we share that we
+  // can do scatter/gather I/O with io_uring.
+  // Want something with an fd, flags, count, and iree_byte_span_t's.
+
+  // TODO(benvanik): IREE_LOOP_COMMAND_WAIT_IDLE to get idle callbacks.
+
+  // Sleeps until the timeout is reached then issues the callback.
+  // The callback will always be called (including when aborted).
+  //
+  // iree_loop_ctl_fn_t:
+  //   params: iree_loop_wait_until_params_t
+  //   inout_ptr: unused
+  IREE_LOOP_COMMAND_WAIT_UNTIL,
+
+  // Waits until the wait source has resolved then issues the callback.
+  // The callback will always be called (including when aborted).
+  //
+  // iree_loop_ctl_fn_t:
+  //   params: iree_loop_wait_one_params_t
+  //   inout_ptr: unused
+  IREE_LOOP_COMMAND_WAIT_ONE,
+
+  // Waits until one or more wait sources have resolved then issues the
+  // callback. The callback will always be called (including when aborted).
+  //
+  // iree_loop_ctl_fn_t:
+  //   params: iree_loop_wait_multi_params_t
+  //   inout_ptr: unused
+  IREE_LOOP_COMMAND_WAIT_ANY,
+
+  // Waits until all of the wait sources have resolved then issues the
+  // callback. The callback will always be called (including when aborted).
+  //
+  // iree_loop_ctl_fn_t:
+  //   params: iree_loop_wait_multi_params_t
+  //   inout_ptr: unused
+  IREE_LOOP_COMMAND_WAIT_ALL,
+
+  // Waits until the loop has no more pending work.
+  // Resolves early with IREE_STATUS_DEADLINE_EXCEEDED if the timeout is reached
+  // before the loop is idle or if the platform does not support the operation.
+  //
+  // iree_loop_ctl_fn_t:
+  //   params: iree_loop_drain_params_t
+  //   inout_ptr: unused
+  IREE_LOOP_COMMAND_DRAIN,
+
+  IREE_LOOP_COMMAND_MAX = IREE_LOOP_COMMAND_DRAIN,
+};
+
+typedef struct iree_loop_callback_t {
+  // Callback function pointer.
+  iree_loop_callback_fn_t fn;
+  // User data passed to the callback function. Unowned.
+  void* user_data;
+} iree_loop_callback_t;
+
+// Parameters for IREE_LOOP_COMMAND_CALL.
+typedef struct iree_loop_call_params_t {
+  // Callback issued to perform the call.
+  iree_loop_callback_t callback;
+  // Controls the scheduling of the call.
+  iree_loop_priority_t priority;
+} iree_loop_call_params_t;
+
+// Parameters for IREE_LOOP_COMMAND_DISPATCH.
+typedef struct iree_loop_dispatch_params_t {
+  // Callback issued when the call completes (successfully or otherwise).
+  iree_loop_callback_t callback;
+  // Callback issued for each workgroup.
+  iree_loop_workgroup_fn_t workgroup_fn;
+  // 3D workgroup count.
+  uint32_t workgroup_count_xyz[3];
+} iree_loop_dispatch_params_t;
+
+// Parameters for IREE_LOOP_COMMAND_WAIT_UTIL.
+typedef struct iree_loop_wait_until_params_t {
+  // Callback issued after the wait condition is satisfied.
+  iree_loop_callback_t callback;
+  // Maximum time to wait before failing the wait with
+  // IREE_STATUS_DEADLINE_EXCEEDED.
+  iree_time_t deadline_ns;
+} iree_loop_wait_until_params_t;
+
+// Parameters for IREE_LOOP_COMMAND_WAIT_ONE.
+typedef struct iree_loop_wait_one_params_t {
+  // Callback issued after the wait condition is satisfied.
+  iree_loop_callback_t callback;
+  // Maximum time to wait before failing the wait with
+  // IREE_STATUS_DEADLINE_EXCEEDED.
+  iree_time_t deadline_ns;
+  // Wait source to wait on.
+  iree_wait_source_t wait_source;
+} iree_loop_wait_one_params_t;
+
+// Parameters for IREE_LOOP_COMMAND_WAIT_ANY / IREE_LOOP_COMMAND_WAIT_ALL.
+typedef struct iree_loop_wait_multi_params_t {
+  // Callback issued after any/all wait conditions are satisfied.
+  iree_loop_callback_t callback;
+  // Maximum time to wait before failing the wait with
+  // IREE_STATUS_DEADLINE_EXCEEDED.
+  iree_time_t deadline_ns;
+  // Total number of wait sources.
+  iree_host_size_t count;
+  // List of wait source to wait on.
+  // Ownership remains with the issuer and must remain live until the callback.
+  iree_wait_source_t* wait_sources;
+} iree_loop_wait_multi_params_t;
+
+// Parameters for IREE_LOOP_COMMAND_DRAIN.
+typedef struct iree_loop_drain_params_t {
+  // Time when the wait will abort.
+  iree_time_t deadline_ns;
+} iree_loop_drain_params_t;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_BASE_LOOP_H_
diff --git a/runtime/src/iree/base/loop_inline.c b/runtime/src/iree/base/loop_inline.c
new file mode 100644
index 0000000..3c19b9e
--- /dev/null
+++ b/runtime/src/iree/base/loop_inline.c
@@ -0,0 +1,514 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/loop_inline.h"
+
+#include "iree/base/assert.h"
+#include "iree/base/tracing.h"
+
+static iree_status_t iree_loop_inline_reentrant_ctl(void* self,
+                                                    iree_loop_command_t command,
+                                                    const void* params,
+                                                    void** inout_ptr);
+
+static void iree_loop_inline_emit_error(iree_loop_t loop, iree_status_t status);
+
+//===----------------------------------------------------------------------===//
+// Inline execution of operations
+//===----------------------------------------------------------------------===//
+
+// IREE_LOOP_COMMAND_CALL
+static void iree_loop_inline_run_call(iree_loop_t loop,
+                                      iree_loop_call_params_t params) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Ideally a tail call (when not tracing).
+  iree_status_t status =
+      params.callback.fn(params.callback.user_data, loop, iree_ok_status());
+  if (!iree_status_is_ok(status)) {
+    iree_loop_inline_emit_error(loop, status);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+// IREE_LOOP_COMMAND_DISPATCH
+static void iree_loop_inline_run_dispatch(iree_loop_t loop,
+                                          iree_loop_dispatch_params_t params) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_status_t status = iree_ok_status();
+
+  // We run all workgroups before issuing the completion callback.
+  // If any workgroup fails we exit early and pass the failing status back to
+  // the completion handler exactly once.
+  uint32_t workgroup_count_x = params.workgroup_count_xyz[0];
+  uint32_t workgroup_count_y = params.workgroup_count_xyz[1];
+  uint32_t workgroup_count_z = params.workgroup_count_xyz[2];
+  iree_status_t workgroup_status = iree_ok_status();
+  for (uint32_t z = 0; z < workgroup_count_z; ++z) {
+    for (uint32_t y = 0; y < workgroup_count_y; ++y) {
+      for (uint32_t x = 0; x < workgroup_count_x; ++x) {
+        workgroup_status =
+            params.workgroup_fn(params.callback.user_data, loop, x, y, z);
+        if (!iree_status_is_ok(workgroup_status)) goto workgroup_failed;
+      }
+    }
+  }
+workgroup_failed:
+
+  // Fire the completion callback with either success or the first error hit by
+  // a workgroup.
+  // Ideally a tail call (when not tracing).
+  status =
+      params.callback.fn(params.callback.user_data, loop, workgroup_status);
+  if (!iree_status_is_ok(status)) {
+    iree_loop_inline_emit_error(loop, status);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+// IREE_LOOP_COMMAND_WAIT_UNTIL
+static void iree_loop_inline_run_wait_until(
+    iree_loop_t loop, iree_loop_wait_until_params_t params) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  bool did_wait = iree_wait_until(params.deadline_ns);
+
+  iree_status_t status = params.callback.fn(
+      params.callback.user_data, loop,
+      did_wait ? iree_ok_status()
+               : iree_make_status(IREE_STATUS_ABORTED,
+                                  "sleep was aborted by a signal/alert"));
+  if (!iree_status_is_ok(status)) {
+    iree_loop_inline_emit_error(loop, status);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+// IREE_LOOP_COMMAND_WAIT_ONE
+static void iree_loop_inline_run_wait_one(iree_loop_t loop,
+                                          iree_loop_wait_one_params_t params) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_timeout_t timeout = iree_make_deadline(params.deadline_ns);
+
+  // Try waiting on the wait source directly; this is usually the most optimal
+  // implementation when available and for others may drop down to a system
+  // wait primitive.
+  iree_status_t wait_status =
+      iree_wait_source_wait_one(params.wait_source, timeout);
+
+  // Callback after wait, whether it succeeded or failed.
+  iree_status_t status =
+      params.callback.fn(params.callback.user_data, loop, wait_status);
+  if (!iree_status_is_ok(status)) {
+    iree_loop_inline_emit_error(loop, status);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+// IREE_LOOP_COMMAND_WAIT_ANY
+static void iree_loop_inline_run_wait_any(
+    iree_loop_t loop, iree_loop_wait_multi_params_t params) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_timeout_t timeout = iree_make_deadline(params.deadline_ns);
+
+  // Do a scan down the wait sources to see if any are already set - if so we
+  // can bail early. Otherwise we need to wait on any one.
+  // iree_wait_any is a much more efficient (and fair) way but this keeps the
+  // code working on bare-metal.
+  iree_status_t wait_status = iree_status_from_code(IREE_STATUS_DEFERRED);
+  for (iree_host_size_t i = 0; i < params.count; ++i) {
+    iree_status_code_t wait_status_code = IREE_STATUS_OK;
+    iree_status_t query_status =
+        iree_wait_source_query(params.wait_sources[i], &wait_status_code);
+    if (iree_status_is_ok(query_status)) {
+      if (wait_status_code == IREE_STATUS_OK) {
+        // Signaled - can bail early.
+        break;
+      } else if (wait_status_code == IREE_STATUS_DEFERRED) {
+        // Not signaled yet - keep scanning.
+        continue;
+      } else {
+        // Wait failed - can bail early.
+        wait_status = iree_status_from_code(wait_status_code);
+        break;
+      }
+    } else {
+      // Failed to perform the query, which we treat the same as a wait error.
+      wait_status = query_status;
+      break;
+    }
+  }
+  if (iree_status_is_deferred(wait_status)) {
+    // No queries resolved/failed - commit any real wait.
+    // We choose the first one to be (somewhat) deterministic but really it
+    // should be randomized... or if the user cares they should use a real loop.
+    wait_status = iree_wait_source_wait_one(params.wait_sources[0], timeout);
+  }
+
+  // Callback after wait, whether it succeeded or failed.
+  iree_status_t status =
+      params.callback.fn(params.callback.user_data, loop, wait_status);
+  if (!iree_status_is_ok(status)) {
+    iree_loop_inline_emit_error(loop, status);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+// IREE_LOOP_COMMAND_WAIT_ALL
+static void iree_loop_inline_run_wait_all(
+    iree_loop_t loop, iree_loop_wait_multi_params_t params) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_timeout_t timeout = iree_make_deadline(params.deadline_ns);
+
+  // Run down the list waiting on each source.
+  // iree_wait_all is a much more efficient way but this keeps the code working
+  // on bare-metal.
+  iree_status_t wait_status = iree_ok_status();
+  for (iree_host_size_t i = 0; i < params.count; ++i) {
+    wait_status = iree_wait_source_wait_one(params.wait_sources[i], timeout);
+    if (!iree_status_is_ok(wait_status)) break;
+  }
+
+  // Callback after wait, whether it succeeded or failed.
+  iree_status_t status =
+      params.callback.fn(params.callback.user_data, loop, wait_status);
+  if (!iree_status_is_ok(status)) {
+    iree_loop_inline_emit_error(loop, status);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+//===----------------------------------------------------------------------===//
+// iree_loop_inline_ring_t
+//===----------------------------------------------------------------------===//
+
+// Total capacity of the ringbuffer in operations pending.
+// The usable capacity is always 1 less than this as we mask it off,
+// unfortunately wasting a slot but keeping this all stupid simple. If we wanted
+// to drop another ~32B of stack space we could make this do the right thing.
+#define IREE_LOOP_INLINE_RING_CAPACITY ((uint8_t)8)
+static_assert((IREE_LOOP_INLINE_RING_CAPACITY &
+               (IREE_LOOP_INLINE_RING_CAPACITY - 1)) == 0,
+              "ringbuffer capacity must be a power of two");
+
+// Bitmask used to perform a quick mod of the ringbuffer indices.
+// This must always be ANDed with the indices before use:
+//   uint8_t physical_idx = logical_idx % IREE_LOOP_INLINE_RING_CAPACITY;
+// or this, way better (though the compiler can usually figure it out):
+//   uint8_t physical_idx = logical_idx & IREE_LOOP_INLINE_RING_CAPACITY;
+#define IREE_LOOP_INLINE_RING_MASK (IREE_LOOP_INLINE_RING_CAPACITY - 1)
+
+// An operation in the inline loop ringbuffer containing all the information
+// required to replay it at a future time. All pointers are unowned.
+typedef struct iree_loop_inline_op_t {
+  iree_loop_command_t command;
+  union {
+    iree_loop_callback_t callback;
+    union {
+      iree_loop_call_params_t call;
+      iree_loop_dispatch_params_t dispatch;
+      iree_loop_wait_until_params_t wait_until;
+      iree_loop_wait_one_params_t wait_one;
+      iree_loop_wait_multi_params_t wait_multi;
+    } params;
+  };
+} iree_loop_inline_op_t;
+
+// Returns the size of the parameters required by |command|.
+static inline uint8_t iree_loop_params_size(iree_loop_command_t command) {
+  // Keep this a tail call switch; compilers can work magic here.
+  switch (command) {
+    case IREE_LOOP_COMMAND_CALL:
+      return sizeof(iree_loop_call_params_t);
+    case IREE_LOOP_COMMAND_DISPATCH:
+      return sizeof(iree_loop_dispatch_params_t);
+    case IREE_LOOP_COMMAND_WAIT_UNTIL:
+      return sizeof(iree_loop_wait_until_params_t);
+    case IREE_LOOP_COMMAND_WAIT_ONE:
+      return sizeof(iree_loop_wait_one_params_t);
+    case IREE_LOOP_COMMAND_WAIT_ANY:
+    case IREE_LOOP_COMMAND_WAIT_ALL:
+      return sizeof(iree_loop_wait_multi_params_t);
+    default:
+      return 0;
+  }
+}
+
+// Fixed-size ringbuffer of commands enqueued reentrantly.
+// We ensure the size stays small so we don't blow the stack of tiny systems.
+// The inline loop is explicitly not designed for multi-program cooperative
+// scheduling and well-formed programs shouldn't hit the limit.
+//
+// NOTE: this structure must be in an initialized state if zeroed.
+typedef struct iree_loop_inline_ring_t {
+  iree_loop_inline_op_t ops[IREE_LOOP_INLINE_RING_CAPACITY];
+  uint8_t read_head;
+  uint8_t write_head;
+  iree_status_t* status_ptr;
+} iree_loop_inline_ring_t;
+static_assert(
+    sizeof(iree_loop_inline_ring_t) <= IREE_LOOP_INLINE_STORAGE_SIZE,
+    "iree_loop_inline_ring_t needs to be tiny as it's allocated on the stack");
+
+// Returns a loop that references the current ringbuffer for reentrant usage.
+static inline iree_loop_t iree_loop_inline_reentrant(
+    iree_loop_inline_ring_t* ring) {
+  iree_loop_t loop = {
+      .self = ring,
+      .ctl = iree_loop_inline_reentrant_ctl,
+  };
+  return loop;
+}
+
+// Initializes |out_ring| for use.
+// We don't clear the ops as we (hopefully) don't use them unless they are valid
+// as defined by the ringbuffer parameters.
+static inline void iree_loop_inline_ring_initialize(
+    iree_status_t* status_ptr, iree_loop_inline_ring_t* out_ring) {
+  out_ring->read_head = 0;
+  out_ring->write_head = 0;
+  out_ring->status_ptr = status_ptr;
+}
+
+// Returns true if the ringbuffer is empty (read has caught up to write).
+static inline bool iree_loop_inline_ring_is_empty(
+    const iree_loop_inline_ring_t* ring) {
+  return ring->read_head == ring->write_head;
+}
+
+// Returns true if the ringbuffer is full (write has caught up to read).
+static inline bool iree_loop_inline_ring_is_full(
+    const iree_loop_inline_ring_t* ring) {
+  return ((ring->write_head - ring->read_head) & IREE_LOOP_INLINE_RING_MASK) ==
+         IREE_LOOP_INLINE_RING_MASK;
+}
+
+// Enqueues an operation into |ring|, capacity-permitting.
+// |params| is copied into the ringbuffer and need not remain live upon return.
+static iree_status_t iree_loop_inline_enqueue(iree_loop_inline_ring_t* ring,
+                                              iree_loop_command_t command,
+                                              const void* params) {
+  // The only thing we need to do here is memcpy the params into our ring.
+  // Since all the params differ in size we just effectively perform a lookup
+  // and do the copy.
+  uint8_t params_size = iree_loop_params_size(command);
+  if (IREE_UNLIKELY(params_size) == 0) {
+    return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                            "unimplemented loop command");
+  }
+
+  // Ensure there's space for the new operation.
+  if (iree_loop_inline_ring_is_full(ring)) {
+    return iree_make_status(
+        IREE_STATUS_RESOURCE_EXHAUSTED,
+        "inline ringbuffer capacity exceeded; reduce the amount of concurrent "
+        "work or use a real loop implementation");
+  }
+
+  // Reserve a slot for the new operation.
+  uint8_t slot = ring->write_head;
+  ring->write_head = (ring->write_head + 1) & IREE_LOOP_INLINE_RING_MASK;
+
+  // Copy the operation in; the params are on the stack and won't be valid after
+  // the caller returns.
+  ring->ops[slot].command = command;
+  memcpy(&ring->ops[slot].params, params, params_size);
+  return iree_ok_status();
+}
+
+// Dequeues the next operation in |ring| and executes it.
+// The operation may reentrantly enqueue more operations.
+static void iree_loop_inline_dequeue_and_run_next(
+    iree_loop_inline_ring_t* ring) {
+  IREE_ASSERT(!iree_loop_inline_ring_is_empty(ring));
+
+  // Acquire the next operation.
+  uint8_t slot = ring->read_head;
+  ring->read_head = (ring->read_head + 1) & IREE_LOOP_INLINE_RING_MASK;
+
+  // Copy out the parameters; the operation we execute may overwrite them by
+  // enqueuing more work.
+  iree_loop_inline_op_t op = ring->ops[slot];
+
+  // We pass the callbacks a loop that has the reentrancy bit set.
+  // This allows iree_loop_inline_ctl to determine whether it needs to alloc
+  // more stack space.
+  iree_loop_t loop = iree_loop_inline_reentrant(ring);
+
+  // Tail call into the execution routine so we can hopefully tail call all the
+  // way up the stack.
+  // Ideally these are all tail calls.
+  switch (op.command) {
+    case IREE_LOOP_COMMAND_CALL:
+      iree_loop_inline_run_call(loop, op.params.call);
+      break;
+    case IREE_LOOP_COMMAND_DISPATCH:
+      iree_loop_inline_run_dispatch(loop, op.params.dispatch);
+      break;
+    case IREE_LOOP_COMMAND_WAIT_UNTIL:
+      iree_loop_inline_run_wait_until(loop, op.params.wait_until);
+      break;
+    case IREE_LOOP_COMMAND_WAIT_ONE:
+      iree_loop_inline_run_wait_one(loop, op.params.wait_one);
+      break;
+    case IREE_LOOP_COMMAND_WAIT_ANY:
+      iree_loop_inline_run_wait_any(loop, op.params.wait_multi);
+      break;
+    case IREE_LOOP_COMMAND_WAIT_ALL:
+      iree_loop_inline_run_wait_all(loop, op.params.wait_multi);
+      break;
+    default:
+      break;
+  }
+}
+
+// Aborts all operations in the ring and resets it to its initial state.
+static void iree_loop_inline_abort_all(iree_loop_inline_ring_t* ring) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Issue the completion callback of each op to notify it of the abort.
+  // To prevent enqueuing more work while aborting we pass in a NULL loop.
+  // We can't do anything with the errors so we ignore them.
+  while (!iree_loop_inline_ring_is_empty(ring)) {
+    uint8_t slot = ring->read_head;
+    ring->read_head = (ring->read_head + 1) & IREE_LOOP_INLINE_RING_MASK;
+    iree_loop_callback_t callback = ring->ops[slot].callback;
+    iree_status_ignore(callback.fn(callback.user_data, iree_loop_null(),
+                                   iree_make_status(IREE_STATUS_ABORTED)));
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static void iree_loop_inline_emit_error(iree_loop_t loop,
+                                        iree_status_t status) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_TEXT(
+      z0, iree_status_code_string(iree_status_code(status)));
+
+  iree_loop_inline_ring_t* ring = (iree_loop_inline_ring_t*)loop.self;
+  if (ring->status_ptr && iree_status_is_ok(*ring->status_ptr)) {
+    *ring->status_ptr = status;
+  } else {
+    iree_status_ignore(status);
+  }
+
+  iree_loop_inline_abort_all(ring);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+// Runs the |ring| until it is empty or an operation fails.
+static iree_status_t iree_loop_inline_run_all(iree_loop_inline_ring_t* ring) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  do {
+    // Dequeue the next op and run it inline.
+    iree_loop_inline_dequeue_and_run_next(ring);
+  } while (!iree_loop_inline_ring_is_empty(ring));
+
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+//===----------------------------------------------------------------------===//
+// iree_loop_inline_ctl functions
+//===----------------------------------------------------------------------===//
+
+IREE_API_EXPORT iree_status_t iree_loop_inline_ctl(void* self,
+                                                   iree_loop_command_t command,
+                                                   const void* params,
+                                                   void** inout_ptr) {
+  IREE_ASSERT_ARGUMENT(self);
+
+  if (command == IREE_LOOP_COMMAND_DRAIN) {
+    // We don't really do anything with this; if called non-reentrantly then
+    // there is no work to drain.
+    return iree_ok_status();
+  }
+
+  iree_status_t* status_ptr = (iree_status_t*)self;
+
+  // Initialize a new execution context on the stack.
+  iree_loop_inline_ring_t stack_ring;
+  iree_loop_inline_ring_initialize(status_ptr, &stack_ring);
+
+  // Enqueue the initial command; we'll dequeue it right away but this keeps
+  // the code size smaller.
+  IREE_RETURN_IF_ERROR(iree_loop_inline_enqueue(&stack_ring, command, params));
+
+  // If the status is not OK then we bail immediately; this allows for sticky
+  // errors that mimic the abort behavior of an actual loop. Inline loops never
+  // run work from multiple scopes as they don't persist beyond the loop
+  // operation.
+  if (iree_status_is_ok(*status_ptr)) {
+    // Run until the ring is empty or we fail.
+    return iree_loop_inline_run_all(&stack_ring);  // tail
+  } else {
+    // Abort all ops.
+    iree_loop_inline_abort_all(&stack_ring);
+    return iree_ok_status();
+  }
+}
+
+IREE_API_EXPORT iree_status_t
+iree_loop_inline_using_storage_ctl(void* self, iree_loop_command_t command,
+                                   const void* params, void** inout_ptr) {
+  if (command == IREE_LOOP_COMMAND_DRAIN) {
+    // We don't really do anything with this; if called non-reentrantly then
+    // there is no work to drain.
+    return iree_ok_status();
+  }
+
+  iree_loop_inline_storage_t* storage = (iree_loop_inline_storage_t*)self;
+  iree_loop_inline_ring_t* ring = (iree_loop_inline_ring_t*)storage->opaque;
+
+  // Top-level call using external storage; run until the ring is empty or
+  // we fail. Note that the storage contents are undefined and we have to
+  // ensure the list is ready for use.
+  iree_loop_inline_ring_initialize(&storage->status, ring);
+
+  IREE_RETURN_IF_ERROR(iree_loop_inline_enqueue(ring, command, params));
+
+  // If the status is not OK then we bail immediately; this allows for sticky
+  // errors that mimic the abort behavior of an actual loop. Inline loops never
+  // run work from multiple scopes as they don't persist beyond the loop
+  // operation.
+  if (iree_status_is_ok(storage->status)) {
+    // Run until the ring is empty or we fail.
+    return iree_loop_inline_run_all(ring);  // tail
+  } else {
+    // Abort all ops.
+    iree_loop_inline_abort_all(ring);
+    return iree_ok_status();
+  }
+}
+
+static iree_status_t iree_loop_inline_reentrant_ctl(void* self,
+                                                    iree_loop_command_t command,
+                                                    const void* params,
+                                                    void** inout_ptr) {
+  if (command == IREE_LOOP_COMMAND_DRAIN) {
+    // We don't really do anything with this; when called reentrantly we are
+    // already draining as we drain on each top-level op.
+    return iree_ok_status();
+  }
+
+  // Enqueue the new command and return to the caller - it'll be run by
+  // the top-level control call.
+  iree_loop_inline_ring_t* ring = (iree_loop_inline_ring_t*)self;
+  return iree_loop_inline_enqueue(ring, command, params);  // tail
+}
diff --git a/runtime/src/iree/base/loop_inline.h b/runtime/src/iree/base/loop_inline.h
new file mode 100644
index 0000000..79a1dc1
--- /dev/null
+++ b/runtime/src/iree/base/loop_inline.h
@@ -0,0 +1,95 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BASE_LOOP_INLINE_H_
+#define IREE_BASE_LOOP_INLINE_H_
+
+#include <inttypes.h>
+
+#include "iree/base/loop.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+//===----------------------------------------------------------------------===//
+// iree_loop_inline
+//===----------------------------------------------------------------------===//
+
+IREE_API_EXPORT iree_status_t iree_loop_inline_ctl(void* self,
+                                                   iree_loop_command_t command,
+                                                   const void* params,
+                                                   void** inout_ptr);
+IREE_API_EXPORT iree_status_t
+iree_loop_inline_using_storage_ctl(void* self, iree_loop_command_t command,
+                                   const void* params, void** inout_ptr);
+
+// Returns a loop that doesn't really loop.
+// All operations are run as they are enqueued on the stack. This uses no
+// additional memory and ensures that everything completes upon return to the
+// user but does eliminate the ability for pipelining and overlapping work from
+// multiple subprograms. This approach limits the amount of work that can be
+// reentrantly scheduled and should only be used when in the tiniest of
+// environments with programs tested to be compatible with it.
+//
+// Reentrant enqueuing is possible and can be used to create tail call chains
+// (or recursion) that executes roughly in order.
+//
+// Caveats:
+// - Reentrant enqueuing of operations is limited to some small number (~4).
+// - Waits are performed as they are enqueued and the loop must be able to
+//   make forward progress on each.
+// - Execution deadlines are ignored in order to fully drain on each operation.
+// - Errors propagate immediately to the top-level caller and abort all pending
+//   operations.
+//
+// Thread-compatible: stateless and executes all work on the calling thread.
+static inline iree_loop_t iree_loop_inline(iree_status_t* out_status) {
+  iree_loop_t loop = {out_status, iree_loop_inline_ctl};
+  return loop;
+}
+
+// Minimum size in bytes required for iree_loop_inline_storage_t.
+// If we wanted to shrink this size to the absolute minimum we'd just expose the
+// structures here; not the worst thing but messy (as this is a public API).
+#define IREE_LOOP_INLINE_STORAGE_SIZE 512
+
+// Storage for an inline loop.
+// May be either allocated on the stack or on the heap and only needs to remain
+// valid for the lifetime of the iree_loop_t referencing it.
+typedef iree_alignas(iree_max_align_t) struct iree_loop_inline_storage_t {
+  uint8_t opaque[IREE_LOOP_INLINE_STORAGE_SIZE];
+  iree_status_t status;
+} iree_loop_inline_storage_t;
+
+// Returns an inline loop that uses an external |storage| instead of the stack.
+// The storage will only be used while executing and can be reused if the caller
+// knows it is safe (not reentrantly inside of a loop execution). Errors that
+// arise will be set in the storage status field and must be checked (or
+// ignored) by the caller to avoid leaks.
+//
+// See iree_loop_inline for details on the execution behavior.
+static inline iree_loop_t iree_loop_inline_initialize(
+    iree_loop_inline_storage_t* storage) {
+  storage->status = iree_ok_status();
+  iree_loop_t loop = {
+      storage,
+      iree_loop_inline_using_storage_ctl,
+  };
+  return loop;
+}
+
+static void iree_loop_inline_deinitialize(iree_loop_inline_storage_t* storage) {
+  if (!storage) return;
+  iree_status_ignore(storage->status);
+  storage->status = iree_ok_status();
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_BASE_LOOP_INLINE_H_
diff --git a/runtime/src/iree/base/loop_inline_test.cc b/runtime/src/iree/base/loop_inline_test.cc
new file mode 100644
index 0000000..0df5c83
--- /dev/null
+++ b/runtime/src/iree/base/loop_inline_test.cc
@@ -0,0 +1,51 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/api.h"
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+// Contains the test definitions applied to all loop implementations:
+#include "iree/base/loop_test.h"
+
+void AllocateLoop(iree_status_t* out_status, iree_allocator_t allocator,
+                  iree_loop_t* out_loop) {
+  *out_loop = iree_loop_inline(out_status);
+}
+
+void FreeLoop(iree_allocator_t allocator, iree_loop_t loop) {}
+
+// Tests usage of external storage for the inline ringbuffer.
+// The standard tests all use loop allocated stack storage while this one uses
+// the storage we control. Real applications could put that storage in .rwdata
+// somewhere or alias it with other storage (arenas/etc).
+TEST(LoopInlineTest, ExternalStorage) {
+  IREE_TRACE_SCOPE();
+
+  iree_loop_inline_storage_t storage = {{0xCD}, iree_ok_status()};
+  auto loop = iree_loop_inline_initialize(&storage);
+
+  // Issue a call that adds 1 to a counter until it reaches kCountUpTo.
+  static const int kCountUpTo = 128;
+  struct user_data_t {
+    int counter = 0;
+  } user_data;
+  static const iree_loop_callback_fn_t callback_fn =
+      +[](void* user_data_ptr, iree_loop_t loop, iree_status_t status) {
+        auto* user_data = reinterpret_cast<user_data_t*>(user_data_ptr);
+        if (++user_data->counter < kCountUpTo) {
+          return iree_loop_call(loop, IREE_LOOP_PRIORITY_DEFAULT, callback_fn,
+                                user_data);
+        }
+        return iree_ok_status();
+      };
+  IREE_ASSERT_OK(iree_loop_call(loop, IREE_LOOP_PRIORITY_DEFAULT, callback_fn,
+                                &user_data));
+  EXPECT_EQ(user_data.counter, kCountUpTo);
+  IREE_ASSERT_OK(storage.status);
+
+  iree_loop_inline_deinitialize(&storage);
+}
diff --git a/runtime/src/iree/base/loop_sync.c b/runtime/src/iree/base/loop_sync.c
new file mode 100644
index 0000000..8de715d
--- /dev/null
+++ b/runtime/src/iree/base/loop_sync.c
@@ -0,0 +1,1101 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/loop_sync.h"
+
+#include "iree/base/internal/math.h"
+#include "iree/base/internal/wait_handle.h"
+#include "iree/base/tracing.h"
+
+//===----------------------------------------------------------------------===//
+// iree_loop_sync_t utilities
+//===----------------------------------------------------------------------===//
+
+// Amount of time that can remain in a wait-until while still retiring.
+// This prevents additional system sleeps when the remaining time before the
+// deadline is less than the granularity the system is likely able to sleep for.
+// Some platforms may have as much as 10-15ms of potential slop and sleeping for
+// 1ms may result in 10-15ms.
+#define IREE_LOOP_SYNC_DELAY_SLOP_NS (2 /*ms*/ * 1000000)
+
+// NOTE: all callbacks should be at offset 0. This allows for easily zipping
+// through the params lists and issuing callbacks.
+static_assert(offsetof(iree_loop_call_params_t, callback) == 0,
+              "callback must be at offset 0");
+static_assert(offsetof(iree_loop_dispatch_params_t, callback) == 0,
+              "callback must be at offset 0");
+static_assert(offsetof(iree_loop_wait_until_params_t, callback) == 0,
+              "callback must be at offset 0");
+static_assert(offsetof(iree_loop_wait_one_params_t, callback) == 0,
+              "callback must be at offset 0");
+static_assert(offsetof(iree_loop_wait_multi_params_t, callback) == 0,
+              "callback must be at offset 0");
+
+static void iree_loop_sync_abort_scope(iree_loop_sync_t* loop_sync,
+                                       iree_loop_sync_scope_t* scope);
+
+//===----------------------------------------------------------------------===//
+// iree_loop_run_ring_t
+//===----------------------------------------------------------------------===//
+
+// Represents an operation in the loop run ringbuffer.
+// Note that the storage may be reallocated at any time and all pointers must be
+// external to the storage in order to remain valid.
+typedef struct iree_loop_run_op_t {
+  union {
+    iree_loop_callback_t callback;  // asserted at offset 0 above
+    union {
+      iree_loop_call_params_t call;
+      iree_loop_dispatch_params_t dispatch;
+    } params;
+  };
+  iree_loop_command_t command;
+  iree_loop_sync_scope_t* scope;
+
+  // Set on calls when we are issuing a callback for an operation.
+  // Unlike other pointers in the params this is owned by the ring.
+  iree_status_t status;
+} iree_loop_run_op_t;
+
+// Ringbuffer containing pending ready to run callback operations.
+//
+// Generally this works as a FIFO but we allow for head-of-ring replacement
+// for high priority tail calls. New operations are appended to the ring and
+// removed as drained; if the ringbuffer capacity is exceeded then the storage
+// will be reallocated up to the maximum capacity specified at creation time.
+typedef iree_alignas(iree_max_align_t) struct iree_loop_run_ring_t {
+  // Current storage capacity of |ops|.
+  uint32_t capacity;
+  // Index into |ops| where the next operation to be dequeued is located.
+  uint32_t read_head;
+  // Index into |ops| where the last operation to be enqueued is located.
+  uint32_t write_head;
+  // Ringbuffer storage.
+  iree_loop_run_op_t ops[0];
+} iree_loop_run_ring_t;
+
+static iree_host_size_t iree_loop_run_ring_storage_size(
+    iree_loop_sync_options_t options) {
+  return sizeof(iree_loop_run_ring_t) +
+         options.max_queue_depth * sizeof(iree_loop_run_op_t);
+}
+
+static inline uint32_t iree_loop_run_ring_mask(
+    const iree_loop_run_ring_t* run_ring) {
+  return run_ring->capacity - 1;
+}
+
+static iree_host_size_t iree_loop_run_ring_size(
+    const iree_loop_run_ring_t* run_ring) {
+  return run_ring->write_head >= run_ring->read_head
+             ? (run_ring->write_head - run_ring->read_head)
+             : (run_ring->write_head + run_ring->capacity -
+                run_ring->read_head);
+}
+
+static bool iree_loop_run_ring_is_empty(const iree_loop_run_ring_t* run_ring) {
+  return run_ring->read_head == run_ring->write_head;
+}
+
+static bool iree_loop_run_ring_is_full(const iree_loop_run_ring_t* run_ring) {
+  const uint32_t mask = iree_loop_run_ring_mask(run_ring);
+  return ((run_ring->write_head - run_ring->read_head) & mask) == mask;
+}
+
+static void iree_loop_run_ring_initialize(iree_loop_sync_options_t options,
+                                          iree_loop_run_ring_t* out_run_ring) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  out_run_ring->capacity = (uint32_t)options.max_queue_depth;
+  out_run_ring->read_head = 0;
+  out_run_ring->write_head = 0;
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static void iree_loop_run_ring_deinitialize(iree_loop_run_ring_t* run_ring) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Expected abort to be called.
+  IREE_ASSERT(iree_loop_run_ring_is_empty(run_ring));
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_status_t iree_loop_run_ring_enqueue(iree_loop_run_ring_t* run_ring,
+                                                iree_loop_run_op_t op) {
+  if (iree_loop_run_ring_is_full(run_ring)) {
+    return iree_make_status(
+        IREE_STATUS_RESOURCE_EXHAUSTED,
+        "run ringbuffer capacity %u exceeded; reduce the amount of concurrent "
+        "work or use a full loop implementation",
+        run_ring->capacity);
+  }
+
+  IREE_TRACE_PLOT_VALUE_I64("iree_loop_queue_depth",
+                            iree_loop_run_ring_size(run_ring));
+
+  // Reserve a slot for the new operation.
+  uint32_t slot = run_ring->write_head;
+  run_ring->write_head =
+      (run_ring->write_head + 1) & iree_loop_run_ring_mask(run_ring);
+
+  // Copy the operation in; the params are on the stack and won't be valid after
+  // the caller returns.
+  run_ring->ops[slot] = op;
+
+  ++op.scope->pending_count;
+
+  IREE_TRACE_PLOT_VALUE_I64("iree_loop_queue_depth",
+                            iree_loop_run_ring_size(run_ring));
+  return iree_ok_status();
+}
+
+static bool iree_loop_run_ring_dequeue(iree_loop_run_ring_t* run_ring,
+                                       iree_loop_run_op_t* out_op) {
+  if (iree_loop_run_ring_is_empty(run_ring)) return false;
+
+  IREE_TRACE_PLOT_VALUE_I64("iree_loop_queue_depth",
+                            iree_loop_run_ring_size(run_ring));
+
+  // Acquire the next operation.
+  uint32_t slot = run_ring->read_head;
+  run_ring->read_head =
+      (run_ring->read_head + 1) & iree_loop_run_ring_mask(run_ring);
+
+  // Copy out the parameters; the operation we execute may overwrite them by
+  // enqueuing more work.
+  *out_op = run_ring->ops[slot];
+
+  --out_op->scope->pending_count;
+
+  IREE_TRACE_PLOT_VALUE_I64("iree_loop_queue_depth",
+                            iree_loop_run_ring_size(run_ring));
+  return true;
+}
+
+// Aborts all ops that are part of |scope|.
+// A NULL |scope| indicates all work from all scopes should be aborted.
+static void iree_loop_run_ring_abort_scope(iree_loop_run_ring_t* run_ring,
+                                           iree_loop_sync_scope_t* scope) {
+  if (iree_loop_run_ring_is_empty(run_ring)) return;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Do a single pass over the ring and abort all ops matching the scope.
+  // To keep things simple and preserve dense ordered ops in the ringbuffer we
+  // dequeue all ops and re-enqueue any that don't match. When complete the ring
+  // may be at a different offset but will contain only those ops we didn't
+  // abort in their original order.
+  iree_host_size_t count = iree_loop_run_ring_size(run_ring);
+  for (iree_host_size_t i = 0; i < count; ++i) {
+    iree_loop_run_op_t op;
+    if (!iree_loop_run_ring_dequeue(run_ring, &op)) break;
+    if (scope && op.scope != scope) {
+      // Not part of the scope we are aborting; re-enqueue to the ring.
+      iree_status_ignore(iree_loop_run_ring_enqueue(run_ring, op));
+    } else {
+      // Part of the scope to abort.
+      --op.scope->pending_count;
+      iree_status_ignore(op.status);
+      iree_status_ignore(op.callback.fn(op.callback.user_data, iree_loop_null(),
+                                        iree_make_status(IREE_STATUS_ABORTED)));
+    }
+  }
+
+  IREE_TRACE_PLOT_VALUE_I64("iree_loop_queue_depth",
+                            iree_loop_run_ring_size(run_ring));
+  IREE_TRACE_ZONE_END(z0);
+}
+
+// Aborts all ops from all scopes.
+static void iree_loop_run_ring_abort_all(iree_loop_run_ring_t* run_ring) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_loop_run_ring_abort_scope(run_ring, /*scope=*/NULL);
+  IREE_TRACE_ZONE_END(z0);
+}
+
+//===----------------------------------------------------------------------===//
+// iree_loop_wait_list_t
+//===----------------------------------------------------------------------===//
+
+// Represents an operation in the loop wait list.
+// Note that the storage may be reallocated at any time and all pointers must be
+// external to the storage in order to remain valid.
+typedef struct iree_loop_wait_op_t {
+  union {
+    iree_loop_callback_t callback;  // asserted at offset 0 above
+    union {
+      iree_loop_wait_until_params_t wait_until;
+      iree_loop_wait_one_params_t wait_one;
+      iree_loop_wait_multi_params_t wait_multi;
+    } params;
+  };
+  iree_loop_command_t command;
+  iree_loop_sync_scope_t* scope;
+} iree_loop_wait_op_t;
+
+// Dense list of pending wait operations.
+// We don't care about the order here as we put them all into a wait set for
+// multi-wait anyway. iree_wait_set_t should really be rewritten such that this
+// is not required (custom data on registered handles, etc).
+typedef iree_alignas(iree_max_align_t) struct iree_loop_wait_list_t {
+  // System wait set used to perform multi-waits.
+  iree_wait_set_t* wait_set;
+  // Current storage capacity of |ops|.
+  uint32_t capacity;
+  // Current count of valid |ops|.
+  uint32_t count;
+  // Pending wait operations.
+  iree_loop_wait_op_t ops[0];
+} iree_loop_wait_list_t;
+
+static iree_host_size_t iree_loop_wait_list_storage_size(
+    iree_loop_sync_options_t options) {
+  return sizeof(iree_loop_wait_list_t) +
+         options.max_wait_count * sizeof(iree_loop_wait_op_t);
+}
+
+static bool iree_loop_wait_list_is_empty(iree_loop_wait_list_t* wait_list) {
+  return wait_list->count == 0;
+}
+
+static iree_status_t iree_loop_wait_list_initialize(
+    iree_loop_sync_options_t options, iree_allocator_t allocator,
+    iree_loop_wait_list_t* out_wait_list) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  out_wait_list->capacity = (uint32_t)options.max_wait_count;
+  out_wait_list->count = 0;
+
+  iree_status_t status = iree_wait_set_allocate(
+      options.max_wait_count, allocator, &out_wait_list->wait_set);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_loop_wait_list_deinitialize(iree_loop_wait_list_t* wait_list) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Expected abort to be called.
+  IREE_ASSERT(iree_loop_wait_list_is_empty(wait_list));
+
+  iree_wait_set_free(wait_list->wait_set);
+  wait_list->wait_set = NULL;
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_status_t iree_loop_wait_list_register_wait_source(
+    iree_loop_wait_list_t* wait_list, iree_wait_source_t* wait_source) {
+  if (iree_wait_source_is_immediate(*wait_source)) {
+    // Task has been neutered and is treated as an immediately resolved wait.
+    return iree_ok_status();
+  } else if (iree_wait_source_is_delay(*wait_source)) {
+    // We can't easily support delays as registered wait sources; we need to be
+    // able to snoop the tasks to find the earliest sleep time and can't easily
+    // do that if we tried to put them in the wait set.
+    return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                            "delays must come from wait-until ops");
+  }
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_status_t status = iree_ok_status();
+
+  // Acquire a wait handle and insert it into the wait set.
+  // We swap out the wait source with the handle so that we don't export it
+  // again and can find it on wake.
+  iree_wait_handle_t wait_handle = iree_wait_handle_immediate();
+  iree_wait_handle_t* wait_handle_ptr =
+      iree_wait_handle_from_source(wait_source);
+  if (wait_handle_ptr) {
+    // Already a wait handle - can directly insert it.
+    wait_handle = *wait_handle_ptr;
+  } else {
+    iree_wait_primitive_t wait_primitive = iree_wait_primitive_immediate();
+    status = iree_wait_source_export(*wait_source, IREE_WAIT_PRIMITIVE_TYPE_ANY,
+                                     iree_immediate_timeout(), &wait_primitive);
+    if (iree_status_is_ok(status)) {
+      // Swap the wait handle with the exported handle so we can wake it later.
+      // It'd be ideal if we retained the wait handle separate so that we could
+      // still do fast queries for local wait sources.
+      iree_wait_handle_wrap_primitive(wait_primitive.type, wait_primitive.value,
+                                      &wait_handle);
+      status = iree_wait_source_import(wait_primitive, wait_source);
+    }
+  }
+
+  if (iree_status_is_ok(status)) {
+    status = iree_wait_set_insert(wait_list->wait_set, wait_handle);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_loop_wait_list_unregister_wait_source(
+    iree_loop_wait_list_t* wait_list, iree_wait_source_t* wait_source) {
+  if (iree_wait_source_is_immediate(*wait_source) ||
+      iree_wait_source_is_delay(*wait_source)) {
+    // Not registered or it's already been unregistered.
+    return;
+  }
+  iree_wait_handle_t* wait_handle = iree_wait_handle_from_source(wait_source);
+  if (wait_handle) {
+    iree_wait_set_erase(wait_list->wait_set, *wait_handle);
+  }
+  *wait_source = iree_wait_source_immediate();
+}
+
+static void iree_loop_wait_list_unregister_wait_sources(
+    iree_loop_wait_list_t* wait_list, iree_loop_wait_op_t* op) {
+  switch (op->command) {
+    case IREE_LOOP_COMMAND_WAIT_ONE:
+      iree_loop_wait_list_unregister_wait_source(
+          wait_list, &op->params.wait_one.wait_source);
+      break;
+    case IREE_LOOP_COMMAND_WAIT_ANY:
+    case IREE_LOOP_COMMAND_WAIT_ALL:
+      for (iree_host_size_t i = 0; i < op->params.wait_multi.count; ++i) {
+        iree_loop_wait_list_unregister_wait_source(
+            wait_list, &op->params.wait_multi.wait_sources[i]);
+      }
+      break;
+    default:
+    case IREE_LOOP_COMMAND_WAIT_UNTIL:
+      break;
+  }
+}
+
+static iree_status_t iree_loop_wait_list_insert(
+    iree_loop_wait_list_t* wait_list, iree_loop_wait_op_t op) {
+  if (wait_list->count + 1 >= wait_list->capacity) {
+    return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED,
+                            "wait list capacity %u reached",
+                            wait_list->capacity);
+  }
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_PLOT_VALUE_I64("iree_loop_wait_depth", wait_list->count);
+
+  uint32_t slot = wait_list->count++;
+  wait_list->ops[slot] = op;
+
+  iree_status_t status = iree_ok_status();
+  switch (op.command) {
+    case IREE_LOOP_COMMAND_WAIT_UNTIL:
+      // No entry in the wait set; we just need it in the list in order to scan.
+      break;
+    case IREE_LOOP_COMMAND_WAIT_ONE: {
+      status = iree_loop_wait_list_register_wait_source(
+          wait_list, &op.params.wait_one.wait_source);
+      break;
+    }
+    case IREE_LOOP_COMMAND_WAIT_ALL:
+    case IREE_LOOP_COMMAND_WAIT_ANY: {
+      for (iree_host_size_t i = 0;
+           i < op.params.wait_multi.count && iree_status_is_ok(status); ++i) {
+        status = iree_loop_wait_list_register_wait_source(
+            wait_list, &op.params.wait_multi.wait_sources[i]);
+      }
+      break;
+    }
+    default:
+      IREE_ASSERT_UNREACHABLE("unhandled wait list command");
+      break;
+  }
+
+  if (iree_status_is_ok(status)) {
+    ++op.scope->pending_count;
+  }
+
+  IREE_TRACE_PLOT_VALUE_I64("iree_loop_wait_depth", wait_list->count);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static iree_status_t iree_loop_wait_list_notify_wake(
+    iree_loop_wait_list_t* wait_list, iree_loop_run_ring_t* run_ring,
+    iree_host_size_t i, iree_status_t status) {
+  IREE_TRACE_PLOT_VALUE_I64("iree_loop_wait_depth", wait_list->count);
+
+  // Unregister all wait handles from the wait set.
+  iree_loop_wait_list_unregister_wait_sources(wait_list, &wait_list->ops[i]);
+
+  // Since we make no guarantees about the order of the lists we can just swap
+  // with the last value. Note that we need to preserve the callback.
+  iree_loop_sync_scope_t* scope = wait_list->ops[i].scope;
+  --scope->pending_count;
+  iree_loop_callback_t callback = wait_list->ops[i].callback;
+  int tail_index = (int)wait_list->count - 1;
+  if (tail_index > i) {
+    memcpy(&wait_list->ops[i], &wait_list->ops[tail_index],
+           sizeof(*wait_list->ops));
+  }
+  --wait_list->count;
+
+  IREE_TRACE_PLOT_VALUE_I64("iree_loop_wait_depth", wait_list->count);
+
+  // Enqueue the callback on the run ring - this ensures it gets sequenced with
+  // other runnable work and keeps ordering easier to reason about.
+  return iree_loop_run_ring_enqueue(
+      run_ring, (iree_loop_run_op_t){
+                    .command = IREE_LOOP_COMMAND_CALL,
+                    .scope = scope,
+                    .params =
+                        {
+                            .call =
+                                {
+                                    .callback = callback,
+                                    // TODO(benvanik): elevate callback priority
+                                    // to reduce latency?
+                                    .priority = IREE_LOOP_PRIORITY_DEFAULT,
+                                },
+                        },
+                    .status = status,
+                });
+}
+
+// Returns DEFERRED if unresolved, OK if resolved, and an error otherwise.
+// If resolved (successful or not) the caller must erase the wait.
+static iree_status_t iree_loop_wait_list_scan_wait_until(
+    iree_loop_wait_list_t* wait_list, iree_loop_wait_until_params_t* params,
+    iree_time_t now_ns, iree_time_t* earliest_deadline_ns) {
+  // Task is a delay until some future time; factor that in to our earliest
+  // deadline so that we'll wait in the system until that time. If we wake
+  // earlier because another wait resolved it's still possible for the delay
+  // to have been reached before we get back to this check.
+  if (params->deadline_ns <= now_ns + IREE_LOOP_SYNC_DELAY_SLOP_NS) {
+    // Wait deadline reached.
+    return iree_ok_status();
+  } else {
+    // Still waiting.
+    *earliest_deadline_ns =
+        iree_min(*earliest_deadline_ns, params->deadline_ns);
+    return iree_status_from_code(IREE_STATUS_DEFERRED);
+  }
+}
+
+// Returns DEFERRED if unresolved, OK if resolved, and an error otherwise.
+// If resolved (successful or not) the caller must erase the wait.
+static iree_status_t iree_loop_wait_list_scan_wait_one(
+    iree_loop_wait_list_t* wait_list, iree_loop_wait_one_params_t* params,
+    iree_time_t now_ns, iree_time_t* earliest_deadline_ns) {
+  // Query the status.
+  iree_status_code_t wait_status_code = IREE_STATUS_OK;
+  IREE_RETURN_IF_ERROR(
+      iree_wait_source_query(params->wait_source, &wait_status_code));
+
+  if (wait_status_code != IREE_STATUS_OK) {
+    if (params->deadline_ns <= now_ns) {
+      // Deadline reached without having resolved.
+      return iree_status_from_code(IREE_STATUS_DEADLINE_EXCEEDED);
+    } else {
+      // Still waiting.
+      *earliest_deadline_ns =
+          iree_min(*earliest_deadline_ns, params->deadline_ns);
+    }
+  }
+
+  return iree_status_from_code(wait_status_code);
+}
+
+// Returns DEFERRED if unresolved, OK if resolved, and an error otherwise.
+// If resolved (successful or not) the caller must erase the wait.
+static iree_status_t iree_loop_wait_list_scan_wait_any(
+    iree_loop_wait_list_t* wait_list, iree_loop_wait_multi_params_t* params,
+    iree_time_t now_ns, iree_time_t* earliest_deadline_ns) {
+  for (iree_host_size_t i = 0; i < params->count; ++i) {
+    iree_status_code_t wait_status_code = IREE_STATUS_OK;
+    IREE_RETURN_IF_ERROR(
+        iree_wait_source_query(params->wait_sources[i], &wait_status_code));
+    if (wait_status_code == IREE_STATUS_OK) {
+      return iree_ok_status();  // one resolved, wait-any satisfied
+    }
+  }
+  if (params->deadline_ns <= now_ns) {
+    // Deadline reached without having resolved any.
+    return iree_status_from_code(IREE_STATUS_DEADLINE_EXCEEDED);
+  } else {
+    // Still waiting.
+    *earliest_deadline_ns =
+        iree_min(*earliest_deadline_ns, params->deadline_ns);
+  }
+  return iree_status_from_code(IREE_STATUS_DEFERRED);  // none resolved
+}
+
+// Returns DEFERRED if unresolved, OK if resolved, and an error otherwise.
+// If resolved (successful or not) the caller must erase the wait.
+static iree_status_t iree_loop_wait_list_scan_wait_all(
+    iree_loop_wait_list_t* wait_list, iree_loop_wait_multi_params_t* params,
+    iree_time_t now_ns, iree_time_t* earliest_deadline_ns) {
+  bool any_unresolved = false;
+  for (iree_host_size_t i = 0; i < params->count; ++i) {
+    if (iree_wait_source_is_immediate(params->wait_sources[i])) continue;
+    iree_status_code_t wait_status_code = IREE_STATUS_OK;
+    IREE_RETURN_IF_ERROR(
+        iree_wait_source_query(params->wait_sources[i], &wait_status_code));
+    if (wait_status_code == IREE_STATUS_OK) {
+      // Wait resolved; remove it from the wait set so that we don't wait on it
+      // again. We do this by neutering the handle.
+      iree_wait_handle_t* wait_handle =
+          iree_wait_handle_from_source(&params->wait_sources[i]);
+      if (wait_handle) {
+        iree_wait_set_erase(wait_list->wait_set, *wait_handle);
+      }
+      params->wait_sources[i] = iree_wait_source_immediate();
+    } else {
+      // Wait not yet resolved.
+      if (params->deadline_ns <= now_ns) {
+        // Deadline reached without having resolved all.
+        return iree_status_from_code(IREE_STATUS_DEADLINE_EXCEEDED);
+      } else {
+        // Still waiting.
+        *earliest_deadline_ns =
+            iree_min(*earliest_deadline_ns, params->deadline_ns);
+        any_unresolved = true;
+      }
+    }
+  }
+  return any_unresolved ? iree_status_from_code(IREE_STATUS_DEFERRED)
+                        : iree_ok_status();
+}
+
+static void iree_loop_wait_list_handle_wake(iree_loop_wait_list_t* wait_list,
+                                            iree_loop_run_ring_t* run_ring,
+                                            iree_wait_handle_t wake_handle) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // TODO(benvanik): scan the list. We need a way to map wake_handle back to
+  // the zero or more tasks that match it but don't currently store the
+  // handle. Ideally we'd have the wait set tell us precisely which things
+  // woke - possibly by having a bitmap of original insertions that match the
+  // handle - but for now we just eat the extra query syscall.
+  int woken_tasks = 0;
+
+  (void)woken_tasks;
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, woken_tasks);
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_status_t iree_loop_wait_list_scan(
+    iree_loop_wait_list_t* wait_list, iree_loop_run_ring_t* run_ring,
+    iree_time_t* out_earliest_deadline_ns) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  *out_earliest_deadline_ns = IREE_TIME_INFINITE_FUTURE;
+
+  iree_time_t now_ns = iree_time_now();
+  iree_status_t scan_status = iree_ok_status();
+  for (iree_host_size_t i = 0;
+       i < wait_list->count && iree_status_is_ok(scan_status); ++i) {
+    iree_status_t wait_status = iree_ok_status();
+    switch (wait_list->ops[i].command) {
+      case IREE_LOOP_COMMAND_WAIT_UNTIL:
+        wait_status = iree_loop_wait_list_scan_wait_until(
+            wait_list, &wait_list->ops[i].params.wait_until, now_ns,
+            out_earliest_deadline_ns);
+        break;
+      case IREE_LOOP_COMMAND_WAIT_ONE:
+        wait_status = iree_loop_wait_list_scan_wait_one(
+            wait_list, &wait_list->ops[i].params.wait_one, now_ns,
+            out_earliest_deadline_ns);
+        break;
+      case IREE_LOOP_COMMAND_WAIT_ANY:
+        wait_status = iree_loop_wait_list_scan_wait_any(
+            wait_list, &wait_list->ops[i].params.wait_multi, now_ns,
+            out_earliest_deadline_ns);
+        break;
+      case IREE_LOOP_COMMAND_WAIT_ALL:
+        wait_status = iree_loop_wait_list_scan_wait_all(
+            wait_list, &wait_list->ops[i].params.wait_multi, now_ns,
+            out_earliest_deadline_ns);
+        break;
+    }
+    if (!iree_status_is_deferred(wait_status)) {
+      // Wait completed/failed - erase from the wait set and op list.
+      scan_status =
+          iree_loop_wait_list_notify_wake(wait_list, run_ring, i, wait_status);
+      --i;  // item i removed
+
+      // Don't commit the wait if we woke something; we want the callback to be
+      // issued ASAP and will let the main loop pump again to actually wait if
+      // needed.
+      *out_earliest_deadline_ns = IREE_TIME_INFINITE_PAST;
+    }
+  }
+
+  IREE_TRACE_PLOT_VALUE_I64("iree_loop_wait_depth", wait_list->count);
+  IREE_TRACE_ZONE_END(z0);
+  return scan_status;
+}
+
+static iree_status_t iree_loop_wait_list_commit(
+    iree_loop_wait_list_t* wait_list, iree_loop_run_ring_t* run_ring,
+    iree_time_t deadline_ns) {
+  if (iree_wait_set_is_empty(wait_list->wait_set) == 0) {
+    // No wait handles; this is a sleep.
+    IREE_TRACE_ZONE_BEGIN_NAMED(z0, "iree_loop_wait_list_commit_sleep");
+    iree_status_t status =
+        iree_wait_until(deadline_ns)
+            ? iree_ok_status()
+            : iree_status_from_code(IREE_STATUS_DEADLINE_EXCEEDED);
+    IREE_TRACE_ZONE_END(z0);
+    return status;
+  }
+
+  // Real system wait.
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, (int64_t)wait_list->count);
+
+  // Enter the system wait API.
+  iree_wait_handle_t wake_handle = iree_wait_handle_immediate();
+  iree_status_t status =
+      iree_wait_any(wait_list->wait_set, deadline_ns, &wake_handle);
+  if (iree_status_is_ok(status)) {
+    // One or more waiters is ready. We don't support multi-wake right now so
+    // we'll just take the one we got back and try again.
+    //
+    // To avoid extra syscalls we scan the list and mark whatever tasks were
+    // using the handle the wait set reported waking as completed. On the next
+    // scan they'll be retired immediately. Ideally we'd have the wait set be
+    // able to tell us this precise list.
+    if (iree_wait_handle_is_immediate(wake_handle)) {
+      // No-op wait - ignore.
+      IREE_TRACE_ZONE_APPEND_TEXT(z0, "nop");
+    } else {
+      // Route to zero or more tasks using this handle.
+      IREE_TRACE_ZONE_APPEND_TEXT(z0, "task(s)");
+      iree_loop_wait_list_handle_wake(wait_list, run_ring, wake_handle);
+    }
+  } else if (iree_status_is_deadline_exceeded(status)) {
+    // Indicates nothing was woken within the deadline. We gracefully bail here
+    // and let the scan check for per-op deadline exceeded events or delay
+    // completion.
+    IREE_TRACE_ZONE_APPEND_TEXT(z0, "deadline exceeded");
+  } else {
+    // (Spurious?) error during wait.
+    // TODO(#4026): propagate failure to all scopes involved.
+    // Failures during waits are serious: ignoring them could lead to live-lock
+    // as tasks further in the pipeline expect them to have completed or - even
+    // worse - user code/other processes/drivers/etc may expect them to
+    // complete.
+    IREE_TRACE_ZONE_APPEND_TEXT(z0, "failure");
+    IREE_ASSERT_TRUE(iree_status_is_ok(status));
+    iree_status_ignore(status);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+// Aborts all waits that are part of |scope|.
+// A NULL |scope| indicates all work from all scopes should be aborted.
+static void iree_loop_wait_list_abort_scope(iree_loop_wait_list_t* wait_list,
+                                            iree_loop_sync_scope_t* scope) {
+  if (!wait_list->count) return;
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_PLOT_VALUE_I64("iree_loop_wait_depth", wait_list->count);
+
+  // Issue the completion callback of each op to notify it of the abort.
+  // To prevent enqueuing more work while aborting we pass in a NULL loop.
+  // We can't do anything with the errors so we ignore them.
+  for (iree_host_size_t i = 0; i < wait_list->count; ++i) {
+    if (scope && wait_list->ops[i].scope != scope) continue;
+
+    --wait_list->ops[i].scope->pending_count;
+    iree_loop_callback_t callback = wait_list->ops[i].callback;
+    iree_status_t status = callback.fn(callback.user_data, iree_loop_null(),
+                                       iree_make_status(IREE_STATUS_ABORTED));
+    iree_status_ignore(status);
+
+    // Since we make no guarantees about the order of the lists we can just swap
+    // with the last value.
+    int tail_index = (int)wait_list->count - 1;
+    if (tail_index > i) {
+      memcpy(&wait_list->ops[i], &wait_list->ops[tail_index],
+             sizeof(*wait_list->ops));
+    }
+    --wait_list->count;
+    --i;
+  }
+
+  IREE_TRACE_PLOT_VALUE_I64("iree_loop_wait_depth", wait_list->count);
+  IREE_TRACE_ZONE_END(z0);
+}
+
+// Aborts all waits from all scopes.
+static void iree_loop_wait_list_abort_all(iree_loop_wait_list_t* wait_list) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_loop_wait_list_abort_scope(wait_list, /*scope=*/NULL);
+  IREE_TRACE_ZONE_END(z0);
+}
+
+//===----------------------------------------------------------------------===//
+// iree_loop_sync_scope_t
+//===----------------------------------------------------------------------===//
+
+IREE_API_EXPORT void iree_loop_sync_scope_initialize(
+    iree_loop_sync_t* loop_sync, iree_loop_sync_error_fn_t error_fn,
+    void* error_user_data, iree_loop_sync_scope_t* out_scope) {
+  memset(out_scope, 0, sizeof(*out_scope));
+  out_scope->loop_sync = loop_sync;
+  out_scope->pending_count = 0;
+  out_scope->error_fn = error_fn;
+  out_scope->error_user_data = error_user_data;
+}
+
+IREE_API_EXPORT void iree_loop_sync_scope_deinitialize(
+    iree_loop_sync_scope_t* scope) {
+  IREE_ASSERT_ARGUMENT(scope);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  if (scope->loop_sync) {
+    iree_loop_sync_abort_scope(scope->loop_sync, scope);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+//===----------------------------------------------------------------------===//
+// iree_loop_sync_t
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_loop_sync_t {
+  iree_allocator_t allocator;
+
+  iree_loop_run_ring_t* run_ring;
+  iree_loop_wait_list_t* wait_list;
+
+  // Trailing data:
+  // + iree_loop_run_ring_storage_size
+  // + iree_loop_wait_list_storage_size
+} iree_loop_sync_t;
+
+IREE_API_EXPORT iree_status_t iree_loop_sync_allocate(
+    iree_loop_sync_options_t options, iree_allocator_t allocator,
+    iree_loop_sync_t** out_loop_sync) {
+  IREE_ASSERT_ARGUMENT(out_loop_sync);
+
+  // The run queue must be a power of two due to the ringbuffer masking
+  // technique we use.
+  options.max_queue_depth =
+      iree_math_round_up_to_pow2_u32((uint32_t)options.max_queue_depth);
+  if (options.max_queue_depth > UINT16_MAX) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "queue depth exceeds maximum");
+  }
+
+  // Wait sets also have a handle limit but we may want to allow more
+  // outstanding wait operations even if we can't wait on them all
+  // simultaneously.
+  if (IREE_UNLIKELY(options.max_wait_count > UINT16_MAX)) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "wait list depth exceeds maximum");
+  }
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  const iree_host_size_t loop_sync_size =
+      iree_host_align(sizeof(iree_loop_sync_t), iree_max_align_t);
+  const iree_host_size_t run_ring_size = iree_host_align(
+      iree_loop_run_ring_storage_size(options), iree_max_align_t);
+  const iree_host_size_t wait_list_size = iree_host_align(
+      iree_loop_wait_list_storage_size(options), iree_max_align_t);
+  const iree_host_size_t total_storage_size =
+      loop_sync_size + run_ring_size + wait_list_size;
+
+  uint8_t* storage = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0,
+      iree_allocator_malloc(allocator, total_storage_size, (void**)&storage));
+  iree_loop_sync_t* loop_sync = (iree_loop_sync_t*)storage;
+  loop_sync->allocator = allocator;
+  loop_sync->run_ring = (iree_loop_run_ring_t*)(storage + loop_sync_size);
+  loop_sync->wait_list =
+      (iree_loop_wait_list_t*)(storage + loop_sync_size + run_ring_size);
+
+  iree_status_t status = iree_ok_status();
+  if (iree_status_is_ok(status)) {
+    iree_loop_run_ring_initialize(options, loop_sync->run_ring);
+  }
+  if (iree_status_is_ok(status)) {
+    status = iree_loop_wait_list_initialize(options, allocator,
+                                            loop_sync->wait_list);
+  }
+
+  if (iree_status_is_ok(status)) {
+    *out_loop_sync = loop_sync;
+  } else {
+    iree_loop_sync_free(loop_sync);
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT void iree_loop_sync_free(iree_loop_sync_t* loop_sync) {
+  IREE_ASSERT_ARGUMENT(loop_sync);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_allocator_t allocator = loop_sync->allocator;
+
+  // Abort all pending operations.
+  // This will issue callbacks for each operation that was aborted directly
+  // with IREE_STATUS_ABORTED.
+  // To ensure we don't enqueue more work while aborting we NULL out the lists.
+  iree_loop_run_ring_t* run_ring = loop_sync->run_ring;
+  iree_loop_wait_list_t* wait_list = loop_sync->wait_list;
+  loop_sync->run_ring = NULL;
+  loop_sync->wait_list = NULL;
+  iree_loop_wait_list_abort_all(wait_list);
+  iree_loop_run_ring_abort_all(run_ring);
+
+  // After all operations are cleared we can release the data structures.
+  iree_loop_run_ring_deinitialize(run_ring);
+  iree_loop_wait_list_deinitialize(wait_list);
+  iree_allocator_free(allocator, loop_sync);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+// Aborts all operations in the loop attributed to |scope|.
+static void iree_loop_sync_abort_scope(iree_loop_sync_t* loop_sync,
+                                       iree_loop_sync_scope_t* scope) {
+  iree_loop_wait_list_abort_scope(loop_sync->wait_list, scope);
+  iree_loop_run_ring_abort_scope(loop_sync->run_ring, scope);
+}
+
+// Emits |status| to the given |loop| scope and aborts associated operations.
+static void iree_loop_sync_emit_error(iree_loop_t loop, iree_status_t status) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_TEXT(
+      z0, iree_status_code_string(iree_status_code(status)));
+
+  iree_loop_sync_scope_t* scope = (iree_loop_sync_scope_t*)loop.self;
+  iree_loop_sync_t* loop_sync = scope->loop_sync;
+
+  if (scope->error_fn) {
+    scope->error_fn(scope->error_user_data, status);
+  } else {
+    iree_status_ignore(status);
+  }
+
+  iree_loop_sync_abort_scope(loop_sync, scope);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static void iree_loop_sync_run_call(iree_loop_sync_t* loop_sync,
+                                    iree_loop_t loop,
+                                    const iree_loop_call_params_t params,
+                                    iree_status_t op_status) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_status_t status =
+      params.callback.fn(params.callback.user_data, loop, op_status);
+  if (!iree_status_is_ok(status)) {
+    iree_loop_sync_emit_error(loop, status);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static void iree_loop_sync_run_dispatch(
+    iree_loop_sync_t* loop_sync, iree_loop_t loop,
+    const iree_loop_dispatch_params_t params) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_status_t status = iree_ok_status();
+
+  // We run all workgroups before issuing the completion callback.
+  // If any workgroup fails we exit early and pass the failing status back to
+  // the completion handler exactly once.
+  uint32_t workgroup_count_x = params.workgroup_count_xyz[0];
+  uint32_t workgroup_count_y = params.workgroup_count_xyz[1];
+  uint32_t workgroup_count_z = params.workgroup_count_xyz[2];
+  iree_status_t workgroup_status = iree_ok_status();
+  for (uint32_t z = 0; z < workgroup_count_z; ++z) {
+    for (uint32_t y = 0; y < workgroup_count_y; ++y) {
+      for (uint32_t x = 0; x < workgroup_count_x; ++x) {
+        workgroup_status =
+            params.workgroup_fn(params.callback.user_data, loop, x, y, z);
+        if (!iree_status_is_ok(workgroup_status)) goto workgroup_failed;
+      }
+    }
+  }
+workgroup_failed:
+
+  // Fire the completion callback with either success or the first error hit by
+  // a workgroup.
+  status =
+      params.callback.fn(params.callback.user_data, loop, workgroup_status);
+  if (!iree_status_is_ok(status)) {
+    iree_loop_sync_emit_error(loop, status);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+// Drains work from the loop until all work in |scope| has completed.
+// A NULL |scope| indicates all work from all scopes should be drained.
+static iree_status_t iree_loop_sync_drain_scope(iree_loop_sync_t* loop_sync,
+                                                iree_loop_sync_scope_t* scope,
+                                                iree_time_t deadline_ns) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  do {
+    // If we are draining a particular scope we can bail whenever there's no
+    // more work remaining.
+    if (scope && !scope->pending_count) break;
+
+    // Run an op from the runnable queue.
+    // We dequeue operations here so that re-entrant enqueuing works.
+    // We only want to run one op at a time before checking our deadline so that
+    // we don't get into infinite loops or exceed the deadline (too much).
+    iree_loop_run_op_t run_op;
+    if (iree_loop_run_ring_dequeue(loop_sync->run_ring, &run_op)) {
+      iree_loop_t loop = {
+          .self = run_op.scope,
+          .ctl = iree_loop_sync_ctl,
+      };
+      switch (run_op.command) {
+        case IREE_LOOP_COMMAND_CALL:
+          iree_loop_sync_run_call(loop_sync, loop, run_op.params.call,
+                                  run_op.status);
+          break;
+        case IREE_LOOP_COMMAND_DISPATCH:
+          iree_loop_sync_run_dispatch(loop_sync, loop, run_op.params.dispatch);
+          break;
+      }
+      continue;  // loop back around only if under the deadline
+    }
+
+    // -- if here then the run ring is currently empty --
+
+    // If there are no pending waits then the drain has completed.
+    if (iree_loop_wait_list_is_empty(loop_sync->wait_list)) {
+      break;
+    }
+
+    // Scan the wait list and check for resolved ops.
+    // If there are any waiting ops the next earliest timeout is returned. An
+    // immediate timeout indicates that there's work in the run ring and we
+    // shouldn't perform a wait operation this go around the loop.
+    iree_time_t earliest_deadline_ns = IREE_TIME_INFINITE_FUTURE;
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z0, iree_loop_wait_list_scan(loop_sync->wait_list, loop_sync->run_ring,
+                                     &earliest_deadline_ns));
+    if (earliest_deadline_ns != IREE_TIME_INFINITE_PAST &&
+        earliest_deadline_ns != IREE_TIME_INFINITE_FUTURE) {
+      // Commit the wait operation, waiting up until the minimum of the user
+      // specified and wait list derived values.
+      iree_time_t wait_deadline_ns = earliest_deadline_ns < deadline_ns
+                                         ? earliest_deadline_ns
+                                         : deadline_ns;
+      IREE_RETURN_AND_END_ZONE_IF_ERROR(
+          z0, iree_loop_wait_list_commit(
+                  loop_sync->wait_list, loop_sync->run_ring, wait_deadline_ns));
+    }
+  } while (iree_time_now() < deadline_ns);
+
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_status_t
+iree_loop_sync_wait_idle(iree_loop_sync_t* loop_sync, iree_timeout_t timeout) {
+  IREE_ASSERT_ARGUMENT(loop_sync);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_time_t deadline_ns = iree_timeout_as_deadline_ns(timeout);
+  iree_status_t status =
+      iree_loop_sync_drain_scope(loop_sync, /*scope=*/NULL, deadline_ns);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+// Control function for the synchronous loop.
+// |self| must be an iree_loop_sync_scope_t.
+IREE_API_EXPORT iree_status_t iree_loop_sync_ctl(void* self,
+                                                 iree_loop_command_t command,
+                                                 const void* params,
+                                                 void** inout_ptr) {
+  IREE_ASSERT_ARGUMENT(self);
+  iree_loop_sync_scope_t* scope = (iree_loop_sync_scope_t*)self;
+  iree_loop_sync_t* loop_sync = scope->loop_sync;
+
+  if (IREE_UNLIKELY(!loop_sync->run_ring)) {
+    return iree_make_status(
+        IREE_STATUS_FAILED_PRECONDITION,
+        "new work cannot be enqueued while the loop is shutting down");
+  }
+
+  // NOTE: we return immediately to make this all (hopefully) tail calls.
+  switch (command) {
+    case IREE_LOOP_COMMAND_CALL:
+      return iree_loop_run_ring_enqueue(
+          loop_sync->run_ring,
+          (iree_loop_run_op_t){
+              .command = command,
+              .scope = scope,
+              .params =
+                  {
+                      .call = *(const iree_loop_call_params_t*)params,
+                  },
+          });
+    case IREE_LOOP_COMMAND_DISPATCH:
+      return iree_loop_run_ring_enqueue(
+          loop_sync->run_ring,
+          (iree_loop_run_op_t){
+              .command = command,
+              .scope = scope,
+              .params =
+                  {
+                      .dispatch = *(const iree_loop_dispatch_params_t*)params,
+                  },
+          });
+    case IREE_LOOP_COMMAND_WAIT_UNTIL:
+      return iree_loop_wait_list_insert(
+          loop_sync->wait_list,
+          (iree_loop_wait_op_t){
+              .command = command,
+              .scope = scope,
+              .params =
+                  {
+                      .wait_until =
+                          *(const iree_loop_wait_until_params_t*)params,
+                  },
+          });
+    case IREE_LOOP_COMMAND_WAIT_ONE:
+      return iree_loop_wait_list_insert(
+          loop_sync->wait_list,
+          (iree_loop_wait_op_t){
+              .command = command,
+              .scope = scope,
+              .params =
+                  {
+                      .wait_one = *(const iree_loop_wait_one_params_t*)params,
+                  },
+          });
+    case IREE_LOOP_COMMAND_WAIT_ALL:
+    case IREE_LOOP_COMMAND_WAIT_ANY:
+      return iree_loop_wait_list_insert(
+          loop_sync->wait_list,
+          (iree_loop_wait_op_t){
+              .command = command,
+              .scope = scope,
+              .params =
+                  {
+                      .wait_multi =
+                          *(const iree_loop_wait_multi_params_t*)params,
+                  },
+          });
+    case IREE_LOOP_COMMAND_DRAIN:
+      return iree_loop_sync_drain_scope(
+          loop_sync, scope,
+          ((const iree_loop_drain_params_t*)params)->deadline_ns);
+    default:
+      return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                              "unimplemented loop command");
+  }
+}
diff --git a/runtime/src/iree/base/loop_sync.h b/runtime/src/iree/base/loop_sync.h
new file mode 100644
index 0000000..12811bd
--- /dev/null
+++ b/runtime/src/iree/base/loop_sync.h
@@ -0,0 +1,109 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BASE_LOOP_SYNC_H_
+#define IREE_BASE_LOOP_SYNC_H_
+
+#include "iree/base/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+//===----------------------------------------------------------------------===//
+// iree_loop_sync_t
+//===----------------------------------------------------------------------===//
+
+// Configuration options for the synchronous loop implementation.
+typedef struct iree_loop_sync_options_t {
+  // Specifies the maximum operation queue depth in number of operations.
+  // Growth is not currently supported and if the capacity is reached during
+  // execution then IREE_STATUS_RESOURCE_EXHAUSTED will be returned when new
+  // operations are enqueued.
+  iree_host_size_t max_queue_depth;
+
+  // Specifies how many pending waits are allowed at the same time.
+  // Growth is not currently supported and if the capacity is reached during
+  // execution then IREE_STATUS_RESOURCE_EXHAUSTED will be returned when new
+  // waits are enqueued.
+  iree_host_size_t max_wait_count;
+} iree_loop_sync_options_t;
+
+// A lightweight loop that greedily runs operations as they are available.
+// This does not require any system threading support and has deterministic
+// behavior unless multi-waits are used.
+//
+// Thread-compatible: the loop only performs work when iree_loop_drain is
+// called and must not be used from multiple threads concurrently.
+typedef struct iree_loop_sync_t iree_loop_sync_t;
+
+// Allocates a synchronous loop using |allocator| stored into |out_loop_sync|.
+IREE_API_EXPORT iree_status_t iree_loop_sync_allocate(
+    iree_loop_sync_options_t options, iree_allocator_t allocator,
+    iree_loop_sync_t** out_loop_sync);
+
+// Frees a synchronous |loop_sync|, aborting all pending operations.
+IREE_API_EXPORT void iree_loop_sync_free(iree_loop_sync_t* loop_sync);
+
+// Waits until the loop is idle (all operations in all scopes have retired).
+// Returns IREE_STATUS_DEADLINE_EXCEEDED if |timeout| is reached before the
+// loop is idle.
+IREE_API_EXPORT iree_status_t
+iree_loop_sync_wait_idle(iree_loop_sync_t* loop_sync, iree_timeout_t timeout);
+
+// Handles scope errors returned from loop callback operations.
+// Ownership of |status| is passed to the handler and must be freed.
+// All operations of the same scope will be aborted.
+typedef void(IREE_API_PTR* iree_loop_sync_error_fn_t)(void* user_data,
+                                                      iree_status_t status);
+
+// A scope of execution within a loop.
+// Each scope has a dedicated error handler that is notified when an error
+// propagates from a loop operation scheduled against the scope. When an error
+// arises all other operations in the same scope will be aborted.
+typedef struct iree_loop_sync_scope_t {
+  // Target loop for execution.
+  iree_loop_sync_t* loop_sync;
+
+  // Total number of pending operations in the scope.
+  // When 0 the scope is considered idle.
+  int32_t pending_count;
+
+  // Optional function used to report errors that occur during execution.
+  iree_loop_sync_error_fn_t error_fn;
+  void* error_user_data;
+} iree_loop_sync_scope_t;
+
+// Initializes a loop scope that runs operations against |loop_sync|.
+IREE_API_EXPORT void iree_loop_sync_scope_initialize(
+    iree_loop_sync_t* loop_sync, iree_loop_sync_error_fn_t error_fn,
+    void* error_user_data, iree_loop_sync_scope_t* out_scope);
+
+// Deinitializes a loop |scope| and aborts any pending operations.
+IREE_API_EXPORT void iree_loop_sync_scope_deinitialize(
+    iree_loop_sync_scope_t* scope);
+
+IREE_API_EXPORT iree_status_t iree_loop_sync_ctl(void* self,
+                                                 iree_loop_command_t command,
+                                                 const void* params,
+                                                 void** inout_ptr);
+
+// Returns a loop that schedules operations against |scope|.
+// The scope must remain valid until all operations scheduled against it have
+// completed.
+static inline iree_loop_t iree_loop_sync_scope(iree_loop_sync_scope_t* scope) {
+  iree_loop_t loop = {
+      scope,
+      iree_loop_sync_ctl,
+  };
+  return loop;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_BASE_LOOP_SYNC_H_
diff --git a/runtime/src/iree/base/loop_sync_test.cc b/runtime/src/iree/base/loop_sync_test.cc
new file mode 100644
index 0000000..893ed40
--- /dev/null
+++ b/runtime/src/iree/base/loop_sync_test.cc
@@ -0,0 +1,52 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/loop_sync.h"
+
+#include "iree/base/api.h"
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+// Contains the test definitions applied to all loop implementations:
+#include "iree/base/loop_test.h"
+
+void AllocateLoop(iree_status_t* out_status, iree_allocator_t allocator,
+                  iree_loop_t* out_loop) {
+  iree_loop_sync_options_t options = {0};
+  options.max_queue_depth = 128;
+  options.max_wait_count = 32;
+
+  iree_loop_sync_t* loop_sync = NULL;
+  IREE_CHECK_OK(iree_loop_sync_allocate(options, allocator, &loop_sync));
+
+  iree_loop_sync_scope_t* scope = NULL;
+  IREE_CHECK_OK(
+      iree_allocator_malloc(allocator, sizeof(*scope), (void**)&scope));
+  iree_loop_sync_scope_initialize(
+      loop_sync,
+      +[](void* user_data, iree_status_t status) {
+        iree_status_t* status_ptr = (iree_status_t*)user_data;
+        if (iree_status_is_ok(*status_ptr)) {
+          *status_ptr = status;
+        } else {
+          iree_status_ignore(status);
+        }
+      },
+      out_status, scope);
+  *out_loop = iree_loop_sync_scope(scope);
+}
+
+void FreeLoop(iree_allocator_t allocator, iree_loop_t loop) {
+  iree_loop_sync_scope_t* scope = (iree_loop_sync_scope_t*)loop.self;
+  iree_loop_sync_t* loop_sync = scope->loop_sync;
+
+  iree_loop_sync_scope_deinitialize(scope);
+  iree_allocator_free(allocator, scope);
+
+  iree_loop_sync_free(loop_sync);
+}
+
+// TODO(benvanik): test multiple scopes and scoped abort behavior.
diff --git a/runtime/src/iree/base/loop_test.h b/runtime/src/iree/base/loop_test.h
new file mode 100644
index 0000000..66b439a
--- /dev/null
+++ b/runtime/src/iree/base/loop_test.h
@@ -0,0 +1,980 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <chrono>
+#include <thread>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/wait_handle.h"
+#include "iree/base/tracing.h"
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+// NOTE: this file is meant to be included inside of a _test.cc source file.
+// The file must define these functions to allocate/free the loop.
+// |out_status| should receive the last global error encountered in the loop.
+void AllocateLoop(iree_status_t* out_status, iree_allocator_t allocator,
+                  iree_loop_t* out_loop);
+void FreeLoop(iree_allocator_t allocator, iree_loop_t loop);
+
+namespace iree {
+namespace testing {
+
+struct LoopTest : public ::testing::Test {
+  iree_allocator_t allocator = iree_allocator_system();
+  iree_loop_t loop;
+  iree_status_t loop_status = iree_ok_status();
+
+  void SetUp() override {
+    IREE_TRACE_SCOPE();
+    AllocateLoop(&loop_status, allocator, &loop);
+  }
+  void TearDown() override {
+    IREE_TRACE_SCOPE();
+    FreeLoop(allocator, loop);
+    iree_status_ignore(loop_status);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// iree_loop_call
+//===----------------------------------------------------------------------===//
+
+// Tests the simple call interface for running work.
+TEST_F(LoopTest, Call) {
+  IREE_TRACE_SCOPE();
+  struct UserData {
+    iree_status_t call_status = iree_status_from_code(IREE_STATUS_DATA_LOSS);
+  } user_data;
+  IREE_ASSERT_OK(iree_loop_call(
+      loop, IREE_LOOP_PRIORITY_DEFAULT,
+      +[](void* user_data_ptr, iree_loop_t loop, iree_status_t status) {
+        IREE_TRACE_SCOPE();
+        IREE_EXPECT_OK(status);
+        auto* user_data = reinterpret_cast<UserData*>(user_data_ptr);
+        user_data->call_status = status;
+        return iree_ok_status();
+      },
+      &user_data));
+  IREE_ASSERT_OK(iree_loop_drain(loop, iree_infinite_timeout()));
+  IREE_ASSERT_OK(loop_status);
+  IREE_ASSERT_OK(user_data.call_status);
+}
+
+// Tests a call that forks into two other calls.
+TEST_F(LoopTest, CallFork) {
+  IREE_TRACE_SCOPE();
+  struct UserData {
+    bool called_a = false;
+    bool called_b = false;
+    bool called_c = false;
+  } user_data;
+
+  // A -> [B, C]
+  IREE_ASSERT_OK(iree_loop_call(
+      loop, IREE_LOOP_PRIORITY_DEFAULT,
+      +[](void* user_data_ptr, iree_loop_t loop, iree_status_t status) {
+        IREE_TRACE_SCOPE();
+        IREE_EXPECT_OK(status);
+        auto* user_data = reinterpret_cast<UserData*>(user_data_ptr);
+        user_data->called_a = true;
+
+        // B
+        IREE_EXPECT_OK(iree_loop_call(
+            loop, IREE_LOOP_PRIORITY_DEFAULT,
+            +[](void* user_data_ptr, iree_loop_t loop, iree_status_t status) {
+              IREE_TRACE_SCOPE();
+              IREE_EXPECT_OK(status);
+              auto* user_data = reinterpret_cast<UserData*>(user_data_ptr);
+              user_data->called_b = true;
+              return iree_ok_status();
+            },
+            user_data));
+
+        // C
+        IREE_EXPECT_OK(iree_loop_call(
+            loop, IREE_LOOP_PRIORITY_DEFAULT,
+            +[](void* user_data_ptr, iree_loop_t loop, iree_status_t status) {
+              IREE_TRACE_SCOPE();
+              IREE_EXPECT_OK(status);
+              auto* user_data = reinterpret_cast<UserData*>(user_data_ptr);
+              user_data->called_c = true;
+              return iree_ok_status();
+            },
+            user_data));
+
+        return iree_ok_status();
+      },
+      &user_data));
+
+  IREE_ASSERT_OK(iree_loop_drain(loop, iree_infinite_timeout()));
+  IREE_ASSERT_OK(loop_status);
+  EXPECT_TRUE(user_data.called_a);
+  EXPECT_TRUE(user_data.called_b);
+  EXPECT_TRUE(user_data.called_c);
+}
+
+// Tests a repeating call - since the loops are intended to be stackless we
+// should in theory be able to issue calls forever. This test ensures we can do
+// a really large amount without blowing the native stack.
+struct CallRepeatedData {
+  int remaining = 2 * 1024;
+};
+static iree_status_t CallRepeatedFn(void* user_data_ptr, iree_loop_t loop,
+                                    iree_status_t status) {
+  IREE_TRACE_SCOPE();
+  IREE_EXPECT_OK(status);
+  auto* user_data = reinterpret_cast<CallRepeatedData*>(user_data_ptr);
+  if (--user_data->remaining) {
+    IREE_RETURN_IF_ERROR(iree_loop_call(loop, IREE_LOOP_PRIORITY_DEFAULT,
+                                        CallRepeatedFn, user_data));
+  }
+  return iree_ok_status();
+}
+TEST_F(LoopTest, CallRepeated) {
+  IREE_TRACE_SCOPE();
+  CallRepeatedData user_data;
+  IREE_ASSERT_OK(iree_loop_call(loop, IREE_LOOP_PRIORITY_DEFAULT,
+                                CallRepeatedFn, &user_data));
+  IREE_ASSERT_OK(iree_loop_drain(loop, iree_infinite_timeout()));
+  IREE_ASSERT_OK(loop_status);
+  EXPECT_EQ(user_data.remaining, 0);
+}
+
+// Tests a call that results in failure.
+TEST_F(LoopTest, CallFailure) {
+  IREE_TRACE_SCOPE();
+  struct UserData {
+    bool completed = false;
+  } user_data;
+  IREE_ASSERT_OK(iree_loop_call(
+      loop, IREE_LOOP_PRIORITY_DEFAULT,
+      +[](void* user_data_ptr, iree_loop_t loop, iree_status_t status) {
+        IREE_TRACE_SCOPE();
+        IREE_EXPECT_OK(status);
+        auto* user_data = reinterpret_cast<UserData*>(user_data_ptr);
+        EXPECT_FALSE(user_data->completed);
+        user_data->completed = true;
+        return iree_status_from_code(IREE_STATUS_DATA_LOSS);
+      },
+      &user_data));
+  IREE_ASSERT_OK(iree_loop_drain(loop, iree_infinite_timeout()));
+  IREE_EXPECT_STATUS_IS(IREE_STATUS_DATA_LOSS, loop_status);
+}
+
+// Tests that a failure will abort other pending tasks.
+TEST_F(LoopTest, CallFailureAborts) {
+  IREE_TRACE_SCOPE();
+  struct UserData {
+    bool did_call_callback = false;
+    bool did_wait_callback = false;
+  } user_data;
+
+  // Issue the call that will fail.
+  IREE_ASSERT_OK(iree_loop_call(
+      loop, IREE_LOOP_PRIORITY_DEFAULT,
+      +[](void* user_data_ptr, iree_loop_t loop, iree_status_t status) {
+        IREE_TRACE_SCOPE();
+        IREE_EXPECT_OK(status);
+        auto* user_data = reinterpret_cast<UserData*>(user_data_ptr);
+        EXPECT_FALSE(user_data->did_call_callback);
+        user_data->did_call_callback = true;
+        return iree_status_from_code(IREE_STATUS_DATA_LOSS);
+      },
+      &user_data));
+
+  // Enqueue a wait that will never complete - if it runs it means we didn't
+  // correctly abort it.
+  IREE_ASSERT_OK(iree_loop_wait_until(
+      loop, iree_make_timeout_ms(1 * 60 * 1000),
+      +[](void* user_data_ptr, iree_loop_t loop, iree_status_t status) {
+        IREE_TRACE_SCOPE();
+        IREE_EXPECT_STATUS_IS(IREE_STATUS_ABORTED, status);
+        iree_status_ignore(status);
+        auto* user_data = reinterpret_cast<UserData*>(user_data_ptr);
+        EXPECT_FALSE(user_data->did_wait_callback);
+        user_data->did_wait_callback = true;
+        return iree_ok_status();
+      },
+      &user_data));
+
+  IREE_ASSERT_OK(iree_loop_drain(loop, iree_infinite_timeout()));
+  IREE_EXPECT_STATUS_IS(IREE_STATUS_DATA_LOSS, loop_status);
+  EXPECT_TRUE(user_data.did_call_callback);
+  EXPECT_TRUE(user_data.did_wait_callback);
+}
+
+// Tests that a failure will abort other pending tasks, including those enqueued
+// from within the failing call itself.
+TEST_F(LoopTest, CallFailureAbortsNested) {
+  IREE_TRACE_SCOPE();
+  struct UserData {
+    bool did_call_callback = false;
+    bool did_wait_callback = false;
+  } user_data;
+
+  // Issue the call that will fail.
+  IREE_ASSERT_OK(iree_loop_call(
+      loop, IREE_LOOP_PRIORITY_DEFAULT,
+      +[](void* user_data_ptr, iree_loop_t loop, iree_status_t status) {
+        IREE_TRACE_SCOPE();
+        IREE_EXPECT_OK(status);
+        auto* user_data = reinterpret_cast<UserData*>(user_data_ptr);
+        EXPECT_FALSE(user_data->did_call_callback);
+        user_data->did_call_callback = true;
+
+        // Enqueue a wait that will never complete - if it runs it means we
+        // didn't correctly abort it. We are enqueuing it reentrantly as a user
+        // would before we encounter the error below.
+        IREE_EXPECT_OK(iree_loop_wait_until(
+            loop, iree_make_timeout_ms(1 * 60 * 1000),
+            +[](void* user_data_ptr, iree_loop_t loop, iree_status_t status) {
+              IREE_TRACE_SCOPE();
+              IREE_EXPECT_STATUS_IS(IREE_STATUS_ABORTED, status);
+              iree_status_ignore(status);
+              auto* user_data = reinterpret_cast<UserData*>(user_data_ptr);
+              EXPECT_FALSE(user_data->did_wait_callback);
+              user_data->did_wait_callback = true;
+              return iree_ok_status();
+            },
+            user_data));
+
+        return iree_status_from_code(IREE_STATUS_DATA_LOSS);
+      },
+      &user_data));
+
+  IREE_ASSERT_OK(iree_loop_drain(loop, iree_infinite_timeout()));
+  IREE_EXPECT_STATUS_IS(IREE_STATUS_DATA_LOSS, loop_status);
+  EXPECT_TRUE(user_data.did_call_callback);
+  EXPECT_TRUE(user_data.did_wait_callback);
+}
+
+//===----------------------------------------------------------------------===//
+// iree_loop_dispatch
+//===----------------------------------------------------------------------===//
+
+// Tests a grid dispatch operation with an empty grid.
+// The completion callback should still be issued but no workgroups.
+TEST_F(LoopTest, DispatchEmpty) {
+  IREE_TRACE_SCOPE();
+  struct UserData {
+    std::atomic<int> workgroup_count = {0};
+    bool completed = false;
+  } user_data;
+  const uint32_t xyz[3] = {1, 0, 0};
+  IREE_ASSERT_OK(iree_loop_dispatch(
+      loop, xyz,
+      +[](void* user_data_ptr, iree_loop_t loop, uint32_t workgroup_x,
+          uint32_t workgroup_y, uint32_t workgroup_z) {
+        IREE_TRACE_SCOPE();
+        auto* user_data = reinterpret_cast<UserData*>(user_data_ptr);
+        ++user_data->workgroup_count;
+        return iree_ok_status();
+      },
+      +[](void* user_data_ptr, iree_loop_t loop, iree_status_t status) {
+        IREE_TRACE_SCOPE();
+        IREE_EXPECT_OK(status);
+        auto* user_data = reinterpret_cast<UserData*>(user_data_ptr);
+        EXPECT_FALSE(user_data->completed);
+        user_data->completed = true;
+        return iree_ok_status();
+      },
+      &user_data));
+  IREE_ASSERT_OK(iree_loop_drain(loop, iree_infinite_timeout()));
+  IREE_ASSERT_OK(loop_status);
+  EXPECT_EQ(user_data.workgroup_count, 0);
+  EXPECT_TRUE(user_data.completed);
+}
+
+// Tests a grid dispatch operation and ensures all workgroups are issued.
+TEST_F(LoopTest, DispatchGrid) {
+  IREE_TRACE_SCOPE();
+  struct UserData {
+    std::atomic<int> workgroup_count = {0};
+    bool completed = false;
+  } user_data;
+  const uint32_t xyz[3] = {4, 2, 1};
+  IREE_ASSERT_OK(iree_loop_dispatch(
+      loop, xyz,
+      +[](void* user_data_ptr, iree_loop_t loop, uint32_t workgroup_x,
+          uint32_t workgroup_y, uint32_t workgroup_z) {
+        IREE_TRACE_SCOPE();
+        auto* user_data = reinterpret_cast<UserData*>(user_data_ptr);
+        ++user_data->workgroup_count;
+        return iree_ok_status();
+      },
+      +[](void* user_data_ptr, iree_loop_t loop, iree_status_t status) {
+        IREE_TRACE_SCOPE();
+        IREE_EXPECT_OK(status);
+        auto* user_data = reinterpret_cast<UserData*>(user_data_ptr);
+        EXPECT_FALSE(user_data->completed);
+        user_data->completed = true;
+        return iree_ok_status();
+      },
+      &user_data));
+  IREE_ASSERT_OK(iree_loop_drain(loop, iree_infinite_timeout()));
+  IREE_ASSERT_OK(loop_status);
+  EXPECT_EQ(user_data.workgroup_count, xyz[0] * xyz[1] * xyz[2]);
+  EXPECT_TRUE(user_data.completed);
+}
+
+// Tests a grid dispatch operation with a workgroup failure.
+TEST_F(LoopTest, DispatchWorkgroupFailure) {
+  IREE_TRACE_SCOPE();
+  struct UserData {
+    bool completed = false;
+  } user_data;
+  const uint32_t xyz[3] = {4, 2, 1};
+  IREE_ASSERT_OK(iree_loop_dispatch(
+      loop, xyz,
+      +[](void* user_data_ptr, iree_loop_t loop, uint32_t workgroup_x,
+          uint32_t workgroup_y, uint32_t workgroup_z) {
+        IREE_TRACE_SCOPE();
+        return iree_status_from_code(IREE_STATUS_DATA_LOSS);
+      },
+      +[](void* user_data_ptr, iree_loop_t loop, iree_status_t status) {
+        IREE_TRACE_SCOPE();
+        IREE_EXPECT_STATUS_IS(IREE_STATUS_DATA_LOSS, status);
+        iree_status_ignore(status);
+        auto* user_data = reinterpret_cast<UserData*>(user_data_ptr);
+        EXPECT_FALSE(user_data->completed);
+        user_data->completed = true;
+        return iree_ok_status();
+      },
+      &user_data));
+  IREE_ASSERT_OK(iree_loop_drain(loop, iree_infinite_timeout()));
+  IREE_ASSERT_OK(loop_status);
+  EXPECT_TRUE(user_data.completed);
+}
+
+//===----------------------------------------------------------------------===//
+// iree_loop_wait_until
+//===----------------------------------------------------------------------===//
+
+// Tests a wait-until delay with an immediate timeout.
+TEST_F(LoopTest, WaitUntilImmediate) {
+  IREE_TRACE_SCOPE();
+  struct UserData {
+    iree_status_t wait_status = iree_status_from_code(IREE_STATUS_DATA_LOSS);
+  } user_data;
+  IREE_ASSERT_OK(iree_loop_wait_until(
+      loop, iree_immediate_timeout(),
+      +[](void* user_data_ptr, iree_loop_t loop, iree_status_t status) {
+        IREE_TRACE_SCOPE();
+        IREE_EXPECT_OK(status);
+        auto* user_data = reinterpret_cast<UserData*>(user_data_ptr);
+        user_data->wait_status = status;
+        return iree_ok_status();
+      },
+      &user_data));
+  IREE_ASSERT_OK(iree_loop_drain(loop, iree_infinite_timeout()));
+  IREE_ASSERT_OK(loop_status);
+  IREE_ASSERT_OK(user_data.wait_status);
+}
+
+// Tests a wait-until delay with an actual delay.
+TEST_F(LoopTest, WaitUntil) {
+  IREE_TRACE_SCOPE();
+  struct UserData {
+    iree_time_t start_ns = iree_time_now();
+    iree_time_t end_ns = IREE_TIME_INFINITE_FUTURE;
+    iree_status_t wait_status = iree_status_from_code(IREE_STATUS_DATA_LOSS);
+  } user_data;
+  IREE_ASSERT_OK(iree_loop_wait_until(
+      loop, iree_make_timeout_ms(50),
+      +[](void* user_data_ptr, iree_loop_t loop, iree_status_t status) {
+        IREE_TRACE_SCOPE();
+        IREE_EXPECT_OK(status);
+        auto* user_data = reinterpret_cast<UserData*>(user_data_ptr);
+        user_data->end_ns = iree_time_now();
+        user_data->wait_status = status;
+        return iree_ok_status();
+      },
+      &user_data));
+  IREE_ASSERT_OK(iree_loop_drain(loop, iree_infinite_timeout()));
+  IREE_ASSERT_OK(loop_status);
+  IREE_ASSERT_OK(user_data.wait_status);
+  // Not checking exact timing as some devices may not have clocks.
+  EXPECT_GE(user_data.end_ns, user_data.start_ns);
+}
+
+// Tests that multiple wait-until's can be active at once.
+// NOTE: loops are not required to wake in any particular order.
+TEST_F(LoopTest, MultiWaitUntil) {
+  IREE_TRACE_SCOPE();
+  struct UserData {
+    bool woke_a = false;
+    bool woke_b = false;
+  } user_data;
+
+  IREE_ASSERT_OK(iree_loop_wait_until(
+      loop, iree_make_timeout_ms(25),
+      +[](void* user_data_ptr, iree_loop_t loop, iree_status_t status) {
+        IREE_TRACE_SCOPE();
+        IREE_EXPECT_OK(status);
+        auto* user_data = reinterpret_cast<UserData*>(user_data_ptr);
+        user_data->woke_a = true;
+        return iree_ok_status();
+      },
+      &user_data));
+
+  IREE_ASSERT_OK(iree_loop_wait_until(
+      loop, iree_make_timeout_ms(50),
+      +[](void* user_data_ptr, iree_loop_t loop, iree_status_t status) {
+        IREE_TRACE_SCOPE();
+        IREE_EXPECT_OK(status);
+        auto* user_data = reinterpret_cast<UserData*>(user_data_ptr);
+        user_data->woke_b = true;
+        return iree_ok_status();
+      },
+      &user_data));
+
+  IREE_ASSERT_OK(iree_loop_drain(loop, iree_infinite_timeout()));
+  IREE_ASSERT_OK(loop_status);
+  EXPECT_TRUE(user_data.woke_a);
+  EXPECT_TRUE(user_data.woke_b);
+}
+
+//===----------------------------------------------------------------------===//
+// iree_loop_wait_one
+//===----------------------------------------------------------------------===//
+
+// Tests a wait-one with an immediate timeout.
+// The handle is never resolved and if we didn't bail immediately we'd hang.
+TEST_F(LoopTest, WaitOneImmediate) {
+  IREE_TRACE_SCOPE();
+
+  // An event that never resolves.
+  iree_event_t event;
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/false, &event));
+  iree_wait_source_t wait_source = iree_event_await(&event);
+
+  struct UserData {
+    bool did_wait_callback = false;
+  } user_data;
+  IREE_ASSERT_OK(iree_loop_wait_one(
+      loop, wait_source, iree_immediate_timeout(),
+      +[](void* user_data_ptr, iree_loop_t loop, iree_status_t status) {
+        IREE_TRACE_SCOPE();
+        IREE_EXPECT_STATUS_IS(IREE_STATUS_DEADLINE_EXCEEDED, status);
+        auto* user_data = reinterpret_cast<UserData*>(user_data_ptr);
+        user_data->did_wait_callback = true;
+        return iree_ok_status();
+      },
+      &user_data));
+  IREE_ASSERT_OK(iree_loop_drain(loop, iree_infinite_timeout()));
+
+  IREE_ASSERT_OK(loop_status);
+  EXPECT_TRUE(user_data.did_wait_callback);
+
+  iree_event_deinitialize(&event);
+}
+
+// Tests a wait-one with a non-immediate timeout.
+TEST_F(LoopTest, WaitOneTimeout) {
+  IREE_TRACE_SCOPE();
+
+  // An event that never resolves.
+  iree_event_t event;
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/false, &event));
+  iree_wait_source_t wait_source = iree_event_await(&event);
+
+  struct UserData {
+    bool did_wait_callback = false;
+  } user_data;
+  IREE_ASSERT_OK(iree_loop_wait_one(
+      loop, wait_source, iree_make_timeout_ms(10),
+      +[](void* user_data_ptr, iree_loop_t loop, iree_status_t status) {
+        IREE_TRACE_SCOPE();
+        IREE_EXPECT_STATUS_IS(IREE_STATUS_DEADLINE_EXCEEDED, status);
+        auto* user_data = reinterpret_cast<UserData*>(user_data_ptr);
+        user_data->did_wait_callback = true;
+        return iree_ok_status();
+      },
+      &user_data));
+  IREE_ASSERT_OK(iree_loop_drain(loop, iree_infinite_timeout()));
+
+  IREE_ASSERT_OK(loop_status);
+  EXPECT_TRUE(user_data.did_wait_callback);
+
+  iree_event_deinitialize(&event);
+}
+
+// Tests a wait-one that times out does not abort other loop ops.
+// The deadline exceeded status passed to the callback is sufficient.
+TEST_F(LoopTest, WaitOneTimeoutNoAbort) {
+  IREE_TRACE_SCOPE();
+
+  // An event that never resolves.
+  iree_event_t event;
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/false, &event));
+  iree_wait_source_t wait_source = iree_event_await(&event);
+
+  struct UserData {
+    bool did_wait_callback = false;
+    bool did_call_callback = false;
+  } user_data;
+
+  // Wait that will time out.
+  IREE_ASSERT_OK(iree_loop_wait_one(
+      loop, wait_source, iree_make_timeout_ms(10),
+      +[](void* user_data_ptr, iree_loop_t loop, iree_status_t status) {
+        IREE_TRACE_SCOPE();
+        IREE_EXPECT_STATUS_IS(IREE_STATUS_DEADLINE_EXCEEDED, status);
+        auto* user_data = reinterpret_cast<UserData*>(user_data_ptr);
+        user_data->did_wait_callback = true;
+
+        // Call that should still be issued correctly.
+        // Note that we queue it here as if we did it outside the wait we'd
+        // immediately execute it on out-of-order implementations.
+        IREE_EXPECT_OK(iree_loop_call(
+            loop, IREE_LOOP_PRIORITY_DEFAULT,
+            +[](void* user_data_ptr, iree_loop_t loop, iree_status_t status) {
+              IREE_TRACE_SCOPE();
+              IREE_EXPECT_OK(status);
+              auto* user_data = reinterpret_cast<UserData*>(user_data_ptr);
+              EXPECT_FALSE(user_data->did_call_callback);
+              user_data->did_call_callback = true;
+              return iree_ok_status();
+            },
+            user_data));
+
+        return iree_ok_status();
+      },
+      &user_data));
+
+  IREE_ASSERT_OK(iree_loop_drain(loop, iree_infinite_timeout()));
+
+  IREE_ASSERT_OK(loop_status);
+  EXPECT_TRUE(user_data.did_wait_callback);
+  EXPECT_TRUE(user_data.did_call_callback);
+
+  iree_event_deinitialize(&event);
+}
+
+// Tests a wait-one with an already signaled wait source.
+TEST_F(LoopTest, WaitOneSignaled) {
+  IREE_TRACE_SCOPE();
+
+  // An event that is resolved immediately.
+  iree_event_t event;
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/true, &event));
+  iree_wait_source_t wait_source = iree_event_await(&event);
+
+  struct UserData {
+    bool did_wait_callback = false;
+  } user_data;
+  IREE_ASSERT_OK(iree_loop_wait_one(
+      loop, wait_source, iree_make_timeout_ms(10),
+      +[](void* user_data_ptr, iree_loop_t loop, iree_status_t status) {
+        IREE_TRACE_SCOPE();
+        IREE_EXPECT_OK(status);
+        auto* user_data = reinterpret_cast<UserData*>(user_data_ptr);
+        user_data->did_wait_callback = true;
+        return iree_ok_status();
+      },
+      &user_data));
+  IREE_ASSERT_OK(iree_loop_drain(loop, iree_infinite_timeout()));
+
+  IREE_ASSERT_OK(loop_status);
+  EXPECT_TRUE(user_data.did_wait_callback);
+
+  iree_event_deinitialize(&event);
+}
+
+// Tests a wait-one on a wait handle signaled out-of-band.
+TEST_F(LoopTest, WaitOneBlocking) {
+  IREE_TRACE_SCOPE();
+
+  // Initially unsignaled.
+  iree_event_t event;
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/false, &event));
+  iree_wait_source_t wait_source = iree_event_await(&event);
+
+  // Spin up the thread to signal the event after a short delay.
+  // We need to do this before we issue the wait so that loops which perform the
+  // wait inline can still make forward progress even if they block.
+  std::thread thread([&]() {
+    IREE_TRACE_SCOPE();
+    std::this_thread::sleep_for(std::chrono::milliseconds(50));
+    iree_event_set(&event);
+  });
+
+  struct UserData {
+    bool did_wait_callback = false;
+  } user_data;
+  IREE_ASSERT_OK(iree_loop_wait_one(
+      loop, wait_source, iree_make_timeout_ms(200),
+      +[](void* user_data_ptr, iree_loop_t loop, iree_status_t status) {
+        IREE_TRACE_SCOPE();
+        IREE_EXPECT_OK(status);
+        auto* user_data = reinterpret_cast<UserData*>(user_data_ptr);
+        user_data->did_wait_callback = true;
+        return iree_ok_status();
+      },
+      &user_data));
+  IREE_ASSERT_OK(iree_loop_drain(loop, iree_infinite_timeout()));
+
+  IREE_ASSERT_OK(loop_status);
+  EXPECT_TRUE(user_data.did_wait_callback);
+
+  thread.join();
+  iree_event_deinitialize(&event);
+}
+
+//===----------------------------------------------------------------------===//
+// iree_loop_wait_any
+//===----------------------------------------------------------------------===//
+
+// Tests a wait-any with a immediate timeout (a poll).
+TEST_F(LoopTest, WaitAnyImmediate) {
+  IREE_TRACE_SCOPE();
+
+  // Events that are never resolved such that we time out.
+  iree_event_t event_a;
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/false, &event_a));
+  iree_wait_source_t wait_source_a = iree_event_await(&event_a);
+  iree_event_t event_b;
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/false, &event_b));
+  iree_wait_source_t wait_source_b = iree_event_await(&event_b);
+
+  iree_wait_source_t wait_sources[2] = {
+      wait_source_a,
+      wait_source_b,
+  };
+  struct UserData {
+    bool did_wait_callback = false;
+  } user_data;
+  IREE_ASSERT_OK(iree_loop_wait_any(
+      loop, IREE_ARRAYSIZE(wait_sources), wait_sources,
+      iree_immediate_timeout(),
+      +[](void* user_data_ptr, iree_loop_t loop, iree_status_t status) {
+        IREE_TRACE_SCOPE();
+        IREE_EXPECT_STATUS_IS(IREE_STATUS_DEADLINE_EXCEEDED, status);
+        auto* user_data = reinterpret_cast<UserData*>(user_data_ptr);
+        user_data->did_wait_callback = true;
+        return iree_ok_status();
+      },
+      &user_data));
+  IREE_ASSERT_OK(iree_loop_drain(loop, iree_infinite_timeout()));
+
+  IREE_ASSERT_OK(loop_status);
+  EXPECT_TRUE(user_data.did_wait_callback);
+
+  iree_event_deinitialize(&event_a);
+  iree_event_deinitialize(&event_b);
+}
+
+// Tests a wait-any with a non-immediate timeout.
+TEST_F(LoopTest, WaitAnyTimeout) {
+  IREE_TRACE_SCOPE();
+
+  // Events that are never resolved such that we time out.
+  iree_event_t event_a;
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/false, &event_a));
+  iree_wait_source_t wait_source_a = iree_event_await(&event_a);
+  iree_event_t event_b;
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/false, &event_b));
+  iree_wait_source_t wait_source_b = iree_event_await(&event_b);
+
+  iree_wait_source_t wait_sources[2] = {
+      wait_source_a,
+      wait_source_b,
+  };
+  struct UserData {
+    bool did_wait_callback = false;
+  } user_data;
+  IREE_ASSERT_OK(iree_loop_wait_any(
+      loop, IREE_ARRAYSIZE(wait_sources), wait_sources,
+      iree_make_timeout_ms(10),
+      +[](void* user_data_ptr, iree_loop_t loop, iree_status_t status) {
+        IREE_TRACE_SCOPE();
+        IREE_EXPECT_STATUS_IS(IREE_STATUS_DEADLINE_EXCEEDED, status);
+        auto* user_data = reinterpret_cast<UserData*>(user_data_ptr);
+        user_data->did_wait_callback = true;
+        return iree_ok_status();
+      },
+      &user_data));
+  IREE_ASSERT_OK(iree_loop_drain(loop, iree_infinite_timeout()));
+
+  IREE_ASSERT_OK(loop_status);
+  EXPECT_TRUE(user_data.did_wait_callback);
+
+  iree_event_deinitialize(&event_a);
+  iree_event_deinitialize(&event_b);
+}
+
+// Tests a wait-any with an already-resolved wait handle.
+TEST_F(LoopTest, WaitAnySignaled) {
+  IREE_TRACE_SCOPE();
+
+  // An event that is resolved immediately.
+  iree_event_t event;
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/true, &event));
+  iree_wait_source_t wait_source = iree_event_await(&event);
+
+  // Always unsignaled so we test the wait-any behavior.
+  iree_event_t unresolved_event;
+  IREE_ASSERT_OK(
+      iree_event_initialize(/*initial_state=*/false, &unresolved_event));
+  iree_wait_source_t unresolved_wait_source =
+      iree_event_await(&unresolved_event);
+
+  iree_wait_source_t wait_sources[2] = {
+      wait_source,
+      unresolved_wait_source,
+  };
+  struct UserData {
+    bool did_wait_callback = false;
+  } user_data;
+  IREE_ASSERT_OK(iree_loop_wait_any(
+      loop, IREE_ARRAYSIZE(wait_sources), wait_sources,
+      iree_make_timeout_ms(10),
+      +[](void* user_data_ptr, iree_loop_t loop, iree_status_t status) {
+        IREE_TRACE_SCOPE();
+        IREE_EXPECT_OK(status);
+        auto* user_data = reinterpret_cast<UserData*>(user_data_ptr);
+        user_data->did_wait_callback = true;
+        return iree_ok_status();
+      },
+      &user_data));
+  IREE_ASSERT_OK(iree_loop_drain(loop, iree_infinite_timeout()));
+
+  IREE_ASSERT_OK(loop_status);
+  EXPECT_TRUE(user_data.did_wait_callback);
+
+  iree_event_deinitialize(&event);
+  iree_event_deinitialize(&unresolved_event);
+}
+
+// Tests a wait-any with a wait handle signaled out-of-band.
+TEST_F(LoopTest, WaitAnyBlocking) {
+  IREE_TRACE_SCOPE();
+
+  // Initially unsignaled.
+  iree_event_t event;
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/false, &event));
+  iree_wait_source_t wait_source = iree_event_await(&event);
+
+  // Always unsignaled so we test the wait-any behavior.
+  iree_event_t unresolved_event;
+  IREE_ASSERT_OK(
+      iree_event_initialize(/*initial_state=*/false, &unresolved_event));
+  iree_wait_source_t unresolved_wait_source =
+      iree_event_await(&unresolved_event);
+
+  // Spin up the thread to signal the event after a short delay.
+  // We need to do this before we issue the wait so that loops which perform the
+  // wait inline can still make forward progress even if they block.
+  std::thread thread([&]() {
+    IREE_TRACE_SCOPE();
+    std::this_thread::sleep_for(std::chrono::milliseconds(50));
+    iree_event_set(&event);
+  });
+
+  iree_wait_source_t wait_sources[2] = {
+      wait_source,
+      unresolved_wait_source,
+  };
+  struct UserData {
+    bool did_wait_callback = false;
+  } user_data;
+  IREE_ASSERT_OK(iree_loop_wait_any(
+      loop, IREE_ARRAYSIZE(wait_sources), wait_sources,
+      iree_make_timeout_ms(200),
+      +[](void* user_data_ptr, iree_loop_t loop, iree_status_t status) {
+        IREE_TRACE_SCOPE();
+        IREE_EXPECT_OK(status);
+        auto* user_data = reinterpret_cast<UserData*>(user_data_ptr);
+        user_data->did_wait_callback = true;
+        return iree_ok_status();
+      },
+      &user_data));
+  IREE_ASSERT_OK(iree_loop_drain(loop, iree_infinite_timeout()));
+
+  IREE_ASSERT_OK(loop_status);
+  EXPECT_TRUE(user_data.did_wait_callback);
+
+  thread.join();
+  iree_event_deinitialize(&event);
+  iree_event_deinitialize(&unresolved_event);
+}
+
+//===----------------------------------------------------------------------===//
+// iree_loop_wait_all
+//===----------------------------------------------------------------------===//
+
+// Tests a wait-all with a immediate timeout (a poll).
+TEST_F(LoopTest, WaitAllImmediate) {
+  IREE_TRACE_SCOPE();
+
+  // One unresolved and one resolved event (should fail the wait-all).
+  iree_event_t event_a;
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/false, &event_a));
+  iree_wait_source_t wait_source_a = iree_event_await(&event_a);
+  iree_event_t event_b;
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/true, &event_b));
+  iree_wait_source_t wait_source_b = iree_event_await(&event_b);
+
+  iree_wait_source_t wait_sources[2] = {
+      wait_source_a,
+      wait_source_b,
+  };
+  struct UserData {
+    bool did_wait_callback = false;
+  } user_data;
+  IREE_ASSERT_OK(iree_loop_wait_all(
+      loop, IREE_ARRAYSIZE(wait_sources), wait_sources,
+      iree_immediate_timeout(),
+      +[](void* user_data_ptr, iree_loop_t loop, iree_status_t status) {
+        IREE_TRACE_SCOPE();
+        IREE_EXPECT_STATUS_IS(IREE_STATUS_DEADLINE_EXCEEDED, status);
+        auto* user_data = reinterpret_cast<UserData*>(user_data_ptr);
+        user_data->did_wait_callback = true;
+        return iree_ok_status();
+      },
+      &user_data));
+  IREE_ASSERT_OK(iree_loop_drain(loop, iree_infinite_timeout()));
+
+  IREE_ASSERT_OK(loop_status);
+  EXPECT_TRUE(user_data.did_wait_callback);
+
+  iree_event_deinitialize(&event_a);
+  iree_event_deinitialize(&event_b);
+}
+
+// Tests a wait-all with a non-immediate timeout.
+TEST_F(LoopTest, WaitAllTimeout) {
+  IREE_TRACE_SCOPE();
+
+  // One unresolved and one resolved event (should fail the wait-all).
+  iree_event_t event_a;
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/false, &event_a));
+  iree_wait_source_t wait_source_a = iree_event_await(&event_a);
+  iree_event_t event_b;
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/true, &event_b));
+  iree_wait_source_t wait_source_b = iree_event_await(&event_b);
+
+  iree_wait_source_t wait_sources[2] = {
+      wait_source_a,
+      wait_source_b,
+  };
+  struct UserData {
+    bool did_wait_callback = false;
+  } user_data;
+  IREE_ASSERT_OK(iree_loop_wait_all(
+      loop, IREE_ARRAYSIZE(wait_sources), wait_sources,
+      iree_make_timeout_ms(10),
+      +[](void* user_data_ptr, iree_loop_t loop, iree_status_t status) {
+        IREE_TRACE_SCOPE();
+        IREE_EXPECT_STATUS_IS(IREE_STATUS_DEADLINE_EXCEEDED, status);
+        auto* user_data = reinterpret_cast<UserData*>(user_data_ptr);
+        user_data->did_wait_callback = true;
+        return iree_ok_status();
+      },
+      &user_data));
+  IREE_ASSERT_OK(iree_loop_drain(loop, iree_infinite_timeout()));
+
+  IREE_ASSERT_OK(loop_status);
+  EXPECT_TRUE(user_data.did_wait_callback);
+
+  iree_event_deinitialize(&event_a);
+  iree_event_deinitialize(&event_b);
+}
+
+// Tests a wait-all with already-resolved wait handles.
+TEST_F(LoopTest, WaitAllSignaled) {
+  IREE_TRACE_SCOPE();
+
+  // Signaled events so the wait-all succeeds.
+  iree_event_t event_a;
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/true, &event_a));
+  iree_wait_source_t wait_source_a = iree_event_await(&event_a);
+  iree_event_t event_b;
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/true, &event_b));
+  iree_wait_source_t wait_source_b = iree_event_await(&event_b);
+
+  iree_wait_source_t wait_sources[2] = {
+      wait_source_a,
+      wait_source_b,
+  };
+  struct UserData {
+    bool did_wait_callback = false;
+  } user_data;
+  IREE_ASSERT_OK(iree_loop_wait_all(
+      loop, IREE_ARRAYSIZE(wait_sources), wait_sources,
+      iree_make_timeout_ms(10),
+      +[](void* user_data_ptr, iree_loop_t loop, iree_status_t status) {
+        IREE_TRACE_SCOPE();
+        IREE_EXPECT_OK(status);
+        auto* user_data = reinterpret_cast<UserData*>(user_data_ptr);
+        user_data->did_wait_callback = true;
+        return iree_ok_status();
+      },
+      &user_data));
+  IREE_ASSERT_OK(iree_loop_drain(loop, iree_infinite_timeout()));
+
+  IREE_ASSERT_OK(loop_status);
+  EXPECT_TRUE(user_data.did_wait_callback);
+
+  iree_event_deinitialize(&event_a);
+  iree_event_deinitialize(&event_b);
+}
+
+// Tests a wait-all with wait handles signaled out-of-band.
+TEST_F(LoopTest, WaitAllBlocking) {
+  IREE_TRACE_SCOPE();
+
+  // Initially unsignaled.
+  iree_event_t event;
+  IREE_ASSERT_OK(iree_event_initialize(/*initial_state=*/false, &event));
+  iree_wait_source_t wait_source = iree_event_await(&event);
+
+  // Always unsignaled so we test the wait-any behavior.
+  iree_event_t resolved_event;
+  IREE_ASSERT_OK(
+      iree_event_initialize(/*initial_state=*/true, &resolved_event));
+  iree_wait_source_t resolved_wait_source = iree_event_await(&resolved_event);
+
+  // Spin up the thread to signal the event after a short delay.
+  // We need to do this before we issue the wait so that loops which perform the
+  // wait inline can still make forward progress even if they block.
+  std::thread thread([&]() {
+    IREE_TRACE_SCOPE();
+    std::this_thread::sleep_for(std::chrono::milliseconds(50));
+    iree_event_set(&event);
+  });
+
+  iree_wait_source_t wait_sources[2] = {
+      wait_source,
+      resolved_wait_source,
+  };
+  struct UserData {
+    bool did_wait_callback = false;
+  } user_data;
+  IREE_ASSERT_OK(iree_loop_wait_all(
+      loop, IREE_ARRAYSIZE(wait_sources), wait_sources,
+      iree_make_timeout_ms(200),
+      +[](void* user_data_ptr, iree_loop_t loop, iree_status_t status) {
+        IREE_TRACE_SCOPE();
+        IREE_EXPECT_OK(status);
+        auto* user_data = reinterpret_cast<UserData*>(user_data_ptr);
+        user_data->did_wait_callback = true;
+        return iree_ok_status();
+      },
+      &user_data));
+  IREE_ASSERT_OK(iree_loop_drain(loop, iree_infinite_timeout()));
+
+  IREE_ASSERT_OK(loop_status);
+  EXPECT_TRUE(user_data.did_wait_callback);
+
+  thread.join();
+  iree_event_deinitialize(&event);
+  iree_event_deinitialize(&resolved_event);
+}
+
+}  // namespace testing
+}  // namespace iree
diff --git a/runtime/src/iree/base/status.c b/runtime/src/iree/base/status.c
new file mode 100644
index 0000000..d71ba53
--- /dev/null
+++ b/runtime/src/iree/base/status.c
@@ -0,0 +1,832 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/status.h"
+
+#include <assert.h>
+#include <errno.h>
+#include <limits.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "iree/base/alignment.h"
+#include "iree/base/allocator.h"
+#include "iree/base/assert.h"
+#include "iree/base/target_platform.h"
+#include "iree/base/tracing.h"
+
+//===----------------------------------------------------------------------===//
+// C11 aligned_alloc compatibility shim
+//===----------------------------------------------------------------------===//
+
+#if defined(IREE_PLATFORM_WINDOWS)
+// https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/aligned-malloc
+#define iree_aligned_alloc(alignment, size) _aligned_malloc(size, alignment)
+#define iree_aligned_free(p) _aligned_free(p)
+#elif defined(_ISOC11_SOURCE)
+// https://en.cppreference.com/w/c/memory/aligned_alloc
+#define iree_aligned_alloc(alignment, size) aligned_alloc(alignment, size)
+#define iree_aligned_free(p) free(p)
+#elif _POSIX_C_SOURCE >= 200112L
+// https://pubs.opengroup.org/onlinepubs/9699919799/functions/posix_memalign.html
+static inline void* iree_aligned_alloc(size_t alignment, size_t size) {
+  void* ptr = NULL;
+  return posix_memalign(&ptr, alignment, size) == 0 ? ptr : NULL;
+}
+#define iree_aligned_free(p) free(p)
+#else
+// Emulates alignment with normal malloc. We overallocate by at least the
+// alignment + the size of a pointer, store the base pointer at p[-1], and
+// return the aligned pointer. This lets us easily get the base pointer in free
+// to pass back to the system.
+static inline void* iree_aligned_alloc(size_t alignment, size_t size) {
+  void* base_ptr = malloc(size + alignment + sizeof(uintptr_t));
+  if (!base_ptr) return NULL;
+  uintptr_t* aligned_ptr = (uintptr_t*)iree_host_align(
+      (uintptr_t)base_ptr + sizeof(uintptr_t), alignment);
+  aligned_ptr[-1] = (uintptr_t)base_ptr;
+  return aligned_ptr;
+}
+static inline void iree_aligned_free(void* p) {
+  if (IREE_UNLIKELY(!p)) return;
+  uintptr_t* aligned_ptr = (uintptr_t*)p;
+  void* base_ptr = (void*)aligned_ptr[-1];
+  free(base_ptr);
+}
+#endif  // IREE_PLATFORM_WINDOWS
+
+//===----------------------------------------------------------------------===//
+// iree_status_t canonical errors
+//===----------------------------------------------------------------------===//
+
+IREE_API_EXPORT iree_status_code_t
+iree_status_code_from_errno(int error_number) {
+  switch (error_number) {
+    case 0:
+      return IREE_STATUS_OK;
+    case EINVAL:        // Invalid argument
+    case ENAMETOOLONG:  // Filename too long
+    case E2BIG:         // Argument list too long
+    case EDESTADDRREQ:  // Destination address required
+    case EDOM:          // Mathematics argument out of domain of function
+    case EFAULT:        // Bad address
+    case EILSEQ:        // Illegal byte sequence
+    case ENOPROTOOPT:   // Protocol not available
+    case ENOSTR:        // Not a STREAM
+    case ENOTSOCK:      // Not a socket
+    case ENOTTY:        // Inappropriate I/O control operation
+    case EPROTOTYPE:    // Protocol wrong type for socket
+    case ESPIPE:        // Invalid seek
+      return IREE_STATUS_INVALID_ARGUMENT;
+    case ETIMEDOUT:  // Connection timed out
+    case ETIME:      // Timer expired
+      return IREE_STATUS_DEADLINE_EXCEEDED;
+    case ENODEV:  // No such device
+    case ENOENT:  // No such file or directory
+#ifdef ENOMEDIUM
+    case ENOMEDIUM:  // No medium found
+#endif
+    case ENXIO:  // No such device or address
+    case ESRCH:  // No such process
+      return IREE_STATUS_NOT_FOUND;
+    case EEXIST:         // File exists
+    case EADDRNOTAVAIL:  // Address not available
+    case EALREADY:       // Connection already in progress
+#ifdef ENOTUNIQ
+    case ENOTUNIQ:  // Name not unique on network
+#endif
+      return IREE_STATUS_ALREADY_EXISTS;
+    case EPERM:   // Operation not permitted
+    case EACCES:  // Permission denied
+#ifdef ENOKEY
+    case ENOKEY:  // Required key not available
+#endif
+    case EROFS:  // Read only file system
+      return IREE_STATUS_PERMISSION_DENIED;
+    case ENOTEMPTY:   // Directory not empty
+    case EISDIR:      // Is a directory
+    case ENOTDIR:     // Not a directory
+    case EADDRINUSE:  // Address already in use
+    case EBADF:       // Invalid file descriptor
+#ifdef EBADFD
+    case EBADFD:  // File descriptor in bad state
+#endif
+    case EBUSY:    // Device or resource busy
+    case ECHILD:   // No child processes
+    case EISCONN:  // Socket is connected
+#ifdef EISNAM
+    case EISNAM:  // Is a named type file
+#endif
+#ifdef ENOTBLK
+    case ENOTBLK:  // Block device required
+#endif
+    case ENOTCONN:  // The socket is not connected
+    case EPIPE:     // Broken pipe
+#ifdef ESHUTDOWN
+    case ESHUTDOWN:  // Cannot send after transport endpoint shutdown
+#endif
+    case ETXTBSY:  // Text file busy
+#ifdef EUNATCH
+    case EUNATCH:  // Protocol driver not attached
+#endif
+      return IREE_STATUS_FAILED_PRECONDITION;
+    case ENOSPC:  // No space left on device
+#ifdef EDQUOT
+    case EDQUOT:  // Disk quota exceeded
+#endif
+    case EMFILE:   // Too many open files
+    case EMLINK:   // Too many links
+    case ENFILE:   // Too many open files in system
+    case ENOBUFS:  // No buffer space available
+    case ENODATA:  // No message is available on the STREAM read queue
+    case ENOMEM:   // Not enough space
+    case ENOSR:    // No STREAM resources
+#ifdef EUSERS
+    case EUSERS:  // Too many users
+#endif
+      return IREE_STATUS_RESOURCE_EXHAUSTED;
+#ifdef ECHRNG
+    case ECHRNG:  // Channel number out of range
+#endif
+    case EFBIG:      // File too large
+    case EOVERFLOW:  // Value too large to be stored in data type
+    case ERANGE:     // Result too large
+      return IREE_STATUS_OUT_OF_RANGE;
+#ifdef ENOPKG
+    case ENOPKG:  // Package not installed
+#endif
+    case ENOSYS:        // Function not implemented
+    case ENOTSUP:       // Operation not supported
+    case EAFNOSUPPORT:  // Address family not supported
+#ifdef EPFNOSUPPORT
+    case EPFNOSUPPORT:  // Protocol family not supported
+#endif
+    case EPROTONOSUPPORT:  // Protocol not supported
+#ifdef ESOCKTNOSUPPORT
+    case ESOCKTNOSUPPORT:  // Socket type not supported
+#endif
+    case EXDEV:  // Improper link
+      return IREE_STATUS_UNIMPLEMENTED;
+    case EAGAIN:  // Resource temporarily unavailable
+#ifdef ECOMM
+    case ECOMM:  // Communication error on send
+#endif
+    case ECONNREFUSED:  // Connection refused
+    case ECONNABORTED:  // Connection aborted
+    case ECONNRESET:    // Connection reset
+    case EINTR:         // Interrupted function call
+#ifdef EHOSTDOWN
+    case EHOSTDOWN:  // Host is down
+#endif
+    case EHOSTUNREACH:  // Host is unreachable
+    case ENETDOWN:      // Network is down
+    case ENETRESET:     // Connection aborted by network
+    case ENETUNREACH:   // Network unreachable
+    case ENOLCK:        // No locks available
+    case ENOLINK:       // Link has been severed
+#ifdef ENONET
+    case ENONET:  // Machine is not on the network
+#endif
+      return IREE_STATUS_UNAVAILABLE;
+    case EDEADLK:  // Resource deadlock avoided
+#ifdef ESTALE
+    case ESTALE:  // Stale file handle
+#endif
+      return IREE_STATUS_ABORTED;
+    case ECANCELED:  // Operation cancelled
+      return IREE_STATUS_CANCELLED;
+    default:
+      return IREE_STATUS_UNKNOWN;
+  }
+}
+
+#if defined(IREE_PLATFORM_WINDOWS)
+IREE_API_EXPORT iree_status_code_t
+iree_status_code_from_win32_error(uint32_t error) {
+  switch (error) {
+    case ERROR_SUCCESS:
+      return IREE_STATUS_OK;
+    case ERROR_FILE_NOT_FOUND:
+    case ERROR_PATH_NOT_FOUND:
+      return IREE_STATUS_NOT_FOUND;
+    case ERROR_TOO_MANY_OPEN_FILES:
+    case ERROR_OUTOFMEMORY:
+    case ERROR_HANDLE_DISK_FULL:
+    case ERROR_HANDLE_EOF:
+      return IREE_STATUS_RESOURCE_EXHAUSTED;
+    case ERROR_ACCESS_DENIED:
+      return IREE_STATUS_PERMISSION_DENIED;
+    case ERROR_INVALID_HANDLE:
+      return IREE_STATUS_INVALID_ARGUMENT;
+    case ERROR_NOT_READY:
+    case ERROR_READ_FAULT:
+      return IREE_STATUS_UNAVAILABLE;
+    case ERROR_WRITE_FAULT:
+      return IREE_STATUS_DATA_LOSS;
+    case ERROR_NOT_SUPPORTED:
+      return IREE_STATUS_UNIMPLEMENTED;
+    default:
+      return IREE_STATUS_UNKNOWN;
+  }
+}
+#endif  // IREE_PLATFORM_WINDOWS
+
+//===----------------------------------------------------------------------===//
+// iree_status_t
+//===----------------------------------------------------------------------===//
+
+IREE_API_EXPORT const char* iree_status_code_string(iree_status_code_t code) {
+  switch (code) {
+    case IREE_STATUS_OK:
+      return "OK";
+    case IREE_STATUS_CANCELLED:
+      return "CANCELLED";
+    case IREE_STATUS_UNKNOWN:
+      return "UNKNOWN";
+    case IREE_STATUS_INVALID_ARGUMENT:
+      return "INVALID_ARGUMENT";
+    case IREE_STATUS_DEADLINE_EXCEEDED:
+      return "DEADLINE_EXCEEDED";
+    case IREE_STATUS_NOT_FOUND:
+      return "NOT_FOUND";
+    case IREE_STATUS_ALREADY_EXISTS:
+      return "ALREADY_EXISTS";
+    case IREE_STATUS_PERMISSION_DENIED:
+      return "PERMISSION_DENIED";
+    case IREE_STATUS_RESOURCE_EXHAUSTED:
+      return "RESOURCE_EXHAUSTED";
+    case IREE_STATUS_FAILED_PRECONDITION:
+      return "FAILED_PRECONDITION";
+    case IREE_STATUS_ABORTED:
+      return "ABORTED";
+    case IREE_STATUS_OUT_OF_RANGE:
+      return "OUT_OF_RANGE";
+    case IREE_STATUS_UNIMPLEMENTED:
+      return "UNIMPLEMENTED";
+    case IREE_STATUS_INTERNAL:
+      return "INTERNAL";
+    case IREE_STATUS_UNAVAILABLE:
+      return "UNAVAILABLE";
+    case IREE_STATUS_DATA_LOSS:
+      return "DATA_LOSS";
+    case IREE_STATUS_UNAUTHENTICATED:
+      return "UNAUTHENTICATED";
+    case IREE_STATUS_DEFERRED:
+      return "DEFERRED";
+    default:
+      return "";
+  }
+}
+
+// TODO(#55): move payload methods/types to header when API is stabilized.
+
+struct iree_status_handle_t {
+  uintptr_t value;
+};
+
+// Defines the type of an iree_status_payload_t.
+typedef enum iree_status_payload_type_e {
+  // Opaque; payload may still be formatted by a formatter but is not possible
+  // to retrieve by the programmatic APIs.
+  IREE_STATUS_PAYLOAD_TYPE_OPAQUE = 0,
+  // A string message annotation of type iree_status_payload_message_t.
+  IREE_STATUS_PAYLOAD_TYPE_MESSAGE = 1,
+  // Starting type ID for user payloads. IREE reserves all payloads with types
+  // less than this.
+  IREE_STATUS_PAYLOAD_TYPE_MIN_USER = 0x70000000u,
+} iree_status_payload_type_t;
+
+typedef struct iree_status_payload_t iree_status_payload_t;
+
+// Function that formats a payload into a human-readable string form for logs.
+typedef void(IREE_API_PTR* iree_status_payload_formatter_t)(
+    const iree_status_payload_t* payload, iree_host_size_t buffer_capacity,
+    char* buffer, iree_host_size_t* out_buffer_length);
+
+// Header for optional status payloads.
+// Each status may have zero or more payloads associated with it that can later
+// be used to produce more detailed logging or programmatically query
+// information about an error.
+struct iree_status_payload_t {
+  // Next payload in the status payload linked list.
+  struct iree_status_payload_t* next;
+  // Payload type identifier used for programmatic access to payloads. May be
+  // IREE_STATUS_PAYLOAD_TYPE_OPAQUE if the payload cannot be accessed directly.
+  iree_status_payload_type_t type;
+  // Allocator used for the payload and associated resources.
+  iree_allocator_t allocator;
+  // String formatter callback used to write the payload into a string buffer.
+  // If not present then the payload will be mentioned but not dumped when the
+  // status is logged.
+  iree_status_payload_formatter_t formatter;
+};
+
+// A string message (IREE_STATUS_PAYLOAD_TYPE_MESSAGE).
+typedef struct iree_status_payload_message_t {
+  iree_status_payload_t header;
+  // String data reference. May point to an address immediately following this
+  // struct (if copied) or a constant string reference in rodata.
+  iree_string_view_t message;
+} iree_status_payload_message_t;
+
+// Allocated storage for an iree_status_t.
+// Only statuses that have either source information or payloads will have
+// storage allocated for them.
+typedef struct iree_status_storage_t {
+  // Optional doubly-linked list of payloads associated with the status.
+  // Head = first added, tail = last added.
+  iree_status_payload_t* payload_head;
+  iree_status_payload_t* payload_tail;
+
+#if (IREE_STATUS_FEATURES & IREE_STATUS_FEATURE_SOURCE_LOCATION) != 0
+  // __FILE__ of the originating status allocation.
+  const char* file;
+  // __LINE__ of the originating status allocation.
+  uint32_t line;
+#endif  // has IREE_STATUS_FEATURE_SOURCE_LOCATION
+
+#if (IREE_STATUS_FEATURES & IREE_STATUS_FEATURE_ANNOTATIONS) != 0
+  // Optional message that is allocated either as a constant string in rodata or
+  // present as a suffix on the storage.
+  iree_string_view_t message;
+#endif  // has IREE_STATUS_FEATURE_ANNOTATIONS
+} iree_status_storage_t;
+
+#define iree_status_storage(status) \
+  ((iree_status_storage_t*)(((uintptr_t)(status) & ~IREE_STATUS_CODE_MASK)))
+
+// Appends a payload to the storage doubly-linked list.
+static iree_status_t iree_status_append_payload(
+    iree_status_t status, iree_status_storage_t* storage,
+    iree_status_payload_t* payload) {
+  if (!storage->payload_tail) {
+    storage->payload_head = payload;
+  } else {
+    storage->payload_tail->next = payload;
+  }
+  storage->payload_tail = payload;
+  return status;
+}
+
+// Formats an iree_status_payload_message_t to the given output |buffer|.
+// |out_buffer_length| will be set to the number of characters written excluding
+// NUL. If |buffer| is omitted then |out_buffer_length| will be set to the
+// total number of characters in |buffer_capacity| required to contain the
+// entire message.
+static void iree_status_payload_message_formatter(
+    const iree_status_payload_t* payload, iree_host_size_t buffer_capacity,
+    char* buffer, iree_host_size_t* out_buffer_length) {
+  iree_status_payload_message_t* message_payload =
+      (iree_status_payload_message_t*)payload;
+  if (!buffer) {
+    *out_buffer_length = message_payload->message.size;
+    return;
+  }
+  iree_host_size_t n = buffer_capacity < message_payload->message.size
+                           ? buffer_capacity
+                           : message_payload->message.size;
+  memcpy(buffer, message_payload->message.data, n);
+  buffer[n] = '\0';
+  *out_buffer_length = n;
+}
+
+// Captures the current stack and attaches it to the status storage.
+// A count of |skip_frames| will be skipped from the top of the stack.
+// Setting |skip_frames|=0 will include the caller in the stack while
+// |skip_frames|=1 will exclude it.
+static void iree_status_attach_stack_trace(iree_status_storage_t* storage,
+                                           int skip_frames) {
+#if (IREE_STATUS_FEATURES & IREE_STATUS_FEATURE_STACK_TRACE) != 0
+  // TODO(#55): backtrace or other magic.
+#endif  // has IREE_STATUS_FEATURE_STACK_TRACE
+}
+
+IREE_API_EXPORT IREE_MUST_USE_RESULT iree_status_t
+iree_status_allocate(iree_status_code_t code, const char* file, uint32_t line,
+                     iree_string_view_t message) {
+#if IREE_STATUS_FEATURES == 0
+  // More advanced status code features like source location and messages are
+  // disabled. All statuses are just the codes.
+  return iree_status_from_code(code);
+#else
+  // No-op for OK statuses; we won't get these from the macros but may be called
+  // with this from marshaling code.
+  if (IREE_UNLIKELY(code == IREE_STATUS_OK)) return iree_ok_status();
+
+  // Allocate storage with the appropriate alignment such that we can pack the
+  // code in the lower bits of the pointer. Since failed statuses are rare and
+  // likely have much larger costs (like string formatting) the extra bytes for
+  // alignment are worth being able to avoid pointer dereferences and other
+  // things during the normal code paths that just check codes.
+  //
+  // Note that we are using the CRT allocation function here, as we can't trust
+  // our allocator system to work when we are throwing errors (as we may be
+  // allocating this error from a failed allocation!).
+  size_t storage_alignment = (IREE_STATUS_CODE_MASK + 1);
+  size_t storage_size =
+      iree_host_align(sizeof(iree_status_storage_t), storage_alignment);
+  iree_status_storage_t* storage = (iree_status_storage_t*)iree_aligned_alloc(
+      storage_alignment, storage_size);
+  if (IREE_UNLIKELY(!storage)) return iree_status_from_code(code);
+  memset(storage, 0, sizeof(*storage));
+
+#if (IREE_STATUS_FEATURES & IREE_STATUS_FEATURE_SOURCE_LOCATION) != 0
+  storage->file = file;
+  storage->line = line;
+#endif  // has IREE_STATUS_FEATURE_SOURCE_LOCATION
+
+#if (IREE_STATUS_FEATURES & IREE_STATUS_FEATURE_ANNOTATIONS) != 0
+  // NOTE: messages are rodata strings here and not retained.
+  storage->message = message;
+#endif  // has IREE_STATUS_FEATURE_ANNOTATIONS
+
+  iree_status_attach_stack_trace(storage, /*skip_frames=*/1);
+  return (iree_status_t)((uintptr_t)storage | (code & IREE_STATUS_CODE_MASK));
+#endif  // has any IREE_STATUS_FEATURES
+}
+
+IREE_API_EXPORT IREE_MUST_USE_RESULT iree_status_t
+iree_status_allocate_f(iree_status_code_t code, const char* file, uint32_t line,
+                       const char* format, ...) {
+  va_list varargs_0, varargs_1;
+  va_start(varargs_0, format);
+  va_start(varargs_1, format);
+  iree_status_t ret =
+      iree_status_allocate_vf(code, file, line, format, varargs_0, varargs_1);
+  va_end(varargs_0);
+  va_end(varargs_1);
+  return ret;
+}
+
+IREE_API_EXPORT IREE_MUST_USE_RESULT iree_status_t iree_status_allocate_vf(
+    iree_status_code_t code, const char* file, uint32_t line,
+    const char* format, va_list varargs_0, va_list varargs_1) {
+#if (IREE_STATUS_FEATURES & IREE_STATUS_FEATURE_ANNOTATIONS) == 0
+  // Annotations disabled; ignore the format string/args.
+  return iree_status_allocate(code, file, line, iree_string_view_empty());
+#else
+  // No-op for OK statuses; we won't get these from the macros but may be called
+  // with this from marshaling code.
+  if (IREE_UNLIKELY(code == IREE_STATUS_OK)) return iree_ok_status();
+
+  // Compute the total number of bytes (including NUL) required to store the
+  // message.
+  int message_size =
+      vsnprintf(/*buffer=*/NULL, /*buffer_count=*/0, format, varargs_0);
+  if (message_size < 0) return iree_status_from_code(code);
+  ++message_size;  // NUL byte
+
+  // Allocate storage with the additional room to store the formatted message.
+  // This avoids additional allocations for the common case of a message coming
+  // only from the original status error site.
+  size_t storage_alignment = (IREE_STATUS_CODE_MASK + 1);
+  size_t storage_size = iree_host_align(
+      sizeof(iree_status_storage_t) + message_size, storage_alignment);
+  iree_status_storage_t* storage = (iree_status_storage_t*)iree_aligned_alloc(
+      storage_alignment, storage_size);
+  if (IREE_UNLIKELY(!storage)) return iree_status_from_code(code);
+  memset(storage, 0, sizeof(*storage));
+
+#if (IREE_STATUS_FEATURES & IREE_STATUS_FEATURE_SOURCE_LOCATION) != 0
+  storage->file = file;
+  storage->line = line;
+#endif  // has IREE_STATUS_FEATURE_SOURCE_LOCATION
+
+  // vsnprintf directly into message buffer.
+  storage->message.size = message_size - 1;
+  storage->message.data = (const char*)storage + sizeof(iree_status_storage_t);
+  int ret =
+      vsnprintf((char*)storage->message.data, message_size, format, varargs_1);
+  if (IREE_UNLIKELY(ret < 0)) {
+    iree_aligned_free(storage);
+    return (iree_status_t)code;
+  }
+
+  iree_status_attach_stack_trace(storage, /*skip_frames=*/1);
+  return (iree_status_t)((uintptr_t)storage | (code & IREE_STATUS_CODE_MASK));
+#endif  // has IREE_STATUS_FEATURE_ANNOTATIONS
+}
+
+IREE_API_EXPORT IREE_MUST_USE_RESULT iree_status_t
+iree_status_clone(iree_status_t status) {
+#if IREE_STATUS_FEATURES == 0
+  // Statuses are just codes; nothing to do.
+  return status;
+#else
+  iree_status_storage_t* storage = iree_status_storage(status);
+  if (!storage) return status;
+
+#if (IREE_STATUS_FEATURES & IREE_STATUS_FEATURE_SOURCE_LOCATION) != 0
+  const char* file = storage->file;
+  uint32_t line = storage->line;
+#else
+  const char* file = NULL;
+  uint32_t line = 0;
+#endif  // has IREE_STATUS_FEATURE_SOURCE_LOCATION
+
+#if (IREE_STATUS_FEATURES & IREE_STATUS_FEATURE_ANNOTATIONS) != 0
+  iree_string_view_t message = storage->message;
+#else
+  iree_string_view_t message = iree_string_view_empty();
+#endif  // has IREE_STATUS_FEATURE_ANNOTATIONS
+
+  // Always copy the message by performing the formatting as we don't know
+  // whether the original status has ownership or not.
+  return iree_status_allocate_f(iree_status_code(status), file, line, "%.*s",
+                                (int)message.size, message.data);
+#endif  // has no IREE_STATUS_FEATURES
+}
+
+IREE_API_EXPORT void iree_status_free(iree_status_t status) {
+#if IREE_STATUS_FEATURES != 0
+  iree_status_storage_t* storage = iree_status_storage(status);
+  if (!storage) return;
+  iree_status_payload_t* payload = storage->payload_head;
+  while (payload) {
+    iree_status_payload_t* next = payload->next;
+    iree_allocator_free(payload->allocator, payload);
+    payload = next;
+  }
+  iree_aligned_free(storage);
+#endif  // has any IREE_STATUS_FEATURES
+}
+
+IREE_API_EXPORT iree_status_t iree_status_ignore(iree_status_t status) {
+  // We can set an 'ignored' flag on the status so that we can otherwise assert
+  // in iree_status_free when statuses are freed without this being called.
+  // Hoping with the C++ Status wrapper we won't hit that often so that
+  // complexity is skipped for now.
+  iree_status_free(status);
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_status_t iree_status_join(iree_status_t base_status,
+                                               iree_status_t new_status) {
+  // TODO(benvanik): annotate |base_status| with |new_status| so we see it?
+  // This is intended for failure handling and usually the first failure is the
+  // root cause and most important to see.
+  if (!iree_status_is_ok(base_status)) {
+    iree_status_ignore(new_status);
+    return base_status;
+  }
+  return new_status;
+}
+
+IREE_API_EXPORT IREE_ATTRIBUTE_NORETURN void iree_status_abort(
+    iree_status_t status) {
+  iree_status_fprint(stderr, status);
+  IREE_ASSERT(!iree_status_is_ok(status),
+              "only valid to call with failing status codes");
+  iree_status_free(status);
+  abort();
+}
+
+IREE_API_EXPORT iree_status_code_t
+iree_status_consume_code(iree_status_t status) {
+  iree_status_code_t code = iree_status_code(status);
+  iree_status_free(status);
+  return code;
+}
+
+#if IREE_STATUS_FEATURES & IREE_STATUS_FEATURE_ANNOTATIONS
+
+IREE_API_EXPORT IREE_MUST_USE_RESULT iree_status_t
+iree_status_annotate(iree_status_t base_status, iree_string_view_t message) {
+  if (iree_status_is_ok(base_status) || iree_string_view_is_empty(message)) {
+    return base_status;
+  }
+
+  // If there's no storage yet we can just reuse normal allocation. Both that
+  // and this do not copy |message|.
+  iree_status_storage_t* storage = iree_status_storage(base_status);
+  if (!storage) {
+    return iree_status_allocate(iree_status_code(base_status), NULL, 0,
+                                message);
+  } else if (iree_string_view_is_empty(storage->message)) {
+    storage->message = message;
+    return base_status;
+  }
+
+  iree_allocator_t allocator = iree_allocator_system();
+  iree_status_payload_message_t* payload = NULL;
+  iree_status_ignore(
+      iree_allocator_malloc(allocator, sizeof(*payload), (void**)&payload));
+  if (IREE_UNLIKELY(!payload)) return base_status;
+  memset(payload, 0, sizeof(*payload));
+  payload->header.type = IREE_STATUS_PAYLOAD_TYPE_MESSAGE;
+  payload->header.allocator = allocator;
+  payload->header.formatter = iree_status_payload_message_formatter;
+  payload->message = message;
+  return iree_status_append_payload(base_status, storage,
+                                    (iree_status_payload_t*)payload);
+}
+
+static IREE_MUST_USE_RESULT iree_status_t
+iree_status_annotate_vf(iree_status_t base_status, const char* format,
+                        va_list varargs_0, va_list varargs_1) {
+  if (iree_status_is_ok(base_status)) return base_status;
+
+  // If there's no storage yet we can just reuse normal allocation. Both that
+  // and this do not copy |message|.
+  iree_status_storage_t* storage = iree_status_storage(base_status);
+  if (!storage) {
+    return iree_status_allocate_vf(iree_status_code(base_status), NULL, 0,
+                                   format, varargs_0, varargs_1);
+  }
+
+  // Compute the total number of bytes (including NUL) required to store the
+  // message.
+  int message_size =
+      vsnprintf(/*buffer=*/NULL, /*buffer_count=*/0, format, varargs_0);
+  if (message_size < 0) return base_status;
+  ++message_size;  // NUL byte
+
+  // Allocate storage with the additional room to store the formatted message.
+  // This avoids additional allocations for the common case of a message coming
+  // only from the original status error site.
+  iree_allocator_t allocator = iree_allocator_system();
+  iree_status_payload_message_t* payload = NULL;
+  iree_status_ignore(iree_allocator_malloc(
+      allocator, sizeof(*payload) + message_size, (void**)&payload));
+  if (IREE_UNLIKELY(!payload)) return base_status;
+  memset(payload, 0, sizeof(*payload));
+  payload->header.type = IREE_STATUS_PAYLOAD_TYPE_MESSAGE;
+  payload->header.allocator = allocator;
+  payload->header.formatter = iree_status_payload_message_formatter;
+
+  // vsnprintf directly into message buffer.
+  payload->message.size = message_size - 1;
+  payload->message.data =
+      (const char*)payload + sizeof(iree_status_payload_message_t);
+  int ret = vsnprintf((char*)payload->message.data, payload->message.size + 1,
+                      format, varargs_1);
+  if (IREE_UNLIKELY(ret < 0)) {
+    iree_aligned_free(payload);
+    return base_status;
+  }
+  return iree_status_append_payload(base_status, storage,
+                                    (iree_status_payload_t*)payload);
+}
+
+IREE_API_EXPORT IREE_MUST_USE_RESULT iree_status_t IREE_PRINTF_ATTRIBUTE(2, 3)
+    iree_status_annotate_f(iree_status_t base_status, const char* format, ...) {
+  // We walk the lists twice as each va_list can only be walked once we need to
+  // double-up. iree_status_annotate_vf could use va_copy to clone the single
+  // list however the proper management of va_end is trickier and this works.
+  va_list varargs_0, varargs_1;
+  va_start(varargs_0, format);
+  va_start(varargs_1, format);
+  iree_status_t ret =
+      iree_status_annotate_vf(base_status, format, varargs_0, varargs_1);
+  va_end(varargs_0);
+  va_end(varargs_1);
+  return ret;
+}
+
+#endif  // has IREE_STATUS_FEATURE_ANNOTATIONS
+
+IREE_API_EXPORT bool iree_status_format(iree_status_t status,
+                                        iree_host_size_t buffer_capacity,
+                                        char* buffer,
+                                        iree_host_size_t* out_buffer_length) {
+  *out_buffer_length = 0;
+
+  // Grab storage which may have a message and zero or more payloads.
+  iree_status_storage_t* storage IREE_ATTRIBUTE_UNUSED =
+      iree_status_storage(status);
+
+  // Prefix with source location and status code string (may be 'OK').
+  iree_host_size_t buffer_length = 0;
+  iree_status_code_t status_code = iree_status_code(status);
+  int n = 0;
+#if (IREE_STATUS_FEATURES & IREE_STATUS_FEATURE_SOURCE_LOCATION) != 0
+  if (storage && storage->file) {
+    n = snprintf(buffer ? buffer + buffer_length : NULL,
+                 buffer ? buffer_capacity - buffer_length : 0, "%s:%d: %s",
+                 storage->file, storage->line,
+                 iree_status_code_string(status_code));
+  } else {
+    n = snprintf(buffer ? buffer + buffer_length : NULL,
+                 buffer ? buffer_capacity - buffer_length : 0, "%s",
+                 iree_status_code_string(status_code));
+  }
+#else
+  n = snprintf(buffer ? buffer + buffer_length : NULL,
+               buffer ? buffer_capacity - buffer_length : 0, "%s",
+               iree_status_code_string(status_code));
+#endif  // has IREE_STATUS_FEATURE_SOURCE_LOCATION
+  if (IREE_UNLIKELY(n < 0)) {
+    return false;
+  } else if (buffer && n >= buffer_capacity - buffer_length) {
+    buffer = NULL;
+  }
+  buffer_length += n;
+
+#if (IREE_STATUS_FEATURES & IREE_STATUS_FEATURE_ANNOTATIONS) != 0
+  // Append base storage message.
+  if (storage && !iree_string_view_is_empty(storage->message)) {
+    n = snprintf(buffer ? buffer + buffer_length : NULL,
+                 buffer ? buffer_capacity - buffer_length : 0, "; %.*s",
+                 (int)storage->message.size, storage->message.data);
+    if (IREE_UNLIKELY(n < 0)) {
+      return false;
+    } else if (buffer && n >= buffer_capacity - buffer_length) {
+      buffer = NULL;
+    }
+    buffer_length += n;
+  }
+#endif  // has IREE_STATUS_FEATURE_ANNOTATIONS
+
+#if IREE_STATUS_FEATURES != 0
+  // Append each payload separated by a newline.
+  iree_status_payload_t* payload = storage ? storage->payload_head : NULL;
+  while (payload != NULL) {
+    // Skip payloads that have no textual representation.
+    if (!payload->formatter) {
+      payload = payload->next;
+      continue;
+    }
+
+    // Append newline to join with message above and other payloads.
+    if (buffer) {
+      if (2 >= buffer_capacity - buffer_length) {
+        buffer = NULL;
+      } else {
+        buffer[buffer_length] = ';';
+        buffer[buffer_length + 1] = ' ';
+        buffer[buffer_length + 2] = '\0';
+      }
+    }
+    buffer_length += 2;  // '; '
+
+    // Append payload via custom formatter callback.
+    iree_host_size_t payload_buffer_length = 0;
+    payload->formatter(payload, buffer ? buffer_capacity - buffer_length : 0,
+                       buffer ? buffer + buffer_length : NULL,
+                       &payload_buffer_length);
+    if (buffer && payload_buffer_length >= buffer_capacity - buffer_length) {
+      buffer = NULL;
+    }
+    buffer_length += payload_buffer_length;
+
+    payload = payload->next;
+  }
+#endif  // has IREE_STATUS_FEATURES
+
+  *out_buffer_length = buffer_length;
+  return true;
+}
+
+// Converts the status to an allocated string value using the given allocator.
+// The caller must free the buffer with |allocator|.
+static bool iree_status_to_string(iree_status_t status,
+                                  iree_allocator_t allocator, char** out_buffer,
+                                  iree_host_size_t* out_buffer_length) {
+  *out_buffer_length = 0;
+  iree_host_size_t buffer_length = 0;
+  if (IREE_UNLIKELY(!iree_status_format(status, /*buffer_capacity=*/0,
+                                        /*buffer=*/NULL, &buffer_length))) {
+    return false;
+  }
+
+  // Buffer capacity needs to be +1 for the NUL terminator (see snprintf).
+  char* buffer = NULL;
+  iree_status_t malloc_status =
+      iree_allocator_malloc(allocator, buffer_length + 1, (void**)&buffer);
+  if (!iree_status_is_ok(malloc_status)) {
+    iree_status_ignore(malloc_status);
+    return false;
+  }
+  bool ret =
+      iree_status_format(status, buffer_length + 1, buffer, out_buffer_length);
+  if (ret) {
+    *out_buffer = buffer;
+    return true;
+  } else {
+    iree_allocator_free(allocator, buffer);
+    return false;
+  }
+}
+
+IREE_API_EXPORT void iree_status_fprint(FILE* file, iree_status_t status) {
+  // TODO(benvanik): better support for colors/etc - possibly move to logging.
+  // TODO(benvanik): do this without allocation by streaming the status.
+  iree_allocator_t allocator = iree_allocator_system();
+  char* status_buffer = NULL;
+  iree_host_size_t status_buffer_length = 0;
+  if (iree_status_to_string(status, allocator, &status_buffer,
+                            &status_buffer_length)) {
+    fprintf(file, "%.*s\n", (int)status_buffer_length, status_buffer);
+    iree_allocator_free(allocator, status_buffer);
+  } else {
+    fprintf(file, "(?)\n");
+  }
+  fflush(file);
+}
diff --git a/runtime/src/iree/base/status.h b/runtime/src/iree/base/status.h
new file mode 100644
index 0000000..7192069
--- /dev/null
+++ b/runtime/src/iree/base/status.h
@@ -0,0 +1,505 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BASE_STATUS_H_
+#define IREE_BASE_STATUS_H_
+
+#include <errno.h>
+#include <memory.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "iree/base/attributes.h"
+#include "iree/base/config.h"
+#include "iree/base/string_view.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+//===----------------------------------------------------------------------===//
+// IREE_STATUS_FEATURE flags and IREE_STATUS_MODE setting
+//===----------------------------------------------------------------------===//
+
+// Captures origin source information on a call to iree_make_status.
+// Status storage will be allocated and reference the __FILE__ and __LINE__
+// of where it is invoked.
+#define IREE_STATUS_FEATURE_SOURCE_LOCATION (1 << 0)
+
+// Captures annotation messages provided via iree_make_status or
+// iree_status_annotate.
+// Status storage will be allocated.
+#define IREE_STATUS_FEATURE_ANNOTATIONS (1 << 1)
+
+// Captures the current callstack on a call to iree_make_status.
+// Status storage will be allocated.
+#define IREE_STATUS_FEATURE_STACK_TRACE (1 << 2)
+
+// Set IREE_STATUS_FEATURES based on IREE_STATUS_MODE if the user hasn't
+// overridden it with more specific settings.
+//
+// IREE_STATUS_MODE = 0: statuses are just integers
+// IREE_STATUS_MODE = 1: statuses have source location of error
+// IREE_STATUS_MODE = 2: statuses also have custom annotations
+// IREE_STATUS_MODE = 3: statuses also have stack traces of the error site
+#if !defined(IREE_STATUS_FEATURES)
+#if defined(IREE_STATUS_MODE) && IREE_STATUS_MODE == 1
+#define IREE_STATUS_FEATURES (IREE_STATUS_FEATURE_SOURCE_LOCATION)
+#elif defined(IREE_STATUS_MODE) && IREE_STATUS_MODE == 2
+#define IREE_STATUS_FEATURES \
+  (IREE_STATUS_FEATURE_SOURCE_LOCATION | IREE_STATUS_FEATURE_ANNOTATIONS)
+#elif defined(IREE_STATUS_MODE) && IREE_STATUS_MODE == 3
+#define IREE_STATUS_FEATURES                                               \
+  (IREE_STATUS_FEATURE_SOURCE_LOCATION | IREE_STATUS_FEATURE_ANNOTATIONS | \
+   IREE_STATUS_FEATURE_STACK_TRACE)
+#else
+#define IREE_STATUS_FEATURES 0
+#endif  // IREE_STATUS_MODE
+#endif  // !IREE_STATUS_FEATURES
+
+//===----------------------------------------------------------------------===//
+// iree_status_t and error reporting
+//===----------------------------------------------------------------------===//
+
+// Well-known status codes matching iree::StatusCode.
+// Note that any code within IREE_STATUS_CODE_MASK is valid even if not
+// enumerated here. Always check for unhandled errors/have default conditions.
+typedef enum iree_status_code_e {
+  // Successful operation.
+  IREE_STATUS_OK = 0,
+
+  // Operation was cancelled by the caller.
+  IREE_STATUS_CANCELLED = 1,
+
+  // Unknown error, or error that could not be mapped to this enum.
+  IREE_STATUS_UNKNOWN = 2,
+
+  // The caller provided an invalid argument and that future calls with the same
+  // arguments will fail. If the failure is predicated on system state that may
+  // change prefer IREE_STATUS_OUT_OF_RANGE.
+  IREE_STATUS_INVALID_ARGUMENT = 3,
+
+  // A deadline was exceeded before the call could complete.
+  // This can be returned even if the operation would have completed
+  // successfully had the deadline not been met.
+  IREE_STATUS_DEADLINE_EXCEEDED = 4,
+
+  // A referenced resource could not be found or was unavailable to all
+  // requesters. IREE_STATUS_PERMISSION_DENIED should be used if only an
+  // individual requester is denied access.
+  IREE_STATUS_NOT_FOUND = 5,
+
+  // The resource the caller attempted to create already exists.
+  IREE_STATUS_ALREADY_EXISTS = 6,
+
+  // The caller does not have permission to execute the operation or have access
+  // to the requested resources.
+  IREE_STATUS_PERMISSION_DENIED = 7,
+
+  // Some resource type has been exhausted and the operation is unable to
+  // reserve what it requires, either by quota or underlying system exhaustion.
+  IREE_STATUS_RESOURCE_EXHAUSTED = 8,
+
+  // The operation was rejected because the system is not in a state required
+  // for the operation's execution.
+  //
+  // Use IREE_STATUS_UNAVAILABLE if the caller can retry the operation.
+  // Use IREE_STATUS_ABORTED if the caller should restart their transaction
+  // (the entire sequence of operations is invalid).
+  // Use IREE_STATUS_FAILED_PRECONDITION if the caller should not retry until
+  // the system state has been explicitly fixed.
+  IREE_STATUS_FAILED_PRECONDITION = 9,
+
+  // The operation was aborted by the system.
+  // If responding to a caller-requested cancellation use IREE_STATUS_CANCELLED.
+  IREE_STATUS_ABORTED = 10,
+
+  // The operation was attempted past the valid range (of a resource, etc).
+  // Indicates the operation can be retried if the system state is fixed.
+  IREE_STATUS_OUT_OF_RANGE = 11,
+
+  // Operation has not been implemented or is not supported.
+  IREE_STATUS_UNIMPLEMENTED = 12,
+
+  // An internal error has occurred and some invariants expected by an
+  // underlying system have been violated. This error code is reserved for
+  // serious errors.
+  IREE_STATUS_INTERNAL = 13,
+
+  // The system used to perform the operation is currently (and transiently)
+  // unavailable. Callers can retry with backoff.
+  IREE_STATUS_UNAVAILABLE = 14,
+
+  // An serious unrecoverable data loss or corruption has occurred.
+  // Indicates that an underlying system or resource has failed in such a way
+  // that all related operations may produce incorrect results.
+  IREE_STATUS_DATA_LOSS = 15,
+
+  // The requested operation does not have proper authentication.
+  // Callers can correct this and retry.
+  IREE_STATUS_UNAUTHENTICATED = 16,
+
+  // The operation has been deferred and must be resumed at a future point.
+  // Used by resumable operations as part of scheduling and execution systems.
+  // Callers that do not handle deferred execution can treat this as a failure.
+  IREE_STATUS_DEFERRED = 17,
+
+  IREE_STATUS_CODE_MASK = 0x1Fu,
+} iree_status_code_t;
+
+// Opaque status structure containing an iree_status_code_t and optional status
+// object with more detailed information and payloads.
+//
+// The status value uses the lower 5 bits to store the iree_status_code_t and
+// the remaining uintptr_t bits to store an optional status payload pointer.
+// An OK status will always be bit-equivalent to 0 to make success/failure
+// checks as cheap as an integer non-zero comparison. As the payload is optional
+// it's legal to construct an iree_status_t from an iree_status_code_t directly
+// meaning `return iree_status_from_code(IREE_STATUS_INTERNAL);` (etc) is valid,
+// though not as useful as constructing via iree_make_status (which captures
+// additional info).
+typedef struct iree_status_handle_t* iree_status_t;
+
+// Returns an iree_status_t from the an iree_status_code_t.
+#define iree_status_from_code(code)                          \
+  ((iree_status_t)((uintptr_t)((iree_status_code_t)(code)) & \
+                   IREE_STATUS_CODE_MASK))
+
+// Returns the iree_status_code_t from an iree_status_t.
+#define iree_status_code(value) \
+  ((iree_status_code_t)(((uintptr_t)(value)) & IREE_STATUS_CODE_MASK))
+
+// Macros to check the value of a status code.
+#define iree_status_is_ok(value) \
+  IREE_LIKELY((uintptr_t)(value) == IREE_STATUS_OK)
+#define iree_status_is_cancelled(value) \
+  (iree_status_code(value) == IREE_STATUS_CANCELLED)
+#define iree_status_is_unknown(value) \
+  (iree_status_code(value) == IREE_STATUS_UNKNOWN)
+#define iree_status_is_invalid_argument(value) \
+  (iree_status_code(value) == IREE_STATUS_INVALID_ARGUMENT)
+#define iree_status_is_deadline_exceeded(value) \
+  (iree_status_code(value) == IREE_STATUS_DEADLINE_EXCEEDED)
+#define iree_status_is_not_found(value) \
+  (iree_status_code(value) == IREE_STATUS_NOT_FOUND)
+#define iree_status_is_already_exists(value) \
+  (iree_status_code(value) == IREE_STATUS_ALREADY_EXISTS)
+#define iree_status_is_permission_denied(value) \
+  (iree_status_code(value) == IREE_STATUS_PERMISSION_DENIED)
+#define iree_status_is_resource_exhausted(value) \
+  (iree_status_code(value) == IREE_STATUS_RESOURCE_EXHAUSTED)
+#define iree_status_is_failed_precondition(value) \
+  (iree_status_code(value) == IREE_STATUS_FAILED_PRECONDITION)
+#define iree_status_is_aborted(value) \
+  (iree_status_code(value) == IREE_STATUS_ABORTED)
+#define iree_status_is_out_of_range(value) \
+  (iree_status_code(value) == IREE_STATUS_OUT_OF_RANGE)
+#define iree_status_is_unimplemented(value) \
+  (iree_status_code(value) == IREE_STATUS_UNIMPLEMENTED)
+#define iree_status_is_internal(value) \
+  (iree_status_code(value) == IREE_STATUS_INTERNAL)
+#define iree_status_is_unavailable(value) \
+  (iree_status_code(value) == IREE_STATUS_UNAVAILABLE)
+#define iree_status_is_data_loss(value) \
+  (iree_status_code(value) == IREE_STATUS_DATA_LOSS)
+#define iree_status_is_unauthenticated(value) \
+  (iree_status_code(value) == IREE_STATUS_UNAUTHENTICATED)
+#define iree_status_is_deferred(value) \
+  (iree_status_code(value) == IREE_STATUS_DEFERRED)
+
+#define IREE_STATUS_IMPL_CONCAT_INNER_(x, y) x##y
+#define IREE_STATUS_IMPL_CONCAT_(x, y) IREE_STATUS_IMPL_CONCAT_INNER_(x, y)
+
+#define IREE_STATUS_IMPL_IDENTITY_(...) __VA_ARGS__
+#define IREE_STATUS_IMPL_GET_EXPR_(expr, ...) expr
+#define IREE_STATUS_IMPL_GET_ARGS_(expr, ...) __VA_ARGS__
+#define IREE_STATUS_IMPL_GET_MACRO_(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, \
+                                    _10, _11, _12, _13, _14, ...)           \
+  IREE_STATUS_IMPL_IDENTITY_(                                               \
+      IREE_STATUS_IMPL_IDENTITY_(IREE_STATUS_IMPL_GET_EXPR_)(__VA_ARGS__))
+
+#define IREE_STATUS_IMPL_MAKE_EMPTY_(file, line, status_code, ...) \
+  iree_status_allocate(status_code, file, line, iree_string_view_empty())
+#define IREE_STATUS_IMPL_MAKE_ANNOTATE_(file, line, status_code, message) \
+  iree_status_allocate(status_code, file, line, iree_make_cstring_view(message))
+#define IREE_STATUS_IMPL_MAKE_ANNOTATE_F_(file, line, status_code, ...) \
+  iree_status_allocate_f(status_code, file, line, __VA_ARGS__)
+#define IREE_STATUS_IMPL_MAKE_SWITCH_(file, line, ...)                      \
+  IREE_STATUS_IMPL_IDENTITY_(IREE_STATUS_IMPL_IDENTITY_(                    \
+      IREE_STATUS_IMPL_GET_MACRO_)(                                         \
+      __VA_ARGS__, IREE_STATUS_IMPL_MAKE_ANNOTATE_F_,                       \
+      IREE_STATUS_IMPL_MAKE_ANNOTATE_F_, IREE_STATUS_IMPL_MAKE_ANNOTATE_F_, \
+      IREE_STATUS_IMPL_MAKE_ANNOTATE_F_, IREE_STATUS_IMPL_MAKE_ANNOTATE_F_, \
+      IREE_STATUS_IMPL_MAKE_ANNOTATE_F_, IREE_STATUS_IMPL_MAKE_ANNOTATE_F_, \
+      IREE_STATUS_IMPL_MAKE_ANNOTATE_F_, IREE_STATUS_IMPL_MAKE_ANNOTATE_F_, \
+      IREE_STATUS_IMPL_MAKE_ANNOTATE_F_, IREE_STATUS_IMPL_MAKE_ANNOTATE_F_, \
+      IREE_STATUS_IMPL_MAKE_ANNOTATE_F_, IREE_STATUS_IMPL_MAKE_ANNOTATE_F_, \
+      IREE_STATUS_IMPL_MAKE_ANNOTATE_, IREE_STATUS_IMPL_MAKE_EMPTY_))       \
+  (file, line, IREE_STATUS_IMPL_GET_EXPR_(__VA_ARGS__),                     \
+   IREE_STATUS_IMPL_GET_ARGS_(__VA_ARGS__))
+
+#define IREE_STATUS_IMPL_PASS_(var, ...) var
+#define IREE_STATUS_IMPL_ANNOTATE_(var, ...)                  \
+  IREE_STATUS_IMPL_IDENTITY_(iree_status_annotate(            \
+      var, iree_make_cstring_view(IREE_STATUS_IMPL_IDENTITY_( \
+               IREE_STATUS_IMPL_GET_ARGS_)(__VA_ARGS__))))
+#define IREE_STATUS_IMPL_ANNOTATE_F_(var, ...)       \
+  IREE_STATUS_IMPL_IDENTITY_(iree_status_annotate_f( \
+      var,                                           \
+      IREE_STATUS_IMPL_IDENTITY_(IREE_STATUS_IMPL_GET_ARGS_)(__VA_ARGS__)))
+#define IREE_STATUS_IMPL_ANNOTATE_SWITCH_(...)                                 \
+  IREE_STATUS_IMPL_IDENTITY_(IREE_STATUS_IMPL_IDENTITY_(                       \
+      IREE_STATUS_IMPL_GET_MACRO_)(                                            \
+      __VA_ARGS__, IREE_STATUS_IMPL_ANNOTATE_F_, IREE_STATUS_IMPL_ANNOTATE_F_, \
+      IREE_STATUS_IMPL_ANNOTATE_F_, IREE_STATUS_IMPL_ANNOTATE_F_,              \
+      IREE_STATUS_IMPL_ANNOTATE_F_, IREE_STATUS_IMPL_ANNOTATE_F_,              \
+      IREE_STATUS_IMPL_ANNOTATE_F_, IREE_STATUS_IMPL_ANNOTATE_F_,              \
+      IREE_STATUS_IMPL_ANNOTATE_F_, IREE_STATUS_IMPL_ANNOTATE_F_,              \
+      IREE_STATUS_IMPL_ANNOTATE_F_, IREE_STATUS_IMPL_ANNOTATE_F_,              \
+      IREE_STATUS_IMPL_ANNOTATE_, IREE_STATUS_IMPL_PASS_))                     \
+  (IREE_STATUS_IMPL_GET_EXPR_(__VA_ARGS__),                                    \
+   IREE_STATUS_IMPL_GET_ARGS_(__VA_ARGS__))
+#define IREE_STATUS_IMPL_RETURN_IF_API_ERROR_(var, ...)                      \
+  iree_status_t var = (IREE_STATUS_IMPL_IDENTITY_(                           \
+      IREE_STATUS_IMPL_IDENTITY_(IREE_STATUS_IMPL_GET_EXPR_)(__VA_ARGS__))); \
+  if (IREE_UNLIKELY(var)) {                                                  \
+    return IREE_STATUS_IMPL_ANNOTATE_SWITCH_(var, __VA_ARGS__);              \
+  }
+#define IREE_STATUS_IMPL_RETURN_AND_EVAL_IF_API_ERROR_(tail_expr, var, ...)  \
+  iree_status_t var = (IREE_STATUS_IMPL_IDENTITY_(                           \
+      IREE_STATUS_IMPL_IDENTITY_(IREE_STATUS_IMPL_GET_EXPR_)(__VA_ARGS__))); \
+  if (IREE_UNLIKELY(var)) {                                                  \
+    (tail_expr);                                                             \
+    return IREE_STATUS_IMPL_ANNOTATE_SWITCH_(var, __VA_ARGS__);              \
+  }
+
+#define IREE_STATUS_IMPL_IGNORE_ERROR_(var, expr) \
+  iree_status_t var = (expr);                     \
+  if (IREE_UNLIKELY(var)) iree_status_ignore(var);
+
+#define IREE_STATUS_IMPL_CHECK_OK_(var, expr) \
+  iree_status_t var = (expr);                 \
+  if (IREE_UNLIKELY(var)) iree_status_abort(var);
+
+// We cut out all status storage code when not used.
+#if IREE_STATUS_FEATURES == 0
+#define IREE_STATUS_IMPL_MAKE_(code, ...) \
+  (iree_status_t)(uintptr_t)((code)&IREE_STATUS_CODE_MASK)
+#define IREE_STATUS_IMPL_MAKE_LOC_(file, line, code, ...) \
+  IREE_STATUS_IMPL_MAKE_(code)
+#undef IREE_STATUS_IMPL_RETURN_IF_API_ERROR_
+#define IREE_STATUS_IMPL_RETURN_IF_API_ERROR_(var, ...)                      \
+  iree_status_t var = (IREE_STATUS_IMPL_IDENTITY_(                           \
+      IREE_STATUS_IMPL_IDENTITY_(IREE_STATUS_IMPL_GET_EXPR_)(__VA_ARGS__))); \
+  if (IREE_UNLIKELY(var)) return var;
+#undef IREE_STATUS_IMPL_RETURN_AND_EVAL_IF_API_ERROR_
+#define IREE_STATUS_IMPL_RETURN_AND_EVAL_IF_API_ERROR_(tail_expr, var, ...)  \
+  iree_status_t var = (IREE_STATUS_IMPL_IDENTITY_(                           \
+      IREE_STATUS_IMPL_IDENTITY_(IREE_STATUS_IMPL_GET_EXPR_)(__VA_ARGS__))); \
+  if (IREE_UNLIKELY(var)) {                                                  \
+    (tail_expr);                                                             \
+    return var;                                                              \
+  }
+#undef IREE_STATUS_IMPL_IGNORE_ERROR_
+#define IREE_STATUS_IMPL_IGNORE_ERROR_(var, expr) \
+  iree_status_t var = (expr);                     \
+  (void)(var);
+#undef IREE_STATUS_IMPL_CHECK_OK_
+#define IREE_STATUS_IMPL_CHECK_OK_(var, expr) \
+  iree_status_t var = (expr);                 \
+  if (IREE_UNLIKELY(!iree_status_is_ok(var))) abort();
+#else
+#define IREE_STATUS_IMPL_MAKE_(...) \
+  IREE_STATUS_IMPL_MAKE_SWITCH_(__FILE__, __LINE__, __VA_ARGS__)
+#define IREE_STATUS_IMPL_MAKE_LOC_(file, line, ...) \
+  IREE_STATUS_IMPL_MAKE_SWITCH_(file, line, __VA_ARGS__)
+#endif  // !IREE_STATUS_FEATURES
+
+// Returns an IREE_STATUS_OK.
+#define iree_ok_status() iree_status_from_code(IREE_STATUS_OK)
+
+// Makes an iree_status_t with the given iree_status_code_t code and records
+// the current source location.
+//
+// Optionally either a message string literal or printf-style format string may
+// be associated with the status.
+//
+// Examples:
+//  return iree_make_status(IREE_STATUS_CANCELLED);
+//  return iree_make_status(IREE_STATUS_CANCELLED, "because reasons");
+//  return iree_make_status(IREE_STATUS_CANCELLED, "because %d > %d", a, b);
+#define iree_make_status IREE_STATUS_IMPL_MAKE_
+
+// Makes an iree_status_t with the given iree_status_code_t code using the given
+// source location. Besides taking the file and line of the source location this
+// is the same as iree_make_status.
+//
+// Examples:
+//  return iree_make_status_with_location(
+//      "file.c", 40, IREE_STATUS_CANCELLED, "because %d > %d", a, b);
+#define iree_make_status_with_location IREE_STATUS_IMPL_MAKE_LOC_
+
+// Propagates the error returned by (expr) by returning from the current
+// function on non-OK status. Optionally annotates the status with additional
+// information (see iree_status_annotate for more information).
+//
+// Example:
+//  iree_status_t OtherFunc(...);
+//  iree_status_t MyFunc(...) {
+//    IREE_RETURN_IF_ERROR(OtherFunc(...));
+//    IREE_RETURN_IF_ERROR(OtherFunc(...), "with a message");
+//    IREE_RETURN_IF_ERROR(OtherFunc(...), "with a value: %d", 5);
+//    return iree_ok_status();
+//  }
+#define IREE_RETURN_IF_ERROR(...)                       \
+  IREE_STATUS_IMPL_RETURN_IF_API_ERROR_(                \
+      IREE_STATUS_IMPL_CONCAT_(__status_, __COUNTER__), \
+      IREE_STATUS_IMPL_IDENTITY_(IREE_STATUS_IMPL_IDENTITY_(__VA_ARGS__)))
+
+// IREE_RETURN_IF_ERROR with a custom expression to evaluate before returning.
+#define IREE_RETURN_AND_EVAL_IF_ERROR(tail_expr, ...)              \
+  IREE_STATUS_IMPL_RETURN_AND_EVAL_IF_API_ERROR_(                  \
+      tail_expr, IREE_STATUS_IMPL_CONCAT_(__status_, __COUNTER__), \
+      IREE_STATUS_IMPL_IDENTITY_(IREE_STATUS_IMPL_IDENTITY_(__VA_ARGS__)))
+
+// Ignores the status result of (expr) regardless of its value.
+//
+// Example:
+//  IREE_IGNORE_ERROR(some_fn_that_may_fail());
+#define IREE_IGNORE_ERROR(expr)   \
+  IREE_STATUS_IMPL_IGNORE_ERROR_( \
+      IREE_STATUS_IMPL_CONCAT_(__status_, __COUNTER__), (expr))
+
+// Aborts the program if the result of (expr) is not IREE_STATUS_OK.
+//
+// WARNING: this should only be used when absolutely required and avoided in any
+// core IREE code. Aborting is a very user-hostile behavior and on some systems
+// can cause major issues. Prefer instead to properly handle errors and route
+// them through hosting application infrastructure in a way that preserves more
+// context than just an instruction pointer and a SIGABRT.
+//
+// Example:
+//  IREE_CHECK_OK(some_fn_that_may_fail());
+#define IREE_CHECK_OK(expr)                                                    \
+  IREE_STATUS_IMPL_CHECK_OK_(IREE_STATUS_IMPL_CONCAT_(__status_, __COUNTER__), \
+                             (expr))
+
+// Returns the canonical status code for the given errno value.
+// https://en.cppreference.com/w/cpp/error/errno_macros
+IREE_API_EXPORT iree_status_code_t
+iree_status_code_from_errno(int error_number);
+
+#if defined(_WIN32) || defined(_WIN64)
+// Returns the canonical status code for the given Win32 GetLastError code.
+// https://docs.microsoft.com/en-us/windows/win32/api/errhandlingapi/nf-errhandlingapi-getlasterror
+IREE_API_EXPORT iree_status_code_t
+iree_status_code_from_win32_error(uint32_t error);
+#endif  // _WIN32 || _WIN64
+
+// Returns a NUL-terminated string constant for the given status code, such as
+// IREE_STATUS_UNAVAILABLE = "UNAVAILABLE". Do not rely on string-matching the
+// result as the exact text may change.
+IREE_API_EXPORT const char* iree_status_code_string(iree_status_code_t code);
+
+// Allocates a new status instance for a failing error |code|.
+// |file| and |line| should be populated with __FILE__ and __LINE__ at the call
+// site and an optional string |message| may be provided.
+//
+// The status will be allocated using the default system allocator and must be
+// freed using either iree_status_free or iree_status_ignore.
+IREE_API_EXPORT IREE_MUST_USE_RESULT iree_status_t
+iree_status_allocate(iree_status_code_t code, const char* file, uint32_t line,
+                     iree_string_view_t message);
+
+// Allocates a new status instance for a failing error |code| and annotates it
+// with a printf-style format string. Roughly equivalent (though more efficient)
+// than iree_status_allocate + iree_status_annotate_f.
+IREE_API_EXPORT IREE_MUST_USE_RESULT iree_status_t IREE_PRINTF_ATTRIBUTE(4, 5)
+    iree_status_allocate_f(iree_status_code_t code, const char* file,
+                           uint32_t line, const char* format, ...);
+
+IREE_API_EXPORT IREE_MUST_USE_RESULT iree_status_t iree_status_allocate_vf(
+    iree_status_code_t code, const char* file, uint32_t line,
+    const char* format, va_list varargs_0, va_list varargs_1);
+
+// Clones |status| into a new status instance.
+// No payloads, if present, will be cloned.
+IREE_API_EXPORT IREE_MUST_USE_RESULT iree_status_t
+iree_status_clone(iree_status_t status);
+
+// Frees |status| if it has any associated storage.
+IREE_API_EXPORT void iree_status_free(iree_status_t status);
+
+// Ignores |status| regardless of its value and frees any associated payloads.
+// Returns an OK status that can be used when chaining.
+IREE_API_EXPORT iree_status_t iree_status_ignore(iree_status_t status);
+
+// Returns a new status that is |base_status| if not OK and otherwise returns
+// |new_status|. This allows for chaining failure handling code that may also
+// return statuses.
+//
+// Example:
+//   iree_status_t status = do_something();
+//   return iree_status_join(status, do_cleanup());
+IREE_API_EXPORT iree_status_t iree_status_join(iree_status_t base_status,
+                                               iree_status_t new_status);
+
+// Aborts the program with a failing |status|.
+// This will trigger a SIGABRT. It's best not to use this at all outside of
+// demos or tools.
+IREE_API_EXPORT IREE_ATTRIBUTE_NORETURN void iree_status_abort(
+    iree_status_t status);
+
+// Consumes the |status| by freeing its storage and returning its code.
+IREE_API_EXPORT iree_status_code_t
+iree_status_consume_code(iree_status_t status);
+
+// NOTE: varargs don't optimize well so we hard-no-op the functions when
+// annotations are not enabled.
+#if IREE_STATUS_FEATURES & IREE_STATUS_FEATURE_ANNOTATIONS
+
+// Annotates a status message with the given constant string message.
+// Ignored if |base_status| is OK.
+IREE_API_EXPORT IREE_MUST_USE_RESULT iree_status_t
+iree_status_annotate(iree_status_t base_status, iree_string_view_t message);
+
+// Annotates a status message with the given printf-style message.
+// Ignored if |base_status| is OK.
+IREE_API_EXPORT IREE_MUST_USE_RESULT iree_status_t IREE_PRINTF_ATTRIBUTE(2, 3)
+    iree_status_annotate_f(iree_status_t base_status, const char* format, ...);
+
+#else
+#define iree_status_annotate(base_status, ...) (base_status)
+#define iree_status_annotate_f(base_status, ...) (base_status)
+#endif  // has IREE_STATUS_FEATURE_ANNOTATIONS
+
+// Formats the status as a multi-line string containing all associated payloads.
+// Note that this may contain PII such as file paths and must only be used for
+// presenting errors to users and not sent to a logs aggregation service.
+//
+// If |buffer_capacity| is insufficient, then |out_buffer_length| is the
+// number of characters that would have been written if |buffer_capacity|
+// had been sufficiently large, not counting the terminating null character.
+IREE_API_EXPORT bool iree_status_format(iree_status_t status,
+                                        iree_host_size_t buffer_capacity,
+                                        char* buffer,
+                                        iree_host_size_t* out_buffer_length);
+
+// Prints |status| to the given |file| as a string with all available
+// annotations. This will produce multiple lines of output and should be used
+// only when dumping a status on failure.
+IREE_API_EXPORT void iree_status_fprint(FILE* file, iree_status_t status);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_BASE_STATUS_H_
diff --git a/runtime/src/iree/base/status_cc.cc b/runtime/src/iree/base/status_cc.cc
new file mode 100644
index 0000000..edd207c
--- /dev/null
+++ b/runtime/src/iree/base/status_cc.cc
@@ -0,0 +1,65 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/status_cc.h"
+
+#include <cstddef>
+#include <cstdlib>
+#include <ostream>
+
+#include "iree/base/attributes.h"
+#include "iree/base/logging.h"
+
+namespace iree {
+
+std::ostream& operator<<(std::ostream& os, const StatusCode& x) {
+  os << StatusCodeToString(x);
+  return os;
+}
+
+// static
+IREE_MUST_USE_RESULT std::string Status::ToString(iree_status_t status) {
+  if (iree_status_is_ok(status)) {
+    return "OK";
+  }
+  iree_host_size_t buffer_length = 0;
+  if (IREE_UNLIKELY(!iree_status_format(status, /*buffer_capacity=*/0,
+                                        /*buffer=*/NULL, &buffer_length))) {
+    return "<!>";
+  }
+  std::string result(buffer_length, '\0');
+  if (IREE_UNLIKELY(!iree_status_format(status, result.size() + 1,
+                                        const_cast<char*>(result.data()),
+                                        &buffer_length))) {
+    return "<!>";
+  }
+  return result;
+}
+
+std::ostream& operator<<(std::ostream& os, const Status& x) {
+  os << x.ToString();
+  return os;
+}
+
+namespace status_impl {
+
+void Helper::HandleInvalidStatusCtorArg(Status* status) {
+  const char* kMessage =
+      "An OK status is not a valid constructor argument to StatusOr<T>";
+  IREE_LOG(ERROR) << kMessage;
+  *status = Status(StatusCode::kInternal, kMessage);
+  abort();
+}
+
+void Helper::Crash(const Status& status) {
+  IREE_LOG(FATAL) << "Attempting to fetch value instead of handling error "
+                  << status;
+  abort();
+}
+
+}  // namespace status_impl
+
+}  // namespace iree
diff --git a/runtime/src/iree/base/status_cc.h b/runtime/src/iree/base/status_cc.h
new file mode 100644
index 0000000..4795dda
--- /dev/null
+++ b/runtime/src/iree/base/status_cc.h
@@ -0,0 +1,944 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BASE_STATUS_CC_H_
+#define IREE_BASE_STATUS_CC_H_
+
+#ifndef __cplusplus
+#error iree::Status is only usable in C++ code.
+#endif  // !__cplusplus
+
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "iree/base/api.h"
+#include "iree/base/attributes.h"
+#include "iree/base/logging.h"
+#include "iree/base/target_platform.h"
+
+namespace iree {
+
+namespace status_impl {
+
+template <class T, class U = T>
+constexpr T exchange(T& obj, U&& new_value) {
+  T old_value = std::move(obj);
+  obj = std::forward<U>(new_value);
+  return old_value;
+}
+
+}  // namespace status_impl
+
+//===----------------------------------------------------------------------===//
+// Status codes and source location utilities
+//===----------------------------------------------------------------------===//
+
+// Class representing a specific location in the source code of a program.
+class SourceLocation {
+ public:
+  // Avoid this constructor; it populates the object with dummy values.
+  constexpr SourceLocation() : line_(0), file_name_(nullptr) {}
+
+  // `file_name` must outlive all copies of the `iree::SourceLocation` object,
+  // so in practice it should be a string literal.
+  constexpr SourceLocation(std::uint_least32_t line, const char* file_name)
+      : line_(line), file_name_(file_name) {}
+
+  // The line number of the captured source location.
+  constexpr std::uint_least32_t line() const { return line_; }
+
+  // The file name of the captured source location.
+  constexpr const char* file_name() const { return file_name_; }
+
+ private:
+  std::uint_least32_t line_;
+  const char* file_name_;
+};
+
+// If a function takes an `iree::SourceLocation` parameter, pass this as the
+// argument.
+#if IREE_STATUS_FEATURES == 0
+#define IREE_LOC ::iree::SourceLocation(0, NULL)
+#else
+#define IREE_LOC ::iree::SourceLocation(__LINE__, __FILE__)
+#endif  // IREE_STATUS_FEATURES == 0
+
+enum class StatusCode : uint32_t {
+  kOk = IREE_STATUS_OK,
+  kCancelled = IREE_STATUS_CANCELLED,
+  kUnknown = IREE_STATUS_UNKNOWN,
+  kInvalidArgument = IREE_STATUS_INVALID_ARGUMENT,
+  kDeadlineExceeded = IREE_STATUS_DEADLINE_EXCEEDED,
+  kNotFound = IREE_STATUS_NOT_FOUND,
+  kAlreadyExists = IREE_STATUS_ALREADY_EXISTS,
+  kPermissionDenied = IREE_STATUS_PERMISSION_DENIED,
+  kResourceExhausted = IREE_STATUS_RESOURCE_EXHAUSTED,
+  kFailedPrecondition = IREE_STATUS_FAILED_PRECONDITION,
+  kAborted = IREE_STATUS_ABORTED,
+  kOutOfRange = IREE_STATUS_OUT_OF_RANGE,
+  kUnimplemented = IREE_STATUS_UNIMPLEMENTED,
+  kInternal = IREE_STATUS_INTERNAL,
+  kUnavailable = IREE_STATUS_UNAVAILABLE,
+  kDataLoss = IREE_STATUS_DATA_LOSS,
+  kUnauthenticated = IREE_STATUS_UNAUTHENTICATED,
+  kDeferred = IREE_STATUS_DEFERRED,
+};
+
+static inline const char* StatusCodeToString(StatusCode code) {
+  return iree_status_code_string(static_cast<iree_status_code_t>(code));
+}
+
+// Prints a human-readable representation of `x` to `os`.
+std::ostream& operator<<(std::ostream& os, const StatusCode& x);
+
+//===----------------------------------------------------------------------===//
+// Status
+//===----------------------------------------------------------------------===//
+
+class IREE_MUST_USE_RESULT Status;
+
+// A Status value can be either OK or not-OK
+//   * OK indicates that the operation succeeded.
+//   * A not-OK value indicates that the operation failed and contains
+//   status_impls
+//     about the error.
+class Status final {
+ public:
+  // Return a combination of the error code name and message.
+  static IREE_MUST_USE_RESULT std::string ToString(iree_status_t status);
+
+  // Creates an OK status with no message.
+  Status() = default;
+
+  // Takes ownership of a C API status instance.
+  Status(iree_status_t&& status) noexcept
+      : value_(status_impl::exchange(
+            status, iree_status_from_code(iree_status_code(status)))) {}
+
+  // Takes ownership of a C API status instance wrapped in a Status.
+  Status(Status& other) noexcept
+      : value_(status_impl::exchange(other.value_,
+                                     iree_status_from_code(other.code()))) {}
+  Status(Status&& other) noexcept
+      : value_(status_impl::exchange(other.value_,
+                                     iree_status_from_code(other.code()))) {}
+  Status& operator=(Status&& other) {
+    if (this != &other) {
+      if (IREE_UNLIKELY(value_)) iree_status_ignore(value_);
+      value_ = status_impl::exchange(other.value_,
+                                     iree_status_from_code(other.code()));
+    }
+    return *this;
+  }
+
+  Status(iree_status_code_t code) : value_(iree_status_from_code(code)) {}
+  Status& operator=(const iree_status_code_t& code) {
+    if (IREE_UNLIKELY(value_)) iree_status_ignore(value_);
+    value_ = iree_status_from_code(code);
+    return *this;
+  }
+
+  Status(StatusCode code) : value_(iree_status_from_code(code)) {}
+  Status& operator=(const StatusCode& code) {
+    if (IREE_UNLIKELY(value_)) iree_status_ignore(value_);
+    value_ = iree_status_from_code(code);
+    return *this;
+  }
+
+  // Creates a status with the specified code and error message.
+  // If `code` is kOk, `message` is ignored.
+  Status(StatusCode code, const char* message) {
+    if (IREE_UNLIKELY(code != StatusCode::kOk)) {
+      value_ = (!message || !strlen(message))
+                   ? iree_status_from_code(code)
+                   : iree_status_allocate(static_cast<iree_status_code_t>(code),
+                                          /*file=*/nullptr, /*line=*/0,
+                                          iree_make_cstring_view(message));
+    }
+  }
+  Status(StatusCode code, SourceLocation location, const char* message) {
+    if (IREE_UNLIKELY(code != StatusCode::kOk)) {
+      value_ = iree_status_allocate(static_cast<iree_status_code_t>(code),
+                                    location.file_name(), location.line(),
+                                    iree_make_cstring_view(message));
+    }
+  }
+
+  ~Status() {
+    if (IREE_UNLIKELY((uintptr_t)(value_) & ~IREE_STATUS_CODE_MASK)) {
+      iree_status_free(value_);
+    }
+  }
+
+  // Returns true if the Status is OK.
+  IREE_MUST_USE_RESULT bool ok() const { return iree_status_is_ok(value_); }
+
+  // Returns the error code.
+  IREE_MUST_USE_RESULT StatusCode code() const {
+    return static_cast<StatusCode>(iree_status_code(value_));
+  }
+
+  // Return a combination of the error code name and message.
+  IREE_MUST_USE_RESULT std::string ToString() const {
+    return Status::ToString(value_);
+  }
+
+  // Ignores any errors, potentially suppressing complaints from any tools.
+  void IgnoreError() { value_ = iree_status_ignore(value_); }
+
+  // Converts to a C API status instance and transfers ownership.
+  IREE_MUST_USE_RESULT operator iree_status_t() && {
+    return status_impl::exchange(
+        value_, iree_status_from_code(iree_status_code(value_)));
+  }
+
+  IREE_MUST_USE_RESULT iree_status_t release() {
+    return status_impl::exchange(value_, iree_ok_status());
+  }
+
+  friend bool operator==(const Status& lhs, const Status& rhs) {
+    return lhs.code() == rhs.code();
+  }
+  friend bool operator!=(const Status& lhs, const Status& rhs) {
+    return !(lhs == rhs);
+  }
+
+  friend bool operator==(const Status& lhs, const StatusCode& rhs) {
+    return lhs.code() == rhs;
+  }
+  friend bool operator!=(const Status& lhs, const StatusCode& rhs) {
+    return !(lhs == rhs);
+  }
+
+  friend bool operator==(const StatusCode& lhs, const Status& rhs) {
+    return lhs == rhs.code();
+  }
+  friend bool operator!=(const StatusCode& lhs, const Status& rhs) {
+    return !(lhs == rhs);
+  }
+
+ private:
+  iree_status_t value_ = iree_ok_status();
+};
+
+// Returns an OK status, equivalent to a default constructed instance.
+IREE_MUST_USE_RESULT static inline Status OkStatus() { return Status(); }
+
+// Prints a human-readable representation of `x` to `os`.
+std::ostream& operator<<(std::ostream& os, const Status& x);
+
+IREE_MUST_USE_RESULT static inline bool IsOk(const Status& status) {
+  return status.code() == StatusCode::kOk;
+}
+
+IREE_MUST_USE_RESULT static inline bool IsOk(const iree_status_t& status) {
+  return iree_status_is_ok(status);
+}
+
+//===----------------------------------------------------------------------===//
+// StatusOr<T>
+//===----------------------------------------------------------------------===//
+
+template <typename T>
+class IREE_MUST_USE_RESULT StatusOr;
+
+namespace status_impl {
+
+// https://en.cppreference.com/w/cpp/types/conjunction
+template <typename... Ts>
+struct conjunction : std::true_type {};
+template <typename T, typename... Ts>
+struct conjunction<T, Ts...>
+    : std::conditional<T::value, conjunction<Ts...>, T>::type {};
+template <typename T>
+struct conjunction<T> : T {};
+
+// https://en.cppreference.com/w/cpp/types/disjunction
+template <typename... Ts>
+struct disjunction : std::false_type {};
+template <typename T, typename... Ts>
+struct disjunction<T, Ts...>
+    : std::conditional<T::value, T, disjunction<Ts...>>::type {};
+template <typename T>
+struct disjunction<T> : T {};
+
+// https://en.cppreference.com/w/cpp/utility/in_place
+struct in_place_t {
+  explicit in_place_t() = default;
+};
+/*inline*/ constexpr in_place_t in_place{};
+
+// https://en.cppreference.com/w/cpp/types/negation
+template <typename T>
+struct negation : std::integral_constant<bool, !T::value> {};
+
+template <typename T, typename U>
+using IsStatusOrConversionAmbiguous =
+    status_impl::disjunction<std::is_constructible<T, StatusOr<U>&>,
+                             std::is_constructible<T, const StatusOr<U>&>,
+                             std::is_constructible<T, StatusOr<U>&&>,
+                             std::is_constructible<T, const StatusOr<U>&&>,
+                             std::is_convertible<StatusOr<U>&, T>,
+                             std::is_convertible<const StatusOr<U>&, T>,
+                             std::is_convertible<StatusOr<U>&&, T>,
+                             std::is_convertible<const StatusOr<U>&&, T>>;
+
+template <typename T, typename U>
+using IsStatusOrConversionAssigmentAmbiguous =
+    status_impl::disjunction<IsStatusOrConversionAmbiguous<T, U>,
+                             std::is_assignable<T&, StatusOr<U>&>,
+                             std::is_assignable<T&, const StatusOr<U>&>,
+                             std::is_assignable<T&, StatusOr<U>&&>,
+                             std::is_assignable<T&, const StatusOr<U>&&>>;
+
+template <typename T, typename U>
+struct IsAmbiguousStatusOrForInitialization
+    :  // Strip const-value refs from type and check again, else false_type.
+       public std::conditional_t<
+           std::is_same<std::remove_cv_t<std::remove_reference_t<U>>, U>::value,
+           std::false_type,
+           IsAmbiguousStatusOrForInitialization<
+               T, std::remove_cv_t<std::remove_reference_t<U>>>> {};
+
+template <typename T, typename U>
+struct IsAmbiguousStatusOrForInitialization<T, StatusOr<U>>
+    : public IsStatusOrConversionAmbiguous<T, U> {};
+
+template <typename T, typename U>
+using IsStatusOrDirectInitializationAmbiguous = status_impl::disjunction<
+    std::is_same<StatusOr<T>, std::remove_cv_t<std::remove_reference_t<U>>>,
+    std::is_same<Status, std::remove_cv_t<std::remove_reference_t<U>>>,
+    std::is_same<status_impl::in_place_t,
+                 std::remove_cv_t<std::remove_reference_t<U>>>,
+    IsAmbiguousStatusOrForInitialization<T, U>>;
+
+template <typename T, typename U>
+using IsStatusOrDirectInitializationValid = status_impl::disjunction<
+    // The is_same allows nested status ors to ignore this check iff same type.
+    std::is_same<T, std::remove_cv_t<std::remove_reference_t<U>>>,
+    status_impl::negation<IsStatusOrDirectInitializationAmbiguous<T, U>>>;
+
+class Helper {
+ public:
+  IREE_ATTRIBUTE_NORETURN static void HandleInvalidStatusCtorArg(Status*);
+  IREE_ATTRIBUTE_NORETURN static void Crash(const Status& status);
+};
+
+// Construct an instance of T in `p` through placement new, passing Args... to
+// the constructor.
+// This abstraction is here mostly for the gcc performance fix.
+template <typename T, typename... Args>
+void PlacementNew(void* p, Args&&... args) {
+#if defined(__GNUC__) && !defined(__clang__)
+  // Teach gcc that 'p' cannot be null, fixing code size issues.
+  if (p == nullptr) __builtin_unreachable();
+#endif
+  new (p) T(std::forward<Args>(args)...);
+}
+
+// Helper base class to hold the data and all operations.
+// We move all this to a base class to allow mixing with the appropriate
+// TraitsBase specialization.
+template <typename T>
+class StatusOrData {
+  template <typename U>
+  friend class StatusOrData;
+
+ public:
+  StatusOrData() = delete;
+
+  StatusOrData(const StatusOrData& other) {
+    if (other.ok()) {
+      MakeValue(other.data_);
+      MakeStatus();
+    } else {
+      MakeStatus(other.status_);
+    }
+  }
+
+  StatusOrData(StatusOrData&& other) noexcept {
+    if (other.ok()) {
+      MakeValue(std::move(other.data_));
+      MakeStatus();
+    } else {
+      MakeStatus(status_impl::exchange(other.status_, other.status_.code()));
+    }
+  }
+
+  template <typename U>
+  explicit StatusOrData(const StatusOrData<U>& other) {
+    if (other.ok()) {
+      MakeValue(other.data_);
+      MakeStatus();
+    } else {
+      MakeStatus(other.status_);
+    }
+  }
+
+  template <typename U>
+  explicit StatusOrData(StatusOrData<U>&& other) {
+    if (other.ok()) {
+      MakeValue(std::move(other.data_));
+      MakeStatus();
+    } else {
+      MakeStatus(status_impl::exchange(other.status_, other.status_.code()));
+    }
+  }
+
+  template <typename... Args>
+  explicit StatusOrData(status_impl::in_place_t, Args&&... args)
+      : data_(std::forward<Args>(args)...) {
+    MakeStatus();
+  }
+
+  explicit StatusOrData(const T& value) : data_(value) { MakeStatus(); }
+  explicit StatusOrData(T&& value) : data_(std::move(value)) { MakeStatus(); }
+
+  explicit StatusOrData(Status&& status)
+      : status_(status_impl::exchange(status, status.code())) {
+    EnsureNotOk();
+  }
+
+  StatusOrData& operator=(const StatusOrData& other) {
+    if (this == &other) return *this;
+    if (other.ok()) {
+      Assign(other.data_);
+    } else {
+      Assign(other.status_);
+    }
+    return *this;
+  }
+
+  StatusOrData& operator=(StatusOrData&& other) {
+    if (this == &other) return *this;
+    if (other.ok()) {
+      Assign(std::move(other.data_));
+    } else {
+      Assign(status_impl::exchange(other.status_, other.status_.code()));
+    }
+    return *this;
+  }
+
+  ~StatusOrData() {
+    if (ok()) {
+      status_.~Status();
+      data_.~T();
+    } else {
+      status_.~Status();
+    }
+  }
+
+  void Assign(const T& value) {
+    if (ok()) {
+      data_.~T();
+      MakeValue(value);
+    } else {
+      MakeValue(value);
+      status_ = StatusCode::kOk;
+    }
+  }
+
+  void Assign(T&& value) {
+    if (ok()) {
+      data_.~T();
+      MakeValue(std::move(value));
+    } else {
+      MakeValue(std::move(value));
+      status_ = StatusCode::kOk;
+    }
+  }
+
+  void Assign(Status&& status) {
+    Clear();
+    status_ = status_impl::exchange(status, status.code());
+    EnsureNotOk();
+  }
+
+  bool ok() const { return status_.ok(); }
+
+ protected:
+  // status_ will always be active after the constructor.
+  // Union to be able to initialize exactly how we need without waste.
+  // Eg. in the copy constructor we use the default constructor of Status in
+  // the ok() path to avoid an extra Ref call.
+  union {
+    Status status_;
+  };
+
+  // data_ is active iff status_.ok()==true
+  struct Dummy {};
+  union {
+    // When T is const, we need some non-const object we can cast to void* for
+    // the placement new. dummy_ is that object.
+    Dummy dummy_;
+    T data_;
+  };
+
+  void Clear() {
+    if (ok()) data_.~T();
+  }
+
+  void EnsureOk() const {
+    if (IREE_UNLIKELY(!ok())) Helper::Crash(status_);
+  }
+
+  void EnsureNotOk() {
+    if (IREE_UNLIKELY(ok())) Helper::HandleInvalidStatusCtorArg(&status_);
+  }
+
+  // Construct the value (data_) through placement new with the passed arg.
+  template <typename Arg>
+  void MakeValue(Arg&& arg) {
+    status_impl::PlacementNew<T>(&dummy_, std::forward<Arg>(arg));
+  }
+
+  // Construct the status (status_) through placement new with the passed arg.
+  template <typename... Args>
+  void MakeStatus(Args&&... args) {
+    status_impl::PlacementNew<Status>(&status_, std::forward<Args>(args)...);
+  }
+};
+
+// Helper base class to allow implicitly deleted constructors and assignment
+// operations in StatusOr.
+// TraitsBase will explicitly delete what it can't support and StatusOr will
+// inherit that behavior implicitly.
+template <bool Copy, bool Move>
+struct TraitsBase {
+  TraitsBase() = default;
+  TraitsBase(const TraitsBase&) = default;
+  TraitsBase(TraitsBase&&) = default;
+  TraitsBase& operator=(const TraitsBase&) = default;
+  TraitsBase& operator=(TraitsBase&&) = default;
+};
+
+template <>
+struct TraitsBase<false, true> {
+  TraitsBase() = default;
+  TraitsBase(const TraitsBase&) = delete;
+  TraitsBase(TraitsBase&&) = default;
+  TraitsBase& operator=(const TraitsBase&) = delete;
+  TraitsBase& operator=(TraitsBase&&) = default;
+};
+
+template <>
+struct TraitsBase<false, false> {
+  TraitsBase() = default;
+  TraitsBase(const TraitsBase&) = delete;
+  TraitsBase(TraitsBase&&) = delete;
+  TraitsBase& operator=(const TraitsBase&) = delete;
+  TraitsBase& operator=(TraitsBase&&) = delete;
+};
+
+}  // namespace status_impl
+
+// StatusOr<T> is the union of a Status object and a T object.
+//
+// A StatusOr object either holds a usable value, or an error Status explaining
+// why such a value is not present.
+template <typename T>
+class StatusOr
+    : private status_impl::StatusOrData<T>,
+      private status_impl::TraitsBase<std::is_copy_constructible<T>::value,
+                                      std::is_move_constructible<T>::value> {
+  template <typename U>
+  friend class StatusOr;
+
+  typedef status_impl::StatusOrData<T> Base;
+
+ public:
+  typedef T element_type;
+
+  // Constructs a new StatusOr with StatusCode::kUnknown status.
+  explicit StatusOr();
+
+  // StatusOr<T> is copy constructible/assignable if T is copy constructible.
+  StatusOr(const StatusOr&) = default;
+  StatusOr& operator=(const StatusOr&) = default;
+
+  // StatusOr<T> is move constructible/assignable if T is move constructible.
+  StatusOr(StatusOr&&) = default;
+  StatusOr& operator=(StatusOr&&) = default;
+
+  // Converting constructors from StatusOr<U>, when T is constructible from U.
+  // To avoid ambiguity, they are disabled if T is also constructible from
+  // StatusOr<U>. Explicit iff the corresponding construction of T from U is
+  // explicit.
+  template <
+      typename U,
+      std::enable_if_t<
+          status_impl::conjunction<
+              status_impl::negation<std::is_same<T, U>>,
+              std::is_constructible<T, const U&>,
+              std::is_convertible<const U&, T>,
+              status_impl::negation<
+                  status_impl::IsStatusOrConversionAmbiguous<T, U>>>::value,
+          int> = 0>
+  StatusOr(const StatusOr<U>& other)  // NOLINT
+      : Base(static_cast<const typename StatusOr<U>::Base&>(other)) {}
+  template <
+      typename U,
+      std::enable_if_t<
+          status_impl::conjunction<
+              status_impl::negation<std::is_same<T, U>>,
+              std::is_constructible<T, const U&>,
+              status_impl::negation<std::is_convertible<const U&, T>>,
+              status_impl::negation<
+                  status_impl::IsStatusOrConversionAmbiguous<T, U>>>::value,
+          int> = 0>
+  explicit StatusOr(const StatusOr<U>& other)
+      : Base(static_cast<const typename StatusOr<U>::Base&>(other)) {}
+
+  template <
+      typename U,
+      std::enable_if_t<
+          status_impl::conjunction<
+              status_impl::negation<std::is_same<T, U>>,
+              std::is_constructible<T, U&&>, std::is_convertible<U&&, T>,
+              status_impl::negation<
+                  status_impl::IsStatusOrConversionAmbiguous<T, U>>>::value,
+          int> = 0>
+  StatusOr(StatusOr<U>&& other)  // NOLINT
+      : Base(static_cast<typename StatusOr<U>::Base&&>(other)) {}
+  template <
+      typename U,
+      std::enable_if_t<
+          status_impl::conjunction<
+              status_impl::negation<std::is_same<T, U>>,
+              std::is_constructible<T, U&&>,
+              status_impl::negation<std::is_convertible<U&&, T>>,
+              status_impl::negation<
+                  status_impl::IsStatusOrConversionAmbiguous<T, U>>>::value,
+          int> = 0>
+  explicit StatusOr(StatusOr<U>&& other)
+      : Base(static_cast<typename StatusOr<U>::Base&&>(other)) {}
+
+  // Conversion copy/move assignment operator, T must be constructible and
+  // assignable from U. Only enable if T cannot be directly assigned from
+  // StatusOr<U>.
+  template <typename U,
+            std::enable_if_t<
+                status_impl::conjunction<
+                    status_impl::negation<std::is_same<T, U>>,
+                    std::is_constructible<T, const U&>,
+                    std::is_assignable<T, const U&>,
+                    status_impl::negation<
+                        status_impl::IsStatusOrConversionAssigmentAmbiguous<
+                            T, U>>>::value,
+                int> = 0>
+  StatusOr& operator=(const StatusOr<U>& other) {
+    this->Assign(other);
+    return *this;
+  }
+  template <typename U,
+            std::enable_if_t<
+                status_impl::conjunction<
+                    status_impl::negation<std::is_same<T, U>>,
+                    std::is_constructible<T, U&&>, std::is_assignable<T, U&&>,
+                    status_impl::negation<
+                        status_impl::IsStatusOrConversionAssigmentAmbiguous<
+                            T, U>>>::value,
+                int> = 0>
+  StatusOr& operator=(StatusOr<U>&& other) {
+    this->Assign(std::move(other));
+    return *this;
+  }
+
+  // Constructs a new StatusOr with the given value. After calling this
+  // constructor, this->ok() will be true and the contained value may be
+  // retrieved with value(), operator*(), or operator->().
+  StatusOr(const T& value);
+
+  // Takes ownership of a C API status instance.
+  StatusOr(iree_status_t&& status) noexcept
+      : Base(status_impl::exchange(
+            status, iree_status_from_code(iree_status_code(status)))) {}
+
+  // Constructs a new StatusOr with the given non-ok status. After calling this
+  // constructor, this->ok() will be false and calls to value() will
+  // IREE_CHECK-fail.
+  StatusOr(const Status& status);
+  StatusOr& operator=(const Status& status);
+
+  // Similar to the `const T&` overload.
+  //
+  // REQUIRES: T is move constructible.
+  StatusOr(T&& value);
+
+  // RValue versions of the operations declared above.
+  StatusOr(Status&& status);
+  StatusOr& operator=(Status&& status);
+
+  // Constructs the inner value T in-place using the provided args, using the
+  // T(args...) constructor.
+  template <typename... Args>
+  explicit StatusOr(status_impl::in_place_t, Args&&... args);
+  template <typename U, typename... Args>
+  explicit StatusOr(status_impl::in_place_t, std::initializer_list<U> ilist,
+                    Args&&... args);
+
+  // Constructs the inner value T in-place using the provided args, using the
+  // T(U) (direct-initialization) constructor. Only valid if T can be
+  // constructed from a U. Can accept move or copy constructors. Explicit it
+  // U is not convertible to T. To avoid ambiguity, this is disabled if U is
+  // a StatusOr<J>, where J is convertible to T.
+  template <typename U = T,
+            std::enable_if_t<
+                status_impl::conjunction<
+                    status_impl::IsStatusOrDirectInitializationValid<T, U&&>,
+                    std::is_constructible<T, U&&>,
+                    std::is_convertible<U&&, T>>::value,
+                int> = 0>
+  StatusOr(U&& u)  // NOLINT
+      : StatusOr(status_impl::in_place, std::forward<U>(u)) {}
+
+  template <typename U = T,
+            std::enable_if_t<
+                status_impl::conjunction<
+                    status_impl::IsStatusOrDirectInitializationValid<T, U&&>,
+                    std::is_constructible<T, U&&>,
+                    status_impl::negation<std::is_convertible<U&&, T>>>::value,
+                int> = 0>
+  explicit StatusOr(U&& u)  // NOLINT
+      : StatusOr(status_impl::in_place, std::forward<U>(u)) {}
+
+  // Returns this->ok()
+  explicit operator bool() const { return ok(); }
+
+  // Returns this->status().ok()
+  IREE_MUST_USE_RESULT bool ok() const { return this->status_.ok(); }
+
+  // Returns a reference to our status. If this contains a T, then
+  // returns OkStatus().
+  const Status& status() const&;
+  Status status() &&;
+
+  // Returns a reference to the held value if `this->ok()`, or IREE_CHECK-fails.
+  // If you have already checked the status using `this->ok()` or
+  // `operator bool()`, you probably want to use `operator*()` or `operator->()`
+  // to access the value instead of `value`.
+  const T& value() const&;
+  T& value() &;
+  const T&& value() const&&;
+  T&& value() &&;
+
+  // Returns a reference to the current value.
+  //
+  // REQUIRES: this->ok() == true, otherwise the behavior is undefined.
+  const T& operator*() const&;
+  T& operator*() &;
+  const T&& operator*() const&&;
+  T&& operator*() &&;
+
+  // Returns a pointer to the current value.
+  //
+  // REQUIRES: this->ok() == true, otherwise the behavior is undefined.
+  const T* operator->() const;
+  T* operator->();
+
+  // Returns a copy of the current value if this->ok() == true. Otherwise
+  // returns a default value.
+  template <typename U>
+  T value_or(U&& default_value) const&;
+  template <typename U>
+  T value_or(U&& default_value) &&;
+
+  // Ignores any errors. This method does nothing except potentially suppress
+  // complaints from any tools that are checking that errors are not dropped on
+  // the floor.
+  void IgnoreError() const;
+
+ private:
+  using status_impl::StatusOrData<T>::Assign;
+  template <typename U>
+  void Assign(const StatusOr<U>& other);
+  template <typename U>
+  void Assign(StatusOr<U>&& other);
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Implementation status_impls for StatusOr<T>
+
+template <typename T>
+StatusOr<T>::StatusOr() : Base(Status(StatusCode::kUnknown, "")) {}
+
+template <typename T>
+StatusOr<T>::StatusOr(const T& value) : Base(value) {}
+
+template <typename T>
+StatusOr<T>::StatusOr(T&& value) : Base(std::move(value)) {}
+
+template <typename T>
+StatusOr<T>::StatusOr(const Status& status) : Base(status) {}
+
+template <typename T>
+StatusOr<T>::StatusOr(Status&& status) : Base(std::move(status)) {}
+
+template <typename T>
+StatusOr<T>& StatusOr<T>::operator=(const Status& status) {
+  this->Assign(status);
+  return *this;
+}
+
+template <typename T>
+StatusOr<T>& StatusOr<T>::operator=(Status&& status) {
+  this->Assign(std::move(status));
+  return *this;
+}
+
+template <typename T>
+template <typename U>
+inline void StatusOr<T>::Assign(const StatusOr<U>& other) {
+  if (other.ok()) {
+    this->Assign(other.value());
+  } else {
+    this->Assign(other.status());
+  }
+}
+
+template <typename T>
+template <typename U>
+inline void StatusOr<T>::Assign(StatusOr<U>&& other) {
+  if (other.ok()) {
+    this->Assign(std::move(other).value());
+  } else {
+    this->Assign(std::move(other).status());
+  }
+}
+template <typename T>
+template <typename... Args>
+StatusOr<T>::StatusOr(status_impl::in_place_t, Args&&... args)
+    : Base(status_impl::in_place, std::forward<Args>(args)...) {}
+
+template <typename T>
+template <typename U, typename... Args>
+StatusOr<T>::StatusOr(status_impl::in_place_t, std::initializer_list<U> ilist,
+                      Args&&... args)
+    : Base(status_impl::in_place, ilist, std::forward<Args>(args)...) {}
+
+template <typename T>
+const Status& StatusOr<T>::status() const& {
+  return this->status_;
+}
+
+template <typename T>
+Status StatusOr<T>::status() && {
+  if (ok()) {
+    return OkStatus();
+  } else {
+    return status_impl::exchange(this->status_, this->status_.code());
+  }
+}
+
+template <typename T>
+const T& StatusOr<T>::value() const& {
+  this->EnsureOk();
+  return this->data_;
+}
+
+template <typename T>
+T& StatusOr<T>::value() & {
+  this->EnsureOk();
+  return this->data_;
+}
+
+template <typename T>
+const T&& StatusOr<T>::value() const&& {
+  this->EnsureOk();
+  return std::move(this->data_);
+}
+
+template <typename T>
+T&& StatusOr<T>::value() && {
+  this->EnsureOk();
+  return std::move(this->data_);
+}
+
+template <typename T>
+const T& StatusOr<T>::operator*() const& {
+  this->EnsureOk();
+  return this->data_;
+}
+
+template <typename T>
+T& StatusOr<T>::operator*() & {
+  this->EnsureOk();
+  return this->data_;
+}
+
+template <typename T>
+const T&& StatusOr<T>::operator*() const&& {
+  this->EnsureOk();
+  return std::move(this->data_);
+}
+
+template <typename T>
+T&& StatusOr<T>::operator*() && {
+  this->EnsureOk();
+  return std::move(this->data_);
+}
+
+template <typename T>
+const T* StatusOr<T>::operator->() const {
+  this->EnsureOk();
+  return &this->data_;
+}
+
+template <typename T>
+T* StatusOr<T>::operator->() {
+  this->EnsureOk();
+  return &this->data_;
+}
+
+template <typename T>
+template <typename U>
+T StatusOr<T>::value_or(U&& default_value) const& {
+  if (ok()) {
+    return this->data_;
+  }
+  return std::forward<U>(default_value);
+}
+
+template <typename T>
+template <typename U>
+T StatusOr<T>::value_or(U&& default_value) && {
+  if (ok()) {
+    return std::move(this->data_);
+  }
+  return std::forward<U>(default_value);
+}
+
+template <typename T>
+void StatusOr<T>::IgnoreError() const {
+  this->status_.IgnoreError();
+}
+
+template <typename T>
+IREE_MUST_USE_RESULT static inline bool IsOk(const StatusOr<T>& status_or) {
+  return status_or.ok();
+}
+
+}  // namespace iree
+
+// Executes an expression `rexpr` that returns a `iree::StatusOr<T>`. On OK,
+// moves its value into the variable defined by `lhs`, otherwise returns
+// from the current function.
+#define IREE_ASSIGN_OR_RETURN(lhs, rexpr)      \
+  IREE_STATUS_MACROS_IMPL_ASSIGN_OR_RETURN_2_( \
+      IREE_STATUS_IMPL_CONCAT_(_status_or_value, __LINE__), lhs, (rexpr))
+
+#define IREE_STATUS_MACROS_IMPL_ASSIGN_OR_RETURN_2_(statusor, lhs, rexpr) \
+  auto statusor = rexpr;                                                  \
+  if (IREE_UNLIKELY(!::iree::IsOk(statusor))) {                           \
+    return std::move(statusor).status();                                  \
+  }                                                                       \
+  lhs = std::move(statusor).value()
+
+#endif  // IREE_BASE_STATUS_CC_H_
diff --git a/runtime/src/iree/base/status_test.cc b/runtime/src/iree/base/status_test.cc
new file mode 100644
index 0000000..c035e1e
--- /dev/null
+++ b/runtime/src/iree/base/status_test.cc
@@ -0,0 +1,102 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <ostream>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "iree/base/api.h"
+#include "iree/base/status_cc.h"
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace iree {
+namespace {
+
+using ::iree::testing::status::StatusIs;
+using ::testing::HasSubstr;
+
+#if (IREE_STATUS_FEATURES & IREE_STATUS_FEATURE_ANNOTATIONS) != 0
+#define CHECK_STATUS_MESSAGE(status, message_substr)         \
+  EXPECT_THAT(status.ToString(),                             \
+              HasSubstr(StatusCodeToString(status.code()))); \
+  EXPECT_THAT(status.ToString(), HasSubstr(message_substr))
+#define CHECK_STREAM_MESSAGE(status, os, message_substr)               \
+  EXPECT_THAT(os.str(), HasSubstr(StatusCodeToString(status.code()))); \
+  EXPECT_THAT(os.str(), HasSubstr(message_substr))
+#else
+#define CHECK_STATUS_MESSAGE(status, message_substr) \
+  EXPECT_THAT(status.ToString(), HasSubstr(StatusCodeToString(status.code())));
+#define CHECK_STREAM_MESSAGE(status, os, message_substr) \
+  EXPECT_THAT(os.str(), HasSubstr(StatusCodeToString(status.code())));
+#endif  // has IREE_STATUS_FEATURE_ANNOTATIONS
+
+TEST(Status, ConstructedWithMessage) {
+  Status status = Status(StatusCode::kInvalidArgument, "message");
+  CHECK_STATUS_MESSAGE(status, "message");
+}
+
+TEST(Status, StreamInsertion) {
+  Status status = Status(StatusCode::kInvalidArgument, "message");
+  std::ostringstream os;
+  os << status;
+  CHECK_STREAM_MESSAGE(status, os, "message");
+}
+
+TEST(Status, StreamInsertionContinued) {
+  Status status = Status(StatusCode::kInvalidArgument, "message");
+  std::ostringstream os;
+  os << status << " annotation";
+  CHECK_STREAM_MESSAGE(status, os, "message");
+  CHECK_STREAM_MESSAGE(status, os, "annotation");
+}
+
+TEST(StatusMacro, ReturnIfError) {
+  auto returnIfError = [](iree_status_t status) -> iree_status_t {
+    IREE_RETURN_IF_ERROR(status, "annotation");
+    return iree_ok_status();
+  };
+  Status status = iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "message");
+  status = returnIfError(std::move(status));
+  EXPECT_THAT(status, StatusIs(StatusCode::kInvalidArgument));
+  CHECK_STATUS_MESSAGE(status, "message");
+  CHECK_STATUS_MESSAGE(status, "annotation");
+
+  IREE_EXPECT_OK(returnIfError(OkStatus()));
+}
+
+TEST(StatusMacro, ReturnIfErrorFormat) {
+  auto returnIfError = [](iree_status_t status) -> iree_status_t {
+    IREE_RETURN_IF_ERROR(status, "annotation %d %d %d", 1, 2, 3);
+    return iree_ok_status();
+  };
+  Status status = iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "message");
+  status = returnIfError(std::move(status));
+  EXPECT_THAT(status, StatusIs(StatusCode::kInvalidArgument));
+  CHECK_STATUS_MESSAGE(status, "message");
+  CHECK_STATUS_MESSAGE(status, "annotation 1 2 3");
+
+  IREE_EXPECT_OK(returnIfError(OkStatus()));
+}
+
+TEST(StatusMacro, AssignOrReturn) {
+  auto assignOrReturn = [](StatusOr<std::string> statusOr) -> iree_status_t {
+    IREE_ASSIGN_OR_RETURN(auto ret, std::move(statusOr));
+    (void)ret;
+    return iree_ok_status();
+  };
+  StatusOr<std::string> statusOr =
+      iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "message");
+  Status status = assignOrReturn(std::move(statusOr));
+  EXPECT_THAT(status, StatusIs(StatusCode::kInvalidArgument));
+  CHECK_STATUS_MESSAGE(status, "message");
+
+  IREE_EXPECT_OK(assignOrReturn("foo"));
+}
+
+}  // namespace
+}  // namespace iree
diff --git a/runtime/src/iree/base/string_builder.c b/runtime/src/iree/base/string_builder.c
new file mode 100644
index 0000000..590e1d5
--- /dev/null
+++ b/runtime/src/iree/base/string_builder.c
@@ -0,0 +1,151 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/string_builder.h"
+
+#include "iree/base/alignment.h"
+
+// Minimum alignment for storage buffer allocations.
+#define IREE_STRING_BUILDER_ALIGNMENT 128
+
+IREE_API_EXPORT void iree_string_builder_initialize(
+    iree_allocator_t allocator, iree_string_builder_t* out_builder) {
+  memset(out_builder, 0, sizeof(*out_builder));
+  out_builder->allocator = allocator;
+}
+
+IREE_API_EXPORT void iree_string_builder_initialize_with_storage(
+    char* buffer, iree_host_size_t buffer_capacity,
+    iree_string_builder_t* out_builder) {
+  iree_string_builder_initialize(iree_allocator_null(), out_builder);
+  out_builder->buffer = buffer;
+  out_builder->capacity = buffer_capacity;
+}
+
+IREE_API_EXPORT void iree_string_builder_deinitialize(
+    iree_string_builder_t* builder) {
+  if (builder->buffer != NULL) {
+    iree_allocator_free(builder->allocator, builder->buffer);
+  }
+  memset(builder, 0, sizeof(*builder));
+}
+
+IREE_API_EXPORT const char* iree_string_builder_buffer(
+    const iree_string_builder_t* builder) {
+  return builder->buffer;
+}
+
+IREE_API_EXPORT iree_host_size_t
+iree_string_builder_size(const iree_string_builder_t* builder) {
+  return builder->size;
+}
+
+IREE_API_EXPORT iree_host_size_t
+iree_string_builder_capacity(const iree_string_builder_t* builder) {
+  return builder->capacity;
+}
+
+IREE_API_EXPORT iree_string_view_t
+iree_string_builder_view(const iree_string_builder_t* builder) {
+  return iree_make_string_view(iree_string_builder_buffer(builder),
+                               iree_string_builder_size(builder));
+}
+
+IREE_API_EXPORT char* iree_string_builder_take_storage(
+    iree_string_builder_t* builder) {
+  char* buffer = builder->buffer;
+  if (builder->size == 0) {
+    // In empty cases we return NULL and need to clean up inline as the user is
+    // expecting to be able to discard the builder after this returns.
+    if (builder->buffer != NULL) {
+      iree_allocator_free(builder->allocator, builder->buffer);
+      builder->buffer = NULL;
+    }
+    buffer = NULL;
+  }
+  builder->size = 0;
+  builder->capacity = 0;
+  builder->buffer = NULL;
+  return buffer;
+}
+
+IREE_API_EXPORT iree_status_t iree_string_builder_reserve(
+    iree_string_builder_t* builder, iree_host_size_t minimum_capacity) {
+  if (iree_allocator_is_null(builder->allocator)) return iree_ok_status();
+  iree_host_size_t new_capacity = builder->capacity;
+  if (builder->capacity < minimum_capacity) {
+    new_capacity =
+        iree_host_align(minimum_capacity, IREE_STRING_BUILDER_ALIGNMENT);
+  }
+  if (builder->capacity >= new_capacity) return iree_ok_status();
+  IREE_RETURN_IF_ERROR(iree_allocator_realloc(builder->allocator, new_capacity,
+                                              (void**)&builder->buffer));
+  builder->buffer[builder->size] = 0;
+  builder->capacity = new_capacity;
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_status_t iree_string_builder_append_string(
+    iree_string_builder_t* builder, iree_string_view_t value) {
+  // Ensure capacity for the value + NUL terminator.
+  IREE_RETURN_IF_ERROR(
+      iree_string_builder_reserve(builder, builder->size + value.size + 1));
+  if (builder->buffer != NULL) {
+    // Only copy the bytes if we are not doing a size calculation.
+    memcpy(builder->buffer + builder->size, value.data, value.size);
+    builder->buffer[builder->size + value.size] = 0;  // NUL
+  }
+  builder->size += value.size;
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_status_t iree_string_builder_append_cstring(
+    iree_string_builder_t* builder, const char* value) {
+  return iree_string_builder_append_string(builder,
+                                           iree_make_cstring_view(value));
+}
+
+static iree_status_t iree_string_builder_append_format_impl(
+    iree_string_builder_t* builder, const char* format, va_list varargs_0,
+    va_list varargs_1) {
+  // Try to directly print into the buffer we have. This may work if we have
+  // capacity but otherwise will yield us the size we need to grow our buffer.
+  int n = vsnprintf(builder->buffer ? builder->buffer + builder->size : NULL,
+                    builder->buffer ? builder->capacity - builder->size : 0,
+                    format, varargs_0);
+  if (IREE_UNLIKELY(n < 0)) {
+    return iree_make_status(IREE_STATUS_INTERNAL, "printf try failed");
+  }
+  if (n < builder->capacity - builder->size) {
+    // Printed into the buffer.
+    builder->size += n;
+    return iree_ok_status();
+  }
+
+  // Reserve new minimum capacity.
+  IREE_RETURN_IF_ERROR(iree_string_builder_reserve(
+      builder, iree_string_builder_size(builder) + n + /*NUL*/ 1));
+
+  // Try printing again.
+  vsnprintf(builder->buffer ? builder->buffer + builder->size : NULL,
+            builder->buffer ? builder->capacity - builder->size : 0, format,
+            varargs_1);
+  builder->size += n;
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_status_t IREE_PRINTF_ATTRIBUTE(2, 3)
+    iree_string_builder_append_format(iree_string_builder_t* builder,
+                                      const char* format, ...) {
+  va_list varargs_0, varargs_1;
+  va_start(varargs_0, format);
+  va_start(varargs_1, format);
+  iree_status_t status = iree_string_builder_append_format_impl(
+      builder, format, varargs_0, varargs_1);
+  va_end(varargs_1);
+  va_end(varargs_0);
+  return status;
+}
diff --git a/runtime/src/iree/base/string_builder.h b/runtime/src/iree/base/string_builder.h
new file mode 100644
index 0000000..ff6eeba
--- /dev/null
+++ b/runtime/src/iree/base/string_builder.h
@@ -0,0 +1,126 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BASE_STRING_BUILDER_H_
+#define IREE_BASE_STRING_BUILDER_H_
+
+#include <stdbool.h>
+#include <string.h>
+
+#include "iree/base/allocator.h"
+#include "iree/base/attributes.h"
+#include "iree/base/status.h"
+#include "iree/base/string_view.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Lightweight string builder.
+// Used to dynamically produce strings in a growable buffer.
+//
+// Usage:
+//  iree_string_builder_t builder;
+//  iree_string_builder_initialize(iree_allocator_system(), &builder);
+//  IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(&builder, "hel"));
+//  IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(&builder, "lo"));
+//  fprintf(stream, "%.*s", (int)iree_string_builder_size(&builder),
+//                          iree_string_builder_buffer(&builder));
+//  iree_string_builder_deinitialize(&builder);
+//
+// Usage for preallocation:
+//  iree_string_builder_t builder;
+//  iree_string_builder_initialize(iree_allocator_null(), &builder);
+//  IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(&builder, "123"));
+//  // str_length is total number of characters (excluding NUL).
+//  iree_host_size_t str_length = iree_string_builder_size(builder);
+//  iree_string_builder_deinitialize(&builder);
+typedef struct iree_string_builder_t {
+  // Allocator used for buffer storage.
+  // May be iree_allocator_null() to have the builder total up the required
+  // size.
+  iree_allocator_t allocator;
+  // Allocated storage buffer, if any.
+  char* buffer;
+  // Total length of the string in the buffer in characters (excluding NUL).
+  iree_host_size_t size;
+  // Total allocated buffer capacity in bytes.
+  iree_host_size_t capacity;
+} iree_string_builder_t;
+
+// Initializes a string builder in |out_builder| with the given |allocator|.
+IREE_API_EXPORT void iree_string_builder_initialize(
+    iree_allocator_t allocator, iree_string_builder_t* out_builder);
+
+// Initializes a string builder in |out_builder| using the given storage.
+// Once the capacity is reached further appending will fail.
+IREE_API_EXPORT void iree_string_builder_initialize_with_storage(
+    char* buffer, iree_host_size_t buffer_capacity,
+    iree_string_builder_t* out_builder);
+
+// Deinitializes |builder| and releases allocated storage.
+IREE_API_EXPORT void iree_string_builder_deinitialize(
+    iree_string_builder_t* builder);
+
+// Returns a pointer into the builder storage.
+// The pointer is only valid so long as the string builder is initialized and
+// unmodified.
+IREE_API_EXPORT const char* iree_string_builder_buffer(
+    const iree_string_builder_t* builder);
+
+// Returns the total length of the string in the buffer in characters (excluding
+// NUL).
+IREE_API_EXPORT iree_host_size_t
+iree_string_builder_size(const iree_string_builder_t* builder);
+
+// Returns the total allocated buffer capacity in bytes.
+IREE_API_EXPORT iree_host_size_t
+iree_string_builder_capacity(const iree_string_builder_t* builder);
+
+// Returns a string view into the builder storage.
+// The pointer is only valid so long as the string builder is initialized and
+// unmodified.
+IREE_API_EXPORT iree_string_view_t
+iree_string_builder_view(const iree_string_builder_t* builder);
+
+// Releases the storage from the builder and returns ownership to the caller.
+// The caller must free the string using the same allocator used by the builder.
+// Returns NULL if the string builder is empty.
+//
+// Usage:
+//  iree_string_builder_t builder;
+//  iree_string_builder_initialize(iree_allocator_system(), &builder);
+//  ...
+//  char* buffer = iree_string_builder_take_storage(&builder);
+//  iree_host_size_t buffer_size = iree_string_builder_size(&builder);
+//  iree_string_builder_deinitialize(&builder);
+//  ...
+//  iree_allocator_free(iree_allocator_system(), buffer);
+IREE_API_EXPORT IREE_MUST_USE_RESULT char* iree_string_builder_take_storage(
+    iree_string_builder_t* builder);
+
+// Reserves storage for at least |minimum_capacity|.
+IREE_API_EXPORT iree_status_t iree_string_builder_reserve(
+    iree_string_builder_t* builder, iree_host_size_t minimum_capacity);
+
+// Appends a string to the builder.
+IREE_API_EXPORT iree_status_t iree_string_builder_append_string(
+    iree_string_builder_t* builder, iree_string_view_t value);
+
+// Appends a NUL-terminated C string to the builder.
+IREE_API_EXPORT iree_status_t iree_string_builder_append_cstring(
+    iree_string_builder_t* builder, const char* value);
+
+// Appends a printf-style formatted string to the builder.
+IREE_API_EXPORT IREE_PRINTF_ATTRIBUTE(2, 3) iree_status_t
+    iree_string_builder_append_format(iree_string_builder_t* builder,
+                                      const char* format, ...);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_BASE_STRING_BUILDER_H_
diff --git a/runtime/src/iree/base/string_builder_test.cc b/runtime/src/iree/base/string_builder_test.cc
new file mode 100644
index 0000000..fad7034
--- /dev/null
+++ b/runtime/src/iree/base/string_builder_test.cc
@@ -0,0 +1,164 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <string>
+
+#include "iree/base/api.h"
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace {
+
+struct StringBuilder {
+  static StringBuilder MakeSystem() {
+    iree_string_builder_t builder;
+    iree_string_builder_initialize(iree_allocator_system(), &builder);
+    return StringBuilder(builder);
+  }
+
+  static StringBuilder MakeEmpty() {
+    iree_string_builder_t builder;
+    iree_string_builder_initialize(iree_allocator_null(), &builder);
+    return StringBuilder(builder);
+  }
+
+  explicit StringBuilder(iree_string_builder_t builder)
+      : builder(std::move(builder)) {}
+
+  ~StringBuilder() { iree_string_builder_deinitialize(&builder); }
+
+  operator iree_string_builder_t*() { return &builder; }
+
+  std::string ToString() const {
+    return std::string(builder.buffer, builder.size);
+  }
+
+  iree_string_builder_t builder;
+};
+
+TEST(StringBuilderTest, QueryEmpty) {
+  auto builder = StringBuilder::MakeEmpty();
+  EXPECT_EQ(iree_string_builder_buffer(builder),
+            static_cast<const char*>(NULL));
+  EXPECT_EQ(iree_string_builder_size(builder), 0);
+  EXPECT_EQ(iree_string_builder_capacity(builder), 0);
+  EXPECT_TRUE(iree_string_view_is_empty(iree_string_builder_view(builder)));
+  EXPECT_EQ(iree_string_builder_take_storage(builder),
+            static_cast<char*>(NULL));
+}
+
+TEST(StringBuilderTest, QueryAppendString) {
+  auto builder = StringBuilder::MakeEmpty();
+  EXPECT_EQ(iree_string_builder_size(builder), 0);
+  IREE_EXPECT_OK(iree_string_builder_append_cstring(builder, ""));
+  EXPECT_EQ(iree_string_builder_size(builder), 0);
+  IREE_EXPECT_OK(iree_string_builder_append_cstring(builder, "a"));
+  EXPECT_EQ(iree_string_builder_size(builder), 1);
+  IREE_EXPECT_OK(iree_string_builder_append_cstring(builder, "abc"));
+  EXPECT_EQ(iree_string_builder_size(builder), 1 + 3);
+  IREE_EXPECT_OK(iree_string_builder_append_cstring(builder, ""));
+  EXPECT_EQ(iree_string_builder_size(builder), 1 + 3);
+
+  char kLongString[1024];
+  memset(kLongString, 'x', IREE_ARRAYSIZE(kLongString));
+  IREE_EXPECT_OK(iree_string_builder_append_string(
+      builder,
+      iree_make_string_view(kLongString, IREE_ARRAYSIZE(kLongString))));
+  EXPECT_EQ(iree_string_builder_size(builder),
+            1 + 3 + IREE_ARRAYSIZE(kLongString));
+}
+
+TEST(StringBuilderTest, QueryFormat) {
+  auto builder = StringBuilder::MakeEmpty();
+  EXPECT_EQ(iree_string_builder_size(builder), 0);
+  IREE_EXPECT_OK(iree_string_builder_append_format(builder, ""));
+  EXPECT_EQ(iree_string_builder_size(builder), 0);
+  IREE_EXPECT_OK(iree_string_builder_append_format(builder, "abc"));
+  EXPECT_EQ(iree_string_builder_size(builder), 3);
+  IREE_EXPECT_OK(iree_string_builder_append_format(builder, "a%cc", 'b'));
+  EXPECT_EQ(iree_string_builder_size(builder), 6);
+  IREE_EXPECT_OK(iree_string_builder_append_format(builder, "%*c", 1024, 'x'));
+  EXPECT_EQ(iree_string_builder_size(builder), 6 + 1024);
+}
+
+TEST(StringBuilderTest, Empty) {
+  auto builder = StringBuilder::MakeSystem();
+  EXPECT_EQ(iree_string_builder_size(builder), 0);
+  EXPECT_GE(iree_string_builder_capacity(builder), 0);
+  EXPECT_TRUE(iree_string_view_is_empty(iree_string_builder_view(builder)));
+  EXPECT_EQ(iree_string_builder_take_storage(builder),
+            static_cast<char*>(NULL));
+}
+
+TEST(StringBuilderTest, AppendString) {
+  auto builder = StringBuilder::MakeSystem();
+  EXPECT_EQ(iree_string_builder_size(builder), 0);
+  IREE_EXPECT_OK(iree_string_builder_append_cstring(builder, ""));
+  EXPECT_EQ(builder.ToString(), "");
+  IREE_EXPECT_OK(iree_string_builder_append_cstring(builder, "a"));
+  EXPECT_EQ(builder.ToString(), "a");
+  EXPECT_EQ(strlen(builder.builder.buffer), 1);  // NUL check
+  IREE_EXPECT_OK(iree_string_builder_append_cstring(builder, "abc"));
+  EXPECT_EQ(builder.ToString(), "aabc");
+  EXPECT_EQ(strlen(builder.builder.buffer), 1 + 3);  // NUL check
+  IREE_EXPECT_OK(iree_string_builder_append_cstring(builder, ""));
+  EXPECT_EQ(builder.ToString(), "aabc");
+  EXPECT_EQ(iree_string_builder_size(builder), 1 + 3);
+  EXPECT_EQ(strlen(builder.builder.buffer), 1 + 3);  // NUL check
+
+  char kLongString[1024];
+  memset(kLongString, 'x', IREE_ARRAYSIZE(kLongString));
+  IREE_EXPECT_OK(iree_string_builder_append_string(
+      builder,
+      iree_make_string_view(kLongString, IREE_ARRAYSIZE(kLongString))));
+  EXPECT_EQ(iree_string_builder_size(builder),
+            1 + 3 + IREE_ARRAYSIZE(kLongString));
+  EXPECT_EQ(strlen(builder.builder.buffer),
+            1 + 3 + IREE_ARRAYSIZE(kLongString));  // NUL check
+  EXPECT_EQ(builder.ToString(),
+            std::string("aabc") +
+                std::string(kLongString, IREE_ARRAYSIZE(kLongString)));
+}
+
+TEST(StringBuilderTest, TakeStorage) {
+  auto builder = StringBuilder::MakeSystem();
+  EXPECT_EQ(iree_string_builder_size(builder), 0);
+  IREE_EXPECT_OK(iree_string_builder_append_cstring(builder, "a"));
+  EXPECT_EQ(builder.ToString(), "a");
+  IREE_EXPECT_OK(iree_string_builder_append_cstring(builder, "abc"));
+  EXPECT_EQ(builder.ToString(), "aabc");
+  EXPECT_EQ(iree_string_builder_size(builder), 1 + 3);
+  EXPECT_EQ(strlen(builder.builder.buffer),
+            1 + 3);  // NUL check
+
+  char* storage = iree_string_builder_take_storage(builder);
+  EXPECT_EQ(iree_string_builder_buffer(builder),
+            static_cast<const char*>(NULL));
+  EXPECT_EQ(iree_string_builder_size(builder), 0);
+  EXPECT_EQ(iree_string_builder_capacity(builder), 0);
+  EXPECT_NE(storage, static_cast<char*>(NULL));
+  EXPECT_STREQ(storage, "aabc");
+  EXPECT_EQ(builder.builder.buffer, static_cast<char*>(NULL));
+  iree_allocator_free(builder.builder.allocator, storage);
+}
+
+TEST(StringBuilderTest, Format) {
+  auto builder = StringBuilder::MakeSystem();
+  EXPECT_EQ(builder.ToString(), "");
+  IREE_EXPECT_OK(iree_string_builder_append_format(builder, ""));
+  EXPECT_EQ(builder.ToString(), "");
+  IREE_EXPECT_OK(iree_string_builder_append_format(builder, "abc"));
+  EXPECT_EQ(builder.ToString(), "abc");
+  IREE_EXPECT_OK(iree_string_builder_append_format(builder, "a%cc", 'b'));
+  EXPECT_EQ(builder.ToString(), "abcabc");
+  IREE_EXPECT_OK(iree_string_builder_append_format(builder, "%*c", 1024, 'x'));
+  EXPECT_EQ(iree_string_builder_size(builder), 6 + 1024);
+  EXPECT_EQ(strlen(builder.builder.buffer), 6 + 1024);  // NUL check
+  EXPECT_EQ(builder.ToString(),
+            std::string("abcabc") + std::string(1023, ' ') + std::string("x"));
+}
+
+}  // namespace
diff --git a/runtime/src/iree/base/string_view.c b/runtime/src/iree/base/string_view.c
new file mode 100644
index 0000000..f117939
--- /dev/null
+++ b/runtime/src/iree/base/string_view.c
@@ -0,0 +1,387 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/string_view.h"
+
+#include <ctype.h>
+#include <errno.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "iree/base/api.h"
+
+static inline size_t iree_min_host_size(size_t a, size_t b) {
+  return a < b ? a : b;
+}
+
+IREE_API_EXPORT bool iree_string_view_equal(iree_string_view_t lhs,
+                                            iree_string_view_t rhs) {
+  if (lhs.size != rhs.size) return false;
+  for (iree_host_size_t i = 0; i < lhs.size; ++i) {
+    if (lhs.data[i] != rhs.data[i]) return false;
+  }
+  return true;
+}
+
+IREE_API_EXPORT int iree_string_view_compare(iree_string_view_t lhs,
+                                             iree_string_view_t rhs) {
+  iree_host_size_t min_size = iree_min_host_size(lhs.size, rhs.size);
+  int cmp = strncmp(lhs.data, rhs.data, min_size);
+  if (cmp != 0) {
+    return cmp;
+  } else if (lhs.size == rhs.size) {
+    return 0;
+  }
+  return lhs.size < rhs.size ? -1 : 1;
+}
+
+IREE_API_EXPORT iree_host_size_t iree_string_view_find_char(
+    iree_string_view_t value, char c, iree_host_size_t pos) {
+  if (iree_string_view_is_empty(value) || pos >= value.size) {
+    return IREE_STRING_VIEW_NPOS;
+  }
+  const char* result =
+      (const char*)(memchr(value.data + pos, c, value.size - pos));
+  return result != NULL ? result - value.data : IREE_STRING_VIEW_NPOS;
+}
+
+IREE_API_EXPORT iree_host_size_t iree_string_view_find_first_of(
+    iree_string_view_t value, iree_string_view_t s, iree_host_size_t pos) {
+  if (iree_string_view_is_empty(value) || iree_string_view_is_empty(s)) {
+    return IREE_STRING_VIEW_NPOS;
+  }
+  if (s.size == 1) {
+    // Avoid the cost of the lookup table for a single-character search.
+    return iree_string_view_find_char(value, s.data[0], pos);
+  }
+  bool lookup_table[UCHAR_MAX + 1] = {0};
+  for (iree_host_size_t i = 0; i < s.size; ++i) {
+    lookup_table[(uint8_t)s.data[i]] = true;
+  }
+  for (iree_host_size_t i = pos; i < value.size; ++i) {
+    if (lookup_table[(uint8_t)value.data[i]]) {
+      return i;
+    }
+  }
+  return IREE_STRING_VIEW_NPOS;
+}
+
+IREE_API_EXPORT iree_host_size_t iree_string_view_find_last_of(
+    iree_string_view_t value, iree_string_view_t s, iree_host_size_t pos) {
+  if (iree_string_view_is_empty(value) || iree_string_view_is_empty(s)) {
+    return IREE_STRING_VIEW_NPOS;
+  }
+  bool lookup_table[UCHAR_MAX + 1] = {0};
+  for (iree_host_size_t i = 0; i < s.size; ++i) {
+    lookup_table[(uint8_t)s.data[i]] = true;
+  }
+  pos = iree_min(pos, value.size) + 1;
+  iree_host_size_t i = pos;
+  while (i != 0) {
+    --i;
+    if (lookup_table[(uint8_t)value.data[i]]) {
+      return i;
+    }
+  }
+  return IREE_STRING_VIEW_NPOS;
+}
+
+IREE_API_EXPORT bool iree_string_view_starts_with(iree_string_view_t value,
+                                                  iree_string_view_t prefix) {
+  if (!value.data || !prefix.data || !prefix.size || prefix.size > value.size) {
+    return false;
+  }
+  return strncmp(value.data, prefix.data, prefix.size) == 0;
+}
+
+IREE_API_EXPORT bool iree_string_view_ends_with(iree_string_view_t value,
+                                                iree_string_view_t suffix) {
+  if (!value.data || !suffix.data || !suffix.size || suffix.size > value.size) {
+    return false;
+  }
+  return strncmp(value.data + value.size - suffix.size, suffix.data,
+                 suffix.size) == 0;
+}
+
+IREE_API_EXPORT iree_string_view_t
+iree_string_view_remove_prefix(iree_string_view_t value, iree_host_size_t n) {
+  if (n >= value.size) {
+    return iree_string_view_empty();
+  }
+  return iree_make_string_view(value.data + n, value.size - n);
+}
+
+IREE_API_EXPORT iree_string_view_t
+iree_string_view_remove_suffix(iree_string_view_t value, iree_host_size_t n) {
+  if (n >= value.size) {
+    return iree_string_view_empty();
+  }
+  return iree_make_string_view(value.data, value.size - n);
+}
+
+IREE_API_EXPORT iree_string_view_t iree_string_view_strip_prefix(
+    iree_string_view_t value, iree_string_view_t prefix) {
+  if (iree_string_view_starts_with(value, prefix)) {
+    return iree_string_view_remove_prefix(value, prefix.size);
+  }
+  return value;
+}
+
+IREE_API_EXPORT iree_string_view_t iree_string_view_strip_suffix(
+    iree_string_view_t value, iree_string_view_t suffix) {
+  if (iree_string_view_ends_with(value, suffix)) {
+    return iree_string_view_remove_suffix(value, suffix.size);
+  }
+  return value;
+}
+
+IREE_API_EXPORT bool iree_string_view_consume_prefix(
+    iree_string_view_t* value, iree_string_view_t prefix) {
+  if (iree_string_view_starts_with(*value, prefix)) {
+    *value = iree_string_view_remove_prefix(*value, prefix.size);
+    return true;
+  }
+  return false;
+}
+
+IREE_API_EXPORT bool iree_string_view_consume_suffix(
+    iree_string_view_t* value, iree_string_view_t suffix) {
+  if (iree_string_view_ends_with(*value, suffix)) {
+    *value = iree_string_view_remove_suffix(*value, suffix.size);
+    return true;
+  }
+  return false;
+}
+
+IREE_API_EXPORT iree_string_view_t
+iree_string_view_trim(iree_string_view_t value) {
+  if (iree_string_view_is_empty(value)) return value;
+  iree_host_size_t start = 0;
+  iree_host_size_t end = value.size - 1;
+  while (value.size > 0 && start <= end) {
+    if (isspace(value.data[start])) {
+      start++;
+    } else {
+      break;
+    }
+  }
+  while (end > start) {
+    if (isspace(value.data[end])) {
+      --end;
+    } else {
+      break;
+    }
+  }
+  return iree_make_string_view(value.data + start, end - start + 1);
+}
+
+IREE_API_EXPORT iree_string_view_t iree_string_view_substr(
+    iree_string_view_t value, iree_host_size_t pos, iree_host_size_t n) {
+  pos = iree_min_host_size(pos, value.size);
+  n = iree_min_host_size(n, value.size - pos);
+  return iree_make_string_view(value.data + pos, n);
+}
+
+IREE_API_EXPORT intptr_t iree_string_view_split(iree_string_view_t value,
+                                                char split_char,
+                                                iree_string_view_t* out_lhs,
+                                                iree_string_view_t* out_rhs) {
+  *out_lhs = iree_string_view_empty();
+  *out_rhs = iree_string_view_empty();
+  if (!value.data || !value.size) {
+    return -1;
+  }
+  const void* first_ptr = memchr(value.data, split_char, value.size);
+  if (!first_ptr) {
+    *out_lhs = value;
+    return -1;
+  }
+  intptr_t offset = (intptr_t)((const char*)(first_ptr)-value.data);
+  if (out_lhs) {
+    out_lhs->data = value.data;
+    out_lhs->size = offset;
+  }
+  if (out_rhs) {
+    out_rhs->data = value.data + offset + 1;
+    out_rhs->size = value.size - offset - 1;
+  }
+  return offset;
+}
+
+IREE_API_EXPORT void iree_string_view_replace_char(iree_string_view_t value,
+                                                   char old_char,
+                                                   char new_char) {
+  char* p = (char*)value.data;
+  for (iree_host_size_t i = 0; i < value.size; ++i) {
+    if (p[i] == old_char) p[i] = new_char;
+  }
+}
+
+static bool iree_string_view_match_pattern_impl(iree_string_view_t value,
+                                                iree_string_view_t pattern) {
+  iree_host_size_t next_char_index = iree_string_view_find_first_of(
+      pattern, iree_make_cstring_view("*?"), /*pos=*/0);
+  if (next_char_index == IREE_STRING_VIEW_NPOS) {
+    return iree_string_view_equal(value, pattern);
+  } else if (next_char_index > 0) {
+    iree_string_view_t value_prefix =
+        iree_string_view_substr(value, 0, next_char_index);
+    iree_string_view_t pattern_prefix =
+        iree_string_view_substr(pattern, 0, next_char_index);
+    if (!iree_string_view_equal(value_prefix, pattern_prefix)) {
+      return false;
+    }
+    value =
+        iree_string_view_substr(value, next_char_index, IREE_STRING_VIEW_NPOS);
+    pattern = iree_string_view_substr(pattern, next_char_index,
+                                      IREE_STRING_VIEW_NPOS);
+  }
+  if (iree_string_view_is_empty(value) && iree_string_view_is_empty(pattern)) {
+    return true;
+  }
+  char pattern_char = pattern.data[0];
+  if (pattern_char == '*' && pattern.size > 1 &&
+      iree_string_view_is_empty(value)) {
+    return false;
+  } else if (pattern_char == '*' && pattern.size == 1) {
+    return true;
+  } else if (pattern_char == '?' || value.data[0] == pattern_char) {
+    return iree_string_view_match_pattern_impl(
+        iree_string_view_substr(value, 1, IREE_STRING_VIEW_NPOS),
+        iree_string_view_substr(pattern, 1, IREE_STRING_VIEW_NPOS));
+  } else if (pattern_char == '*') {
+    return iree_string_view_match_pattern_impl(
+               value,
+               iree_string_view_substr(pattern, 1, IREE_STRING_VIEW_NPOS)) ||
+           iree_string_view_match_pattern_impl(
+               iree_string_view_substr(value, 1, IREE_STRING_VIEW_NPOS),
+               pattern);
+  }
+  return false;
+}
+
+IREE_API_EXPORT bool iree_string_view_match_pattern(
+    iree_string_view_t value, iree_string_view_t pattern) {
+  return iree_string_view_match_pattern_impl(value, pattern);
+}
+
+IREE_API_EXPORT iree_host_size_t iree_string_view_append_to_buffer(
+    iree_string_view_t source_value, iree_string_view_t* target_value,
+    char* buffer) {
+  memcpy(buffer, source_value.data, source_value.size);
+  target_value->data = buffer;
+  target_value->size = source_value.size;
+  return source_value.size;
+}
+
+// NOTE: these implementations aren't great due to the enforced memcpy we
+// perform. These _should_ never be on a hot path, though, so this keeps our
+// code size small.
+
+IREE_API_EXPORT bool iree_string_view_atoi_int32(iree_string_view_t value,
+                                                 int32_t* out_value) {
+  // Copy to scratch memory with a NUL terminator.
+  char temp[16] = {0};
+  if (value.size >= IREE_ARRAYSIZE(temp)) return false;
+  memcpy(temp, value.data, value.size);
+
+  // Attempt to parse.
+  errno = 0;
+  char* end = NULL;
+  long parsed_value = strtol(temp, &end, 0);
+  if (temp == end) return false;
+  if ((parsed_value == LONG_MIN || parsed_value == LONG_MAX) &&
+      errno == ERANGE) {
+    return false;
+  }
+  *out_value = (int32_t)parsed_value;
+  return parsed_value != 0 || errno == 0;
+}
+
+IREE_API_EXPORT bool iree_string_view_atoi_uint32(iree_string_view_t value,
+                                                  uint32_t* out_value) {
+  // Copy to scratch memory with a NUL terminator.
+  char temp[16] = {0};
+  if (value.size >= IREE_ARRAYSIZE(temp)) return false;
+  memcpy(temp, value.data, value.size);
+
+  // Attempt to parse.
+  errno = 0;
+  char* end = NULL;
+  unsigned long parsed_value = strtoul(temp, &end, 0);
+  if (temp == end) return false;
+  if (parsed_value == ULONG_MAX && errno == ERANGE) return false;
+  *out_value = (uint32_t)parsed_value;
+  return parsed_value != 0 || errno == 0;
+}
+
+IREE_API_EXPORT bool iree_string_view_atoi_int64(iree_string_view_t value,
+                                                 int64_t* out_value) {
+  // Copy to scratch memory with a NUL terminator.
+  char temp[32] = {0};
+  if (value.size >= IREE_ARRAYSIZE(temp)) return false;
+  memcpy(temp, value.data, value.size);
+
+  // Attempt to parse.
+  errno = 0;
+  char* end = NULL;
+  long long parsed_value = strtoll(temp, &end, 0);
+  if (temp == end) return false;
+  if ((parsed_value == LLONG_MIN || parsed_value == LLONG_MAX) &&
+      errno == ERANGE) {
+    return false;
+  }
+  *out_value = (int64_t)parsed_value;
+  return parsed_value != 0 || errno == 0;
+}
+
+IREE_API_EXPORT bool iree_string_view_atoi_uint64(iree_string_view_t value,
+                                                  uint64_t* out_value) {
+  // Copy to scratch memory with a NUL terminator.
+  char temp[32] = {0};
+  if (value.size >= IREE_ARRAYSIZE(temp)) return false;
+  memcpy(temp, value.data, value.size);
+
+  // Attempt to parse.
+  errno = 0;
+  char* end = NULL;
+  unsigned long long parsed_value = strtoull(temp, &end, 0);
+  if (temp == end) return false;
+  if (parsed_value == ULLONG_MAX && errno == ERANGE) return false;
+  *out_value = (uint64_t)parsed_value;
+  return parsed_value != 0 || errno == 0;
+}
+
+IREE_API_EXPORT bool iree_string_view_atof(iree_string_view_t value,
+                                           float* out_value) {
+  // Copy to scratch memory with a NUL terminator.
+  char temp[32] = {0};
+  if (value.size >= IREE_ARRAYSIZE(temp)) return false;
+  memcpy(temp, value.data, value.size);
+
+  // Attempt to parse.
+  errno = 0;
+  char* end = NULL;
+  *out_value = strtof(temp, &end);
+  if (temp == end) return false;
+  return *out_value != 0 || errno == 0;
+}
+
+IREE_API_EXPORT bool iree_string_view_atod(iree_string_view_t value,
+                                           double* out_value) {
+  // Copy to scratch memory with a NUL terminator.
+  char temp[32] = {0};
+  if (value.size >= IREE_ARRAYSIZE(temp)) return false;
+  memcpy(temp, value.data, value.size);
+
+  // Attempt to parse.
+  errno = 0;
+  char* end = NULL;
+  *out_value = strtod(temp, &end);
+  if (temp == end) return false;
+  return *out_value != 0 || errno == 0;
+}
diff --git a/runtime/src/iree/base/string_view.h b/runtime/src/iree/base/string_view.h
new file mode 100644
index 0000000..5d191a4
--- /dev/null
+++ b/runtime/src/iree/base/string_view.h
@@ -0,0 +1,176 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BASE_STRING_VIEW_H_
+#define IREE_BASE_STRING_VIEW_H_
+
+#include <limits.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "iree/base/attributes.h"
+#include "iree/base/config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+#define IREE_STRING_VIEW_NPOS SIZE_MAX
+
+// A string view (ala std::string_view) into a non-NUL-terminated string.
+typedef struct iree_string_view_t {
+  const char* data;
+  iree_host_size_t size;
+} iree_string_view_t;
+
+// Returns an empty string view ("").
+static inline iree_string_view_t iree_string_view_empty(void) {
+  iree_string_view_t v = {0, 0};
+  return v;
+}
+
+// Returns true if the given string view is the empty string.
+#define iree_string_view_is_empty(sv) (((sv).data == NULL) || ((sv).size == 0))
+
+static inline iree_string_view_t iree_make_string_view(
+    const char* str, iree_host_size_t str_length) {
+  iree_string_view_t v = {str, str_length};
+  return v;
+}
+
+// Returns a string view initialized with a reference to the given
+// NUL-terminated string literal.
+static inline iree_string_view_t iree_make_cstring_view(const char* str) {
+  iree_string_view_t v = {str, strlen(str)};
+  return v;
+}
+
+#define iree_string_view_literal(str) \
+  { .data = (str), .size = IREE_ARRAYSIZE(str) - 1 }
+
+// Returns a string view initialized with the given cstring.
+#define IREE_SV(cstr) iree_make_cstring_view(cstr)
+
+// Returns a string view initialized with the given string literal.
+#define IREE_SVL(cstr) iree_string_view_literal(cstr)
+
+// Returns true if the two strings are equal (compare == 0).
+IREE_API_EXPORT bool iree_string_view_equal(iree_string_view_t lhs,
+                                            iree_string_view_t rhs);
+
+// Like std::string::compare but with iree_string_view_t values.
+IREE_API_EXPORT int iree_string_view_compare(iree_string_view_t lhs,
+                                             iree_string_view_t rhs);
+
+// Finds the first occurrence of |c| in |value| starting at |pos|.
+// Returns the found character position or IREE_STRING_VIEW_NPOS if not found.
+IREE_API_EXPORT iree_host_size_t iree_string_view_find_char(
+    iree_string_view_t value, char c, iree_host_size_t pos);
+
+// Returns the index of the first occurrence of one of the characters in |s| or
+// IREE_STRING_VIEW_NPOS if none of the characters were found.
+IREE_API_EXPORT iree_host_size_t iree_string_view_find_first_of(
+    iree_string_view_t value, iree_string_view_t s, iree_host_size_t pos);
+
+// Returns the index of the last occurrence of one of the characters in |s| or
+// IREE_STRING_VIEW_NPOS if none of the characters were found.
+IREE_API_EXPORT iree_host_size_t iree_string_view_find_last_of(
+    iree_string_view_t value, iree_string_view_t s, iree_host_size_t pos);
+
+// Returns true if the string starts with the given prefix.
+IREE_API_EXPORT bool iree_string_view_starts_with(iree_string_view_t value,
+                                                  iree_string_view_t prefix);
+
+// Returns true if the string starts with the given suffix.
+IREE_API_EXPORT bool iree_string_view_ends_with(iree_string_view_t value,
+                                                iree_string_view_t suffix);
+
+// Removes the first |n| characters from the string view (not the data).
+IREE_API_EXPORT iree_string_view_t
+iree_string_view_remove_prefix(iree_string_view_t value, iree_host_size_t n);
+
+// Removes the last |n| characters from the string view (not the data).
+IREE_API_EXPORT iree_string_view_t
+iree_string_view_remove_suffix(iree_string_view_t value, iree_host_size_t n);
+
+// Removes the given substring prefix from the string view if present.
+IREE_API_EXPORT iree_string_view_t iree_string_view_strip_prefix(
+    iree_string_view_t value, iree_string_view_t prefix);
+
+// Removes the given substring suffix from the string view if present.
+IREE_API_EXPORT iree_string_view_t iree_string_view_strip_suffix(
+    iree_string_view_t value, iree_string_view_t suffix);
+
+// Removes the given substring prefix from the string view if present in-place.
+// Returns true if the strip succeeded.
+IREE_API_EXPORT bool iree_string_view_consume_prefix(iree_string_view_t* value,
+                                                     iree_string_view_t prefix);
+
+// Removes the given substring suffix from the string view if present in-place.
+// Returns true if the strip succeeded.
+IREE_API_EXPORT bool iree_string_view_consume_suffix(iree_string_view_t* value,
+                                                     iree_string_view_t suffix);
+
+// Removes leading and trailing whitespace.
+IREE_API_EXPORT iree_string_view_t
+iree_string_view_trim(iree_string_view_t value);
+
+// Returns a substring of the string view at offset |pos| and length |n|.
+// Use |n| == INTPTR_MAX to take the remainder of the string after |pos|.
+// Returns empty string on failure.
+IREE_API_EXPORT iree_string_view_t iree_string_view_substr(
+    iree_string_view_t value, iree_host_size_t pos, iree_host_size_t n);
+
+// Splits |value| into two parts based on the first occurrence of |split_char|.
+// Returns the index of the |split_char| in the original |value| or -1 if not
+// found.
+IREE_API_EXPORT intptr_t iree_string_view_split(iree_string_view_t value,
+                                                char split_char,
+                                                iree_string_view_t* out_lhs,
+                                                iree_string_view_t* out_rhs);
+
+// Replaces all occurrences of |old_char| with |new_char|.
+IREE_API_EXPORT void iree_string_view_replace_char(iree_string_view_t value,
+                                                   char old_char,
+                                                   char new_char);
+
+// Returns true if the given |value| matches |pattern| (normal * and ? rules).
+// This accepts wildcards in the form of '*' and '?' for any delimited value.
+// '*' will match zero or more of any character and '?' will match exactly one
+// of any character.
+//
+// For example,
+// 'foo-*-bar' matches: 'foo-123-bar', 'foo-456-789-bar'
+// 'foo-10?' matches: 'foo-101', 'foo-102'
+IREE_API_EXPORT bool iree_string_view_match_pattern(iree_string_view_t value,
+                                                    iree_string_view_t pattern);
+
+// Copies the string bytes into the target buffer and returns the number of
+// characters copied. Does not include a NUL terminator.
+IREE_API_EXPORT iree_host_size_t iree_string_view_append_to_buffer(
+    iree_string_view_t source_value, iree_string_view_t* target_value,
+    char* buffer);
+
+IREE_API_EXPORT bool iree_string_view_atoi_int32(iree_string_view_t value,
+                                                 int32_t* out_value);
+IREE_API_EXPORT bool iree_string_view_atoi_uint32(iree_string_view_t value,
+                                                  uint32_t* out_value);
+IREE_API_EXPORT bool iree_string_view_atoi_int64(iree_string_view_t value,
+                                                 int64_t* out_value);
+IREE_API_EXPORT bool iree_string_view_atoi_uint64(iree_string_view_t value,
+                                                  uint64_t* out_value);
+IREE_API_EXPORT bool iree_string_view_atof(iree_string_view_t value,
+                                           float* out_value);
+IREE_API_EXPORT bool iree_string_view_atod(iree_string_view_t value,
+                                           double* out_value);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_BASE_STRING_VIEW_H_
diff --git a/runtime/src/iree/base/string_view_test.cc b/runtime/src/iree/base/string_view_test.cc
new file mode 100644
index 0000000..ac5a713
--- /dev/null
+++ b/runtime/src/iree/base/string_view_test.cc
@@ -0,0 +1,365 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <string>
+
+#include "iree/base/api.h"
+#include "iree/testing/gtest.h"
+
+namespace {
+
+std::string ToString(iree_string_view_t value) {
+  return std::string(value.data, value.size);
+}
+
+TEST(StringViewTest, Equal) {
+  auto equal = [](const char* lhs, const char* rhs) -> bool {
+    return iree_string_view_equal(iree_make_cstring_view(lhs),
+                                  iree_make_cstring_view(rhs));
+  };
+  EXPECT_TRUE(equal("", ""));
+  EXPECT_FALSE(equal("a", ""));
+  EXPECT_FALSE(equal("", "a"));
+  EXPECT_TRUE(equal("a", "a"));
+  EXPECT_FALSE(equal("a", "ab"));
+  EXPECT_FALSE(equal("b", "ab"));
+  EXPECT_TRUE(equal("abc", "abc"));
+  EXPECT_FALSE(equal("abc", "aBc"));
+}
+
+TEST(StringViewTest, FindChar) {
+  auto find_char = [](const char* value, char c, iree_host_size_t pos) {
+    return iree_string_view_find_char(iree_make_cstring_view(value), c, pos);
+  };
+  EXPECT_EQ(find_char("", 'x', 0), IREE_STRING_VIEW_NPOS);
+  EXPECT_EQ(find_char("", 'x', 1), IREE_STRING_VIEW_NPOS);
+  EXPECT_EQ(find_char("", 'x', IREE_STRING_VIEW_NPOS), IREE_STRING_VIEW_NPOS);
+  EXPECT_EQ(find_char("x", 'x', 0), 0);
+  EXPECT_EQ(find_char("x", 'x', 1), IREE_STRING_VIEW_NPOS);
+  EXPECT_EQ(find_char("x", 'x', IREE_STRING_VIEW_NPOS), IREE_STRING_VIEW_NPOS);
+  EXPECT_EQ(find_char("abc", 'x', 0), IREE_STRING_VIEW_NPOS);
+  EXPECT_EQ(find_char("abc", 'x', 1), IREE_STRING_VIEW_NPOS);
+  EXPECT_EQ(find_char("abc", 'x', IREE_STRING_VIEW_NPOS),
+            IREE_STRING_VIEW_NPOS);
+  EXPECT_EQ(find_char("axbxc", 'x', 0), 1);
+  EXPECT_EQ(find_char("axbxc", 'x', 1), 1);
+  EXPECT_EQ(find_char("axbxc", 'x', 2), 3);
+  EXPECT_EQ(find_char("axbxc", 'x', 3), 3);
+  EXPECT_EQ(find_char("axbxc", 'x', 4), IREE_STRING_VIEW_NPOS);
+  EXPECT_EQ(find_char("axbxc", 'x', IREE_STRING_VIEW_NPOS),
+            IREE_STRING_VIEW_NPOS);
+}
+
+TEST(StringViewTest, FindFirstOf) {
+  auto find_first_of = [](const char* value, const char* s,
+                          iree_host_size_t pos) {
+    return iree_string_view_find_first_of(iree_make_cstring_view(value),
+                                          iree_make_cstring_view(s), pos);
+  };
+  EXPECT_EQ(find_first_of("", "", 0), IREE_STRING_VIEW_NPOS);
+  EXPECT_EQ(find_first_of("", "", 1), IREE_STRING_VIEW_NPOS);
+  EXPECT_EQ(find_first_of("", "", IREE_STRING_VIEW_NPOS),
+            IREE_STRING_VIEW_NPOS);
+  EXPECT_EQ(find_first_of("", "x", 0), IREE_STRING_VIEW_NPOS);
+  EXPECT_EQ(find_first_of("", "x", 1), IREE_STRING_VIEW_NPOS);
+  EXPECT_EQ(find_first_of("", "x", IREE_STRING_VIEW_NPOS),
+            IREE_STRING_VIEW_NPOS);
+  EXPECT_EQ(find_first_of("x", "x", 0), 0);
+  EXPECT_EQ(find_first_of("x", "x", 1), IREE_STRING_VIEW_NPOS);
+  EXPECT_EQ(find_first_of("x", "x", IREE_STRING_VIEW_NPOS),
+            IREE_STRING_VIEW_NPOS);
+  EXPECT_EQ(find_first_of("x", "", 0), IREE_STRING_VIEW_NPOS);
+  EXPECT_EQ(find_first_of("x", "", 1), IREE_STRING_VIEW_NPOS);
+  EXPECT_EQ(find_first_of("x", "", IREE_STRING_VIEW_NPOS),
+            IREE_STRING_VIEW_NPOS);
+  EXPECT_EQ(find_first_of("abc", "x", 0), IREE_STRING_VIEW_NPOS);
+  EXPECT_EQ(find_first_of("abc", "x", 1), IREE_STRING_VIEW_NPOS);
+  EXPECT_EQ(find_first_of("abc", "x", IREE_STRING_VIEW_NPOS),
+            IREE_STRING_VIEW_NPOS);
+  EXPECT_EQ(find_first_of("axbxc", "xy", 0), 1);
+  EXPECT_EQ(find_first_of("axbxc", "xy", 1), 1);
+  EXPECT_EQ(find_first_of("axbxc", "xy", 2), 3);
+  EXPECT_EQ(find_first_of("axbxc", "xy", 3), 3);
+  EXPECT_EQ(find_first_of("axbxc", "xy", 4), IREE_STRING_VIEW_NPOS);
+  EXPECT_EQ(find_first_of("axbxc", "xy", IREE_STRING_VIEW_NPOS),
+            IREE_STRING_VIEW_NPOS);
+  EXPECT_EQ(find_first_of("aybxc", "xy", 0), 1);
+  EXPECT_EQ(find_first_of("aybxc", "xy", 1), 1);
+  EXPECT_EQ(find_first_of("aybxc", "xy", 2), 3);
+  EXPECT_EQ(find_first_of("aybxc", "xy", 3), 3);
+  EXPECT_EQ(find_first_of("aybxc", "xy", 4), IREE_STRING_VIEW_NPOS);
+  EXPECT_EQ(find_first_of("aybxc", "xy", IREE_STRING_VIEW_NPOS),
+            IREE_STRING_VIEW_NPOS);
+}
+
+TEST(StringViewTest, FindLastOf) {
+  auto find_last_of = [](const char* value, const char* s,
+                         iree_host_size_t pos) {
+    return iree_string_view_find_last_of(iree_make_cstring_view(value),
+                                         iree_make_cstring_view(s), pos);
+  };
+  EXPECT_EQ(find_last_of("", "", 0), IREE_STRING_VIEW_NPOS);
+  EXPECT_EQ(find_last_of("", "", 1), IREE_STRING_VIEW_NPOS);
+  EXPECT_EQ(find_last_of("", "", IREE_STRING_VIEW_NPOS), IREE_STRING_VIEW_NPOS);
+  EXPECT_EQ(find_last_of("", "x", 0), IREE_STRING_VIEW_NPOS);
+  EXPECT_EQ(find_last_of("", "x", 1), IREE_STRING_VIEW_NPOS);
+  EXPECT_EQ(find_last_of("", "x", IREE_STRING_VIEW_NPOS),
+            IREE_STRING_VIEW_NPOS);
+  EXPECT_EQ(find_last_of("x", "x", 0), 0);
+  EXPECT_EQ(find_last_of("x", "x", 1), 0);
+  EXPECT_EQ(find_last_of("x", "x", IREE_STRING_VIEW_NPOS), 0);
+  EXPECT_EQ(find_last_of("x", "", 0), IREE_STRING_VIEW_NPOS);
+  EXPECT_EQ(find_last_of("x", "", 1), IREE_STRING_VIEW_NPOS);
+  EXPECT_EQ(find_last_of("x", "", IREE_STRING_VIEW_NPOS),
+            IREE_STRING_VIEW_NPOS);
+  EXPECT_EQ(find_last_of("abc", "x", 0), IREE_STRING_VIEW_NPOS);
+  EXPECT_EQ(find_last_of("abc", "x", 1), IREE_STRING_VIEW_NPOS);
+  EXPECT_EQ(find_last_of("abc", "x", IREE_STRING_VIEW_NPOS),
+            IREE_STRING_VIEW_NPOS);
+  EXPECT_EQ(find_last_of("axbxc", "xy", 0), IREE_STRING_VIEW_NPOS);
+  EXPECT_EQ(find_last_of("axbxc", "xy", 1), 1);
+  EXPECT_EQ(find_last_of("axbxc", "xy", 2), 1);
+  EXPECT_EQ(find_last_of("axbxc", "xy", 3), 3);
+  EXPECT_EQ(find_last_of("axbxc", "xy", 4), 3);
+  EXPECT_EQ(find_last_of("axbxc", "xy", IREE_STRING_VIEW_NPOS), 3);
+  EXPECT_EQ(find_last_of("aybxc", "xy", 0), IREE_STRING_VIEW_NPOS);
+  EXPECT_EQ(find_last_of("aybxc", "xy", 1), 1);
+  EXPECT_EQ(find_last_of("aybxc", "xy", 2), 1);
+  EXPECT_EQ(find_last_of("aybxc", "xy", 3), 3);
+  EXPECT_EQ(find_last_of("aybxc", "xy", 4), 3);
+  EXPECT_EQ(find_last_of("aybxc", "xy", IREE_STRING_VIEW_NPOS), 3);
+}
+
+TEST(StringViewTest, StartsWith) {
+  auto starts_with = [](const char* value, const char* prefix) -> bool {
+    return iree_string_view_starts_with(iree_make_cstring_view(value),
+                                        iree_make_cstring_view(prefix));
+  };
+  EXPECT_TRUE(starts_with("a", "a"));
+  EXPECT_TRUE(starts_with("ab", "a"));
+  EXPECT_TRUE(starts_with("ab", "ab"));
+  EXPECT_TRUE(starts_with("abc", "ab"));
+  EXPECT_TRUE(starts_with("abc", "abc"));
+  EXPECT_FALSE(starts_with("abc", ""));
+  EXPECT_FALSE(starts_with("", ""));
+  EXPECT_FALSE(starts_with("", "a"));
+  EXPECT_FALSE(starts_with("", "abc"));
+  EXPECT_FALSE(starts_with("abc", "b"));
+  EXPECT_FALSE(starts_with("abc", "bc"));
+  EXPECT_FALSE(starts_with("a", "abc"));
+}
+
+TEST(StringViewTest, EndsWith) {
+  auto ends_with = [](const char* value, const char* suffix) -> bool {
+    return iree_string_view_ends_with(iree_make_cstring_view(value),
+                                      iree_make_cstring_view(suffix));
+  };
+  EXPECT_TRUE(ends_with("a", "a"));
+  EXPECT_TRUE(ends_with("ab", "b"));
+  EXPECT_TRUE(ends_with("ab", "ab"));
+  EXPECT_TRUE(ends_with("abc", "bc"));
+  EXPECT_TRUE(ends_with("abc", "c"));
+  EXPECT_FALSE(ends_with("abc", ""));
+  EXPECT_FALSE(ends_with("", ""));
+  EXPECT_FALSE(ends_with("", "a"));
+  EXPECT_FALSE(ends_with("", "abc"));
+  EXPECT_FALSE(ends_with("abc", "b"));
+  EXPECT_FALSE(ends_with("abc", "ab"));
+  EXPECT_FALSE(ends_with("a", "abc"));
+}
+
+TEST(StringViewTest, RemovePrefix) {
+  auto remove_prefix = [](const char* value,
+                          iree_host_size_t n) -> std::string {
+    return ToString(
+        iree_string_view_remove_prefix(iree_make_cstring_view(value), n));
+  };
+  EXPECT_EQ(remove_prefix("", 0), "");
+  EXPECT_EQ(remove_prefix("", 1), "");
+  EXPECT_EQ(remove_prefix("a", 10), "");
+  EXPECT_EQ(remove_prefix("ab", 1), "b");
+  EXPECT_EQ(remove_prefix("ab", 2), "");
+  EXPECT_EQ(remove_prefix("abcdef", 2), "cdef");
+}
+
+TEST(StringViewTest, RemoveSuffix) {
+  auto remove_suffix = [](const char* value,
+                          iree_host_size_t n) -> std::string {
+    return ToString(
+        iree_string_view_remove_suffix(iree_make_cstring_view(value), n));
+  };
+  EXPECT_EQ(remove_suffix("", 0), "");
+  EXPECT_EQ(remove_suffix("", 1), "");
+  EXPECT_EQ(remove_suffix("a", 10), "");
+  EXPECT_EQ(remove_suffix("ab", 1), "a");
+  EXPECT_EQ(remove_suffix("ab", 2), "");
+  EXPECT_EQ(remove_suffix("abcdef", 2), "abcd");
+}
+
+TEST(StringViewTest, StripPrefix) {
+  auto strip_prefix = [](const char* value, const char* prefix) -> std::string {
+    return ToString(iree_string_view_strip_prefix(
+        iree_make_cstring_view(value), iree_make_cstring_view(prefix)));
+  };
+  EXPECT_EQ(strip_prefix("", ""), "");
+  EXPECT_EQ(strip_prefix("", "a"), "");
+  EXPECT_EQ(strip_prefix("a", ""), "a");
+  EXPECT_EQ(strip_prefix("a", "a"), "");
+  EXPECT_EQ(strip_prefix("ab", "a"), "b");
+  EXPECT_EQ(strip_prefix("ab", "b"), "ab");
+  EXPECT_EQ(strip_prefix("ab", "ab"), "");
+  EXPECT_EQ(strip_prefix("ab", "abc"), "ab");
+  EXPECT_EQ(strip_prefix("abcdef", "ab"), "cdef");
+  EXPECT_EQ(strip_prefix("abcdef", "bc"), "abcdef");
+}
+
+TEST(StringViewTest, StripSuffix) {
+  auto strip_suffix = [](const char* value, const char* suffix) -> std::string {
+    return ToString(iree_string_view_strip_suffix(
+        iree_make_cstring_view(value), iree_make_cstring_view(suffix)));
+  };
+  EXPECT_EQ(strip_suffix("", ""), "");
+  EXPECT_EQ(strip_suffix("", "a"), "");
+  EXPECT_EQ(strip_suffix("a", ""), "a");
+  EXPECT_EQ(strip_suffix("a", "a"), "");
+  EXPECT_EQ(strip_suffix("ab", "a"), "ab");
+  EXPECT_EQ(strip_suffix("ab", "b"), "a");
+  EXPECT_EQ(strip_suffix("ab", "ab"), "");
+  EXPECT_EQ(strip_suffix("ab", "abc"), "ab");
+  EXPECT_EQ(strip_suffix("abcdef", "ef"), "abcd");
+  EXPECT_EQ(strip_suffix("abcdef", "de"), "abcdef");
+}
+
+TEST(StringViewTest, ConsumePrefix) {
+  auto consume_prefix = [](const char* value,
+                           const char* prefix) -> std::string {
+    iree_string_view_t value_sv = iree_make_cstring_view(value);
+    if (iree_string_view_consume_prefix(&value_sv,
+                                        iree_make_cstring_view(prefix))) {
+      return ToString(value_sv);
+    } else {
+      return "FAILED";
+    }
+  };
+  EXPECT_EQ(consume_prefix("", ""), "FAILED");
+  EXPECT_EQ(consume_prefix("", "a"), "FAILED");
+  EXPECT_EQ(consume_prefix("a", ""), "FAILED");
+  EXPECT_EQ(consume_prefix("a", "a"), "");
+  EXPECT_EQ(consume_prefix("ab", "a"), "b");
+  EXPECT_EQ(consume_prefix("ab", "b"), "FAILED");
+  EXPECT_EQ(consume_prefix("ab", "ab"), "");
+  EXPECT_EQ(consume_prefix("ab", "abc"), "FAILED");
+  EXPECT_EQ(consume_prefix("abcdef", "ab"), "cdef");
+  EXPECT_EQ(consume_prefix("abcdef", "bc"), "FAILED");
+}
+
+TEST(StringViewTest, ConsumeSuffix) {
+  auto consume_suffix = [](const char* value,
+                           const char* suffix) -> std::string {
+    iree_string_view_t value_sv = iree_make_cstring_view(value);
+    if (iree_string_view_consume_suffix(&value_sv,
+                                        iree_make_cstring_view(suffix))) {
+      return ToString(value_sv);
+    } else {
+      return "FAILED";
+    }
+  };
+  EXPECT_EQ(consume_suffix("", ""), "FAILED");
+  EXPECT_EQ(consume_suffix("", "a"), "FAILED");
+  EXPECT_EQ(consume_suffix("a", ""), "FAILED");
+  EXPECT_EQ(consume_suffix("a", "a"), "");
+  EXPECT_EQ(consume_suffix("ab", "a"), "FAILED");
+  EXPECT_EQ(consume_suffix("ab", "b"), "a");
+  EXPECT_EQ(consume_suffix("ab", "ab"), "");
+  EXPECT_EQ(consume_suffix("ab", "abc"), "FAILED");
+  EXPECT_EQ(consume_suffix("abcdef", "ef"), "abcd");
+  EXPECT_EQ(consume_suffix("abcdef", "de"), "FAILED");
+}
+
+TEST(StringViewTest, Trim) {
+  auto trim = [](const char* value) -> std::string {
+    return ToString(iree_string_view_trim(iree_make_cstring_view(value)));
+  };
+  EXPECT_EQ(trim(""), "");
+  EXPECT_EQ(trim("a"), "a");
+  EXPECT_EQ(trim(" a"), "a");
+  EXPECT_EQ(trim("a "), "a");
+  EXPECT_EQ(trim("a b"), "a b");
+  EXPECT_EQ(trim(" a b "), "a b");
+  EXPECT_EQ(trim("\t\t\na b\n \t "), "a b");
+  EXPECT_EQ(trim("\n"), "");
+  EXPECT_EQ(trim("\r\n"), "");
+}
+
+TEST(StringViewTest, Substr) {
+  auto substr = [](const char* value, iree_host_size_t pos,
+                   iree_host_size_t n) {
+    return ToString(
+        iree_string_view_substr(iree_make_cstring_view(value), pos, n));
+  };
+  EXPECT_EQ(substr("", 0, 0), "");
+  EXPECT_EQ(substr("", 0, 1), "");
+  EXPECT_EQ(substr("", 0, INTPTR_MAX), "");
+  EXPECT_EQ(substr("", 1, 0), "");
+  EXPECT_EQ(substr("", 1, 1), "");
+  EXPECT_EQ(substr("", 1, INTPTR_MAX), "");
+
+  EXPECT_EQ(substr("a", 0, 0), "");
+  EXPECT_EQ(substr("a", 0, 1), "a");
+  EXPECT_EQ(substr("a", 0, 2), "a");
+  EXPECT_EQ(substr("a", 0, INTPTR_MAX), "a");
+  EXPECT_EQ(substr("a", 1, 0), "");
+  EXPECT_EQ(substr("a", 1, 1), "");
+  EXPECT_EQ(substr("a", 1, INTPTR_MAX), "");
+
+  EXPECT_EQ(substr("abc", 0, 1), "a");
+  EXPECT_EQ(substr("abc", 1, 1), "b");
+  EXPECT_EQ(substr("abc", 2, 1), "c");
+  EXPECT_EQ(substr("abc", 0, 2), "ab");
+  EXPECT_EQ(substr("abc", 1, 2), "bc");
+  EXPECT_EQ(substr("abc", 1, INTPTR_MAX), "bc");
+  EXPECT_EQ(substr("abc", 0, 3), "abc");
+  EXPECT_EQ(substr("abc", 0, INTPTR_MAX), "abc");
+}
+
+TEST(StringViewTest, Split) {
+  auto split =
+      [](const char* value,
+         char split_char) -> std::tuple<intptr_t, std::string, std::string> {
+    iree_string_view_t lhs;
+    iree_string_view_t rhs;
+    intptr_t index = iree_string_view_split(iree_make_cstring_view(value),
+                                            split_char, &lhs, &rhs);
+    return std::make_tuple(index, ToString(lhs), ToString(rhs));
+  };
+  EXPECT_EQ(split("", 'x'), std::make_tuple(-1, "", ""));
+  EXPECT_EQ(split(" ", 'x'), std::make_tuple(-1, " ", ""));
+  EXPECT_EQ(split("x", 'x'), std::make_tuple(0, "", ""));
+  EXPECT_EQ(split(" x ", 'x'), std::make_tuple(1, " ", " "));
+  EXPECT_EQ(split("axb", 'x'), std::make_tuple(1, "a", "b"));
+  EXPECT_EQ(split("axxxb", 'x'), std::make_tuple(1, "a", "xxb"));
+  EXPECT_EQ(split("ax", 'x'), std::make_tuple(1, "a", ""));
+  EXPECT_EQ(split("xb", 'x'), std::make_tuple(0, "", "b"));
+  EXPECT_EQ(split("axbxc", 'x'), std::make_tuple(1, "a", "bxc"));
+}
+
+TEST(StringViewTest, ReplaceChar) {
+  auto replace_char = [](const char* value, char old_char, char new_char) {
+    std::string value_clone(value);
+    iree_string_view_replace_char(
+        iree_make_string_view(value_clone.data(), value_clone.size()), old_char,
+        new_char);
+    return value_clone;
+  };
+  EXPECT_EQ(replace_char("", 'x', 'y'), "");
+  EXPECT_EQ(replace_char(" ", 'x', 'y'), " ");
+  EXPECT_EQ(replace_char("a", 'x', 'y'), "a");
+  EXPECT_EQ(replace_char("x", 'x', 'y'), "y");
+  EXPECT_EQ(replace_char("xx", 'x', 'y'), "yy");
+  EXPECT_EQ(replace_char("axbxc", 'x', 'y'), "aybyc");
+}
+
+}  // namespace
diff --git a/runtime/src/iree/base/target_platform.h b/runtime/src/iree/base/target_platform.h
new file mode 100644
index 0000000..a15f80c
--- /dev/null
+++ b/runtime/src/iree/base/target_platform.h
@@ -0,0 +1,293 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BASE_TARGET_PLATFORM_H_
+#define IREE_BASE_TARGET_PLATFORM_H_
+
+#include <assert.h>
+#include <stdint.h>
+
+// The build system defines one of the following top-level platforms and then
+// one platform+architecture pair for that platform.
+//
+// IREE_ARCH ("arm_32", "arm_64", etc)
+// IREE_ARCH_ARM_32
+// IREE_ARCH_ARM_64
+// IREE_ARCH_RISCV_32
+// IREE_ARCH_RISCV_64
+// IREE_ARCH_WASM_32
+// IREE_ARCH_WASM_64
+// IREE_ARCH_X86_32
+// IREE_ARCH_X86_64
+//
+// IREE_PTR_SIZE
+// IREE_PTR_SIZE_32
+// IREE_PTR_SIZE_64
+//
+// IREE_ENDIANNESS_LITTLE
+// IREE_ENDIANNESS_BIG
+//
+// IREE_MEMORY_ACCESS_ALIGNMENT_REQUIRED (0/1)
+//
+// IREE_COMPILER_CLANG
+// IREE_COMPILER_GCC
+// IREE_COMPILER_GCC_COMPAT
+// IREE_COMPILER_MSVC
+//
+// IREE_SANITIZER_ADDRESS
+// IREE_SANITIZER_MEMORY
+// IREE_SANITIZER_THREAD
+//
+// IREE_PLATFORM_ANDROID
+// IREE_PLATFORM_ANDROID_EMULATOR
+// IREE_PLATFORM_APPLE (IOS | MACOS)
+// IREE_PLATFORM_EMSCRIPTEN
+// IREE_PLATFORM_GENERIC
+// IREE_PLATFORM_IOS
+// IREE_PLATFORM_IOS_SIMULATOR
+// IREE_PLATFORM_LINUX
+// IREE_PLATFORM_MACOS
+// IREE_PLATFORM_WINDOWS
+
+//==============================================================================
+// IREE_ARCH_*
+//==============================================================================
+
+#if defined(__arm__) || defined(__arm64) || defined(__aarch64__) || \
+    defined(__thumb__) || defined(__TARGET_ARCH_ARM) ||             \
+    defined(__TARGET_ARCH_THUMB) || defined(_M_ARM)
+#if defined(__arm64) || defined(__aarch64__)
+#define IREE_ARCH "arm_64"
+#define IREE_ARCH_ARM_64 1
+#else
+#define IREE_ARCH "arm_32"
+#define IREE_ARCH_ARM_32 1
+#endif  // __arm64
+#endif  // ARM
+
+#if defined(__wasm32__)
+#define IREE_ARCH "wasm_32"
+#define IREE_ARCH_WASM_32 1
+#elif defined(__wasm64__)
+#define IREE_ARCH "wasm_64"
+#define IREE_ARCH_WASM_64 1
+#endif  // WASM
+
+#if defined(__i386__) || defined(__i486__) || defined(__i586__) || \
+    defined(__i686__) || defined(__i386) || defined(_M_IX86) || defined(_X86_)
+#define IREE_ARCH "x86_32"
+#define IREE_ARCH_X86_32 1
+#elif defined(__x86_64) || defined(__x86_64__) || defined(__amd64__) || \
+    defined(__amd64) || defined(_M_X64)
+#define IREE_ARCH "x86_64"
+#define IREE_ARCH_X86_64 1
+#endif  // X86
+
+#if defined(__riscv) && (__riscv_xlen == 32)
+#define IREE_ARCH "riscv_32"
+#define IREE_ARCH_RISCV_32 1
+#elif defined(__riscv) && (__riscv_xlen == 64)
+#define IREE_ARCH "riscv_64"
+#define IREE_ARCH_RISCV_64 1
+#endif
+
+#if !defined(IREE_ARCH_ARM_32) && !defined(IREE_ARCH_ARM_64) &&     \
+    !defined(IREE_ARCH_RISCV_32) && !defined(IREE_ARCH_RISCV_64) && \
+    !defined(IREE_ARCH_WASM_32) && !defined(IREE_ARCH_WASM_64) &&   \
+    !defined(IREE_ARCH_X86_32) && !defined(IREE_ARCH_X86_64)
+#error Unknown architecture.
+#endif  // all archs
+
+//==============================================================================
+// IREE_PTR_SIZE_*
+//==============================================================================
+
+// See https://stackoverflow.com/q/51616057
+static_assert(sizeof(void*) == sizeof(uintptr_t),
+              "can't determine pointer size");
+
+#if UINTPTR_MAX == 0xFFFFFFFF
+#define IREE_PTR_SIZE_32
+#define IREE_PTR_SIZE 4
+#elif UINTPTR_MAX == 0xFFFFFFFFFFFFFFFFu
+#define IREE_PTR_SIZE_64
+#define IREE_PTR_SIZE 8
+#else
+#error "can't determine pointer size"
+#endif
+
+//==============================================================================
+// IREE_ENDIANNESS_*
+//==============================================================================
+// https://en.wikipedia.org/wiki/Endianness
+
+#if (defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && \
+     __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#define IREE_ENDIANNESS_LITTLE 1
+#elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && \
+    __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define IREE_ENDIANNESS_BIG 1
+#elif defined(_WIN32)
+#define IREE_ENDIANNESS_LITTLE 1
+#else
+#error IREE endian detection needs to be set up for your compiler
+#endif  // __BYTE_ORDER__
+
+//==============================================================================
+// IREE_MEMORY_ACCESS_*
+//==============================================================================
+// Certain architectures have specific memory access requirements that require
+// user-mode code changes to work at all or work at reasonable performance.
+
+#if !defined(IREE_MEMORY_ACCESS_ALIGNMENT_REQUIRED)
+
+#if defined(IREE_ARCH_ARM_32) || defined(IREE_ARCH_ARM_64)
+
+// Armv6‑M and Armv8-M (w/o the main extension) do not support unaligned access.
+// The -munaligned-access and -mno-unaligned-access flags control this.
+// https://www.keil.com/support/man/docs/armclang_ref/armclang_ref_sam1444138667173.htm
+#if !defined(__ARM_FEATURE_UNALIGNED)
+#define IREE_MEMORY_ACCESS_ALIGNMENT_REQUIRED 1
+#else
+#define IREE_MEMORY_ACCESS_ALIGNMENT_REQUIRED 0
+#endif  // !__ARM_FEATURE_UNALIGNED
+
+#elif defined(IREE_ARCH_RISCV_32) || defined(IREE_ARCH_RISCV_64)
+
+// Though unaligned access is part of the base spec it is allowed to be
+// implemented with trap handlers. Bare-metal systems likely won't have these
+// handlers and even on systems that do (linux) we don't want to be trapping for
+// every load/store.
+#define IREE_MEMORY_ACCESS_ALIGNMENT_REQUIRED 1
+
+#endif  // IREE_ARCH_*
+
+#else
+#define IREE_MEMORY_ACCESS_ALIGNMENT_REQUIRED 0
+#endif  // !IREE_MEMORY_ACCESS_ALIGNMENT_REQUIRED
+
+//==============================================================================
+// IREE_COMPILER_*
+//==============================================================================
+
+#if defined(__clang__)
+#define IREE_COMPILER_CLANG 1
+#define IREE_COMPILER_GCC_COMPAT 1
+#elif defined(__GNUC__)
+#define IREE_COMPILER_GCC 1
+#define IREE_COMPILER_GCC_COMPAT 1
+#elif defined(_MSC_VER)
+#define IREE_COMPILER_MSVC 1
+#else
+#error Unrecognized compiler.
+#endif  // compiler versions
+
+#if defined(__has_feature)
+#if __has_feature(address_sanitizer)
+#define IREE_SANITIZER_ADDRESS 1
+#endif  // __has_feature(address_sanitizer)
+#if __has_feature(memory_sanitizer)
+#define IREE_SANITIZER_MEMORY 1
+#endif  // __has_feature(memory_sanitizer)
+#if __has_feature(thread_sanitizer)
+#define IREE_SANITIZER_THREAD 1
+#endif  // __has_feature(thread_sanitizer)
+#endif  // defined(__has_feature)
+
+//==============================================================================
+// IREE_COMPILER_HAS_BUILTIN_DEBUG_TRAP
+//==============================================================================
+
+#if defined __has_builtin
+#if __has_builtin(__builtin_debugtrap)
+#define IREE_COMPILER_HAS_BUILTIN_DEBUG_TRAP 1
+#endif
+#endif
+
+//==============================================================================
+// IREE_PLATFORM_ANDROID
+//==============================================================================
+
+#if defined(__ANDROID__)
+#define IREE_PLATFORM_ANDROID 1
+#endif  // __ANDROID__
+
+//==============================================================================
+// IREE_PLATFORM_EMSCRIPTEN
+//==============================================================================
+
+#if defined(__EMSCRIPTEN__)
+#define IREE_PLATFORM_EMSCRIPTEN 1
+#endif  // __ANDROID__
+
+//==============================================================================
+// IREE_PLATFORM_IOS | IREE_PLATFORM_MACOS
+//==============================================================================
+
+#if defined(__APPLE__)
+#include <TargetConditionals.h>  // IWYU pragma: export
+#if TARGET_OS_IPHONE
+#define IREE_PLATFORM_IOS 1
+#else
+#define IREE_PLATFORM_MACOS 1
+#endif  // TARGET_OS_IPHONE
+#if TARGET_IPHONE_SIMULATOR
+#define IREE_PLATFORM_IOS_SIMULATOR 1
+#endif  // TARGET_IPHONE_SIMULATOR
+#endif  // __APPLE__
+
+#if defined(IREE_PLATFORM_IOS) || defined(IREE_PLATFORM_MACOS)
+#define IREE_PLATFORM_APPLE 1
+#endif  // IREE_PLATFORM_IOS || IREE_PLATFORM_MACOS
+
+//==============================================================================
+// IREE_PLATFORM_LINUX
+//==============================================================================
+
+#if defined(__linux__) || defined(linux) || defined(__linux)
+#define IREE_PLATFORM_LINUX 1
+#endif  // __linux__
+
+//==============================================================================
+// IREE_PLATFORM_WINDOWS
+//==============================================================================
+
+#if defined(_WIN32) || defined(_WIN64)
+#define IREE_PLATFORM_WINDOWS 1
+#endif  // _WIN32 || _WIN64
+
+#if defined(IREE_PLATFORM_WINDOWS)
+
+#if defined(_MSC_VER)
+// Abseil compatibility: don't include incompatible winsock versions.
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif  // WIN32_LEAN_AND_MEAN
+// Abseil compatibility: don't define min and max macros.
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif  // NOMINMAX
+#endif  // _MSC_VER
+
+#include <windows.h>  // IWYU pragma: export
+
+// WinGDI.h defines `ERROR`, undef to avoid conflict naming.
+#undef ERROR
+
+#endif  // IREE_PLATFORM_WINDOWS
+
+//==============================================================================
+// Fallthrough for unsupported platforms
+//==============================================================================
+
+#if !defined(IREE_PLATFORM_ANDROID) && !defined(IREE_PLATFORM_EMSCRIPTEN) && \
+    !defined(IREE_PLATFORM_GENERIC) && !defined(IREE_PLATFORM_IOS) &&        \
+    !defined(IREE_PLATFORM_LINUX) && !defined(IREE_PLATFORM_MACOS) &&        \
+    !defined(IREE_PLATFORM_WINDOWS)
+#error Unknown platform.
+#endif  // all archs
+
+#endif  // IREE_BASE_TARGET_PLATFORM_H_
diff --git a/runtime/src/iree/base/testing/BUILD b/runtime/src/iree/base/testing/BUILD
new file mode 100644
index 0000000..abc160b
--- /dev/null
+++ b/runtime/src/iree/base/testing/BUILD
@@ -0,0 +1,44 @@
+# Copyright 2020 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("//build_tools/embed_data:build_defs.bzl", "c_embed_data")
+load("//iree:build_defs.oss.bzl", "iree_runtime_cc_test")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_binary(
+    name = "dynamic_library_test_library.so",
+    testonly = True,
+    srcs = ["dynamic_library_test_library.cc"],
+    linkshared = True,
+)
+
+c_embed_data(
+    name = "dynamic_library_test_library",
+    testonly = True,
+    srcs = [":dynamic_library_test_library.so"],
+    c_file_output = "dynamic_library_test_library_embed.c",
+    flatten = True,
+    h_file_output = "dynamic_library_test_library_embed.h",
+)
+
+iree_runtime_cc_test(
+    name = "dynamic_library_test",
+    srcs = ["dynamic_library_test.cc"],
+    deps = [
+        ":dynamic_library_test_library",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:logging",
+        "//runtime/src/iree/base/internal:dynamic_library",
+        "//runtime/src/iree/base/internal:file_io",
+        "//runtime/src/iree/testing:gtest",
+        "//runtime/src/iree/testing:gtest_main",
+    ],
+)
diff --git a/runtime/src/iree/base/testing/CMakeLists.txt b/runtime/src/iree/base/testing/CMakeLists.txt
new file mode 100644
index 0000000..2daa820
--- /dev/null
+++ b/runtime/src/iree/base/testing/CMakeLists.txt
@@ -0,0 +1,46 @@
+# Copyright 2019 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# TODO(scotttodd): clean up bazel_to_cmake handling here
+#   * this is a cc_binary in Bazel, but `linkshared` fits iree_cc_library better
+#   * the output file name is platform-specific, get it with $<TARGET_FILE:>
+iree_cc_library(
+  NAME
+    dynamic_library_test_library.so
+  SRCS
+    "dynamic_library_test_library.cc"
+  TESTONLY
+  SHARED
+)
+
+iree_c_embed_data(
+  NAME
+    dynamic_library_test_library
+  GENERATED_SRCS
+    "$<TARGET_FILE:iree::base::testing::dynamic_library_test_library.so>"
+  C_FILE_OUTPUT
+    "dynamic_library_test_library_embed.c"
+  H_FILE_OUTPUT
+    "dynamic_library_test_library_embed.h"
+  TESTONLY
+  FLATTEN
+  PUBLIC
+)
+
+iree_cc_test(
+  NAME
+    dynamic_library_test
+  SRCS
+    "dynamic_library_test.cc"
+  DEPS
+    ::dynamic_library_test_library
+    iree::base
+    iree::base::internal::dynamic_library
+    iree::base::internal::file_io
+    iree::base::logging
+    iree::testing::gtest
+    iree::testing::gtest_main
+)
diff --git a/runtime/src/iree/base/testing/dynamic_library_test.cc b/runtime/src/iree/base/testing/dynamic_library_test.cc
new file mode 100644
index 0000000..a63338b
--- /dev/null
+++ b/runtime/src/iree/base/testing/dynamic_library_test.cc
@@ -0,0 +1,138 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/internal/dynamic_library.h"
+
+#include <cstdlib>
+#include <iostream>
+#include <ostream>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/file_io.h"
+#include "iree/base/logging.h"
+#include "iree/base/testing/dynamic_library_test_library_embed.h"
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace iree {
+namespace {
+
+using iree::testing::status::StatusIs;
+
+static const char* kUnknownName = "library_that_does_not_exist.so";
+
+class DynamicLibraryTest : public ::testing::Test {
+ public:
+  static std::string GetTempFilename(const char* suffix) {
+    static int unique_id = 0;
+    char* test_tmpdir = getenv("TEST_TMPDIR");
+    if (!test_tmpdir) {
+      test_tmpdir = getenv("TMPDIR");
+    }
+    if (!test_tmpdir) {
+      test_tmpdir = getenv("TEMP");
+    }
+    IREE_CHECK(test_tmpdir) << "TEST_TMPDIR/TMPDIR/TEMP not defined";
+    return test_tmpdir + std::string("/iree_test_") +
+           std::to_string(unique_id++) + suffix;
+  }
+
+  static void SetUpTestCase() {
+    // Making files available to tests, particularly across operating systems
+    // and build tools (Bazel/CMake) is complicated. Rather than include a test
+    // dynamic library as a "testdata" file, we use c_embed_data to package
+    // the file so it's embedded in a C module, then write that embedded file
+    // to a platform/test-environment specific temp file for loading.
+
+    // System APIs for loading dynamic libraries typically require an extension.
+#if defined(IREE_PLATFORM_WINDOWS)
+    static constexpr const char* ext = ".dll";
+#else
+    static constexpr const char* ext = ".so";
+#endif
+    library_temp_path_ = GetTempFilename(ext);
+
+    const struct iree_file_toc_t* file_toc =
+        dynamic_library_test_library_create();
+    IREE_ASSERT_OK(iree_file_write_contents(
+        library_temp_path_.c_str(),
+        iree_make_const_byte_span(file_toc->data, file_toc->size)));
+
+    std::cout << "Embedded test library written to temp path: "
+              << library_temp_path_;
+  }
+
+  static std::string library_temp_path_;
+};
+
+std::string DynamicLibraryTest::library_temp_path_;
+
+TEST_F(DynamicLibraryTest, LoadLibrarySuccess) {
+  iree_dynamic_library_t* library = NULL;
+  IREE_ASSERT_OK(iree_dynamic_library_load_from_file(
+      library_temp_path_.c_str(), IREE_DYNAMIC_LIBRARY_FLAG_NONE,
+      iree_allocator_system(), &library));
+  iree_dynamic_library_release(library);
+}
+
+TEST_F(DynamicLibraryTest, LoadLibraryFailure) {
+  iree_dynamic_library_t* library = NULL;
+  iree_status_t status = iree_dynamic_library_load_from_file(
+      kUnknownName, IREE_DYNAMIC_LIBRARY_FLAG_NONE, iree_allocator_system(),
+      &library);
+  IREE_EXPECT_STATUS_IS(IREE_STATUS_NOT_FOUND, status);
+  iree_status_free(status);
+}
+
+TEST_F(DynamicLibraryTest, LoadLibraryTwice) {
+  iree_dynamic_library_t* library1 = NULL;
+  iree_dynamic_library_t* library2 = NULL;
+  IREE_ASSERT_OK(iree_dynamic_library_load_from_file(
+      library_temp_path_.c_str(), IREE_DYNAMIC_LIBRARY_FLAG_NONE,
+      iree_allocator_system(), &library1));
+  IREE_ASSERT_OK(iree_dynamic_library_load_from_file(
+      library_temp_path_.c_str(), IREE_DYNAMIC_LIBRARY_FLAG_NONE,
+      iree_allocator_system(), &library2));
+  iree_dynamic_library_release(library1);
+  iree_dynamic_library_release(library2);
+}
+
+TEST_F(DynamicLibraryTest, GetSymbolSuccess) {
+  iree_dynamic_library_t* library = NULL;
+  IREE_ASSERT_OK(iree_dynamic_library_load_from_file(
+      library_temp_path_.c_str(), IREE_DYNAMIC_LIBRARY_FLAG_NONE,
+      iree_allocator_system(), &library));
+
+  int (*fn_ptr)(int);
+  IREE_ASSERT_OK(iree_dynamic_library_lookup_symbol(library, "times_two",
+                                                    (void**)&fn_ptr));
+  ASSERT_NE(nullptr, fn_ptr);
+  EXPECT_EQ(246, fn_ptr(123));
+
+  iree_dynamic_library_release(library);
+}
+
+TEST_F(DynamicLibraryTest, GetSymbolFailure) {
+  iree_dynamic_library_t* library = NULL;
+  IREE_ASSERT_OK(iree_dynamic_library_load_from_file(
+      library_temp_path_.c_str(), IREE_DYNAMIC_LIBRARY_FLAG_NONE,
+      iree_allocator_system(), &library));
+
+  int (*fn_ptr)(int);
+  iree_status_t status =
+      iree_dynamic_library_lookup_symbol(library, "unknown", (void**)&fn_ptr);
+  IREE_EXPECT_STATUS_IS(IREE_STATUS_NOT_FOUND, status);
+  iree_status_free(status);
+  EXPECT_EQ(nullptr, fn_ptr);
+
+  iree_dynamic_library_release(library);
+}
+
+}  // namespace
+}  // namespace iree
diff --git a/runtime/src/iree/base/testing/dynamic_library_test_library.cc b/runtime/src/iree/base/testing/dynamic_library_test_library.cc
new file mode 100644
index 0000000..d356eeb
--- /dev/null
+++ b/runtime/src/iree/base/testing/dynamic_library_test_library.cc
@@ -0,0 +1,21 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+#if defined(_WIN32)
+#define IREE_SYM_EXPORT __declspec(dllexport)
+#else
+#define IREE_SYM_EXPORT __attribute__((visibility("default")))
+#endif  // _WIN32
+
+int IREE_SYM_EXPORT times_two(int value) { return value * 2; }
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
diff --git a/runtime/src/iree/base/time.c b/runtime/src/iree/base/time.c
new file mode 100644
index 0000000..b9ad245
--- /dev/null
+++ b/runtime/src/iree/base/time.c
@@ -0,0 +1,182 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/time.h"
+
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "iree/base/target_platform.h"
+#include "iree/base/tracing.h"
+
+IREE_API_EXPORT iree_time_t iree_time_now(void) {
+#if defined(IREE_TIME_NOW_FN)
+  IREE_TIME_NOW_FN
+#elif defined(IREE_PLATFORM_WINDOWS)
+  // GetSystemTimePreciseAsFileTime requires Windows 8, add a fallback
+  // (such as using std::chrono) if older support is needed.
+  FILETIME system_time;
+  GetSystemTimePreciseAsFileTime(&system_time);
+  const int64_t kUnixEpochStartTicks = 116444736000000000i64;
+  const int64_t kFtToNanoSec = 100;
+  LARGE_INTEGER li;
+  li.LowPart = system_time.dwLowDateTime;
+  li.HighPart = system_time.dwHighDateTime;
+  li.QuadPart -= kUnixEpochStartTicks;
+  li.QuadPart *= kFtToNanoSec;
+  return li.QuadPart;
+#elif defined(IREE_PLATFORM_ANDROID) || defined(IREE_PLATFORM_APPLE) || \
+    defined(IREE_PLATFORM_LINUX) || defined(IREE_PLATFORM_EMSCRIPTEN)
+  struct timespec clock_time;
+  clock_gettime(CLOCK_REALTIME, &clock_time);
+  return clock_time.tv_sec * 1000000000ull + clock_time.tv_nsec;
+#else
+#error "IREE system clock needs to be set up for your platform"
+#endif  // IREE_PLATFORM_*
+}
+
+IREE_API_EXPORT iree_time_t
+iree_relative_timeout_to_deadline_ns(iree_duration_t timeout_ns) {
+  if (timeout_ns == IREE_DURATION_ZERO) {
+    return IREE_TIME_INFINITE_PAST;
+  } else if (timeout_ns == IREE_DURATION_INFINITE) {
+    return IREE_TIME_INFINITE_FUTURE;
+  }
+  return iree_time_now() + timeout_ns;
+}
+
+IREE_API_EXPORT iree_duration_t
+iree_absolute_deadline_to_timeout_ns(iree_time_t deadline_ns) {
+  if (deadline_ns == IREE_TIME_INFINITE_PAST) {
+    return IREE_DURATION_ZERO;
+  } else if (deadline_ns == IREE_TIME_INFINITE_FUTURE) {
+    return IREE_DURATION_INFINITE;
+  } else {
+    iree_time_t now_ns = iree_time_now();
+    return deadline_ns < now_ns ? IREE_DURATION_ZERO : deadline_ns - now_ns;
+  }
+}
+
+IREE_API_EXPORT uint32_t
+iree_absolute_deadline_to_timeout_ms(iree_time_t deadline_ns) {
+  if (deadline_ns == IREE_TIME_INFINITE_PAST) {
+    return IREE_DURATION_ZERO;
+  } else if (deadline_ns == IREE_TIME_INFINITE_FUTURE) {
+    return UINT32_MAX;
+  } else {
+    // We have either already passed the deadline (and can turn this into a
+    // poll) or want to do nanos->millis. We round up so that a deadline of 1ns
+    // results in 1ms as it should still wait, vs. if it was actually 0ns
+    // indicating the user intended a poll.
+    iree_time_t now_ns = iree_time_now();
+    return deadline_ns < now_ns
+               ? IREE_DURATION_ZERO
+               : (deadline_ns - now_ns + 1000000 - 1) / 1000000ull;
+  }
+}
+
+#if defined(IREE_WAIT_UNTIL_FN)
+
+// Define IREE_WAIT_UNTIL_FN to call out to a user-configured function.
+static bool iree_wait_until_impl(iree_time_t deadline_ns) {
+  return IREE_WAIT_UNTIL_FN(deadline_ns);
+}
+
+#elif defined(IREE_PLATFORM_WINDOWS)
+
+// No good sleep APIs on Windows; we need to accumulate low-precision relative
+// waits to reach the absolute time. Lots of slop here, but we primarily use
+// nanoseconds as a uniform time API and don't guarantee that precision. Note
+// that we try to round up to ensure we wait until at least the requested time.
+static bool iree_wait_until_impl(iree_time_t deadline_ns) {
+  iree_time_t now_ns = iree_time_now();
+  while (now_ns < deadline_ns) {
+    iree_time_t delta_ns = deadline_ns - now_ns;
+    uint32_t delta_ms = (uint32_t)((delta_ns + 1000000 - 1) / 1000000ull);
+    if (delta_ms == 0) {
+      // Sleep(0) doesn't actually sleep and instead acts as a yield; instead of
+      // potentially spilling in a tight loop when we get down near the end of
+      // the wait we bail a bit early. We don't guarantee the precision of the
+      // waits so this is fine.
+      break;
+    }
+    Sleep(delta_ms);
+    now_ns = iree_time_now();
+  }
+  return true;
+}
+
+#elif (_POSIX_C_SOURCE >= 200112L) && defined(TIMER_ABSTIME)
+
+// This is widely available on *nix-like systems (linux/bsd/etc) and in
+// most libc implementations (glibc/musl/etc). It's the best as we get to
+// tell the system the exact time we want to sleep until.
+//
+// https://man7.org/linux/man-pages/man2/clock_nanosleep.2.html
+//
+// NOTE: we could save a syscall in many cases if we returned the time upon wake
+// from the API.
+static bool iree_wait_until_impl(iree_time_t deadline_ns) {
+  struct timespec ts = {
+      .tv_sec = (time_t)(deadline_ns / 1000000000ull),
+      .tv_nsec = (long)(deadline_ns % 1000000000ull),
+  };
+  int ret = clock_nanosleep(CLOCK_REALTIME, TIMER_ABSTIME, &ts, NULL);
+  return ret == 0;
+}
+
+#elif (_POSIX_C_SOURCE >= 199309L) || defined(IREE_PLATFORM_APPLE)
+
+// Apple doesn't have clock_nanosleep. We could use the Mach APIs on darwin to
+// do this but they require initialization and potential updates during
+// execution as clock frequencies change. Instead we use the relative nanosleep
+// and accumulate until the deadline, which is a good fallback for some other
+// platforms as well.
+//
+// https://developer.apple.com/library/archive/documentation/System/Conceptual/ManPages_iPhoneOS/man2/nanosleep.2.html
+static bool iree_wait_until_impl(iree_time_t deadline_ns) {
+  iree_time_t now_ns = iree_time_now();
+  while (now_ns < deadline_ns) {
+    iree_time_t delta_ns = deadline_ns - now_ns;
+    struct timespec abs_ts = {
+        .tv_sec = (time_t)(delta_ns / 1000000000ull),
+        .tv_nsec = (long)(delta_ns % 1000000000ull),
+    };
+    int ret = nanosleep(&abs_ts, NULL);
+    if (ret != 0) return false;
+    now_ns = iree_time_now();
+  }
+  return true;
+}
+
+#else
+
+// No waiting available; just pretend like we did. This will cause programs
+// using timers to run as fast as possible but without having a way to delay
+// time there's not much else they could do.
+static bool iree_wait_until_impl(iree_time_t deadline_ns) { return true; }
+
+#endif  // (platforms)
+
+bool iree_wait_until(iree_time_t deadline_ns) {
+  // Can't wait forever - or for the past.
+  if (deadline_ns == IREE_TIME_INFINITE_FUTURE) return false;
+  if (deadline_ns == IREE_TIME_INFINITE_PAST) return true;
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE(
+      z0, (uint64_t)iree_absolute_deadline_to_timeout_ns(deadline_ns));
+
+  // NOTE: we want to use sleep APIs with absolute times as that makes retrying
+  // on spurious wakes easier; if we using relative timeouts we need to ensure
+  // we don't drift.
+  bool did_wait = iree_wait_until_impl(deadline_ns);
+
+  IREE_TRACE_ZONE_END(z0);
+  return did_wait;
+}
diff --git a/runtime/src/iree/base/time.h b/runtime/src/iree/base/time.h
new file mode 100644
index 0000000..89cad70
--- /dev/null
+++ b/runtime/src/iree/base/time.h
@@ -0,0 +1,194 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BASE_TIME_H_
+#define IREE_BASE_TIME_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "iree/base/attributes.h"
+#include "iree/base/config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// A point in time represented as nanoseconds since unix epoch.
+// TODO(benvanik): pick something easy to get into/out-of time_t/etc.
+typedef int64_t iree_time_t;
+
+// A time in the infinite past used to indicate "already happened".
+// This forces APIs that wait for a point in time to act as a poll and always
+// return IREE_STATUS_DEADLINE_EXCEEDED instead of blocking the caller.
+#define IREE_TIME_INFINITE_PAST INT64_MIN
+
+// A time in the infinite future used to indicate "never".
+// This causes APIs that wait for a point in time to wait however long is needed
+// to satisfy the wait condition.
+#define IREE_TIME_INFINITE_FUTURE INT64_MAX
+
+// A duration represented as relative nanoseconds.
+typedef int64_t iree_duration_t;
+
+// A zero-length duration.
+// Like IREE_TIME_INFINITE_PAST this forces APIs that would wait to instead
+// return IREE_STATUS_DEADLINE_EXCEEDED immediately.
+#define IREE_DURATION_ZERO 0
+
+// An infinite-length duration.
+// Like IREE_TIME_INFINITE_FUTURE this causes APIs that wait to do so until
+// their wait condition is satisfied without returning early.
+#define IREE_DURATION_INFINITE INT64_MAX
+
+// Returns the current system time in unix nanoseconds.
+// Depending on the system architecture and power mode this time may have a
+// very coarse granularity (on the order of microseconds to milliseconds).
+//
+// The system timer may not be monotonic; users should ensure when comparing
+// times they check for negative values in case the time moves backwards.
+IREE_API_EXPORT iree_time_t iree_time_now(void);
+
+// Converts a relative timeout duration to an absolute deadline time.
+// This handles the special cases of IREE_DURATION_ZERO and
+// IREE_DURATION_INFINITE to avoid extraneous time queries.
+IREE_API_EXPORT iree_time_t
+iree_relative_timeout_to_deadline_ns(iree_duration_t timeout_ns);
+
+// Converts an absolute deadline time to a relative timeout duration in nanos.
+// This handles the special cases of IREE_TIME_INFINITE_PAST and
+// IREE_TIME_INFINITE_FUTURE to avoid extraneous time queries.
+IREE_API_EXPORT iree_duration_t
+iree_absolute_deadline_to_timeout_ns(iree_time_t deadline_ns);
+
+// Converts an absolute deadline time to a relative timeout duration in millis.
+// This handles the special cases of IREE_TIME_INFINITE_PAST and
+// IREE_TIME_INFINITE_FUTURE to avoid extraneous time queries.
+IREE_API_EXPORT uint32_t
+iree_absolute_deadline_to_timeout_ms(iree_time_t deadline_ns);
+
+typedef enum iree_timeout_type_e {
+  // Timeout is defined by an absolute value `deadline_ns`.
+  IREE_TIMEOUT_ABSOLUTE = 0,
+  // Timeout is defined by a relative value `timeout_ns`.
+  IREE_TIMEOUT_RELATIVE = 1,
+} iree_timeout_type_t;
+
+// A timeout defined either by an absolute or relative value.
+typedef struct iree_timeout_t {
+  iree_timeout_type_t type;
+  iree_time_t nanos;
+} iree_timeout_t;
+
+// Returns a timeout that will be exceeded immediately.
+// This can be used with APIs that would otherwise wait to cause them to poll.
+//
+// Example:
+//   status = iree_wait_for_signal_or_timeout(&obj, iree_immediate_timeout());
+//   if (iree_status_is_deadline_exceeded(status)) {
+//     // Would have waited indicating the signal has not occurred. If the
+//     // timeout was not immediate the call would have blocked the caller.
+//   }
+static inline iree_timeout_t iree_immediate_timeout(void) {
+  iree_timeout_t timeout = {IREE_TIMEOUT_ABSOLUTE, IREE_TIME_INFINITE_PAST};
+  return timeout;
+}
+
+// Returns true if the |timeout| indicates an immediate/polling/nonblocking
+// timeout.
+static inline bool iree_timeout_is_immediate(iree_timeout_t timeout) {
+  return timeout.type == IREE_TIMEOUT_ABSOLUTE
+             ? timeout.nanos == IREE_TIME_INFINITE_PAST
+             : timeout.nanos == IREE_DURATION_ZERO;
+}
+
+// Returns a timeout that will never be reached.
+// This can be used with APIs that can wait to disable the early
+// deadline-exceeded returns when a condition is not met. It should be used with
+// care as it can complicate program state and make termination more prone to
+// hangs. On the other hand, it's really useful to not bother with actual
+// deadlines. YMMV.
+static inline iree_timeout_t iree_infinite_timeout(void) {
+  iree_timeout_t timeout = {IREE_TIMEOUT_ABSOLUTE, IREE_TIME_INFINITE_FUTURE};
+  return timeout;
+}
+
+// Returns true if the |timeout| indicates an infinite/forever blocking timeout.
+static inline bool iree_timeout_is_infinite(iree_timeout_t timeout) {
+  return timeout.type == IREE_TIMEOUT_ABSOLUTE
+             ? timeout.nanos == IREE_TIME_INFINITE_FUTURE
+             : timeout.nanos == IREE_DURATION_INFINITE;
+}
+
+// Defines an absolute timeout with the given time in nanoseconds.
+static inline iree_timeout_t iree_make_deadline(iree_time_t deadline_ns) {
+  iree_timeout_t timeout = {IREE_TIMEOUT_ABSOLUTE, deadline_ns};
+  return timeout;
+}
+
+// Defines a relative timeout with the given time in nanoseconds.
+static inline iree_timeout_t iree_make_timeout_ns(iree_duration_t timeout_ns) {
+  iree_timeout_t timeout = {IREE_TIMEOUT_RELATIVE, timeout_ns};
+  return timeout;
+}
+
+// Defines a relative timeout with the given time in milliseconds.
+static inline iree_timeout_t iree_make_timeout_ms(iree_duration_t timeout_ms) {
+  iree_timeout_t timeout = {
+      IREE_TIMEOUT_RELATIVE,
+      timeout_ms == IREE_DURATION_INFINITE ? IREE_DURATION_INFINITE
+                                           : timeout_ms * 1000000,
+  };
+  return timeout;
+}
+
+// Converts a timeout from relative to absolute (if it is).
+//
+// Absolute timeouts (deadlines) are better for long-running tasks or when
+// making calls that may complete in stages as relative ones will tend to skew;
+// if a wait is performed with a relative timeout of 10ms but it takes 5ms to
+// get from the origin of the call to the actual wait using the timeout then
+// the total latency of the call may be 15ms (5ms to prepare + 10ms on the
+// wait). Instead if an absolute deadline is used the caller can ensure that
+// the total time spent in the operation happens regardless of the intervening
+// work that happens.
+//
+// For this reason IREE internal APIs try to convert to absolute times and users
+// may be able to reduce overhead by populating the times as absolute to start
+// with via iree_make_deadline.
+static inline void iree_convert_timeout_to_absolute(iree_timeout_t* timeout) {
+  if (timeout->type == IREE_TIMEOUT_RELATIVE) {
+    timeout->type = IREE_TIMEOUT_ABSOLUTE;
+    timeout->nanos = iree_relative_timeout_to_deadline_ns(timeout->nanos);
+  }
+}
+
+// Returns an absolute deadline in nanoseconds from the given timeout.
+static inline iree_time_t iree_timeout_as_deadline_ns(iree_timeout_t timeout) {
+  return timeout.type == IREE_TIMEOUT_ABSOLUTE
+             ? timeout.nanos
+             : iree_relative_timeout_to_deadline_ns(timeout.nanos);
+}
+
+// Returns the earliest timeout between |lhs| and |rhs|.
+static inline iree_timeout_t iree_timeout_min(iree_timeout_t lhs,
+                                              iree_timeout_t rhs) {
+  iree_convert_timeout_to_absolute(&lhs);
+  iree_convert_timeout_to_absolute(&rhs);
+  return iree_make_deadline(lhs.nanos < rhs.nanos ? lhs.nanos : rhs.nanos);
+}
+
+// Waits until |deadline_ns| (or longer), putting the calling thread to sleep.
+// The precision of this varies across platforms and may have a minimum
+// granularity anywhere between microsecond to milliseconds.
+// Returns true if the sleep completed successfully and false if it was aborted.
+bool iree_wait_until(iree_time_t deadline_ns);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_BASE_TIME_H_
diff --git a/runtime/src/iree/base/tracing.cc b/runtime/src/iree/base/tracing.cc
new file mode 100644
index 0000000..32e2826
--- /dev/null
+++ b/runtime/src/iree/base/tracing.cc
@@ -0,0 +1,205 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/tracing.h"
+
+#include "iree/base/target_platform.h"
+
+// Textually include the Tracy implementation.
+// We do this here instead of relying on an external build target so that we can
+// ensure our configuration specified in tracing.h is picked up.
+#if IREE_TRACING_FEATURES != 0
+#include "third_party/tracy/TracyClient.cpp"
+#endif  // IREE_TRACING_FEATURES
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+#if defined(TRACY_ENABLE) && defined(IREE_PLATFORM_WINDOWS)
+static HANDLE iree_dbghelp_mutex;
+void IREEDbgHelpInit(void) {
+  iree_dbghelp_mutex = CreateMutex(NULL, FALSE, NULL);
+}
+void IREEDbgHelpLock(void) {
+  WaitForSingleObject(iree_dbghelp_mutex, INFINITE);
+}
+void IREEDbgHelpUnlock(void) { ReleaseMutex(iree_dbghelp_mutex); }
+#endif  // TRACY_ENABLE && IREE_PLATFORM_WINDOWS
+
+#if IREE_TRACING_FEATURES != 0
+
+void iree_tracing_set_thread_name_impl(const char* name) {
+  tracy::SetThreadName(name);
+}
+
+iree_zone_id_t iree_tracing_zone_begin_impl(
+    const iree_tracing_location_t* src_loc, const char* name,
+    size_t name_length) {
+  const iree_zone_id_t zone_id = tracy::GetProfiler().GetNextZoneId();
+
+#ifndef TRACY_NO_VERIFY
+  {
+    TracyLfqPrepareC(tracy::QueueType::ZoneValidation);
+    tracy::MemWrite(&item->zoneValidation.id, zone_id);
+    TracyLfqCommitC;
+  }
+#endif  // TRACY_NO_VERIFY
+
+  {
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_CALLSTACKS
+    TracyLfqPrepareC(tracy::QueueType::ZoneBeginCallstack);
+#else
+    TracyLfqPrepareC(tracy::QueueType::ZoneBegin);
+#endif  // IREE_TRACING_FEATURE_INSTRUMENTATION_CALLSTACKS
+    tracy::MemWrite(&item->zoneBegin.time, tracy::Profiler::GetTime());
+    tracy::MemWrite(&item->zoneBegin.srcloc,
+                    reinterpret_cast<uint64_t>(src_loc));
+    TracyLfqCommitC;
+  }
+
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_CALLSTACKS
+  tracy::GetProfiler().SendCallstack(IREE_TRACING_MAX_CALLSTACK_DEPTH);
+#endif  // IREE_TRACING_FEATURE_INSTRUMENTATION_CALLSTACKS
+
+  if (name_length) {
+#ifndef TRACY_NO_VERIFY
+    {
+      TracyLfqPrepareC(tracy::QueueType::ZoneValidation);
+      tracy::MemWrite(&item->zoneValidation.id, zone_id);
+      TracyLfqCommitC;
+    }
+#endif  // TRACY_NO_VERIFY
+    auto name_ptr = reinterpret_cast<char*>(tracy::tracy_malloc(name_length));
+    memcpy(name_ptr, name, name_length);
+    TracyLfqPrepareC(tracy::QueueType::ZoneName);
+    tracy::MemWrite(&item->zoneTextFat.text,
+                    reinterpret_cast<uint64_t>(name_ptr));
+    tracy::MemWrite(&item->zoneTextFat.size,
+                    static_cast<uint64_t>(name_length));
+    TracyLfqCommitC;
+  }
+
+  return zone_id;
+}
+
+iree_zone_id_t iree_tracing_zone_begin_external_impl(
+    const char* file_name, size_t file_name_length, uint32_t line,
+    const char* function_name, size_t function_name_length, const char* name,
+    size_t name_length) {
+  uint64_t src_loc = tracy::Profiler::AllocSourceLocation(
+      line, file_name, file_name_length, function_name, function_name_length,
+      name, name_length);
+
+  const iree_zone_id_t zone_id = tracy::GetProfiler().GetNextZoneId();
+
+#ifndef TRACY_NO_VERIFY
+  {
+    TracyLfqPrepareC(tracy::QueueType::ZoneValidation);
+    tracy::MemWrite(&item->zoneValidation.id, zone_id);
+    TracyLfqCommitC;
+  }
+#endif  // TRACY_NO_VERIFY
+
+  {
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_CALLSTACKS
+    TracyLfqPrepareC(tracy::QueueType::ZoneBeginAllocSrcLocCallstack);
+#else
+    TracyLfqPrepareC(tracy::QueueType::ZoneBeginAllocSrcLoc);
+#endif  // IREE_TRACING_FEATURE_INSTRUMENTATION_CALLSTACKS
+    tracy::MemWrite(&item->zoneBegin.time, tracy::Profiler::GetTime());
+    tracy::MemWrite(&item->zoneBegin.srcloc, src_loc);
+    TracyLfqCommitC;
+  }
+
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_CALLSTACKS
+  tracy::GetProfiler().SendCallstack(IREE_TRACING_MAX_CALLSTACK_DEPTH);
+#endif  // IREE_TRACING_FEATURE_INSTRUMENTATION_CALLSTACKS
+
+  return zone_id;
+}
+
+void iree_tracing_set_plot_type_impl(const char* name_literal,
+                                     uint8_t plot_type) {
+  tracy::Profiler::ConfigurePlot(name_literal,
+                                 static_cast<tracy::PlotFormatType>(plot_type));
+}
+
+void iree_tracing_plot_value_i64_impl(const char* name_literal, int64_t value) {
+  tracy::Profiler::PlotData(name_literal, value);
+}
+
+void iree_tracing_plot_value_f32_impl(const char* name_literal, float value) {
+  tracy::Profiler::PlotData(name_literal, value);
+}
+
+void iree_tracing_plot_value_f64_impl(const char* name_literal, double value) {
+  tracy::Profiler::PlotData(name_literal, value);
+}
+
+void iree_tracing_mutex_announce(const iree_tracing_location_t* src_loc,
+                                 uint32_t* out_lock_id) {
+  uint32_t lock_id =
+      tracy::GetLockCounter().fetch_add(1, std::memory_order_relaxed);
+  assert(lock_id != std::numeric_limits<uint32_t>::max());
+  *out_lock_id = lock_id;
+
+  auto item = tracy::Profiler::QueueSerial();
+  tracy::MemWrite(&item->hdr.type, tracy::QueueType::LockAnnounce);
+  tracy::MemWrite(&item->lockAnnounce.id, lock_id);
+  tracy::MemWrite(&item->lockAnnounce.time, tracy::Profiler::GetTime());
+  tracy::MemWrite(&item->lockAnnounce.lckloc,
+                  reinterpret_cast<uint64_t>(src_loc));
+  tracy::MemWrite(&item->lockAnnounce.type, tracy::LockType::Lockable);
+  tracy::Profiler::QueueSerialFinish();
+}
+
+void iree_tracing_mutex_terminate(uint32_t lock_id) {
+  auto item = tracy::Profiler::QueueSerial();
+  tracy::MemWrite(&item->hdr.type, tracy::QueueType::LockTerminate);
+  tracy::MemWrite(&item->lockTerminate.id, lock_id);
+  tracy::MemWrite(&item->lockTerminate.time, tracy::Profiler::GetTime());
+  tracy::Profiler::QueueSerialFinish();
+}
+
+void iree_tracing_mutex_before_lock(uint32_t lock_id) {
+  auto item = tracy::Profiler::QueueSerial();
+  tracy::MemWrite(&item->hdr.type, tracy::QueueType::LockWait);
+  tracy::MemWrite(&item->lockWait.thread, tracy::GetThreadHandle());
+  tracy::MemWrite(&item->lockWait.id, lock_id);
+  tracy::MemWrite(&item->lockWait.time, tracy::Profiler::GetTime());
+  tracy::Profiler::QueueSerialFinish();
+}
+
+void iree_tracing_mutex_after_lock(uint32_t lock_id) {
+  auto item = tracy::Profiler::QueueSerial();
+  tracy::MemWrite(&item->hdr.type, tracy::QueueType::LockObtain);
+  tracy::MemWrite(&item->lockObtain.thread, tracy::GetThreadHandle());
+  tracy::MemWrite(&item->lockObtain.id, lock_id);
+  tracy::MemWrite(&item->lockObtain.time, tracy::Profiler::GetTime());
+  tracy::Profiler::QueueSerialFinish();
+}
+
+void iree_tracing_mutex_after_try_lock(uint32_t lock_id, bool was_acquired) {
+  if (was_acquired) {
+    iree_tracing_mutex_after_lock(lock_id);
+  }
+}
+
+void iree_tracing_mutex_after_unlock(uint32_t lock_id) {
+  auto item = tracy::Profiler::QueueSerial();
+  tracy::MemWrite(&item->hdr.type, tracy::QueueType::LockRelease);
+  tracy::MemWrite(&item->lockRelease.thread, tracy::GetThreadHandle());
+  tracy::MemWrite(&item->lockRelease.id, lock_id);
+  tracy::MemWrite(&item->lockRelease.time, tracy::Profiler::GetTime());
+  tracy::Profiler::QueueSerialFinish();
+}
+
+#endif  // IREE_TRACING_FEATURES
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
diff --git a/runtime/src/iree/base/tracing.h b/runtime/src/iree/base/tracing.h
new file mode 100644
index 0000000..9a879ae
--- /dev/null
+++ b/runtime/src/iree/base/tracing.h
@@ -0,0 +1,502 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Utilities for runtime tracing support.
+// These allow the various runtime subsystems to insert trace events, attach
+// metadata to events or allocations, and control tracing verbosity.
+//
+// Tracing features can be enabled with either an IREE_TRACING_MODE define that
+// allows predefined tracing modes or individual IREE_TRACING_FEATURE_* flags
+// set on IREE_TRACING_FEATURES when a more custom set of features is
+// required. Exact feature support may vary on platform and toolchain.
+//
+// The tracing infrastructure is currently designed to target the Tracy
+// profiler: https://github.com/wolfpld/tracy
+// Tracy's profiler UI allowing for streaming captures and analysis can be
+// downloaded from: https://github.com/wolfpld/tracy/releases
+// The manual provided on the releases page contains more information about how
+// Tracy works, its limitations, and how to operate the UI.
+//
+// NOTE: this header is used both from C and C++ code and only conditionally
+// enables the C++ when in a valid context. Do not use C++ features or include
+// other files that are not C-compatible.
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "iree/base/attributes.h"
+#include "iree/base/config.h"
+
+#ifndef IREE_BASE_TRACING_H_
+#define IREE_BASE_TRACING_H_
+
+//===----------------------------------------------------------------------===//
+// IREE_TRACING_FEATURE_* flags and options
+//===----------------------------------------------------------------------===//
+
+// Enables IREE_TRACE_* macros for instrumented tracing.
+#define IREE_TRACING_FEATURE_INSTRUMENTATION (1 << 0)
+
+// Captures callstacks up to IREE_TRACING_MAX_CALLSTACK_DEPTH at all
+// IREE_TRACE_* events. This has a significant performance impact and should
+// only be enabled when tracking down missing instrumentation.
+#define IREE_TRACING_FEATURE_INSTRUMENTATION_CALLSTACKS (1 << 1)
+
+// Tracks all allocations (we know about) via new/delete/malloc/free.
+// This allows fine-grained allocation and usage tracking down to the code that
+// performed the allocations. Allocations or frees that are performed outside of
+// the IREE API or runtime library will not be tracked and unbalanced usage
+// (allocating with IREE's API then freeing with stdlib free, for example) will
+// cause Tracy to become very unhappy.
+#define IREE_TRACING_FEATURE_ALLOCATION_TRACKING (1 << 2)
+
+// Captures callstacks up to IREE_TRACING_MAX_CALLSTACK_DEPTH at all allocation
+// events when allocation tracking is enabled.
+#define IREE_TRACING_FEATURE_ALLOCATION_CALLSTACKS (1 << 3)
+
+// Tracks fast locks in all cases (both contended and uncontended).
+// This may introduce contention where there would otherwise be none as what
+// would be a handful of instructions and little memory access may become
+// hundreds. To see only locks under contention use
+// IREE_TRACING_FEATURE_SLOW_LOCKS.
+#define IREE_TRACING_FEATURE_FAST_LOCKS (1 << 4)
+
+// Tracks slow locks that end up going to the OS for waits/wakes in futexes.
+// Uncontended locks will not be displayed and only waits will be visible in the
+// Tracy UI.
+#define IREE_TRACING_FEATURE_SLOW_LOCKS (1 << 5)
+
+// Forwards log messages to traces, which will be visible under "Messages" in
+// the Tracy UI.
+#define IREE_TRACING_FEATURE_LOG_MESSAGES (1 << 6)
+
+#if !defined(IREE_TRACING_MAX_CALLSTACK_DEPTH)
+// Tracing functions that capture stack traces will only capture up to N frames.
+// The overhead for stack walking scales linearly with the number of frames
+// captured and can increase the cost of an event capture by orders of
+// magnitude.
+// Minimum: 0 (disable)
+// Maximum: 62
+#define IREE_TRACING_MAX_CALLSTACK_DEPTH 16
+#endif  // IREE_TRACING_MAX_CALLSTACK_DEPTH
+
+//===----------------------------------------------------------------------===//
+// IREE_TRACING_MODE simple setting
+//===----------------------------------------------------------------------===//
+
+// Set IREE_TRACING_FEATURES based on IREE_TRACING_MODE if the user hasn't
+// overridden it with more specific settings.
+//
+// IREE_TRACING_MODE = 0: tracing disabled
+// IREE_TRACING_MODE = 1: instrumentation, log messages, and basic statistics
+// IREE_TRACING_MODE = 2: same as 1 with added allocation tracking
+// IREE_TRACING_MODE = 3: same as 2 with callstacks for allocations
+// IREE_TRACING_MODE = 4: same as 3 with callstacks for all instrumentation
+#if !defined(IREE_TRACING_FEATURES)
+#if defined(IREE_TRACING_MODE) && IREE_TRACING_MODE == 1
+#define IREE_TRACING_FEATURES \
+  (IREE_TRACING_FEATURE_INSTRUMENTATION | IREE_TRACING_FEATURE_LOG_MESSAGES)
+#undef IREE_TRACING_MAX_CALLSTACK_DEPTH
+#define IREE_TRACING_MAX_CALLSTACK_DEPTH 0
+#elif defined(IREE_TRACING_MODE) && IREE_TRACING_MODE == 2
+#define IREE_TRACING_FEATURES                 \
+  (IREE_TRACING_FEATURE_INSTRUMENTATION |     \
+   IREE_TRACING_FEATURE_ALLOCATION_TRACKING | \
+   IREE_TRACING_FEATURE_LOG_MESSAGES)
+#elif defined(IREE_TRACING_MODE) && IREE_TRACING_MODE == 3
+#define IREE_TRACING_FEATURES                   \
+  (IREE_TRACING_FEATURE_INSTRUMENTATION |       \
+   IREE_TRACING_FEATURE_ALLOCATION_TRACKING |   \
+   IREE_TRACING_FEATURE_ALLOCATION_CALLSTACKS | \
+   IREE_TRACING_FEATURE_LOG_MESSAGES)
+#elif defined(IREE_TRACING_MODE) && IREE_TRACING_MODE >= 4
+#define IREE_TRACING_FEATURES                        \
+  (IREE_TRACING_FEATURE_INSTRUMENTATION |            \
+   IREE_TRACING_FEATURE_INSTRUMENTATION_CALLSTACKS | \
+   IREE_TRACING_FEATURE_ALLOCATION_TRACKING |        \
+   IREE_TRACING_FEATURE_ALLOCATION_CALLSTACKS |      \
+   IREE_TRACING_FEATURE_LOG_MESSAGES)
+#else
+#define IREE_TRACING_FEATURES 0
+#endif  // IREE_TRACING_MODE
+#endif  // !IREE_TRACING_FEATURES
+
+//===----------------------------------------------------------------------===//
+// Tracy configuration
+//===----------------------------------------------------------------------===//
+// NOTE: order matters here as we are including files that require/define.
+
+// Enable Tracy only when we are using tracing features.
+#if IREE_TRACING_FEATURES != 0
+#define TRACY_ENABLE 1
+#endif  // IREE_TRACING_FEATURES
+
+// Disable zone nesting verification in release builds.
+// The verification makes it easy to find unbalanced zones but doubles the cost
+// (at least) of each zone recorded. Run in debug builds to verify new
+// instrumentation is correct before capturing traces in release builds.
+#if defined(NDEBUG)
+#define TRACY_NO_VERIFY 1
+#endif  // NDEBUG
+
+// Force callstack capture on all zones (even those without the C suffix).
+#if (IREE_TRACING_FEATURES &                             \
+     IREE_TRACING_FEATURE_INSTRUMENTATION_CALLSTACKS) || \
+    (IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_ALLOCATION_CALLSTACKS)
+#define TRACY_CALLSTACK 1
+#endif  // IREE_TRACING_FEATURE_INSTRUMENTATION_CALLSTACKS
+
+// Guard tracy use of DbgHelp on Windows via IREEDbgHelp* functions.
+// All our own usage of DbgHelp must be guarded with the same lock.
+#define TRACY_DBGHELP_LOCK IREEDbgHelp
+
+// Disable frame image capture to avoid the DXT compression code and the frame
+// capture worker thread.
+#define TRACY_NO_FRAME_IMAGE 1
+
+// We don't care about vsync events as they can pollute traces and don't have
+// much meaning in our workloads. If integrators still want them we can expose
+// this as a tracing feature flag.
+#define TRACY_NO_VSYNC_CAPTURE 1
+
+// Flush the settings we have so far; settings after this point will be
+// overriding values set by Tracy itself.
+#if defined(TRACY_ENABLE)
+#include "third_party/tracy/TracyC.h"  // IWYU pragma: export
+#endif
+
+// Disable callstack capture if our depth is 0; this allows us to avoid any
+// expensive capture (and all the associated dependencies) if we aren't going to
+// use it. Note that this means that unless code is instrumented we won't be
+// able to tell what's happening in the Tracy UI.
+#if IREE_TRACING_MAX_CALLSTACK_DEPTH == 0
+#undef TRACY_CALLSTACK
+#endif  // IREE_TRACING_MAX_CALLSTACK_DEPTH
+
+//===----------------------------------------------------------------------===//
+// C API used for Tracy control
+//===----------------------------------------------------------------------===//
+// These functions are implementation details and should not be called directly.
+// Always use the macros (or C++ RAII types).
+
+// Local zone ID used for the C IREE_TRACE_ZONE_* macros.
+typedef uint32_t iree_zone_id_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+#if IREE_TRACING_FEATURES
+
+void iree_tracing_set_thread_name_impl(const char* name);
+
+typedef struct ___tracy_source_location_data iree_tracing_location_t;
+
+#ifdef __cplusplus
+#define iree_tracing_make_zone_ctx(zone_id) \
+  TracyCZoneCtx { zone_id, 1 }
+#else
+#define iree_tracing_make_zone_ctx(zone_id) \
+  (TracyCZoneCtx) { zone_id, 1 }
+#endif  // __cplusplus
+
+IREE_MUST_USE_RESULT iree_zone_id_t
+iree_tracing_zone_begin_impl(const iree_tracing_location_t* src_loc,
+                             const char* name, size_t name_length);
+IREE_MUST_USE_RESULT iree_zone_id_t iree_tracing_zone_begin_external_impl(
+    const char* file_name, size_t file_name_length, uint32_t line,
+    const char* function_name, size_t function_name_length, const char* name,
+    size_t name_length);
+
+void iree_tracing_set_plot_type_impl(const char* name_literal,
+                                     uint8_t plot_type);
+void iree_tracing_plot_value_i64_impl(const char* name_literal, int64_t value);
+void iree_tracing_plot_value_f32_impl(const char* name_literal, float value);
+void iree_tracing_plot_value_f64_impl(const char* name_literal, double value);
+
+void iree_tracing_mutex_announce(const iree_tracing_location_t* src_loc,
+                                 uint32_t* out_lock_id);
+void iree_tracing_mutex_terminate(uint32_t lock_id);
+void iree_tracing_mutex_before_lock(uint32_t lock_id);
+void iree_tracing_mutex_after_lock(uint32_t lock_id);
+void iree_tracing_mutex_after_try_lock(uint32_t lock_id, bool was_acquired);
+void iree_tracing_mutex_after_unlock(uint32_t lock_id);
+
+#endif  // IREE_TRACING_FEATURES
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+//===----------------------------------------------------------------------===//
+// Instrumentation macros (C)
+//===----------------------------------------------------------------------===//
+
+// Matches Tracy's PlotFormatType enum.
+enum {
+  // Values will be displayed as plain numbers.
+  IREE_TRACING_PLOT_TYPE_NUMBER = 0,
+  // Treats the values as memory sizes. Will display kilobytes, megabytes, etc.
+  IREE_TRACING_PLOT_TYPE_MEMORY = 1,
+  // Values will be displayed as percentage with value 100 being equal to 100%.
+  IREE_TRACING_PLOT_TYPE_PERCENTAGE = 2,
+};
+
+// Colors used for messages based on the level provided to the macro.
+enum {
+  IREE_TRACING_MESSAGE_LEVEL_ERROR = 0xFF0000u,
+  IREE_TRACING_MESSAGE_LEVEL_WARNING = 0xFFFF00u,
+  IREE_TRACING_MESSAGE_LEVEL_INFO = 0xFFFFFFu,
+  IREE_TRACING_MESSAGE_LEVEL_VERBOSE = 0xC0C0C0u,
+  IREE_TRACING_MESSAGE_LEVEL_DEBUG = 0x00FF00u,
+};
+
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+
+// Sets an application-specific payload that will be stored in the trace.
+// This can be used to fingerprint traces to particular versions and denote
+// compilation options or configuration. The given string value will be copied.
+#define IREE_TRACE_SET_APP_INFO(value, value_length) \
+  ___tracy_emit_message_appinfo(value, value_length)
+
+// Sets the current thread name to the given string value.
+// This will only set the thread name as it appears in the tracing backend and
+// not set the OS thread name as it would appear in a debugger.
+// The C-string |name| will be copied and does not need to be a literal.
+#define IREE_TRACE_SET_THREAD_NAME(name) iree_tracing_set_thread_name_impl(name)
+
+// Evalutes the expression code only if tracing is enabled.
+//
+// Example:
+//  struct {
+//    IREE_TRACE(uint32_t trace_only_value);
+//  } my_object;
+//  IREE_TRACE(my_object.trace_only_value = 5);
+#define IREE_TRACE(expr) expr
+
+// Begins a new zone with the parent function name.
+#define IREE_TRACE_ZONE_BEGIN(zone_id) \
+  IREE_TRACE_ZONE_BEGIN_NAMED(zone_id, NULL)
+
+// Begins a new zone with the given compile-time literal name.
+#define IREE_TRACE_ZONE_BEGIN_NAMED(zone_id, name_literal)                    \
+  static const iree_tracing_location_t TracyConcat(                           \
+      __tracy_source_location, __LINE__) = {name_literal, __FUNCTION__,       \
+                                            __FILE__, (uint32_t)__LINE__, 0}; \
+  iree_zone_id_t zone_id = iree_tracing_zone_begin_impl(                      \
+      &TracyConcat(__tracy_source_location, __LINE__), NULL, 0);
+
+// Begins a new zone with the given runtime dynamic string name.
+// The |value| string will be copied into the trace buffer.
+#define IREE_TRACE_ZONE_BEGIN_NAMED_DYNAMIC(zone_id, name, name_length) \
+  static const iree_tracing_location_t TracyConcat(                     \
+      __tracy_source_location, __LINE__) = {0, __FUNCTION__, __FILE__,  \
+                                            (uint32_t)__LINE__, 0};     \
+  iree_zone_id_t zone_id = iree_tracing_zone_begin_impl(                \
+      &TracyConcat(__tracy_source_location, __LINE__), (name), (name_length));
+
+// Begins an externally defined zone with a dynamic source location.
+// The |file_name|, |function_name|, and optional |name| strings will be copied
+// into the trace buffer and do not need to persist.
+#define IREE_TRACE_ZONE_BEGIN_EXTERNAL(                                       \
+    zone_id, file_name, file_name_length, line, function_name,                \
+    function_name_length, name, name_length)                                  \
+  iree_zone_id_t zone_id = iree_tracing_zone_begin_external_impl(             \
+      file_name, file_name_length, line, function_name, function_name_length, \
+      name, name_length)
+
+// Sets the dynamic color of the zone to an XXBBGGRR value.
+#define IREE_TRACE_ZONE_SET_COLOR(zone_id, color_xbgr) \
+  ___tracy_emit_zone_color(iree_tracing_make_zone_ctx(zone_id), color_xbgr);
+
+// Appends an integer value to the parent zone. May be called multiple times.
+#define IREE_TRACE_ZONE_APPEND_VALUE(zone_id, value) \
+  ___tracy_emit_zone_value(iree_tracing_make_zone_ctx(zone_id), value);
+
+// Appends a string value to the parent zone. May be called multiple times.
+// The |value| string will be copied into the trace buffer.
+#define IREE_TRACE_ZONE_APPEND_TEXT(...)                                  \
+  IREE_TRACE_IMPL_GET_VARIADIC_((__VA_ARGS__,                             \
+                                 IREE_TRACE_ZONE_APPEND_TEXT_STRING_VIEW, \
+                                 IREE_TRACE_ZONE_APPEND_TEXT_CSTRING))    \
+  (__VA_ARGS__)
+#define IREE_TRACE_ZONE_APPEND_TEXT_CSTRING(zone_id, value) \
+  IREE_TRACE_ZONE_APPEND_TEXT_STRING_VIEW(zone_id, value, strlen(value))
+#define IREE_TRACE_ZONE_APPEND_TEXT_STRING_VIEW(zone_id, value, value_length) \
+  ___tracy_emit_zone_text(iree_tracing_make_zone_ctx(zone_id), value,         \
+                          value_length)
+
+// Ends the current zone. Must be passed the |zone_id| from the _BEGIN.
+#define IREE_TRACE_ZONE_END(zone_id) \
+  ___tracy_emit_zone_end(iree_tracing_make_zone_ctx(zone_id))
+
+// Ends the current zone before returning on a failure.
+// Sugar for IREE_TRACE_ZONE_END+IREE_RETURN_IF_ERROR.
+#define IREE_RETURN_AND_END_ZONE_IF_ERROR(zone_id, ...) \
+  IREE_RETURN_AND_EVAL_IF_ERROR(IREE_TRACE_ZONE_END(zone_id), __VA_ARGS__)
+
+// Configures the named plot with an IREE_TRACING_PLOT_TYPE_* representation.
+#define IREE_TRACE_SET_PLOT_TYPE(name_literal, plot_type) \
+  iree_tracing_set_plot_type_impl(name_literal, plot_type)
+// Plots a value in the named plot group as an integer.
+#define IREE_TRACE_PLOT_VALUE_I64(name_literal, value) \
+  iree_tracing_plot_value_i64_impl(name_literal, value)
+// Plots a value in the named plot group as a single-precision float.
+#define IREE_TRACE_PLOT_VALUE_F32(name_literal, value) \
+  iree_tracing_plot_value_f32_impl(name_literal, value)
+// Plots a value in the named plot group as a double-precision float.
+#define IREE_TRACE_PLOT_VALUE_F64(name_literal, value) \
+  iree_tracing_plot_value_f64_impl(name_literal, value)
+
+// Demarcates an advancement of the top-level unnamed frame group.
+#define IREE_TRACE_FRAME_MARK() ___tracy_emit_frame_mark(NULL)
+// Demarcates an advancement of a named frame group.
+#define IREE_TRACE_FRAME_MARK_NAMED(name_literal) \
+  ___tracy_emit_frame_mark(name_literal)
+// Begins a discontinuous frame in a named frame group.
+// Must be properly matched with a IREE_TRACE_FRAME_MARK_NAMED_END.
+#define IREE_TRACE_FRAME_MARK_BEGIN_NAMED(name_literal) \
+  ___tracy_emit_frame_mark_start(name_literal)
+// Ends a discontinuous frame in a named frame group.
+#define IREE_TRACE_FRAME_MARK_END_NAMED(name_literal) \
+  ___tracy_emit_frame_mark_end(name_literal)
+
+// Logs a message at the given logging level to the trace.
+// The message text must be a compile-time string literal.
+#define IREE_TRACE_MESSAGE(level, value_literal) \
+  ___tracy_emit_messageLC(value_literal, IREE_TRACING_MESSAGE_LEVEL_##level, 0)
+// Logs a message with the given color to the trace.
+// Standard colors are defined as IREE_TRACING_MESSAGE_LEVEL_* values.
+// The message text must be a compile-time string literal.
+#define IREE_TRACE_MESSAGE_COLORED(color, value_literal) \
+  ___tracy_emit_messageLC(value_literal, color, 0)
+// Logs a dynamically-allocated message at the given logging level to the trace.
+// The string |value| will be copied into the trace buffer.
+#define IREE_TRACE_MESSAGE_DYNAMIC(level, value, value_length) \
+  ___tracy_emit_messageC(value, value_length,                  \
+                         IREE_TRACING_MESSAGE_LEVEL_##level, 0)
+// Logs a dynamically-allocated message with the given color to the trace.
+// Standard colors are defined as IREE_TRACING_MESSAGE_LEVEL_* values.
+// The string |value| will be copied into the trace buffer.
+#define IREE_TRACE_MESSAGE_DYNAMIC_COLORED(color, value, value_length) \
+  ___tracy_emit_messageC(value, value_length, color, 0)
+
+// Utilities:
+#define IREE_TRACE_IMPL_GET_VARIADIC_HELPER_(_1, _2, _3, NAME, ...) NAME
+#define IREE_TRACE_IMPL_GET_VARIADIC_(args) \
+  IREE_TRACE_IMPL_GET_VARIADIC_HELPER_ args
+
+#else
+#define IREE_TRACE_SET_APP_INFO(value, value_length)
+#define IREE_TRACE_SET_THREAD_NAME(name)
+#define IREE_TRACE(expr)
+#define IREE_TRACE_ZONE_BEGIN(zone_id)
+#define IREE_TRACE_ZONE_BEGIN_NAMED(zone_id, name_literal)
+#define IREE_TRACE_ZONE_BEGIN_NAMED_DYNAMIC(zone_id, name, name_length)
+#define IREE_TRACE_ZONE_BEGIN_EXTERNAL(                        \
+    zone_id, file_name, file_name_length, line, function_name, \
+    function_name_length, name, name_length)
+#define IREE_TRACE_ZONE_SET_COLOR(zone_id, color_xrgb)
+#define IREE_TRACE_ZONE_APPEND_VALUE(zone_id, value)
+#define IREE_TRACE_ZONE_APPEND_TEXT(zone_id, ...)
+#define IREE_TRACE_ZONE_APPEND_TEXT_CSTRING(zone_id, value)
+#define IREE_TRACE_ZONE_APPEND_TEXT_STRING_VIEW(zone_id, value, value_length)
+#define IREE_TRACE_ZONE_END(zone_id)
+#define IREE_RETURN_AND_END_ZONE_IF_ERROR(zone_id, ...) \
+  IREE_RETURN_IF_ERROR(__VA_ARGS__)
+#define IREE_TRACE_SET_PLOT_TYPE(name_literal, plot_type)
+#define IREE_TRACE_PLOT_VALUE_I64(name_literal, value)
+#define IREE_TRACE_PLOT_VALUE_F32(name_literal, value)
+#define IREE_TRACE_PLOT_VALUE_F64(name_literal, value)
+#define IREE_TRACE_FRAME_MARK()
+#define IREE_TRACE_FRAME_MARK_NAMED(name_literal)
+#define IREE_TRACE_FRAME_MARK_BEGIN_NAMED(name_literal)
+#define IREE_TRACE_FRAME_MARK_END_NAMED(name_literal)
+#define IREE_TRACE_MESSAGE(level, value_literal)
+#define IREE_TRACE_MESSAGE_COLORED(color, value_literal)
+#define IREE_TRACE_MESSAGE_DYNAMIC(level, value, value_length)
+#define IREE_TRACE_MESSAGE_DYNAMIC_COLORED(color, value, value_length)
+#endif  // IREE_TRACING_FEATURE_INSTRUMENTATION
+
+//===----------------------------------------------------------------------===//
+// Allocation tracking macros (C/C++)
+//===----------------------------------------------------------------------===//
+//
+// IREE_TRACE_ALLOC: records an malloc.
+// IREE_TRACE_FREE: records a free.
+//
+// NOTE: realloc must be recorded as a FREE/ALLOC pair.
+
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_ALLOCATION_TRACKING
+
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_ALLOCATION_CALLSTACKS
+
+#define IREE_TRACE_ALLOC(ptr, size)               \
+  ___tracy_emit_memory_alloc_callstack(ptr, size, \
+                                       IREE_TRACING_MAX_CALLSTACK_DEPTH, 0)
+#define IREE_TRACE_FREE(ptr) \
+  ___tracy_emit_memory_free_callstack(ptr, IREE_TRACING_MAX_CALLSTACK_DEPTH, 0)
+#define IREE_TRACE_ALLOC_NAMED(name, ptr, size) \
+  ___tracy_emit_memory_alloc_callstack_named(   \
+      ptr, size, IREE_TRACING_MAX_CALLSTACK_DEPTH, 0, name)
+#define IREE_TRACE_FREE_NAMED(name, ptr)     \
+  ___tracy_emit_memory_free_callstack_named( \
+      ptr, IREE_TRACING_MAX_CALLSTACK_DEPTH, 0, name)
+
+#else
+
+#define IREE_TRACE_ALLOC(ptr, size) ___tracy_emit_memory_alloc(ptr, size, 0)
+#define IREE_TRACE_FREE(ptr) ___tracy_emit_memory_free(ptr, 0)
+#define IREE_TRACE_ALLOC_NAMED(name, ptr, size) \
+  ___tracy_emit_memory_alloc_named(ptr, size, 0, name)
+#define IREE_TRACE_FREE_NAMED(name, ptr) \
+  ___tracy_emit_memory_free_named(ptr, 0, name)
+
+#endif  // IREE_TRACING_FEATURE_ALLOCATION_CALLSTACKS
+
+#else
+#define IREE_TRACE_ALLOC(ptr, size)
+#define IREE_TRACE_FREE(ptr)
+#define IREE_TRACE_ALLOC_NAMED(name, ptr, size)
+#define IREE_TRACE_FREE_NAMED(name, ptr)
+#endif  // IREE_TRACING_FEATURE_ALLOCATION_TRACKING
+
+//===----------------------------------------------------------------------===//
+// Instrumentation C++ RAII types, wrappers, and macros
+//===----------------------------------------------------------------------===//
+
+#ifdef __cplusplus
+
+#if defined(TRACY_ENABLE)
+#include "third_party/tracy/Tracy.hpp"  // IWYU pragma: export
+#endif
+
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+
+// TODO(#1886): update these to tracy and drop the 0.
+#define IREE_TRACE_SCOPE() ZoneScoped
+#define IREE_TRACE_SCOPE_DYNAMIC(name_cstr) \
+  ZoneTransientN(___tracy_scoped_zone, name_cstr, true)
+#define IREE_TRACE_SCOPE0(name_literal) ZoneScopedN(name_literal)
+#define IREE_TRACE_EVENT
+#define IREE_TRACE_EVENT0
+
+#else
+#define IREE_TRACE_THREAD_ENABLE(name)
+#define IREE_TRACE_SCOPE()
+#define IREE_TRACE_SCOPE_DYNAMIC(name_string_view)
+#define IREE_TRACE_SCOPE0(name_literal)
+#define IREE_TRACE_EVENT(void)
+#define IREE_TRACE_EVENT0
+#endif  // IREE_TRACING_FEATURE_INSTRUMENTATION
+
+// TODO(benvanik): macros for LockableCtx / Lockable mutex tracking.
+
+#endif  // __cplusplus
+
+#endif  // IREE_BASE_TRACING_H_
diff --git a/runtime/src/iree/base/wait_source.c b/runtime/src/iree/base/wait_source.c
new file mode 100644
index 0000000..b626a69
--- /dev/null
+++ b/runtime/src/iree/base/wait_source.c
@@ -0,0 +1,117 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/wait_source.h"
+
+#include "iree/base/assert.h"
+#include "iree/base/tracing.h"
+
+//===----------------------------------------------------------------------===//
+// iree_wait_source_t
+//===----------------------------------------------------------------------===//
+
+// NOTE: iree_wait_source_import lives in iree/base/internal/wait_handle.c
+// for now as that lets us compile out native wait handle support at a coarse
+// level.
+
+IREE_API_EXPORT iree_status_t iree_wait_source_export(
+    iree_wait_source_t wait_source, iree_wait_primitive_type_t target_type,
+    iree_timeout_t timeout, iree_wait_primitive_t* out_wait_primitive) {
+  IREE_ASSERT_ARGUMENT(out_wait_primitive);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_status_t status = iree_ok_status();
+  if (IREE_LIKELY(wait_source.ctl)) {
+    const iree_wait_source_export_params_t params = {
+        .target_type = target_type,
+        .timeout = timeout,
+    };
+    status = wait_source.ctl(wait_source, IREE_WAIT_SOURCE_COMMAND_EXPORT,
+                             &params, (void**)out_wait_primitive);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_wait_source_query(
+    iree_wait_source_t wait_source, iree_status_code_t* out_wait_status_code) {
+  IREE_ASSERT_ARGUMENT(out_wait_status_code);
+  *out_wait_status_code = IREE_STATUS_OK;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_status_t status = iree_ok_status();
+  if (IREE_LIKELY(wait_source.ctl)) {
+    status = wait_source.ctl(wait_source, IREE_WAIT_SOURCE_COMMAND_QUERY, NULL,
+                             (void**)out_wait_status_code);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_wait_source_wait_one(
+    iree_wait_source_t wait_source, iree_timeout_t timeout) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Capture time as an absolute value as we don't know when it's going to run.
+  iree_convert_timeout_to_absolute(&timeout);
+
+  iree_status_t status = iree_ok_status();
+  if (IREE_LIKELY(wait_source.ctl)) {
+    const iree_wait_source_wait_params_t params = {
+        .timeout = timeout,
+    };
+    status = wait_source.ctl(wait_source, IREE_WAIT_SOURCE_COMMAND_WAIT_ONE,
+                             &params, NULL);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+//===----------------------------------------------------------------------===//
+// iree_wait_source_delay
+//===----------------------------------------------------------------------===//
+
+IREE_API_EXPORT iree_status_t iree_wait_source_delay_ctl(
+    iree_wait_source_t wait_source, iree_wait_source_command_t command,
+    const void* params, void** inout_ptr) {
+  iree_time_t delay_deadline_ns = (iree_time_t)wait_source.data;
+  switch (command) {
+    case IREE_WAIT_SOURCE_COMMAND_QUERY: {
+      iree_status_code_t* out_wait_status_code = (iree_status_code_t*)inout_ptr;
+      *out_wait_status_code = iree_time_now() >= delay_deadline_ns
+                                  ? IREE_STATUS_OK
+                                  : IREE_STATUS_DEFERRED;
+      return iree_ok_status();
+    }
+    case IREE_WAIT_SOURCE_COMMAND_WAIT_ONE: {
+      iree_time_t timeout_deadline_ns = iree_timeout_as_deadline_ns(
+          ((const iree_wait_source_wait_params_t*)params)->timeout);
+      if (timeout_deadline_ns > delay_deadline_ns) {
+        // Delay is before timeout and we can perform a simple sleep.
+        return iree_wait_until(delay_deadline_ns)
+                   ? iree_ok_status()
+                   : iree_status_from_code(IREE_STATUS_DEFERRED);
+      } else {
+        // Timeout is before deadline, just wait for the deadline. We _may_
+        // wake after the delay deadline but can't be sure.
+        iree_wait_until(timeout_deadline_ns);
+        return iree_time_now() >= delay_deadline_ns
+                   ? iree_ok_status()
+                   : iree_status_from_code(IREE_STATUS_DEADLINE_EXCEEDED);
+      }
+      return iree_status_from_code(IREE_STATUS_DEFERRED);
+    }
+    case IREE_WAIT_SOURCE_COMMAND_EXPORT:
+      return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                              "delay wait sources cannot be exported");
+    default:
+      return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                              "unhandled wait source command");
+  }
+}
diff --git a/runtime/src/iree/base/wait_source.h b/runtime/src/iree/base/wait_source.h
new file mode 100644
index 0000000..4aceb49
--- /dev/null
+++ b/runtime/src/iree/base/wait_source.h
@@ -0,0 +1,336 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BASE_WAIT_SOURCE_H_
+#define IREE_BASE_WAIT_SOURCE_H_
+
+#include "iree/base/attributes.h"
+#include "iree/base/status.h"
+#include "iree/base/target_platform.h"
+#include "iree/base/time.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+//===----------------------------------------------------------------------===//
+// iree_wait_primitive_t
+//===----------------------------------------------------------------------===//
+
+#if IREE_SYNCHRONIZATION_DISABLE_UNSAFE
+// Bare metal/no synchronization available; wait handles are no-oped.
+#define IREE_WAIT_HANDLE_DISABLED 1
+#elif defined(IREE_PLATFORM_WINDOWS)
+// Though Windows can support pipes no one uses them so for simplicity we only
+// exposes HANDLEs.
+#define IREE_HAVE_WAIT_TYPE_WIN32_HANDLE 1
+#elif defined(IREE_PLATFORM_ANDROID) || defined(IREE_PLATFORM_LINUX)
+// Treat Android and modern linux as (mostly) the same.
+#define IREE_HAVE_WAIT_TYPE_EVENTFD 1
+#define IREE_HAVE_WAIT_TYPE_PIPE 1
+#else
+// BSD/Darwin/etc all have pipe.
+#define IREE_HAVE_WAIT_TYPE_PIPE 1
+#endif  // IREE_PLATFORM_*
+
+// TODO(benvanik): see if we can get sync file on linux too:
+#if defined(IREE_PLATFORM_ANDROID)
+#define IREE_HAVE_WAIT_TYPE_SYNC_FILE 1
+#endif  // IREE_PLATFORM_ANDROID
+
+#if !IREE_SYNCHRONIZATION_DISABLE_UNSAFE
+#define IREE_HAVE_WAIT_TYPE_LOCAL_FUTEX 1
+#endif  // threading enabled
+
+// Specifies the type of a system wait primitive.
+// Enums that are unavailable on a platform are still present to allow for
+// platform-independent code to still route wait primitives but actually using
+// them will fail.
+enum iree_wait_primitive_type_bits_t {
+  // Empty handle; immediately resolved.
+  IREE_WAIT_PRIMITIVE_TYPE_NONE = 0u,
+
+  // Android/Linux eventfd handle.
+  // These are akin to pipe() but require only a single handle and have
+  // significantly lower overhead (equivalent if not slightly better than
+  // pthreads condvars).
+  //
+  // eventfds support acting as both semaphores and auto reset events.
+  //
+  // More information:
+  // http://man7.org/linux/man-pages/man2/eventfd.2.html
+  IREE_WAIT_PRIMITIVE_TYPE_EVENT_FD = 1u,
+
+  // Android/Linux sync_file handle (aka 'sync fence').
+  // The handle is allocated indirectly by the device driver via the
+  // <linux/sync_file.h> API. It may be waited upon with poll(), select(), or
+  // epoll() and must be closed with close() when no longer required. If
+  // waiting on multiple sync_files the caller should first merge them
+  // together.
+  //
+  // A sync_file must only be used as fences (one-shot manual reset events).
+  //
+  // More information:
+  // https://www.kernel.org/doc/Documentation/sync_file.txt
+  // https://lwn.net/Articles/702339/
+  // https://source.android.com/devices/graphics/implement-vsync#explicit_synchronization
+  // https://developer.android.com/ndk/reference/group/sync
+  IREE_WAIT_PRIMITIVE_TYPE_SYNC_FILE = 2u,
+
+  // Android/Linux/iOS-compatible POSIX pipe handle.
+  // Two handles are generated: one for transmitting and one for receiving.
+  //
+  // More information:
+  // http://man7.org/linux/man-pages/man2/pipe.2.html
+  IREE_WAIT_PRIMITIVE_TYPE_PIPE = 3u,
+
+  // Windows HANDLE type.
+  // The HANDLE may represent a thread, event, semaphore, timer, etc.
+  //
+  // More information:
+  // https://docs.microsoft.com/en-us/windows/win32/sysinfo/object-categories
+  // https://docs.microsoft.com/en-us/windows/win32/sync/using-event-objects
+  IREE_WAIT_PRIMITIVE_TYPE_WIN32_HANDLE = 4u,
+
+  // Process-local futex.
+  // These are only valid for multi-wait when used with an in-process wait
+  // handle implementation (IREE_WAIT_API == IREE_WAIT_API_INPROC).
+  IREE_WAIT_PRIMITIVE_TYPE_LOCAL_FUTEX = 5u,
+
+  // Placeholder for wildcard queries of primitive types.
+  // On an export request this indicates that the source may export any type it
+  // can.
+  IREE_WAIT_PRIMITIVE_TYPE_ANY = 0xFFu,
+};
+typedef uint8_t iree_wait_primitive_type_t;
+
+// A handle value whose behavior is defined by the iree_wait_primitive_type_t.
+// Only the primitives available on a platform are compiled in as syscalls and
+// other associated operations that act on them aren't available anyway.
+typedef union {
+  int reserved;  // to avoid zero-sized unions
+#if defined(IREE_HAVE_WAIT_TYPE_EVENTFD)
+  // IREE_WAIT_PRIMITIVE_TYPE_EVENT_FD
+  struct {
+    int fd;
+  } event;
+#endif  // IREE_HAVE_WAIT_TYPE_EVENTFD
+#if defined(IREE_HAVE_WAIT_TYPE_SYNC_FILE)
+  // IREE_WAIT_PRIMITIVE_TYPE_SYNC_FILE
+  struct {
+    int fd;
+  } sync_file;
+#endif  // IREE_HAVE_WAIT_TYPE_SYNC_FILE
+#if defined(IREE_HAVE_WAIT_TYPE_PIPE)
+  // IREE_WAIT_PRIMITIVE_TYPE_PIPE
+  union {
+    struct {
+      int read_fd;
+      int write_fd;
+    };
+    int fds[2];
+  } pipe;
+#endif  // IREE_HAVE_WAIT_TYPE_PIPE
+#if defined(IREE_HAVE_WAIT_TYPE_WIN32_HANDLE)
+  // IREE_WAIT_PRIMITIVE_TYPE_WIN32_HANDLE
+  struct {
+    uintptr_t handle;
+  } win32;
+#endif  // IREE_HAVE_WAIT_TYPE_WIN32_HANDLE
+#if defined(IREE_HAVE_WAIT_TYPE_LOCAL_FUTEX)
+  /*iree_futex_handle_t*/ void* local_futex;
+#endif  // IREE_HAVE_WAIT_TYPE_LOCAL_FUTEX
+} iree_wait_primitive_value_t;
+
+// A (type, value) pair describing a system wait primitive handle.
+typedef struct iree_wait_primitive_t {
+  iree_wait_primitive_type_t type;
+  iree_wait_primitive_value_t value;
+} iree_wait_primitive_t;
+
+// Returns a wait primitive with the given (|type|, |value|).
+static inline iree_wait_primitive_t iree_make_wait_primitive(
+    iree_wait_primitive_type_t type, iree_wait_primitive_value_t value) {
+  iree_wait_primitive_t primitive = {type, value};
+  return primitive;
+}
+
+// Returns a wait primitive that will resolve immediately if waited on.
+static inline iree_wait_primitive_t iree_wait_primitive_immediate(void) {
+  iree_wait_primitive_value_t dummy_primitive = {0};
+  return iree_make_wait_primitive(IREE_WAIT_PRIMITIVE_TYPE_NONE,
+                                  dummy_primitive);
+}
+
+// Returns true if the |wait_primitive| is resolved immediately (empty).
+static inline bool iree_wait_primitive_is_immediate(
+    iree_wait_primitive_t wait_primitive) {
+  return wait_primitive.type == IREE_WAIT_PRIMITIVE_TYPE_NONE;
+}
+
+//===----------------------------------------------------------------------===//
+// iree_wait_source_t
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_wait_source_t iree_wait_source_t;
+
+// Controls the behavior of an iree_wait_source_ctl_fn_t callback function.
+typedef enum iree_wait_source_command_e {
+  // Queries the state of the wait source.
+  // Returns IREE_STATUS_DEFERRED if the wait source is not yet resolved.
+  //
+  // iree_wait_source_ctl_fn_t:
+  //   params: unused
+  //   inout_ptr: iree_status_code_t* out_wait_status_code
+  IREE_WAIT_SOURCE_COMMAND_QUERY = 0u,
+
+  // Tries to wait for the wait source to resolve.
+  // Returns IREE_STATUS_DEFERRED if the wait source does not support waiting.
+  //
+  // iree_wait_source_ctl_fn_t:
+  //   params: iree_wait_source_wait_params_t
+  //   inout_ptr: unused
+  IREE_WAIT_SOURCE_COMMAND_WAIT_ONE,
+
+  // Exports the wait source to a system wait handle.
+  //
+  // iree_wait_source_ctl_fn_t:
+  //   params: iree_wait_source_export_params_t
+  //   inout_ptr: iree_wait_primitive_t* out_wait_primitive
+  IREE_WAIT_SOURCE_COMMAND_EXPORT,
+} iree_wait_source_command_t;
+
+// Parameters for IREE_WAIT_SOURCE_COMMAND_WAIT_ONE.
+typedef struct iree_wait_source_wait_params_t {
+  // Timeout after which the wait will return even if the wait source is not
+  // resolved with IREE_STATUS_DEADLINE_EXCEEDED.
+  iree_timeout_t timeout;
+} iree_wait_source_wait_params_t;
+
+// Parameters for IREE_WAIT_SOURCE_COMMAND_EXPORT.
+typedef struct iree_wait_source_export_params_t {
+  // Indicates the target handle type of the export operation.
+  iree_wait_primitive_type_t target_type;
+  // Timeout after which the export will return even if the wait source is not
+  // yet available for export with IREE_STATUS_DEADLINE_EXCEEDED.
+  iree_timeout_t timeout;
+} iree_wait_source_export_params_t;
+
+// Function pointer for an iree_wait_source_t control function.
+// |command| provides the operation to perform. Optionally some commands may use
+// |params| to pass additional operation-specific parameters. |inout_ptr| usage
+// is defined by each operation.
+typedef iree_status_t(IREE_API_PTR* iree_wait_source_ctl_fn_t)(
+    iree_wait_source_t wait_source, iree_wait_source_command_t command,
+    const void* params, void** inout_ptr);
+
+// A wait source instance representing some future point in time.
+// Wait sources are promises for a system native wait handle that allow for
+// cheaper queries and waits when the full system wait path is not required.
+//
+// Wait sources may have user-defined implementations or come from system wait
+// handles via iree_wait_source_import.
+typedef struct iree_wait_source_t {
+  union {
+    struct {
+      // Control function data.
+      void* self;
+      // Implementation-defined data identifying the point in time.
+      uint64_t data;
+    };
+    // Large enough to store an iree_wait_handle_t, used when importing a
+    // system wait handle into a wait source.
+    uint64_t storage[2];
+  };
+  // ioctl-style control function servicing wait source commands.
+  // See iree_wait_source_command_t for more information.
+  iree_wait_source_ctl_fn_t ctl;
+} iree_wait_source_t;
+
+// Returns a wait source that will always immediately return as resolved.
+static inline iree_wait_source_t iree_wait_source_immediate(void) {
+  iree_wait_source_t v = {{{NULL, 0ull}}, NULL};
+  return v;
+}
+
+// Returns true if the |wait_source| is immediately resolved.
+// This can be used to neuter waits in lists/sets.
+static inline bool iree_wait_source_is_immediate(
+    iree_wait_source_t wait_source) {
+  return wait_source.ctl == NULL;
+}
+
+// Wait source control function for iree_wait_source_delay.
+IREE_API_EXPORT iree_status_t iree_wait_source_delay_ctl(
+    iree_wait_source_t wait_source, iree_wait_source_command_t command,
+    const void* params, void** inout_ptr);
+
+// Returns a wait source that indicates a delay until a point in time.
+// The source will remain unresolved until the |deadline_ns| is reached or
+// exceeded and afterward return resolved. Export is unavailable.
+static inline iree_wait_source_t iree_wait_source_delay(
+    iree_time_t deadline_ns) {
+  iree_wait_source_t v = {
+      {{NULL, (uint64_t)deadline_ns}},
+      iree_wait_source_delay_ctl,
+  };
+  return v;
+}
+
+// Returns true if the |wait_source| is a timed delay.
+// These are sleeps that can often be handled more intelligently by platforms.
+static inline bool iree_wait_source_is_delay(iree_wait_source_t wait_source) {
+  return wait_source.ctl == iree_wait_source_delay_ctl;
+}
+
+// Imports a system |wait_primitive| into a wait source in |out_wait_source|.
+// Ownership of the wait handle remains will the caller and it must remain valid
+// for the duration the wait source is in use.
+IREE_API_EXPORT iree_status_t iree_wait_source_import(
+    iree_wait_primitive_t wait_primitive, iree_wait_source_t* out_wait_source);
+
+// Exports a |wait_source| to a system wait primitive in |out_wait_primitive|.
+// If the wait source is already resolved then the wait handle will be set to
+// immediate and callers can check it with iree_wait_primitive_is_immediate.
+// If the wait source resolved with a failure then the error status will be
+// returned. The returned wait handle is owned by the wait source and will
+// remain valid for the lifetime of the wait source.
+//
+// Exporting may require a blocking operation and |timeout| can be used to
+// limit its duration.
+//
+// Returns IREE_STATUS_UNAVAILABLE if the requested primitive |target_type| is
+// unavailable on the current platform or from the given wait source.
+// Passing IREE_WAIT_PRIMITIVE_TYPE_ANY will allow the implementation to return
+// any primitive that it can.
+IREE_API_EXPORT iree_status_t iree_wait_source_export(
+    iree_wait_source_t wait_source, iree_wait_primitive_type_t target_type,
+    iree_timeout_t timeout, iree_wait_primitive_t* out_wait_primitive);
+
+// Queries the state of a |wait_source| without waiting.
+// |out_wait_status_code| will indicate the status of the source while the
+// returned value indicates the status of the query. |out_wait_status_code| will
+// be set to IREE_STATUS_DEFERRED if the wait source has not yet resolved and
+// IREE_STATUS_OK otherwise.
+IREE_API_EXPORT iree_status_t iree_wait_source_query(
+    iree_wait_source_t wait_source, iree_status_code_t* out_wait_status_code);
+
+// Blocks the caller and waits for a |wait_source| to resolve.
+// Returns IREE_STATUS_DEADLINE_EXCEEDED if |timeout| is reached before the
+// wait source resolves. If the wait source resolved with a failure then the
+// error status will be returned.
+IREE_API_EXPORT iree_status_t iree_wait_source_wait_one(
+    iree_wait_source_t wait_source, iree_timeout_t timeout);
+
+// TODO(benvanik): iree_wait_source_wait_any/all: allow multiple wait sources
+// that share the same control function. The implementation can decide if it
+// wants to coalesce them or not.
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_BASE_WAIT_SOURCE_H_
diff --git a/runtime/src/iree/builtins/BUILD b/runtime/src/iree/builtins/BUILD
new file mode 100644
index 0000000..f27d209
--- /dev/null
+++ b/runtime/src/iree/builtins/BUILD
@@ -0,0 +1,11 @@
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
diff --git a/runtime/src/iree/builtins/CMakeLists.txt b/runtime/src/iree/builtins/CMakeLists.txt
new file mode 100644
index 0000000..954e388
--- /dev/null
+++ b/runtime/src/iree/builtins/CMakeLists.txt
@@ -0,0 +1,13 @@
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# runtime/src/iree/builtins/BUILD                                              #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
+
+iree_add_all_subdirs()
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/runtime/src/iree/builtins/device/BUILD b/runtime/src/iree/builtins/device/BUILD
new file mode 100644
index 0000000..f670428
--- /dev/null
+++ b/runtime/src/iree/builtins/device/BUILD
@@ -0,0 +1,23 @@
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("//iree:build_defs.oss.bzl", "iree_runtime_cc_library")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+iree_runtime_cc_library(
+    name = "device",
+    srcs = [
+        "device_generic.c",
+    ],
+    hdrs = [
+        "device.h",
+    ],
+)
diff --git a/runtime/src/iree/builtins/device/CMakeLists.txt b/runtime/src/iree/builtins/device/CMakeLists.txt
new file mode 100644
index 0000000..b16043e
--- /dev/null
+++ b/runtime/src/iree/builtins/device/CMakeLists.txt
@@ -0,0 +1,25 @@
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# runtime/src/iree/builtins/device/BUILD                                       #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
+
+iree_add_all_subdirs()
+
+iree_cc_library(
+  NAME
+    device
+  HDRS
+    "device.h"
+  SRCS
+    "device_generic.c"
+  DEPS
+
+  PUBLIC
+)
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/runtime/src/iree/builtins/device/README.md b/runtime/src/iree/builtins/device/README.md
new file mode 100644
index 0000000..288ec4c
--- /dev/null
+++ b/runtime/src/iree/builtins/device/README.md
@@ -0,0 +1,213 @@
+IREE CPU Device Library: `libdevice`
+====================================
+
+This library provides builtin functions to the IREE generated CPU code. It
+covers the role of a compiler runtime library handling things like soft float
+builtin calls produced during code generation and a support library to ease
+implementation of more complex intrinsic-like functionality. The code in this
+library is compiled into bitcode files and embedded inside the IREE compiler
+which then links it into the generated code before emitting the final user
+output.
+
+```
++------------+      +-------+      +-------------------------------+
+| device_*.c | ---> | clang | ---> |+-------------------------------+
++------------+      +-------+      +| libdevice_[arch]_[variant].bc |
+                                    +-------------------------------+
+                                                  |||
+                                                  vvv
+      +------------+      +---------+      +================+
+      | input.mlir | ---> | codegen | ---> | iree-compile   |
+      +------------+      +---------+      +================+
+                                                   |
+                      +----------------------------+
+                      v                            v
+         +------------------------+   +----------------------------+
+         | static library (.o/.a) |   | dynamic library (.so/.dll) |
+         +------------------------+   +----------------------------+
+```
+
+Good examples of things this library can provide:
+* float16/half support functions
+* MMA-like intrinsics for architecture-optimized tiled matrix multiplies
+* Atomic intrinsics
+
+Bad examples:
+* A full convolution kernel
+* Anything only used in only one particular configuration or target
+* Frequently changing code
+
+### Why Not C++ Passes?
+
+This approach of an external library that is linked in via bitcode is a tradeoff
+that favors a familiar environment for architecture-specific implementations and
+reusable code to custom MLIR passes that directly construct the IR. It will
+always be better from a technical standpoint to directly perform these
+specializations inside compiler passes as all information is available, multiple
+levels of optimization at MLIR `vector` and `llvm` dialect levels can hoist and
+fold aggressively, and specialization is possible using the entire context. It's
+encouraged that work is done there when possible and some of the cases handled
+by this library may end up being done in that environment.
+
+As a reusable library this approach allows for other backends - such as the IREE
+VMVX backend - to share the same optimized implementations. Having standalone
+tests and benchmarks also allows for fast iteration without needing to modify
+the compiler.
+
+The hope is that over time things added here will be moved into the compiler and
+this becomes mostly a lightweight intrinsics library and staging ground for
+experimental features that require quick iteration in C.
+
+## Bitcode Files
+
+The IREE compiler embeds bitcode files and when producing executable libraries
+will select one for linkage based on the specified target machine. As these
+bitcode files can only be produced by a cross-compilation-enabled Clang they are
+built offline and checked into the repository. Future improvements to the
+compiler could also allow for external files to be specified to avoid the need
+to rebuild the compiler however for now this keeps things simple and hermetic.
+
+The naming convention is `libdevice_[arch]_[features].bc`, corresponding to the
+source files of `device_[arch].c` with the features specifying conditional
+target CPU features such as extended instruction sets. When no special features
+are required `generic` is used.
+
+For example, the implementations for all ISA variants of AArch64 would be found
+in a `device_aarch64.c` and an implementation for the baseline ISA
+is compiled into `libdevice_aarch64_generic.bc`. When the dot product
+instructions are available (`-march=armv8.2-a+dotprod`) the more specialized
+`libdevice_aarch64_dotprod.bc` bitcode file would be used.
+
+### Updating Bitcode Files
+
+The bitcode files need to be rebuilt whenever the source is modified, new
+variants are added, or new architectures are targeted. The
+[`bin/build.sh`](bin/build.sh) uses a compatible Clang and LLVM toolchain to
+produce the files in the correct format and location.
+
+Requirements:
+* A modern version of Clang/LLVM (tested with 13)
+* A build of llvm-as with all target architectures linked in
+
+This script could use some usability improvements, but for now a common
+invocation will look like:
+```sh
+LLVM_AS=/usr/bin/llvm-as \
+CLANG=/usr/bin/clang-13 \
+./iree/builtins/device/bin/build.sh
+```
+
+If there are complaints that llvm-as does not support a target architecture then
+the llvm-as included in the IREE CMake distribution should be built and provided
+by way of the `IREE_BUILD_DIR`:
+```sh
+IREE_BUILD_DIR=../iree-build \
+CLANG=/usr/bin/clang-13 \
+./iree/builtins/device/bin/build.sh
+```
+
+After this the newly updated/added bitcode files can be added to git.
+
+### Compiler Bitcode Selection
+
+The logic in the compiler for selecting which bitcode file to use is found in
+[`iree/compiler/Dialect/HAL/Target/LLVM/Builtins/Device.cpp`](/iree/compiler/Dialect/HAL/Target/LLVM/Builtins/Device.cpp).
+The `lookupDeviceFile` function uses the `llvm::TargetMachine` to query the
+architecture, CPU features, and other properties to choose the corresponding
+bitcode file. If no matching bitcode file is found a fallback of the WebAssembly
+generic implementation is used as its bitcode is generally portable. It's not
+fast, though, and should only be used for correctness testing during bringup.
+
+### Adding an Architecture/ISA Bitcode File
+
+First copy [`device_generic.c`](device_generic.c) and name it consistent with
+the canonical LLVM architecture (the first part of the target triple, e.g. if
+you pass `--target=aarch64-arm-none-eabi` to Clang you'd name it `aarch64`).
+
+From there guard the new file with the architecture-specific preprocessor guards
+and add the inverse to `device_generic.c` to prevent it from being used when the
+source files are globbed.
+
+To build the new bitcode file add a `make_arch_bc` call to [`bin/build.sh`](bin/build.sh).
+The flags provided are passed directly to Clang and can be used to control the
+compilation environment with the requirement being that the corresponding
+selection logic is updated in `Device.cpp`.
+
+Finally update the [`iree/compiler/Dialect/HAL/Target/LLVM/Builtins/Device.cpp`](/iree/compiler/Dialect/HAL/Target/LLVM/Builtins/Device.cpp)
+file in the compiler to select the new bitcode file based on the
+`llvm::TargetMachine` in the same way that it is produced with `make_arch_bc`.
+
+Ergonomic improvements here would allow for function-level multi-versioning such
+that bitcode files per architecture could be used instead of requiring
+per-feature variants of each bitcode file.
+
+## Engineering Requirements
+
+As this library is directly merged into the compiler-generated code there are
+specific restrictions as to what can be used inherited from the IREE executable
+requirements:
+
+* No mutable globals/static variables or thread-local storage
+* No syscalls
+* No libc calls outside of builtins (like memset/memcpy) - _no mallocs_!
+
+Though the primary usage of the library is through the precompiled bitcode files
+that only need to work with Clang the library may also be built on other
+toolchains such as GCC and MSVC (or older version of Clang). When standard
+intrinsics are used this will generally not be a problem however inline assembly
+may need compiler-specific variants or at least exclusions that fall back to
+generic paths.
+
+### Compile-time Configuration
+
+Preprocessor statements used to control behavior must only use information known
+when the bitcode files are being compiled. This means that if the bitcode file
+being produced is for AArch64 it is safe to use the `__aarch64__` macro.
+Information that is only available after the bitcode file is produced - such as
+in the IREE compiler pipelines - must use link-time configuration.
+
+### Link-time Configuration
+
+As we are producing bitcode files we cannot rely on the C preprocessor for
+changing behavior based on some information only known during linking. In other
+cases we may want to specialize code paths based on knowledge about the context
+in which the kernels are used. To provide this link-time modification ability
+there is support for flags by way of `extern` globals. These globals are either
+specified by the IREE compiler when linking the bitcode or by the hosting
+application when linked statically.
+
+Each flag is defined in `device.h`; for example:
+```c
+extern int libdevice_platform_example_flag;
+```
+
+Any code may then use this flag to condition/control behavior:
+```c
+if (libdevice_platform_example_flag >= 1) {
+  // Do something special.
+}
+```
+
+When linking libdevice statically the flags can be provided by the hosting
+application via compiler defines: `-DLIBDEVICE_PLATFORM_EXAMPLE_FLAG=123`.
+
+When producing bitcode the flags are left symbolic and the IREE compiler
+provides their values:
+```c++
+overridePlatformGlobal(*bitcodeModule, "libdevice_platform_example_flag", 123u);
+```
+
+What flags are useful and how to handle cases where flags are arch-dependent are
+still TBD.
+
+## Testing and Benchmarking
+
+[`tools/libdevice_test.cc`](tools/libdevice_test.cc) provides a gtest runner
+that compares the results of the optimized implementations for the target
+architecture against a reference implementation for correctness.
+
+[`tools/libdevice_benchmark.c`](tools/libdevice_benchmark.c) provides a
+benchmark suite for the optimized implementations of the target architecture.
+
+Both are compiled for the CMake target and can be used to develop
+implementations without the need to rebuild/run the compiler.
diff --git a/runtime/src/iree/builtins/device/bin/BUILD b/runtime/src/iree/builtins/device/bin/BUILD
new file mode 100644
index 0000000..286e32a
--- /dev/null
+++ b/runtime/src/iree/builtins/device/bin/BUILD
@@ -0,0 +1,28 @@
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("//build_tools/embed_data:build_defs.bzl", "c_embed_data")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+c_embed_data(
+    name = "libdevice",
+    srcs = [
+        "libdevice_wasm32_generic.bc",
+        "libdevice_wasm64_generic.bc",
+    ],
+    c_file_output = "libdevice.c",
+    flatten = True,
+    h_file_output = "libdevice.h",
+    identifier = "iree_builtins_libdevice",
+    deps = [
+        "//runtime/src:runtime_defines",
+    ],
+)
diff --git a/runtime/src/iree/builtins/device/bin/CMakeLists.txt b/runtime/src/iree/builtins/device/bin/CMakeLists.txt
new file mode 100644
index 0000000..105bf87
--- /dev/null
+++ b/runtime/src/iree/builtins/device/bin/CMakeLists.txt
@@ -0,0 +1,31 @@
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# runtime/src/iree/builtins/device/bin/BUILD                                   #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
+
+iree_add_all_subdirs()
+
+iree_c_embed_data(
+  NAME
+    libdevice
+  SRCS
+    "libdevice_wasm32_generic.bc"
+    "libdevice_wasm64_generic.bc"
+  DEPS
+
+  C_FILE_OUTPUT
+    "libdevice.c"
+  H_FILE_OUTPUT
+    "libdevice.h"
+  IDENTIFIER
+    "iree_builtins_libdevice"
+  FLATTEN
+  PUBLIC
+)
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/runtime/src/iree/builtins/device/bin/build.sh b/runtime/src/iree/builtins/device/bin/build.sh
new file mode 100644
index 0000000..11f793a
--- /dev/null
+++ b/runtime/src/iree/builtins/device/bin/build.sh
@@ -0,0 +1,67 @@
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# Example command line:
+#   LLVM_AS=/usr/bin/llvm-as \
+#   CLANG=/usr/bin/clang-13 \
+#   ./iree/builtins/device/bin/build.sh
+
+set -x
+set -e
+
+CLANG="${CLANG:-clang}"
+# TODO(benvanik): figure out how to get this path from clang itself.
+CLANG_INCLUDE="${CLANG_INCLUDE:-/usr/lib/llvm-13/lib/clang/13.0.0/include/}"
+IREE_SRC_DIR="$(git rev-parse --show-toplevel)"
+IREE_BUILD_DIR="${IREE_BUILD_DIR:-${IREE_SRC_DIR?}/../build}"
+LLVM_AS="${LLVM_AS:-${IREE_BUILD_DIR}/third_party/llvm-project/llvm/bin/llvm-as}"
+
+SCRIPT_DIR="$(realpath `dirname $0`)"
+OUT="${SCRIPT_DIR?}/"
+SRC="${SCRIPT_DIR?}/.."
+
+function make_arch_bc {
+  local ARCH=$1
+  local FEATURES=$2
+  local SOURCE_FILE=$3
+  local FILE_BASENAME="${OUT}/libdevice_${ARCH}_${FEATURES}"
+
+  # Generate an LLVM IR assembly listing so we can easily read the file.
+  # This is not checked in or used by the compiler.
+  ${CLANG?} \
+      "${@:4}" \
+      -isystem "${CLANG_INCLUDE?}" \
+      -std=c17 \
+      -O3 \
+      -fno-ident \
+      -fvisibility=hidden \
+      -nostdinc \
+      -S \
+      -emit-llvm \
+      -fdiscard-value-names \
+      -DIREE_DEVICE_STANDALONE \
+      -o "${FILE_BASENAME}.ll" \
+      -c \
+      "${SRC}/${SOURCE_FILE}"
+
+  # Clang adds a bunch of bad attributes and host-specific information that we
+  # don't want (so we get at least somewhat deterministic builds).
+  sed -i 's/^;.*$//' "${FILE_BASENAME}.ll"
+  sed -i 's/^source_filename.*$//' "${FILE_BASENAME}.ll"
+  sed -i 's/^target datalayout.*$//' "${FILE_BASENAME}.ll"
+  sed -i 's/^target triple.*$//' "${FILE_BASENAME}.ll"
+  sed -i 's/^\(attributes #[0-9]* = {\).*$/\1 inlinehint }/' "${FILE_BASENAME}.ll"
+
+  # Generate a binary bitcode file embedded into the compiler binary.
+  # NOTE: we do this from stdin so that the filename on the user's system is not
+  # embedded in the bitcode file (making it non-deterministic).
+  cat "${FILE_BASENAME}.ll" | ${LLVM_AS} -o="${FILE_BASENAME}.bc"
+}
+
+make_arch_bc "wasm32" "generic" "device_generic.c" \
+    --target=wasm32
+make_arch_bc "wasm64" "generic" "device_generic.c" \
+    --target=wasm64
diff --git a/runtime/src/iree/builtins/device/bin/libdevice_wasm32_generic.bc b/runtime/src/iree/builtins/device/bin/libdevice_wasm32_generic.bc
new file mode 100644
index 0000000..26e2310
--- /dev/null
+++ b/runtime/src/iree/builtins/device/bin/libdevice_wasm32_generic.bc
Binary files differ
diff --git a/runtime/src/iree/builtins/device/bin/libdevice_wasm64_generic.bc b/runtime/src/iree/builtins/device/bin/libdevice_wasm64_generic.bc
new file mode 100644
index 0000000..26e2310
--- /dev/null
+++ b/runtime/src/iree/builtins/device/bin/libdevice_wasm64_generic.bc
Binary files differ
diff --git a/runtime/src/iree/builtins/device/device.h b/runtime/src/iree/builtins/device/device.h
new file mode 100644
index 0000000..4378101
--- /dev/null
+++ b/runtime/src/iree/builtins/device/device.h
@@ -0,0 +1,120 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BUILTINS_DEVICE_DEVICE_H_
+#define IREE_BUILTINS_DEVICE_DEVICE_H_
+
+//===----------------------------------------------------------------------===//
+// A simplified libc/libm-alike that is designed to compile to portable LLVM IR.
+//===----------------------------------------------------------------------===//
+// This library is focused on supporting the subset of LLVM's RuntimeLibcalls
+// that we need in our embedded executable binaries. This means that things like
+// printf, malloc, etc are excluded.
+//
+// See the full list of possible functions here:
+// third_party/llvm-project/llvm/include/llvm/IR/RuntimeLibcalls.def
+//
+// Code here must not use any system headers - as almost all pull in bits/ and
+// various other target-dependent definitions that make the resulting IR
+// non-portable. This means there is no size_t, etc. Any definitions that may
+// come from an std* file must be redefined here with care.
+//
+// Code must also not use any mutable global or thread-local state ala
+// errno/rounding modes/etc. Each of the functions in the library will be called
+// concurrently from multiple threads and from multiple source modules. There
+// must be no mutable static values anywhere.
+//
+// Avoid #ifdef entirely: they indicate a leakage of host build configuration
+// into what is supposed to be a portable module. Anything that requires
+// target-specific conditional logic must be implemented via an extern that
+// can be substituted by the IREE compiler when producing the final
+// target-specific module.
+
+//===----------------------------------------------------------------------===//
+// Configuration
+//===----------------------------------------------------------------------===//
+
+// IREE_DEVICE_STANDALONE:
+// Define to have libdevice's implementation of builtins alias the standard
+// names. If undefined then the host toolchain implementations will be used.
+
+//===----------------------------------------------------------------------===//
+// Attributes and metadata
+//===----------------------------------------------------------------------===//
+
+// Tagged on functions that are part of the public API.
+#ifdef __cplusplus
+#define IREE_DEVICE_EXPORT extern "C"
+#else
+#define IREE_DEVICE_EXPORT
+#endif  // __cplusplus
+
+// `restrict` keyword, not supported by some older compilers.
+// We define our own macro in case dependencies use `restrict` differently.
+#if defined(_MSC_VER) && _MSC_VER >= 1900
+#define IREE_DEVICE_RESTRICT __restrict
+#elif defined(_MSC_VER)
+#define IREE_DEVICE_RESTRICT
+#elif defined(__cplusplus)
+#define IREE_DEVICE_RESTRICT __restrict__
+#else
+#define IREE_DEVICE_RESTRICT restrict
+#endif  // _MSC_VER
+
+//===----------------------------------------------------------------------===//
+// stdint.h
+//===----------------------------------------------------------------------===//
+// https://pubs.opengroup.org/onlinepubs/009604599/basedefs/stdint.h.html
+// NOTE: no size_t/ptrdiff_t/etc (as they are target dependent).
+
+#if !defined(INT8_MIN)
+
+typedef signed char int8_t;
+typedef short int16_t;
+typedef int int32_t;
+typedef long long int64_t;
+typedef unsigned char uint8_t;
+typedef unsigned short uint16_t;
+typedef unsigned int uint32_t;
+typedef unsigned long long uint64_t;
+
+#define INT8_MIN (-127i8 - 1)
+#define INT16_MIN (-32767i16 - 1)
+#define INT32_MIN (-2147483647i32 - 1)
+#define INT64_MIN (-9223372036854775807i64 - 1)
+#define INT8_MAX 127i8
+#define INT16_MAX 32767i16
+#define INT32_MAX 2147483647i32
+#define INT64_MAX 9223372036854775807i64
+#define UINT8_MAX 0xffui8
+#define UINT16_MAX 0xffffui16
+#define UINT32_MAX 0xffffffffui32
+#define UINT64_MAX 0xffffffffffffffffui64
+
+#endif  // !INT8_MIN
+
+//===----------------------------------------------------------------------===//
+// Target-specific queries
+//===----------------------------------------------------------------------===//
+// These are substituted with values from the compiler and must not be specified
+// here in C before we generate the IR.
+
+// Do not use: here as an example. Remove once we have any other flag.
+extern int libdevice_platform_example_flag;
+// The value used when not coming from the compiler.
+#define LIBDEVICE_PLATFORM_EXAMPLE_FLAG 0
+
+//===----------------------------------------------------------------------===//
+// Public API
+//===----------------------------------------------------------------------===//
+
+// Converts a 16-bit floating-point value to a 32-bit C `float`.
+IREE_DEVICE_EXPORT float iree_h2f_ieee(short param);
+
+// Converts a 32-bit C `float` value to a 16-bit floating-point value.
+IREE_DEVICE_EXPORT short iree_f2h_ieee(float param);
+
+#endif  // IREE_BUILTINS_DEVICE_DEVICE_H_
diff --git a/runtime/src/iree/builtins/device/device_generic.c b/runtime/src/iree/builtins/device/device_generic.c
new file mode 100644
index 0000000..3d55f71
--- /dev/null
+++ b/runtime/src/iree/builtins/device/device_generic.c
@@ -0,0 +1,121 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "device.h"
+
+#if !defined(IREE_DEVICE_STANDALONE)
+int libdevice_platform_example_flag = LIBDEVICE_PLATFORM_EXAMPLE_FLAG;
+#endif  // IREE_DEVICE_STANDALONE
+
+IREE_DEVICE_EXPORT float iree_h2f_ieee(short param) {
+  unsigned short expHalf16 = param & 0x7C00;
+  int exp1 = (int)expHalf16;
+  unsigned short mantissa16 = param & 0x03FF;
+  int mantissa1 = (int)mantissa16;
+  int sign = (int)(param & 0x8000);
+  sign = sign << 16;
+
+  // nan or inf
+  if (expHalf16 == 0x7C00) {
+    // nan
+    if (mantissa16 > 0) {
+      int res = (0x7FC00000 | sign);
+      float fres = *((float*)(&res));
+      return fres;
+    }
+    // inf
+    int res = (0x7F800000 | sign);
+    float fres = *((float*)(&res));
+    return fres;
+  }
+  if (expHalf16 != 0) {
+    exp1 += ((127 - 15) << 10);  // exponents converted to float32 bias
+    int res = (exp1 | mantissa1);
+    res = res << 13;
+    res = (res | sign);
+    float fres = *((float*)(&res));
+    return fres;
+  }
+
+  int xmm1 = exp1 > (1 << 10) ? exp1 : (1 << 10);
+  xmm1 = (xmm1 << 13);
+  xmm1 += ((127 - 15 - 10) << 23);  // add the bias difference to xmm1
+  xmm1 = xmm1 | sign;               // Combine with the sign mask
+
+  float res = (float)mantissa1;  // Convert mantissa to float
+  res *= *((float*)(&xmm1));
+
+  return res;
+}
+
+IREE_DEVICE_EXPORT short iree_f2h_ieee(float param) {
+  unsigned int param_bit = *((unsigned int*)(&param));
+  int sign = param_bit >> 31;
+  int mantissa = param_bit & 0x007FFFFF;
+  int exp = ((param_bit & 0x7F800000) >> 23) + 15 - 127;
+  short res;
+  if (exp > 0 && exp < 30) {
+    // use rte rounding mode, round the significand, combine sign, exponent and
+    // significand into a short.
+    res = (sign << 15) | (exp << 10) | ((mantissa + 0x00001000) >> 13);
+  } else if (param_bit == 0) {
+    res = 0;
+  } else {
+    if (exp <= 0) {
+      if (exp < -10) {
+        // value is less than min half float point
+        res = 0;
+      } else {
+        // normalized single, magnitude is less than min normal half float
+        // point.
+        mantissa = (mantissa | 0x00800000) >> (1 - exp);
+        // round to nearest
+        if ((mantissa & 0x00001000) > 0) {
+          mantissa = mantissa + 0x00002000;
+        }
+        // combine sign & mantissa (exp is zero to get denormalized number)
+        res = (sign << 15) | (mantissa >> 13);
+      }
+    } else if (exp == (255 - 127 + 15)) {
+      if (mantissa == 0) {
+        // input float is infinity, return infinity half
+        res = (sign << 15) | 0x7C00;
+      } else {
+        // input float is NaN, return half NaN
+        res = (sign << 15) | 0x7C00 | (mantissa >> 13);
+      }
+    } else {
+      // exp > 0, normalized single, round to nearest
+      if ((mantissa & 0x00001000) > 0) {
+        mantissa = mantissa + 0x00002000;
+        if ((mantissa & 0x00800000) > 0) {
+          mantissa = 0;
+          exp = exp + 1;
+        }
+      }
+      if (exp > 30) {
+        // exponent overflow - return infinity half
+        res = (sign << 15) | 0x7C00;
+      } else {
+        // combine sign, exp and mantissa into normalized half
+        res = (sign << 15) | (exp << 10) | (mantissa >> 13);
+      }
+    }
+  }
+  return res;
+}
+
+#if defined(IREE_DEVICE_STANDALONE)
+
+IREE_DEVICE_EXPORT float __gnu_h2f_ieee(short param) {
+  return iree_h2f_ieee(param);
+}
+
+IREE_DEVICE_EXPORT short __gnu_f2h_ieee(float param) {
+  return iree_f2h_ieee(param);
+}
+
+#endif  // IREE_DEVICE_STANDALONE
diff --git a/runtime/src/iree/builtins/device/tools/BUILD b/runtime/src/iree/builtins/device/tools/BUILD
new file mode 100644
index 0000000..de878d0
--- /dev/null
+++ b/runtime/src/iree/builtins/device/tools/BUILD
@@ -0,0 +1,37 @@
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("//build_tools/bazel:cc_binary_benchmark.bzl", "cc_binary_benchmark")
+load("//iree:build_defs.oss.bzl", "iree_runtime_cc_test")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_binary_benchmark(
+    name = "libdevice_benchmark",
+    srcs = ["libdevice_benchmark.c"],
+    deps = [
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base/internal:flags",
+        "//runtime/src/iree/builtins/device",
+        "//runtime/src/iree/testing:benchmark",
+    ],
+)
+
+iree_runtime_cc_test(
+    name = "libdevice_test",
+    srcs = ["libdevice_test.cc"],
+    deps = [
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base/internal:flags",
+        "//runtime/src/iree/builtins/device",
+        "//runtime/src/iree/testing:gtest",
+        "//runtime/src/iree/testing:gtest_main",
+    ],
+)
diff --git a/runtime/src/iree/builtins/device/tools/CMakeLists.txt b/runtime/src/iree/builtins/device/tools/CMakeLists.txt
new file mode 100644
index 0000000..70e68f0
--- /dev/null
+++ b/runtime/src/iree/builtins/device/tools/CMakeLists.txt
@@ -0,0 +1,39 @@
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# runtime/src/iree/builtins/device/tools/BUILD                                 #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
+
+iree_add_all_subdirs()
+
+iree_cc_binary_benchmark(
+  NAME
+    libdevice_benchmark
+  SRCS
+    "libdevice_benchmark.c"
+  DEPS
+    iree::base
+    iree::base::internal::flags
+    iree::builtins::device
+    iree::testing::benchmark
+  TESTONLY
+)
+
+iree_cc_test(
+  NAME
+    libdevice_test
+  SRCS
+    "libdevice_test.cc"
+  DEPS
+    iree::base
+    iree::base::internal::flags
+    iree::builtins::device
+    iree::testing::gtest
+    iree::testing::gtest_main
+)
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/runtime/src/iree/builtins/device/tools/libdevice_benchmark.c b/runtime/src/iree/builtins/device/tools/libdevice_benchmark.c
new file mode 100644
index 0000000..0814f56
--- /dev/null
+++ b/runtime/src/iree/builtins/device/tools/libdevice_benchmark.c
@@ -0,0 +1,78 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/api.h"
+#include "iree/base/internal/flags.h"
+#include "iree/builtins/device/device.h"
+#include "iree/testing/benchmark.h"
+
+// Example flag; not really useful:
+IREE_FLAG(int32_t, batch_count, 64, "Ops to run per benchmark iteration.");
+
+static iree_status_t iree_h2f_ieee_benchmark(
+    const iree_benchmark_def_t* benchmark_def,
+    iree_benchmark_state_t* benchmark_state) {
+  while (iree_benchmark_keep_running(benchmark_state,
+                                     /*batch_count=*/FLAG_batch_count)) {
+    for (int i = 0; i < FLAG_batch_count; ++i) {
+      // TODO(benvanik): iree_do_not_optimize barrier.
+      iree_h2f_ieee(0x3400 + i);
+    }
+  }
+  return iree_ok_status();
+}
+
+static iree_status_t iree_f2h_ieee_benchmark(
+    const iree_benchmark_def_t* benchmark_def,
+    iree_benchmark_state_t* benchmark_state) {
+  while (iree_benchmark_keep_running(benchmark_state,
+                                     /*batch_count=*/FLAG_batch_count)) {
+    for (int i = 0; i < FLAG_batch_count; ++i) {
+      // TODO(benvanik): iree_do_not_optimize barrier.
+      iree_f2h_ieee(0.25f + i);
+    }
+  }
+  return iree_ok_status();
+}
+
+int main(int argc, char** argv) {
+  iree_flags_set_usage(
+      "libdevice_benchmark",
+      "Benchmarks the libdevice implementation of the target machine.\n"
+      "\n");
+
+  iree_flags_parse_checked(IREE_FLAGS_PARSE_MODE_UNDEFINED_OK, &argc, &argv);
+  iree_benchmark_initialize(&argc, argv);
+
+  {
+    static const iree_benchmark_def_t benchmark_def = {
+        .flags = IREE_BENCHMARK_FLAG_MEASURE_PROCESS_CPU_TIME |
+                 IREE_BENCHMARK_FLAG_USE_REAL_TIME,
+        .time_unit = IREE_BENCHMARK_UNIT_NANOSECOND,
+        .minimum_duration_ns = 0,
+        .iteration_count = 0,
+        .run = iree_h2f_ieee_benchmark,
+        .user_data = NULL,
+    };
+    iree_benchmark_register(IREE_SV("iree_h2f_ieee"), &benchmark_def);
+  }
+
+  {
+    static const iree_benchmark_def_t benchmark_def = {
+        .flags = IREE_BENCHMARK_FLAG_MEASURE_PROCESS_CPU_TIME |
+                 IREE_BENCHMARK_FLAG_USE_REAL_TIME,
+        .time_unit = IREE_BENCHMARK_UNIT_NANOSECOND,
+        .minimum_duration_ns = 0,
+        .iteration_count = 0,
+        .run = iree_f2h_ieee_benchmark,
+        .user_data = NULL,
+    };
+    iree_benchmark_register(IREE_SV("iree_f2h_ieee"), &benchmark_def);
+  }
+
+  iree_benchmark_run_specified();
+  return 0;
+}
diff --git a/runtime/src/iree/builtins/device/tools/libdevice_test.cc b/runtime/src/iree/builtins/device/tools/libdevice_test.cc
new file mode 100644
index 0000000..adeed0a
--- /dev/null
+++ b/runtime/src/iree/builtins/device/tools/libdevice_test.cc
@@ -0,0 +1,22 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <cstring>
+
+#include "iree/base/api.h"
+#include "iree/builtins/device/device.h"
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+TEST(LibDeviceTest, iree_h2f_ieee) {
+  // Just ensuring that the code links.
+  EXPECT_EQ(0.25f, iree_h2f_ieee(0x3400));
+}
+
+TEST(LibDeviceTest, iree_f2h_ieee) {
+  // Just ensuring that the code links.
+  EXPECT_EQ(0x3400, iree_f2h_ieee(0.25f));
+}
diff --git a/runtime/src/iree/builtins/musl/BUILD b/runtime/src/iree/builtins/musl/BUILD
new file mode 100644
index 0000000..f27d209
--- /dev/null
+++ b/runtime/src/iree/builtins/musl/BUILD
@@ -0,0 +1,11 @@
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
diff --git a/runtime/src/iree/builtins/musl/CMakeLists.txt b/runtime/src/iree/builtins/musl/CMakeLists.txt
new file mode 100644
index 0000000..8da1a73
--- /dev/null
+++ b/runtime/src/iree/builtins/musl/CMakeLists.txt
@@ -0,0 +1,13 @@
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# runtime/src/iree/builtins/musl/BUILD                                         #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
+
+iree_add_all_subdirs()
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/runtime/src/iree/builtins/musl/Makefile_wasm32.iree b/runtime/src/iree/builtins/musl/Makefile_wasm32.iree
new file mode 100644
index 0000000..b79444b
--- /dev/null
+++ b/runtime/src/iree/builtins/musl/Makefile_wasm32.iree
@@ -0,0 +1,35 @@
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+musldir=${MUSL_DIR}
+include $(musldir)/Makefile
+
+IREE_BASE_SRCS = $(addprefix $(srcdir)/, \
+	src/fenv/fenv.c \
+	src/math/ceilf.c \
+	src/math/floorf.c \
+	src/math/fmaf.c \
+	src/math/fmodf.c \
+	src/math/powf.c \
+	src/math/expf.c \
+	src/math/powf_data.c \
+	src/math/exp2f_data.c \
+	src/math/__math_invalidf.c \
+	src/math/__math_oflowf.c \
+	src/math/__math_uflowf.c \
+	src/math/__math_xflowf.c)
+IREE_BASE_LLS = $(patsubst $(srcdir)/%,%.ll,$(basename $(IREE_BASE_SRCS)))
+IREE_BASE_BCS = $(patsubst $(srcdir)/%,%.bc,$(basename $(IREE_BASE_SRCS)))
+IREE_LL_FILES = $(addprefix obj/, $(IREE_BASE_LLS))
+IREE_CFLAGS=-Xclang -disable-llvm-passes -fno-ident -fvisibility=hidden -target wasm32
+LL_CMD = $(CC) $(CFLAGS_ALL) $(IREE_CFLAGS) -S -emit-llvm -o $@ -c $<
+
+obj/%.ll: $(musldir)/%.c obj/include/bits/alltypes.h
+	$(LL_CMD)
+
+iree: $(IREE_LL_FILES)
+	$(info $$IREE_BASE_SRCS is [${IREE_BASE_SRCS}])
+	$(info $$IREE_LL_FILES is [${IREE_LL_FILES}])
diff --git a/runtime/src/iree/builtins/musl/Makefile_wasm64.iree b/runtime/src/iree/builtins/musl/Makefile_wasm64.iree
new file mode 100644
index 0000000..5e3d956
--- /dev/null
+++ b/runtime/src/iree/builtins/musl/Makefile_wasm64.iree
@@ -0,0 +1,35 @@
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+musldir=${MUSL_DIR}
+include $(musldir)/Makefile
+
+IREE_BASE_SRCS = $(addprefix $(srcdir)/, \
+	src/fenv/fenv.c \
+	src/math/ceilf.c \
+	src/math/floorf.c \
+	src/math/fmaf.c \
+	src/math/fmodf.c \
+	src/math/powf.c \
+	src/math/expf.c \
+	src/math/powf_data.c \
+	src/math/exp2f_data.c \
+	src/math/__math_invalidf.c \
+	src/math/__math_oflowf.c \
+	src/math/__math_uflowf.c \
+	src/math/__math_xflowf.c)
+IREE_BASE_LLS = $(patsubst $(srcdir)/%,%.ll,$(basename $(IREE_BASE_SRCS)))
+IREE_BASE_BCS = $(patsubst $(srcdir)/%,%.bc,$(basename $(IREE_BASE_SRCS)))
+IREE_LL_FILES = $(addprefix obj/, $(IREE_BASE_LLS))
+IREE_CFLAGS=-Xclang -disable-llvm-passes -fno-ident -fvisibility=hidden -target wasm64
+LL_CMD = $(CC) $(CFLAGS_ALL) $(IREE_CFLAGS) -S -emit-llvm -o $@ -c $<
+
+obj/%.ll: $(musldir)/%.c obj/include/bits/alltypes.h
+	$(LL_CMD)
+
+iree: $(IREE_LL_FILES)
+	$(info $$IREE_BASE_SRCS is [${IREE_BASE_SRCS}])
+	$(info $$IREE_LL_FILES is [${IREE_LL_FILES}])
diff --git a/runtime/src/iree/builtins/musl/bin/BUILD b/runtime/src/iree/builtins/musl/bin/BUILD
new file mode 100644
index 0000000..d9a2529
--- /dev/null
+++ b/runtime/src/iree/builtins/musl/bin/BUILD
@@ -0,0 +1,28 @@
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("//build_tools/embed_data:build_defs.bzl", "c_embed_data")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+c_embed_data(
+    name = "libmusl",
+    srcs = [
+        "libmusl_wasm32_generic.bc",
+        "libmusl_wasm64_generic.bc",
+    ],
+    c_file_output = "libmusl.c",
+    flatten = True,
+    h_file_output = "libmusl.h",
+    identifier = "iree_builtins_libmusl",
+    deps = [
+        "//runtime/src:runtime_defines",
+    ],
+)
diff --git a/runtime/src/iree/builtins/musl/bin/CMakeLists.txt b/runtime/src/iree/builtins/musl/bin/CMakeLists.txt
new file mode 100644
index 0000000..433fd58
--- /dev/null
+++ b/runtime/src/iree/builtins/musl/bin/CMakeLists.txt
@@ -0,0 +1,31 @@
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# runtime/src/iree/builtins/musl/bin/BUILD                                     #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
+
+iree_add_all_subdirs()
+
+iree_c_embed_data(
+  NAME
+    libmusl
+  SRCS
+    "libmusl_wasm32_generic.bc"
+    "libmusl_wasm64_generic.bc"
+  DEPS
+
+  C_FILE_OUTPUT
+    "libmusl.c"
+  H_FILE_OUTPUT
+    "libmusl.h"
+  IDENTIFIER
+    "iree_builtins_libmusl"
+  FLATTEN
+  PUBLIC
+)
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/runtime/src/iree/builtins/musl/bin/build.sh b/runtime/src/iree/builtins/musl/bin/build.sh
new file mode 100755
index 0000000..c024088
--- /dev/null
+++ b/runtime/src/iree/builtins/musl/bin/build.sh
@@ -0,0 +1,80 @@
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# Example command line:
+#   LLVM_AS=/usr/bin/llvm-as \
+#   LLVM_LINK=/usr/bin/llvm-link \
+#   CLANG=/usr/bin/clang-13 \
+#   ./iree/builtins/musl/bin/build.sh
+
+set -x
+set -e
+
+CLANG="${CLANG:-clang}"
+CLANGXX="${CLANGXX:-$(which clang++)}"
+LLVM_AS="${LLVM_AS:-${IREE_BUILD_DIR}/third_party/llvm-project/llvm/bin/llvm-as}"
+LLVM_LINK="${LLVM_LINK:-${IREE_BUILD_DIR}/third_party/llvm-project/llvm/bin/llvm-link}"
+LLVM_OPT="${LLVM_OPT:-${IREE_BUILD_DIR}/third_party/llvm-project/llvm/bin/opt}"
+
+IREE_SRC_DIR="$(git rev-parse --show-toplevel)"
+IREE_BUILD_DIR="${IREE_BUILD_DIR:-${IREE_SRC_DIR?}/../build}"
+
+SCRIPT_DIR="$(realpath `dirname $0`)"
+OUT="${SCRIPT_DIR?}/"
+SRC="${SCRIPT_DIR?}/.."
+
+function make_arch_bc {
+  local ARCH=$1
+  local FEATURES=$2
+  local FILE_BASENAME="${OUT}/libmusl_${ARCH}_${FEATURES}"
+  local MUSL_MAKEFILE="${SCRIPT_DIR?}/../Makefile_${ARCH}.iree"
+
+  # Generate IR with 32-bit target.
+  MUSL_DIR=${IREE_SRC_DIR?}/third_party/musl
+  cd ${MUSL_DIR}
+  rm -rf obj/
+  CC=${CLANG?} CXX=${CLANGXX?} ./configure
+  MUSL_DIR=${MUSL_DIR} make -f ${MUSL_MAKEFILE} iree
+  MUSL_LL_FILES=`find obj/ -name "*.ll"`
+  cp ${MUSL_LL_FILES?} ${OUT}
+  rm ${MUSL_LL_FILES?}
+  cd ${SCRIPT_DIR?}
+
+  ALL_LL_FILES=`find ${OUT} -name "*.ll"`
+
+  cd ${OUT}
+  # git restore ${FILE_BASENAME}.bc
+  for file in ${ALL_LL_FILES}
+  do
+    # Run full LLVM optimizations.
+    # TODO(benvanik): defer this? Some of these opts may not be portable/safe.
+    ${LLVM_OPT?} ${file} -O3 -S -o ${file}.opt.ll
+
+    # Clang adds a bunch of bad attributes and host-specific information that we
+    # don't want (so we get at least somewhat deterministic builds).
+    sed -i 's/^;.*$//' "${file}.opt.ll"
+    sed -i 's/^source_filename.*$//' "${file}.opt.ll"
+    sed -i 's/^target datalayout.*$//' "${file}.opt.ll"
+    sed -i 's/^target triple.*$//' "${file}.opt.ll"
+    sed -i 's/^\(attributes #[0-9]* = {\).*$/\1 inlinehint }/' "${file}.opt.ll"
+
+    # Generate a binary bitcode file embedded into the compiler binary.
+    # NOTE: we do this from stdin so that the filename on the user's system is not
+    # embedded in the bitcode file (making it non-deterministic).
+    cat ${file}.opt.ll | ${LLVM_AS?} -o=${file}.opt.ll.bc
+    rm ${file}.opt.ll
+  done
+  rm ${ALL_LL_FILES}
+
+  ALL_BC_FILES=`ls *.ll.bc`
+  ${LLVM_LINK?} ${ALL_BC_FILES} -o ${FILE_BASENAME}.bc
+  rm ${ALL_BC_FILES}
+}
+
+make_arch_bc "wasm32" "generic" \
+    --target=wasm32
+make_arch_bc "wasm64" "generic" \
+    --target=wasm64
diff --git a/runtime/src/iree/builtins/musl/bin/libmusl_wasm32_generic.bc b/runtime/src/iree/builtins/musl/bin/libmusl_wasm32_generic.bc
new file mode 100644
index 0000000..02ecf00
--- /dev/null
+++ b/runtime/src/iree/builtins/musl/bin/libmusl_wasm32_generic.bc
Binary files differ
diff --git a/runtime/src/iree/builtins/musl/bin/libmusl_wasm64_generic.bc b/runtime/src/iree/builtins/musl/bin/libmusl_wasm64_generic.bc
new file mode 100644
index 0000000..3e6adcf
--- /dev/null
+++ b/runtime/src/iree/builtins/musl/bin/libmusl_wasm64_generic.bc
Binary files differ
diff --git a/runtime/src/iree/hal/BUILD b/runtime/src/iree/hal/BUILD
new file mode 100644
index 0000000..97d9270
--- /dev/null
+++ b/runtime/src/iree/hal/BUILD
@@ -0,0 +1,90 @@
+# Copyright 2019 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# HAL (Hardware Abstraction Layer).
+# Subdirectories contain implementations for different hardware and
+# software backends.
+
+load("//iree:build_defs.oss.bzl", "iree_runtime_cc_library", "iree_runtime_cc_test")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+#===------------------------------------------------------------------------===#
+# Public API
+#===------------------------------------------------------------------------===#
+
+iree_runtime_cc_library(
+    name = "hal",
+    srcs = [
+        "allocator.c",
+        "allocator.h",
+        "allocator_heap.c",
+        "buffer.c",
+        "buffer.h",
+        "buffer_heap.c",
+        "buffer_heap_impl.h",
+        "buffer_view.c",
+        "buffer_view.h",
+        "buffer_view_util.c",
+        "buffer_view_util.h",
+        "command_buffer.c",
+        "command_buffer.h",
+        "command_buffer_validation.c",
+        "command_buffer_validation.h",
+        "descriptor_set.c",
+        "descriptor_set.h",
+        "descriptor_set_layout.c",
+        "descriptor_set_layout.h",
+        "detail.h",
+        "device.c",
+        "device.h",
+        "driver.c",
+        "driver.h",
+        "driver_registry.c",
+        "driver_registry.h",
+        "event.c",
+        "event.h",
+        "executable.c",
+        "executable.h",
+        "executable_cache.c",
+        "executable_cache.h",
+        "executable_layout.c",
+        "executable_layout.h",
+        "resource.h",
+        "semaphore.c",
+        "semaphore.h",
+        "string_util.c",
+        "string_util.h",
+    ],
+    hdrs = [
+        "api.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:core_headers",
+        "//runtime/src/iree/base:tracing",
+        "//runtime/src/iree/base/internal",
+        "//runtime/src/iree/base/internal:synchronization",
+    ],
+)
+
+iree_runtime_cc_test(
+    name = "string_util_test",
+    srcs = ["string_util_test.cc"],
+    deps = [
+        ":hal",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:cc",
+        "//runtime/src/iree/base/internal:span",
+        "//runtime/src/iree/testing:gtest",
+        "//runtime/src/iree/testing:gtest_main",
+    ],
+)
diff --git a/runtime/src/iree/hal/CMakeLists.txt b/runtime/src/iree/hal/CMakeLists.txt
new file mode 100644
index 0000000..a4e2bbb
--- /dev/null
+++ b/runtime/src/iree/hal/CMakeLists.txt
@@ -0,0 +1,81 @@
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# runtime/src/iree/hal/BUILD                                                   #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
+
+iree_add_all_subdirs()
+
+iree_cc_library(
+  NAME
+    hal
+  HDRS
+    "api.h"
+  SRCS
+    "allocator.c"
+    "allocator.h"
+    "allocator_heap.c"
+    "buffer.c"
+    "buffer.h"
+    "buffer_heap.c"
+    "buffer_heap_impl.h"
+    "buffer_view.c"
+    "buffer_view.h"
+    "buffer_view_util.c"
+    "buffer_view_util.h"
+    "command_buffer.c"
+    "command_buffer.h"
+    "command_buffer_validation.c"
+    "command_buffer_validation.h"
+    "descriptor_set.c"
+    "descriptor_set.h"
+    "descriptor_set_layout.c"
+    "descriptor_set_layout.h"
+    "detail.h"
+    "device.c"
+    "device.h"
+    "driver.c"
+    "driver.h"
+    "driver_registry.c"
+    "driver_registry.h"
+    "event.c"
+    "event.h"
+    "executable.c"
+    "executable.h"
+    "executable_cache.c"
+    "executable_cache.h"
+    "executable_layout.c"
+    "executable_layout.h"
+    "resource.h"
+    "semaphore.c"
+    "semaphore.h"
+    "string_util.c"
+    "string_util.h"
+  DEPS
+    iree::base
+    iree::base::core_headers
+    iree::base::internal
+    iree::base::internal::synchronization
+    iree::base::tracing
+  PUBLIC
+)
+
+iree_cc_test(
+  NAME
+    string_util_test
+  SRCS
+    "string_util_test.cc"
+  DEPS
+    ::hal
+    iree::base
+    iree::base::cc
+    iree::base::internal::span
+    iree::testing::gtest
+    iree::testing::gtest_main
+)
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/runtime/src/iree/hal/README.md b/runtime/src/iree/hal/README.md
new file mode 100644
index 0000000..f50befa
--- /dev/null
+++ b/runtime/src/iree/hal/README.md
@@ -0,0 +1,18 @@
+# IREE Hardware Abstraction Layer (HAL)
+
+The IREE HAL expresses a low-level abstraction over modern compute APIs like
+Vulkan (CPUs count too!). Each implementation of the HAL interface can:
+
+* Enumerate and query devices and their capabilities
+* Define executable code that runs on the device
+* Allocate unified or discrete memory and provide cache control
+* Organize work into sequences for deferred submission
+* Provide explicit synchronization primitives for ordering submissions
+
+Refer to IREE's
+[presentations and talks](../../README.md#presentations-and-talks) for further
+details.
+
+## Testing
+
+See the [cts/ folder](./cts/) for the HAL Conformance Test Suite.
diff --git a/runtime/src/iree/hal/allocator.c b/runtime/src/iree/hal/allocator.c
new file mode 100644
index 0000000..daa418b
--- /dev/null
+++ b/runtime/src/iree/hal/allocator.c
@@ -0,0 +1,176 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/allocator.h"
+
+#include <stddef.h>
+#include <stdio.h>
+
+#include "iree/base/tracing.h"
+#include "iree/hal/detail.h"
+#include "iree/hal/resource.h"
+
+IREE_API_EXPORT iree_status_t iree_hal_allocator_statistics_format(
+    const iree_hal_allocator_statistics_t* statistics,
+    iree_string_builder_t* builder) {
+#if IREE_STATISTICS_ENABLE
+
+  // This could be prettier/have nice number formatting/etc.
+
+  IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
+      builder,
+      "  HOST_LOCAL: %12" PRIdsz "B peak / %12" PRIdsz
+      "B allocated / %12" PRIdsz "B freed / %12" PRIdsz "B live\n",
+      statistics->host_bytes_peak, statistics->host_bytes_allocated,
+      statistics->host_bytes_freed,
+      (statistics->host_bytes_allocated - statistics->host_bytes_freed)));
+
+  IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
+      builder,
+      "DEVICE_LOCAL: %12" PRIdsz "B peak / %12" PRIdsz
+      "B allocated / %12" PRIdsz "B freed / %12" PRIdsz "B live\n",
+      statistics->device_bytes_peak, statistics->device_bytes_allocated,
+      statistics->device_bytes_freed,
+      (statistics->device_bytes_allocated - statistics->device_bytes_freed)));
+
+#else
+  // No-op when disabled.
+#endif  // IREE_STATISTICS_ENABLE
+  return iree_ok_status();
+}
+
+#define _VTABLE_DISPATCH(allocator, method_name) \
+  IREE_HAL_VTABLE_DISPATCH(allocator, iree_hal_allocator, method_name)
+
+IREE_HAL_API_RETAIN_RELEASE(allocator);
+
+IREE_API_EXPORT iree_allocator_t iree_hal_allocator_host_allocator(
+    const iree_hal_allocator_t* IREE_RESTRICT allocator) {
+  IREE_ASSERT_ARGUMENT(allocator);
+  return _VTABLE_DISPATCH(allocator, host_allocator)(allocator);
+}
+
+IREE_API_EXPORT
+iree_status_t iree_hal_allocator_trim(
+    iree_hal_allocator_t* IREE_RESTRICT allocator) {
+  IREE_ASSERT_ARGUMENT(allocator);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_status_t status = _VTABLE_DISPATCH(allocator, trim)(allocator);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT void iree_hal_allocator_query_statistics(
+    iree_hal_allocator_t* IREE_RESTRICT allocator,
+    iree_hal_allocator_statistics_t* IREE_RESTRICT out_statistics) {
+  IREE_ASSERT_ARGUMENT(allocator);
+  memset(out_statistics, 0, sizeof(*out_statistics));
+  IREE_STATISTICS({
+    _VTABLE_DISPATCH(allocator, query_statistics)(allocator, out_statistics);
+  });
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_allocator_statistics_fprint(
+    FILE* file, iree_hal_allocator_t* IREE_RESTRICT allocator) {
+#if IREE_STATISTICS_ENABLE
+  iree_hal_allocator_statistics_t statistics;
+  iree_hal_allocator_query_statistics(allocator, &statistics);
+
+  iree_string_builder_t builder;
+  iree_string_builder_initialize(iree_hal_allocator_host_allocator(allocator),
+                                 &builder);
+
+  // TODO(benvanik): query identifier for the allocator so we can denote which
+  // device is being reported.
+  iree_status_t status = iree_string_builder_append_cstring(
+      &builder, "[[ iree_hal_allocator_t memory statistics ]]\n");
+
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_allocator_statistics_format(&statistics, &builder);
+  }
+
+  if (iree_status_is_ok(status)) {
+    fprintf(file, "%.*s", (int)iree_string_builder_size(&builder),
+            iree_string_builder_buffer(&builder));
+  }
+
+  iree_string_builder_deinitialize(&builder);
+  return status;
+#else
+  // No-op.
+  return iree_ok_status();
+#endif  // IREE_STATISTICS_ENABLE
+}
+
+IREE_API_EXPORT iree_hal_buffer_compatibility_t
+iree_hal_allocator_query_compatibility(
+    iree_hal_allocator_t* IREE_RESTRICT allocator,
+    iree_hal_buffer_params_t params, iree_device_size_t allocation_size) {
+  IREE_ASSERT_ARGUMENT(allocator);
+  iree_hal_buffer_params_canonicalize(&params);
+  return _VTABLE_DISPATCH(allocator, query_compatibility)(allocator, &params,
+                                                          allocation_size);
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_allocator_allocate_buffer(
+    iree_hal_allocator_t* IREE_RESTRICT allocator,
+    iree_hal_buffer_params_t params, iree_device_size_t allocation_size,
+    iree_const_byte_span_t initial_data,
+    iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
+  IREE_ASSERT_ARGUMENT(allocator);
+  IREE_ASSERT_ARGUMENT(out_buffer);
+  *out_buffer = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_hal_buffer_params_canonicalize(&params);
+  iree_status_t status = _VTABLE_DISPATCH(allocator, allocate_buffer)(
+      allocator, &params, allocation_size, initial_data, out_buffer);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT void iree_hal_allocator_deallocate_buffer(
+    iree_hal_allocator_t* IREE_RESTRICT allocator, iree_hal_buffer_t* buffer) {
+  IREE_ASSERT_ARGUMENT(allocator);
+  IREE_ASSERT_ARGUMENT(buffer);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  _VTABLE_DISPATCH(allocator, deallocate_buffer)(allocator, buffer);
+  IREE_TRACE_ZONE_END(z0);
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_allocator_import_buffer(
+    iree_hal_allocator_t* IREE_RESTRICT allocator,
+    iree_hal_buffer_params_t params,
+    iree_hal_external_buffer_t* IREE_RESTRICT external_buffer,
+    iree_hal_buffer_release_callback_t release_callback,
+    iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
+  IREE_ASSERT_ARGUMENT(allocator);
+  IREE_ASSERT_ARGUMENT(external_buffer);
+  IREE_ASSERT_ARGUMENT(out_buffer);
+  *out_buffer = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_hal_buffer_params_canonicalize(&params);
+  iree_status_t status = _VTABLE_DISPATCH(allocator, import_buffer)(
+      allocator, &params, external_buffer, release_callback, out_buffer);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_allocator_export_buffer(
+    iree_hal_allocator_t* IREE_RESTRICT allocator,
+    iree_hal_buffer_t* IREE_RESTRICT buffer,
+    iree_hal_external_buffer_type_t requested_type,
+    iree_hal_external_buffer_flags_t requested_flags,
+    iree_hal_external_buffer_t* IREE_RESTRICT out_external_buffer) {
+  IREE_ASSERT_ARGUMENT(allocator);
+  IREE_ASSERT_ARGUMENT(buffer);
+  IREE_ASSERT_ARGUMENT(out_external_buffer);
+  memset(out_external_buffer, 0, sizeof(*out_external_buffer));
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_status_t status = _VTABLE_DISPATCH(allocator, export_buffer)(
+      allocator, buffer, requested_type, requested_flags, out_external_buffer);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
diff --git a/runtime/src/iree/hal/allocator.h b/runtime/src/iree/hal/allocator.h
new file mode 100644
index 0000000..92c11df
--- /dev/null
+++ b/runtime/src/iree/hal/allocator.h
@@ -0,0 +1,538 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_ALLOCATOR_H_
+#define IREE_HAL_ALLOCATOR_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/buffer.h"
+#include "iree/hal/resource.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+//===----------------------------------------------------------------------===//
+// Types and Enums
+//===----------------------------------------------------------------------===//
+
+// A bitmap indicating logical device queue affinity.
+// Used to direct submissions to specific device queues or locate memory nearby
+// where it will be used. The meaning of the bits in the bitmap is
+// implementation-specific: a bit may represent a logical queue in an underlying
+// API such as a VkQueue or a physical queue such as a discrete virtual device.
+//
+// Bitwise operations can be performed on affinities; for example AND'ing two
+// affinities will produce the intersection and OR'ing will produce the union.
+// This enables just-in-time selection as a command buffer could be made
+// available to some set of queues when recorded and then AND'ed with an actual
+// set of queues to execute on during submission.
+typedef uint64_t iree_hal_queue_affinity_t;
+
+// Specifies that any queue may be selected.
+#define IREE_HAL_QUEUE_AFFINITY_ANY ((iree_hal_queue_affinity_t)(-1))
+
+// Parameters defining how a buffer should be allocated.
+//
+// Designed to be zero-initialized: any field with a 0 value will be assigned
+// a default as indicated in the field description.
+//
+// For ergonomics when used from C++ w/o named initializers the first field is
+// the most commonly used so that it can be initialized by location:
+//    some_fn(..., {IREE_HAL_BUFFER_USAGE_FOO}, ...)
+typedef struct iree_hal_buffer_params_t {
+  // Specifies the usage allowed by HAL APIs and aids in memory placement.
+  // Devices may have different memory types for different usage and require
+  // the intended usage to be declared upon allocation. It's always best to
+  // limit the allowed usage bits to precisely what the actual usage will be to
+  // avoid additional copies, synchronization, and expensive emulation.
+  //
+  // If 0 then the usage will default to all usage modes.
+  iree_hal_buffer_usage_t usage;
+
+  // Specifies the access allowed to the memory via the HAL APIs.
+  // For example, if the IREE_HAL_MEMORY_ACCESS_WRITE bit is not set then any
+  // API call that would write to the memory will fail (such as
+  // iree_hal_command_buffer_update_buffer). This does not limit any untrusted
+  // dispatch or external use of the buffer and should not be treated as a
+  // memory protection mechanism.
+  //
+  // If 0 then the access will be set as IREE_HAL_MEMORY_ACCESS_ALL.
+  iree_hal_memory_access_t access;
+
+  // Specifies the memory type properties used for selecting a memory space.
+  // This should often be IREE_HAL_MEMORY_TYPE_OPTIMAL to allow the allocator
+  // to place the allocation based on usage bits but can be specified if the
+  // exact memory type must be used for compatibility with external code.
+  //
+  // If 0 then the type will be set as IREE_HAL_MEMORY_TYPE_OPTIMAL.
+  iree_hal_memory_type_t type;
+
+  // Queue affinity bitmap indicating which queues may access this buffer.
+  // For NUMA devices this can be used to more tightly scope the allocation to
+  // particular device memory and provide better pool placement. When a device
+  // supports peering or replication the affinity bitmap will be used to choose
+  // which subdevices require configuration.
+  //
+  // If 0 then the buffer will be available on any queue as if
+  // IREE_HAL_QUEUE_AFFINITY_ANY was specified.
+  iree_hal_queue_affinity_t queue_affinity;
+
+  // Minimum alignment, in bytes, of the resulting allocation.
+  // The actual alignment may be any value greater-than-or-equal-to this value.
+  //
+  // If 0 then the alignment will be decided by the allocator based on optimal
+  // device parameters.
+  iree_device_size_t min_alignment;
+} iree_hal_buffer_params_t;
+
+// Canonicalizes |params| fields when zero initialization is used.
+static inline void iree_hal_buffer_params_canonicalize(
+    iree_hal_buffer_params_t* params) {
+  if (!params->usage) {
+    params->usage =
+        IREE_HAL_BUFFER_USAGE_DISPATCH | IREE_HAL_BUFFER_USAGE_TRANSFER;
+  }
+  if (!params->access) {
+    params->access = IREE_HAL_MEMORY_ACCESS_ALL;
+  }
+  if (!params->queue_affinity) {
+    params->queue_affinity = IREE_HAL_QUEUE_AFFINITY_ANY;
+  }
+}
+
+// Returns |params| with the given |usage| bits OR'ed in.
+static inline iree_hal_buffer_params_t iree_hal_buffer_params_with_usage(
+    const iree_hal_buffer_params_t params, iree_hal_buffer_usage_t usage) {
+  iree_hal_buffer_params_t result = params;
+  if (!result.usage) {
+    result.usage =
+        IREE_HAL_BUFFER_USAGE_DISPATCH | IREE_HAL_BUFFER_USAGE_TRANSFER;
+  }
+  result.usage |= usage;
+  return result;
+}
+
+// A bitfield indicating compatible behavior for buffers in an allocator.
+enum iree_hal_buffer_compatibility_bits_t {
+  // Indicates (in the absence of other bits) the buffer is not compatible with
+  // the allocator or device at all. Any attempts to use the buffer for any
+  // usage will fail. This will happen if the buffer is device-local to another
+  // device without peering and not visible to the host.
+  IREE_HAL_BUFFER_COMPATIBILITY_NONE = 0u,
+
+  // Indicates that the allocator could allocate new buffers of this type and
+  // usage natively. Allocations with the queried parameters may still fail due
+  // to runtime conditions (out of memory, fragmentation, etc) but are otherwise
+  // valid.
+  IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE = 1u << 0,
+
+  // Indicates that the allocator could import external buffers of this type and
+  // usage natively. Imports may fail due to runtime conditions (out of handles,
+  // invalid pointer address spaces/page parameters, etc) but are otherwise
+  // valid.
+  IREE_HAL_BUFFER_COMPATIBILITY_IMPORTABLE = 1u << 1,
+
+  // Indicates that the allocator could export external buffers of this type and
+  // usage natively. Exports may fail due to runtime conditions (out of handles,
+  // etc) but are otherwise valid.
+  IREE_HAL_BUFFER_COMPATIBILITY_EXPORTABLE = 1u << 2,
+
+  // Indicates that the buffer can be used as a transfer source or target on the
+  // a device queue (such as being the source or target of a DMA operation,
+  // etc). If not set then the buffer may still be usable for
+  // iree_hal_buffer_map_copy but not with queued operations.
+  IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_TRANSFER = 1u << 10,
+
+  // Indicates that the buffer can be used as an input/output to a dispatch.
+  IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_DISPATCH = 1u << 11,
+};
+typedef uint32_t iree_hal_buffer_compatibility_t;
+
+// Defines the type of an external buffer handle.
+// Each type may only be usable in a subset of implementations and platforms and
+// may even vary based on the runtime device properties or buffer instance.
+//
+// See the notes on each type for requirements; compatibility often requires
+// the handle to check and trying to import/export is the most reliable way to
+// check for support.
+//
+// The Vulkan documentation on external memory covers a lot of the design
+// decisions made here:
+// https://www.khronos.org/registry/vulkan/specs/1.3-extensions/man/html/VK_KHR_external_memory.html
+typedef enum iree_hal_external_buffer_type_e {
+  IREE_HAL_EXTERNAL_BUFFER_TYPE_NONE = 0,
+
+  // A host pointer allocated from an external allocator.
+  // An imported/exported buffer does not own a reference to the memory and the
+  // caller is responsible for ensuring the memory remains live for as long as
+  // the iree_hal_buffer_t referencing it.
+  //
+  // CUDA:
+  //  Requires device support.
+  //  Uses cuMemHostRegister / cuMemHostUnregister.
+  //  The memory type specified on import/export determines the required device
+  //  capabilities.
+  //
+  // Vulkan:
+  //  Requires VK_EXT_external_memory_host.
+  //  Requires device support.
+  //  Uses VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT.
+  IREE_HAL_EXTERNAL_BUFFER_TYPE_HOST_ALLOCATION = 1,
+
+  // A driver/device-specific POSIX file descriptor handle.
+  // The handle supports dup, dup2, close, and transport using the SCM_RIGHTS
+  // control message. All other usage with system APIs is undefined.
+  // An imported/exported handle owns a reference to the underlying allocator
+  // memory. May only be shared with the same underlying driver and device
+  //
+  // CUDA:
+  //  Requires device support.
+  //  Uses CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD.
+  //
+  // Vulkan:
+  //  Requires device support.
+  //  Uses VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT.
+  IREE_HAL_EXTERNAL_BUFFER_TYPE_OPAQUE_FD = 2,
+
+  // A driver/device-specific Win32 HANDLE.
+  // The handle supports DuplicateHandle, CompareObjectHandles, CloseHandle, and
+  // Get/SetHandleInformation. All other usage with system APIs is undefined.
+  // An imported/exported handle owns a reference to the underlying allocator
+  // memory. Must only be shared with the same underlying driver and device.
+  //
+  // CUDA:
+  //  Requires device support.
+  //  Uses CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32.
+  //
+  // Vulkan:
+  //  Requires device support.
+  //  Uses VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT.
+  IREE_HAL_EXTERNAL_BUFFER_TYPE_OPAQUE_WIN32 = 3,
+
+  // TODO(benvanik): additional memory types:
+  //  shared memory fd (shmem)/mapped file
+  //  VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT
+  //  VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID
+} iree_hal_external_buffer_type_t;
+
+// Flags for controlling iree_hal_external_buffer_t implementation details.
+enum iree_hal_external_buffer_flag_bits_t {
+  IREE_HAL_EXTERNAL_BUFFER_FLAG_NONE = 0u,
+};
+typedef uint32_t iree_hal_external_buffer_flags_t;
+
+// Handle to a typed external buffer.
+// This is a non-owning reference and the underlying allocation must remain
+// valid for as long as the handle is in use. Some buffer types support internal
+// referencing counting but in general ownership remains with the caller.
+// See the type enum for more information.
+typedef struct iree_hal_external_buffer_t {
+  // Type of the resource used to interpret the handle.
+  iree_hal_external_buffer_type_t type;
+  // Flags indicating buffer compatibility.
+  iree_hal_external_buffer_flags_t flags;
+  // Total size of the external resource in bytes.
+  iree_device_size_t size;
+  union {
+    // IREE_HAL_EXTERNAL_BUFFER_TYPE_HOST_ALLOCATION
+    struct {
+      // Host memory pointer.
+      void* ptr;
+    } host_allocation;
+    // IREE_HAL_EXTERNAL_BUFFER_TYPE_OPAQUE_FD
+    struct {
+      int fd;
+    } opaque_fd;
+    // IREE_HAL_EXTERNAL_BUFFER_TYPE_OPAQUE_WIN32
+    struct {
+      void* handle;
+    } opaque_win32;
+  } handle;
+} iree_hal_external_buffer_t;
+
+typedef void(IREE_API_PTR* iree_hal_buffer_release_fn_t)(
+    void* user_data, iree_hal_buffer_t* buffer);
+
+// A callback issued when a buffer is released.
+typedef struct {
+  // Callback function pointer.
+  iree_hal_buffer_release_fn_t fn;
+  // User data passed to the callback function. Unowned.
+  void* user_data;
+} iree_hal_buffer_release_callback_t;
+
+// Returns a no-op buffer release callback that implies that no cleanup is
+// required.
+static inline iree_hal_buffer_release_callback_t
+iree_hal_buffer_release_callback_null(void) {
+  iree_hal_buffer_release_callback_t callback = {NULL, NULL};
+  return callback;
+}
+
+//===----------------------------------------------------------------------===//
+// Statistics/reporting
+//===----------------------------------------------------------------------===//
+
+// Aggregate allocation statistics.
+typedef struct iree_hal_allocator_statistics_t {
+#if IREE_STATISTICS_ENABLE
+  iree_device_size_t host_bytes_peak;
+  iree_device_size_t host_bytes_allocated;
+  iree_device_size_t host_bytes_freed;
+  iree_device_size_t device_bytes_peak;
+  iree_device_size_t device_bytes_allocated;
+  iree_device_size_t device_bytes_freed;
+  // TODO(benvanik): mapping information (discarded, mapping ranges,
+  //                 flushed/invalidated, etc).
+#else
+  int reserved;
+#endif  // IREE_STATISTICS_ENABLE
+} iree_hal_allocator_statistics_t;
+
+// Formats allocator statistics as a pretty-printed multi-line string.
+IREE_API_EXPORT iree_status_t iree_hal_allocator_statistics_format(
+    const iree_hal_allocator_statistics_t* statistics,
+    iree_string_builder_t* builder);
+
+//===----------------------------------------------------------------------===//
+// iree_hal_allocator_t
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_allocator_t iree_hal_allocator_t;
+
+// Retains the given |allocator| for the caller.
+IREE_API_EXPORT void iree_hal_allocator_retain(iree_hal_allocator_t* allocator);
+
+// Releases the given |allocator| from the caller.
+IREE_API_EXPORT void iree_hal_allocator_release(
+    iree_hal_allocator_t* allocator);
+
+// Returns the host allocator used for allocating host objects.
+IREE_API_EXPORT iree_allocator_t iree_hal_allocator_host_allocator(
+    const iree_hal_allocator_t* IREE_RESTRICT allocator);
+
+// Trims cached/unused pooled buffers, if any.
+IREE_API_EXPORT
+iree_status_t iree_hal_allocator_trim(
+    iree_hal_allocator_t* IREE_RESTRICT allocator);
+
+// Queries the aggregate statistics from the allocator since creation.
+// Thread-safe; statistics are captured at the time the call is made.
+//
+// NOTE: statistics may be compiled out in some configurations and this call
+// will become a memset(0).
+IREE_API_EXPORT void iree_hal_allocator_query_statistics(
+    iree_hal_allocator_t* IREE_RESTRICT allocator,
+    iree_hal_allocator_statistics_t* IREE_RESTRICT out_statistics);
+
+// Prints the current allocation statistics of |allocator| to |file|.
+// No-op if statistics are not enabled (IREE_STATISTICS_ENABLE).
+IREE_API_EXPORT iree_status_t iree_hal_allocator_statistics_fprint(
+    FILE* file, iree_hal_allocator_t* IREE_RESTRICT allocator);
+
+// Returns a bitmask indicating what operations with buffers of the given type
+// are available on the allocator.
+//
+// For buffers allocated from the given allocator it's expected that the result
+// will always be non-NONE. For buffers that originate from another allocator
+// there may be limited support for cross-device usage.
+//
+// Returning IREE_HAL_BUFFER_COMPATIBILITY_NONE indicates that the buffer must
+// be transferred externally into a buffer compatible with the device the
+// allocator services.
+IREE_API_EXPORT iree_hal_buffer_compatibility_t
+iree_hal_allocator_query_compatibility(
+    iree_hal_allocator_t* IREE_RESTRICT allocator,
+    iree_hal_buffer_params_t params, iree_device_size_t allocation_size);
+
+// Allocates a buffer from the allocator.
+// If |initial_data| is provided then the bytes will be copied into the device
+// buffer. To avoid the copy when device-accessible constant data is used prefer
+// iree_hal_allocator_import_buffer when available.
+//
+// The memory type of the buffer returned may differ from the requested value
+// if the device can provide more functionality; for example, if requesting
+// IREE_HAL_MEMORY_TYPE_HOST_VISIBLE but the memory is really host cached you
+// may get a buffer back with IREE_HAL_MEMORY_TYPE_HOST_VISIBLE |
+// IREE_HAL_MEMORY_TYPE_HOST_CACHED. The only requirement is that the buffer
+// satisfy the required bits.
+//
+// |out_buffer| must be released by the caller.
+// Fails if the memory type requested for the given usage cannot be serviced.
+// Callers can use iree_hal_allocator_query_compatibility to decide their memory
+// use strategy.
+IREE_API_EXPORT iree_status_t iree_hal_allocator_allocate_buffer(
+    iree_hal_allocator_t* IREE_RESTRICT allocator,
+    iree_hal_buffer_params_t params, iree_device_size_t allocation_size,
+    iree_const_byte_span_t initial_data, iree_hal_buffer_t** out_buffer);
+
+// TODO(benvanik): iree_hal_allocator_query_external_buffer_compatibility to
+// check for support without needing an external buffer already. There's a few
+// usage modes and it'd be nice to have a single function for it to keep the
+// interface slimmer.
+
+// Imports an externally-owned |external_buffer| to a buffer handle.
+// See notes on iree_hal_external_buffer_type_t for ownership information;
+// depending on the type the caller may be responsible for ensuring the external
+// buffer remains valid for the duration it is in use by the returned
+// iree_hal_buffer_t. The returned external buffer may only be usable with the
+// same driver/device.
+//
+// iree_hal_allocator_query_compatibility can be used to query whether a
+// buffer can be imported when using the given memory type and usage. A
+// compatibility result containing IREE_HAL_BUFFER_COMPATIBILITY_IMPORTABLE
+// means the import _may_ succeed however if the pointer/page range is not in a
+// supported mode (no read access, etc) this call will fail with
+// IREE_STATUS_OUT_OF_RANGE.
+//
+// An optional |release_callback| can be provided to allow the caller to listen
+// for when the underlying resource is no longer in use by the HAL. This can
+// be used to perform lifetime management or flushing.
+//
+// |out_buffer| must be released by the caller.
+// Fails with IREE_STATUS_UNAVAILABLE if the allocator cannot import the buffer
+// into the given memory type. This may be due to unavailable device/platform
+// capabilities or the memory type the external buffer was allocated with.
+IREE_API_EXPORT iree_status_t iree_hal_allocator_import_buffer(
+    iree_hal_allocator_t* IREE_RESTRICT allocator,
+    iree_hal_buffer_params_t params,
+    iree_hal_external_buffer_t* IREE_RESTRICT external_buffer,
+    iree_hal_buffer_release_callback_t release_callback,
+    iree_hal_buffer_t** out_buffer);
+
+// Exports an allocator-owned |buffer| to an external buffer handle.
+// See the notes on iree_hal_external_buffer_type_t for ownership information.
+// Upon successful return the caller is responsible for any required lifetime
+// management on the external buffer which may include ensuring that the
+// provided source |buffer| is kept live. The returned external buffer may only
+// be usable with the same driver/device.
+//
+// Fails with IREE_STATUS_UNAVAILABLE if the allocator cannot export the buffer
+// into the external type. This may be due to unavailable device/platform
+// capabilities or the memory type the buffer was allocated with.
+IREE_API_EXPORT iree_status_t iree_hal_allocator_export_buffer(
+    iree_hal_allocator_t* IREE_RESTRICT allocator,
+    iree_hal_buffer_t* IREE_RESTRICT buffer,
+    iree_hal_external_buffer_type_t requested_type,
+    iree_hal_external_buffer_flags_t requested_flags,
+    iree_hal_external_buffer_t* IREE_RESTRICT out_external_buffer);
+
+//===----------------------------------------------------------------------===//
+// iree_hal_heap_allocator_t
+//===----------------------------------------------------------------------===//
+
+// Creates a host-local heap allocator that can be used when buffers are
+// required that will not interact with a real hardware device (such as those
+// used in file IO or tests). Buffers allocated with this will not be compatible
+// with real device allocators and will likely incur a copy (or failure) if
+// used.
+//
+// The buffers created from the allocator will use |host_allocator| for their
+// metadata and |data_allocator| for their device storage allocations. If the
+// two are the same the buffers will be allocated in a single flat slab.
+IREE_API_EXPORT iree_status_t iree_hal_allocator_create_heap(
+    iree_string_view_t identifier, iree_allocator_t data_allocator,
+    iree_allocator_t host_allocator, iree_hal_allocator_t** out_allocator);
+
+//===----------------------------------------------------------------------===//
+// iree_hal_allocator_t implementation details
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_allocator_vtable_t {
+  void(IREE_API_PTR* destroy)(iree_hal_allocator_t* IREE_RESTRICT allocator);
+
+  iree_allocator_t(IREE_API_PTR* host_allocator)(
+      const iree_hal_allocator_t* IREE_RESTRICT allocator);
+
+  iree_status_t(IREE_API_PTR* trim)(
+      iree_hal_allocator_t* IREE_RESTRICT allocator);
+
+  void(IREE_API_PTR* query_statistics)(
+      iree_hal_allocator_t* IREE_RESTRICT allocator,
+      iree_hal_allocator_statistics_t* IREE_RESTRICT out_statistics);
+
+  iree_hal_buffer_compatibility_t(IREE_API_PTR* query_compatibility)(
+      iree_hal_allocator_t* IREE_RESTRICT allocator,
+      const iree_hal_buffer_params_t* IREE_RESTRICT params,
+      iree_device_size_t allocation_size);
+
+  iree_status_t(IREE_API_PTR* allocate_buffer)(
+      iree_hal_allocator_t* IREE_RESTRICT allocator,
+      const iree_hal_buffer_params_t* IREE_RESTRICT params,
+      iree_device_size_t allocation_size, iree_const_byte_span_t initial_data,
+      iree_hal_buffer_t** IREE_RESTRICT out_buffer);
+
+  void(IREE_API_PTR* deallocate_buffer)(
+      iree_hal_allocator_t* IREE_RESTRICT allocator,
+      iree_hal_buffer_t* IREE_RESTRICT buffer);
+
+  iree_status_t(IREE_API_PTR* import_buffer)(
+      iree_hal_allocator_t* IREE_RESTRICT allocator,
+      const iree_hal_buffer_params_t* IREE_RESTRICT params,
+      iree_hal_external_buffer_t* IREE_RESTRICT external_buffer,
+      iree_hal_buffer_release_callback_t release_callback,
+      iree_hal_buffer_t** IREE_RESTRICT out_buffer);
+
+  iree_status_t(IREE_API_PTR* export_buffer)(
+      iree_hal_allocator_t* IREE_RESTRICT allocator,
+      iree_hal_buffer_t* IREE_RESTRICT buffer,
+      iree_hal_external_buffer_type_t requested_type,
+      iree_hal_external_buffer_flags_t requested_flags,
+      iree_hal_external_buffer_t* IREE_RESTRICT out_external_buffer);
+} iree_hal_allocator_vtable_t;
+IREE_HAL_ASSERT_VTABLE_LAYOUT(iree_hal_allocator_vtable_t);
+
+IREE_API_EXPORT void iree_hal_allocator_destroy(
+    iree_hal_allocator_t* IREE_RESTRICT allocator);
+
+IREE_API_EXPORT void iree_hal_allocator_deallocate_buffer(
+    iree_hal_allocator_t* IREE_RESTRICT allocator,
+    iree_hal_buffer_t* IREE_RESTRICT buffer);
+
+#if IREE_STATISTICS_ENABLE
+
+// Records a buffer allocation to |statistics|.
+static inline void iree_hal_allocator_statistics_record_alloc(
+    iree_hal_allocator_statistics_t* statistics,
+    iree_hal_memory_type_t memory_type, iree_device_size_t allocation_size) {
+  if (iree_all_bits_set(memory_type, IREE_HAL_MEMORY_TYPE_HOST_LOCAL)) {
+    statistics->host_bytes_allocated += allocation_size;
+    statistics->host_bytes_peak =
+        iree_max(statistics->host_bytes_peak, statistics->host_bytes_allocated -
+                                                  statistics->host_bytes_freed);
+  } else {
+    statistics->device_bytes_allocated += allocation_size;
+    statistics->device_bytes_peak = iree_max(
+        statistics->device_bytes_peak,
+        statistics->device_bytes_allocated - statistics->device_bytes_freed);
+  }
+}
+
+// Records a buffer deallocation to |statistics|.
+static inline void iree_hal_allocator_statistics_record_free(
+    iree_hal_allocator_statistics_t* statistics,
+    iree_hal_memory_type_t memory_type, iree_device_size_t allocation_size) {
+  if (iree_all_bits_set(memory_type, IREE_HAL_MEMORY_TYPE_HOST_LOCAL)) {
+    statistics->host_bytes_freed += allocation_size;
+  } else {
+    statistics->device_bytes_freed += allocation_size;
+  }
+}
+
+#else
+#define iree_hal_allocator_statistics_record_alloc(...)
+#define iree_hal_allocator_statistics_record_free(...)
+#endif  // IREE_STATISTICS_ENABLE
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_ALLOCATOR_H_
diff --git a/runtime/src/iree/hal/allocator_heap.c b/runtime/src/iree/hal/allocator_heap.c
new file mode 100644
index 0000000..7b53c27
--- /dev/null
+++ b/runtime/src/iree/hal/allocator_heap.c
@@ -0,0 +1,238 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <stddef.h>
+
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/allocator.h"
+#include "iree/hal/buffer.h"
+#include "iree/hal/buffer_heap_impl.h"
+#include "iree/hal/resource.h"
+
+typedef struct iree_hal_heap_allocator_t {
+  iree_hal_resource_t resource;
+  iree_allocator_t host_allocator;
+  iree_allocator_t data_allocator;
+  iree_string_view_t identifier;
+  IREE_STATISTICS(iree_hal_heap_allocator_statistics_t statistics;)
+} iree_hal_heap_allocator_t;
+
+static const iree_hal_allocator_vtable_t iree_hal_heap_allocator_vtable;
+
+iree_hal_heap_allocator_t* iree_hal_heap_allocator_cast(
+    iree_hal_allocator_t* IREE_RESTRICT base_value) {
+  return (iree_hal_heap_allocator_t*)base_value;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_allocator_create_heap(
+    iree_string_view_t identifier, iree_allocator_t data_allocator,
+    iree_allocator_t host_allocator, iree_hal_allocator_t** out_allocator) {
+  IREE_ASSERT_ARGUMENT(out_allocator);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  *out_allocator = NULL;
+
+  iree_hal_heap_allocator_t* allocator = NULL;
+  iree_host_size_t total_size =
+      iree_sizeof_struct(*allocator) + identifier.size;
+  iree_status_t status =
+      iree_allocator_malloc(host_allocator, total_size, (void**)&allocator);
+  if (iree_status_is_ok(status)) {
+    iree_hal_resource_initialize(&iree_hal_heap_allocator_vtable,
+                                 &allocator->resource);
+    allocator->host_allocator = host_allocator;
+    allocator->data_allocator = data_allocator;
+    iree_string_view_append_to_buffer(
+        identifier, &allocator->identifier,
+        (char*)allocator + iree_sizeof_struct(*allocator));
+
+    IREE_STATISTICS({
+      // All start initialized to zero.
+      iree_slim_mutex_initialize(&allocator->statistics.mutex);
+    });
+
+    *out_allocator = (iree_hal_allocator_t*)allocator;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_heap_allocator_destroy(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator) {
+  iree_hal_heap_allocator_t* allocator =
+      iree_hal_heap_allocator_cast(base_allocator);
+  iree_allocator_t host_allocator = allocator->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  IREE_STATISTICS(iree_slim_mutex_deinitialize(&allocator->statistics.mutex));
+
+  iree_allocator_free(host_allocator, allocator);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_allocator_t iree_hal_heap_allocator_host_allocator(
+    const iree_hal_allocator_t* IREE_RESTRICT base_allocator) {
+  iree_hal_heap_allocator_t* allocator =
+      (iree_hal_heap_allocator_t*)base_allocator;
+  return allocator->host_allocator;
+}
+
+static iree_status_t iree_hal_heap_allocator_trim(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator) {
+  return iree_ok_status();
+}
+
+static void iree_hal_heap_allocator_query_statistics(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    iree_hal_allocator_statistics_t* IREE_RESTRICT out_statistics) {
+  IREE_STATISTICS({
+    iree_hal_heap_allocator_t* allocator =
+        iree_hal_heap_allocator_cast(base_allocator);
+    iree_slim_mutex_lock(&allocator->statistics.mutex);
+    memcpy(out_statistics, &allocator->statistics.base,
+           sizeof(*out_statistics));
+    iree_slim_mutex_unlock(&allocator->statistics.mutex);
+  });
+}
+
+static iree_hal_buffer_compatibility_t
+iree_hal_heap_allocator_query_compatibility(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    const iree_hal_buffer_params_t* IREE_RESTRICT params,
+    iree_device_size_t allocation_size) {
+  // All buffers can be allocated on the heap and all heap-accessible buffers
+  // can be imported/exported.
+  iree_hal_buffer_compatibility_t compatibility =
+      IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE |
+      IREE_HAL_BUFFER_COMPATIBILITY_IMPORTABLE |
+      IREE_HAL_BUFFER_COMPATIBILITY_EXPORTABLE;
+
+  // Buffers can only be used on the queue if they are device visible.
+  // This is not a strict requirement of heap buffers but matches devices that
+  // have discrete memory spaces (remoting/sandboxed, GPUs, etc) and makes it
+  // much easier to find issues of buffer definition with local devices that
+  // will cause issues when used with real devices.
+  if (iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE)) {
+    if (iree_all_bits_set(params->usage, IREE_HAL_BUFFER_USAGE_TRANSFER)) {
+      compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_TRANSFER;
+    }
+    if (iree_all_bits_set(params->usage, IREE_HAL_BUFFER_USAGE_DISPATCH)) {
+      compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_DISPATCH;
+    }
+  }
+
+  return compatibility;
+}
+
+static iree_hal_buffer_params_t iree_hal_heap_allocator_make_compatible(
+    const iree_hal_buffer_params_t* IREE_RESTRICT params) {
+  iree_hal_buffer_params_t result = *params;
+
+  // Always ensure we are host-visible.
+  result.type |= IREE_HAL_MEMORY_TYPE_HOST_VISIBLE;
+
+  // Host currently uses mapping to copy buffers, which is done a lot.
+  // We could probably remove this mutation by preventing copies in those cases.
+  // TODO(benvanik): check if transfer is still required for DMA copy source.
+  result.usage |=
+      IREE_HAL_BUFFER_USAGE_MAPPING | IREE_HAL_BUFFER_USAGE_TRANSFER;
+
+  return result;
+}
+
+static iree_status_t iree_hal_heap_allocator_allocate_buffer(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    const iree_hal_buffer_params_t* IREE_RESTRICT params,
+    iree_device_size_t allocation_size, iree_const_byte_span_t initial_data,
+    iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
+  iree_hal_heap_allocator_t* allocator =
+      iree_hal_heap_allocator_cast(base_allocator);
+
+  // Coerce options into those required for use by heap-based devices.
+  iree_hal_buffer_params_t compat_params =
+      iree_hal_heap_allocator_make_compatible(params);
+
+  // Allocate the buffer (both the wrapper and the contents).
+  iree_hal_heap_allocator_statistics_t* statistics = NULL;
+  IREE_STATISTICS(statistics = &allocator->statistics);
+  iree_hal_buffer_t* buffer = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_heap_buffer_create(
+      base_allocator, statistics, &compat_params, allocation_size, initial_data,
+      allocator->data_allocator, allocator->host_allocator, &buffer));
+
+  *out_buffer = buffer;
+  return iree_ok_status();
+}
+
+static void iree_hal_heap_allocator_deallocate_buffer(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    iree_hal_buffer_t* IREE_RESTRICT base_buffer) {
+  // We don't do any pooling yet.
+  // TODO(benvanik): move stats tracking here.
+  iree_hal_buffer_destroy(base_buffer);
+}
+
+static iree_status_t iree_hal_heap_allocator_import_buffer(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    const iree_hal_buffer_params_t* IREE_RESTRICT params,
+    iree_hal_external_buffer_t* IREE_RESTRICT external_buffer,
+    iree_hal_buffer_release_callback_t release_callback,
+    iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
+  if (external_buffer->type != IREE_HAL_EXTERNAL_BUFFER_TYPE_HOST_ALLOCATION) {
+    return iree_make_status(IREE_STATUS_UNAVAILABLE,
+                            "external buffer type not supported");
+  }
+
+  // Coerce options into those required for use by heap-based devices.
+  iree_hal_buffer_params_t compat_params =
+      iree_hal_heap_allocator_make_compatible(params);
+
+  return iree_hal_heap_buffer_wrap(
+      base_allocator, compat_params.type, compat_params.access,
+      compat_params.usage, external_buffer->size,
+      iree_make_byte_span(external_buffer->handle.host_allocation.ptr,
+                          external_buffer->size),
+      release_callback, out_buffer);
+}
+
+static iree_status_t iree_hal_heap_allocator_export_buffer(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    iree_hal_buffer_t* IREE_RESTRICT buffer,
+    iree_hal_external_buffer_type_t requested_type,
+    iree_hal_external_buffer_flags_t requested_flags,
+    iree_hal_external_buffer_t* IREE_RESTRICT out_external_buffer) {
+  if (requested_type != IREE_HAL_EXTERNAL_BUFFER_TYPE_HOST_ALLOCATION) {
+    return iree_make_status(IREE_STATUS_UNAVAILABLE,
+                            "external buffer type not supported");
+  }
+
+  // Map the entire buffer persistently, if possible.
+  iree_hal_buffer_mapping_t mapping;
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_map_range(
+      buffer, IREE_HAL_MAPPING_MODE_PERSISTENT,
+      iree_hal_buffer_allowed_access(buffer), 0, IREE_WHOLE_BUFFER, &mapping));
+
+  // Note that the returned pointer is unowned.
+  out_external_buffer->type = requested_type;
+  out_external_buffer->flags = requested_flags;
+  out_external_buffer->size = mapping.contents.data_length;
+  out_external_buffer->handle.host_allocation.ptr = mapping.contents.data;
+  return iree_ok_status();
+}
+
+static const iree_hal_allocator_vtable_t iree_hal_heap_allocator_vtable = {
+    .destroy = iree_hal_heap_allocator_destroy,
+    .host_allocator = iree_hal_heap_allocator_host_allocator,
+    .trim = iree_hal_heap_allocator_trim,
+    .query_statistics = iree_hal_heap_allocator_query_statistics,
+    .query_compatibility = iree_hal_heap_allocator_query_compatibility,
+    .allocate_buffer = iree_hal_heap_allocator_allocate_buffer,
+    .deallocate_buffer = iree_hal_heap_allocator_deallocate_buffer,
+    .import_buffer = iree_hal_heap_allocator_import_buffer,
+    .export_buffer = iree_hal_heap_allocator_export_buffer,
+};
diff --git a/runtime/src/iree/hal/api.h b/runtime/src/iree/hal/api.h
new file mode 100644
index 0000000..0ca7171
--- /dev/null
+++ b/runtime/src/iree/hal/api.h
@@ -0,0 +1,30 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// See iree/base/api.h for documentation on the API conventions used.
+
+#ifndef IREE_HAL_API_H_
+#define IREE_HAL_API_H_
+
+#include "iree/hal/allocator.h"              // IWYU pragma: export
+#include "iree/hal/buffer.h"                 // IWYU pragma: export
+#include "iree/hal/buffer_view.h"            // IWYU pragma: export
+#include "iree/hal/buffer_view_util.h"       // IWYU pragma: export
+#include "iree/hal/command_buffer.h"         // IWYU pragma: export
+#include "iree/hal/descriptor_set.h"         // IWYU pragma: export
+#include "iree/hal/descriptor_set_layout.h"  // IWYU pragma: export
+#include "iree/hal/device.h"                 // IWYU pragma: export
+#include "iree/hal/driver.h"                 // IWYU pragma: export
+#include "iree/hal/driver_registry.h"        // IWYU pragma: export
+#include "iree/hal/event.h"                  // IWYU pragma: export
+#include "iree/hal/executable.h"             // IWYU pragma: export
+#include "iree/hal/executable_cache.h"       // IWYU pragma: export
+#include "iree/hal/executable_layout.h"      // IWYU pragma: export
+#include "iree/hal/resource.h"               // IWYU pragma: export
+#include "iree/hal/semaphore.h"              // IWYU pragma: export
+#include "iree/hal/string_util.h"            // IWYU pragma: export
+
+#endif  // IREE_HAL_API_H_
diff --git a/runtime/src/iree/hal/buffer.c b/runtime/src/iree/hal/buffer.c
new file mode 100644
index 0000000..3af50a6
--- /dev/null
+++ b/runtime/src/iree/hal/buffer.c
@@ -0,0 +1,880 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/buffer.h"
+
+#include <inttypes.h>
+#include <stddef.h>
+#include <string.h>
+
+#include "iree/base/tracing.h"
+#include "iree/hal/allocator.h"
+#include "iree/hal/detail.h"
+
+#define _VTABLE_DISPATCH(buffer, method_name) \
+  IREE_HAL_VTABLE_DISPATCH(buffer, iree_hal_buffer, method_name)
+
+//===----------------------------------------------------------------------===//
+// String utils
+//===----------------------------------------------------------------------===//
+
+IREE_API_EXPORT iree_string_view_t iree_hal_memory_type_format(
+    iree_hal_memory_type_t value, iree_bitfield_string_temp_t* out_temp) {
+  static const iree_bitfield_string_mapping_t mappings[] = {
+      // Combined:
+      {IREE_HAL_MEMORY_TYPE_HOST_LOCAL, IREE_SVL("HOST_LOCAL")},
+      {IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL, IREE_SVL("DEVICE_LOCAL")},
+      // Separate:
+      {IREE_HAL_MEMORY_TYPE_TRANSIENT, IREE_SVL("TRANSIENT")},
+      {IREE_HAL_MEMORY_TYPE_HOST_VISIBLE, IREE_SVL("HOST_VISIBLE")},
+      {IREE_HAL_MEMORY_TYPE_HOST_COHERENT, IREE_SVL("HOST_COHERENT")},
+      {IREE_HAL_MEMORY_TYPE_HOST_CACHED, IREE_SVL("HOST_CACHED")},
+      {IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE, IREE_SVL("DEVICE_VISIBLE")},
+  };
+  return iree_bitfield_format_inline(value, mappings, IREE_ARRAYSIZE(mappings),
+                                     out_temp);
+}
+
+IREE_API_EXPORT iree_string_view_t iree_hal_memory_access_format(
+    iree_hal_memory_access_t value, iree_bitfield_string_temp_t* out_temp) {
+  static const iree_bitfield_string_mapping_t mappings[] = {
+      // Combined:
+      {IREE_HAL_MEMORY_ACCESS_ALL, IREE_SVL("ALL")},
+      {IREE_HAL_MEMORY_ACCESS_DISCARD_WRITE, IREE_SVL("DISCARD_WRITE")},
+      // Separate:
+      {IREE_HAL_MEMORY_ACCESS_READ, IREE_SVL("READ")},
+      {IREE_HAL_MEMORY_ACCESS_WRITE, IREE_SVL("WRITE")},
+      {IREE_HAL_MEMORY_ACCESS_DISCARD, IREE_SVL("DISCARD")},
+      {IREE_HAL_MEMORY_ACCESS_MAY_ALIAS, IREE_SVL("MAY_ALIAS")},
+      {IREE_HAL_MEMORY_ACCESS_ANY, IREE_SVL("ANY")},
+  };
+  return iree_bitfield_format_inline(value, mappings, IREE_ARRAYSIZE(mappings),
+                                     out_temp);
+}
+
+IREE_API_EXPORT iree_string_view_t iree_hal_buffer_usage_format(
+    iree_hal_buffer_usage_t value, iree_bitfield_string_temp_t* out_temp) {
+  static const iree_bitfield_string_mapping_t mappings[] = {
+      // Combined:
+      // Separate:
+      {IREE_HAL_BUFFER_USAGE_CONSTANT, IREE_SVL("CONSTANT")},
+      {IREE_HAL_BUFFER_USAGE_TRANSFER, IREE_SVL("TRANSFER")},
+      {IREE_HAL_BUFFER_USAGE_MAPPING, IREE_SVL("MAPPING")},
+      {IREE_HAL_BUFFER_USAGE_DISPATCH, IREE_SVL("DISPATCH")},
+  };
+  return iree_bitfield_format_inline(value, mappings, IREE_ARRAYSIZE(mappings),
+                                     out_temp);
+}
+
+//===----------------------------------------------------------------------===//
+// Subspan indirection buffer
+//===----------------------------------------------------------------------===//
+
+static const iree_hal_buffer_vtable_t iree_hal_subspan_buffer_vtable;
+
+IREE_API_EXPORT void iree_hal_subspan_buffer_initialize(
+    iree_hal_buffer_t* allocated_buffer, iree_device_size_t byte_offset,
+    iree_device_size_t byte_length, iree_hal_allocator_t* device_allocator,
+    iree_allocator_t host_allocator, iree_hal_buffer_t* out_buffer) {
+  IREE_ASSERT_ARGUMENT(allocated_buffer);
+  IREE_ASSERT_ARGUMENT(out_buffer);
+  iree_hal_buffer_initialize(host_allocator, device_allocator, allocated_buffer,
+                             allocated_buffer->allocation_size, byte_offset,
+                             byte_length, allocated_buffer->memory_type,
+                             allocated_buffer->allowed_access,
+                             allocated_buffer->allowed_usage,
+                             &iree_hal_subspan_buffer_vtable, out_buffer);
+}
+
+IREE_API_EXPORT void iree_hal_subspan_buffer_deinitialize(
+    iree_hal_buffer_t* buffer) {
+  IREE_ASSERT_ARGUMENT(buffer);
+  iree_hal_buffer_release(buffer->allocated_buffer);
+  buffer->allocated_buffer = NULL;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_subspan_buffer_create(
+    iree_hal_buffer_t* allocated_buffer, iree_device_size_t byte_offset,
+    iree_device_size_t byte_length, iree_hal_allocator_t* device_allocator,
+    iree_allocator_t host_allocator, iree_hal_buffer_t** out_buffer) {
+  IREE_ASSERT_ARGUMENT(allocated_buffer);
+  IREE_ASSERT_ARGUMENT(out_buffer);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_buffer_t* buffer = NULL;
+  iree_status_t status =
+      iree_allocator_malloc(host_allocator, sizeof(*buffer), (void**)&buffer);
+  if (iree_status_is_ok(status)) {
+    iree_hal_buffer_initialize(
+        host_allocator, device_allocator, allocated_buffer,
+        allocated_buffer->allocation_size, byte_offset, byte_length,
+        allocated_buffer->memory_type, allocated_buffer->allowed_access,
+        allocated_buffer->allowed_usage, &iree_hal_subspan_buffer_vtable,
+        buffer);
+    *out_buffer = buffer;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+static void iree_hal_subspan_buffer_destroy(iree_hal_buffer_t* base_buffer) {
+  iree_allocator_t host_allocator = base_buffer->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_buffer_release(base_buffer->allocated_buffer);
+  iree_allocator_free(host_allocator, base_buffer);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_status_t iree_hal_subspan_buffer_map_range(
+    iree_hal_buffer_t* buffer, iree_hal_mapping_mode_t mapping_mode,
+    iree_hal_memory_access_t memory_access,
+    iree_device_size_t local_byte_offset, iree_device_size_t local_byte_length,
+    iree_hal_buffer_mapping_t* mapping) {
+  return _VTABLE_DISPATCH(buffer->allocated_buffer, map_range)(
+      buffer->allocated_buffer, mapping_mode, memory_access, local_byte_offset,
+      local_byte_length, mapping);
+}
+
+static iree_status_t iree_hal_subspan_buffer_unmap_range(
+    iree_hal_buffer_t* buffer, iree_device_size_t local_byte_offset,
+    iree_device_size_t local_byte_length, iree_hal_buffer_mapping_t* mapping) {
+  if (!buffer->allocated_buffer) return iree_ok_status();
+  return _VTABLE_DISPATCH(buffer->allocated_buffer, unmap_range)(
+      buffer->allocated_buffer, local_byte_offset, local_byte_length, mapping);
+}
+
+static iree_status_t iree_hal_subspan_buffer_invalidate_range(
+    iree_hal_buffer_t* buffer, iree_device_size_t local_byte_offset,
+    iree_device_size_t local_byte_length) {
+  return _VTABLE_DISPATCH(buffer->allocated_buffer, invalidate_range)(
+      buffer->allocated_buffer, local_byte_offset, local_byte_length);
+}
+
+static iree_status_t iree_hal_subspan_buffer_flush_range(
+    iree_hal_buffer_t* buffer, iree_device_size_t local_byte_offset,
+    iree_device_size_t local_byte_length) {
+  return _VTABLE_DISPATCH(buffer->allocated_buffer, flush_range)(
+      buffer->allocated_buffer, local_byte_offset, local_byte_length);
+}
+
+static const iree_hal_buffer_vtable_t iree_hal_subspan_buffer_vtable = {
+    .recycle = iree_hal_buffer_recycle,
+    .destroy = iree_hal_subspan_buffer_destroy,
+    .map_range = iree_hal_subspan_buffer_map_range,
+    .unmap_range = iree_hal_subspan_buffer_unmap_range,
+    .invalidate_range = iree_hal_subspan_buffer_invalidate_range,
+    .flush_range = iree_hal_subspan_buffer_flush_range,
+};
+
+//===----------------------------------------------------------------------===//
+// iree_hal_buffer_t
+//===----------------------------------------------------------------------===//
+
+IREE_API_EXPORT void iree_hal_buffer_initialize(
+    iree_allocator_t host_allocator, iree_hal_allocator_t* device_allocator,
+    iree_hal_buffer_t* allocated_buffer, iree_device_size_t allocation_size,
+    iree_device_size_t byte_offset, iree_device_size_t byte_length,
+    iree_hal_memory_type_t memory_type, iree_hal_memory_access_t allowed_access,
+    iree_hal_buffer_usage_t allowed_usage,
+    const iree_hal_buffer_vtable_t* vtable, iree_hal_buffer_t* buffer) {
+  iree_hal_resource_initialize(vtable, &buffer->resource);
+  buffer->host_allocator = host_allocator;
+  buffer->device_allocator = device_allocator;
+  buffer->allocated_buffer = allocated_buffer;
+  buffer->allocation_size = allocation_size;
+  buffer->byte_offset = byte_offset;
+  buffer->byte_length = byte_length;
+  buffer->memory_type = memory_type;
+  buffer->allowed_access = allowed_access;
+  buffer->allowed_usage = allowed_usage;
+
+  // Retain the base allocated buffer if it's unique from the buffer we are
+  // initializing.
+  if (allocated_buffer != buffer) {
+    iree_hal_buffer_retain(buffer->allocated_buffer);
+  }
+}
+
+IREE_API_EXPORT void iree_hal_buffer_recycle(iree_hal_buffer_t* buffer) {
+  if (IREE_LIKELY(buffer)) {
+    IREE_TRACE_ZONE_BEGIN(z0);
+    if (buffer->device_allocator) {
+      iree_hal_allocator_deallocate_buffer(buffer->device_allocator, buffer);
+    } else {
+      iree_hal_buffer_destroy(buffer);
+    }
+    IREE_TRACE_ZONE_END(z0);
+  }
+}
+
+IREE_API_EXPORT void iree_hal_buffer_destroy(iree_hal_buffer_t* buffer) {
+  if (IREE_LIKELY(buffer)) {
+    IREE_HAL_VTABLE_DISPATCH(buffer, iree_hal_buffer, destroy)
+    (buffer);
+  }
+}
+
+IREE_API_EXPORT void iree_hal_buffer_retain(iree_hal_buffer_t* buffer) {
+  if (IREE_LIKELY(buffer)) {
+    iree_atomic_ref_count_inc(&((iree_hal_resource_t*)(buffer))->ref_count);
+  }
+}
+
+IREE_API_EXPORT void iree_hal_buffer_release(iree_hal_buffer_t* buffer) {
+  if (IREE_LIKELY(buffer) &&
+      iree_atomic_ref_count_dec(&((iree_hal_resource_t*)(buffer))->ref_count) ==
+          1) {
+    iree_hal_buffer_recycle(buffer);
+  }
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_buffer_validate_memory_type(
+    iree_hal_memory_type_t actual_memory_type,
+    iree_hal_memory_type_t expected_memory_type) {
+  if (IREE_UNLIKELY(
+          !iree_all_bits_set(actual_memory_type, expected_memory_type))) {
+#if IREE_STATUS_MODE
+    // Missing one or more bits.
+    iree_bitfield_string_temp_t temp0, temp1;
+    iree_string_view_t actual_memory_type_str =
+        iree_hal_memory_type_format(actual_memory_type, &temp0);
+    iree_string_view_t expected_memory_type_str =
+        iree_hal_memory_type_format(expected_memory_type, &temp1);
+    return iree_make_status(
+        IREE_STATUS_PERMISSION_DENIED,
+        "buffer memory type is not compatible with the requested operation; "
+        "buffer has %.*s, operation requires %.*s",
+        (int)actual_memory_type_str.size, actual_memory_type_str.data,
+        (int)expected_memory_type_str.size, expected_memory_type_str.data);
+#else
+    return iree_status_from_code(IREE_STATUS_PERMISSION_DENIED);
+#endif  // IREE_STATUS_MODE
+  }
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_buffer_validate_access(
+    iree_hal_memory_access_t allowed_memory_access,
+    iree_hal_memory_access_t required_memory_access) {
+  if (iree_all_bits_set(required_memory_access, IREE_HAL_MEMORY_ACCESS_ANY)) {
+    return iree_ok_status();
+  }
+  if (IREE_UNLIKELY(!iree_any_bit_set(
+          required_memory_access,
+          IREE_HAL_MEMORY_ACCESS_READ | IREE_HAL_MEMORY_ACCESS_WRITE))) {
+    // No actual access bits defined.
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "memory access must specify one or more of _READ or _WRITE");
+  } else if (IREE_UNLIKELY(!iree_all_bits_set(allowed_memory_access,
+                                              required_memory_access))) {
+#if IREE_STATUS_MODE
+    // Bits must match exactly.
+    iree_bitfield_string_temp_t temp0, temp1;
+    iree_string_view_t allowed_memory_access_str =
+        iree_hal_memory_access_format(allowed_memory_access, &temp0);
+    iree_string_view_t required_memory_access_str =
+        iree_hal_memory_access_format(required_memory_access, &temp1);
+    return iree_make_status(
+        IREE_STATUS_PERMISSION_DENIED,
+        "buffer does not support the requested access "
+        "type; buffer allows %.*s, operation requires %.*s",
+        (int)allowed_memory_access_str.size, allowed_memory_access_str.data,
+        (int)required_memory_access_str.size, required_memory_access_str.data);
+#else
+    return iree_status_from_code(IREE_STATUS_PERMISSION_DENIED);
+#endif  // IREE_STATUS_MODE
+  }
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_status_t
+iree_hal_buffer_validate_usage(iree_hal_buffer_usage_t allowed_usage,
+                               iree_hal_buffer_usage_t required_usage) {
+  if (IREE_UNLIKELY(!iree_all_bits_set(allowed_usage, required_usage))) {
+#if IREE_STATUS_MODE
+    // Missing one or more bits.
+    iree_bitfield_string_temp_t temp0, temp1;
+    iree_string_view_t allowed_usage_str =
+        iree_hal_buffer_usage_format(allowed_usage, &temp0);
+    iree_string_view_t required_usage_str =
+        iree_hal_buffer_usage_format(required_usage, &temp1);
+    return iree_make_status(
+        IREE_STATUS_PERMISSION_DENIED,
+        "requested usage was not specified when the buffer was allocated; "
+        "buffer allows %.*s, operation requires %.*s",
+        (int)allowed_usage_str.size, allowed_usage_str.data,
+        (int)required_usage_str.size, required_usage_str.data);
+#else
+    return iree_status_from_code(IREE_STATUS_PERMISSION_DENIED);
+#endif  // IREE_STATUS_MODE
+  }
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_buffer_validate_range(
+    iree_hal_buffer_t* buffer, iree_device_size_t byte_offset,
+    iree_device_size_t byte_length) {
+  // Check if the start of the range runs off the end of the buffer.
+  if (IREE_UNLIKELY(byte_offset > iree_hal_buffer_byte_length(buffer))) {
+    return iree_make_status(
+        IREE_STATUS_OUT_OF_RANGE,
+        "attempted to access an address off the end of the valid buffer range "
+        "(offset=%" PRIdsz ", length=%" PRIdsz ", buffer byte_length=%" PRIdsz
+        ")",
+        byte_offset, byte_length, iree_hal_buffer_byte_length(buffer));
+  }
+
+  if (byte_length == 0) {
+    // Fine to have a zero length.
+    return iree_ok_status();
+  }
+
+  // Check if the end runs over the allocation.
+  iree_device_size_t end = byte_offset + byte_length;
+  if (IREE_UNLIKELY(end > iree_hal_buffer_byte_length(buffer))) {
+    return iree_make_status(
+        IREE_STATUS_OUT_OF_RANGE,
+        "attempted to access an address outside of the valid buffer range "
+        "(offset=%" PRIdsz ", length=%" PRIdsz ", end(inc)=%" PRIdsz
+        ", buffer byte_length=%" PRIdsz ")",
+        byte_offset, byte_length, end - 1, iree_hal_buffer_byte_length(buffer));
+  }
+
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_buffer_calculate_range(
+    iree_device_size_t base_offset, iree_device_size_t max_length,
+    iree_device_size_t offset, iree_device_size_t length,
+    iree_device_size_t* out_adjusted_offset,
+    iree_device_size_t* out_adjusted_length) {
+  // Check if the start of the range runs off the end of the buffer.
+  if (IREE_UNLIKELY(offset > max_length)) {
+    *out_adjusted_offset = 0;
+    if (out_adjusted_length) *out_adjusted_length = 0;
+    return iree_make_status(
+        IREE_STATUS_OUT_OF_RANGE,
+        "attempted to access an address off the end of the valid buffer "
+        "range (offset=%" PRIdsz ", length=%" PRIdsz
+        ", buffer byte_length=%" PRIdsz ")",
+        offset, length, max_length);
+  }
+
+  // Handle length as IREE_WHOLE_BUFFER by adjusting it (if allowed).
+  if (IREE_UNLIKELY(length == IREE_WHOLE_BUFFER) &&
+      IREE_UNLIKELY(!out_adjusted_length)) {
+    *out_adjusted_offset = 0;
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "IREE_WHOLE_BUFFER may only be used with buffer "
+                            "ranges, not external pointer ranges");
+  }
+
+  // Calculate the real ranges adjusted for our region within the allocation.
+  iree_device_size_t adjusted_offset = base_offset + offset;
+  iree_device_size_t adjusted_length =
+      length == IREE_WHOLE_BUFFER ? max_length - offset : length;
+  if (adjusted_length == 0) {
+    // Fine to have a zero length.
+    *out_adjusted_offset = adjusted_offset;
+    if (out_adjusted_length) *out_adjusted_length = adjusted_length;
+    return iree_ok_status();
+  }
+
+  // Check if the end runs over the allocation.
+  iree_device_size_t end = offset + adjusted_length - 1;
+  if (IREE_UNLIKELY(end >= max_length)) {
+    *out_adjusted_offset = 0;
+    if (out_adjusted_length) *out_adjusted_length = 0;
+    return iree_make_status(
+        IREE_STATUS_OUT_OF_RANGE,
+        "attempted to access an address outside of the valid buffer "
+        "range (offset=%" PRIdsz ", adjusted_length=%" PRIdsz ", end=%" PRIdsz
+        ", buffer byte_length=%" PRIdsz ")",
+        offset, adjusted_length, end, max_length);
+  }
+
+  *out_adjusted_offset = adjusted_offset;
+  if (out_adjusted_length) *out_adjusted_length = adjusted_length;
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_hal_buffer_overlap_t iree_hal_buffer_test_overlap(
+    iree_hal_buffer_t* lhs_buffer, iree_device_size_t lhs_offset,
+    iree_device_size_t lhs_length, iree_hal_buffer_t* rhs_buffer,
+    iree_device_size_t rhs_offset, iree_device_size_t rhs_length) {
+  if (iree_hal_buffer_allocated_buffer(lhs_buffer) !=
+      iree_hal_buffer_allocated_buffer(rhs_buffer)) {
+    // Not even the same buffers.
+    return IREE_HAL_BUFFER_OVERLAP_DISJOINT;
+  }
+  // Resolve offsets into the underlying allocation.
+  iree_device_size_t lhs_alloc_offset =
+      iree_hal_buffer_byte_offset(lhs_buffer) + lhs_offset;
+  iree_device_size_t rhs_alloc_offset =
+      iree_hal_buffer_byte_offset(rhs_buffer) + rhs_offset;
+  iree_device_size_t lhs_alloc_length =
+      lhs_length == IREE_WHOLE_BUFFER
+          ? iree_hal_buffer_byte_length(lhs_buffer) - lhs_offset
+          : lhs_length;
+  iree_device_size_t rhs_alloc_length =
+      rhs_length == IREE_WHOLE_BUFFER
+          ? iree_hal_buffer_byte_length(rhs_buffer) - rhs_offset
+          : rhs_length;
+  if (!lhs_alloc_length || !rhs_alloc_length) {
+    return IREE_HAL_BUFFER_OVERLAP_DISJOINT;
+  }
+  if (lhs_alloc_offset == rhs_alloc_offset &&
+      lhs_alloc_length == rhs_alloc_length) {
+    return IREE_HAL_BUFFER_OVERLAP_COMPLETE;
+  }
+  return lhs_alloc_offset + lhs_alloc_length > rhs_alloc_offset &&
+                 rhs_alloc_offset + rhs_alloc_length > lhs_alloc_offset
+             ? IREE_HAL_BUFFER_OVERLAP_PARTIAL
+             : IREE_HAL_BUFFER_OVERLAP_DISJOINT;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_buffer_subspan(
+    iree_hal_buffer_t* buffer, iree_device_size_t byte_offset,
+    iree_device_size_t byte_length, iree_hal_buffer_t** out_buffer) {
+  IREE_ASSERT_ARGUMENT(buffer);
+  IREE_ASSERT_ARGUMENT(out_buffer);
+  *out_buffer = NULL;
+
+  // Fast path: if we are requesting the whole buffer (usually via
+  // IREE_WHOLE_BUFFER) then we can just return the buffer itself.
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_calculate_range(
+      iree_hal_buffer_byte_offset(buffer), iree_hal_buffer_byte_length(buffer),
+      byte_offset, byte_length, &byte_offset, &byte_length));
+  if (byte_offset == 0 && byte_length == iree_hal_buffer_byte_length(buffer)) {
+    iree_hal_buffer_retain(buffer);
+    *out_buffer = buffer;
+    return iree_ok_status();
+  }
+
+  // To avoid heavy nesting of subspans that just add indirection we go to the
+  // parent buffer directly. If we wanted better accounting (to track where
+  // buffers came from) we'd want to avoid this but I'm not sure that's worth
+  // the super deep indirection that could arise.
+  iree_hal_buffer_t* allocated_buffer =
+      iree_hal_buffer_allocated_buffer(buffer);
+  if (allocated_buffer != buffer) {
+    return iree_hal_buffer_subspan(allocated_buffer, byte_offset, byte_length,
+                                   out_buffer);
+  }
+
+  return iree_hal_subspan_buffer_create(buffer, byte_offset, byte_length,
+                                        /*device_allocator=*/NULL,
+                                        buffer->host_allocator, out_buffer);
+}
+
+IREE_API_EXPORT iree_hal_buffer_t* iree_hal_buffer_allocated_buffer(
+    const iree_hal_buffer_t* buffer) {
+  IREE_ASSERT_ARGUMENT(buffer);
+  return buffer->allocated_buffer;
+}
+
+IREE_API_EXPORT iree_device_size_t
+iree_hal_buffer_allocation_size(const iree_hal_buffer_t* buffer) {
+  IREE_ASSERT_ARGUMENT(buffer);
+  return buffer->allocation_size;
+}
+
+IREE_API_EXPORT iree_device_size_t
+iree_hal_buffer_byte_offset(const iree_hal_buffer_t* buffer) {
+  IREE_ASSERT_ARGUMENT(buffer);
+  return buffer->byte_offset;
+}
+
+IREE_API_EXPORT iree_device_size_t
+iree_hal_buffer_byte_length(const iree_hal_buffer_t* buffer) {
+  IREE_ASSERT_ARGUMENT(buffer);
+  return buffer->byte_length;
+}
+
+IREE_API_EXPORT
+iree_hal_memory_type_t iree_hal_buffer_memory_type(
+    const iree_hal_buffer_t* buffer) {
+  IREE_ASSERT_ARGUMENT(buffer);
+  return buffer->memory_type;
+}
+
+IREE_API_EXPORT
+iree_hal_memory_access_t iree_hal_buffer_allowed_access(
+    const iree_hal_buffer_t* buffer) {
+  IREE_ASSERT_ARGUMENT(buffer);
+  return buffer->allowed_access;
+}
+
+IREE_API_EXPORT
+iree_hal_buffer_usage_t iree_hal_buffer_allowed_usage(
+    const iree_hal_buffer_t* buffer) {
+  IREE_ASSERT_ARGUMENT(buffer);
+  return buffer->allowed_usage;
+}
+
+//===----------------------------------------------------------------------===//
+// Transfer
+//===----------------------------------------------------------------------===//
+
+IREE_API_EXPORT iree_status_t iree_hal_buffer_map_zero(
+    iree_hal_buffer_t* buffer, iree_device_size_t byte_offset,
+    iree_device_size_t byte_length) {
+  const uint8_t zero = 0;
+  return iree_hal_buffer_map_fill(buffer, byte_offset, byte_length, &zero, 1);
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_buffer_map_fill(
+    iree_hal_buffer_t* buffer, iree_device_size_t byte_offset,
+    iree_device_size_t byte_length, const void* pattern,
+    iree_host_size_t pattern_length) {
+  IREE_ASSERT_ARGUMENT(buffer);
+  IREE_ASSERT_ARGUMENT(pattern);
+
+  if (IREE_UNLIKELY(pattern_length != 1 && pattern_length != 2 &&
+                    pattern_length != 4)) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "fill patterns must be 1, 2, or 4 bytes (got %zu)",
+                            pattern_length);
+  }
+
+  if (byte_length == 0) {
+    return iree_ok_status();  // No-op.
+  }
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_hal_buffer_mapping_t target_mapping = {{0}};
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_hal_buffer_map_range(buffer, IREE_HAL_MAPPING_MODE_SCOPED,
+                                    IREE_HAL_MEMORY_ACCESS_DISCARD_WRITE,
+                                    byte_offset, byte_length, &target_mapping));
+  if (byte_length == IREE_WHOLE_BUFFER) {
+    byte_length = target_mapping.contents.data_length;
+  }
+
+  if (IREE_UNLIKELY((byte_offset % pattern_length) != 0) ||
+      IREE_UNLIKELY((byte_length % pattern_length) != 0)) {
+    iree_status_ignore(iree_hal_buffer_unmap_range(&target_mapping));
+    IREE_TRACE_ZONE_END(z0);
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "attempting to fill a range with %zu byte values "
+                            "that is not aligned (offset=%" PRIdsz
+                            ", length=%" PRIdsz ")",
+                            pattern_length, byte_offset, byte_length);
+  }
+
+  const uint32_t zero_32 = 0;
+  if (memcmp(pattern, &zero_32, pattern_length) == 0) {
+    // We can turn all-zero values into single-byte fills as that can be much
+    // faster on devices (doing a fill8 vs fill32).
+    pattern_length = 1;
+  }
+
+  iree_status_t status = iree_ok_status();
+  void* data_ptr = target_mapping.contents.data;
+  switch (pattern_length) {
+    case 1: {
+      uint8_t* data = (uint8_t*)data_ptr;
+      uint8_t value_bits = *(const uint8_t*)(pattern);
+      memset(data, value_bits, byte_length);
+      break;
+    }
+    case 2: {
+      uint16_t* data = (uint16_t*)data_ptr;
+      uint16_t value_bits = *(const uint16_t*)(pattern);
+      for (iree_device_size_t i = 0; i < byte_length / sizeof(uint16_t); ++i) {
+        data[i] = value_bits;
+      }
+      break;
+    }
+    case 4: {
+      uint32_t* data = (uint32_t*)data_ptr;
+      uint32_t value_bits = *(const uint32_t*)(pattern);
+      for (iree_device_size_t i = 0; i < byte_length / sizeof(uint32_t); ++i) {
+        data[i] = value_bits;
+      }
+      break;
+    }
+    default:
+      status = iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                                "unsupported fill pattern length: %zu",
+                                pattern_length);
+      break;
+  }
+
+  if (iree_status_is_ok(status) &&
+      !iree_all_bits_set(iree_hal_buffer_memory_type(buffer),
+                         IREE_HAL_MEMORY_TYPE_HOST_COHERENT)) {
+    status = iree_hal_buffer_flush_range(&target_mapping, 0, IREE_WHOLE_BUFFER);
+  }
+
+  status =
+      iree_status_join(status, iree_hal_buffer_unmap_range(&target_mapping));
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_buffer_map_read(
+    iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
+    void* target_buffer, iree_device_size_t data_length) {
+  if (data_length == 0) {
+    return iree_ok_status();  // No-op.
+  }
+  IREE_ASSERT_ARGUMENT(source_buffer);
+  IREE_ASSERT_ARGUMENT(target_buffer);
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, data_length);
+  iree_hal_buffer_mapping_t source_mapping = {{0}};
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_hal_buffer_map_range(source_buffer, IREE_HAL_MAPPING_MODE_SCOPED,
+                                    IREE_HAL_MEMORY_ACCESS_READ, source_offset,
+                                    data_length, &source_mapping));
+
+  memcpy(target_buffer, source_mapping.contents.data, data_length);
+
+  iree_hal_buffer_unmap_range(&source_mapping);
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_buffer_map_write(
+    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+    const void* source_buffer, iree_device_size_t data_length) {
+  if (data_length == 0) {
+    return iree_ok_status();  // No-op.
+  }
+  IREE_ASSERT_ARGUMENT(target_buffer);
+  IREE_ASSERT_ARGUMENT(source_buffer);
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, data_length);
+  iree_hal_buffer_mapping_t target_mapping;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0,
+      iree_hal_buffer_map_range(target_buffer, IREE_HAL_MAPPING_MODE_SCOPED,
+                                IREE_HAL_MEMORY_ACCESS_DISCARD_WRITE,
+                                target_offset, data_length, &target_mapping));
+
+  memcpy(target_mapping.contents.data, source_buffer, data_length);
+
+  iree_status_t status = iree_ok_status();
+  if (!iree_all_bits_set(iree_hal_buffer_memory_type(target_buffer),
+                         IREE_HAL_MEMORY_TYPE_HOST_COHERENT)) {
+    status = iree_hal_buffer_flush_range(&target_mapping, 0, IREE_WHOLE_BUFFER);
+  }
+
+  iree_hal_buffer_unmap_range(&target_mapping);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_buffer_map_copy(
+    iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
+    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+    iree_device_size_t data_length) {
+  if (data_length == 0) {
+    return iree_ok_status();  // No-op.
+  }
+  IREE_ASSERT_ARGUMENT(source_buffer);
+  IREE_ASSERT_ARGUMENT(target_buffer);
+
+  // Check for overlap - like memcpy we require that the two ranges don't have
+  // any overlap - because we use memcpy below!
+  if (iree_hal_buffer_test_overlap(source_buffer, source_offset, data_length,
+                                   target_buffer, target_offset, data_length) !=
+      IREE_HAL_BUFFER_OVERLAP_DISJOINT) {
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "source and target ranges must not overlap within the same buffer");
+  }
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, data_length);
+
+  // Map source, which may have IREE_WHOLE_BUFFER length.
+  iree_hal_buffer_mapping_t source_mapping;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_hal_buffer_map_range(source_buffer, IREE_HAL_MAPPING_MODE_SCOPED,
+                                    IREE_HAL_MEMORY_ACCESS_READ, source_offset,
+                                    data_length, &source_mapping));
+
+  // Map target, which may also have IREE_WHOLE_BUFFER length.
+  iree_hal_buffer_mapping_t target_mapping;
+  iree_status_t status =
+      iree_hal_buffer_map_range(target_buffer, IREE_HAL_MAPPING_MODE_SCOPED,
+                                IREE_HAL_MEMORY_ACCESS_DISCARD_WRITE,
+                                target_offset, data_length, &target_mapping);
+  if (!iree_status_is_ok(status)) {
+    iree_hal_buffer_unmap_range(&source_mapping);
+    IREE_TRACE_ZONE_END(z0);
+    return status;
+  }
+
+  // Adjust the data length based on the min we have.
+  iree_device_size_t adjusted_data_length = 0;
+  if (data_length == IREE_WHOLE_BUFFER) {
+    // Whole buffer copy requested - that could mean either, so take the min.
+    adjusted_data_length = iree_min(source_mapping.contents.data_length,
+                                    target_mapping.contents.data_length);
+  } else {
+    // Specific length requested - validate that we have matching lengths.
+    IREE_ASSERT_EQ(source_mapping.contents.data_length,
+                   target_mapping.contents.data_length);
+    adjusted_data_length = target_mapping.contents.data_length;
+  }
+
+  // Elide zero length copies. It's been expensive to get to this point just to
+  // bail but we need to have mapped to resolve IREE_WHOLE_BUFFERs that may
+  // result in zero lengths.
+  if (IREE_UNLIKELY(adjusted_data_length == 0)) {
+    IREE_TRACE_ZONE_END(z0);
+    return iree_ok_status();
+  }
+
+  memcpy(target_mapping.contents.data, source_mapping.contents.data,
+         adjusted_data_length);
+
+  if (!iree_all_bits_set(iree_hal_buffer_memory_type(target_buffer),
+                         IREE_HAL_MEMORY_TYPE_HOST_COHERENT)) {
+    status =
+        iree_hal_buffer_flush_range(&target_mapping, 0, adjusted_data_length);
+  }
+
+  iree_hal_buffer_unmap_range(&source_mapping);
+  iree_hal_buffer_unmap_range(&target_mapping);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+//===----------------------------------------------------------------------===//
+// Mapping
+//===----------------------------------------------------------------------===//
+
+IREE_API_EXPORT iree_status_t iree_hal_buffer_map_range(
+    iree_hal_buffer_t* buffer, iree_hal_mapping_mode_t mapping_mode,
+    iree_hal_memory_access_t memory_access, iree_device_size_t byte_offset,
+    iree_device_size_t byte_length,
+    iree_hal_buffer_mapping_t* out_buffer_mapping) {
+  IREE_ASSERT_ARGUMENT(buffer);
+  IREE_ASSERT_ARGUMENT(out_buffer_mapping);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  memset(out_buffer_mapping, 0, sizeof(*out_buffer_mapping));
+
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_hal_buffer_validate_access(
+              iree_hal_buffer_allowed_access(buffer), memory_access));
+
+  // Persistent mapping requires the buffer was allocated to support it.
+  const bool is_persistent =
+      iree_all_bits_set(mapping_mode, IREE_HAL_MAPPING_MODE_PERSISTENT);
+  if (is_persistent) {
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(z0,
+                                      iree_hal_buffer_validate_memory_type(
+                                          iree_hal_buffer_memory_type(buffer),
+                                          IREE_HAL_MEMORY_TYPE_HOST_VISIBLE));
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z0,
+        iree_hal_buffer_validate_usage(iree_hal_buffer_allowed_usage(buffer),
+                                       IREE_HAL_BUFFER_USAGE_MAPPING));
+  }
+
+  iree_device_size_t local_byte_offset = 0;
+  iree_device_size_t local_byte_length = 0;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_hal_buffer_calculate_range(
+              iree_hal_buffer_byte_offset(buffer),
+              iree_hal_buffer_byte_length(buffer), byte_offset, byte_length,
+              &local_byte_offset, &local_byte_length));
+
+  out_buffer_mapping->buffer = buffer;
+  out_buffer_mapping->impl.allowed_access = memory_access;
+  out_buffer_mapping->impl.is_persistent = is_persistent ? 1 : 0;
+  out_buffer_mapping->impl.byte_offset = local_byte_offset;
+
+  iree_status_t status = _VTABLE_DISPATCH(buffer, map_range)(
+      buffer, mapping_mode, memory_access, out_buffer_mapping->impl.byte_offset,
+      local_byte_length, out_buffer_mapping);
+
+  if (iree_status_is_ok(status)) {
+    // Scoped mappings retain the buffer until unmapped.
+    if (!is_persistent) iree_hal_buffer_retain(buffer);
+  } else {
+    memset(out_buffer_mapping, 0, sizeof(*out_buffer_mapping));
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t
+iree_hal_buffer_unmap_range(iree_hal_buffer_mapping_t* buffer_mapping) {
+  IREE_ASSERT_ARGUMENT(buffer_mapping);
+  iree_hal_buffer_t* buffer = buffer_mapping->buffer;
+  if (!buffer) return iree_ok_status();
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_status_t status = _VTABLE_DISPATCH(buffer, unmap_range)(
+      buffer, buffer_mapping->impl.byte_offset,
+      buffer_mapping->contents.data_length, buffer_mapping);
+
+  if (!buffer_mapping->impl.is_persistent) {
+    iree_hal_buffer_release(buffer);
+  }
+  memset(buffer_mapping, 0, sizeof(*buffer_mapping));
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_buffer_invalidate_range(
+    iree_hal_buffer_mapping_t* buffer_mapping, iree_device_size_t byte_offset,
+    iree_device_size_t byte_length) {
+  IREE_ASSERT_ARGUMENT(buffer_mapping);
+  iree_hal_buffer_t* buffer = buffer_mapping->buffer;
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_access(
+      buffer_mapping->impl.allowed_access, IREE_HAL_MEMORY_ACCESS_READ));
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_calculate_range(
+      buffer_mapping->impl.byte_offset, buffer_mapping->contents.data_length,
+      byte_offset, byte_length, &byte_offset, &byte_length));
+  return _VTABLE_DISPATCH(buffer, invalidate_range)(buffer, byte_offset,
+                                                    byte_length);
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_buffer_flush_range(
+    iree_hal_buffer_mapping_t* buffer_mapping, iree_device_size_t byte_offset,
+    iree_device_size_t byte_length) {
+  IREE_ASSERT_ARGUMENT(buffer_mapping);
+  iree_hal_buffer_t* buffer = buffer_mapping->buffer;
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_access(
+      buffer_mapping->impl.allowed_access, IREE_HAL_MEMORY_ACCESS_WRITE));
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_calculate_range(
+      buffer_mapping->impl.byte_offset, buffer_mapping->contents.data_length,
+      byte_offset, byte_length, &byte_offset, &byte_length));
+  return _VTABLE_DISPATCH(buffer, flush_range)(buffer, byte_offset,
+                                               byte_length);
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_buffer_mapping_subspan(
+    iree_hal_buffer_mapping_t* buffer_mapping,
+    iree_hal_memory_access_t memory_access, iree_device_size_t byte_offset,
+    iree_device_size_t byte_length, iree_byte_span_t* out_span) {
+  IREE_ASSERT_ARGUMENT(buffer_mapping);
+  IREE_ASSERT_ARGUMENT(out_span);
+  memset(out_span, 0, sizeof(*out_span));
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_access(
+      buffer_mapping->impl.allowed_access, memory_access));
+  iree_device_size_t data_length = 0;
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_calculate_range(
+      0, buffer_mapping->contents.data_length, byte_offset, byte_length,
+      &byte_offset, &data_length));
+  out_span->data_length = data_length;
+  out_span->data = buffer_mapping->contents.data + byte_offset;
+  return iree_ok_status();
+}
diff --git a/runtime/src/iree/hal/buffer.h b/runtime/src/iree/hal/buffer.h
new file mode 100644
index 0000000..2f2a16f
--- /dev/null
+++ b/runtime/src/iree/hal/buffer.h
@@ -0,0 +1,607 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_BUFFER_H_
+#define IREE_HAL_BUFFER_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/resource.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef struct iree_hal_allocator_t iree_hal_allocator_t;
+
+//===----------------------------------------------------------------------===//
+// Types and Enums
+//===----------------------------------------------------------------------===//
+
+// Whole length of the underlying buffer.
+#define IREE_WHOLE_BUFFER ((iree_device_size_t)(-1))
+
+// A bitfield specifying properties for a memory type.
+enum iree_hal_memory_type_bits_t {
+  IREE_HAL_MEMORY_TYPE_NONE = 0u,
+
+  // Memory is lazily allocated by the device and only exists transiently.
+  // This is the optimal mode for memory used only within a single command
+  // buffer. Transient buffers, even if they have
+  // IREE_HAL_MEMORY_TYPE_HOST_VISIBLE set, should be treated as device-local
+  // and opaque as they may have no memory attached to them outside of the time
+  // they are being evaluated on devices.
+  //
+  // This flag can be treated as a hint in most cases; allocating a buffer with
+  // it set _may_ return the same as if it had not be set. Certain allocation
+  // routines may use the hint to more tightly control reuse or defer wiring the
+  // memory.
+  IREE_HAL_MEMORY_TYPE_TRANSIENT = 1u << 0,
+
+  // Memory allocated with this type can be mapped for host access using
+  // iree_hal_buffer_map_range.
+  IREE_HAL_MEMORY_TYPE_HOST_VISIBLE = 1u << 1,
+
+  // The host cache management commands MappedMemory::Flush and
+  // MappedMemory::Invalidate are not needed to flush host writes
+  // to the device or make device writes visible to the host, respectively.
+  IREE_HAL_MEMORY_TYPE_HOST_COHERENT = 1u << 2,
+
+  // Memory allocated with this type is cached on the host. Host memory
+  // accesses to uncached memory are slower than to cached memory, however
+  // uncached memory is always host coherent. MappedMemory::Flush must be used
+  // to ensure the device has visibility into any changes made on the host and
+  // Invalidate must be used to ensure the host has visibility into any changes
+  // made on the device.
+  IREE_HAL_MEMORY_TYPE_HOST_CACHED = 1u << 3,
+
+  // Memory is accessible as normal host allocated memory.
+  IREE_HAL_MEMORY_TYPE_HOST_LOCAL =
+      IREE_HAL_MEMORY_TYPE_HOST_VISIBLE | IREE_HAL_MEMORY_TYPE_HOST_COHERENT,
+
+  // Memory allocated with this type is visible to the device for execution.
+  // Being device visible does not mean the same thing as
+  // IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL. Though an allocation may be visible to
+  // the device and therefore useable for execution it may require expensive
+  // mapping or implicit transfers.
+  IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE = 1u << 4,
+
+  // Memory allocated with this type is the most efficient for device access.
+  // Devices may support using memory that is not device local via
+  // IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE but doing so can incur non-trivial
+  // performance penalties. Device local memory, on the other hand, is
+  // guaranteed to be fast for all operations.
+  IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL =
+      IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE | (1u << 5),
+};
+typedef uint32_t iree_hal_memory_type_t;
+
+// A bitfield specifying how memory will be accessed in a mapped memory region.
+enum iree_hal_memory_access_bits_t {
+  // Memory is not mapped.
+  IREE_HAL_MEMORY_ACCESS_NONE = 0u,
+  // Memory will be read.
+  // If a buffer is only mapped for reading it may still be possible to write to
+  // it but the results will be undefined (as it may present coherency issues).
+  IREE_HAL_MEMORY_ACCESS_READ = 1u << 0,
+  // Memory will be written.
+  // If a buffer is only mapped for writing it may still be possible to read
+  // from it but the results will be undefined or incredibly slow (as it may
+  // be mapped by the driver as uncached).
+  IREE_HAL_MEMORY_ACCESS_WRITE = 1u << 1,
+  // Memory will be discarded prior to mapping.
+  // The existing contents will be undefined after mapping and must be written
+  // to ensure validity.
+  IREE_HAL_MEMORY_ACCESS_DISCARD = 1u << 2,
+  // Memory will be discarded and completely overwritten in a single operation.
+  IREE_HAL_MEMORY_ACCESS_DISCARD_WRITE =
+      IREE_HAL_MEMORY_ACCESS_WRITE | IREE_HAL_MEMORY_ACCESS_DISCARD,
+  // A flag that can be applied to any access type to indicate that the buffer
+  // storage being accessed may alias with other accesses occurring concurrently
+  // within or across operations. The lack of the flag indicates that the access
+  // is guaranteed not to alias (ala C's `restrict` keyword).
+  IREE_HAL_MEMORY_ACCESS_MAY_ALIAS = 1u << 3,
+  // Memory access may perform any operation and should not be validated.
+  // Used upon access to bypass access verification at the API boundary and
+  // effectively provides a `void*`.
+  // This should only be used by device-side code where it is known-safe to
+  // bypass the access verification.
+  IREE_HAL_MEMORY_ACCESS_ANY = 1u << 4,
+  // Memory may have any operation performed on it.
+  IREE_HAL_MEMORY_ACCESS_ALL = IREE_HAL_MEMORY_ACCESS_READ |
+                               IREE_HAL_MEMORY_ACCESS_WRITE |
+                               IREE_HAL_MEMORY_ACCESS_DISCARD,
+};
+typedef uint16_t iree_hal_memory_access_t;
+
+// Bitfield that defines how a buffer is intended to be used.
+// Usage allows the driver to appropriately place the buffer for more
+// efficient operations of the specified types.
+enum iree_hal_buffer_usage_bits_t {
+  IREE_HAL_BUFFER_USAGE_NONE = 0u,
+
+  // The buffer, once defined, will not be mapped or updated again.
+  // This should be used for uniform parameter values such as runtime
+  // constants for executables. Doing so may allow drivers to inline values or
+  // represent them in command buffers more efficiently (avoiding memory reads
+  // or swapping, etc).
+  IREE_HAL_BUFFER_USAGE_CONSTANT = 1u << 0,
+
+  // The buffer can be used as the source or target of a transfer command
+  // (CopyBuffer, UpdateBuffer, etc).
+  //
+  // If |IREE_HAL_BUFFER_USAGE_MAPPING| is not specified drivers may safely
+  // assume that the host may never need visibility of this buffer as all
+  // accesses will happen via command buffers.
+  IREE_HAL_BUFFER_USAGE_TRANSFER = 1u << 1,
+
+  // The buffer can be mapped by the host application for reading and writing
+  // without a copy.
+  //
+  // As mapping may require placement in special address ranges or system
+  // calls to enable visibility the driver can use the presence (or lack of)
+  // this flag to perform allocation-type setup and avoid initial mapping
+  // overhead.
+  IREE_HAL_BUFFER_USAGE_MAPPING = 1u << 2,
+
+  // The buffer can be provided as an input or output to an executable.
+  // Buffers of this type may be directly used by drivers during dispatch.
+  IREE_HAL_BUFFER_USAGE_DISPATCH = 1u << 3,
+};
+typedef uint32_t iree_hal_buffer_usage_t;
+
+// Buffer overlap testing results.
+typedef enum iree_hal_buffer_overlap_e {
+  // No overlap between the two buffers.
+  IREE_HAL_BUFFER_OVERLAP_DISJOINT = 0,
+  // Partial overlap between the two buffers.
+  IREE_HAL_BUFFER_OVERLAP_PARTIAL,
+  // Complete overlap between the two buffers (they are the same).
+  IREE_HAL_BUFFER_OVERLAP_COMPLETE,
+} iree_hal_buffer_overlap_t;
+
+// A bitfield specifying buffer transfer behavior.
+enum iree_hal_transfer_buffer_flag_bits_t {
+  // TODO(benvanik): flags controlling blocking, flushing, invalidation, and
+  // persistence. We may also want to set a bit that causes failure on emulated
+  // transfers that would otherwise be really expensive.
+  IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT = 0,
+};
+typedef uint32_t iree_hal_transfer_buffer_flags_t;
+
+// Determines buffer mapping behavior.
+enum iree_hal_mapping_mode_bits_t {
+  // Buffers are mapped as part of a scoped map-access-unmap sequence.
+  // If there are any in-flight operations using the buffer contents are
+  // undefined though they may deceivingly still seem correct under certain
+  // implementations.
+  IREE_HAL_MAPPING_MODE_SCOPED = 1u << 0,
+
+  // Buffers are mapped persistently and concurrently accessible by both the
+  // host and device. Mapping happens once and so long as there are any live
+  // mappings the buffer will remain accessible. Not all implementations or
+  // buffer memory types support this, and even ones that do may not support
+  // coherent cross-device sharing.
+  IREE_HAL_MAPPING_MODE_PERSISTENT = 1u << 1,
+};
+typedef uint32_t iree_hal_mapping_mode_t;
+
+// Implementation-specific mapping data.
+typedef struct iree_hal_buffer_mapping_impl_t {
+  // Byte offset within the buffer where the mapped data begins.
+  iree_device_size_t byte_offset;
+  // Used for validation only.
+  iree_hal_memory_access_t allowed_access;
+  // Tracking flags.
+  uint32_t is_persistent : 1;
+  uint32_t reserved_flags : 31;
+  // Backing implementation data.
+  // For backends that require additional tracking (shadow data structures/etc)
+  // this can be used to store references to them for the duration of the
+  // mapping.
+  uint64_t reserved[1];
+} iree_hal_buffer_mapping_impl_t;
+
+// Reference to a buffer's mapped memory.
+typedef struct iree_hal_buffer_mapping_t {
+  // Contents of the buffer. Behavior is undefined if an access is performed
+  // whose type was not specified during mapping.
+  //
+  // The bytes available may be greater than what was requested if platform
+  // alignment rules require it. Only memory defined by the given span may be
+  // accessed.
+  iree_byte_span_t contents;
+
+  // Buffer providing the backing storage for the mapping.
+  // When mapped with IREE_HAL_MAPPING_MODE_SCOPED the buffer will be retained
+  // until it is unmapped. When mapped with IREE_HAL_MAPPING_MODE_PERSISTENT the
+  // caller is responsible for retaining the buffer.
+  struct iree_hal_buffer_t* buffer;
+
+  // Used internally - do not modify.
+  // Implementations are allowed to use the reserved fields for their own
+  // storage but should otherwise ignore the remaining parts.
+  iree_hal_buffer_mapping_impl_t impl;
+} iree_hal_buffer_mapping_t;
+
+// Formats a memory type bitfield as a string.
+// See iree_bitfield_format for usage.
+IREE_API_EXPORT iree_string_view_t iree_hal_memory_type_format(
+    iree_hal_memory_type_t value, iree_bitfield_string_temp_t* out_temp);
+
+// Formats a memory access bitfield as a string.
+// See iree_bitfield_format for usage.
+IREE_API_EXPORT iree_string_view_t iree_hal_memory_access_format(
+    iree_hal_memory_access_t value, iree_bitfield_string_temp_t* out_temp);
+
+// Formats a buffer usage bitfield as a string.
+// See iree_bitfield_format for usage.
+IREE_API_EXPORT iree_string_view_t iree_hal_buffer_usage_format(
+    iree_hal_buffer_usage_t value, iree_bitfield_string_temp_t* out_temp);
+
+//===----------------------------------------------------------------------===//
+// iree_hal_buffer_t
+//===----------------------------------------------------------------------===//
+
+// Allocated memory buffer wrapper type and utilities.
+//
+// Buffers are the basic unit of memory used by the inference system. They may
+// be allocated such that they are accessible from the host (normal C++ code
+// running on the main CPU), a particular device (such as an accelerator) or
+// family of devices, or from some mix of all of those.
+//
+// The type of memory a buffer is allocated within has implications on it's
+// performance and lifetime. For example if an application attempts to use a
+// host-allocated buffer (IREE_HAL_MEMORY_TYPE_HOST_LOCAL) on an accelerator
+// with discrete memory the accelerator may either be unable to access the
+// memory or take a non-trivial performance hit when attempting to do so
+// (involving setting up kernel mappings, doing DMA transfers, etc). Likewise,
+// trying to access a device-allocated buffer
+// (IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL) may incur similar overhead or not be
+// possible at all. This may be due to restrictions in the memory visibility,
+// address spaces, mixed endianness or pointer widths, and other weirdness.
+//
+// The memory types (defined by a bitfield of iree_hal_memory_type_t values)
+// that a particular context (host or device) may use vary from device to device
+// and must be queried by the application when allocating buffers. It's strongly
+// recommended that the most specific memory type be set as possible. For
+// example allocating a buffer with IREE_HAL_MEMORY_TYPE_HOST_COHERENT even when
+// it will never be used in a way that requires coherency may occupy address
+// space reservations or memory mapping that would otherwise not be needed.
+//
+// As buffers may sometimes not be accessible from the host the base buffer type
+// does not allow for direct void* access and instead buffers must be either
+// manipulated using utility functions (such as ReadData or WriteData) or by
+// mapping them into a host-accessible address space via MapMemory. Buffers must
+// be unmapped before any command may use them.
+//
+// Buffers may equate (roughly) 1:1 with an allocation either from the host heap
+// or a device. iree_hal_buffer_subspan can be used to reference subspans of
+// buffers like std::span - though unlike std::span the returned buffer holds
+// a reference to the parent buffer.
+typedef struct iree_hal_buffer_t iree_hal_buffer_t;
+
+// Returns success iff the buffer was allocated with the given memory type.
+IREE_API_EXPORT iree_status_t iree_hal_buffer_validate_memory_type(
+    iree_hal_memory_type_t actual_memory_type,
+    iree_hal_memory_type_t expected_memory_type);
+
+// Returns success iff the buffer allows the requested access.
+IREE_API_EXPORT iree_status_t iree_hal_buffer_validate_access(
+    iree_hal_memory_access_t allowed_memory_access,
+    iree_hal_memory_access_t required_memory_access);
+
+// Returns success iff the buffer usage allows the given usage type.
+IREE_API_EXPORT iree_status_t
+iree_hal_buffer_validate_usage(iree_hal_buffer_usage_t allowed_usage,
+                               iree_hal_buffer_usage_t required_usage);
+
+// Returns success iff the given byte range falls within the valid buffer.
+IREE_API_EXPORT iree_status_t iree_hal_buffer_validate_range(
+    iree_hal_buffer_t* buffer, iree_device_size_t byte_offset,
+    iree_device_size_t byte_length);
+
+// Tests whether the given buffers overlap, including support for subspans.
+// IREE_WHOLE_BUFFER may be used for |lhs_length| and/or |rhs_length| to use the
+// lengths of those buffers, respectively.
+IREE_API_EXPORT iree_hal_buffer_overlap_t iree_hal_buffer_test_overlap(
+    iree_hal_buffer_t* lhs_buffer, iree_device_size_t lhs_offset,
+    iree_device_size_t lhs_length, iree_hal_buffer_t* rhs_buffer,
+    iree_device_size_t rhs_offset, iree_device_size_t rhs_length);
+
+// Returns a reference to a subspan of the |buffer|.
+// If |byte_length| is IREE_WHOLE_BUFFER the remaining bytes in the buffer after
+// |byte_offset| (possibly 0) will be selected.
+//
+// The parent buffer will remain alive for the lifetime of the subspan
+// returned. If the subspan is a small portion this may cause additional
+// memory to remain allocated longer than required.
+//
+// Returns the given |buffer| if the requested span covers the entire range.
+// |out_buffer| must be released by the caller.
+IREE_API_EXPORT iree_status_t iree_hal_buffer_subspan(
+    iree_hal_buffer_t* buffer, iree_device_size_t byte_offset,
+    iree_device_size_t byte_length, iree_hal_buffer_t** out_buffer);
+
+// Retains the given |buffer| for the caller.
+IREE_API_EXPORT void iree_hal_buffer_retain(iree_hal_buffer_t* buffer);
+
+// Releases the given |buffer| from the caller.
+IREE_API_EXPORT void iree_hal_buffer_release(iree_hal_buffer_t* buffer);
+
+// Returns a pointer to the buffer containing the actual allocation.
+// The buffer represents a span of the allocated bytes defined by byte_offset
+// and byte_length. If the provided buffer *is* the allocated buffer then the
+// returned value will be the provided buffer pointer.
+IREE_API_EXPORT iree_hal_buffer_t* iree_hal_buffer_allocated_buffer(
+    const iree_hal_buffer_t* buffer);
+
+// Returns the size of the resource memory allocation in bytes.
+// This may be rounded up from the originally requested size or the ideal
+// size for the resource based on device restrictions.
+IREE_API_EXPORT iree_device_size_t
+iree_hal_buffer_allocation_size(const iree_hal_buffer_t* buffer);
+
+// Returns the offset in bytes of the buffer within its allocated_buffer.
+IREE_API_EXPORT iree_device_size_t
+iree_hal_buffer_byte_offset(const iree_hal_buffer_t* buffer);
+
+// Returns the size in bytes of the buffer.
+IREE_API_EXPORT iree_device_size_t
+iree_hal_buffer_byte_length(const iree_hal_buffer_t* buffer);
+
+// Returns the memory type the buffer was allocated with.
+IREE_API_EXPORT
+iree_hal_memory_type_t iree_hal_buffer_memory_type(
+    const iree_hal_buffer_t* buffer);
+
+// Returns the allowed memory access modes.
+// These may be more strict than the underlying allocation, for example when the
+// buffer is exposing read-only memory that may be in mutable pages.
+IREE_API_EXPORT
+iree_hal_memory_access_t iree_hal_buffer_allowed_access(
+    const iree_hal_buffer_t* buffer);
+
+// Returns the allowed buffer usage modes.
+IREE_API_EXPORT
+iree_hal_buffer_usage_t iree_hal_buffer_allowed_usage(
+    const iree_hal_buffer_t* buffer);
+
+// Sets a range of the buffer to binary zero.
+//
+// Requires that the buffer has the IREE_HAL_BUFFER_USAGE_MAPPING bit set.
+// The byte range in |buffer| will be flushed if needed.
+//
+// It is strongly recommended that buffer operations are performed on transfer
+// queues; using this synchronous function may incur additional cache flushes
+// and synchronous blocking behavior and is not supported on all buffer types.
+// See iree_hal_command_buffer_fill_buffer.
+IREE_API_EXPORT iree_status_t iree_hal_buffer_map_zero(
+    iree_hal_buffer_t* buffer, iree_device_size_t byte_offset,
+    iree_device_size_t byte_length);
+
+// Sets a range of the buffer to the given value.
+// Only |pattern_length| values with 1, 2, or 4 bytes are supported.
+//
+// Requires that the buffer has the IREE_HAL_BUFFER_USAGE_MAPPING bit set.
+// The byte range in |buffer| will be flushed if needed.
+//
+// It is strongly recommended that buffer operations are performed on transfer
+// queues; using this synchronous function may incur additional cache flushes
+// and synchronous blocking behavior and is not supported on all buffer types.
+// See iree_hal_command_buffer_fill_buffer.
+IREE_API_EXPORT iree_status_t iree_hal_buffer_map_fill(
+    iree_hal_buffer_t* buffer, iree_device_size_t byte_offset,
+    iree_device_size_t byte_length, const void* pattern,
+    iree_host_size_t pattern_length);
+
+// Reads a block of data from the buffer at the given offset.
+//
+// Requires that the buffer has the IREE_HAL_BUFFER_USAGE_MAPPING bit set.
+//
+// It is strongly recommended that buffer operations are performed on transfer
+// queues; using this synchronous function may incur additional cache flushes
+// and synchronous blocking behavior and is not supported on all buffer types.
+// See iree_hal_command_buffer_copy_buffer.
+IREE_API_EXPORT iree_status_t iree_hal_buffer_map_read(
+    iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
+    void* target_buffer, iree_device_size_t data_length);
+
+// Writes a block of byte data into the buffer at the given offset.
+//
+// Requires that the buffer has the IREE_HAL_BUFFER_USAGE_MAPPING bit set.
+// The byte range in |target_buffer| will be flushed if needed.
+//
+// It is strongly recommended that buffer operations are performed on transfer
+// queues; using this synchronous function may incur additional cache flushes
+// and synchronous blocking behavior and is not supported on all buffer types.
+// See iree_hal_command_buffer_update_buffer and
+// iree_hal_command_buffer_copy_buffer.
+IREE_API_EXPORT iree_status_t iree_hal_buffer_map_write(
+    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+    const void* source_buffer, iree_device_size_t data_length);
+
+// Copies data from the provided |source_buffer| into the |target_buffer|.
+//
+// Requires that both buffers have the IREE_HAL_BUFFER_USAGE_MAPPING bit set.
+// The byte range in |target_buffer| will be flushed if needed. Both buffers
+// need not come from the same device.
+//
+// It is strongly recommended that buffer operations are performed on transfer
+// queues; using this synchronous function may incur additional cache flushes
+// and synchronous blocking behavior and is not supported on all buffer types.
+// See iree_hal_command_buffer_copy_buffer.
+IREE_API_EXPORT iree_status_t iree_hal_buffer_map_copy(
+    iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
+    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+    iree_device_size_t data_length);
+
+// Maps the buffer to be accessed as a host pointer into |out_buffer_mapping|.
+// The byte offset and byte length may be adjusted for device alignment.
+// The output data pointer will be properly aligned to the start of the data.
+// Fails if the memory could not be mapped (invalid access type, invalid
+// range, or unsupported memory type).
+//
+// Requires that the buffer has the IREE_HAL_BUFFER_USAGE_MAPPING bit set.
+// If the buffer is not IREE_HAL_MEMORY_TYPE_HOST_COHERENT then the caller must
+// invalidate the byte range they want to access to update the visibility of the
+// mapped memory.
+IREE_API_EXPORT iree_status_t iree_hal_buffer_map_range(
+    iree_hal_buffer_t* buffer, iree_hal_mapping_mode_t mapping_mode,
+    iree_hal_memory_access_t memory_access, iree_device_size_t byte_offset,
+    iree_device_size_t byte_length,
+    iree_hal_buffer_mapping_t* out_buffer_mapping);
+
+// Unmaps the buffer as was previously mapped to |buffer_mapping|.
+//
+// If the buffer is not IREE_HAL_MEMORY_TYPE_HOST_COHERENT then the caller must
+// flush the byte range they want to make available to other threads/devices.
+//
+// May fail, though unlikely to do so for read-only mapping and the result can
+// be safely ignored using iree_status_ignore. If writing then users must check
+// the status to ensure their writes succeeded.
+IREE_API_EXPORT iree_status_t
+iree_hal_buffer_unmap_range(iree_hal_buffer_mapping_t* buffer_mapping);
+
+// Invalidates ranges of non-coherent memory from the host caches.
+// This guarantees that device writes to the memory ranges provided are
+// visible on the host. Use before reading from non-coherent memory.
+//
+// Only required for memory types without IREE_HAL_MEMORY_TYPE_HOST_COHERENT.
+IREE_API_EXPORT iree_status_t iree_hal_buffer_invalidate_range(
+    iree_hal_buffer_mapping_t* buffer_mapping, iree_device_size_t byte_offset,
+    iree_device_size_t byte_length);
+
+// Flushes ranges of non-coherent memory from the host caches.
+// This guarantees that host writes to the memory ranges provided are available
+// for device access. Use after writing to non-coherent memory.
+//
+// Only required for memory types without IREE_HAL_MEMORY_TYPE_HOST_COHERENT.
+IREE_API_EXPORT iree_status_t iree_hal_buffer_flush_range(
+    iree_hal_buffer_mapping_t* buffer_mapping, iree_device_size_t byte_offset,
+    iree_device_size_t byte_length);
+
+// Calculates and returns a byte subspan range within a buffer mapping.
+// The byte range provided is local to the mapping. May return a 0-length span.
+// IREE_WHOLE_BUFFER can be used for |byte_length|.
+//
+// Note that the access requirements of the mapping still hold: if the memory is
+// not host coherent and writeable then the caller must use the
+// iree_hal_buffer_invalidate_range and iree_hal_buffer_flush_range methods to
+// ensure memory is in the expected state.
+IREE_API_EXPORT iree_status_t iree_hal_buffer_mapping_subspan(
+    iree_hal_buffer_mapping_t* buffer_mapping,
+    iree_hal_memory_access_t memory_access, iree_device_size_t byte_offset,
+    iree_device_size_t byte_length, iree_byte_span_t* out_span);
+
+//===----------------------------------------------------------------------===//
+// iree_hal_subspan_buffer_t
+//===----------------------------------------------------------------------===//
+
+// Initializes in-place a subspan buffer stored in |out_buffer|.
+// The reference count of the buffer will be set to 1.
+//
+// This is intended to be used for provably on-stack transient subspans or
+// buffer wrapping where ownership is controlled externally. If the lifetime of
+// the subspan may extend beyond the lifetime of the |out_buffer| storage then
+// iree_hal_subspan_buffer_create must be used instead.
+//
+// iree_hal_subspan_buffer_deinitialize must be used to deinitialize the buffer.
+IREE_API_EXPORT void iree_hal_subspan_buffer_initialize(
+    iree_hal_buffer_t* allocated_buffer, iree_device_size_t byte_offset,
+    iree_device_size_t byte_length, iree_hal_allocator_t* device_allocator,
+    iree_allocator_t host_allocator, iree_hal_buffer_t* out_buffer);
+
+// Deinitializes a subspan buffer that was initialized with
+// iree_hal_subspan_buffer_initialize.
+IREE_API_EXPORT void iree_hal_subspan_buffer_deinitialize(
+    iree_hal_buffer_t* buffer);
+
+// Creates a buffer referencing a subspan of some base allocation.
+// Optionally |device_allocator| can be provided if this subspan references
+// managed buffers that need deallocation callbacks.
+IREE_API_EXPORT iree_status_t iree_hal_subspan_buffer_create(
+    iree_hal_buffer_t* allocated_buffer, iree_device_size_t byte_offset,
+    iree_device_size_t byte_length, iree_hal_allocator_t* device_allocator,
+    iree_allocator_t host_allocator, iree_hal_buffer_t** out_buffer);
+
+//===----------------------------------------------------------------------===//
+// iree_hal_buffer_t implementation details
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_buffer_vtable_t {
+  // Must be iree_hal_buffer_recycle.
+  void(IREE_API_PTR* recycle)(iree_hal_buffer_t* buffer);
+  void(IREE_API_PTR* destroy)(iree_hal_buffer_t* buffer);
+
+  iree_status_t(IREE_API_PTR* map_range)(iree_hal_buffer_t* buffer,
+                                         iree_hal_mapping_mode_t mapping_mode,
+                                         iree_hal_memory_access_t memory_access,
+                                         iree_device_size_t local_byte_offset,
+                                         iree_device_size_t local_byte_length,
+                                         iree_hal_buffer_mapping_t* mapping);
+
+  iree_status_t(IREE_API_PTR* unmap_range)(iree_hal_buffer_t* buffer,
+                                           iree_device_size_t local_byte_offset,
+                                           iree_device_size_t local_byte_length,
+                                           iree_hal_buffer_mapping_t* mapping);
+
+  iree_status_t(IREE_API_PTR* invalidate_range)(
+      iree_hal_buffer_t* buffer, iree_device_size_t local_byte_offset,
+      iree_device_size_t local_byte_length);
+
+  iree_status_t(IREE_API_PTR* flush_range)(
+      iree_hal_buffer_t* buffer, iree_device_size_t local_byte_offset,
+      iree_device_size_t local_byte_length);
+} iree_hal_buffer_vtable_t;
+static_assert(offsetof(iree_hal_buffer_vtable_t, recycle) == 0,
+              "iree_hal_resource_vtable_t expects destroy at offset 0, we want "
+              "to recycle instead");
+
+struct iree_hal_buffer_t {
+  // Frequently accessed:
+  iree_hal_resource_t resource;  // must be at 0
+  iree_hal_buffer_t* allocated_buffer;
+  iree_device_size_t allocation_size;
+  iree_device_size_t byte_offset;
+  iree_device_size_t byte_length;
+
+  // Rarely accessed:
+  iree_allocator_t host_allocator;
+  iree_hal_allocator_t* device_allocator;
+  // TODO(benvanik): bit pack these; could be ~4 bytes vs 12.
+  iree_hal_memory_type_t memory_type;
+  iree_hal_buffer_usage_t allowed_usage;
+  iree_hal_memory_access_t allowed_access;
+
+  // Implementation-defined flags.
+  uint16_t flags;
+};
+
+IREE_API_EXPORT void iree_hal_buffer_initialize(
+    iree_allocator_t host_allocator, iree_hal_allocator_t* device_allocator,
+    iree_hal_buffer_t* allocated_buffer, iree_device_size_t allocation_size,
+    iree_device_size_t byte_offset, iree_device_size_t byte_length,
+    iree_hal_memory_type_t memory_type, iree_hal_memory_access_t allowed_access,
+    iree_hal_buffer_usage_t allowed_usage,
+    const iree_hal_buffer_vtable_t* vtable, iree_hal_buffer_t* buffer);
+
+// Recycles |buffer| by returning it to its allocator (or destroying it).
+// The |buffer| pointer may remain valid if it is returned to a pool but callers
+// must assume its contents are undefined.
+IREE_API_EXPORT void iree_hal_buffer_recycle(iree_hal_buffer_t* buffer);
+
+// Destroys |buffer| and frees its memory.
+// Implementations should use iree_hal_buffer_recycle in their vtables.
+IREE_API_EXPORT void iree_hal_buffer_destroy(iree_hal_buffer_t* buffer);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_BUFFER_H_
diff --git a/runtime/src/iree/hal/buffer_heap.c b/runtime/src/iree/hal/buffer_heap.c
new file mode 100644
index 0000000..47ec037
--- /dev/null
+++ b/runtime/src/iree/hal/buffer_heap.c
@@ -0,0 +1,311 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/allocator.h"
+#include "iree/hal/buffer.h"
+#include "iree/hal/buffer_heap_impl.h"
+#include "iree/hal/resource.h"
+
+typedef enum iree_hal_heap_buffer_storage_mode_e {
+  // Allocated as a [metadata, data] slab.
+  // The base metadata pointer must be freed with iree_allocator_free_aligned.
+  // The data storage is not freed.
+  IREE_HAL_HEAP_BUFFER_STORAGE_MODE_SLAB = 0u,
+  // Allocated as split [metadata] and [data].
+  // The base metadata pointer must be freed with iree_allocator_free.
+  // The data storage must be freed with iree_allocator_free_aligned.
+  IREE_HAL_HEAP_BUFFER_STORAGE_MODE_SPLIT = 1u,
+  // Allocated as split [metadata] and an externally-owned [data].
+  // The base metadata pointer must be freed with iree_allocator_free.
+  // A user-provided buffer release callback is notified that the buffer is no
+  // longer referencing the data.
+  IREE_HAL_HEAP_BUFFER_STORAGE_MODE_EXTERNAL = 2u,
+} iree_hal_heap_buffer_storage_mode_t;
+
+typedef struct iree_hal_heap_buffer_t {
+  // base.flags has the iree_hal_heap_buffer_storage_mode_t.
+  iree_hal_buffer_t base;
+
+  iree_byte_span_t data;
+  union {
+    // Used for IREE_HAL_HEAP_BUFFER_STORAGE_MODE_SPLIT.
+    iree_allocator_t data_allocator;
+    // Used for IREE_HAL_HEAP_BUFFER_STORAGE_MODE_EXTERNAL.
+    iree_hal_buffer_release_callback_t release_callback;
+  };
+
+  // Optional statistics shared with the allocator.
+  IREE_STATISTICS(iree_hal_heap_allocator_statistics_t* statistics;)
+} iree_hal_heap_buffer_t;
+static_assert(sizeof(iree_hal_heap_buffer_t) <= 128,
+              "header should be <= the minimum buffer alignment so that we "
+              "don't introduce internal waste");
+
+static const iree_hal_buffer_vtable_t iree_hal_heap_buffer_vtable;
+
+// Allocates a buffer with the metadata and storage split.
+// This results in an additional host allocation but allows for user-overridden
+// data storage allocations.
+static iree_status_t iree_hal_heap_buffer_allocate_split(
+    iree_device_size_t allocation_size, iree_allocator_t data_allocator,
+    iree_allocator_t host_allocator, iree_hal_heap_buffer_t** out_buffer,
+    iree_byte_span_t* out_data) {
+  // Try allocating the storage first as it's the most likely to fail if OOM.
+  // It must be aligned to the minimum buffer alignment.
+  out_data->data_length = allocation_size;
+  uint8_t* data_ptr = 0;
+  IREE_RETURN_IF_ERROR(iree_allocator_malloc_aligned(
+      data_allocator, allocation_size, IREE_HAL_HEAP_BUFFER_ALIGNMENT,
+      /*offset=*/0, (void**)&data_ptr));
+  IREE_ASSERT_TRUE(iree_host_size_has_alignment(
+      (iree_host_size_t)data_ptr, IREE_HAL_HEAP_BUFFER_ALIGNMENT));
+  out_data->data = data_ptr;
+
+  // Allocate the host metadata wrapper with natural alignment.
+  iree_status_t status = iree_allocator_malloc(
+      host_allocator, sizeof(**out_buffer), (void**)out_buffer);
+  if (!iree_status_is_ok(status)) {
+    // Need to free the storage we just allocated.
+    iree_allocator_free_aligned(data_allocator, out_data->data);
+  }
+  return status;
+}
+
+// Allocates a buffer with the metadata as a prefix to the storage.
+// This results in a single allocation per buffer but requires that both the
+// metadata and storage live together.
+static iree_status_t iree_hal_heap_buffer_allocate_slab(
+    iree_device_size_t allocation_size, iree_allocator_t host_allocator,
+    iree_hal_heap_buffer_t** out_buffer, iree_byte_span_t* out_data) {
+  // The metadata header is always aligned and we want to ensure it's padded
+  // out to the max alignment.
+  iree_hal_heap_buffer_t* buffer = NULL;
+  iree_host_size_t header_size =
+      iree_host_align(iree_sizeof_struct(*buffer), iree_max_align_t);
+  iree_host_size_t total_size = header_size + allocation_size;
+
+  // Allocate with the data starting at offset header_size aligned to the
+  // minimum required buffer alignment. The header itself will still be aligned
+  // to the natural alignment but our buffer alignment is often much larger.
+  IREE_RETURN_IF_ERROR(iree_allocator_malloc_aligned(
+      host_allocator, total_size, IREE_HAL_HEAP_BUFFER_ALIGNMENT, header_size,
+      (void**)&buffer));
+  *out_buffer = buffer;
+
+  // Set bit indicating that we need to free the metadata with
+  // iree_allocator_free_aligned.
+  uint8_t* data_ptr = (uint8_t*)buffer + header_size;
+  IREE_ASSERT_TRUE(iree_host_size_has_alignment(
+      (iree_host_size_t)data_ptr, IREE_HAL_HEAP_BUFFER_ALIGNMENT));
+  *out_data = iree_make_byte_span(data_ptr, allocation_size);
+
+  return iree_ok_status();
+}
+
+iree_status_t iree_hal_heap_buffer_create(
+    iree_hal_allocator_t* allocator,
+    iree_hal_heap_allocator_statistics_t* statistics,
+    const iree_hal_buffer_params_t* params, iree_device_size_t allocation_size,
+    iree_const_byte_span_t initial_data, iree_allocator_t data_allocator,
+    iree_allocator_t host_allocator, iree_hal_buffer_t** out_buffer) {
+  IREE_ASSERT_ARGUMENT(allocator);
+  IREE_ASSERT_ARGUMENT(params);
+  IREE_ASSERT_ARGUMENT(out_buffer);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // If the data and host allocators are the same we can allocate more
+  // efficiently as a large slab. Otherwise we need to allocate both the
+  // metadata and the storage independently.
+  const bool same_allocator =
+      memcmp(&data_allocator, &host_allocator, sizeof(data_allocator)) == 0;
+
+  iree_hal_heap_buffer_t* buffer = NULL;
+  iree_byte_span_t data = iree_make_byte_span(NULL, 0);
+  iree_status_t status =
+      same_allocator
+          ? iree_hal_heap_buffer_allocate_slab(allocation_size, host_allocator,
+                                               &buffer, &data)
+          : iree_hal_heap_buffer_allocate_split(allocation_size, data_allocator,
+                                                host_allocator, &buffer, &data);
+
+  if (iree_status_is_ok(status)) {
+    iree_hal_buffer_initialize(host_allocator, allocator, &buffer->base,
+                               allocation_size, 0, allocation_size,
+                               params->type, params->access, params->usage,
+                               &iree_hal_heap_buffer_vtable, &buffer->base);
+    buffer->data = data;
+
+    if (same_allocator) {
+      buffer->base.flags = IREE_HAL_HEAP_BUFFER_STORAGE_MODE_SLAB;
+      buffer->data_allocator = iree_allocator_null();
+    } else {
+      buffer->base.flags = IREE_HAL_HEAP_BUFFER_STORAGE_MODE_SPLIT;
+      buffer->data_allocator = data_allocator;
+    }
+
+    IREE_STATISTICS({
+      if (statistics != NULL) {
+        buffer->statistics = statistics;
+        iree_slim_mutex_lock(&statistics->mutex);
+        iree_hal_allocator_statistics_record_alloc(
+            &statistics->base, params->type, allocation_size);
+        iree_slim_mutex_unlock(&statistics->mutex);
+      }
+    });
+
+    if (!iree_const_byte_span_is_empty(initial_data)) {
+      const iree_device_size_t initial_length =
+          iree_min(initial_data.data_length, allocation_size);
+      memcpy(buffer->data.data, initial_data.data, initial_length);
+    }
+
+    *out_buffer = &buffer->base;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+iree_status_t iree_hal_heap_buffer_wrap(
+    iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type,
+    iree_hal_memory_access_t allowed_access,
+    iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size,
+    iree_byte_span_t data, iree_hal_buffer_release_callback_t release_callback,
+    iree_hal_buffer_t** out_buffer) {
+  IREE_ASSERT_ARGUMENT(allocator);
+  IREE_ASSERT_ARGUMENT(out_buffer);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  if (!iree_host_size_has_alignment((uintptr_t)data.data,
+                                    IREE_HAL_HEAP_BUFFER_ALIGNMENT)) {
+    IREE_TRACE_ZONE_END(z0);
+    return iree_make_status(
+        IREE_STATUS_OUT_OF_RANGE,
+        "imported heap buffer data must be aligned to %d; got %p",
+        (int)IREE_HAL_HEAP_BUFFER_ALIGNMENT, data.data);
+  }
+
+  iree_allocator_t host_allocator =
+      iree_hal_allocator_host_allocator(allocator);
+  iree_hal_heap_buffer_t* buffer = NULL;
+  iree_status_t status =
+      iree_allocator_malloc(host_allocator, sizeof(*buffer), (void**)&buffer);
+  if (iree_status_is_ok(status)) {
+    iree_hal_buffer_initialize(host_allocator, allocator, &buffer->base,
+                               allocation_size, 0, data.data_length,
+                               memory_type, allowed_access, allowed_usage,
+                               &iree_hal_heap_buffer_vtable, &buffer->base);
+    buffer->data = data;
+
+    // Notify the provided callback when the external data is no longer needed.
+    buffer->base.flags = IREE_HAL_HEAP_BUFFER_STORAGE_MODE_EXTERNAL;
+    buffer->release_callback = release_callback;
+
+    *out_buffer = &buffer->base;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_heap_buffer_destroy(iree_hal_buffer_t* base_buffer) {
+  iree_hal_heap_buffer_t* buffer = (iree_hal_heap_buffer_t*)base_buffer;
+  iree_allocator_t host_allocator = base_buffer->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  IREE_STATISTICS({
+    if (buffer->statistics != NULL) {
+      iree_slim_mutex_lock(&buffer->statistics->mutex);
+      iree_hal_allocator_statistics_record_free(&buffer->statistics->base,
+                                                base_buffer->memory_type,
+                                                base_buffer->allocation_size);
+      iree_slim_mutex_unlock(&buffer->statistics->mutex);
+    }
+  });
+
+  switch (buffer->base.flags) {
+    case IREE_HAL_HEAP_BUFFER_STORAGE_MODE_SLAB: {
+      iree_allocator_free_aligned(host_allocator, buffer);
+      break;
+    }
+    case IREE_HAL_HEAP_BUFFER_STORAGE_MODE_SPLIT: {
+      iree_allocator_free(buffer->data_allocator, buffer->data.data);
+      iree_allocator_free(host_allocator, buffer);
+      break;
+    }
+    case IREE_HAL_HEAP_BUFFER_STORAGE_MODE_EXTERNAL: {
+      if (buffer->release_callback.fn) {
+        buffer->release_callback.fn(buffer->release_callback.user_data,
+                                    base_buffer);
+      }
+      iree_allocator_free(host_allocator, buffer);
+      break;
+    }
+    default:
+      IREE_ASSERT_UNREACHABLE("unhandled buffer storage mode");
+      break;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_status_t iree_hal_heap_buffer_map_range(
+    iree_hal_buffer_t* base_buffer, iree_hal_mapping_mode_t mapping_mode,
+    iree_hal_memory_access_t memory_access,
+    iree_device_size_t local_byte_offset, iree_device_size_t local_byte_length,
+    iree_hal_buffer_mapping_t* mapping) {
+  iree_hal_heap_buffer_t* buffer = (iree_hal_heap_buffer_t*)base_buffer;
+  mapping->contents = iree_make_byte_span(buffer->data.data + local_byte_offset,
+                                          local_byte_length);
+
+  // If we mapped for discard scribble over the bytes. This is not a mandated
+  // behavior but it will make debugging issues easier. Alternatively for
+  // heap buffers we could reallocate them such that ASAN yells, but that
+  // would only work if the entire buffer was discarded.
+#ifndef NDEBUG
+  if (iree_any_bit_set(memory_access, IREE_HAL_MEMORY_ACCESS_DISCARD)) {
+    memset(mapping->contents.data, 0xCD, local_byte_length);
+  }
+#endif  // !NDEBUG
+
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_heap_buffer_unmap_range(
+    iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset,
+    iree_device_size_t local_byte_length, iree_hal_buffer_mapping_t* mapping) {
+  // No-op here as we always have the pointer.
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_heap_buffer_invalidate_range(
+    iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset,
+    iree_device_size_t local_byte_length) {
+  iree_atomic_thread_fence(iree_memory_order_acquire);
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_heap_buffer_flush_range(
+    iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset,
+    iree_device_size_t local_byte_length) {
+  iree_atomic_thread_fence(iree_memory_order_release);
+  return iree_ok_status();
+}
+
+static const iree_hal_buffer_vtable_t iree_hal_heap_buffer_vtable = {
+    .recycle = iree_hal_buffer_recycle,
+    .destroy = iree_hal_heap_buffer_destroy,
+    .map_range = iree_hal_heap_buffer_map_range,
+    .unmap_range = iree_hal_heap_buffer_unmap_range,
+    .invalidate_range = iree_hal_heap_buffer_invalidate_range,
+    .flush_range = iree_hal_heap_buffer_flush_range,
+};
diff --git a/runtime/src/iree/hal/buffer_heap_impl.h b/runtime/src/iree/hal/buffer_heap_impl.h
new file mode 100644
index 0000000..9481a3d
--- /dev/null
+++ b/runtime/src/iree/hal/buffer_heap_impl.h
@@ -0,0 +1,59 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_BUFFER_HEAP_IMPL_H_
+#define IREE_HAL_BUFFER_HEAP_IMPL_H_
+
+#include "iree/base/api.h"
+#include "iree/base/internal/synchronization.h"
+#include "iree/hal/buffer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+//===----------------------------------------------------------------------===//
+// Private utilities for working with heap buffers
+//===----------------------------------------------------------------------===//
+
+// Shared heap allocator statistics; owned by a heap allocator.
+// Access to the base statistics must be guarded by |mutex|.
+typedef struct iree_hal_heap_allocator_statistics_t {
+  iree_slim_mutex_t mutex;
+  iree_hal_allocator_statistics_t base;
+} iree_hal_heap_allocator_statistics_t;
+
+// Allocates a new heap buffer from the specified |data_allocator|.
+// |host_allocator| is used for the iree_hal_buffer_t metadata. If both
+// |data_allocator| and |host_allocator| are the same the buffer will be created
+// as a flat slab. |out_buffer| must be released by the caller.
+iree_status_t iree_hal_heap_buffer_create(
+    iree_hal_allocator_t* allocator,
+    iree_hal_heap_allocator_statistics_t* statistics,
+    const iree_hal_buffer_params_t* params, iree_device_size_t allocation_size,
+    iree_const_byte_span_t initial_data, iree_allocator_t data_allocator,
+    iree_allocator_t host_allocator, iree_hal_buffer_t** out_buffer);
+
+// Wraps an existing host allocation in a buffer.
+// When the buffer is destroyed the provided |release_callback| will be called.
+//
+// The buffer must be aligned to at least IREE_HAL_HEAP_BUFFER_ALIGNMENT and if
+// it is not the call will fail with IREE_STATUS_OUT_OF_RANGE.
+//
+// |out_buffer| must be released by the caller. |data| must be kept live for the
+// lifetime of the wrapping buffer.
+iree_status_t iree_hal_heap_buffer_wrap(
+    iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type,
+    iree_hal_memory_access_t allowed_access,
+    iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size,
+    iree_byte_span_t data, iree_hal_buffer_release_callback_t release_callback,
+    iree_hal_buffer_t** out_buffer);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_BUFFER_HEAP_IMPL_H_
diff --git a/runtime/src/iree/hal/buffer_view.c b/runtime/src/iree/hal/buffer_view.c
new file mode 100644
index 0000000..c338235
--- /dev/null
+++ b/runtime/src/iree/hal/buffer_view.c
@@ -0,0 +1,235 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/buffer_view.h"
+
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/allocator.h"
+#include "iree/hal/buffer_view_util.h"
+#include "iree/hal/resource.h"
+
+struct iree_hal_buffer_view_t {
+  iree_atomic_ref_count_t ref_count;
+  iree_allocator_t host_allocator;
+  iree_hal_buffer_t* buffer;
+  iree_hal_element_type_t element_type;
+  iree_hal_encoding_type_t encoding_type;
+  iree_device_size_t byte_length;
+  iree_host_size_t shape_rank;
+  iree_hal_dim_t shape[];
+};
+
+IREE_API_EXPORT iree_status_t iree_hal_buffer_view_create(
+    iree_hal_buffer_t* buffer, const iree_hal_dim_t* shape,
+    iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
+    iree_hal_encoding_type_t encoding_type, iree_allocator_t host_allocator,
+    iree_hal_buffer_view_t** out_buffer_view) {
+  IREE_ASSERT_ARGUMENT(buffer);
+  IREE_ASSERT_ARGUMENT(out_buffer_view);
+
+  *out_buffer_view = NULL;
+  if (IREE_UNLIKELY(shape_rank > 0 && !shape)) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "no shape dimensions specified");
+  }
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Allocate and initialize the iree_hal_buffer_view_t struct.
+  // Note that we have the dynamically-sized shape dimensions on the end.
+  iree_hal_buffer_view_t* buffer_view = NULL;
+  iree_status_t status = iree_allocator_malloc(
+      host_allocator,
+      sizeof(*buffer_view) + sizeof(iree_hal_dim_t) * shape_rank,
+      (void**)&buffer_view);
+  if (iree_status_is_ok(status)) {
+    iree_atomic_ref_count_init(&buffer_view->ref_count);
+    buffer_view->host_allocator = host_allocator;
+    buffer_view->buffer = buffer;
+    iree_hal_buffer_retain(buffer_view->buffer);
+    buffer_view->element_type = element_type;
+    buffer_view->encoding_type = encoding_type;
+    buffer_view->byte_length =
+        iree_hal_element_dense_byte_count(buffer_view->element_type);
+    buffer_view->shape_rank = shape_rank;
+    for (iree_host_size_t i = 0; i < shape_rank; ++i) {
+      buffer_view->shape[i] = shape[i];
+      buffer_view->byte_length *= shape[i];
+    }
+    *out_buffer_view = buffer_view;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT void iree_hal_buffer_view_retain(
+    iree_hal_buffer_view_t* buffer_view) {
+  if (IREE_LIKELY(buffer_view)) {
+    iree_atomic_ref_count_inc(&buffer_view->ref_count);
+  }
+}
+
+IREE_API_EXPORT void iree_hal_buffer_view_release(
+    iree_hal_buffer_view_t* buffer_view) {
+  if (IREE_LIKELY(buffer_view) &&
+      iree_atomic_ref_count_dec(&buffer_view->ref_count) == 1) {
+    iree_hal_buffer_view_destroy(buffer_view);
+  }
+}
+
+IREE_API_EXPORT void iree_hal_buffer_view_destroy(
+    iree_hal_buffer_view_t* buffer_view) {
+  iree_allocator_t host_allocator = buffer_view->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_hal_buffer_release(buffer_view->buffer);
+  iree_allocator_free(host_allocator, buffer_view);
+  IREE_TRACE_ZONE_END(z0);
+}
+
+IREE_API_EXPORT iree_hal_buffer_t* iree_hal_buffer_view_buffer(
+    const iree_hal_buffer_view_t* buffer_view) {
+  IREE_ASSERT_ARGUMENT(buffer_view);
+  return buffer_view->buffer;
+}
+
+IREE_API_EXPORT iree_host_size_t
+iree_hal_buffer_view_shape_rank(const iree_hal_buffer_view_t* buffer_view) {
+  IREE_ASSERT_ARGUMENT(buffer_view);
+  return buffer_view->shape_rank;
+}
+
+IREE_API_EXPORT const iree_hal_dim_t* iree_hal_buffer_view_shape_dims(
+    const iree_hal_buffer_view_t* buffer_view) {
+  IREE_ASSERT_ARGUMENT(buffer_view);
+  return buffer_view->shape;
+}
+
+IREE_API_EXPORT iree_hal_dim_t iree_hal_buffer_view_shape_dim(
+    const iree_hal_buffer_view_t* buffer_view, iree_host_size_t index) {
+  IREE_ASSERT_ARGUMENT(buffer_view);
+  if (IREE_UNLIKELY(index > buffer_view->shape_rank)) {
+    return 0;
+  }
+  return buffer_view->shape[index];
+}
+
+IREE_API_EXPORT iree_host_size_t
+iree_hal_buffer_view_element_count(const iree_hal_buffer_view_t* buffer_view) {
+  IREE_ASSERT_ARGUMENT(buffer_view);
+  iree_host_size_t element_count = 1;
+  for (iree_host_size_t i = 0; i < buffer_view->shape_rank; ++i) {
+    element_count *= buffer_view->shape[i];
+  }
+  return element_count;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_buffer_view_shape(
+    const iree_hal_buffer_view_t* buffer_view, iree_host_size_t rank_capacity,
+    iree_hal_dim_t* out_shape, iree_host_size_t* out_shape_rank) {
+  IREE_ASSERT_ARGUMENT(buffer_view);
+  IREE_ASSERT_ARGUMENT(out_shape);
+  if (out_shape_rank) {
+    *out_shape_rank = 0;
+  }
+
+  if (out_shape_rank) {
+    *out_shape_rank = buffer_view->shape_rank;
+  }
+  if (rank_capacity < buffer_view->shape_rank) {
+    // Not an error; just a size query.
+    return iree_status_from_code(IREE_STATUS_OUT_OF_RANGE);
+  }
+
+  for (iree_host_size_t i = 0; i < buffer_view->shape_rank; ++i) {
+    out_shape[i] = buffer_view->shape[i];
+  }
+
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_buffer_view_reshape(
+    iree_hal_buffer_view_t* buffer_view, const iree_hal_dim_t* shape,
+    iree_host_size_t shape_rank) {
+  IREE_ASSERT_ARGUMENT(buffer_view);
+  IREE_ASSERT_ARGUMENT(shape);
+
+  if (shape_rank != buffer_view->shape_rank) {
+    // Rank changes require reallocation of the structure as we inline the
+    // shape dimensions. We could lighten this restriction to allow for rank
+    // reduction but knowing that rank changes aren't allowed is easier than
+    // remembering all the conditions in which they may be.
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "buffer view reshapes must have the same rank; "
+                            "target=%zu, existing=%zu",
+                            shape_rank, buffer_view->shape_rank);
+  }
+
+  iree_device_size_t new_element_count = 1;
+  for (iree_host_size_t i = 0; i < shape_rank; ++i) {
+    new_element_count *= shape[i];
+  }
+  iree_device_size_t old_element_count =
+      iree_hal_buffer_view_element_count(buffer_view);
+  if (new_element_count != old_element_count) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "buffer view reshapes must have the same element "
+                            "count; target=%" PRIdsz ", existing=%" PRIdsz,
+                            new_element_count, old_element_count);
+  }
+
+  for (iree_host_size_t i = 0; i < shape_rank; ++i) {
+    buffer_view->shape[i] = shape[i];
+  }
+
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_hal_element_type_t
+iree_hal_buffer_view_element_type(const iree_hal_buffer_view_t* buffer_view) {
+  IREE_ASSERT_ARGUMENT(buffer_view);
+  return buffer_view->element_type;
+}
+
+IREE_API_EXPORT iree_host_size_t
+iree_hal_buffer_view_element_size(const iree_hal_buffer_view_t* buffer_view) {
+  IREE_ASSERT_ARGUMENT(buffer_view);
+  return iree_hal_element_dense_byte_count(buffer_view->element_type);
+}
+
+IREE_API_EXPORT iree_hal_encoding_type_t
+iree_hal_buffer_view_encoding_type(const iree_hal_buffer_view_t* buffer_view) {
+  IREE_ASSERT_ARGUMENT(buffer_view);
+  return buffer_view->encoding_type;
+}
+
+IREE_API_EXPORT iree_device_size_t
+iree_hal_buffer_view_byte_length(const iree_hal_buffer_view_t* buffer_view) {
+  IREE_ASSERT_ARGUMENT(buffer_view);
+  return buffer_view->byte_length;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_buffer_view_compute_offset(
+    const iree_hal_buffer_view_t* buffer_view, const iree_hal_dim_t* indices,
+    iree_host_size_t indices_count, iree_device_size_t* out_offset) {
+  IREE_ASSERT_ARGUMENT(buffer_view);
+  return iree_hal_buffer_compute_view_offset(
+      buffer_view->shape, buffer_view->shape_rank, buffer_view->element_type,
+      buffer_view->encoding_type, indices, indices_count, out_offset);
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_buffer_view_compute_range(
+    const iree_hal_buffer_view_t* buffer_view,
+    const iree_hal_dim_t* start_indices, iree_host_size_t indices_count,
+    const iree_hal_dim_t* lengths, iree_host_size_t lengths_count,
+    iree_device_size_t* out_start_offset, iree_device_size_t* out_length) {
+  IREE_ASSERT_ARGUMENT(buffer_view);
+  return iree_hal_buffer_compute_view_range(
+      buffer_view->shape, buffer_view->shape_rank, buffer_view->element_type,
+      buffer_view->encoding_type, start_indices, indices_count, lengths,
+      lengths_count, out_start_offset, out_length);
+}
diff --git a/runtime/src/iree/hal/buffer_view.h b/runtime/src/iree/hal/buffer_view.h
new file mode 100644
index 0000000..5a483e1
--- /dev/null
+++ b/runtime/src/iree/hal/buffer_view.h
@@ -0,0 +1,272 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_BUFFER_VIEW_H_
+#define IREE_HAL_BUFFER_VIEW_H_
+
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/buffer.h"
+#include "iree/hal/resource.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+//===----------------------------------------------------------------------===//
+// Types and Enums
+//===----------------------------------------------------------------------===//
+
+// NOTE: these values must be in sync with
+//    iree/compiler/Dialect/HAL/IR/HALTypes.cpp
+
+enum iree_hal_numerical_type_bits_t {
+  // Opaque or unknown - bytes cannot be interpreted. Indexing is still allowed
+  // so long as the bit width of the elements is known.
+  IREE_HAL_NUMERICAL_TYPE_UNKNOWN = 0x00u,
+
+  // Signless integer-like.
+  IREE_HAL_NUMERICAL_TYPE_INTEGER = 0x10u,
+  // Signed integer.
+  IREE_HAL_NUMERICAL_TYPE_INTEGER_SIGNED =
+      IREE_HAL_NUMERICAL_TYPE_INTEGER | 0x01u,
+  // Unsigned integer.
+  IREE_HAL_NUMERICAL_TYPE_INTEGER_UNSIGNED =
+      IREE_HAL_NUMERICAL_TYPE_INTEGER | 0x02u,
+
+  // Float-like.
+  IREE_HAL_NUMERICAL_TYPE_FLOAT = 0x20,
+  // IEEE754-compatible floating point semantics.
+  IREE_HAL_NUMERICAL_TYPE_FLOAT_IEEE = IREE_HAL_NUMERICAL_TYPE_FLOAT | 0x01u,
+  // 'Brain' floating point semantics (currently only bf16).
+  IREE_HAL_NUMERICAL_TYPE_FLOAT_BRAIN = IREE_HAL_NUMERICAL_TYPE_FLOAT | 0x02u,
+};
+typedef uint8_t iree_hal_numerical_type_t;
+
+#define IREE_HAL_ELEMENT_TYPE_VALUE(numerical_type, bit_count) \
+  (((uint32_t)(numerical_type) << 24) | (uint32_t)(bit_count))
+
+// Composes an iree_hal_element_type_t value with the given attributes.
+#define iree_hal_make_element_type(numerical_type, bit_count) \
+  (iree_hal_element_type_t)(                                  \
+      IREE_HAL_ELEMENT_TYPE_VALUE(numerical_type, bit_count))
+
+// Returns the numerical type of the element, if known and not opaque.
+#define iree_hal_element_numerical_type(element_type) \
+  (iree_hal_numerical_type_t)((uint32_t)(element_type) >> 24)
+
+// Returns true if |element_type| is opaque and cannot be interpreted.
+#define iree_hal_element_numerical_type_is_opaque(element_type) \
+  (iree_hal_element_numerical_type(element_type) ==             \
+   IREE_HAL_NUMERICAL_TYPE_UNKNOWN)
+
+// Returns true if |element_type| is an integer of some width and semantics.
+#define iree_hal_element_numerical_type_is_integer(element_type)   \
+  iree_all_bits_set(iree_hal_element_numerical_type(element_type), \
+                    IREE_HAL_NUMERICAL_TYPE_INTEGER)
+
+// Returns true if |element_type| is a float of some width and semantics.
+#define iree_hal_element_numerical_type_is_float(element_type)     \
+  iree_all_bits_set(iree_hal_element_numerical_type(element_type), \
+                    IREE_HAL_NUMERICAL_TYPE_FLOAT)
+
+// TODO(#8193): split out logical and physical bit widths.
+// Returns the bit width of each element.
+#define iree_hal_element_bit_count(element_type) (size_t)((element_type)&0xFF)
+
+// Returns true if the element is byte-aligned.
+// Sub-byte aligned types such as i4 require user handling of the packing.
+#define iree_hal_element_is_byte_aligned(element_type) \
+  (iree_hal_element_bit_count(element_type) % 8 == 0)
+
+// Returns the number of bytes each |element_type| consumes in memory.
+// This is only valid when the encoding type is dense as sub-byte bit widths
+// may be packed in various forms (for example, i4 may be stored as nibbles
+// where each byte in memory contains two elements).
+#define iree_hal_element_dense_byte_count(element_type) \
+  ((iree_hal_element_bit_count(element_type) + 8 - 1) / 8)
+
+// Returns true if the given |element_type| represents an integer of exactly
+// |bit_width|. This ignores the signedness of the integer type.
+#define iree_hal_element_type_is_integer(element_type, bit_width) \
+  (iree_hal_element_numerical_type_is_integer(element_type) &&    \
+   iree_hal_element_bit_count(element_type) == (bit_width))
+
+// Defines the element type of a buffer in a standard format.
+//
+// Composed as a 32-bit bitfield to allow for opaque data types. Use
+// iree_hal_make_element_type to make a bitfield with the appropriate ordering.
+//
+//   MSB ----------------------------------------------- LSB
+//   [numerical type] [reserved] [reserved] [number of bits]
+//
+// clang-format off
+enum iree_hal_element_types_t {
+  IREE_HAL_ELEMENT_TYPE_NONE             = IREE_HAL_ELEMENT_TYPE_VALUE(IREE_HAL_NUMERICAL_TYPE_UNKNOWN,             0),  // NOLINT
+  IREE_HAL_ELEMENT_TYPE_OPAQUE_8         = IREE_HAL_ELEMENT_TYPE_VALUE(IREE_HAL_NUMERICAL_TYPE_UNKNOWN,             8),  // NOLINT
+  IREE_HAL_ELEMENT_TYPE_OPAQUE_16        = IREE_HAL_ELEMENT_TYPE_VALUE(IREE_HAL_NUMERICAL_TYPE_UNKNOWN,            16),  // NOLINT
+  IREE_HAL_ELEMENT_TYPE_OPAQUE_32        = IREE_HAL_ELEMENT_TYPE_VALUE(IREE_HAL_NUMERICAL_TYPE_UNKNOWN,            32),  // NOLINT
+  IREE_HAL_ELEMENT_TYPE_OPAQUE_64        = IREE_HAL_ELEMENT_TYPE_VALUE(IREE_HAL_NUMERICAL_TYPE_UNKNOWN,            64),  // NOLINT
+  IREE_HAL_ELEMENT_TYPE_INT_4            = IREE_HAL_ELEMENT_TYPE_VALUE(IREE_HAL_NUMERICAL_TYPE_INTEGER,             4),  // NOLINT
+  IREE_HAL_ELEMENT_TYPE_SINT_4           = IREE_HAL_ELEMENT_TYPE_VALUE(IREE_HAL_NUMERICAL_TYPE_INTEGER_SIGNED,      4),  // NOLINT
+  IREE_HAL_ELEMENT_TYPE_UINT_4           = IREE_HAL_ELEMENT_TYPE_VALUE(IREE_HAL_NUMERICAL_TYPE_INTEGER_UNSIGNED,    4),  // NOLINT
+  IREE_HAL_ELEMENT_TYPE_INT_8            = IREE_HAL_ELEMENT_TYPE_VALUE(IREE_HAL_NUMERICAL_TYPE_INTEGER,             8),  // NOLINT
+  IREE_HAL_ELEMENT_TYPE_SINT_8           = IREE_HAL_ELEMENT_TYPE_VALUE(IREE_HAL_NUMERICAL_TYPE_INTEGER_SIGNED,      8),  // NOLINT
+  IREE_HAL_ELEMENT_TYPE_UINT_8           = IREE_HAL_ELEMENT_TYPE_VALUE(IREE_HAL_NUMERICAL_TYPE_INTEGER_UNSIGNED,    8),  // NOLINT
+  IREE_HAL_ELEMENT_TYPE_INT_16           = IREE_HAL_ELEMENT_TYPE_VALUE(IREE_HAL_NUMERICAL_TYPE_INTEGER,            16),  // NOLINT
+  IREE_HAL_ELEMENT_TYPE_SINT_16          = IREE_HAL_ELEMENT_TYPE_VALUE(IREE_HAL_NUMERICAL_TYPE_INTEGER_SIGNED,     16),  // NOLINT
+  IREE_HAL_ELEMENT_TYPE_UINT_16          = IREE_HAL_ELEMENT_TYPE_VALUE(IREE_HAL_NUMERICAL_TYPE_INTEGER_UNSIGNED,   16),  // NOLINT
+  IREE_HAL_ELEMENT_TYPE_INT_32           = IREE_HAL_ELEMENT_TYPE_VALUE(IREE_HAL_NUMERICAL_TYPE_INTEGER,            32),  // NOLINT
+  IREE_HAL_ELEMENT_TYPE_SINT_32          = IREE_HAL_ELEMENT_TYPE_VALUE(IREE_HAL_NUMERICAL_TYPE_INTEGER_SIGNED,     32),  // NOLINT
+  IREE_HAL_ELEMENT_TYPE_UINT_32          = IREE_HAL_ELEMENT_TYPE_VALUE(IREE_HAL_NUMERICAL_TYPE_INTEGER_UNSIGNED,   32),  // NOLINT
+  IREE_HAL_ELEMENT_TYPE_INT_64           = IREE_HAL_ELEMENT_TYPE_VALUE(IREE_HAL_NUMERICAL_TYPE_INTEGER,            64),  // NOLINT
+  IREE_HAL_ELEMENT_TYPE_SINT_64          = IREE_HAL_ELEMENT_TYPE_VALUE(IREE_HAL_NUMERICAL_TYPE_INTEGER_SIGNED,     64),  // NOLINT
+  IREE_HAL_ELEMENT_TYPE_UINT_64          = IREE_HAL_ELEMENT_TYPE_VALUE(IREE_HAL_NUMERICAL_TYPE_INTEGER_UNSIGNED,   64),  // NOLINT
+  IREE_HAL_ELEMENT_TYPE_FLOAT_16         = IREE_HAL_ELEMENT_TYPE_VALUE(IREE_HAL_NUMERICAL_TYPE_FLOAT_IEEE,         16),  // NOLINT
+  IREE_HAL_ELEMENT_TYPE_FLOAT_32         = IREE_HAL_ELEMENT_TYPE_VALUE(IREE_HAL_NUMERICAL_TYPE_FLOAT_IEEE,         32),  // NOLINT
+  IREE_HAL_ELEMENT_TYPE_FLOAT_64         = IREE_HAL_ELEMENT_TYPE_VALUE(IREE_HAL_NUMERICAL_TYPE_FLOAT_IEEE,         64),  // NOLINT
+  IREE_HAL_ELEMENT_TYPE_BFLOAT_16        = IREE_HAL_ELEMENT_TYPE_VALUE(IREE_HAL_NUMERICAL_TYPE_FLOAT_BRAIN,        16),  // NOLINT
+};
+typedef uint32_t iree_hal_element_type_t;
+// clang-format on
+
+// Defines the encoding type of a buffer when known.
+enum iree_hal_encoding_types_t {
+  // Encoding is unknown or unspecified. Generic interpretation of the buffer
+  // contents is not possible.
+  IREE_HAL_ENCODING_TYPE_OPAQUE = 0,
+  // Encoding is a densely-packed numpy/C-style row-major format.
+  // All elements are contiguous in memory.
+  IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR = 1,
+  // TODO(#6762): sparse encodings we care about (_SPARSE_CSR)
+  // We will likely want to make this a bitfield like the element type is that
+  // we can more easily distinguish between encoding types that we can use for
+  // certain operations; for example, size calculations on a DENSE_ROW_MAJOR
+  // and DENSE_COLUMN_MAJOR would be easier to perform if we had a bit to test
+  // for whether it's dense.
+};
+typedef uint32_t iree_hal_encoding_type_t;
+
+// A dimension within a shape.
+typedef int32_t iree_hal_dim_t;
+
+//===----------------------------------------------------------------------===//
+// iree_hal_buffer_view_t
+//===----------------------------------------------------------------------===//
+
+// A shaped and typed view into a storage buffer.
+// This is the closest thing to a "tensor" we have, and it's purely used to ease
+// application code and not treated special internally by IREE. They are
+// effectively just `tuple(shape, type, buffer)`, and if the application is
+// already tracking this information in its own structures this entire type can
+// be ignored.
+typedef struct iree_hal_buffer_view_t iree_hal_buffer_view_t;
+
+// Creates a buffer view with the given |buffer|.
+// |out_buffer_view| must be released by the caller.
+IREE_API_EXPORT iree_status_t iree_hal_buffer_view_create(
+    iree_hal_buffer_t* buffer, const iree_hal_dim_t* shape,
+    iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
+    iree_hal_encoding_type_t encoding_type, iree_allocator_t host_allocator,
+    iree_hal_buffer_view_t** out_buffer_view);
+
+// Retains the given |buffer_view| for the caller.
+IREE_API_EXPORT void iree_hal_buffer_view_retain(
+    iree_hal_buffer_view_t* buffer_view);
+
+// Releases the given |buffer_view| from the caller.
+IREE_API_EXPORT void iree_hal_buffer_view_release(
+    iree_hal_buffer_view_t* buffer_view);
+
+// Returns the buffer underlying the buffer view.
+// The caller must retain the returned buffer if they want to continue using it.
+//
+// NOTE: the returned buffer length will almost always be larger than the valid
+// bytes representing this buffer view due to padding. Always query the actual
+// valid length with iree_hal_buffer_view_byte_length instead of assuming the
+// buffer is already clamped.
+IREE_API_EXPORT iree_hal_buffer_t* iree_hal_buffer_view_buffer(
+    const iree_hal_buffer_view_t* buffer_view);
+
+// Returns the rank of the shape associated with the buffer view.
+IREE_API_EXPORT iree_host_size_t
+iree_hal_buffer_view_shape_rank(const iree_hal_buffer_view_t* buffer_view);
+
+// Returns a pointer to the shape dimensions; the array limit is defined by
+// iree_hal_buffer_view_shape_rank.
+IREE_API_EXPORT const iree_hal_dim_t* iree_hal_buffer_view_shape_dims(
+    const iree_hal_buffer_view_t* buffer_view);
+
+// Returns the value of the given dimension.
+IREE_API_EXPORT iree_hal_dim_t iree_hal_buffer_view_shape_dim(
+    const iree_hal_buffer_view_t* buffer_view, iree_host_size_t index);
+
+// Returns the dimensions of the shape in |out_shape| and its rank in
+// |out_shape_rank|. |rank_capacity| indicates the number of dimensions
+// available in the |out_shape| buffer. If there is not enough capacity to store
+// all of the dimensions IREE_STATUS_OUT_OF_RANGE is returned.
+// |out_shape_rank| can be omitted if the rank is already known.
+IREE_API_EXPORT iree_status_t iree_hal_buffer_view_shape(
+    const iree_hal_buffer_view_t* buffer_view, iree_host_size_t rank_capacity,
+    iree_hal_dim_t* out_shape, iree_host_size_t* out_shape_rank);
+
+// Performs a **metadata update-only** reshape.
+// The new rank and element count must match the existing values. The buffer
+// contents are left untouched; if the buffer is not dense this may make the
+// contents undefined.
+IREE_API_EXPORT iree_status_t iree_hal_buffer_view_reshape(
+    iree_hal_buffer_view_t* buffer_view, const iree_hal_dim_t* shape,
+    iree_host_size_t shape_rank);
+
+// Returns the total number of elements stored in the view.
+IREE_API_EXPORT iree_host_size_t
+iree_hal_buffer_view_element_count(const iree_hal_buffer_view_t* buffer_view);
+
+// Returns the element type of the buffer.
+IREE_API_EXPORT iree_hal_element_type_t
+iree_hal_buffer_view_element_type(const iree_hal_buffer_view_t* buffer_view);
+
+// Returns the size of each element in the buffer view in bytes.
+// Note that not all buffers are contiguous or densely packed.
+IREE_API_EXPORT iree_host_size_t
+iree_hal_buffer_view_element_size(const iree_hal_buffer_view_t* buffer_view);
+
+// Returns the encoding type of the buffer.
+IREE_API_EXPORT iree_hal_encoding_type_t
+iree_hal_buffer_view_encoding_type(const iree_hal_buffer_view_t* buffer_view);
+
+// Returns the total size of the specified view in bytes.
+// Note that not all buffers are contiguous or densely packed.
+IREE_API_EXPORT iree_device_size_t
+iree_hal_buffer_view_byte_length(const iree_hal_buffer_view_t* buffer_view);
+
+// Calculates a byte offset into the |buffer_view| at the given indices.
+// Requires that the encoding and element type support indexing.
+IREE_API_EXPORT iree_status_t iree_hal_buffer_view_compute_offset(
+    const iree_hal_buffer_view_t* buffer_view, const iree_hal_dim_t* indices,
+    iree_host_size_t indices_count, iree_device_size_t* out_offset);
+
+// Calculates a byte range into the |buffer_view| of the given contiguous range.
+// Requires that the encoding and element type support indexing.
+IREE_API_EXPORT iree_status_t iree_hal_buffer_view_compute_range(
+    const iree_hal_buffer_view_t* buffer_view,
+    const iree_hal_dim_t* start_indices, iree_host_size_t indices_count,
+    const iree_hal_dim_t* lengths, iree_host_size_t lengths_count,
+    iree_device_size_t* out_start_offset, iree_device_size_t* out_length);
+
+//===----------------------------------------------------------------------===//
+// iree_hal_buffer_view_t implementation details
+//===----------------------------------------------------------------------===//
+
+IREE_API_EXPORT void iree_hal_buffer_view_destroy(
+    iree_hal_buffer_view_t* buffer_view);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_BUFFER_VIEW_H_
diff --git a/runtime/src/iree/hal/buffer_view_util.c b/runtime/src/iree/hal/buffer_view_util.c
new file mode 100644
index 0000000..a791c0e
--- /dev/null
+++ b/runtime/src/iree/hal/buffer_view_util.c
@@ -0,0 +1,573 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/buffer_view_util.h"
+
+#include <inttypes.h>
+#include <stdbool.h>
+
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/allocator.h"
+#include "iree/hal/resource.h"
+#include "iree/hal/string_util.h"
+
+//===----------------------------------------------------------------------===//
+// Buffer view math
+//===----------------------------------------------------------------------===//
+
+IREE_API_EXPORT iree_status_t iree_hal_buffer_compute_view_size(
+    const iree_hal_dim_t* shape, iree_host_size_t shape_rank,
+    iree_hal_element_type_t element_type,
+    iree_hal_encoding_type_t encoding_type,
+    iree_device_size_t* out_allocation_size) {
+  IREE_ASSERT_ARGUMENT(!shape_rank || shape);
+  IREE_ASSERT_ARGUMENT(out_allocation_size);
+  *out_allocation_size = 0;
+
+  iree_device_size_t byte_length = 0;
+
+  switch (encoding_type) {
+    case IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR: {
+      if (IREE_UNLIKELY(iree_hal_element_bit_count(element_type) == 0) ||
+          IREE_UNLIKELY(!iree_hal_element_is_byte_aligned(element_type))) {
+        return iree_make_status(
+            IREE_STATUS_INVALID_ARGUMENT,
+            "opaque and sub-byte aligned element types cannot be indexed");
+      }
+      byte_length = iree_hal_element_dense_byte_count(element_type);
+      for (iree_host_size_t i = 0; i < shape_rank; ++i) {
+        byte_length *= shape[i];
+      }
+      break;
+    }
+    default:
+      return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                              "unimplemented encoding type size calculation");
+  }
+
+  *out_allocation_size = byte_length;
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_buffer_compute_view_offset(
+    const iree_hal_dim_t* shape, iree_host_size_t shape_rank,
+    iree_hal_element_type_t element_type,
+    iree_hal_encoding_type_t encoding_type, const iree_hal_dim_t* indices,
+    iree_host_size_t indices_count, iree_device_size_t* out_offset) {
+  IREE_ASSERT_ARGUMENT(shape);
+  IREE_ASSERT_ARGUMENT(indices);
+  IREE_ASSERT_ARGUMENT(out_offset);
+  *out_offset = 0;
+  if (IREE_UNLIKELY(encoding_type != IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR)) {
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "only dense encodings support view range computation");
+  } else if (IREE_UNLIKELY(iree_hal_element_bit_count(element_type) == 0) ||
+             IREE_UNLIKELY(!iree_hal_element_is_byte_aligned(element_type))) {
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "opaque and sub-byte aligned element types cannot be indexed");
+  } else if (IREE_UNLIKELY(shape_rank != indices_count)) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "shape rank/indices mismatch: %zu != %zu",
+                            shape_rank, indices_count);
+  }
+
+  iree_device_size_t offset = 0;
+  for (iree_host_size_t i = 0; i < indices_count; ++i) {
+    if (IREE_UNLIKELY(indices[i] >= shape[i])) {
+      return iree_make_status(IREE_STATUS_OUT_OF_RANGE,
+                              "index[%zu] out of bounds: %d >= %d", i,
+                              indices[i], shape[i]);
+    }
+    iree_device_size_t axis_offset = indices[i];
+    for (iree_host_size_t j = i + 1; j < shape_rank; ++j) {
+      axis_offset *= shape[j];
+    }
+    offset += axis_offset;
+  }
+  offset *= iree_hal_element_dense_byte_count(element_type);
+
+  *out_offset = offset;
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_buffer_compute_view_range(
+    const iree_hal_dim_t* shape, iree_host_size_t shape_rank,
+    iree_hal_element_type_t element_type,
+    iree_hal_encoding_type_t encoding_type, const iree_hal_dim_t* start_indices,
+    iree_host_size_t indices_count, const iree_hal_dim_t* lengths,
+    iree_host_size_t lengths_count, iree_device_size_t* out_start_offset,
+    iree_device_size_t* out_length) {
+  IREE_ASSERT_ARGUMENT(shape);
+  IREE_ASSERT_ARGUMENT(start_indices);
+  IREE_ASSERT_ARGUMENT(lengths);
+  IREE_ASSERT_ARGUMENT(out_start_offset);
+  IREE_ASSERT_ARGUMENT(out_length);
+  *out_start_offset = 0;
+  *out_length = 0;
+  if (IREE_UNLIKELY(encoding_type != IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR)) {
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "only dense encodings support view range computation");
+  } else if (IREE_UNLIKELY(iree_hal_element_bit_count(element_type) == 0) ||
+             IREE_UNLIKELY(!iree_hal_element_is_byte_aligned(element_type))) {
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "opaque and sub-byte aligned element types cannot be indexed");
+  } else if (IREE_UNLIKELY(indices_count != lengths_count)) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "indices/lengths mismatch: %zu != %zu",
+                            indices_count, lengths_count);
+  } else if (IREE_UNLIKELY(shape_rank != indices_count)) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "shape rank/indices mismatch: %zu != %zu",
+                            shape_rank, indices_count);
+  }
+
+  iree_hal_dim_t* end_indices =
+      iree_alloca(shape_rank * sizeof(iree_hal_dim_t));
+  iree_device_size_t element_size =
+      iree_hal_element_dense_byte_count(element_type);
+  iree_device_size_t subspan_length = element_size;
+  for (iree_host_size_t i = 0; i < lengths_count; ++i) {
+    subspan_length *= lengths[i];
+    end_indices[i] = start_indices[i] + lengths[i] - 1;
+  }
+
+  iree_device_size_t start_byte_offset = 0;
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_compute_view_offset(
+      shape, shape_rank, element_type, encoding_type, start_indices,
+      indices_count, &start_byte_offset));
+  iree_device_size_t end_byte_offset = 0;
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_compute_view_offset(
+      shape, shape_rank, element_type, encoding_type, end_indices, shape_rank,
+      &end_byte_offset));
+
+  // Non-contiguous regions not yet implemented. Will be easier to detect when
+  // we have strides.
+  iree_device_size_t offset_length =
+      end_byte_offset - start_byte_offset + element_size;
+  if (subspan_length != offset_length) {
+    return iree_make_status(
+        IREE_STATUS_UNIMPLEMENTED,
+        "non-contiguous range region computation not implemented");
+  }
+
+  *out_start_offset = start_byte_offset;
+  *out_length = subspan_length;
+  return iree_ok_status();
+}
+
+//===----------------------------------------------------------------------===//
+// Buffer view allocation and generation
+//===----------------------------------------------------------------------===//
+
+IREE_API_EXPORT iree_status_t iree_hal_buffer_view_allocate_buffer(
+    iree_hal_allocator_t* allocator, const iree_hal_dim_t* shape,
+    iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
+    iree_hal_encoding_type_t encoding_type,
+    iree_hal_buffer_params_t buffer_params, iree_const_byte_span_t initial_data,
+    iree_hal_buffer_view_t** out_buffer_view) {
+  IREE_ASSERT_ARGUMENT(allocator);
+  IREE_ASSERT_ARGUMENT(out_buffer_view);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_hal_buffer_params_canonicalize(&buffer_params);
+
+  iree_device_size_t allocation_size = 0;
+  iree_status_t status = iree_hal_buffer_compute_view_size(
+      shape, shape_rank, element_type, encoding_type, &allocation_size);
+
+  iree_hal_buffer_t* buffer = NULL;
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_allocator_allocate_buffer(
+        allocator, buffer_params, allocation_size, initial_data, &buffer);
+  }
+
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_buffer_view_create(
+        buffer, shape, shape_rank, element_type, encoding_type,
+        iree_hal_allocator_host_allocator(allocator), out_buffer_view);
+  }
+
+  iree_hal_buffer_release(buffer);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static iree_status_t iree_hal_buffer_view_generate_buffer_in_situ(
+    iree_hal_allocator_t* allocator, const iree_hal_dim_t* shape,
+    iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
+    iree_hal_encoding_type_t encoding_type,
+    iree_hal_buffer_params_t buffer_params,
+    iree_hal_buffer_view_generator_callback_t callback, void* user_data,
+    iree_hal_buffer_view_t** out_buffer_view) {
+  // Allocate the buffer view and entire buffer contents with the target memory
+  // type and the mapping bits.
+  iree_hal_buffer_view_t* buffer_view = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_view_allocate_buffer(
+      allocator, shape, shape_rank, element_type, encoding_type,
+      iree_hal_buffer_params_with_usage(buffer_params,
+                                        IREE_HAL_BUFFER_USAGE_MAPPING),
+      iree_const_byte_span_empty(), &buffer_view));
+
+  // Map the buffer into host-visible memory.
+  iree_hal_buffer_mapping_t buffer_mapping = {{0}};
+  iree_status_t status = iree_hal_buffer_map_range(
+      iree_hal_buffer_view_buffer(buffer_view), IREE_HAL_MAPPING_MODE_SCOPED,
+      IREE_HAL_MEMORY_ACCESS_DISCARD_WRITE, 0, IREE_WHOLE_BUFFER,
+      &buffer_mapping);
+
+  // Generate using the callback directly into the buffer.
+  if (iree_status_is_ok(status)) {
+    status = callback(&buffer_mapping, user_data);
+  }
+
+  status =
+      iree_status_join(status, iree_hal_buffer_unmap_range(&buffer_mapping));
+  if (iree_status_is_ok(status)) {
+    *out_buffer_view = buffer_view;
+  } else {
+    iree_hal_buffer_view_release(buffer_view);
+  }
+  return status;
+}
+
+static iree_status_t iree_hal_buffer_view_generate_buffer_on_host(
+    iree_hal_allocator_t* allocator, const iree_hal_dim_t* shape,
+    iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
+    iree_hal_encoding_type_t encoding_type,
+    iree_hal_buffer_params_t buffer_params, iree_device_size_t allocation_size,
+    iree_hal_buffer_view_generator_callback_t callback, void* user_data,
+    iree_hal_buffer_view_t** out_buffer_view) {
+  // Allocate the host memory and generate the contents.
+  iree_allocator_t host_allocator =
+      iree_hal_allocator_host_allocator(allocator);
+  void* host_ptr = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_allocator_malloc(host_allocator, allocation_size, &host_ptr));
+  iree_hal_buffer_mapping_t mapping = {
+      .contents = iree_make_byte_span(host_ptr, allocation_size),
+  };
+  iree_status_t status = callback(&mapping, user_data);
+  if (!iree_status_is_ok(status)) {
+    iree_allocator_free(host_allocator, host_ptr);
+    return status;
+  }
+
+  // Allocate the buffer with the data we just generated.
+  // We could try importing but that may create buffers that are slower to
+  // access and we want users to opt in to that instead.
+  status = iree_hal_buffer_view_allocate_buffer(
+      allocator, shape, shape_rank, element_type, encoding_type, buffer_params,
+      iree_make_const_byte_span(host_ptr, allocation_size), out_buffer_view);
+
+  iree_allocator_free(host_allocator, host_ptr);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_buffer_view_generate_buffer(
+    iree_hal_allocator_t* allocator, const iree_hal_dim_t* shape,
+    iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
+    iree_hal_encoding_type_t encoding_type,
+    iree_hal_buffer_params_t buffer_params,
+    iree_hal_buffer_view_generator_callback_t callback, void* user_data,
+    iree_hal_buffer_view_t** out_buffer_view) {
+  IREE_ASSERT_ARGUMENT(allocator);
+  IREE_ASSERT_ARGUMENT(callback);
+  IREE_ASSERT_ARGUMENT(out_buffer_view);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_hal_buffer_params_canonicalize(&buffer_params);
+
+  // Compute how large of an allocation we need to hold the whole view.
+  iree_device_size_t allocation_size = 0;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_hal_buffer_compute_view_size(shape, shape_rank, element_type,
+                                            encoding_type, &allocation_size));
+
+  // If we can create the requested memory type with mapping then we'll do that
+  // and avoid needing to allocate the staging memory. If we can't get that
+  // memory type (or the allocator doesn't want us using it) then we'll fall
+  // back to allocation -> generation -> copy.
+  iree_hal_buffer_params_t mappable_params = buffer_params;
+  mappable_params.type |= IREE_HAL_MEMORY_TYPE_HOST_VISIBLE;
+  mappable_params.usage |= IREE_HAL_BUFFER_USAGE_MAPPING;
+  iree_hal_buffer_compatibility_t compatibility =
+      iree_hal_allocator_query_compatibility(allocator, mappable_params,
+                                             allocation_size);
+  bool is_mappable = iree_all_bits_set(
+      compatibility, IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE);
+
+  iree_status_t status = iree_ok_status();
+  if (is_mappable) {
+    // Compatible with allocate -> map -> generate.
+    status = iree_hal_buffer_view_generate_buffer_in_situ(
+        allocator, shape, shape_rank, element_type, encoding_type,
+        mappable_params, callback, user_data, out_buffer_view);
+  } else {
+    // Allocate host-local memory first and generate into that.
+    status = iree_hal_buffer_view_generate_buffer_on_host(
+        allocator, shape, shape_rank, element_type, encoding_type,
+        buffer_params, allocation_size, callback, user_data, out_buffer_view);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+//===----------------------------------------------------------------------===//
+// Buffer view parsing and printing
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_buffer_view_parse_params_t {
+  iree_string_view_t data_str;
+  iree_hal_element_type_t element_type;
+} iree_hal_buffer_view_parse_params_t;
+static iree_status_t iree_hal_buffer_view_parse_into(
+    iree_hal_buffer_mapping_t* mapping, void* user_data) {
+  iree_hal_buffer_view_parse_params_t* params =
+      (iree_hal_buffer_view_parse_params_t*)user_data;
+  return iree_hal_parse_buffer_elements(params->data_str, params->element_type,
+                                        mapping->contents);
+}
+
+static iree_status_t iree_hal_buffer_view_parse_impl(
+    iree_string_view_t value, iree_hal_allocator_t* buffer_allocator,
+    iree_hal_buffer_view_t** out_buffer_view) {
+  // Strip whitespace that may come along (linefeeds/etc).
+  value = iree_string_view_trim(value);
+  value = iree_string_view_strip_prefix(value, IREE_SV("\""));
+  value = iree_string_view_strip_suffix(value, IREE_SV("\""));
+  if (iree_string_view_is_empty(value)) {
+    // Empty lines are invalid; need at least the shape/type information.
+    *out_buffer_view = NULL;
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "empty string input");
+  }
+
+  // The part of the string corresponding to the shape, e.g. 1x2x3.
+  iree_string_view_t shape_str = iree_string_view_empty();
+  // The part of the string corresponding to the type, e.g. f32
+  iree_string_view_t type_str = iree_string_view_empty();
+  // The part of the string corresponding to the buffer data, e.g. 1 2 3 4 5 6
+  iree_string_view_t data_str = iree_string_view_empty();
+
+  iree_string_view_t shape_and_type_str = value;
+  iree_string_view_split(value, '=', &shape_and_type_str, &data_str);
+  iree_host_size_t last_x_index = iree_string_view_find_last_of(
+      shape_and_type_str, IREE_SV("x"), IREE_STRING_VIEW_NPOS);
+  if (last_x_index == IREE_STRING_VIEW_NPOS) {
+    // Scalar.
+    type_str = shape_and_type_str;
+  } else {
+    // Has a shape.
+    shape_str = iree_string_view_substr(shape_and_type_str, 0, last_x_index);
+    type_str = iree_string_view_substr(shape_and_type_str, last_x_index + 1,
+                                       IREE_STRING_VIEW_NPOS);
+  }
+
+  // AxBxC...
+  iree_host_size_t shape_rank = 0;
+  iree_status_t shape_result =
+      iree_hal_parse_shape(shape_str, 0, NULL, &shape_rank);
+  if (!iree_status_is_ok(shape_result) &&
+      !iree_status_is_out_of_range(shape_result)) {
+    return shape_result;
+  } else if (shape_rank > 128) {
+    return iree_make_status(
+        IREE_STATUS_RESOURCE_EXHAUSTED,
+        "a shape rank of %zu is just a little bit excessive, eh?", shape_rank);
+  }
+  shape_result = iree_status_ignore(shape_result);
+  iree_hal_dim_t* shape =
+      (iree_hal_dim_t*)iree_alloca(shape_rank * sizeof(iree_hal_dim_t));
+  IREE_RETURN_IF_ERROR(
+      iree_hal_parse_shape(shape_str, shape_rank, shape, &shape_rank));
+
+  // f32, i32, etc
+  iree_hal_element_type_t element_type = IREE_HAL_ELEMENT_TYPE_NONE;
+  IREE_RETURN_IF_ERROR(iree_hal_parse_element_type(type_str, &element_type));
+
+  // TODO(benvanik): allow specifying the encoding.
+  iree_hal_encoding_type_t encoding_type =
+      IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR;
+
+  // Allocate the buffer from the provided allocator and parse directly into it.
+  const iree_hal_buffer_params_t buffer_params = {
+      .type = IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL,
+      .usage = IREE_HAL_BUFFER_USAGE_DISPATCH | IREE_HAL_BUFFER_USAGE_TRANSFER,
+  };
+  iree_hal_buffer_view_parse_params_t parse_params = {
+      .data_str = data_str,
+      .element_type = element_type,
+  };
+  return iree_hal_buffer_view_generate_buffer(
+      buffer_allocator, shape, shape_rank, element_type, encoding_type,
+      buffer_params, iree_hal_buffer_view_parse_into, &parse_params,
+      out_buffer_view);
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_buffer_view_parse(
+    iree_string_view_t value, iree_hal_allocator_t* buffer_allocator,
+    iree_hal_buffer_view_t** out_buffer_view) {
+  IREE_ASSERT_ARGUMENT(buffer_allocator);
+  IREE_ASSERT_ARGUMENT(out_buffer_view);
+  *out_buffer_view = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_status_t status =
+      iree_hal_buffer_view_parse_impl(value, buffer_allocator, out_buffer_view);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+#define APPEND_CHAR(c)                           \
+  {                                              \
+    if (buffer) {                                \
+      if (buffer_length < buffer_capacity - 1) { \
+        buffer[buffer_length] = c;               \
+        buffer[buffer_length + 1] = '\0';        \
+      } else {                                   \
+        buffer = NULL;                           \
+      }                                          \
+    }                                            \
+    ++buffer_length;                             \
+  }
+
+static iree_status_t iree_hal_buffer_view_format_impl(
+    const iree_hal_buffer_view_t* buffer_view,
+    iree_host_size_t max_element_count, iree_host_size_t buffer_capacity,
+    char* buffer, iree_host_size_t* out_buffer_length) {
+  if (out_buffer_length) {
+    *out_buffer_length = 0;
+  }
+  if (buffer && buffer_capacity) {
+    buffer[0] = 0;
+  }
+
+  iree_host_size_t buffer_length = 0;
+  if (iree_hal_buffer_view_shape_rank(buffer_view) > 0) {
+    // Shape: 1x2x3
+    iree_host_size_t shape_length = 0;
+    iree_status_t status = iree_hal_format_shape(
+        iree_hal_buffer_view_shape_dims(buffer_view),
+        iree_hal_buffer_view_shape_rank(buffer_view),
+        buffer ? buffer_capacity - buffer_length : 0,
+        buffer ? buffer + buffer_length : NULL, &shape_length);
+    buffer_length += shape_length;
+    if (iree_status_is_out_of_range(status)) {
+      status = iree_status_ignore(status);
+      buffer = NULL;
+    } else if (!iree_status_is_ok(status)) {
+      return status;
+    }
+
+    // Separator: <shape>x<format>
+    APPEND_CHAR('x');
+  }
+
+  // Element type: f32
+  iree_host_size_t element_type_length = 0;
+  iree_status_t status = iree_hal_format_element_type(
+      iree_hal_buffer_view_element_type(buffer_view),
+      buffer ? buffer_capacity - buffer_length : 0,
+      buffer ? buffer + buffer_length : NULL, &element_type_length);
+  buffer_length += element_type_length;
+  if (iree_status_is_out_of_range(status)) {
+    status = iree_status_ignore(status);
+    buffer = NULL;
+  } else if (!iree_status_is_ok(status)) {
+    return status;
+  }
+
+  // TODO(benvanik): allow printing the encoding.
+
+  // Separator: <meta>=<value>
+  APPEND_CHAR('=');
+
+  // Buffer contents: 0 1 2 3 ...
+  iree_hal_buffer_mapping_t buffer_mapping = {{0}};
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_map_range(
+      iree_hal_buffer_view_buffer(buffer_view), IREE_HAL_MAPPING_MODE_SCOPED,
+      IREE_HAL_MEMORY_ACCESS_READ, 0, IREE_WHOLE_BUFFER, &buffer_mapping));
+  iree_host_size_t elements_length = 0;
+  status = iree_hal_format_buffer_elements(
+      iree_make_const_byte_span(buffer_mapping.contents.data,
+                                buffer_mapping.contents.data_length),
+      iree_hal_buffer_view_shape_dims(buffer_view),
+      iree_hal_buffer_view_shape_rank(buffer_view),
+      iree_hal_buffer_view_element_type(buffer_view), max_element_count,
+      buffer ? buffer_capacity - buffer_length : 0,
+      buffer ? buffer + buffer_length : NULL, &elements_length);
+  buffer_length += elements_length;
+  status =
+      iree_status_join(status, iree_hal_buffer_unmap_range(&buffer_mapping));
+  if (iree_status_is_out_of_range(status)) {
+    status = iree_status_ignore(status);
+    buffer = NULL;
+  } else if (!iree_status_is_ok(status)) {
+    return status;
+  }
+
+  if (out_buffer_length) {
+    *out_buffer_length = buffer_length;
+  }
+  return buffer ? iree_ok_status()
+                : iree_status_from_code(IREE_STATUS_OUT_OF_RANGE);
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_buffer_view_format(
+    const iree_hal_buffer_view_t* buffer_view,
+    iree_host_size_t max_element_count, iree_host_size_t buffer_capacity,
+    char* buffer, iree_host_size_t* out_buffer_length) {
+  IREE_ASSERT_ARGUMENT(buffer_view);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_status_t status = iree_hal_buffer_view_format_impl(
+      buffer_view, max_element_count, buffer_capacity, buffer,
+      out_buffer_length);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+// TODO(benvanik): streaming all the way down (needs string_util updates).
+IREE_API_EXPORT iree_status_t iree_hal_buffer_view_fprint(
+    FILE* file, const iree_hal_buffer_view_t* buffer_view,
+    iree_host_size_t max_element_count, iree_allocator_t host_allocator) {
+  IREE_ASSERT_ARGUMENT(file);
+  IREE_ASSERT_ARGUMENT(buffer_view);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Query the string length (in characters).
+  iree_host_size_t buffer_length = 0;
+  iree_status_t status = iree_hal_buffer_view_format(
+      buffer_view, max_element_count, 0, NULL, &buffer_length);
+  if (!iree_status_is_out_of_range(status)) {
+    IREE_TRACE_ZONE_END(z0);
+    return status;
+  }
+
+  // Allocate scratch space to format in to.
+  // We should be streaming.
+  iree_host_size_t buffer_capacity = buffer_length + 1;  // NUL
+  char* buffer = NULL;
+  status =
+      iree_allocator_malloc(host_allocator, buffer_capacity, (void**)&buffer);
+
+  // Format the buffer into the string storage.
+  if (iree_status_is_ok(status)) {
+    status =
+        iree_hal_buffer_view_format(buffer_view, max_element_count,
+                                    buffer_capacity, buffer, &buffer_length);
+  }
+
+  // Dump to the file.
+  if (iree_status_is_ok(status)) {
+    fprintf(file, "%.*s", (int)buffer_length, buffer);
+  }
+
+  iree_allocator_free(host_allocator, buffer);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
diff --git a/runtime/src/iree/hal/buffer_view_util.h b/runtime/src/iree/hal/buffer_view_util.h
new file mode 100644
index 0000000..a7d7f61
--- /dev/null
+++ b/runtime/src/iree/hal/buffer_view_util.h
@@ -0,0 +1,148 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_BUFFER_VIEW_UTIL_H_
+#define IREE_HAL_BUFFER_VIEW_UTIL_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/allocator.h"
+#include "iree/hal/buffer_view.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+//===----------------------------------------------------------------------===//
+// Buffer view math
+//===----------------------------------------------------------------------===//
+
+// Calculates the allocation size of a buffer view.
+IREE_API_EXPORT iree_status_t iree_hal_buffer_compute_view_size(
+    const iree_hal_dim_t* shape, iree_host_size_t shape_rank,
+    iree_hal_element_type_t element_type,
+    iree_hal_encoding_type_t encoding_type,
+    iree_device_size_t* out_allocation_size);
+
+// Calculates a byte offset into a buffer at the given indices.
+// Only works with densely-packed representations.
+IREE_API_EXPORT iree_status_t iree_hal_buffer_compute_view_offset(
+    const iree_hal_dim_t* shape, iree_host_size_t shape_rank,
+    iree_hal_element_type_t element_type,
+    iree_hal_encoding_type_t encoding_type, const iree_hal_dim_t* indices,
+    size_t indices_count, iree_device_size_t* out_offset);
+
+// Calculates a byte range into a buffer of the given contiguous range.
+// Only works with densely-packed representations.
+IREE_API_EXPORT iree_status_t iree_hal_buffer_compute_view_range(
+    const iree_hal_dim_t* shape, iree_host_size_t shape_rank,
+    iree_hal_element_type_t element_type,
+    iree_hal_encoding_type_t encoding_type, const iree_hal_dim_t* start_indices,
+    iree_host_size_t indices_count, const iree_hal_dim_t* lengths,
+    iree_host_size_t lengths_count, iree_device_size_t* out_start_offset,
+    iree_device_size_t* out_length);
+
+//===----------------------------------------------------------------------===//
+// Buffer view allocation and generation
+//===----------------------------------------------------------------------===//
+
+// Allocates a buffer from |allocator| and wraps it in a buffer view.
+//
+// This is equivalent to:
+//   1. iree_hal_buffer_compute_view_size
+//   2. iree_hal_allocator_allocate_buffer
+//   3. iree_hal_buffer_view_create
+IREE_API_EXPORT iree_status_t iree_hal_buffer_view_allocate_buffer(
+    iree_hal_allocator_t* allocator, const iree_hal_dim_t* shape,
+    iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
+    iree_hal_encoding_type_t encoding_type,
+    iree_hal_buffer_params_t buffer_params, iree_const_byte_span_t initial_data,
+    iree_hal_buffer_view_t** out_buffer_view);
+
+typedef iree_status_t(IREE_API_PTR* iree_hal_buffer_view_generator_callback_t)(
+    iree_hal_buffer_mapping_t* mapping, void* user_data);
+
+// Generates a buffer view with its initial contents produced by a callback.
+// When host and device memory are shared this allows direct generation into the
+// target device buffer. If not shared this can avoid expensive transfer mapping
+// operations at the cost of a transient host memory allocation. The mapped host
+// pointer passed to the callback is only valid within the callback.
+//
+// Buffers allocated like this do not need the IREE_HAL_BUFFER_USAGE_MAPPING bit
+// set; it will be added automatically if the allocator needs it and otherwise
+// the memory can remain unmappable (and thus fully device isolated).
+//
+// As this _may_ require allocation of the entire buffer content in host memory
+// it is always preferable to stage and issue copy commands via the device
+// queue. Even better is to do all generation on-device via dispatches without
+// the need to ever transfer. Usage of this method should be limited to times
+// where device-side generation isn't possible or memory consumption is not a
+// concern.
+//
+// This is equivalent to:
+//   1. iree_hal_buffer_compute_view_size
+//   2. iree_hal_allocator_allocate_buffer
+//   3. iree_hal_buffer_map_range + callback + iree_hal_buffer_unmap_range
+//   4. iree_hal_buffer_view_create
+IREE_API_EXPORT iree_status_t iree_hal_buffer_view_generate_buffer(
+    iree_hal_allocator_t* allocator, const iree_hal_dim_t* shape,
+    iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
+    iree_hal_encoding_type_t encoding_type,
+    iree_hal_buffer_params_t buffer_params,
+    iree_hal_buffer_view_generator_callback_t callback, void* user_data,
+    iree_hal_buffer_view_t** out_buffer_view);
+
+//===----------------------------------------------------------------------===//
+// Buffer view parsing and printing
+//===----------------------------------------------------------------------===//
+
+// Parses a serialized set of buffer elements in the canonical tensor format
+// (the same as produced by iree_hal_buffer_view_format). The underlying buffer
+// will be allocated with |buffer_allocator| as a host-local/device-visible
+// buffer.
+IREE_API_EXPORT iree_status_t iree_hal_buffer_view_parse(
+    iree_string_view_t value, iree_hal_allocator_t* buffer_allocator,
+    iree_hal_buffer_view_t** out_buffer_view);
+
+// TODO(#5413): enum for printing mode (include shape, precision).
+
+// Converts buffer view elements into a fully-specified string-form format like
+// `2x4xi16=[[1 2][3 4]]`.
+//
+// |max_element_count| can be used to limit the total number of elements printed
+// when the count may be large. Elided elements will be replaced with `...`.
+//
+// |buffer_capacity| defines the size of |buffer| in bytes and
+// |out_buffer_length| will return the string length in characters. Returns
+// IREE_STATUS_OUT_OF_RANGE if the buffer capacity is insufficient to hold the
+// formatted elements and |out_buffer_length| will contain the required size.
+//
+// Follows the standard API string formatting rules. See iree/base/api.h.
+IREE_API_EXPORT iree_status_t iree_hal_buffer_view_format(
+    const iree_hal_buffer_view_t* buffer_view,
+    iree_host_size_t max_element_count, iree_host_size_t buffer_capacity,
+    char* buffer, iree_host_size_t* out_buffer_length);
+
+// Prints buffer view elements into a fully-specified string-form format like
+// `2x4xi16=[[1 2][3 4]]`.
+//
+// |max_element_count| can be used to limit the total number of elements printed
+// when the count may be large. Elided elements will be replaced with `...`.
+//
+// |host_allocator| will be used for any transient allocations required while
+// printing.
+IREE_API_EXPORT iree_status_t iree_hal_buffer_view_fprint(
+    FILE* file, const iree_hal_buffer_view_t* buffer_view,
+    iree_host_size_t max_element_count, iree_allocator_t host_allocator);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_BUFFER_VIEW_UTIL_H_
diff --git a/runtime/src/iree/hal/command_buffer.c b/runtime/src/iree/hal/command_buffer.c
new file mode 100644
index 0000000..e4c7fdf
--- /dev/null
+++ b/runtime/src/iree/hal/command_buffer.c
@@ -0,0 +1,523 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/command_buffer.h"
+
+#include <stddef.h>
+
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/command_buffer_validation.h"
+#include "iree/hal/detail.h"
+#include "iree/hal/device.h"
+#include "iree/hal/resource.h"
+
+// Conditionally executes an expression based on whether command buffer
+// validation was enabled in the build and the command buffer wants validation.
+#if IREE_HAL_COMMAND_BUFFER_VALIDATION_ENABLE
+#define IF_VALIDATING(command_buffer, expr)                                  \
+  if (((command_buffer)->mode & IREE_HAL_COMMAND_BUFFER_MODE_UNVALIDATED) == \
+      0) {                                                                   \
+    expr;                                                                    \
+  }
+#else
+#define IF_VALIDATING(command_buffer, expr)
+#endif  // IREE_HAL_COMMAND_BUFFER_VALIDATION_ENABLE
+
+#define _VTABLE_DISPATCH(command_buffer, method_name) \
+  IREE_HAL_VTABLE_DISPATCH(command_buffer, iree_hal_command_buffer, method_name)
+
+//===----------------------------------------------------------------------===//
+// String utils
+//===----------------------------------------------------------------------===//
+
+IREE_API_EXPORT iree_string_view_t
+iree_hal_command_buffer_mode_format(iree_hal_command_buffer_mode_t value,
+                                    iree_bitfield_string_temp_t* out_temp) {
+  static const iree_bitfield_string_mapping_t mappings[] = {
+      {IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT, IREE_SVL("ONE_SHOT")},
+      {IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION,
+       IREE_SVL("ALLOW_INLINE_EXECUTION")},
+      {IREE_HAL_COMMAND_BUFFER_MODE_UNVALIDATED, IREE_SVL("UNVALIDATED")},
+  };
+  return iree_bitfield_format_inline(value, mappings, IREE_ARRAYSIZE(mappings),
+                                     out_temp);
+}
+
+IREE_API_EXPORT iree_string_view_t iree_hal_command_category_format(
+    iree_hal_command_category_t value, iree_bitfield_string_temp_t* out_temp) {
+  static const iree_bitfield_string_mapping_t mappings[] = {
+      // Combined:
+      {IREE_HAL_COMMAND_CATEGORY_ANY, IREE_SVL("ANY")},
+      // Separate:
+      {IREE_HAL_COMMAND_CATEGORY_TRANSFER, IREE_SVL("TRANSFER")},
+      {IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_SVL("DISPATCH")},
+  };
+  return iree_bitfield_format_inline(value, mappings, IREE_ARRAYSIZE(mappings),
+                                     out_temp);
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_command_buffer_t
+//===----------------------------------------------------------------------===//
+
+IREE_HAL_API_RETAIN_RELEASE(command_buffer);
+
+IREE_API_EXPORT void iree_hal_command_buffer_initialize(
+    iree_hal_device_t* device, iree_hal_command_buffer_mode_t mode,
+    iree_hal_command_category_t command_categories,
+    iree_hal_queue_affinity_t queue_affinity,
+    const iree_hal_command_buffer_vtable_t* vtable,
+    iree_hal_command_buffer_t* command_buffer) {
+  iree_hal_resource_initialize(vtable, &command_buffer->resource);
+  command_buffer->mode = mode;
+  command_buffer->allowed_categories = command_categories;
+  command_buffer->queue_affinity = queue_affinity;
+
+  // Perform initialization validation after we allocate/initialize the concrete
+  // implementation.
+  IF_VALIDATING(command_buffer, {
+    iree_hal_command_buffer_initialize_validation(device, command_buffer);
+  });
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_command_buffer_create(
+    iree_hal_device_t* device, iree_hal_command_buffer_mode_t mode,
+    iree_hal_command_category_t command_categories,
+    iree_hal_queue_affinity_t queue_affinity,
+    iree_hal_command_buffer_t** out_command_buffer) {
+  IREE_ASSERT_ARGUMENT(device);
+  IREE_ASSERT_ARGUMENT(out_command_buffer);
+  *out_command_buffer = NULL;
+
+  if (iree_all_bits_set(mode,
+                        IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION)) {
+    // Inline command buffers must be one-shot and primary.
+    if (!iree_all_bits_set(mode, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT)) {
+      return iree_make_status(
+          IREE_STATUS_INVALID_ARGUMENT,
+          "inline command buffers must be one-shot and primary");
+    }
+  }
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_status_t status =
+      IREE_HAL_VTABLE_DISPATCH(device, iree_hal_device, create_command_buffer)(
+          device, mode, command_categories, queue_affinity, out_command_buffer);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT void* iree_hal_command_buffer_dyn_cast(
+    iree_hal_command_buffer_t* command_buffer, const void* vtable) {
+  IREE_ASSERT_ARGUMENT(command_buffer);
+  if (iree_hal_resource_is(command_buffer, vtable)) return command_buffer;
+  return _VTABLE_DISPATCH(command_buffer, dyn_cast)(command_buffer, vtable);
+}
+
+IREE_API_EXPORT iree_hal_command_buffer_mode_t
+iree_hal_command_buffer_mode(const iree_hal_command_buffer_t* command_buffer) {
+  IREE_ASSERT_ARGUMENT(command_buffer);
+  return command_buffer->mode;
+}
+
+IREE_API_EXPORT iree_hal_command_category_t
+iree_hal_command_buffer_allowed_categories(
+    const iree_hal_command_buffer_t* command_buffer) {
+  IREE_ASSERT_ARGUMENT(command_buffer);
+  return command_buffer->allowed_categories;
+}
+
+IREE_API_EXPORT iree_status_t
+iree_hal_command_buffer_begin(iree_hal_command_buffer_t* command_buffer) {
+  IREE_ASSERT_ARGUMENT(command_buffer);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IF_VALIDATING(command_buffer, {
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z0, iree_hal_command_buffer_begin_validation(command_buffer));
+  });
+  iree_status_t status =
+      _VTABLE_DISPATCH(command_buffer, begin)(command_buffer);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t
+iree_hal_command_buffer_end(iree_hal_command_buffer_t* command_buffer) {
+  IREE_ASSERT_ARGUMENT(command_buffer);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IF_VALIDATING(command_buffer, {
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z0, iree_hal_command_buffer_end_validation(command_buffer));
+  });
+  iree_status_t status = _VTABLE_DISPATCH(command_buffer, end)(command_buffer);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT void iree_hal_command_buffer_begin_debug_group(
+    iree_hal_command_buffer_t* command_buffer, iree_string_view_t label,
+    iree_hal_label_color_t label_color,
+    const iree_hal_label_location_t* location) {
+  IREE_ASSERT_ARGUMENT(command_buffer);
+  IF_VALIDATING(command_buffer,
+                iree_hal_command_buffer_begin_debug_group_validation(
+                    command_buffer, label, label_color, location));
+  _VTABLE_DISPATCH(command_buffer, begin_debug_group)
+  (command_buffer, label, label_color, location);
+}
+
+IREE_API_EXPORT void iree_hal_command_buffer_end_debug_group(
+    iree_hal_command_buffer_t* command_buffer) {
+  IREE_ASSERT_ARGUMENT(command_buffer);
+  IF_VALIDATING(
+      command_buffer,
+      iree_hal_command_buffer_end_debug_group_validation(command_buffer));
+  _VTABLE_DISPATCH(command_buffer, end_debug_group)
+  (command_buffer);
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_command_buffer_execution_barrier(
+    iree_hal_command_buffer_t* command_buffer,
+    iree_hal_execution_stage_t source_stage_mask,
+    iree_hal_execution_stage_t target_stage_mask,
+    iree_hal_execution_barrier_flags_t flags,
+    iree_host_size_t memory_barrier_count,
+    const iree_hal_memory_barrier_t* memory_barriers,
+    iree_host_size_t buffer_barrier_count,
+    const iree_hal_buffer_barrier_t* buffer_barriers) {
+  IREE_ASSERT_ARGUMENT(command_buffer);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IF_VALIDATING(command_buffer, {
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z0, iree_hal_command_buffer_execution_barrier_validation(
+                command_buffer, source_stage_mask, target_stage_mask, flags,
+                memory_barrier_count, memory_barriers, buffer_barrier_count,
+                buffer_barriers));
+  });
+  iree_status_t status = _VTABLE_DISPATCH(command_buffer, execution_barrier)(
+      command_buffer, source_stage_mask, target_stage_mask, flags,
+      memory_barrier_count, memory_barriers, buffer_barrier_count,
+      buffer_barriers);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_command_buffer_signal_event(
+    iree_hal_command_buffer_t* command_buffer, iree_hal_event_t* event,
+    iree_hal_execution_stage_t source_stage_mask) {
+  IREE_ASSERT_ARGUMENT(command_buffer);
+  IREE_ASSERT_ARGUMENT(event);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IF_VALIDATING(command_buffer, {
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z0, iree_hal_command_buffer_signal_event_validation(
+                command_buffer, event, source_stage_mask));
+  });
+  iree_status_t status = _VTABLE_DISPATCH(command_buffer, signal_event)(
+      command_buffer, event, source_stage_mask);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_command_buffer_reset_event(
+    iree_hal_command_buffer_t* command_buffer, iree_hal_event_t* event,
+    iree_hal_execution_stage_t source_stage_mask) {
+  IREE_ASSERT_ARGUMENT(command_buffer);
+  IREE_ASSERT_ARGUMENT(event);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IF_VALIDATING(command_buffer, {
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z0, iree_hal_command_buffer_reset_event_validation(
+                command_buffer, event, source_stage_mask));
+  });
+  iree_status_t status = _VTABLE_DISPATCH(command_buffer, reset_event)(
+      command_buffer, event, source_stage_mask);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_command_buffer_wait_events(
+    iree_hal_command_buffer_t* command_buffer, iree_host_size_t event_count,
+    const iree_hal_event_t** events,
+    iree_hal_execution_stage_t source_stage_mask,
+    iree_hal_execution_stage_t target_stage_mask,
+    iree_host_size_t memory_barrier_count,
+    const iree_hal_memory_barrier_t* memory_barriers,
+    iree_host_size_t buffer_barrier_count,
+    const iree_hal_buffer_barrier_t* buffer_barriers) {
+  IREE_ASSERT_ARGUMENT(command_buffer);
+  IREE_ASSERT_ARGUMENT(!event_count || events);
+  IREE_ASSERT_ARGUMENT(!memory_barrier_count || memory_barriers);
+  IREE_ASSERT_ARGUMENT(!buffer_barrier_count || buffer_barriers);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IF_VALIDATING(command_buffer, {
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z0, iree_hal_command_buffer_wait_events_validation(
+                command_buffer, event_count, events, source_stage_mask,
+                target_stage_mask, memory_barrier_count, memory_barriers,
+                buffer_barrier_count, buffer_barriers));
+  });
+  iree_status_t status = _VTABLE_DISPATCH(command_buffer, wait_events)(
+      command_buffer, event_count, events, source_stage_mask, target_stage_mask,
+      memory_barrier_count, memory_barriers, buffer_barrier_count,
+      buffer_barriers);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_command_buffer_discard_buffer(
+    iree_hal_command_buffer_t* command_buffer, iree_hal_buffer_t* buffer) {
+  IREE_ASSERT_ARGUMENT(command_buffer);
+  IREE_ASSERT_ARGUMENT(buffer);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IF_VALIDATING(command_buffer, {
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z0, iree_hal_command_buffer_discard_buffer_validation(command_buffer,
+                                                              buffer));
+  });
+  iree_status_t status =
+      _VTABLE_DISPATCH(command_buffer, discard_buffer)(command_buffer, buffer);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_command_buffer_fill_buffer(
+    iree_hal_command_buffer_t* command_buffer, iree_hal_buffer_t* target_buffer,
+    iree_device_size_t target_offset, iree_device_size_t length,
+    const void* pattern, iree_host_size_t pattern_length) {
+  IREE_ASSERT_ARGUMENT(command_buffer);
+  IREE_ASSERT_ARGUMENT(target_buffer);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IF_VALIDATING(command_buffer, {
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z0, iree_hal_command_buffer_fill_buffer_validation(
+                command_buffer, target_buffer, target_offset, length, pattern,
+                pattern_length));
+  });
+  iree_status_t status = _VTABLE_DISPATCH(command_buffer, fill_buffer)(
+      command_buffer, target_buffer, target_offset, length, pattern,
+      pattern_length);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_command_buffer_update_buffer(
+    iree_hal_command_buffer_t* command_buffer, const void* source_buffer,
+    iree_host_size_t source_offset, iree_hal_buffer_t* target_buffer,
+    iree_device_size_t target_offset, iree_device_size_t length) {
+  IREE_ASSERT_ARGUMENT(command_buffer);
+  IREE_ASSERT_ARGUMENT(source_buffer);
+  IREE_ASSERT_ARGUMENT(target_buffer);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IF_VALIDATING(command_buffer, {
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z0, iree_hal_command_buffer_update_buffer_validation(
+                command_buffer, source_buffer, source_offset, target_buffer,
+                target_offset, length));
+  });
+  iree_status_t status = _VTABLE_DISPATCH(command_buffer, update_buffer)(
+      command_buffer, source_buffer, source_offset, target_buffer,
+      target_offset, length);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_command_buffer_copy_buffer(
+    iree_hal_command_buffer_t* command_buffer, iree_hal_buffer_t* source_buffer,
+    iree_device_size_t source_offset, iree_hal_buffer_t* target_buffer,
+    iree_device_size_t target_offset, iree_device_size_t length) {
+  IREE_ASSERT_ARGUMENT(command_buffer);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IF_VALIDATING(command_buffer, {
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z0, iree_hal_command_buffer_copy_buffer_validation(
+                command_buffer, source_buffer, source_offset, target_buffer,
+                target_offset, length));
+  });
+  iree_status_t status = _VTABLE_DISPATCH(command_buffer, copy_buffer)(
+      command_buffer, source_buffer, source_offset, target_buffer,
+      target_offset, length);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_command_buffer_push_constants(
+    iree_hal_command_buffer_t* command_buffer,
+    iree_hal_executable_layout_t* executable_layout, iree_host_size_t offset,
+    const void* values, iree_host_size_t values_length) {
+  IREE_ASSERT_ARGUMENT(command_buffer);
+  IREE_ASSERT_ARGUMENT(executable_layout);
+  IREE_ASSERT_ARGUMENT(values);
+  if (IREE_UNLIKELY(values_length == 0)) {
+    return iree_ok_status();
+  }
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IF_VALIDATING(command_buffer, {
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z0,
+        iree_hal_command_buffer_push_constants_validation(
+            command_buffer, executable_layout, offset, values, values_length));
+  });
+  iree_status_t status = _VTABLE_DISPATCH(command_buffer, push_constants)(
+      command_buffer, executable_layout, offset, values, values_length);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_command_buffer_push_descriptor_set(
+    iree_hal_command_buffer_t* command_buffer,
+    iree_hal_executable_layout_t* executable_layout, uint32_t set,
+    iree_host_size_t binding_count,
+    const iree_hal_descriptor_set_binding_t* bindings) {
+  IREE_ASSERT_ARGUMENT(command_buffer);
+  IREE_ASSERT_ARGUMENT(executable_layout);
+  IREE_ASSERT_ARGUMENT(!binding_count || bindings);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IF_VALIDATING(command_buffer, {
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z0,
+        iree_hal_command_buffer_push_descriptor_set_validation(
+            command_buffer, executable_layout, set, binding_count, bindings));
+  });
+  iree_status_t status = _VTABLE_DISPATCH(command_buffer, push_descriptor_set)(
+      command_buffer, executable_layout, set, binding_count, bindings);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_command_buffer_bind_descriptor_set(
+    iree_hal_command_buffer_t* command_buffer,
+    iree_hal_executable_layout_t* executable_layout, uint32_t set,
+    iree_hal_descriptor_set_t* descriptor_set,
+    iree_host_size_t dynamic_offset_count,
+    const iree_device_size_t* dynamic_offsets) {
+  IREE_ASSERT_ARGUMENT(command_buffer);
+  IREE_ASSERT_ARGUMENT(executable_layout);
+  IREE_ASSERT_ARGUMENT(descriptor_set);
+  IREE_ASSERT_ARGUMENT(!dynamic_offset_count || dynamic_offsets);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IF_VALIDATING(command_buffer, {
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z0, iree_hal_command_buffer_bind_descriptor_set_validation(
+                command_buffer, executable_layout, set, descriptor_set,
+                dynamic_offset_count, dynamic_offsets));
+  });
+  iree_status_t status = _VTABLE_DISPATCH(command_buffer, bind_descriptor_set)(
+      command_buffer, executable_layout, set, descriptor_set,
+      dynamic_offset_count, dynamic_offsets);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_command_buffer_dispatch(
+    iree_hal_command_buffer_t* command_buffer,
+    iree_hal_executable_t* executable, int32_t entry_point,
+    uint32_t workgroup_x, uint32_t workgroup_y, uint32_t workgroup_z) {
+  IREE_ASSERT_ARGUMENT(command_buffer);
+  IREE_ASSERT_ARGUMENT(executable);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IF_VALIDATING(command_buffer, {
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z0, iree_hal_command_buffer_dispatch_validation(
+                command_buffer, executable, entry_point, workgroup_x,
+                workgroup_y, workgroup_z));
+  });
+  iree_status_t status = _VTABLE_DISPATCH(command_buffer, dispatch)(
+      command_buffer, executable, entry_point, workgroup_x, workgroup_y,
+      workgroup_z);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_command_buffer_dispatch_indirect(
+    iree_hal_command_buffer_t* command_buffer,
+    iree_hal_executable_t* executable, int32_t entry_point,
+    iree_hal_buffer_t* workgroups_buffer,
+    iree_device_size_t workgroups_offset) {
+  IREE_ASSERT_ARGUMENT(command_buffer);
+  IREE_ASSERT_ARGUMENT(executable);
+  IREE_ASSERT_ARGUMENT(workgroups_buffer);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IF_VALIDATING(command_buffer, {
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z0, iree_hal_command_buffer_dispatch_indirect_validation(
+                command_buffer, executable, entry_point, workgroups_buffer,
+                workgroups_offset));
+  });
+  iree_status_t status = _VTABLE_DISPATCH(command_buffer, dispatch_indirect)(
+      command_buffer, executable, entry_point, workgroups_buffer,
+      workgroups_offset);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+//===----------------------------------------------------------------------===//
+// Utilities for command buffer creation
+//===----------------------------------------------------------------------===//
+
+IREE_API_EXPORT iree_status_t iree_hal_create_transfer_command_buffer(
+    iree_hal_device_t* device, iree_hal_command_buffer_mode_t mode,
+    iree_hal_queue_affinity_t queue_affinity, iree_host_size_t transfer_count,
+    const iree_hal_transfer_command_t* transfer_commands,
+    iree_hal_command_buffer_t** out_command_buffer) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_command_buffer_t* command_buffer = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_hal_command_buffer_create(device, mode,
+                                         IREE_HAL_COMMAND_CATEGORY_TRANSFER,
+                                         queue_affinity, &command_buffer));
+
+  iree_status_t status = iree_hal_command_buffer_begin(command_buffer);
+  if (iree_status_is_ok(status)) {
+    for (iree_host_size_t i = 0; i < transfer_count; ++i) {
+      const iree_hal_transfer_command_t* transfer_command =
+          &transfer_commands[i];
+      switch (transfer_command->type) {
+        case IREE_HAL_TRANSFER_COMMAND_TYPE_FILL:
+          status = iree_hal_command_buffer_fill_buffer(
+              command_buffer, transfer_command->fill.target_buffer,
+              transfer_command->fill.target_offset,
+              transfer_command->fill.length, transfer_command->fill.pattern,
+              transfer_command->fill.pattern_length);
+          break;
+        case IREE_HAL_TRANSFER_COMMAND_TYPE_COPY:
+          status = iree_hal_command_buffer_copy_buffer(
+              command_buffer, transfer_command->copy.source_buffer,
+              transfer_command->copy.source_offset,
+              transfer_command->copy.target_buffer,
+              transfer_command->copy.target_offset,
+              transfer_command->copy.length);
+          break;
+        case IREE_HAL_TRANSFER_COMMAND_TYPE_UPDATE:
+          status = iree_hal_command_buffer_update_buffer(
+              command_buffer, transfer_command->update.source_buffer,
+              transfer_command->update.source_offset,
+              transfer_command->update.target_buffer,
+              transfer_command->update.target_offset,
+              transfer_command->update.length);
+          break;
+        default:
+          status = iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                                    "unknown transfer_commands[%zu] type %d", i,
+                                    (int)transfer_command->type);
+          break;
+      }
+      if (!iree_status_is_ok(status)) break;
+    }
+  }
+  status =
+      iree_status_join(status, iree_hal_command_buffer_end(command_buffer));
+
+  if (iree_status_is_ok(status)) {
+    *out_command_buffer = command_buffer;
+  } else {
+    iree_hal_command_buffer_release(command_buffer);
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
diff --git a/runtime/src/iree/hal/command_buffer.h b/runtime/src/iree/hal/command_buffer.h
new file mode 100644
index 0000000..c06da30
--- /dev/null
+++ b/runtime/src/iree/hal/command_buffer.h
@@ -0,0 +1,694 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_COMMAND_BUFFER_H_
+#define IREE_HAL_COMMAND_BUFFER_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/allocator.h"
+#include "iree/hal/buffer.h"
+#include "iree/hal/descriptor_set.h"
+#include "iree/hal/descriptor_set_layout.h"
+#include "iree/hal/event.h"
+#include "iree/hal/executable.h"
+#include "iree/hal/executable_layout.h"
+#include "iree/hal/resource.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef struct iree_hal_device_t iree_hal_device_t;
+
+//===----------------------------------------------------------------------===//
+// Types and Enums
+//===----------------------------------------------------------------------===//
+
+// A bitfield specifying the mode of operation for a command buffer.
+enum iree_hal_command_buffer_mode_bits_t {
+  // Command buffer will be submitted once and never used again.
+  // This may enable in-place patching of command buffers that reduce overhead
+  // when it's known that command buffers will not be reused.
+  IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT = 1u << 0,
+
+  // TODO(benvanik): IREE_HAL_COMMAND_BUFFER_MODE_REUSABLE = 1u << 1,
+  // TODO(benvanik): IREE_HAL_COMMAND_BUFFER_MODE_PRIMARY = 1u << 2,
+  // TODO(benvanik): IREE_HAL_COMMAND_BUFFER_MODE_SECONDARY = 1u << 3,
+
+  // Indicates that the command buffer execution is allowed to execute inline
+  // with recording. The exact execution behavior is unspecified by the API and
+  // intentionally unknowable and must always assume to happen entirely
+  // asynchronously and that it will only have completed after waiting on device
+  // idle or the wait semaphores specified in the submission are signaled.
+  //
+  // Local backends can use this to avoid recording when the calling program can
+  // guarantee that it makes no assumptions about execution being deferred until
+  // a submission. The command buffer must still be submitted for scheduling and
+  // must have no wait semaphores specified. This allows the same program code
+  // to execute work both synchronously and asynchronously as remote backends
+  // are allowed to ignore this.
+  //
+  // Remote backends can use this to flush the command buffer more aggressively
+  // to begin early execution and overlap with continued recording.
+  //
+  // Requires IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT and
+  // IREE_HAL_COMMAND_BUFFER_MODE_PRIMARY. Compatible with
+  // IREE_HAL_COMMAND_BUFFER_MODE_REUSABLE.
+  IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION = 1u << 4,
+
+  // Disables additional command buffer validation (if present).
+  // By default all command buffers will be validated if
+  // `IREE_HAL_COMMAND_BUFFER_VALIDATION_ENABLE=1` - if shimming command buffers
+  // or performing replay this validation can be disabled per-command buffer.
+  IREE_HAL_COMMAND_BUFFER_MODE_UNVALIDATED = 1u << 5,
+};
+typedef uint32_t iree_hal_command_buffer_mode_t;
+
+// A bitfield specifying the category of commands in a command queue.
+enum iree_hal_command_category_bits_t {
+  // Command is considered a transfer operation (memcpy, etc).
+  IREE_HAL_COMMAND_CATEGORY_TRANSFER = 1u << 0,
+  // Command is considered a dispatch operation (dispatch/execute).
+  IREE_HAL_COMMAND_CATEGORY_DISPATCH = 1u << 1,
+  // Commands may be of any type.
+  // Using this value may prevent optimizations and if possible callers should
+  // always specify the strictest set possible (for example, only transfer
+  // commands to ensure they get placed on a DMA queue).
+  IREE_HAL_COMMAND_CATEGORY_ANY =
+      IREE_HAL_COMMAND_CATEGORY_TRANSFER | IREE_HAL_COMMAND_CATEGORY_DISPATCH,
+};
+typedef uint32_t iree_hal_command_category_t;
+
+// Bitfield specifying which execution stage a barrier should start/end at.
+//
+// Maps to VkPipelineStageFlagBits.
+enum iree_hal_execution_stage_bits_t {
+  // Top of the pipeline when commands are initially issued by the device.
+  IREE_HAL_EXECUTION_STAGE_COMMAND_ISSUE = 1u << 0,
+  // Stage of the pipeline when dispatch parameter data is consumed.
+  IREE_HAL_EXECUTION_STAGE_COMMAND_PROCESS = 1u << 1,
+  // Stage where dispatch commands execute.
+  IREE_HAL_EXECUTION_STAGE_DISPATCH = 1u << 2,
+  // Stage where transfer (copy/clear/fill/etc) commands execute.
+  IREE_HAL_EXECUTION_STAGE_TRANSFER = 1u << 3,
+  // Final stage in the pipeline when commands are retired on the device.
+  IREE_HAL_EXECUTION_STAGE_COMMAND_RETIRE = 1u << 4,
+  // Pseudo-stage for read/writes by the host. Not executed on device.
+  IREE_HAL_EXECUTION_STAGE_HOST = 1u << 5,
+};
+typedef uint32_t iree_hal_execution_stage_t;
+
+// Bitfield specifying flags controlling an execution dependency.
+//
+// Maps to VkDependencyFlags.
+enum iree_hal_execution_barrier_flag_bits_t {
+  IREE_HAL_EXECUTION_BARRIER_FLAG_NONE = 0,
+};
+typedef uint32_t iree_hal_execution_barrier_flags_t;
+
+// Bitfield specifying which scopes will access memory and how.
+//
+// Maps to VkAccessFlagBits.
+enum iree_hal_access_scope_bits_t {
+  // Read access to indirect command data as part of an indirect dispatch.
+  IREE_HAL_ACCESS_SCOPE_INDIRECT_COMMAND_READ = 1u << 0,
+  // Constant uniform buffer reads by the device.
+  IREE_HAL_ACCESS_SCOPE_CONSTANT_READ = 1u << 1,
+  // Storage buffer reads by dispatch commands.
+  IREE_HAL_ACCESS_SCOPE_DISPATCH_READ = 1u << 2,
+  // Storage buffer writes by dispatch commands.
+  IREE_HAL_ACCESS_SCOPE_DISPATCH_WRITE = 1u << 3,
+  // Source of a transfer operation.
+  IREE_HAL_ACCESS_SCOPE_TRANSFER_READ = 1u << 4,
+  // Target of a transfer operation.
+  IREE_HAL_ACCESS_SCOPE_TRANSFER_WRITE = 1u << 5,
+  // Read operation by the host through mapped memory.
+  IREE_HAL_ACCESS_SCOPE_HOST_READ = 1u << 6,
+  // Write operation by the host through mapped memory.
+  IREE_HAL_ACCESS_SCOPE_HOST_WRITE = 1u << 7,
+  // External/non-specific read.
+  IREE_HAL_ACCESS_SCOPE_MEMORY_READ = 1u << 8,
+  // External/non-specific write.
+  IREE_HAL_ACCESS_SCOPE_MEMORY_WRITE = 1u << 9,
+};
+typedef uint32_t iree_hal_access_scope_t;
+
+// Defines a global memory barrier.
+// These are cheaper to encode than buffer-specific barriers but may cause
+// stalls and bubbles in device pipelines if applied too broadly. Prefer them
+// over equivalently large sets of buffer-specific barriers (such as when
+// completely changing execution contexts).
+//
+// Maps to VkMemoryBarrier.
+typedef struct iree_hal_memory_barrier_t {
+  // All access scopes prior-to the barrier (inclusive).
+  iree_hal_access_scope_t source_scope;
+  // All access scopes following the barrier (inclusive).
+  iree_hal_access_scope_t target_scope;
+} iree_hal_memory_barrier_t;
+
+// Defines a memory barrier that applies to a range of a specific buffer.
+// Use of these (vs. global memory barriers) provides fine-grained execution
+// ordering to device command processors and allows for more aggressive
+// reordering.
+//
+// Maps to VkBufferMemoryBarrier.
+typedef struct iree_hal_buffer_barrier_t {
+  // All access scopes prior-to the barrier (inclusive).
+  iree_hal_access_scope_t source_scope;
+  // All access scopes following the barrier (inclusive).
+  iree_hal_access_scope_t target_scope;
+  // Buffer the barrier is restricted to.
+  // The barrier will apply to the entire physical device allocation.
+  iree_hal_buffer_t* buffer;
+  // Relative offset/length within |buffer| (which may itself be mapped into the
+  // device allocation at an offset).
+  iree_device_size_t offset;
+  iree_device_size_t length;
+} iree_hal_buffer_barrier_t;
+
+// An RGBA color.
+typedef struct iree_hal_label_color_t {
+  uint8_t r;
+  uint8_t g;
+  uint8_t b;
+  uint8_t a;
+} iree_hal_label_color_t;
+
+// A source location attached to debug labels.
+typedef struct iree_hal_label_location_t {
+  iree_string_view_t file;
+  int line;
+} iree_hal_label_location_t;
+
+// An unspecified color; debugging tools are to choose their own.
+static inline iree_hal_label_color_t iree_hal_label_color_unspecified() {
+  iree_hal_label_color_t color = {0, 0, 0, 0};
+  return color;
+}
+
+// Formats a command buffer mode bitfield as a string.
+// See iree_bitfield_format for usage.
+IREE_API_EXPORT iree_string_view_t
+iree_hal_command_buffer_mode_format(iree_hal_command_buffer_mode_t value,
+                                    iree_bitfield_string_temp_t* out_temp);
+
+// Formats a command category bitfield as a string.
+// See iree_bitfield_format for usage.
+IREE_API_EXPORT iree_string_view_t iree_hal_command_category_format(
+    iree_hal_command_category_t value, iree_bitfield_string_temp_t* out_temp);
+
+// Storage for command buffer validation state.
+// Designed to be embedded in concrete implementations that want validation.
+typedef struct iree_hal_command_buffer_validation_state_t {
+  iree_hal_device_t* device;
+  bool is_recording;
+  int32_t debug_group_depth;
+  // TODO(benvanik): current executable layout/descriptor set layout info.
+  // TODO(benvanik): valid push constant bit ranges.
+} iree_hal_command_buffer_validation_state_t;
+
+// Maximum size of any update in iree_hal_command_buffer_update_buffer.
+// 64KB is the limit on Vulkan and we uniformly use that today across all
+// targets as to not need too much command buffer memory.
+#define IREE_HAL_COMMAND_BUFFER_MAX_UPDATE_SIZE \
+  ((iree_device_size_t)(64 * 1024))
+
+//===----------------------------------------------------------------------===//
+// iree_hal_command_buffer_t
+//===----------------------------------------------------------------------===//
+
+// Asynchronous command buffer recording interface.
+// Commands are recorded by the implementation for later submission to command
+// queues.
+//
+// Buffers, events, and programs referenced must remain valid and not be
+// modified or read while there are commands in-flight. The usual flow is to
+// populate input buffers, dispatch using those buffers, wait on a semaphore
+// until the buffers are guaranteed to no longer be in use, and then reuse the
+// buffers. Lifetimes are managed by the command buffer and all used resources
+// will be retained for as long as the command buffer is live or until it is
+// reset.
+//
+// Errors that can be recognized when operations are enqueued will be returned
+// immediately, such as invalid argument errors. Errors that can only be
+// determined at execution time will be returned on semaphores. Once a failure
+// occurs the device queue will enter an error state that invalidates all
+// operations on the device queue (as ordering is not strict and any may still
+// be in-flight). In this case the user of the device queue should treat all
+// in-flight operations as cancelled and fully reset themselves. Other device
+// queues that may be waiting on events from the device queue will also enter
+// error states. Only once a user has acknowledged and cleared the error state
+// with a Reset the queue will become usable, and otherwise all operations will
+// return errors.
+//
+// Command buffers are thread-compatible. Use multiple command buffers if trying
+// to record commands from multiple threads. Command buffers must not be mutated
+// between when they have are submitted for execution on a queue and when the
+// semaphore fires indicating the completion of their execution.
+typedef struct iree_hal_command_buffer_t iree_hal_command_buffer_t;
+
+// Creates a command buffer ready to begin recording, possibly reusing an
+// existing one from the |device| pool.
+//
+// |queue_affinity| specifies the device queues the command buffer may be
+// submitted to. The queue affinity provided to iree_hal_device_queue_submit
+// must match or be a subset of the |queue_affinity|.
+IREE_API_EXPORT iree_status_t iree_hal_command_buffer_create(
+    iree_hal_device_t* device, iree_hal_command_buffer_mode_t mode,
+    iree_hal_command_category_t command_categories,
+    iree_hal_queue_affinity_t queue_affinity,
+    iree_hal_command_buffer_t** out_command_buffer);
+
+// Retains the given |command_buffer| for the caller.
+IREE_API_EXPORT void iree_hal_command_buffer_retain(
+    iree_hal_command_buffer_t* command_buffer);
+
+// Releases the given |command_buffer| from the caller.
+IREE_API_EXPORT void iree_hal_command_buffer_release(
+    iree_hal_command_buffer_t* command_buffer);
+
+IREE_API_EXPORT void* iree_hal_command_buffer_dyn_cast(
+    iree_hal_command_buffer_t* command_buffer, const void* vtable);
+
+// Returns a bitmask indicating the behavior of the command buffer.
+IREE_API_EXPORT iree_hal_command_buffer_mode_t
+iree_hal_command_buffer_mode(const iree_hal_command_buffer_t* command_buffer);
+
+// Returns a bitmask indicating which command categories this command buffer
+// can record.
+IREE_API_EXPORT iree_hal_command_category_t
+iree_hal_command_buffer_allowed_categories(
+    const iree_hal_command_buffer_t* command_buffer);
+
+// Resets and begins recording into the command buffer, clearing all
+// previously recorded contents.
+// The command buffer must not be in-flight.
+IREE_API_EXPORT iree_status_t
+iree_hal_command_buffer_begin(iree_hal_command_buffer_t* command_buffer);
+
+// Ends recording into the command buffer.
+// This must be called prior to submitting the command buffer for execution.
+IREE_API_EXPORT iree_status_t
+iree_hal_command_buffer_end(iree_hal_command_buffer_t* command_buffer);
+
+// Pushes a new debug group with the given |label|.
+// All commands between this and a mandatory matching call to
+// iree_hal_command_buffer_end_debug_group will be grouped together with the
+// given label. If a source location is available it can be provided via
+// |location| to allow mapping back into the source program that issued the
+// commands.
+//
+// An optional RGBA color to show in the debug UI may be provided via
+// |label_color|; otherwise iree_hal_label_color_unspecified can be used to let
+// the debug tool choose.
+IREE_API_EXPORT void iree_hal_command_buffer_begin_debug_group(
+    iree_hal_command_buffer_t* command_buffer, iree_string_view_t label,
+    iree_hal_label_color_t label_color,
+    const iree_hal_label_location_t* location);
+
+// Pops a debug group from the stack.
+IREE_API_EXPORT void iree_hal_command_buffer_end_debug_group(
+    iree_hal_command_buffer_t* command_buffer);
+
+// Defines a memory dependency between commands recorded before and after the
+// barrier. One or more memory or buffer barriers can be specified to indicate
+// between which stages or buffers the dependencies exist.
+IREE_API_EXPORT iree_status_t iree_hal_command_buffer_execution_barrier(
+    iree_hal_command_buffer_t* command_buffer,
+    iree_hal_execution_stage_t source_stage_mask,
+    iree_hal_execution_stage_t target_stage_mask,
+    iree_hal_execution_barrier_flags_t flags,
+    iree_host_size_t memory_barrier_count,
+    const iree_hal_memory_barrier_t* memory_barriers,
+    iree_host_size_t buffer_barrier_count,
+    const iree_hal_buffer_barrier_t* buffer_barriers);
+
+// Sets an event to the signaled state.
+// |source_stage_mask| specifies when the event is signaled.
+//
+// Events are only valid within a single command buffer. Events can only be
+// used on non-transfer queues.
+IREE_API_EXPORT iree_status_t iree_hal_command_buffer_signal_event(
+    iree_hal_command_buffer_t* command_buffer, iree_hal_event_t* event,
+    iree_hal_execution_stage_t source_stage_mask);
+
+// Resets an event to the non-signaled state.
+// |source_stage_mask| specifies when the event is unsignaled.
+//
+// Events are only valid within a single command buffer. Events can only be
+// used on non-transfer queues.
+IREE_API_EXPORT iree_status_t iree_hal_command_buffer_reset_event(
+    iree_hal_command_buffer_t* command_buffer, iree_hal_event_t* event,
+    iree_hal_execution_stage_t source_stage_mask);
+
+// Waits for one or more events to be signaled and defines a memory dependency
+// between the synchronization scope of the signal operations and the commands
+// following the wait.
+//
+// |source_stage_mask| must include ExecutionStage::kHost for Event::Signal to
+// be visibile.
+//
+// Events are only valid within a single command buffer. Events remain
+// signaled even after waiting and must be reset to be reused. Events can only
+// be used on non-transfer queues.
+IREE_API_EXPORT iree_status_t iree_hal_command_buffer_wait_events(
+    iree_hal_command_buffer_t* command_buffer, iree_host_size_t event_count,
+    const iree_hal_event_t** events,
+    iree_hal_execution_stage_t source_stage_mask,
+    iree_hal_execution_stage_t target_stage_mask,
+    iree_host_size_t memory_barrier_count,
+    const iree_hal_memory_barrier_t* memory_barriers,
+    iree_host_size_t buffer_barrier_count,
+    const iree_hal_buffer_barrier_t* buffer_barriers);
+
+// Hints to the device queue that the given buffer will not be used again.
+// After encoding a discard the buffer contents will be considered undefined.
+// This is because the discard may be used to elide write backs to host memory
+// or aggressively reuse the allocation for other purposes.
+//
+// For buffers allocated with IREE_HAL_MEMORY_TYPE_TRANSIENT this may allow
+// the device queue to reclaim the memory used by the buffer earlier than
+// otherwise possible.
+IREE_API_EXPORT iree_status_t iree_hal_command_buffer_discard_buffer(
+    iree_hal_command_buffer_t* command_buffer, iree_hal_buffer_t* buffer);
+
+// Fills the target buffer with the given repeating value.
+// Expects that |pattern_length| is one of 1, 2, or 4 and that the offset and
+// length are aligned to the natural alignment of the value.
+// The target buffer must be compatible with the devices owned by this
+// device queue and be allocated with IREE_HAL_BUFFER_USAGE_TRANSFER.
+IREE_API_EXPORT iree_status_t iree_hal_command_buffer_fill_buffer(
+    iree_hal_command_buffer_t* command_buffer, iree_hal_buffer_t* target_buffer,
+    iree_device_size_t target_offset, iree_device_size_t length,
+    const void* pattern, iree_host_size_t pattern_length);
+
+// Updates a range of the given target buffer from the source host memory.
+// The source host memory is copied immediately into the command buffer and
+// occupies command buffer space. It is strongly recommended that large buffer
+// updates are performed via iree_hal_command_buffer_copy_buffer where there is
+// the possibility of a zero-copy path.
+// The |source_buffer| may be released by the caller immediately after this
+// call returns.
+// The |target_buffer| must be compatible with the devices owned by this
+// device queue and be allocated with IREE_HAL_BUFFER_USAGE_TRANSFER.
+IREE_API_EXPORT iree_status_t iree_hal_command_buffer_update_buffer(
+    iree_hal_command_buffer_t* command_buffer, const void* source_buffer,
+    iree_host_size_t source_offset, iree_hal_buffer_t* target_buffer,
+    iree_device_size_t target_offset, iree_device_size_t length);
+
+// Copies a range of one buffer to another.
+// Both buffers must be compatible with the devices owned by this device
+// queue and be allocated with IREE_HAL_BUFFER_USAGE_TRANSFER. Though the source
+// and target buffer may be the same the ranges must not overlap (as with
+// memcpy).
+//
+// This can be used to perform device->host, host->device, and device->device
+// copies.
+IREE_API_EXPORT iree_status_t iree_hal_command_buffer_copy_buffer(
+    iree_hal_command_buffer_t* command_buffer, iree_hal_buffer_t* source_buffer,
+    iree_device_size_t source_offset, iree_hal_buffer_t* target_buffer,
+    iree_device_size_t target_offset, iree_device_size_t length);
+
+// Pushes an inline set of constants that can be accessed by subsequent
+// dispatches using a compatible executable layout.
+//
+// Push constants are treated as opaque bytes, meaning that they may be
+// bit-casted floats, bit-packed booleans, etc. |offset| and |values_length| are
+// in bytes.
+IREE_API_EXPORT iree_status_t iree_hal_command_buffer_push_constants(
+    iree_hal_command_buffer_t* command_buffer,
+    iree_hal_executable_layout_t* executable_layout, iree_host_size_t offset,
+    const void* values, iree_host_size_t values_length);
+
+// Pushes a descriptor set and associates it with |set|.
+// This uses an internal ringbuffer inside of the command buffer to avoid the
+// need for creating and binding descriptor sets and managing their lifetime.
+//
+// The descriptor set will remain bound and valid so long as the executable
+// layouts used by dispatches are compatible (same descriptor layouts and push
+// constant sizes).
+IREE_API_EXPORT iree_status_t iree_hal_command_buffer_push_descriptor_set(
+    iree_hal_command_buffer_t* command_buffer,
+    iree_hal_executable_layout_t* executable_layout, uint32_t set,
+    iree_host_size_t binding_count,
+    const iree_hal_descriptor_set_binding_t* bindings);
+
+// Binds a descriptor set to the given |set| matching that used in the
+// executable layout interface.
+//
+// The descriptor set will remain bound and valid so long as the executable
+// layouts used by dispatches are compatible (same descriptor layouts and push
+// constant sizes).
+//
+// If any dynamic descriptor types are defined in the descriptor set layout then
+// the dynamic offsets must be provided. These offsets will be added to the base
+// offset of the descriptor layout binding.
+IREE_API_EXPORT iree_status_t iree_hal_command_buffer_bind_descriptor_set(
+    iree_hal_command_buffer_t* command_buffer,
+    iree_hal_executable_layout_t* executable_layout, uint32_t set,
+    iree_hal_descriptor_set_t* descriptor_set,
+    iree_host_size_t dynamic_offset_count,
+    const iree_device_size_t* dynamic_offsets);
+
+// Dispatches an execution request.
+// The request may execute overlapped with any other transfer operation or
+// dispatch made within the same barrier-defined sequence.
+//
+// The executable specified must be registered for use with the device driver
+// owning this queue. It must not be unregistered until all requests that use
+// it have completed.
+//
+// Fails if the queue does not support dispatch operations (as indicated by
+// can_dispatch).
+IREE_API_EXPORT iree_status_t iree_hal_command_buffer_dispatch(
+    iree_hal_command_buffer_t* command_buffer,
+    iree_hal_executable_t* executable, int32_t entry_point,
+    uint32_t workgroup_x, uint32_t workgroup_y, uint32_t workgroup_z);
+
+// Dispatches an execution request with deferred workgroup counts.
+// This is the same as iree_hal_command_buffer_dispatch but the workgroup counts
+// are read from the given |workgroups_buffer| at offset |workgroups_offset| as
+// 3 uint32_t XYZ values before performing the dispatch. This allows prior
+// dispatches within the command sequence to populate the workgroup counts.
+//
+// The buffer must have been allocated with IREE_HAL_BUFFER_USAGE_DISPATCH and
+// be of IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE.
+IREE_API_EXPORT iree_status_t iree_hal_command_buffer_dispatch_indirect(
+    iree_hal_command_buffer_t* command_buffer,
+    iree_hal_executable_t* executable, int32_t entry_point,
+    iree_hal_buffer_t* workgroups_buffer, iree_device_size_t workgroups_offset);
+
+//===----------------------------------------------------------------------===//
+// Utilities for command buffer creation
+//===----------------------------------------------------------------------===//
+
+// Defines a transfer command operation.
+typedef enum iree_hal_transfer_command_type_t {
+  // iree_hal_command_buffer_fill_buffer
+  IREE_HAL_TRANSFER_COMMAND_TYPE_FILL = 0u,
+  // iree_hal_command_buffer_copy_buffer
+  IREE_HAL_TRANSFER_COMMAND_TYPE_COPY = 1u,
+  // iree_hal_command_buffer_update_buffer
+  IREE_HAL_TRANSFER_COMMAND_TYPE_UPDATE = 2u,
+} iree_hal_transfer_command_type_t;
+
+// Represents a single transfer command within a batch of commands.
+typedef struct iree_hal_transfer_command_t {
+  // The type of the command selecting which of the payload data is used.
+  iree_hal_transfer_command_type_t type;
+  union {
+    // IREE_HAL_TRANSFER_COMMAND_TYPE_FILL
+    struct {
+      iree_hal_buffer_t* target_buffer;
+      iree_device_size_t target_offset;
+      iree_device_size_t length;
+      const void* pattern;
+      iree_host_size_t pattern_length;
+    } fill;
+    // IREE_HAL_TRANSFER_COMMAND_TYPE_COPY
+    struct {
+      iree_hal_buffer_t* source_buffer;
+      iree_device_size_t source_offset;
+      iree_hal_buffer_t* target_buffer;
+      iree_device_size_t target_offset;
+      iree_device_size_t length;
+    } copy;
+    // IREE_HAL_TRANSFER_COMMAND_TYPE_UPDATE
+    struct {
+      const void* source_buffer;
+      iree_host_size_t source_offset;
+      iree_hal_buffer_t* target_buffer;
+      iree_device_size_t target_offset;
+      iree_device_size_t length;
+    } update;
+  };
+} iree_hal_transfer_command_t;
+
+// Builds a command buffer containing a recording of all |transfer_commands|.
+// All buffers must be compatible with |device| and ranges must not overlap
+// (same as with memcpy). All commands are executed concurrently with no
+// barriers. The provided commands and any referenced data needs only remain
+// live during recording, while all referenced buffers must be kept live by
+// the caller until the command buffer has completed execution.
+//
+// This is just a utility to make it easier to quickly construct batches of
+// transfer operations. If more control is required then record the command
+// buffer as normal.
+IREE_API_EXPORT iree_status_t iree_hal_create_transfer_command_buffer(
+    iree_hal_device_t* device, iree_hal_command_buffer_mode_t mode,
+    iree_hal_queue_affinity_t queue_affinity, iree_host_size_t transfer_count,
+    const iree_hal_transfer_command_t* transfer_commands,
+    iree_hal_command_buffer_t** out_command_buffer);
+
+//===----------------------------------------------------------------------===//
+// iree_hal_command_buffer_t validation wrapper
+//===----------------------------------------------------------------------===//
+
+// Wraps |target_command_buffer| with a validation layer that checks the
+// parameters to each call in an attempt to return errors where usage may result
+// in failed or incorrect execution. This layer adds many additional checks to
+// each call but must be used when dealing with untrusted incoming commands.
+//
+// The validation is strictly input argument and permission-based and not a full
+// verification of the correctness of any barriers or memory dependencies. A
+// command buffer recording that has passed validation does not indicate that it
+// is guaranteed to make forward progress or properly observe memory visibility
+// or availability rules. Instead, validation ensures that no command references
+// memory outside of the allowed ranges or accesses memory in violation of the
+// allowed usage or access rights.
+IREE_API_EXPORT iree_status_t iree_hal_command_buffer_wrap_validation(
+    iree_hal_device_t* device, iree_hal_command_buffer_t* target_command_buffer,
+    iree_hal_command_buffer_t** out_command_buffer);
+
+//===----------------------------------------------------------------------===//
+// iree_hal_command_buffer_t implementation details
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_command_buffer_vtable_t {
+  void(IREE_API_PTR* destroy)(iree_hal_command_buffer_t* command_buffer);
+
+  void*(IREE_API_PTR* dyn_cast)(iree_hal_command_buffer_t* command_buffer,
+                                const void* vtable);
+
+  iree_status_t(IREE_API_PTR* begin)(iree_hal_command_buffer_t* command_buffer);
+  iree_status_t(IREE_API_PTR* end)(iree_hal_command_buffer_t* command_buffer);
+
+  void(IREE_API_PTR* begin_debug_group)(
+      iree_hal_command_buffer_t* command_buffer, iree_string_view_t label,
+      iree_hal_label_color_t label_color,
+      const iree_hal_label_location_t* location);
+  void(IREE_API_PTR* end_debug_group)(
+      iree_hal_command_buffer_t* command_buffer);
+
+  iree_status_t(IREE_API_PTR* execution_barrier)(
+      iree_hal_command_buffer_t* command_buffer,
+      iree_hal_execution_stage_t source_stage_mask,
+      iree_hal_execution_stage_t target_stage_mask,
+      iree_hal_execution_barrier_flags_t flags,
+      iree_host_size_t memory_barrier_count,
+      const iree_hal_memory_barrier_t* memory_barriers,
+      iree_host_size_t buffer_barrier_count,
+      const iree_hal_buffer_barrier_t* buffer_barriers);
+
+  iree_status_t(IREE_API_PTR* signal_event)(
+      iree_hal_command_buffer_t* command_buffer, iree_hal_event_t* event,
+      iree_hal_execution_stage_t source_stage_mask);
+
+  iree_status_t(IREE_API_PTR* reset_event)(
+      iree_hal_command_buffer_t* command_buffer, iree_hal_event_t* event,
+      iree_hal_execution_stage_t source_stage_mask);
+
+  iree_status_t(IREE_API_PTR* wait_events)(
+      iree_hal_command_buffer_t* command_buffer, iree_host_size_t event_count,
+      const iree_hal_event_t** events,
+      iree_hal_execution_stage_t source_stage_mask,
+      iree_hal_execution_stage_t target_stage_mask,
+      iree_host_size_t memory_barrier_count,
+      const iree_hal_memory_barrier_t* memory_barriers,
+      iree_host_size_t buffer_barrier_count,
+      const iree_hal_buffer_barrier_t* buffer_barriers);
+
+  iree_status_t(IREE_API_PTR* discard_buffer)(
+      iree_hal_command_buffer_t* command_buffer, iree_hal_buffer_t* buffer);
+
+  iree_status_t(IREE_API_PTR* fill_buffer)(
+      iree_hal_command_buffer_t* command_buffer,
+      iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+      iree_device_size_t length, const void* pattern,
+      iree_host_size_t pattern_length);
+
+  iree_status_t(IREE_API_PTR* update_buffer)(
+      iree_hal_command_buffer_t* command_buffer, const void* source_buffer,
+      iree_host_size_t source_offset, iree_hal_buffer_t* target_buffer,
+      iree_device_size_t target_offset, iree_device_size_t length);
+
+  iree_status_t(IREE_API_PTR* copy_buffer)(
+      iree_hal_command_buffer_t* command_buffer,
+      iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
+      iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+      iree_device_size_t length);
+
+  iree_status_t(IREE_API_PTR* push_constants)(
+      iree_hal_command_buffer_t* command_buffer,
+      iree_hal_executable_layout_t* executable_layout, iree_host_size_t offset,
+      const void* values, iree_host_size_t values_length);
+
+  iree_status_t(IREE_API_PTR* push_descriptor_set)(
+      iree_hal_command_buffer_t* command_buffer,
+      iree_hal_executable_layout_t* executable_layout, uint32_t set,
+      iree_host_size_t binding_count,
+      const iree_hal_descriptor_set_binding_t* bindings);
+
+  iree_status_t(IREE_API_PTR* bind_descriptor_set)(
+      iree_hal_command_buffer_t* command_buffer,
+      iree_hal_executable_layout_t* executable_layout, uint32_t set,
+      iree_hal_descriptor_set_t* descriptor_set,
+      iree_host_size_t dynamic_offset_count,
+      const iree_device_size_t* dynamic_offsets);
+
+  iree_status_t(IREE_API_PTR* dispatch)(
+      iree_hal_command_buffer_t* command_buffer,
+      iree_hal_executable_t* executable, int32_t entry_point,
+      uint32_t workgroup_x, uint32_t workgroup_y, uint32_t workgroup_z);
+
+  iree_status_t(IREE_API_PTR* dispatch_indirect)(
+      iree_hal_command_buffer_t* command_buffer,
+      iree_hal_executable_t* executable, int32_t entry_point,
+      iree_hal_buffer_t* workgroups_buffer,
+      iree_device_size_t workgroups_offset);
+} iree_hal_command_buffer_vtable_t;
+IREE_HAL_ASSERT_VTABLE_LAYOUT(iree_hal_command_buffer_vtable_t);
+
+struct iree_hal_command_buffer_t {
+  iree_hal_resource_t resource;
+  iree_hal_command_buffer_mode_t mode;
+  iree_hal_command_category_t allowed_categories;
+  iree_hal_queue_affinity_t queue_affinity;
+
+#if IREE_HAL_COMMAND_BUFFER_VALIDATION_ENABLE
+  iree_hal_command_buffer_validation_state_t validation;
+#endif  // IREE_HAL_COMMAND_BUFFER_VALIDATION_ENABLE
+};
+
+IREE_API_EXPORT void iree_hal_command_buffer_initialize(
+    iree_hal_device_t* device, iree_hal_command_buffer_mode_t mode,
+    iree_hal_command_category_t command_categories,
+    iree_hal_queue_affinity_t queue_affinity,
+    const iree_hal_command_buffer_vtable_t* vtable,
+    iree_hal_command_buffer_t* command_buffer);
+
+IREE_API_EXPORT void iree_hal_command_buffer_destroy(
+    iree_hal_command_buffer_t* command_buffer);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_COMMAND_BUFFER_H_
diff --git a/runtime/src/iree/hal/command_buffer_validation.c b/runtime/src/iree/hal/command_buffer_validation.c
new file mode 100644
index 0000000..8ccf775
--- /dev/null
+++ b/runtime/src/iree/hal/command_buffer_validation.c
@@ -0,0 +1,441 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/command_buffer_validation.h"
+
+#include <inttypes.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "iree/base/tracing.h"
+#include "iree/hal/allocator.h"
+#include "iree/hal/buffer.h"
+#include "iree/hal/descriptor_set.h"
+#include "iree/hal/detail.h"
+#include "iree/hal/device.h"
+#include "iree/hal/event.h"
+#include "iree/hal/executable.h"
+#include "iree/hal/executable_layout.h"
+#include "iree/hal/resource.h"
+
+#if IREE_HAL_COMMAND_BUFFER_VALIDATION_ENABLE
+#define VALIDATION_STATE(command_buffer) (&(command_buffer)->validation)
+#else
+#define VALIDATION_STATE(command_buffer) \
+  ((iree_hal_command_buffer_validation_state_t*)NULL)
+#endif  // IREE_HAL_COMMAND_BUFFER_VALIDATION_ENABLE
+
+// Returns success iff the queue supports the given command categories.
+static iree_status_t iree_hal_command_buffer_validate_categories(
+    const iree_hal_command_buffer_t* command_buffer,
+    iree_hal_command_category_t required_categories) {
+  if (!iree_all_bits_set(command_buffer->allowed_categories,
+                         required_categories)) {
+#if IREE_STATUS_MODE
+    iree_bitfield_string_temp_t temp0, temp1;
+    iree_string_view_t required_categories_str =
+        iree_hal_command_category_format(required_categories, &temp0);
+    iree_string_view_t allowed_categories_str =
+        iree_hal_command_category_format(command_buffer->allowed_categories,
+                                         &temp1);
+    return iree_make_status(
+        IREE_STATUS_FAILED_PRECONDITION,
+        "operation requires categories %.*s but command buffer only supports "
+        "%.*s",
+        (int)required_categories_str.size, required_categories_str.data,
+        (int)allowed_categories_str.size, allowed_categories_str.data);
+#else
+    return iree_status_from_code(IREE_STATUS_FAILED_PRECONDITION);
+#endif  // IREE_STATUS_MODE
+  }
+  return iree_ok_status();
+}
+
+// Returns success iff the buffer is compatible with the device.
+static iree_status_t iree_hal_command_buffer_validate_buffer_compatibility(
+    const iree_hal_command_buffer_t* command_buffer, iree_hal_buffer_t* buffer,
+    iree_hal_buffer_compatibility_t required_compatibility,
+    iree_hal_buffer_usage_t intended_usage) {
+  iree_hal_buffer_compatibility_t allowed_compatibility =
+      iree_hal_allocator_query_compatibility(
+          iree_hal_device_allocator(VALIDATION_STATE(command_buffer)->device),
+          (iree_hal_buffer_params_t){
+              .type = iree_hal_buffer_memory_type(buffer),
+              .usage = iree_hal_buffer_allowed_usage(buffer) & intended_usage,
+          },
+          iree_hal_buffer_allocation_size(buffer));
+  if (!iree_all_bits_set(allowed_compatibility, required_compatibility)) {
+#if IREE_STATUS_MODE
+    // Buffer cannot be used on the queue for the given usage.
+    iree_bitfield_string_temp_t temp0, temp1;
+    iree_string_view_t allowed_usage_str = iree_hal_buffer_usage_format(
+        iree_hal_buffer_allowed_usage(buffer), &temp0);
+    iree_string_view_t intended_usage_str =
+        iree_hal_buffer_usage_format(intended_usage, &temp1);
+    return iree_make_status(
+        IREE_STATUS_PERMISSION_DENIED,
+        "requested buffer usage is not supported for the buffer on this queue; "
+        "buffer allows %.*s, operation requires %.*s (allocator compatibility "
+        "mismatch)",
+        (int)allowed_usage_str.size, allowed_usage_str.data,
+        (int)intended_usage_str.size, intended_usage_str.data);
+#else
+    return iree_status_from_code(IREE_STATUS_PERMISSION_DENIED);
+#endif  // IREE_STATUS_MODE
+  }
+  return iree_ok_status();
+}
+
+// Returns success iff the currently bound descriptor sets are valid for the
+// given executable entry point.
+static iree_status_t iree_hal_command_buffer_validate_dispatch_bindings(
+    iree_hal_command_buffer_t* command_buffer,
+    iree_hal_executable_t* executable, int32_t entry_point) {
+  // TODO(benvanik): validate buffers referenced have compatible memory types
+  // and access rights.
+  // TODO(benvanik): validate no aliasing between inputs/outputs.
+  return iree_ok_status();
+}
+
+void iree_hal_command_buffer_initialize_validation(
+    iree_hal_device_t* device, iree_hal_command_buffer_t* command_buffer) {
+  VALIDATION_STATE(command_buffer)->device = device;
+  VALIDATION_STATE(command_buffer)->is_recording = false;
+}
+
+iree_status_t iree_hal_command_buffer_begin_validation(
+    iree_hal_command_buffer_t* command_buffer) {
+  if (VALIDATION_STATE(command_buffer)->is_recording) {
+    return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                            "command buffer is already in a recording state");
+  }
+  VALIDATION_STATE(command_buffer)->is_recording = true;
+  return iree_ok_status();
+}
+
+iree_status_t iree_hal_command_buffer_end_validation(
+    iree_hal_command_buffer_t* command_buffer) {
+  if (VALIDATION_STATE(command_buffer)->debug_group_depth != 0) {
+    return iree_make_status(
+        IREE_STATUS_FAILED_PRECONDITION,
+        "unbalanced debug group depth (expected 0, is %d)",
+        VALIDATION_STATE(command_buffer)->debug_group_depth);
+  }
+  if (!VALIDATION_STATE(command_buffer)->is_recording) {
+    return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                            "command buffer is not in a recording state");
+  }
+  VALIDATION_STATE(command_buffer)->is_recording = false;
+  return iree_ok_status();
+}
+
+void iree_hal_command_buffer_begin_debug_group_validation(
+    iree_hal_command_buffer_t* command_buffer, iree_string_view_t label,
+    iree_hal_label_color_t label_color,
+    const iree_hal_label_location_t* location) {
+  ++VALIDATION_STATE(command_buffer)->debug_group_depth;
+}
+
+void iree_hal_command_buffer_end_debug_group_validation(
+    iree_hal_command_buffer_t* command_buffer) {
+  --VALIDATION_STATE(command_buffer)->debug_group_depth;
+}
+
+iree_status_t iree_hal_command_buffer_execution_barrier_validation(
+    iree_hal_command_buffer_t* command_buffer,
+    iree_hal_execution_stage_t source_stage_mask,
+    iree_hal_execution_stage_t target_stage_mask,
+    iree_hal_execution_barrier_flags_t flags,
+    iree_host_size_t memory_barrier_count,
+    const iree_hal_memory_barrier_t* memory_barriers,
+    iree_host_size_t buffer_barrier_count,
+    const iree_hal_buffer_barrier_t* buffer_barriers) {
+  // NOTE: all command buffer types can perform this so no need to check.
+
+  // TODO(benvanik): additional synchronization validation.
+
+  return iree_ok_status();
+}
+
+iree_status_t iree_hal_command_buffer_signal_event_validation(
+    iree_hal_command_buffer_t* command_buffer, iree_hal_event_t* event,
+    iree_hal_execution_stage_t source_stage_mask) {
+  IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_categories(
+      command_buffer, IREE_HAL_COMMAND_CATEGORY_DISPATCH));
+
+  // TODO(benvanik): additional synchronization validation.
+
+  return iree_ok_status();
+}
+
+iree_status_t iree_hal_command_buffer_reset_event_validation(
+    iree_hal_command_buffer_t* command_buffer, iree_hal_event_t* event,
+    iree_hal_execution_stage_t source_stage_mask) {
+  IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_categories(
+      command_buffer, IREE_HAL_COMMAND_CATEGORY_DISPATCH));
+
+  // TODO(benvanik): additional synchronization validation.
+
+  return iree_ok_status();
+}
+
+iree_status_t iree_hal_command_buffer_wait_events_validation(
+    iree_hal_command_buffer_t* command_buffer, iree_host_size_t event_count,
+    const iree_hal_event_t** events,
+    iree_hal_execution_stage_t source_stage_mask,
+    iree_hal_execution_stage_t target_stage_mask,
+    iree_host_size_t memory_barrier_count,
+    const iree_hal_memory_barrier_t* memory_barriers,
+    iree_host_size_t buffer_barrier_count,
+    const iree_hal_buffer_barrier_t* buffer_barriers) {
+  IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_categories(
+      command_buffer, IREE_HAL_COMMAND_CATEGORY_DISPATCH));
+
+  // TODO(benvanik): additional synchronization validation.
+
+  return iree_ok_status();
+}
+
+iree_status_t iree_hal_command_buffer_discard_buffer_validation(
+    iree_hal_command_buffer_t* command_buffer, iree_hal_buffer_t* buffer) {
+  IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_categories(
+      command_buffer, IREE_HAL_COMMAND_CATEGORY_TRANSFER));
+
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_memory_type(
+      iree_hal_buffer_memory_type(buffer),
+      IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE));
+
+  return iree_ok_status();
+}
+
+iree_status_t iree_hal_command_buffer_fill_buffer_validation(
+    iree_hal_command_buffer_t* command_buffer, iree_hal_buffer_t* target_buffer,
+    iree_device_size_t target_offset, iree_device_size_t length,
+    const void* pattern, iree_host_size_t pattern_length) {
+  IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_categories(
+      command_buffer, IREE_HAL_COMMAND_CATEGORY_TRANSFER));
+  IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_buffer_compatibility(
+      command_buffer, target_buffer,
+      IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_TRANSFER,
+      IREE_HAL_BUFFER_USAGE_TRANSFER));
+
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_memory_type(
+      iree_hal_buffer_memory_type(target_buffer),
+      IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE));
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_access(
+      iree_hal_buffer_allowed_access(target_buffer),
+      IREE_HAL_MEMORY_ACCESS_WRITE));
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_usage(
+      iree_hal_buffer_allowed_usage(target_buffer),
+      IREE_HAL_BUFFER_USAGE_TRANSFER));
+  IREE_RETURN_IF_ERROR(
+      iree_hal_buffer_validate_range(target_buffer, target_offset, length));
+
+  // Ensure the value length is supported.
+  if (pattern_length != 1 && pattern_length != 2 && pattern_length != 4) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "fill value length is not one of the supported "
+                            "values (pattern_length=%zu)",
+                            pattern_length);
+  }
+
+  // Ensure the offset and length have an alignment matching the value length.
+  if ((target_offset % pattern_length) != 0 || (length % pattern_length) != 0) {
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "fill offset and/or length do not match the natural alignment of the "
+        "fill value (target_offset=%" PRIdsz ", length=%" PRIdsz
+        ", pattern_length=%zu)",
+        target_offset, length, pattern_length);
+  }
+
+  return iree_ok_status();
+}
+
+iree_status_t iree_hal_command_buffer_update_buffer_validation(
+    iree_hal_command_buffer_t* command_buffer, const void* source_buffer,
+    iree_host_size_t source_offset, iree_hal_buffer_t* target_buffer,
+    iree_device_size_t target_offset, iree_device_size_t length) {
+  IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_categories(
+      command_buffer, IREE_HAL_COMMAND_CATEGORY_TRANSFER));
+  IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_buffer_compatibility(
+      command_buffer, target_buffer,
+      IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_TRANSFER,
+      IREE_HAL_BUFFER_USAGE_TRANSFER));
+
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_memory_type(
+      iree_hal_buffer_memory_type(target_buffer),
+      IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE));
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_access(
+      iree_hal_buffer_allowed_access(target_buffer),
+      IREE_HAL_MEMORY_ACCESS_WRITE));
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_usage(
+      iree_hal_buffer_allowed_usage(target_buffer),
+      IREE_HAL_BUFFER_USAGE_TRANSFER));
+  IREE_RETURN_IF_ERROR(
+      iree_hal_buffer_validate_range(target_buffer, target_offset, length));
+
+  return iree_ok_status();
+}
+
+iree_status_t iree_hal_command_buffer_copy_buffer_validation(
+    iree_hal_command_buffer_t* command_buffer, iree_hal_buffer_t* source_buffer,
+    iree_device_size_t source_offset, iree_hal_buffer_t* target_buffer,
+    iree_device_size_t target_offset, iree_device_size_t length) {
+  IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_categories(
+      command_buffer, IREE_HAL_COMMAND_CATEGORY_TRANSFER));
+  IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_buffer_compatibility(
+      command_buffer, source_buffer,
+      IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_TRANSFER,
+      IREE_HAL_BUFFER_USAGE_TRANSFER));
+  IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_buffer_compatibility(
+      command_buffer, target_buffer,
+      IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_TRANSFER,
+      IREE_HAL_BUFFER_USAGE_TRANSFER));
+
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_access(
+      iree_hal_buffer_allowed_access(source_buffer),
+      IREE_HAL_MEMORY_ACCESS_READ));
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_usage(
+      iree_hal_buffer_allowed_usage(source_buffer),
+      IREE_HAL_BUFFER_USAGE_TRANSFER));
+  IREE_RETURN_IF_ERROR(
+      iree_hal_buffer_validate_range(source_buffer, source_offset, length));
+
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_usage(
+      iree_hal_buffer_allowed_usage(target_buffer),
+      IREE_HAL_BUFFER_USAGE_TRANSFER));
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_access(
+      iree_hal_buffer_allowed_access(target_buffer),
+      IREE_HAL_MEMORY_ACCESS_WRITE));
+  IREE_RETURN_IF_ERROR(
+      iree_hal_buffer_validate_range(target_buffer, target_offset, length));
+
+  // At least source or destination must be device-visible to enable
+  // host->device, device->host, and device->device.
+  // TODO(b/117338171): host->host copies.
+  if (!iree_any_bit_set(iree_hal_buffer_memory_type(source_buffer),
+                        IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE) &&
+      !iree_any_bit_set(iree_hal_buffer_memory_type(target_buffer),
+                        IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE)) {
+#if IREE_STATUS_MODE
+    iree_bitfield_string_temp_t temp0, temp1;
+    iree_string_view_t source_memory_type_str = iree_hal_memory_type_format(
+        iree_hal_buffer_memory_type(source_buffer), &temp0);
+    iree_string_view_t target_memory_type_str = iree_hal_memory_type_format(
+        iree_hal_buffer_memory_type(target_buffer), &temp1);
+    return iree_make_status(
+        IREE_STATUS_PERMISSION_DENIED,
+        "at least one buffer must be device-visible for a copy; "
+        "source_buffer=%.*s, target_buffer=%.*s",
+        (int)source_memory_type_str.size, source_memory_type_str.data,
+        (int)target_memory_type_str.size, target_memory_type_str.data);
+#else
+    return iree_status_from_code(IREE_STATUS_PERMISSION_DENIED);
+#endif  // IREE_STATUS_MODE
+  }
+
+  // Check for overlap - just like memcpy we don't handle that.
+  if (iree_hal_buffer_test_overlap(source_buffer, source_offset, length,
+                                   target_buffer, target_offset, length) !=
+      IREE_HAL_BUFFER_OVERLAP_DISJOINT) {
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "source and target ranges overlap within the same buffer");
+  }
+
+  return iree_ok_status();
+}
+
+iree_status_t iree_hal_command_buffer_push_constants_validation(
+    iree_hal_command_buffer_t* command_buffer,
+    iree_hal_executable_layout_t* executable_layout, iree_host_size_t offset,
+    const void* values, iree_host_size_t values_length) {
+  IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_categories(
+      command_buffer, IREE_HAL_COMMAND_CATEGORY_DISPATCH));
+
+  if (IREE_UNLIKELY((values_length % 4) != 0)) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "invalid alignment %zu, must be 4-byte aligned",
+                            values_length);
+  }
+
+  // TODO(benvanik): validate offset and value count with layout.
+
+  return iree_ok_status();
+}
+
+iree_status_t iree_hal_command_buffer_push_descriptor_set_validation(
+    iree_hal_command_buffer_t* command_buffer,
+    iree_hal_executable_layout_t* executable_layout, uint32_t set,
+    iree_host_size_t binding_count,
+    const iree_hal_descriptor_set_binding_t* bindings) {
+  IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_categories(
+      command_buffer, IREE_HAL_COMMAND_CATEGORY_DISPATCH));
+
+  // TODO(benvanik): validate set index.
+  // TODO(benvanik): validate binding_offset.
+  // TODO(benvanik): validate bindings.
+
+  return iree_ok_status();
+}
+
+iree_status_t iree_hal_command_buffer_bind_descriptor_set_validation(
+    iree_hal_command_buffer_t* command_buffer,
+    iree_hal_executable_layout_t* executable_layout, uint32_t set,
+    iree_hal_descriptor_set_t* descriptor_set,
+    iree_host_size_t dynamic_offset_count,
+    const iree_device_size_t* dynamic_offsets) {
+  IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_categories(
+      command_buffer, IREE_HAL_COMMAND_CATEGORY_DISPATCH));
+
+  // TODO(benvanik): validate set index.
+  // TODO(benvanik): validate dynamic offsets (both count and offsets).
+
+  return iree_ok_status();
+}
+
+iree_status_t iree_hal_command_buffer_dispatch_validation(
+    iree_hal_command_buffer_t* command_buffer,
+    iree_hal_executable_t* executable, int32_t entry_point,
+    uint32_t workgroup_x, uint32_t workgroup_y, uint32_t workgroup_z) {
+  IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_categories(
+      command_buffer, IREE_HAL_COMMAND_CATEGORY_DISPATCH));
+  IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_dispatch_bindings(
+      command_buffer, executable, entry_point));
+  return iree_ok_status();
+}
+
+iree_status_t iree_hal_command_buffer_dispatch_indirect_validation(
+    iree_hal_command_buffer_t* command_buffer,
+    iree_hal_executable_t* executable, int32_t entry_point,
+    iree_hal_buffer_t* workgroups_buffer,
+    iree_device_size_t workgroups_offset) {
+  IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_categories(
+      command_buffer, IREE_HAL_COMMAND_CATEGORY_DISPATCH));
+  IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_buffer_compatibility(
+      command_buffer, workgroups_buffer,
+      IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_DISPATCH,
+      IREE_HAL_BUFFER_USAGE_DISPATCH));
+
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_memory_type(
+      iree_hal_buffer_memory_type(workgroups_buffer),
+      IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE));
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_access(
+      iree_hal_buffer_allowed_access(workgroups_buffer),
+      IREE_HAL_MEMORY_ACCESS_READ));
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_usage(
+      iree_hal_buffer_allowed_usage(workgroups_buffer),
+      IREE_HAL_BUFFER_USAGE_DISPATCH));
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_range(
+      workgroups_buffer, workgroups_offset, sizeof(uint32_t) * 3));
+
+  IREE_RETURN_IF_ERROR(iree_hal_command_buffer_validate_dispatch_bindings(
+      command_buffer, executable, entry_point));
+
+  return iree_ok_status();
+}
diff --git a/runtime/src/iree/hal/command_buffer_validation.h b/runtime/src/iree/hal/command_buffer_validation.h
new file mode 100644
index 0000000..42ab881
--- /dev/null
+++ b/runtime/src/iree/hal/command_buffer_validation.h
@@ -0,0 +1,104 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_COMMAND_BUFFER_VALIDATION_H_
+#define IREE_HAL_COMMAND_BUFFER_VALIDATION_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/command_buffer.h"
+
+void iree_hal_command_buffer_initialize_validation(
+    iree_hal_device_t* device, iree_hal_command_buffer_t* command_buffer);
+
+iree_status_t iree_hal_command_buffer_begin_validation(
+    iree_hal_command_buffer_t* command_buffer);
+
+iree_status_t iree_hal_command_buffer_end_validation(
+    iree_hal_command_buffer_t* command_buffer);
+
+void iree_hal_command_buffer_begin_debug_group_validation(
+    iree_hal_command_buffer_t* command_buffer, iree_string_view_t label,
+    iree_hal_label_color_t label_color,
+    const iree_hal_label_location_t* location);
+
+void iree_hal_command_buffer_end_debug_group_validation(
+    iree_hal_command_buffer_t* command_buffer);
+
+iree_status_t iree_hal_command_buffer_execution_barrier_validation(
+    iree_hal_command_buffer_t* command_buffer,
+    iree_hal_execution_stage_t source_stage_mask,
+    iree_hal_execution_stage_t target_stage_mask,
+    iree_hal_execution_barrier_flags_t flags,
+    iree_host_size_t memory_barrier_count,
+    const iree_hal_memory_barrier_t* memory_barriers,
+    iree_host_size_t buffer_barrier_count,
+    const iree_hal_buffer_barrier_t* buffer_barriers);
+
+iree_status_t iree_hal_command_buffer_signal_event_validation(
+    iree_hal_command_buffer_t* command_buffer, iree_hal_event_t* event,
+    iree_hal_execution_stage_t source_stage_mask);
+
+iree_status_t iree_hal_command_buffer_reset_event_validation(
+    iree_hal_command_buffer_t* command_buffer, iree_hal_event_t* event,
+    iree_hal_execution_stage_t source_stage_mask);
+
+iree_status_t iree_hal_command_buffer_wait_events_validation(
+    iree_hal_command_buffer_t* command_buffer, iree_host_size_t event_count,
+    const iree_hal_event_t** events,
+    iree_hal_execution_stage_t source_stage_mask,
+    iree_hal_execution_stage_t target_stage_mask,
+    iree_host_size_t memory_barrier_count,
+    const iree_hal_memory_barrier_t* memory_barriers,
+    iree_host_size_t buffer_barrier_count,
+    const iree_hal_buffer_barrier_t* buffer_barriers);
+
+iree_status_t iree_hal_command_buffer_discard_buffer_validation(
+    iree_hal_command_buffer_t* command_buffer, iree_hal_buffer_t* buffer);
+
+iree_status_t iree_hal_command_buffer_fill_buffer_validation(
+    iree_hal_command_buffer_t* command_buffer, iree_hal_buffer_t* target_buffer,
+    iree_device_size_t target_offset, iree_device_size_t length,
+    const void* pattern, iree_host_size_t pattern_length);
+
+iree_status_t iree_hal_command_buffer_update_buffer_validation(
+    iree_hal_command_buffer_t* command_buffer, const void* source_buffer,
+    iree_host_size_t source_offset, iree_hal_buffer_t* target_buffer,
+    iree_device_size_t target_offset, iree_device_size_t length);
+
+iree_status_t iree_hal_command_buffer_copy_buffer_validation(
+    iree_hal_command_buffer_t* command_buffer, iree_hal_buffer_t* source_buffer,
+    iree_device_size_t source_offset, iree_hal_buffer_t* target_buffer,
+    iree_device_size_t target_offset, iree_device_size_t length);
+
+iree_status_t iree_hal_command_buffer_push_constants_validation(
+    iree_hal_command_buffer_t* command_buffer,
+    iree_hal_executable_layout_t* executable_layout, iree_host_size_t offset,
+    const void* values, iree_host_size_t values_length);
+
+iree_status_t iree_hal_command_buffer_push_descriptor_set_validation(
+    iree_hal_command_buffer_t* command_buffer,
+    iree_hal_executable_layout_t* executable_layout, uint32_t set,
+    iree_host_size_t binding_count,
+    const iree_hal_descriptor_set_binding_t* bindings);
+
+iree_status_t iree_hal_command_buffer_bind_descriptor_set_validation(
+    iree_hal_command_buffer_t* command_buffer,
+    iree_hal_executable_layout_t* executable_layout, uint32_t set,
+    iree_hal_descriptor_set_t* descriptor_set,
+    iree_host_size_t dynamic_offset_count,
+    const iree_device_size_t* dynamic_offsets);
+
+iree_status_t iree_hal_command_buffer_dispatch_validation(
+    iree_hal_command_buffer_t* command_buffer,
+    iree_hal_executable_t* executable, int32_t entry_point,
+    uint32_t workgroup_x, uint32_t workgroup_y, uint32_t workgroup_z);
+
+iree_status_t iree_hal_command_buffer_dispatch_indirect_validation(
+    iree_hal_command_buffer_t* command_buffer,
+    iree_hal_executable_t* executable, int32_t entry_point,
+    iree_hal_buffer_t* workgroups_buffer, iree_device_size_t workgroups_offset);
+
+#endif  // IREE_HAL_COMMAND_BUFFER_VALIDATION_H_
diff --git a/runtime/src/iree/hal/cts/CMakeLists.txt b/runtime/src/iree/hal/cts/CMakeLists.txt
new file mode 100644
index 0000000..0216567
--- /dev/null
+++ b/runtime/src/iree/hal/cts/CMakeLists.txt
@@ -0,0 +1,194 @@
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+set(IREE_ALL_CTS_TESTS
+  "allocator"
+  "buffer_mapping"
+  "command_buffer"
+  "command_buffer_dispatch"
+  "descriptor_set"
+  "descriptor_set_layout"
+  "driver"
+  "event"
+  "executable_cache"
+  "executable_layout"
+  "semaphore"
+  "semaphore_submission"
+  PARENT_SCOPE
+)
+
+# These tests use executables produced by the iree-translate compiler tool.
+# If the compiler is disabled or a HAL driver implementation is not yet
+# connected to a functional compiler target, these tests can be skipped.
+set(IREE_EXECUTABLE_CTS_TESTS
+  "command_buffer_dispatch"
+  "executable_cache"
+  PARENT_SCOPE
+)
+
+# List of testdata/{name}.mlir source files.
+set(IREE_ALL_CTS_EXECUTABLE_SOURCES
+  "command_buffer_dispatch_test"
+  "executable_cache_test"
+  PARENT_SCOPE
+)
+
+iree_cc_library(
+  NAME
+    cts_test_base
+  HDRS
+    "cts_test_base.h"
+  DEPS
+    iree::base
+    iree::hal
+    iree::testing::gtest
+  TESTONLY
+  PUBLIC
+)
+
+iree_cc_library(
+  NAME
+    allocator_test_library
+  HDRS
+    "allocator_test.h"
+  DEPS
+    ::cts_test_base
+    iree::base
+    iree::hal
+    iree::testing::gtest
+)
+
+iree_cc_library(
+  NAME
+    buffer_mapping_test_library
+  HDRS
+    "buffer_mapping_test.h"
+  DEPS
+    ::cts_test_base
+    iree::base
+    iree::hal
+    iree::testing::gtest
+)
+
+iree_cc_library(
+  NAME
+    command_buffer_test_library
+  HDRS
+    "command_buffer_test.h"
+  DEPS
+    ::cts_test_base
+    iree::base
+    iree::hal
+    iree::testing::gtest
+)
+
+iree_cc_library(
+  NAME
+    command_buffer_dispatch_test_library
+  HDRS
+    "command_buffer_dispatch_test.h"
+  DEPS
+    ::cts_test_base
+    iree::base
+    iree::hal
+    iree::testing::gtest
+)
+
+iree_cc_library(
+  NAME
+    descriptor_set_test_library
+  HDRS
+    "descriptor_set_test.h"
+  DEPS
+    ::cts_test_base
+    iree::base
+    iree::hal
+    iree::testing::gtest
+)
+
+iree_cc_library(
+  NAME
+    descriptor_set_layout_test_library
+  HDRS
+    "descriptor_set_layout_test.h"
+  DEPS
+    ::cts_test_base
+    iree::base
+    iree::hal
+    iree::testing::gtest
+)
+
+iree_cc_library(
+  NAME
+    driver_test_library
+  HDRS
+    "driver_test.h"
+  DEPS
+    ::cts_test_base
+    iree::base
+    iree::hal
+    iree::testing::gtest
+)
+
+iree_cc_library(
+  NAME
+    event_test_library
+  HDRS
+    "event_test.h"
+  DEPS
+    ::cts_test_base
+    iree::base
+    iree::hal
+    iree::testing::gtest
+)
+
+iree_cc_library(
+  NAME
+    executable_layout_test_library
+  HDRS
+    "executable_layout_test.h"
+  DEPS
+    ::cts_test_base
+    iree::base
+    iree::hal
+    iree::testing::gtest
+)
+
+iree_cc_library(
+  NAME
+    executable_cache_test_library
+  HDRS
+    "executable_cache_test.h"
+  DEPS
+    ::cts_test_base
+    iree::base
+    iree::hal
+    iree::testing::gtest
+)
+
+iree_cc_library(
+  NAME
+    semaphore_test_library
+  HDRS
+    "semaphore_test.h"
+  DEPS
+    ::cts_test_base
+    iree::base
+    iree::hal
+    iree::testing::gtest
+)
+
+iree_cc_library(
+  NAME
+    semaphore_submission_test_library
+  HDRS
+    "semaphore_submission_test.h"
+  DEPS
+    ::cts_test_base
+    iree::base
+    iree::hal
+    iree::testing::gtest
+)
diff --git a/runtime/src/iree/hal/cts/README.md b/runtime/src/iree/hal/cts/README.md
new file mode 100644
index 0000000..0bd8cb6
--- /dev/null
+++ b/runtime/src/iree/hal/cts/README.md
@@ -0,0 +1,38 @@
+# Conformance Test Suite (CTS) for HAL implementations
+
+These tests exercise IREE's Hardware Abstraction Layer (HAL) in a way that
+checks for conformance across implementations and devices. The tests themselves
+are structured to help with HAL driver development by using individual features
+in isolation, demonstrating typical full-system usage, and pointing out where
+capabilities are optional.
+
+## Usage
+
+Each HAL driver (in-tree or out-of-tree) can use the `iree_hal_cts_test_suite()`
+CMake function to create a set of tests. See the documentation in
+[iree_hal_cts_test_suite.cmake](../../build_tools/cmake/iree_hal_cts_test_suite.cmake)
+and [cts_test_base.h](cts_test_base.h) for concrete details.
+
+## On testing for error conditions
+
+In general, error states are only lightly tested because the low level APIs that
+IREE's HAL is designed to thinly abstract over often assume programmer usage
+will be correct and treat errors as undefined behavior. See the Vulkan spec:
+
+* https://www.khronos.org/registry/vulkan/specs/1.2-extensions/html/chap3.html#introduction-conventions
+* https://www.khronos.org/registry/vulkan/specs/1.2-extensions/html/chap4.html#fundamentals-errors
+
+While the generic tests in the CTS may not be able to check for error conditions
+exhaustively, individual HAL implementations can implement stricter behavior
+or enable higher level checks like what the
+[Vulkan Validation Layers](https://github.com/KhronosGroup/Vulkan-ValidationLayers)
+provide.
+
+## Tips for adding new HAL implementations
+
+* Driver (`iree_hal_driver_t`) and device (`iree_hal_device_t`) creation, tested
+  in [driver_test](driver_test.h), are both prerequisites for all tests.
+* Tests for individual components (e.g.
+  [descriptor_set_layout_test](descriptor_set_layout_test.h)) are more
+  approachable than tests which use collections of components together (e.g.
+  [command_buffer_test](command_buffer_test.h)).
diff --git a/runtime/src/iree/hal/cts/allocator_test.h b/runtime/src/iree/hal/cts/allocator_test.h
new file mode 100644
index 0000000..4a1103c
--- /dev/null
+++ b/runtime/src/iree/hal/cts/allocator_test.h
@@ -0,0 +1,113 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_CTS_ALLOCATOR_TEST_H_
+#define IREE_HAL_CTS_ALLOCATOR_TEST_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/cts/cts_test_base.h"
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace iree {
+namespace hal {
+namespace cts {
+
+namespace {
+
+constexpr iree_device_size_t kAllocationSize = 1024;
+
+}  // namespace
+
+class allocator_test : public CtsTestBase {};
+
+// All allocators must support some baseline capabilities.
+//
+// Certain capabilities or configurations are optional and may vary between
+// driver implementations or target devices, such as:
+//   IREE_HAL_MEMORY_TYPE_HOST_LOCAL | IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL
+//   IREE_HAL_BUFFER_USAGE_MAPPING
+TEST_P(allocator_test, BaselineBufferCompatibility) {
+  // Need at least one way to get data between the host and device.
+  iree_hal_buffer_params_t host_local_params = {0};
+  host_local_params.type =
+      IREE_HAL_MEMORY_TYPE_HOST_LOCAL | IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE;
+  host_local_params.usage = IREE_HAL_BUFFER_USAGE_TRANSFER;
+  iree_hal_buffer_compatibility_t transfer_compatibility_host =
+      iree_hal_allocator_query_compatibility(
+          device_allocator_, host_local_params, kAllocationSize);
+
+  iree_hal_buffer_params_t device_local_params = {0};
+  device_local_params.type =
+      IREE_HAL_MEMORY_TYPE_HOST_VISIBLE | IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL;
+  device_local_params.usage = IREE_HAL_BUFFER_USAGE_TRANSFER;
+  iree_hal_buffer_compatibility_t transfer_compatibility_device =
+      iree_hal_allocator_query_compatibility(
+          device_allocator_, device_local_params, kAllocationSize);
+
+  iree_hal_buffer_compatibility_t required_transfer_compatibility =
+      IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE |
+      IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_TRANSFER;
+  EXPECT_TRUE(iree_all_bits_set(transfer_compatibility_host,
+                                required_transfer_compatibility) ||
+              iree_all_bits_set(transfer_compatibility_device,
+                                required_transfer_compatibility));
+
+  // Need to be able to use some type of buffer as dispatch inputs or outputs.
+  iree_hal_buffer_params_t dispatch_params = {0};
+  dispatch_params.type =
+      IREE_HAL_MEMORY_TYPE_HOST_LOCAL | IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE;
+  dispatch_params.usage = IREE_HAL_BUFFER_USAGE_DISPATCH;
+  iree_hal_buffer_compatibility_t dispatch_compatibility =
+      iree_hal_allocator_query_compatibility(device_allocator_, dispatch_params,
+                                             kAllocationSize);
+  EXPECT_TRUE(
+      iree_all_bits_set(dispatch_compatibility,
+                        IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE |
+                            IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_DISPATCH));
+}
+
+TEST_P(allocator_test, AllocateBuffer) {
+  iree_hal_buffer_params_t params = {0};
+  params.type = IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL;
+  params.usage = IREE_HAL_BUFFER_USAGE_TRANSFER;
+  iree_hal_buffer_t* buffer = NULL;
+  IREE_ASSERT_OK(iree_hal_allocator_allocate_buffer(
+      device_allocator_, params, kAllocationSize, iree_const_byte_span_empty(),
+      &buffer));
+
+  // At a mimimum, the requested memory type should be respected.
+  // Additional bits may be optionally set depending on the allocator.
+  EXPECT_TRUE(
+      iree_all_bits_set(iree_hal_buffer_memory_type(buffer), params.type));
+  EXPECT_TRUE(
+      iree_all_bits_set(iree_hal_buffer_allowed_usage(buffer), params.usage));
+  EXPECT_GE(iree_hal_buffer_allocation_size(buffer),
+            kAllocationSize);  // Larger is okay.
+
+  iree_hal_buffer_release(buffer);
+}
+
+// While empty allocations aren't particularly useful, they can occur in
+// practice so we should at least be able to create them without errors.
+TEST_P(allocator_test, AllocateEmptyBuffer) {
+  iree_hal_buffer_params_t params = {0};
+  params.type = IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL;
+  params.usage = IREE_HAL_BUFFER_USAGE_TRANSFER;
+  iree_hal_buffer_t* buffer = NULL;
+  IREE_ASSERT_OK(iree_hal_allocator_allocate_buffer(
+      device_allocator_, params, /*allocation_size=*/0,
+      iree_const_byte_span_empty(), &buffer));
+
+  iree_hal_buffer_release(buffer);
+}
+
+}  // namespace cts
+}  // namespace hal
+}  // namespace iree
+
+#endif  // IREE_HAL_CTS_ALLOCATOR_TEST_H_
diff --git a/runtime/src/iree/hal/cts/buffer_mapping_test.h b/runtime/src/iree/hal/cts/buffer_mapping_test.h
new file mode 100644
index 0000000..2810efa
--- /dev/null
+++ b/runtime/src/iree/hal/cts/buffer_mapping_test.h
@@ -0,0 +1,554 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_CTS_BUFFER_MAPPING_TEST_H_
+#define IREE_HAL_CTS_BUFFER_MAPPING_TEST_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/cts/cts_test_base.h"
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace iree {
+namespace hal {
+namespace cts {
+
+using ::testing::ContainerEq;
+
+namespace {
+constexpr iree_device_size_t kDefaultAllocationSize = 1024;
+}  // namespace
+
+// Tests for buffer mapping (IREE_HAL_BUFFER_USAGE_MAPPING) support and
+// for `iree_hal_buffer_*` functions which require buffer mapping.
+//
+// Note that most of these tests first write into a buffer using one or more
+// functions then read the (possibly partial) contents of that buffer using
+// `iree_hal_buffer_map_read`. As the buffer read implementation is
+// nontrivial, particularly on implementations with complex host/device splits,
+// test failures may indicate issues in either the code doing the writing or the
+// code doing the reading.
+//
+// Where applicable, tests for each function are organized in increasing order
+// of complexity, such as:
+//   * write to full buffer
+//   * write with an offset and length
+//   * write into a subspan of a buffer
+
+class buffer_mapping_test : public CtsTestBase {
+ protected:
+  void AllocateUninitializedBuffer(iree_device_size_t buffer_size,
+                                   iree_hal_buffer_t** out_buffer) {
+    iree_hal_buffer_params_t params = {0};
+    params.type =
+        IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL | IREE_HAL_MEMORY_TYPE_HOST_VISIBLE;
+    params.usage =
+        IREE_HAL_BUFFER_USAGE_TRANSFER | IREE_HAL_BUFFER_USAGE_MAPPING;
+    iree_hal_buffer_t* device_buffer = NULL;
+    IREE_CHECK_OK(iree_hal_allocator_allocate_buffer(
+        iree_hal_device_allocator(device_), params, buffer_size,
+        iree_const_byte_span_empty(), &device_buffer));
+    *out_buffer = device_buffer;
+  }
+};
+
+TEST_P(buffer_mapping_test, AllocatorSupportsBufferMapping) {
+  iree_hal_buffer_params_t params = {0};
+  params.type = IREE_HAL_MEMORY_TYPE_HOST_VISIBLE;
+  params.usage = IREE_HAL_BUFFER_USAGE_MAPPING;
+  iree_hal_buffer_compatibility_t compatibility =
+      iree_hal_allocator_query_compatibility(device_allocator_, params,
+                                             kDefaultAllocationSize);
+  EXPECT_TRUE(iree_all_bits_set(compatibility,
+                                IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE));
+
+  iree_hal_buffer_t* buffer = NULL;
+  AllocateUninitializedBuffer(kDefaultAllocationSize, &buffer);
+
+  EXPECT_TRUE(
+      iree_all_bits_set(iree_hal_buffer_memory_type(buffer), params.type));
+  EXPECT_TRUE(
+      iree_all_bits_set(iree_hal_buffer_allowed_usage(buffer), params.usage));
+  EXPECT_GE(iree_hal_buffer_allocation_size(buffer), kDefaultAllocationSize);
+
+  iree_hal_buffer_release(buffer);
+}
+
+TEST_P(buffer_mapping_test, ZeroWholeBuffer) {
+  iree_hal_buffer_t* buffer = NULL;
+  AllocateUninitializedBuffer(kDefaultAllocationSize, &buffer);
+
+  // Zero the entire buffer.
+  IREE_ASSERT_OK(
+      iree_hal_buffer_map_zero(buffer, /*byte_offset=*/0, IREE_WHOLE_BUFFER));
+
+  // Check that the contents match what we expect.
+  std::vector<uint8_t> actual_data(kDefaultAllocationSize);
+  IREE_ASSERT_OK(iree_hal_buffer_map_read(
+      buffer, /*source_offset=*/0, actual_data.data(), actual_data.size()));
+  std::vector<uint8_t> reference_buffer(kDefaultAllocationSize);
+  std::memset(reference_buffer.data(), 0, kDefaultAllocationSize);
+  EXPECT_THAT(actual_data, ContainerEq(reference_buffer));
+
+  iree_hal_buffer_release(buffer);
+}
+
+TEST_P(buffer_mapping_test, ZeroWithOffset) {
+  iree_device_size_t buffer_size = 16;
+  iree_hal_buffer_t* buffer = NULL;
+  AllocateUninitializedBuffer(buffer_size, &buffer);
+
+  // Fill the entire buffer then zero only a segment of it.
+  uint8_t fill_value = 0xFF;
+  IREE_ASSERT_OK(iree_hal_buffer_map_fill(buffer, /*byte_offset=*/0,
+                                          IREE_WHOLE_BUFFER, &fill_value,
+                                          sizeof(fill_value)));
+  IREE_ASSERT_OK(
+      iree_hal_buffer_map_zero(buffer, /*byte_offset=*/4, /*byte_length=*/8));
+
+  // Check that the contents match what we expect.
+  std::vector<uint8_t> actual_data(buffer_size);
+  IREE_ASSERT_OK(iree_hal_buffer_map_read(
+      buffer, /*source_offset=*/0, actual_data.data(), actual_data.size()));
+  std::vector<uint8_t> reference_buffer{0xFF, 0xFF, 0xFF, 0xFF,  //
+                                        0x00, 0x00, 0x00, 0x00,  //
+                                        0x00, 0x00, 0x00, 0x00,  //
+                                        0xFF, 0xFF, 0xFF, 0xFF};
+  EXPECT_THAT(actual_data, ContainerEq(reference_buffer));
+
+  iree_hal_buffer_release(buffer);
+}
+
+TEST_P(buffer_mapping_test, ZeroSubspan) {
+  iree_device_size_t buffer_size = 16;
+  iree_hal_buffer_t* buffer = NULL;
+  AllocateUninitializedBuffer(buffer_size, &buffer);
+
+  // Fill the entire buffer.
+  uint8_t fill_value = 0xFF;
+  IREE_ASSERT_OK(iree_hal_buffer_map_fill(buffer, /*byte_offset=*/0,
+                                          IREE_WHOLE_BUFFER, &fill_value,
+                                          sizeof(fill_value)));
+
+  // Create a subspan.
+  iree_device_size_t subspan_length = 8;
+  iree_hal_buffer_t* buffer_subspan = NULL;
+  IREE_ASSERT_OK(iree_hal_buffer_subspan(buffer, /*byte_offset=*/4,
+                                         subspan_length, &buffer_subspan));
+
+  // Zero part of the subspan.
+  IREE_ASSERT_OK(iree_hal_buffer_map_zero(buffer_subspan, /*byte_offset=*/4,
+                                          /*byte_length=*/4));
+
+  // Check that the contents match what we expect.
+  std::vector<uint8_t> actual_data(buffer_size);
+  IREE_ASSERT_OK(iree_hal_buffer_map_read(
+      buffer, /*source_offset=*/0, actual_data.data(), actual_data.size()));
+  std::vector<uint8_t> reference_buffer{0xFF, 0xFF, 0xFF, 0xFF,  //
+                                        0xFF, 0xFF, 0xFF, 0xFF,  //
+                                        0x00, 0x00, 0x00, 0x00,  //
+                                        0xFF, 0xFF, 0xFF, 0xFF};
+  EXPECT_THAT(actual_data, ContainerEq(reference_buffer));
+  // Also check the subspan.
+  std::vector<uint8_t> actual_data_subspan(subspan_length);
+  IREE_ASSERT_OK(iree_hal_buffer_map_read(buffer_subspan, /*source_offset=*/0,
+                                          actual_data_subspan.data(),
+                                          actual_data_subspan.size()));
+  std::vector<uint8_t> reference_buffer_subspan{0xFF, 0xFF, 0xFF, 0xFF,  //
+                                                0x00, 0x00, 0x00, 0x00};
+  EXPECT_THAT(actual_data_subspan, ContainerEq(reference_buffer_subspan));
+
+  iree_hal_buffer_release(buffer_subspan);
+  iree_hal_buffer_release(buffer);
+}
+
+TEST_P(buffer_mapping_test, FillEmpty) {
+  iree_hal_buffer_t* buffer = NULL;
+  AllocateUninitializedBuffer(kDefaultAllocationSize, &buffer);
+
+  // Zero the whole buffer then "fill" 0 bytes with a different pattern.
+  IREE_ASSERT_OK(iree_hal_buffer_map_zero(buffer, 0, IREE_WHOLE_BUFFER));
+  uint8_t fill_value = 0xFF;
+  IREE_ASSERT_OK(
+      iree_hal_buffer_map_fill(buffer, /*byte_offset=*/0,
+                               /*byte_length=*/0,  // <---- empty!
+                               /*pattern=*/&fill_value,
+                               /*pattern_length=*/sizeof(fill_value)));
+
+  // Check that the buffer is still all zeroes.
+  std::vector<uint8_t> actual_data(kDefaultAllocationSize);
+  IREE_ASSERT_OK(iree_hal_buffer_map_read(
+      buffer, /*source_offset=*/0, actual_data.data(), actual_data.size()));
+  std::vector<uint8_t> reference_buffer(kDefaultAllocationSize);
+  std::memset(reference_buffer.data(), 0, kDefaultAllocationSize);
+  EXPECT_THAT(actual_data, ContainerEq(reference_buffer));
+
+  iree_hal_buffer_release(buffer);
+}
+
+TEST_P(buffer_mapping_test, FillWholeBuffer) {
+  iree_hal_buffer_t* buffer = NULL;
+  AllocateUninitializedBuffer(kDefaultAllocationSize, &buffer);
+
+  uint8_t fill_value = 0xFF;
+  IREE_ASSERT_OK(
+      iree_hal_buffer_map_fill(buffer, /*byte_offset=*/0,
+                               /*byte_length=*/IREE_WHOLE_BUFFER,
+                               /*pattern=*/&fill_value,
+                               /*pattern_length=*/sizeof(fill_value)));
+
+  // Check that the buffer is filled with the pattern.
+  std::vector<uint8_t> actual_data(kDefaultAllocationSize);
+  IREE_ASSERT_OK(iree_hal_buffer_map_read(
+      buffer, /*source_offset=*/0, actual_data.data(), actual_data.size()));
+  std::vector<uint8_t> reference_buffer(kDefaultAllocationSize);
+  std::memset(reference_buffer.data(), fill_value, kDefaultAllocationSize);
+  EXPECT_THAT(actual_data, ContainerEq(reference_buffer));
+
+  iree_hal_buffer_release(buffer);
+}
+
+TEST_P(buffer_mapping_test, FillWithOffset) {
+  iree_device_size_t buffer_size = 16;
+  iree_hal_buffer_t* buffer = NULL;
+  AllocateUninitializedBuffer(buffer_size, &buffer);
+
+  // Zero the entire buffer then fill only a segment of it.
+  IREE_ASSERT_OK(iree_hal_buffer_map_zero(buffer, 0, IREE_WHOLE_BUFFER));
+  uint8_t fill_value = 0xFF;
+  IREE_ASSERT_OK(
+      iree_hal_buffer_map_fill(buffer, /*byte_offset=*/4,
+                               /*byte_length=*/8,
+                               /*pattern=*/&fill_value,
+                               /*pattern_length=*/sizeof(fill_value)));
+
+  // Check that only the segment of the buffer is filled with the pattern.
+  std::vector<uint8_t> actual_data(buffer_size);
+  IREE_ASSERT_OK(iree_hal_buffer_map_read(
+      buffer, /*source_offset=*/0, actual_data.data(), actual_data.size()));
+  std::vector<uint8_t> reference_offset_buffer{0x00, 0x00, 0x00, 0x00,  //
+                                               0xFF, 0xFF, 0xFF, 0xFF,  //
+                                               0xFF, 0xFF, 0xFF, 0xFF,  //
+                                               0x00, 0x00, 0x00, 0x00};
+  EXPECT_THAT(actual_data, ContainerEq(reference_offset_buffer));
+
+  iree_hal_buffer_release(buffer);
+}
+
+TEST_P(buffer_mapping_test, FillSubspan) {
+  iree_device_size_t buffer_size = 16;
+  iree_hal_buffer_t* buffer = NULL;
+  AllocateUninitializedBuffer(buffer_size, &buffer);
+
+  // Zero the entire buffer.
+  IREE_ASSERT_OK(iree_hal_buffer_map_zero(buffer, 0, IREE_WHOLE_BUFFER));
+
+  // Create a subspan.
+  iree_device_size_t subspan_length = 8;
+  iree_hal_buffer_t* buffer_subspan = NULL;
+  IREE_ASSERT_OK(iree_hal_buffer_subspan(buffer, /*byte_offset=*/4,
+                                         subspan_length, &buffer_subspan));
+
+  // Fill part of the subspan.
+  uint8_t fill_value = 0xFF;
+  IREE_ASSERT_OK(
+      iree_hal_buffer_map_fill(buffer_subspan, /*byte_offset=*/4,
+                               /*byte_length=*/4,
+                               /*pattern=*/&fill_value,
+                               /*pattern_length=*/sizeof(fill_value)));
+
+  // Check that the contents match what we expect.
+  std::vector<uint8_t> actual_data(buffer_size);
+  IREE_ASSERT_OK(iree_hal_buffer_map_read(
+      buffer, /*source_offset=*/0, actual_data.data(), actual_data.size()));
+  std::vector<uint8_t> reference_buffer{0x00, 0x00, 0x00, 0x00,  //
+                                        0x00, 0x00, 0x00, 0x00,  //
+                                        0xFF, 0xFF, 0xFF, 0xFF,  //
+                                        0x00, 0x00, 0x00, 0x00};
+  EXPECT_THAT(actual_data, ContainerEq(reference_buffer));
+  // Also check the subspan.
+  std::vector<uint8_t> actual_data_subspan(subspan_length);
+  IREE_ASSERT_OK(iree_hal_buffer_map_read(buffer_subspan, /*source_offset=*/0,
+                                          actual_data_subspan.data(),
+                                          actual_data_subspan.size()));
+  std::vector<uint8_t> reference_buffer_subspan{0x00, 0x00, 0x00, 0x00,  //
+                                                0xFF, 0xFF, 0xFF, 0xFF};
+  EXPECT_THAT(actual_data_subspan, ContainerEq(reference_buffer_subspan));
+
+  iree_hal_buffer_release(buffer_subspan);
+  iree_hal_buffer_release(buffer);
+}
+
+TEST_P(buffer_mapping_test, ReadData) {
+  iree_device_size_t buffer_size = 16;
+  iree_hal_buffer_t* buffer = NULL;
+  AllocateUninitializedBuffer(buffer_size, &buffer);
+
+  // Zero the first half, fill the second half.
+  IREE_ASSERT_OK(
+      iree_hal_buffer_map_zero(buffer, /*byte_offset=*/0, /*byte_length=*/8));
+  uint8_t fill_value = 0xFF;
+  IREE_ASSERT_OK(
+      iree_hal_buffer_map_fill(buffer, /*byte_offset=*/8,
+                               /*byte_length=*/8,
+                               /*pattern=*/&fill_value,
+                               /*pattern_length=*/sizeof(fill_value)));
+
+  // Read the entire buffer.
+  std::vector<uint8_t> actual_data(buffer_size);
+  IREE_ASSERT_OK(iree_hal_buffer_map_read(
+      buffer, /*source_offset=*/0, actual_data.data(), actual_data.size()));
+  std::vector<uint8_t> reference_buffer{0x00, 0x00, 0x00, 0x00,  //
+                                        0x00, 0x00, 0x00, 0x00,  //
+                                        0xFF, 0xFF, 0xFF, 0xFF,  //
+                                        0xFF, 0xFF, 0xFF, 0xFF};
+  EXPECT_THAT(actual_data, ContainerEq(reference_buffer));
+
+  // Read only a segment of the buffer.
+  std::vector<uint8_t> actual_data_offset(8);
+  IREE_ASSERT_OK(iree_hal_buffer_map_read(buffer, /*source_offset=*/4,
+                                          actual_data_offset.data(),
+                                          actual_data_offset.size()));
+  std::vector<uint8_t> reference_buffer_offset{0x00, 0x00, 0x00, 0x00,  //
+                                               0xFF, 0xFF, 0xFF, 0xFF};
+  EXPECT_THAT(actual_data_offset, ContainerEq(reference_buffer_offset));
+
+  iree_hal_buffer_release(buffer);
+}
+
+TEST_P(buffer_mapping_test, ReadDataSubspan) {
+  iree_device_size_t buffer_size = 16;
+  iree_hal_buffer_t* buffer = NULL;
+  AllocateUninitializedBuffer(buffer_size, &buffer);
+
+  // Fill a few segments with distinct values.
+  uint8_t value = 0xAA;
+  IREE_ASSERT_OK(iree_hal_buffer_map_fill(buffer, 0, 4, &value, sizeof(value)));
+  value = 0xBB;
+  IREE_ASSERT_OK(iree_hal_buffer_map_fill(buffer, 4, 4, &value, sizeof(value)));
+  value = 0xCC;
+  IREE_ASSERT_OK(iree_hal_buffer_map_fill(buffer, 8, 4, &value, sizeof(value)));
+  value = 0xDD;
+  IREE_ASSERT_OK(
+      iree_hal_buffer_map_fill(buffer, 12, 4, &value, sizeof(value)));
+
+  // Create a subspan.
+  iree_device_size_t subspan_length = 8;
+  iree_hal_buffer_t* buffer_subspan = NULL;
+  IREE_ASSERT_OK(iree_hal_buffer_subspan(buffer, /*byte_offset=*/4,
+                                         subspan_length, &buffer_subspan));
+
+  // Read the entire buffer subspan.
+  std::vector<uint8_t> actual_data(subspan_length);
+  IREE_ASSERT_OK(iree_hal_buffer_map_read(buffer_subspan, /*source_offset=*/0,
+                                          actual_data.data(),
+                                          actual_data.size()));
+  std::vector<uint8_t> reference_buffer{0xBB, 0xBB, 0xBB, 0xBB,  //
+                                        0xCC, 0xCC, 0xCC, 0xCC};
+  EXPECT_THAT(actual_data, ContainerEq(reference_buffer));
+
+  // Read only a segment of the buffer.
+  std::vector<uint8_t> actual_data_offset(4);
+  IREE_ASSERT_OK(iree_hal_buffer_map_read(buffer_subspan, /*source_offset=*/4,
+                                          actual_data_offset.data(),
+                                          actual_data_offset.size()));
+  std::vector<uint8_t> reference_buffer_offset{0xCC, 0xCC, 0xCC, 0xCC};
+  EXPECT_THAT(actual_data_offset, ContainerEq(reference_buffer_offset));
+
+  iree_hal_buffer_release(buffer_subspan);
+  iree_hal_buffer_release(buffer);
+}
+
+TEST_P(buffer_mapping_test, WriteDataWholeBuffer) {
+  iree_device_size_t buffer_size = 16;
+  iree_hal_buffer_t* buffer = NULL;
+  AllocateUninitializedBuffer(buffer_size, &buffer);
+
+  // Write over the whole buffer.
+  uint8_t fill_value = 0xFF;
+  std::vector<uint8_t> reference_buffer(buffer_size);
+  std::memset(reference_buffer.data(), fill_value, buffer_size);
+  IREE_ASSERT_OK(iree_hal_buffer_map_write(buffer, /*target_offset=*/0,
+                                           reference_buffer.data(),
+                                           reference_buffer.size()));
+
+  // Check that entire buffer was written to.
+  std::vector<uint8_t> actual_data(buffer_size);
+  IREE_ASSERT_OK(iree_hal_buffer_map_read(
+      buffer, /*source_offset=*/0, actual_data.data(), actual_data.size()));
+  EXPECT_THAT(actual_data, ContainerEq(reference_buffer));
+
+  iree_hal_buffer_release(buffer);
+}
+
+TEST_P(buffer_mapping_test, WriteDataWithOffset) {
+  iree_device_size_t buffer_size = 16;
+  iree_hal_buffer_t* buffer = NULL;
+  AllocateUninitializedBuffer(buffer_size, &buffer);
+
+  // Zero the entire buffer.
+  IREE_ASSERT_OK(iree_hal_buffer_map_zero(buffer, 0, IREE_WHOLE_BUFFER));
+
+  // Write over part of the buffer.
+  std::vector<uint8_t> fill_buffer{0x11, 0x22, 0x33, 0x44,  //
+                                   0x55, 0x66, 0x77, 0x88};
+  IREE_ASSERT_OK(iree_hal_buffer_map_write(
+      buffer, /*target_offset=*/4, fill_buffer.data(), fill_buffer.size()));
+
+  // Check that the contents match what we expect.
+  std::vector<uint8_t> actual_data(buffer_size);
+  IREE_ASSERT_OK(iree_hal_buffer_map_read(
+      buffer, /*source_offset=*/0, actual_data.data(), actual_data.size()));
+  std::vector<uint8_t> reference_buffer{0x00, 0x00, 0x00, 0x00,  //
+                                        0x11, 0x22, 0x33, 0x44,  //
+                                        0x55, 0x66, 0x77, 0x88,  //
+                                        0x00, 0x00, 0x00, 0x00};
+  EXPECT_THAT(actual_data, ContainerEq(reference_buffer));
+
+  iree_hal_buffer_release(buffer);
+}
+
+TEST_P(buffer_mapping_test, WriteDataSubspan) {
+  iree_device_size_t buffer_size = 16;
+  iree_hal_buffer_t* buffer = NULL;
+  AllocateUninitializedBuffer(buffer_size, &buffer);
+
+  // Zero the entire buffer.
+  IREE_ASSERT_OK(iree_hal_buffer_map_zero(buffer, 0, IREE_WHOLE_BUFFER));
+
+  // Create a subspan.
+  iree_device_size_t subspan_length = 8;
+  iree_hal_buffer_t* buffer_subspan = NULL;
+  IREE_ASSERT_OK(iree_hal_buffer_subspan(buffer, /*byte_offset=*/4,
+                                         subspan_length, &buffer_subspan));
+
+  // Write over part of the subspan.
+  std::vector<uint8_t> fill_buffer{0x11, 0x22, 0x33, 0x44};
+  IREE_ASSERT_OK(iree_hal_buffer_map_write(buffer_subspan, /*target_offset=*/4,
+                                           fill_buffer.data(),
+                                           fill_buffer.size()));
+
+  // Check that the contents match what we expect.
+  std::vector<uint8_t> actual_data(buffer_size);
+  IREE_ASSERT_OK(iree_hal_buffer_map_read(
+      buffer, /*source_offset=*/0, actual_data.data(), actual_data.size()));
+  std::vector<uint8_t> reference_buffer{0x00, 0x00, 0x00, 0x00,  //
+                                        0x00, 0x00, 0x00, 0x00,  //
+                                        0x11, 0x22, 0x33, 0x44,  //
+                                        0x00, 0x00, 0x00, 0x00};
+  EXPECT_THAT(actual_data, ContainerEq(reference_buffer));
+  // Also check the subspan.
+  std::vector<uint8_t> actual_data_subspan(subspan_length);
+  IREE_ASSERT_OK(iree_hal_buffer_map_read(buffer_subspan, /*source_offset=*/0,
+                                          actual_data_subspan.data(),
+                                          actual_data_subspan.size()));
+  std::vector<uint8_t> reference_buffer_subspan{0x00, 0x00, 0x00, 0x00,  //
+                                                0x11, 0x22, 0x33, 0x44};
+  EXPECT_THAT(actual_data_subspan, ContainerEq(reference_buffer_subspan));
+
+  iree_hal_buffer_release(buffer_subspan);
+  iree_hal_buffer_release(buffer);
+}
+
+TEST_P(buffer_mapping_test, CopyData) {
+  iree_hal_buffer_t* buffer_a = NULL;
+  iree_hal_buffer_t* buffer_b = NULL;
+  AllocateUninitializedBuffer(kDefaultAllocationSize, &buffer_a);
+  AllocateUninitializedBuffer(kDefaultAllocationSize, &buffer_b);
+
+  uint8_t fill_value = 0x07;
+  IREE_ASSERT_OK(
+      iree_hal_buffer_map_fill(buffer_a, /*byte_offset=*/0,
+                               /*byte_length=*/kDefaultAllocationSize,
+                               /*pattern=*/&fill_value,
+                               /*pattern_length=*/sizeof(fill_value)));
+  IREE_ASSERT_OK(iree_hal_buffer_map_copy(
+      /*source_buffer=*/buffer_a,
+      /*source_offset=*/0, /*target_buffer=*/buffer_b, /*target_offset=*/0,
+      /*data_length=*/kDefaultAllocationSize));
+
+  std::vector<uint8_t> reference_buffer(kDefaultAllocationSize);
+  std::memset(reference_buffer.data(), fill_value, kDefaultAllocationSize);
+
+  std::vector<uint8_t> actual_data(kDefaultAllocationSize);
+  IREE_ASSERT_OK(iree_hal_buffer_map_read(
+      buffer_b, /*source_offset=*/0, actual_data.data(), actual_data.size()));
+  EXPECT_THAT(actual_data, ContainerEq(reference_buffer));
+
+  iree_hal_buffer_release(buffer_a);
+  iree_hal_buffer_release(buffer_b);
+}
+
+// Maps a buffer range for reading from device -> host.
+// This is roughly what iree_hal_buffer_map_read does internally.
+TEST_P(buffer_mapping_test, MapRangeRead) {
+  iree_device_size_t buffer_size = 16;
+  iree_hal_buffer_t* buffer = NULL;
+  AllocateUninitializedBuffer(buffer_size, &buffer);
+
+  uint8_t fill_value = 0xEF;
+  IREE_ASSERT_OK(iree_hal_buffer_map_fill(buffer, /*byte_offset=*/0,
+                                          IREE_WHOLE_BUFFER, &fill_value,
+                                          sizeof(fill_value)));
+
+  iree_hal_buffer_mapping_t mapping;
+  IREE_ASSERT_OK(iree_hal_buffer_map_range(
+      buffer, IREE_HAL_MAPPING_MODE_SCOPED, IREE_HAL_MEMORY_ACCESS_READ,
+      /*byte_offset=*/0, /*byte_length=*/buffer_size, &mapping));
+  EXPECT_EQ(buffer, mapping.buffer);
+  EXPECT_GE(mapping.contents.data_length, (iree_host_size_t)buffer_size);
+
+  std::vector<uint8_t> reference_buffer(buffer_size);
+  std::memset(reference_buffer.data(), fill_value, buffer_size);
+  std::vector<uint8_t> mapping_data(
+      mapping.contents.data,
+      mapping.contents.data + mapping.contents.data_length);
+  EXPECT_THAT(mapping_data, ContainerEq(reference_buffer));
+
+  iree_hal_buffer_unmap_range(&mapping);
+  iree_hal_buffer_release(buffer);
+}
+
+// Maps a buffer range for writing from host -> device.
+// This is roughly what iree_hal_buffer_map_write does internally.
+TEST_P(buffer_mapping_test, MapRangeWrite) {
+  iree_device_size_t buffer_size = 16;
+  iree_hal_buffer_t* buffer = NULL;
+  AllocateUninitializedBuffer(buffer_size, &buffer);
+
+  iree_hal_buffer_mapping_t mapping;
+  IREE_ASSERT_OK(iree_hal_buffer_map_range(
+      buffer, IREE_HAL_MAPPING_MODE_SCOPED,
+      IREE_HAL_MEMORY_ACCESS_DISCARD_WRITE,
+      /*byte_offset=*/0, /*byte_length=*/buffer_size, &mapping));
+  EXPECT_EQ(buffer, mapping.buffer);
+  EXPECT_GE(mapping.contents.data_length, (iree_host_size_t)buffer_size);
+
+  // Write into the mapped memory, flush for device access, then read back.
+  uint8_t fill_value = 0x12;
+  std::memset(mapping.contents.data, fill_value, buffer_size);
+  IREE_ASSERT_OK(iree_hal_buffer_flush_range(&mapping, /*byte_offset=*/0,
+                                             /*byte_length=*/buffer_size));
+  std::vector<uint8_t> actual_data(buffer_size);
+  IREE_ASSERT_OK(iree_hal_buffer_map_read(
+      buffer, /*source_offset=*/0, actual_data.data(), actual_data.size()));
+  std::vector<uint8_t> reference_buffer(buffer_size);
+  std::memset(reference_buffer.data(), fill_value, buffer_size);
+  EXPECT_THAT(actual_data, ContainerEq(reference_buffer));
+
+  iree_hal_buffer_unmap_range(&mapping);
+  iree_hal_buffer_release(buffer);
+}
+
+}  // namespace cts
+}  // namespace hal
+}  // namespace iree
+
+#endif  // IREE_HAL_CTS_BUFFER_MAPPING_TEST_H_
diff --git a/runtime/src/iree/hal/cts/command_buffer_dispatch_test.h b/runtime/src/iree/hal/cts/command_buffer_dispatch_test.h
new file mode 100644
index 0000000..d30b5d0
--- /dev/null
+++ b/runtime/src/iree/hal/cts/command_buffer_dispatch_test.h
@@ -0,0 +1,155 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_CTS_COMMAND_BUFFER_DISPATCH_TEST_H_
+#define IREE_HAL_CTS_COMMAND_BUFFER_DISPATCH_TEST_H_
+
+#include "iree/base/api.h"
+#include "iree/base/string_view.h"
+#include "iree/hal/api.h"
+#include "iree/hal/cts/cts_test_base.h"
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace iree {
+namespace hal {
+namespace cts {
+
+class command_buffer_dispatch_test : public CtsTestBase {
+ protected:
+  void PrepareAbsExecutable() {
+    IREE_ASSERT_OK(iree_hal_executable_cache_create(
+        device_, iree_make_cstring_view("default"),
+        iree_loop_inline(&loop_status_), &executable_cache_));
+
+    iree_hal_descriptor_set_layout_binding_t descriptor_set_layout_bindings[] =
+        {
+            {0, IREE_HAL_DESCRIPTOR_TYPE_STORAGE_BUFFER},
+            {1, IREE_HAL_DESCRIPTOR_TYPE_STORAGE_BUFFER},
+        };
+    IREE_ASSERT_OK(iree_hal_descriptor_set_layout_create(
+        device_, IREE_HAL_DESCRIPTOR_SET_LAYOUT_USAGE_TYPE_PUSH_ONLY,
+        IREE_ARRAYSIZE(descriptor_set_layout_bindings),
+        descriptor_set_layout_bindings, &descriptor_set_layout_));
+    IREE_ASSERT_OK(iree_hal_executable_layout_create(
+        device_, /*push_constants=*/0, /*set_layout_count=*/1,
+        &descriptor_set_layout_, &executable_layout_));
+
+    iree_hal_executable_params_t executable_params;
+    iree_hal_executable_params_initialize(&executable_params);
+    executable_params.caching_mode =
+        IREE_HAL_EXECUTABLE_CACHING_MODE_ALIAS_PROVIDED_DATA;
+    executable_params.executable_format =
+        iree_make_cstring_view(get_test_executable_format());
+    executable_params.executable_data = get_test_executable_data(
+        iree_make_cstring_view("command_buffer_dispatch_test.bin"));
+    executable_params.executable_layout_count = 1;
+    executable_params.executable_layouts = &executable_layout_;
+
+    IREE_ASSERT_OK(iree_hal_executable_cache_prepare_executable(
+        executable_cache_, &executable_params, &executable_));
+  }
+
+  void CleanupExecutable() {
+    iree_hal_executable_release(executable_);
+    iree_hal_executable_layout_release(executable_layout_);
+    iree_hal_descriptor_set_layout_release(descriptor_set_layout_);
+    iree_hal_executable_cache_release(executable_cache_);
+    IREE_ASSERT_OK(loop_status_);
+  }
+
+  iree_status_t loop_status_ = iree_ok_status();
+  iree_hal_executable_cache_t* executable_cache_ = NULL;
+  iree_hal_descriptor_set_layout_t* descriptor_set_layout_ = NULL;
+  iree_hal_executable_layout_t* executable_layout_ = NULL;
+  iree_hal_executable_t* executable_ = NULL;
+};
+
+TEST_P(command_buffer_dispatch_test, DispatchAbs) {
+  PrepareAbsExecutable();
+
+  iree_hal_command_buffer_t* command_buffer = NULL;
+  IREE_ASSERT_OK(iree_hal_command_buffer_create(
+      device_,
+      IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT |
+          IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION,
+      IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY,
+      &command_buffer));
+
+  IREE_ASSERT_OK(iree_hal_command_buffer_begin(command_buffer));
+
+  // Create input and output buffers.
+  iree_hal_buffer_params_t input_params = {0};
+  input_params.type = IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL;
+  input_params.usage =
+      IREE_HAL_BUFFER_USAGE_DISPATCH | IREE_HAL_BUFFER_USAGE_TRANSFER;
+  iree_hal_buffer_view_t* input_buffer_view = NULL;
+  float input_data[1] = {-2.5f};
+  IREE_ASSERT_OK(iree_hal_buffer_view_allocate_buffer(
+      device_allocator_, /*shape=*/NULL,
+      /*shape_rank=*/0, IREE_HAL_ELEMENT_TYPE_FLOAT_32,
+      IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR, input_params,
+      iree_make_const_byte_span((void*)input_data, sizeof(input_data)),
+      &input_buffer_view));
+  iree_hal_buffer_params_t output_params = {0};
+  output_params.type =
+      IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL | IREE_HAL_MEMORY_TYPE_HOST_VISIBLE;
+  output_params.usage = IREE_HAL_BUFFER_USAGE_DISPATCH |
+                        IREE_HAL_BUFFER_USAGE_TRANSFER |
+                        IREE_HAL_BUFFER_USAGE_MAPPING;
+  iree_hal_buffer_t* output_buffer = NULL;
+  IREE_ASSERT_OK(iree_hal_allocator_allocate_buffer(
+      device_allocator_, output_params, sizeof(float),
+      iree_const_byte_span_empty(), &output_buffer));
+
+  iree_hal_descriptor_set_binding_t descriptor_set_bindings[] = {
+      {/*binding=*/0, iree_hal_buffer_view_buffer(input_buffer_view),
+       /*offset=*/0, iree_hal_buffer_view_byte_length(input_buffer_view)},
+      {/*binding=*/1, output_buffer, iree_hal_buffer_byte_offset(output_buffer),
+       iree_hal_buffer_byte_length(output_buffer)},
+  };
+
+  IREE_ASSERT_OK(iree_hal_command_buffer_push_descriptor_set(
+      command_buffer, executable_layout_, /*set=*/0,
+      IREE_ARRAYSIZE(descriptor_set_bindings), descriptor_set_bindings));
+
+  IREE_ASSERT_OK(iree_hal_command_buffer_dispatch(
+      command_buffer, executable_, /*entry_point=*/0,
+      /*workgroup_x=*/1, /*workgroup_y=*/1, /*workgroup_z=*/1));
+  IREE_ASSERT_OK(iree_hal_command_buffer_execution_barrier(
+      command_buffer,
+      /*source_stage_mask=*/IREE_HAL_EXECUTION_STAGE_DISPATCH |
+          IREE_HAL_EXECUTION_STAGE_TRANSFER |
+          IREE_HAL_EXECUTION_STAGE_COMMAND_RETIRE,
+      /*target_stage_mask=*/IREE_HAL_EXECUTION_STAGE_COMMAND_ISSUE |
+          IREE_HAL_EXECUTION_STAGE_DISPATCH | IREE_HAL_EXECUTION_STAGE_TRANSFER,
+      IREE_HAL_EXECUTION_BARRIER_FLAG_NONE, /*memory_barrier_count=*/0,
+      /*memory_barriers=*/NULL,
+      /*buffer_barrier_count=*/0, /*buffer_barriers=*/NULL));
+
+  IREE_ASSERT_OK(iree_hal_command_buffer_end(command_buffer));
+
+  IREE_ASSERT_OK(SubmitCommandBufferAndWait(IREE_HAL_COMMAND_CATEGORY_DISPATCH,
+                                            command_buffer));
+
+  float output_value = 0.0f;
+  IREE_ASSERT_OK(iree_hal_device_transfer_d2h(
+      device_, output_buffer,
+      /*source_offset=*/0, &output_value, sizeof(output_value),
+      IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT, iree_infinite_timeout()));
+  EXPECT_EQ(2.5f, output_value);
+
+  iree_hal_command_buffer_release(command_buffer);
+  iree_hal_buffer_release(output_buffer);
+  iree_hal_buffer_view_release(input_buffer_view);
+  CleanupExecutable();
+}
+
+}  // namespace cts
+}  // namespace hal
+}  // namespace iree
+
+#endif  // IREE_HAL_CTS_COMMAND_BUFFER_DISPATCH_TEST_H_
diff --git a/runtime/src/iree/hal/cts/command_buffer_test.h b/runtime/src/iree/hal/cts/command_buffer_test.h
new file mode 100644
index 0000000..2327197
--- /dev/null
+++ b/runtime/src/iree/hal/cts/command_buffer_test.h
@@ -0,0 +1,574 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_CTS_COMMAND_BUFFER_TEST_H_
+#define IREE_HAL_CTS_COMMAND_BUFFER_TEST_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/cts/cts_test_base.h"
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace iree {
+namespace hal {
+namespace cts {
+
+using ::testing::ContainerEq;
+
+namespace {
+constexpr iree_device_size_t kDefaultAllocationSize = 1024;
+}  // namespace
+
+class command_buffer_test : public CtsTestBase {
+ protected:
+  void CreateZeroedDeviceBuffer(iree_device_size_t buffer_size,
+                                iree_hal_buffer_t** out_buffer) {
+    iree_hal_buffer_params_t params = {0};
+    params.type =
+        IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL | IREE_HAL_MEMORY_TYPE_HOST_VISIBLE;
+    params.usage = IREE_HAL_BUFFER_USAGE_DISPATCH |
+                   IREE_HAL_BUFFER_USAGE_TRANSFER |
+                   IREE_HAL_BUFFER_USAGE_MAPPING;
+    iree_hal_buffer_t* device_buffer = NULL;
+    IREE_CHECK_OK(iree_hal_allocator_allocate_buffer(
+        iree_hal_device_allocator(device_), params, buffer_size,
+        iree_const_byte_span_empty(), &device_buffer));
+    IREE_ASSERT_OK(
+        iree_hal_buffer_map_zero(device_buffer, 0, IREE_WHOLE_BUFFER));
+    *out_buffer = device_buffer;
+  }
+
+  std::vector<uint8_t> RunFillBufferTest(iree_device_size_t buffer_size,
+                                         iree_device_size_t target_offset,
+                                         iree_device_size_t fill_length,
+                                         const void* pattern,
+                                         iree_host_size_t pattern_length) {
+    iree_hal_buffer_t* device_buffer = NULL;
+    CreateZeroedDeviceBuffer(buffer_size, &device_buffer);
+
+    iree_hal_command_buffer_t* command_buffer = NULL;
+    IREE_CHECK_OK(iree_hal_command_buffer_create(
+        device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT,
+        IREE_HAL_COMMAND_CATEGORY_ANY, IREE_HAL_QUEUE_AFFINITY_ANY,
+        &command_buffer));
+    IREE_CHECK_OK(iree_hal_command_buffer_begin(command_buffer));
+
+    // Fill the pattern.
+    IREE_CHECK_OK(iree_hal_command_buffer_fill_buffer(
+        command_buffer, device_buffer, target_offset, fill_length, pattern,
+        pattern_length));
+    IREE_CHECK_OK(iree_hal_command_buffer_end(command_buffer));
+    IREE_CHECK_OK(SubmitCommandBufferAndWait(IREE_HAL_COMMAND_CATEGORY_ANY,
+                                             command_buffer));
+
+    // Read data for returning.
+    std::vector<uint8_t> actual_data(buffer_size);
+    IREE_CHECK_OK(iree_hal_device_transfer_d2h(
+        device_, device_buffer, /*source_offset=*/0,
+        /*target_buffer=*/actual_data.data(),
+        /*data_length=*/buffer_size, IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT,
+        iree_infinite_timeout()));
+
+    // Cleanup and return.
+    iree_hal_command_buffer_release(command_buffer);
+    iree_hal_buffer_release(device_buffer);
+    return actual_data;
+  }
+};
+
+TEST_P(command_buffer_test, Create) {
+  iree_hal_command_buffer_t* command_buffer = NULL;
+  IREE_ASSERT_OK(iree_hal_command_buffer_create(
+      device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT,
+      IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY,
+      &command_buffer));
+
+  EXPECT_TRUE((iree_hal_command_buffer_allowed_categories(command_buffer) &
+               IREE_HAL_COMMAND_CATEGORY_DISPATCH) ==
+              IREE_HAL_COMMAND_CATEGORY_DISPATCH);
+
+  iree_hal_command_buffer_release(command_buffer);
+}
+
+TEST_P(command_buffer_test, BeginEnd) {
+  iree_hal_command_buffer_t* command_buffer = NULL;
+  IREE_ASSERT_OK(iree_hal_command_buffer_create(
+      device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT,
+      IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY,
+      &command_buffer));
+
+  IREE_ASSERT_OK(iree_hal_command_buffer_begin(command_buffer));
+  IREE_ASSERT_OK(iree_hal_command_buffer_end(command_buffer));
+
+  iree_hal_command_buffer_release(command_buffer);
+}
+
+TEST_P(command_buffer_test, SubmitEmpty) {
+  iree_hal_command_buffer_t* command_buffer = NULL;
+  IREE_ASSERT_OK(iree_hal_command_buffer_create(
+      device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT,
+      IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY,
+      &command_buffer));
+
+  IREE_ASSERT_OK(iree_hal_command_buffer_begin(command_buffer));
+  IREE_ASSERT_OK(iree_hal_command_buffer_end(command_buffer));
+
+  IREE_ASSERT_OK(SubmitCommandBufferAndWait(IREE_HAL_COMMAND_CATEGORY_DISPATCH,
+                                            command_buffer));
+
+  iree_hal_command_buffer_release(command_buffer);
+}
+
+TEST_P(command_buffer_test, CopyWholeBuffer) {
+  iree_hal_command_buffer_t* command_buffer = NULL;
+  IREE_ASSERT_OK(iree_hal_command_buffer_create(
+      device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT,
+      IREE_HAL_COMMAND_CATEGORY_TRANSFER, IREE_HAL_QUEUE_AFFINITY_ANY,
+      &command_buffer));
+
+  uint8_t i8_val = 0x54;
+  std::vector<uint8_t> reference_buffer(kDefaultAllocationSize);
+  std::memset(reference_buffer.data(), i8_val, kDefaultAllocationSize);
+
+  // Create and fill a host buffer.
+  iree_hal_buffer_params_t host_params = {0};
+  host_params.type =
+      IREE_HAL_MEMORY_TYPE_HOST_LOCAL | IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE;
+  host_params.usage = IREE_HAL_BUFFER_USAGE_DISPATCH |
+                      IREE_HAL_BUFFER_USAGE_TRANSFER |
+                      IREE_HAL_BUFFER_USAGE_MAPPING;
+  iree_hal_buffer_t* host_buffer = nullptr;
+  IREE_ASSERT_OK(iree_hal_allocator_allocate_buffer(
+      device_allocator_, host_params, kDefaultAllocationSize,
+      iree_make_const_byte_span(reference_buffer.data(),
+                                reference_buffer.size()),
+      &host_buffer));
+
+  // Create a device buffer.
+  iree_hal_buffer_params_t device_params = {0};
+  device_params.type =
+      IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL | IREE_HAL_MEMORY_TYPE_HOST_VISIBLE;
+  device_params.usage = IREE_HAL_BUFFER_USAGE_DISPATCH |
+                        IREE_HAL_BUFFER_USAGE_TRANSFER |
+                        IREE_HAL_BUFFER_USAGE_MAPPING;
+  iree_hal_buffer_t* device_buffer = nullptr;
+  IREE_ASSERT_OK(iree_hal_allocator_allocate_buffer(
+      device_allocator_, device_params, kDefaultAllocationSize,
+      iree_const_byte_span_empty(), &device_buffer));
+
+  // Copy the host buffer to the device buffer.
+  IREE_ASSERT_OK(iree_hal_command_buffer_begin(command_buffer));
+  IREE_ASSERT_OK(iree_hal_command_buffer_copy_buffer(
+      command_buffer, /*source_buffer=*/host_buffer, /*source_offset=*/0,
+      /*target_buffer=*/device_buffer, /*target_offset=*/0,
+      /*length=*/kDefaultAllocationSize));
+  IREE_ASSERT_OK(iree_hal_command_buffer_end(command_buffer));
+
+  IREE_ASSERT_OK(SubmitCommandBufferAndWait(IREE_HAL_COMMAND_CATEGORY_TRANSFER,
+                                            command_buffer));
+
+  // Read the device buffer and compare.
+  std::vector<uint8_t> actual_data(kDefaultAllocationSize);
+  IREE_ASSERT_OK(iree_hal_device_transfer_d2h(
+      device_, device_buffer, /*source_offset=*/0,
+      /*target_buffer=*/actual_data.data(),
+      /*data_length=*/kDefaultAllocationSize,
+      IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT, iree_infinite_timeout()));
+  EXPECT_THAT(actual_data, ContainerEq(reference_buffer));
+
+  // Must release the command buffer before resources used by it.
+  iree_hal_command_buffer_release(command_buffer);
+  iree_hal_buffer_release(device_buffer);
+  iree_hal_buffer_release(host_buffer);
+}
+
+TEST_P(command_buffer_test, CopySubBuffer) {
+  iree_hal_command_buffer_t* command_buffer = NULL;
+  IREE_ASSERT_OK(iree_hal_command_buffer_create(
+      device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT,
+      IREE_HAL_COMMAND_CATEGORY_TRANSFER, IREE_HAL_QUEUE_AFFINITY_ANY,
+      &command_buffer));
+
+  iree_hal_buffer_params_t device_params = {0};
+  device_params.type =
+      IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL | IREE_HAL_MEMORY_TYPE_HOST_VISIBLE;
+  device_params.usage = IREE_HAL_BUFFER_USAGE_DISPATCH |
+                        IREE_HAL_BUFFER_USAGE_TRANSFER |
+                        IREE_HAL_BUFFER_USAGE_MAPPING;
+  iree_hal_buffer_t* device_buffer = NULL;
+  IREE_ASSERT_OK(iree_hal_allocator_allocate_buffer(
+      device_allocator_, device_params, kDefaultAllocationSize,
+      iree_const_byte_span_empty(), &device_buffer));
+
+  uint8_t i8_val = 0x88;
+  std::vector<uint8_t> reference_buffer(kDefaultAllocationSize);
+  std::memset(reference_buffer.data() + 8, i8_val,
+              kDefaultAllocationSize / 2 - 4);
+
+  // Create another host buffer with a smaller size.
+  iree_hal_buffer_params_t host_params = {0};
+  host_params.type =
+      IREE_HAL_MEMORY_TYPE_HOST_LOCAL | IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE;
+  host_params.usage = IREE_HAL_BUFFER_USAGE_DISPATCH |
+                      IREE_HAL_BUFFER_USAGE_TRANSFER |
+                      IREE_HAL_BUFFER_USAGE_MAPPING;
+  std::vector<uint8_t> host_buffer_data(kDefaultAllocationSize, i8_val);
+  iree_hal_buffer_t* host_buffer = NULL;
+  IREE_ASSERT_OK(iree_hal_allocator_allocate_buffer(
+      device_allocator_, host_params, host_buffer_data.size() / 2,
+      iree_make_const_byte_span(host_buffer_data.data(),
+                                host_buffer_data.size() / 2),
+      &host_buffer));
+
+  // Copy the host buffer to the device buffer; zero fill the untouched bytes.
+  uint8_t zero_val = 0x0;
+  IREE_ASSERT_OK(iree_hal_command_buffer_begin(command_buffer));
+  IREE_ASSERT_OK(iree_hal_command_buffer_fill_buffer(
+      command_buffer, device_buffer, /*target_offset=*/0, /*length=*/8,
+      &zero_val, /*pattern_length=*/sizeof(zero_val)));
+  IREE_ASSERT_OK(iree_hal_command_buffer_copy_buffer(
+      command_buffer, /*source_buffer=*/host_buffer, /*source_offset=*/4,
+      /*target_buffer=*/device_buffer, /*target_offset=*/8,
+      /*length=*/kDefaultAllocationSize / 2 - 4));
+  IREE_ASSERT_OK(iree_hal_command_buffer_fill_buffer(
+      command_buffer, device_buffer,
+      /*target_offset=*/8 + kDefaultAllocationSize / 2 - 4,
+      /*length=*/kDefaultAllocationSize - (8 + kDefaultAllocationSize / 2 - 4),
+      &zero_val,
+      /*pattern_length=*/sizeof(zero_val)));
+  IREE_ASSERT_OK(iree_hal_command_buffer_end(command_buffer));
+
+  IREE_ASSERT_OK(SubmitCommandBufferAndWait(IREE_HAL_COMMAND_CATEGORY_TRANSFER,
+                                            command_buffer));
+
+  // Read the device buffer and compare.
+  std::vector<uint8_t> actual_data(kDefaultAllocationSize);
+  IREE_ASSERT_OK(iree_hal_device_transfer_d2h(
+      device_, device_buffer, /*source_offset=*/0,
+      /*target_buffer=*/actual_data.data(),
+      /*data_length=*/kDefaultAllocationSize,
+      IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT, iree_infinite_timeout()));
+  EXPECT_THAT(actual_data, ContainerEq(reference_buffer));
+
+  // Must release the command buffer before resources used by it.
+  iree_hal_command_buffer_release(command_buffer);
+  iree_hal_buffer_release(device_buffer);
+  iree_hal_buffer_release(host_buffer);
+}
+
+TEST_P(command_buffer_test, FillBuffer_pattern1_size1_offset0_length1) {
+  iree_device_size_t buffer_size = 1;
+  iree_device_size_t target_offset = 0;
+  iree_device_size_t fill_length = 1;
+  uint8_t pattern = 0x07;
+  std::vector<uint8_t> reference_buffer{0x07};
+  std::vector<uint8_t> actual_buffer =
+      RunFillBufferTest(buffer_size, target_offset, fill_length,
+                        (void*)&pattern, sizeof(pattern));
+  EXPECT_THAT(actual_buffer, ContainerEq(reference_buffer));
+}
+
+TEST_P(command_buffer_test, FillBuffer_pattern1_size5_offset0_length5) {
+  iree_device_size_t buffer_size = 5;
+  iree_device_size_t target_offset = 0;
+  iree_device_size_t fill_length = 5;
+  uint8_t pattern = 0x07;
+  std::vector<uint8_t> reference_buffer{0x07, 0x07, 0x07, 0x07,  //
+                                        0x07};
+  std::vector<uint8_t> actual_buffer =
+      RunFillBufferTest(buffer_size, target_offset, fill_length,
+                        (void*)&pattern, sizeof(pattern));
+  EXPECT_THAT(actual_buffer, ContainerEq(reference_buffer));
+}
+
+TEST_P(command_buffer_test, FillBuffer_pattern1_size16_offset0_length1) {
+  iree_device_size_t buffer_size = 16;
+  iree_device_size_t target_offset = 0;
+  iree_device_size_t fill_length = 1;
+  uint8_t pattern = 0x07;
+  std::vector<uint8_t> reference_buffer{0x07, 0x00, 0x00, 0x00,  //
+                                        0x00, 0x00, 0x00, 0x00,  //
+                                        0x00, 0x00, 0x00, 0x00,  //
+                                        0x00, 0x00, 0x00, 0x00};
+  std::vector<uint8_t> actual_buffer =
+      RunFillBufferTest(buffer_size, target_offset, fill_length,
+                        (void*)&pattern, sizeof(pattern));
+  EXPECT_THAT(actual_buffer, ContainerEq(reference_buffer));
+}
+
+TEST_P(command_buffer_test, FillBuffer_pattern1_size16_offset0_length3) {
+  iree_device_size_t buffer_size = 16;
+  iree_device_size_t target_offset = 0;
+  iree_device_size_t fill_length = 3;
+  uint8_t pattern = 0x07;
+  std::vector<uint8_t> reference_buffer{0x07, 0x07, 0x07, 0x00,  //
+                                        0x00, 0x00, 0x00, 0x00,  //
+                                        0x00, 0x00, 0x00, 0x00,  //
+                                        0x00, 0x00, 0x00, 0x00};
+  std::vector<uint8_t> actual_buffer =
+      RunFillBufferTest(buffer_size, target_offset, fill_length,
+                        (void*)&pattern, sizeof(pattern));
+  EXPECT_THAT(actual_buffer, ContainerEq(reference_buffer));
+}
+
+TEST_P(command_buffer_test, FillBuffer_pattern1_size16_offset0_length8) {
+  iree_device_size_t buffer_size = 16;
+  iree_device_size_t target_offset = 0;
+  iree_device_size_t fill_length = 8;
+  uint8_t pattern = 0x07;
+  std::vector<uint8_t> reference_buffer{0x07, 0x07, 0x07, 0x07,  //
+                                        0x07, 0x07, 0x07, 0x07,  //
+                                        0x00, 0x00, 0x00, 0x00,  //
+                                        0x00, 0x00, 0x00, 0x00};
+  std::vector<uint8_t> actual_buffer =
+      RunFillBufferTest(buffer_size, target_offset, fill_length,
+                        (void*)&pattern, sizeof(pattern));
+  EXPECT_THAT(actual_buffer, ContainerEq(reference_buffer));
+}
+
+TEST_P(command_buffer_test, FillBuffer_pattern1_size16_offset2_length8) {
+  iree_device_size_t buffer_size = 16;
+  iree_device_size_t target_offset = 2;
+  iree_device_size_t fill_length = 8;
+  uint8_t pattern = 0x07;
+  std::vector<uint8_t> reference_buffer{0x00, 0x00, 0x07, 0x07,  //
+                                        0x07, 0x07, 0x07, 0x07,  //
+                                        0x07, 0x07, 0x00, 0x00,  //
+                                        0x00, 0x00, 0x00, 0x00};
+  std::vector<uint8_t> actual_buffer =
+      RunFillBufferTest(buffer_size, target_offset, fill_length,
+                        (void*)&pattern, sizeof(pattern));
+  EXPECT_THAT(actual_buffer, ContainerEq(reference_buffer));
+}
+
+TEST_P(command_buffer_test, FillBuffer_pattern2_size2_offset0_length2) {
+  iree_device_size_t buffer_size = 2;
+  iree_device_size_t target_offset = 0;
+  iree_device_size_t fill_length = 2;
+  uint16_t pattern = 0xAB23;
+  std::vector<uint8_t> reference_buffer{0x23, 0xAB};
+  std::vector<uint8_t> actual_buffer =
+      RunFillBufferTest(buffer_size, target_offset, fill_length,
+                        (void*)&pattern, sizeof(pattern));
+  EXPECT_THAT(actual_buffer, ContainerEq(reference_buffer));
+}
+
+TEST_P(command_buffer_test, FillBuffer_pattern2_size16_offset0_length8) {
+  iree_device_size_t buffer_size = 16;
+  iree_device_size_t target_offset = 0;
+  iree_device_size_t fill_length = 8;
+  uint16_t pattern = 0xAB23;
+  std::vector<uint8_t> reference_buffer{0x23, 0xAB, 0x23, 0xAB,  //
+                                        0x23, 0xAB, 0x23, 0xAB,  //
+                                        0x00, 0x00, 0x00, 0x00,  //
+                                        0x00, 0x00, 0x00, 0x00};
+  std::vector<uint8_t> actual_buffer =
+      RunFillBufferTest(buffer_size, target_offset, fill_length,
+                        (void*)&pattern, sizeof(pattern));
+  EXPECT_THAT(actual_buffer, ContainerEq(reference_buffer));
+}
+
+TEST_P(command_buffer_test, FillBuffer_pattern2_size16_offset0_length10) {
+  iree_device_size_t buffer_size = 16;
+  iree_device_size_t target_offset = 0;
+  iree_device_size_t fill_length = 10;
+  uint16_t pattern = 0xAB23;
+  std::vector<uint8_t> reference_buffer{0x23, 0xAB, 0x23, 0xAB,  //
+                                        0x23, 0xAB, 0x23, 0xAB,  //
+                                        0x23, 0xAB, 0x00, 0x00,  //
+                                        0x00, 0x00, 0x00, 0x00};
+  std::vector<uint8_t> actual_buffer =
+      RunFillBufferTest(buffer_size, target_offset, fill_length,
+                        (void*)&pattern, sizeof(pattern));
+  EXPECT_THAT(actual_buffer, ContainerEq(reference_buffer));
+}
+
+TEST_P(command_buffer_test, FillBuffer_pattern2_size16_offset2_length8) {
+  iree_device_size_t buffer_size = 16;
+  iree_device_size_t target_offset = 2;
+  iree_device_size_t fill_length = 8;
+  uint16_t pattern = 0xAB23;
+  std::vector<uint8_t> reference_buffer{0x00, 0x00, 0x23, 0xAB,  //
+                                        0x23, 0xAB, 0x23, 0xAB,  //
+                                        0x23, 0xAB, 0x00, 0x00,  //
+                                        0x00, 0x00, 0x00, 0x00};
+  std::vector<uint8_t> actual_buffer =
+      RunFillBufferTest(buffer_size, target_offset, fill_length,
+                        (void*)&pattern, sizeof(pattern));
+  EXPECT_THAT(actual_buffer, ContainerEq(reference_buffer));
+}
+
+TEST_P(command_buffer_test, FillBuffer_pattern4_size4_offset0_length4) {
+  iree_device_size_t buffer_size = 4;
+  iree_device_size_t target_offset = 0;
+  iree_device_size_t fill_length = 4;
+  uint32_t pattern = 0xAB23CD45;
+  std::vector<uint8_t> reference_buffer{0x45, 0xCD, 0x23, 0xAB};
+  std::vector<uint8_t> actual_buffer =
+      RunFillBufferTest(buffer_size, target_offset, fill_length,
+                        (void*)&pattern, sizeof(pattern));
+  EXPECT_THAT(actual_buffer, ContainerEq(reference_buffer));
+}
+
+TEST_P(command_buffer_test, FillBuffer_pattern4_size16_offset0_length8) {
+  iree_device_size_t buffer_size = 16;
+  iree_device_size_t target_offset = 0;
+  iree_device_size_t fill_length = 8;
+  uint32_t pattern = 0xAB23CD45;
+  std::vector<uint8_t> reference_buffer{0x45, 0xCD, 0x23, 0xAB,  //
+                                        0x45, 0xCD, 0x23, 0xAB,  //
+                                        0x00, 0x00, 0x00, 0x00,  //
+                                        0x00, 0x00, 0x00, 0x00};
+  std::vector<uint8_t> actual_buffer =
+      RunFillBufferTest(buffer_size, target_offset, fill_length,
+                        (void*)&pattern, sizeof(pattern));
+  EXPECT_THAT(actual_buffer, ContainerEq(reference_buffer));
+}
+
+TEST_P(command_buffer_test, UpdateBufferWholeBuffer) {
+  iree_device_size_t target_buffer_size = 16;
+  std::vector<uint8_t> source_buffer{0x01, 0x02, 0x03, 0x04,  //
+                                     0x05, 0x06, 0x07, 0x08,  //
+                                     0xA1, 0xA2, 0xA3, 0xA4,  //
+                                     0xA5, 0xA6, 0xA7, 0xA8};
+
+  iree_hal_buffer_t* device_buffer = NULL;
+  CreateZeroedDeviceBuffer(target_buffer_size, &device_buffer);
+
+  iree_hal_command_buffer_t* command_buffer = NULL;
+  IREE_CHECK_OK(iree_hal_command_buffer_create(
+      device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT,
+      IREE_HAL_COMMAND_CATEGORY_ANY, IREE_HAL_QUEUE_AFFINITY_ANY,
+      &command_buffer));
+  IREE_CHECK_OK(iree_hal_command_buffer_begin(command_buffer));
+
+  // Issue the update_buffer command.
+  IREE_CHECK_OK(iree_hal_command_buffer_update_buffer(
+      command_buffer, source_buffer.data(), /*source_offset=*/0, device_buffer,
+      /*target_offset=*/0, /*length=*/target_buffer_size));
+  IREE_CHECK_OK(iree_hal_command_buffer_end(command_buffer));
+  IREE_CHECK_OK(SubmitCommandBufferAndWait(IREE_HAL_COMMAND_CATEGORY_ANY,
+                                           command_buffer));
+
+  // Check that the contents match what we expect.
+  std::vector<uint8_t> actual_data(target_buffer_size);
+  IREE_CHECK_OK(iree_hal_device_transfer_d2h(
+      device_, device_buffer, /*source_offset=*/0, actual_data.data(),
+      actual_data.size(), IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT,
+      iree_infinite_timeout()));
+  EXPECT_THAT(actual_data, ContainerEq(source_buffer));
+
+  iree_hal_command_buffer_release(command_buffer);
+  iree_hal_buffer_release(device_buffer);
+}
+
+TEST_P(command_buffer_test, UpdateBufferWithOffsets) {
+  iree_device_size_t target_buffer_size = 16;
+  std::vector<uint8_t> source_buffer{0x01, 0x02, 0x03, 0x04,  //
+                                     0x05, 0x06, 0x07, 0x08,  //
+                                     0xA1, 0xA2, 0xA3, 0xA4,  //
+                                     0xA5, 0xA6, 0xA7, 0xA8};
+
+  iree_hal_buffer_t* device_buffer = NULL;
+  CreateZeroedDeviceBuffer(target_buffer_size, &device_buffer);
+
+  iree_hal_command_buffer_t* command_buffer = NULL;
+  IREE_CHECK_OK(iree_hal_command_buffer_create(
+      device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT,
+      IREE_HAL_COMMAND_CATEGORY_ANY, IREE_HAL_QUEUE_AFFINITY_ANY,
+      &command_buffer));
+  IREE_CHECK_OK(iree_hal_command_buffer_begin(command_buffer));
+
+  // Issue the update_buffer command.
+  IREE_CHECK_OK(iree_hal_command_buffer_update_buffer(
+      command_buffer, source_buffer.data(), /*source_offset=*/4, device_buffer,
+      /*target_offset=*/4, /*length=*/8));
+  IREE_CHECK_OK(iree_hal_command_buffer_end(command_buffer));
+  IREE_CHECK_OK(SubmitCommandBufferAndWait(IREE_HAL_COMMAND_CATEGORY_ANY,
+                                           command_buffer));
+
+  // Check that the contents match what we expect.
+  std::vector<uint8_t> actual_data(target_buffer_size);
+  IREE_CHECK_OK(iree_hal_device_transfer_d2h(
+      device_, device_buffer, /*source_offset=*/0, actual_data.data(),
+      actual_data.size(), IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT,
+      iree_infinite_timeout()));
+  std::vector<uint8_t> reference_buffer{0x00, 0x00, 0x00, 0x00,  //
+                                        0x05, 0x06, 0x07, 0x08,  //
+                                        0xA1, 0xA2, 0xA3, 0xA4,  //
+                                        0x00, 0x00, 0x00, 0x00};
+  EXPECT_THAT(actual_data, ContainerEq(reference_buffer));
+
+  iree_hal_command_buffer_release(command_buffer);
+  iree_hal_buffer_release(device_buffer);
+}
+
+TEST_P(command_buffer_test, UpdateBufferSubspan) {
+  iree_device_size_t target_buffer_size = 16;
+  std::vector<uint8_t> source_buffer{0x01, 0x02, 0x03, 0x04,  //
+                                     0x05, 0x06, 0x07, 0x08,  //
+                                     0xA1, 0xA2, 0xA3, 0xA4,  //
+                                     0xA5, 0xA6, 0xA7, 0xA8};
+
+  iree_hal_buffer_t* device_buffer = NULL;
+  CreateZeroedDeviceBuffer(target_buffer_size, &device_buffer);
+
+  // Create a subspan.
+  iree_device_size_t subspan_length = 8;
+  iree_hal_buffer_t* buffer_subspan;
+  IREE_ASSERT_OK(iree_hal_buffer_subspan(device_buffer, /*byte_offset=*/4,
+                                         subspan_length, &buffer_subspan));
+
+  iree_hal_command_buffer_t* command_buffer = NULL;
+  IREE_CHECK_OK(iree_hal_command_buffer_create(
+      device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT,
+      IREE_HAL_COMMAND_CATEGORY_ANY, IREE_HAL_QUEUE_AFFINITY_ANY,
+      &command_buffer));
+  IREE_CHECK_OK(iree_hal_command_buffer_begin(command_buffer));
+
+  // Issue the update_buffer command.
+  IREE_CHECK_OK(iree_hal_command_buffer_update_buffer(
+      command_buffer, source_buffer.data(), /*source_offset=*/4, buffer_subspan,
+      /*target_offset=*/4, /*length=*/4));
+  IREE_CHECK_OK(iree_hal_command_buffer_end(command_buffer));
+  IREE_CHECK_OK(SubmitCommandBufferAndWait(IREE_HAL_COMMAND_CATEGORY_ANY,
+                                           command_buffer));
+
+  // Check that the contents match what we expect.
+  std::vector<uint8_t> actual_data(target_buffer_size);
+  IREE_ASSERT_OK(iree_hal_device_transfer_d2h(
+      device_, device_buffer, /*source_offset=*/0, actual_data.data(),
+      actual_data.size(), IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT,
+      iree_infinite_timeout()));
+  std::vector<uint8_t> reference_buffer{0x00, 0x00, 0x00, 0x00,  //
+                                        0x00, 0x00, 0x00, 0x00,  //
+                                        0x05, 0x06, 0x07, 0x08,  //
+                                        0x00, 0x00, 0x00, 0x00};
+  EXPECT_THAT(actual_data, ContainerEq(reference_buffer));
+  // Also check the subspan.
+  std::vector<uint8_t> actual_data_subspan(subspan_length);
+  IREE_ASSERT_OK(iree_hal_device_transfer_d2h(
+      device_, buffer_subspan, /*source_offset=*/0, actual_data_subspan.data(),
+      actual_data_subspan.size(), IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT,
+      iree_infinite_timeout()));
+  std::vector<uint8_t> reference_buffer_subspan{0x00, 0x00, 0x00, 0x00,  //
+                                                0x05, 0x06, 0x07, 0x08};
+  EXPECT_THAT(actual_data_subspan, ContainerEq(reference_buffer_subspan));
+
+  iree_hal_command_buffer_release(command_buffer);
+  iree_hal_buffer_release(buffer_subspan);
+  iree_hal_buffer_release(device_buffer);
+}
+
+}  // namespace cts
+}  // namespace hal
+}  // namespace iree
+
+#endif  // IREE_HAL_CTS_COMMAND_BUFFER_TEST_H_
diff --git a/runtime/src/iree/hal/cts/cts_test_base.h b/runtime/src/iree/hal/cts/cts_test_base.h
new file mode 100644
index 0000000..32e4431
--- /dev/null
+++ b/runtime/src/iree/hal/cts/cts_test_base.h
@@ -0,0 +1,179 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_CTS_CTS_TEST_BASE_H_
+#define IREE_HAL_CTS_CTS_TEST_BASE_H_
+
+#include <set>
+#include <string>
+
+#include "iree/base/api.h"
+#include "iree/base/string_view.h"
+#include "iree/hal/api.h"
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace iree {
+namespace hal {
+namespace cts {
+
+// Registers the driver that will be used with INSTANTIATE_TEST_SUITE_P.
+// Leaf test binaries must implement this function.
+iree_status_t register_test_driver(iree_hal_driver_registry_t* registry);
+
+// Returns the executable format for the driver under test.
+// Leaf test binaries must implement this function.
+const char* get_test_executable_format();
+
+// Returns a file's executable data for the driver under test.
+// Leaf test binaries must implement this function.
+iree_const_byte_span_t get_test_executable_data(iree_string_view_t file_name);
+
+// Common setup for tests parameterized on driver names.
+class CtsTestBase : public ::testing::TestWithParam<std::string> {
+ protected:
+  static void SetUpTestSuite() {
+    IREE_CHECK_OK(register_test_driver(iree_hal_driver_registry_default()));
+  }
+
+  virtual void SetUp() {
+    const std::string& driver_name = GetParam();
+
+    // Get driver with the given name and create its default device.
+    // Skip drivers that are (gracefully) unavailable, fail if creation fails.
+    iree_hal_driver_t* driver = NULL;
+    iree_status_t status = TryGetDriver(driver_name, &driver);
+    if (iree_status_is_unavailable(status)) {
+      iree_status_free(status);
+      IREE_LOG(WARNING) << "Skipping test as '" << driver_name
+                        << "' driver is unavailable";
+      GTEST_SKIP();
+      return;
+    }
+    IREE_ASSERT_OK(status);
+    driver_ = driver;
+
+    iree_hal_device_t* device = NULL;
+    status = iree_hal_driver_create_default_device(
+        driver_, iree_allocator_system(), &device);
+    if (iree_status_is_unavailable(status)) {
+      iree_status_free(status);
+      IREE_LOG(WARNING) << "Skipping test as default device for '"
+                        << driver_name << "' driver is unavailable";
+      GTEST_SKIP();
+      return;
+    }
+    IREE_ASSERT_OK(status);
+    iree_status_free(status);
+    device_ = device;
+
+    device_allocator_ = iree_hal_device_allocator(device_);
+    iree_hal_allocator_retain(device_allocator_);
+  }
+
+  virtual void TearDown() {
+    if (device_allocator_) {
+      iree_hal_allocator_release(device_allocator_);
+      device_allocator_ = NULL;
+    }
+    if (device_) {
+      iree_hal_device_release(device_);
+      device_ = NULL;
+    }
+    if (driver_) {
+      iree_hal_driver_release(driver_);
+      driver_ = NULL;
+    }
+  }
+
+  // Submits |command_buffer| to the device and waits for it to complete before
+  // returning.
+  iree_status_t SubmitCommandBufferAndWait(
+      iree_hal_command_category_t command_categories,
+      iree_hal_command_buffer_t* command_buffer) {
+    iree_hal_semaphore_t* signal_semaphore = NULL;
+    IREE_RETURN_IF_ERROR(
+        iree_hal_semaphore_create(device_, 0ull, &signal_semaphore));
+
+    iree_hal_submission_batch_t submission_batch;
+
+    // No wait semaphores.
+    submission_batch.wait_semaphores.count = 0;
+    submission_batch.wait_semaphores.semaphores = NULL;
+    submission_batch.wait_semaphores.payload_values = NULL;
+
+    iree_hal_command_buffer_t* command_buffer_ptrs[] = {command_buffer};
+    submission_batch.command_buffer_count = IREE_ARRAYSIZE(command_buffer_ptrs);
+    submission_batch.command_buffers = command_buffer_ptrs;
+
+    // One signal semaphore from 0 -> 1.
+    iree_hal_semaphore_t* signal_semaphore_ptrs[] = {signal_semaphore};
+    uint64_t payload_values[] = {1ull};
+    submission_batch.signal_semaphores.count =
+        IREE_ARRAYSIZE(signal_semaphore_ptrs);
+    submission_batch.signal_semaphores.semaphores = signal_semaphore_ptrs;
+    submission_batch.signal_semaphores.payload_values = payload_values;
+
+    iree_status_t status =
+        iree_hal_device_queue_submit(device_, command_categories,
+                                     /*queue_affinity=*/0,
+                                     /*batch_count=*/1, &submission_batch);
+    if (iree_status_is_ok(status)) {
+      status = iree_hal_semaphore_wait(signal_semaphore, 1ull,
+                                       iree_infinite_timeout());
+    }
+
+    iree_hal_semaphore_release(signal_semaphore);
+    return status;
+  }
+
+  iree_hal_driver_t* driver_ = NULL;
+  iree_hal_device_t* device_ = NULL;
+  iree_hal_allocator_t* device_allocator_ = NULL;
+
+ private:
+  // Gets a HAL driver with the provided name, if available.
+  static iree_status_t TryGetDriver(const std::string& driver_name,
+                                    iree_hal_driver_t** out_driver) {
+    static std::set<std::string> unavailable_driver_names;
+
+    // If creation failed before, don't try again.
+    if (unavailable_driver_names.find(driver_name) !=
+        unavailable_driver_names.end()) {
+      return iree_make_status(IREE_STATUS_UNAVAILABLE, "driver unavailable");
+    }
+
+    // No existing driver, attempt to create.
+    iree_hal_driver_t* driver = NULL;
+    iree_status_t status = iree_hal_driver_registry_try_create_by_name(
+        iree_hal_driver_registry_default(),
+        iree_make_string_view(driver_name.data(), driver_name.size()),
+        iree_allocator_system(), &driver);
+    if (iree_status_is_unavailable(status)) {
+      unavailable_driver_names.insert(driver_name);
+    }
+    if (iree_status_is_ok(status)) {
+      *out_driver = driver;
+    }
+    return status;
+  }
+};
+
+struct GenerateTestName {
+  template <class ParamType>
+  std::string operator()(
+      const ::testing::TestParamInfo<ParamType>& info) const {
+    std::string name = info.param;
+    std::replace(name.begin(), name.end(), '-', '_');
+    return name;
+  }
+};
+
+}  // namespace cts
+}  // namespace hal
+}  // namespace iree
+
+#endif  // IREE_HAL_CTS_CTS_TEST_BASE_H_
diff --git a/runtime/src/iree/hal/cts/cts_test_template.cc.in b/runtime/src/iree/hal/cts/cts_test_template.cc.in
new file mode 100644
index 0000000..7783f5e
--- /dev/null
+++ b/runtime/src/iree/hal/cts/cts_test_template.cc.in
@@ -0,0 +1,63 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// clang-format off
+#cmakedefine IREE_CTS_TEST_FILE_PATH "@IREE_CTS_TEST_FILE_PATH@"
+#cmakedefine IREE_CTS_DRIVER_REGISTRATION_HDR "@IREE_CTS_DRIVER_REGISTRATION_HDR@"
+#cmakedefine IREE_CTS_DRIVER_REGISTRATION_FN @IREE_CTS_DRIVER_REGISTRATION_FN@
+#cmakedefine IREE_CTS_TEST_CLASS_NAME @IREE_CTS_TEST_CLASS_NAME@
+#cmakedefine IREE_CTS_DRIVER_NAME "@IREE_CTS_DRIVER_NAME@"
+#cmakedefine IREE_CTS_EXECUTABLE_FORMAT @IREE_CTS_EXECUTABLE_FORMAT@
+#cmakedefine IREE_CTS_EXECUTABLES_TESTDATA_HDR "@IREE_CTS_EXECUTABLES_TESTDATA_HDR@"
+// clang-format on
+
+#include IREE_CTS_TEST_FILE_PATH
+
+#include IREE_CTS_DRIVER_REGISTRATION_HDR
+#include "iree/hal/cts/cts_test_base.h"
+#include "iree/testing/gtest.h"
+
+#ifdef IREE_CTS_EXECUTABLES_TESTDATA_HDR
+#include IREE_CTS_EXECUTABLES_TESTDATA_HDR
+#endif
+
+namespace iree {
+namespace hal {
+namespace cts {
+
+iree_status_t register_test_driver(iree_hal_driver_registry_t* registry) {
+  return IREE_CTS_DRIVER_REGISTRATION_FN(registry);
+}
+
+const char* get_test_executable_format() {
+#ifdef IREE_CTS_EXECUTABLE_FORMAT
+  return IREE_CTS_EXECUTABLE_FORMAT;
+#else
+  return "UNDEFINED";
+#endif
+}
+
+iree_const_byte_span_t get_test_executable_data(iree_string_view_t file_name) {
+#ifdef IREE_CTS_EXECUTABLES_TESTDATA_HDR
+  const struct iree_file_toc_t* toc = iree_cts_testdata_executables_create();
+  for (size_t i = 0; i < iree_cts_testdata_executables_size(); ++i) {
+    const auto& file = toc[i];
+    if (iree_string_view_equal(file_name, iree_make_cstring_view(file.name))) {
+      return iree_make_const_byte_span(file.data, file.size);
+    }
+  }
+  // TODO(scotttodd): error handling / reporting? This a sharp edge.
+#endif
+  return iree_const_byte_span_empty();
+}
+
+INSTANTIATE_TEST_SUITE_P(CTS, IREE_CTS_TEST_CLASS_NAME,
+                         ::testing::Values(IREE_CTS_DRIVER_NAME),
+                         GenerateTestName());
+
+}  // namespace cts
+}  // namespace hal
+}  // namespace iree
diff --git a/runtime/src/iree/hal/cts/descriptor_set_layout_test.h b/runtime/src/iree/hal/cts/descriptor_set_layout_test.h
new file mode 100644
index 0000000..2b5515c
--- /dev/null
+++ b/runtime/src/iree/hal/cts/descriptor_set_layout_test.h
@@ -0,0 +1,75 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_CTS_DESCRIPTOR_SET_LAYOUT_TEST_H_
+#define IREE_HAL_CTS_DESCRIPTOR_SET_LAYOUT_TEST_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/cts/cts_test_base.h"
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace iree {
+namespace hal {
+namespace cts {
+
+class descriptor_set_layout_test : public CtsTestBase {};
+
+// Note: bindingCount == 0 is valid in VkDescriptorSetLayoutCreateInfo:
+// https://www.khronos.org/registry/vulkan/specs/1.2-extensions/man/html/VkDescriptorSetLayoutCreateInfo.html
+TEST_P(descriptor_set_layout_test, CreateWithNoBindings) {
+  iree_hal_descriptor_set_layout_t* descriptor_set_layout = NULL;
+  IREE_ASSERT_OK(iree_hal_descriptor_set_layout_create(
+      device_, IREE_HAL_DESCRIPTOR_SET_LAYOUT_USAGE_TYPE_IMMUTABLE,
+      /*binding_count=*/0,
+      /*bindings=*/NULL, &descriptor_set_layout));
+  iree_hal_descriptor_set_layout_release(descriptor_set_layout);
+}
+
+TEST_P(descriptor_set_layout_test, CreateWithOneBinding) {
+  iree_hal_descriptor_set_layout_t* descriptor_set_layout = NULL;
+  iree_hal_descriptor_set_layout_binding_t descriptor_set_layout_bindings[] = {
+      {/*binding=*/0, /*type=*/IREE_HAL_DESCRIPTOR_TYPE_STORAGE_BUFFER},
+  };
+  IREE_ASSERT_OK(iree_hal_descriptor_set_layout_create(
+      device_, IREE_HAL_DESCRIPTOR_SET_LAYOUT_USAGE_TYPE_IMMUTABLE,
+      IREE_ARRAYSIZE(descriptor_set_layout_bindings),
+      descriptor_set_layout_bindings, &descriptor_set_layout));
+  iree_hal_descriptor_set_layout_release(descriptor_set_layout);
+}
+
+TEST_P(descriptor_set_layout_test, CreateWithTwoBindings) {
+  iree_hal_descriptor_set_layout_t* descriptor_set_layout = NULL;
+  iree_hal_descriptor_set_layout_binding_t descriptor_set_layout_bindings[] = {
+      {/*binding=*/0, /*type=*/IREE_HAL_DESCRIPTOR_TYPE_STORAGE_BUFFER},
+      {/*binding=*/1, /*type=*/IREE_HAL_DESCRIPTOR_TYPE_STORAGE_BUFFER},
+  };
+  IREE_ASSERT_OK(iree_hal_descriptor_set_layout_create(
+      device_, IREE_HAL_DESCRIPTOR_SET_LAYOUT_USAGE_TYPE_IMMUTABLE,
+      IREE_ARRAYSIZE(descriptor_set_layout_bindings),
+      descriptor_set_layout_bindings, &descriptor_set_layout));
+  iree_hal_descriptor_set_layout_release(descriptor_set_layout);
+}
+
+TEST_P(descriptor_set_layout_test, CreateWithPushDescriptorType) {
+  iree_hal_descriptor_set_layout_t* descriptor_set_layout = NULL;
+  iree_hal_descriptor_set_layout_binding_t descriptor_set_layout_bindings[] = {
+      {/*binding=*/0, /*type=*/IREE_HAL_DESCRIPTOR_TYPE_STORAGE_BUFFER},
+      {/*binding=*/1, /*type=*/IREE_HAL_DESCRIPTOR_TYPE_STORAGE_BUFFER},
+  };
+  IREE_ASSERT_OK(iree_hal_descriptor_set_layout_create(
+      device_, IREE_HAL_DESCRIPTOR_SET_LAYOUT_USAGE_TYPE_PUSH_ONLY,
+      IREE_ARRAYSIZE(descriptor_set_layout_bindings),
+      descriptor_set_layout_bindings, &descriptor_set_layout));
+  iree_hal_descriptor_set_layout_release(descriptor_set_layout);
+}
+
+}  // namespace cts
+}  // namespace hal
+}  // namespace iree
+
+#endif  // IREE_HAL_CTS_DESCRIPTOR_SET_LAYOUT_TEST_H_
diff --git a/runtime/src/iree/hal/cts/descriptor_set_test.h b/runtime/src/iree/hal/cts/descriptor_set_test.h
new file mode 100644
index 0000000..241ad95
--- /dev/null
+++ b/runtime/src/iree/hal/cts/descriptor_set_test.h
@@ -0,0 +1,72 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_CTS_DESCRIPTOR_SET_TEST_H_
+#define IREE_HAL_CTS_DESCRIPTOR_SET_TEST_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/cts/cts_test_base.h"
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace iree {
+namespace hal {
+namespace cts {
+
+class descriptor_set_test : public CtsTestBase {};
+
+TEST_P(descriptor_set_test, CreateWithNoBindings) {
+  iree_hal_descriptor_set_layout_t* descriptor_set_layout = NULL;
+  IREE_ASSERT_OK(iree_hal_descriptor_set_layout_create(
+      device_, IREE_HAL_DESCRIPTOR_SET_LAYOUT_USAGE_TYPE_IMMUTABLE,
+      /*binding_count=*/0,
+      /*bindings=*/NULL, &descriptor_set_layout));
+
+  iree_hal_descriptor_set_t* descriptor_set = NULL;
+  IREE_ASSERT_OK(iree_hal_descriptor_set_create(
+      device_, descriptor_set_layout, /*binding_count=*/0,
+      /*bindings=*/NULL, &descriptor_set));
+
+  // The descriptor set struct is an opaque handle. We can't test for much more
+  // than successful creation.
+
+  iree_hal_descriptor_set_release(descriptor_set);
+  iree_hal_descriptor_set_layout_release(descriptor_set_layout);
+}
+
+TEST_P(descriptor_set_test, CreateWithTwoBindings) {
+  iree_hal_descriptor_set_layout_t* descriptor_set_layout = NULL;
+  iree_hal_descriptor_set_layout_binding_t descriptor_set_layout_bindings[] = {
+      {/*binding=*/0, /*type=*/IREE_HAL_DESCRIPTOR_TYPE_STORAGE_BUFFER},
+      {/*binding=*/1, /*type=*/IREE_HAL_DESCRIPTOR_TYPE_STORAGE_BUFFER},
+  };
+  IREE_ASSERT_OK(iree_hal_descriptor_set_layout_create(
+      device_, IREE_HAL_DESCRIPTOR_SET_LAYOUT_USAGE_TYPE_IMMUTABLE,
+      IREE_ARRAYSIZE(descriptor_set_layout_bindings),
+      descriptor_set_layout_bindings, &descriptor_set_layout));
+
+  iree_hal_descriptor_set_binding_t descriptor_set_bindings[] = {
+      {/*binding=*/0, /*buffer=*/NULL, /*offset=*/0, /*length=*/0},
+      {/*binding=*/1, /*buffer=*/NULL, /*offset=*/0, /*length=*/0},
+  };
+  iree_hal_descriptor_set_t* descriptor_set = NULL;
+  IREE_ASSERT_OK(iree_hal_descriptor_set_create(
+      device_, descriptor_set_layout, IREE_ARRAYSIZE(descriptor_set_bindings),
+      descriptor_set_bindings, &descriptor_set));
+
+  // The descriptor set struct is an opaque handle. We can't test for much more
+  // than successful creation.
+
+  iree_hal_descriptor_set_release(descriptor_set);
+  iree_hal_descriptor_set_layout_release(descriptor_set_layout);
+}
+
+}  // namespace cts
+}  // namespace hal
+}  // namespace iree
+
+#endif  // IREE_HAL_CTS_DESCRIPTOR_SET_TEST_H_
diff --git a/runtime/src/iree/hal/cts/driver_test.h b/runtime/src/iree/hal/cts/driver_test.h
new file mode 100644
index 0000000..0ebb947
--- /dev/null
+++ b/runtime/src/iree/hal/cts/driver_test.h
@@ -0,0 +1,53 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_CTS_DRIVER_TEST_H_
+#define IREE_HAL_CTS_DRIVER_TEST_H_
+
+#include <iostream>
+#include <string>
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/cts/cts_test_base.h"
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace iree {
+namespace hal {
+namespace cts {
+
+class driver_test : public CtsTestBase {};
+
+TEST_P(driver_test, QueryAndCreateAvailableDevices) {
+  iree_hal_device_info_t* device_infos = NULL;
+  iree_host_size_t device_info_count;
+  IREE_ASSERT_OK(iree_hal_driver_query_available_devices(
+      driver_, iree_allocator_system(), &device_infos, &device_info_count));
+
+  std::cout << "Driver has " << device_info_count << " device(s)";
+  for (iree_host_size_t i = 0; i < device_info_count; ++i) {
+    std::cout << "  Creating device '"
+              << std::string(device_infos[i].name.data,
+                             device_infos[i].name.size)
+              << "'";
+    iree_hal_device_t* device = NULL;
+    IREE_ASSERT_OK(iree_hal_driver_create_device(
+        driver_, device_infos[i].device_id, iree_allocator_system(), &device));
+    iree_string_view_t device_id = iree_hal_device_id(device);
+    std::cout << "  Created device with id: '"
+              << std::string(device_id.data, device_id.size) << "'";
+    iree_hal_device_release(device);
+  }
+
+  iree_allocator_free(iree_allocator_system(), device_infos);
+}
+
+}  // namespace cts
+}  // namespace hal
+}  // namespace iree
+
+#endif  // IREE_HAL_CTS_DRIVER_TEST_H_
diff --git a/runtime/src/iree/hal/cts/event_test.h b/runtime/src/iree/hal/cts/event_test.h
new file mode 100644
index 0000000..7bc1769
--- /dev/null
+++ b/runtime/src/iree/hal/cts/event_test.h
@@ -0,0 +1,123 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_CTS_EVENT_TEST_H_
+#define IREE_HAL_CTS_EVENT_TEST_H_
+
+#include <cstdint>
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/cts/cts_test_base.h"
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace iree {
+namespace hal {
+namespace cts {
+
+class event_test : public CtsTestBase {};
+
+TEST_P(event_test, Create) {
+  iree_hal_event_t* event = NULL;
+  IREE_ASSERT_OK(iree_hal_event_create(device_, &event));
+  iree_hal_event_release(event);
+}
+
+TEST_P(event_test, SignalAndReset) {
+  iree_hal_event_t* event = NULL;
+  IREE_ASSERT_OK(iree_hal_event_create(device_, &event));
+
+  iree_hal_command_buffer_t* command_buffer = NULL;
+  IREE_ASSERT_OK(iree_hal_command_buffer_create(
+      device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT,
+      IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY,
+      &command_buffer));
+
+  IREE_ASSERT_OK(iree_hal_command_buffer_begin(command_buffer));
+  IREE_ASSERT_OK(iree_hal_command_buffer_signal_event(
+      command_buffer, event, IREE_HAL_EXECUTION_STAGE_COMMAND_PROCESS));
+  IREE_ASSERT_OK(iree_hal_command_buffer_reset_event(
+      command_buffer, event, IREE_HAL_EXECUTION_STAGE_COMMAND_RETIRE));
+  IREE_ASSERT_OK(iree_hal_command_buffer_end(command_buffer));
+
+  IREE_ASSERT_OK(SubmitCommandBufferAndWait(IREE_HAL_COMMAND_CATEGORY_DISPATCH,
+                                            command_buffer));
+
+  iree_hal_event_release(event);
+  iree_hal_command_buffer_release(command_buffer);
+}
+
+TEST_P(event_test, SubmitWithChainedCommandBuffers) {
+  iree_hal_event_t* event = NULL;
+  IREE_ASSERT_OK(iree_hal_event_create(device_, &event));
+
+  iree_hal_command_buffer_t* command_buffer_1 = NULL;
+  iree_hal_command_buffer_t* command_buffer_2 = NULL;
+  IREE_ASSERT_OK(iree_hal_command_buffer_create(
+      device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT,
+      IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY,
+      &command_buffer_1));
+  IREE_ASSERT_OK(iree_hal_command_buffer_create(
+      device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT,
+      IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY,
+      &command_buffer_2));
+
+  // First command buffer signals the event when it completes.
+  IREE_ASSERT_OK(iree_hal_command_buffer_begin(command_buffer_1));
+  IREE_ASSERT_OK(iree_hal_command_buffer_signal_event(
+      command_buffer_1, event, IREE_HAL_EXECUTION_STAGE_COMMAND_RETIRE));
+  IREE_ASSERT_OK(iree_hal_command_buffer_end(command_buffer_1));
+
+  // Second command buffer waits on the event before starting.
+  IREE_ASSERT_OK(iree_hal_command_buffer_begin(command_buffer_2));
+  const iree_hal_event_t* event_pts[] = {event};
+  // TODO(scotttodd): verify execution stage usage (check Vulkan spec)
+  IREE_ASSERT_OK(iree_hal_command_buffer_wait_events(
+      command_buffer_2, IREE_ARRAYSIZE(event_pts), event_pts,
+      /*source_stage_mask=*/IREE_HAL_EXECUTION_STAGE_COMMAND_RETIRE,
+      /*target_stage_mask=*/IREE_HAL_EXECUTION_STAGE_COMMAND_ISSUE,
+      /*memory_barrier_count=*/0,
+      /*memory_barriers=*/NULL, /*buffer_barrier_count=*/0,
+      /*buffer_barriers=*/NULL));
+  IREE_ASSERT_OK(iree_hal_command_buffer_end(command_buffer_2));
+
+  // No wait semaphores, one signal which we immediately wait on after submit.
+  iree_hal_submission_batch_t submission_batch;
+  submission_batch.wait_semaphores.count = 0;
+  submission_batch.wait_semaphores.semaphores = NULL;
+  submission_batch.wait_semaphores.payload_values = NULL;
+  iree_hal_command_buffer_t* command_buffer_ptrs[] = {command_buffer_1,
+                                                      command_buffer_2};
+  submission_batch.command_buffer_count = IREE_ARRAYSIZE(command_buffer_ptrs);
+  submission_batch.command_buffers = command_buffer_ptrs;
+  iree_hal_semaphore_t* signal_semaphore;
+  IREE_ASSERT_OK(iree_hal_semaphore_create(device_, 0ull, &signal_semaphore));
+  iree_hal_semaphore_t* signal_semaphore_ptrs[] = {signal_semaphore};
+  submission_batch.signal_semaphores.count =
+      IREE_ARRAYSIZE(signal_semaphore_ptrs);
+  submission_batch.signal_semaphores.semaphores = signal_semaphore_ptrs;
+  uint64_t payload_values[] = {1ull};
+  submission_batch.signal_semaphores.payload_values = payload_values;
+
+  IREE_ASSERT_OK(
+      iree_hal_device_queue_submit(device_, IREE_HAL_COMMAND_CATEGORY_DISPATCH,
+                                   /*queue_affinity=*/0,
+                                   /*batch_count=*/1, &submission_batch));
+  IREE_ASSERT_OK(
+      iree_hal_semaphore_wait(signal_semaphore, 1ull, iree_infinite_timeout()));
+
+  iree_hal_command_buffer_release(command_buffer_1);
+  iree_hal_command_buffer_release(command_buffer_2);
+  iree_hal_semaphore_release(signal_semaphore);
+  iree_hal_event_release(event);
+}
+
+}  // namespace cts
+}  // namespace hal
+}  // namespace iree
+
+#endif  // IREE_HAL_CTS_EVENT_TEST_H_
diff --git a/runtime/src/iree/hal/cts/executable_cache_test.h b/runtime/src/iree/hal/cts/executable_cache_test.h
new file mode 100644
index 0000000..f8c23b2
--- /dev/null
+++ b/runtime/src/iree/hal/cts/executable_cache_test.h
@@ -0,0 +1,96 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_CTS_EXECUTABLE_CACHE_TEST_H_
+#define IREE_HAL_CTS_EXECUTABLE_CACHE_TEST_H_
+
+#include "iree/base/api.h"
+#include "iree/base/string_view.h"
+#include "iree/hal/api.h"
+#include "iree/hal/cts/cts_test_base.h"
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace iree {
+namespace hal {
+namespace cts {
+
+class executable_cache_test : public CtsTestBase {};
+
+TEST_P(executable_cache_test, Create) {
+  iree_status_t loop_status = iree_ok_status();
+  iree_hal_executable_cache_t* executable_cache = NULL;
+  IREE_ASSERT_OK(iree_hal_executable_cache_create(
+      device_, iree_make_cstring_view("default"),
+      iree_loop_inline(&loop_status), &executable_cache));
+
+  iree_hal_executable_cache_release(executable_cache);
+  IREE_ASSERT_OK(loop_status);
+}
+
+TEST_P(executable_cache_test, CantPrepareUnknownFormat) {
+  iree_status_t loop_status = iree_ok_status();
+  iree_hal_executable_cache_t* executable_cache = NULL;
+  IREE_ASSERT_OK(iree_hal_executable_cache_create(
+      device_, iree_make_cstring_view("default"),
+      iree_loop_inline(&loop_status), &executable_cache));
+
+  EXPECT_FALSE(iree_hal_executable_cache_can_prepare_format(
+      executable_cache, /*caching_mode=*/0, iree_make_cstring_view("FOO?")));
+
+  iree_hal_executable_cache_release(executable_cache);
+  IREE_ASSERT_OK(loop_status);
+}
+
+TEST_P(executable_cache_test, PrepareExecutable) {
+  iree_status_t loop_status = iree_ok_status();
+  iree_hal_executable_cache_t* executable_cache = NULL;
+  IREE_ASSERT_OK(iree_hal_executable_cache_create(
+      device_, iree_make_cstring_view("default"),
+      iree_loop_inline(&loop_status), &executable_cache));
+
+  // Note: this layout must match the testdata executable.
+  iree_hal_descriptor_set_layout_t* descriptor_set_layout = NULL;
+  iree_hal_descriptor_set_layout_binding_t descriptor_set_layout_bindings[] = {
+      {0, IREE_HAL_DESCRIPTOR_TYPE_STORAGE_BUFFER},
+      {1, IREE_HAL_DESCRIPTOR_TYPE_STORAGE_BUFFER},
+  };
+  IREE_ASSERT_OK(iree_hal_descriptor_set_layout_create(
+      device_, IREE_HAL_DESCRIPTOR_SET_LAYOUT_USAGE_TYPE_IMMUTABLE,
+      IREE_ARRAYSIZE(descriptor_set_layout_bindings),
+      descriptor_set_layout_bindings, &descriptor_set_layout));
+  iree_hal_executable_layout_t* executable_layout;
+  IREE_ASSERT_OK(iree_hal_executable_layout_create(
+      device_, /*push_constants=*/0, /*set_layout_count=*/1,
+      &descriptor_set_layout, &executable_layout));
+
+  iree_hal_executable_params_t executable_params;
+  iree_hal_executable_params_initialize(&executable_params);
+  executable_params.caching_mode =
+      IREE_HAL_EXECUTABLE_CACHING_MODE_ALIAS_PROVIDED_DATA;
+  executable_params.executable_format =
+      iree_make_cstring_view(get_test_executable_format());
+  executable_params.executable_data = get_test_executable_data(
+      iree_make_cstring_view("executable_cache_test.bin"));
+  executable_params.executable_layout_count = 1;
+  executable_params.executable_layouts = &executable_layout;
+
+  iree_hal_executable_t* executable = NULL;
+  IREE_ASSERT_OK(iree_hal_executable_cache_prepare_executable(
+      executable_cache, &executable_params, &executable));
+
+  iree_hal_executable_release(executable);
+  iree_hal_executable_layout_release(executable_layout);
+  iree_hal_descriptor_set_layout_release(descriptor_set_layout);
+  iree_hal_executable_cache_release(executable_cache);
+  IREE_ASSERT_OK(loop_status);
+}
+
+}  // namespace cts
+}  // namespace hal
+}  // namespace iree
+
+#endif  // IREE_HAL_CTS_EXECUTABLE_CACHE_TEST_H_
diff --git a/runtime/src/iree/hal/cts/executable_layout_test.h b/runtime/src/iree/hal/cts/executable_layout_test.h
new file mode 100644
index 0000000..8af3e60
--- /dev/null
+++ b/runtime/src/iree/hal/cts/executable_layout_test.h
@@ -0,0 +1,97 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_CTS_EXECUTABLE_LAYOUT_TEST_H_
+#define IREE_HAL_CTS_EXECUTABLE_LAYOUT_TEST_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/cts/cts_test_base.h"
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace iree {
+namespace hal {
+namespace cts {
+
+class executable_layout_test : public CtsTestBase {};
+
+TEST_P(executable_layout_test, CreateWithNoLayouts) {
+  iree_hal_executable_layout_t* executable_layout = NULL;
+  IREE_ASSERT_OK(iree_hal_executable_layout_create(
+      device_, /*push_constants=*/0, /*set_layout_count=*/0, NULL,
+      &executable_layout));
+
+  iree_hal_executable_layout_release(executable_layout);
+}
+
+TEST_P(executable_layout_test, CreateWithPushConstants) {
+  iree_hal_executable_layout_t* executable_layout = NULL;
+  // Note: The Vulkan maxPushConstantsSize limit must be at least 128 bytes:
+  // https://www.khronos.org/registry/vulkan/specs/1.2/html/vkspec.html#limits-minmax
+  IREE_ASSERT_OK(iree_hal_executable_layout_create(
+      device_, /*push_constants=*/5, /*set_layout_count=*/0, NULL,
+      &executable_layout));
+
+  iree_hal_executable_layout_release(executable_layout);
+}
+
+TEST_P(executable_layout_test, CreateWithOneLayout) {
+  iree_hal_descriptor_set_layout_t* descriptor_set_layout = NULL;
+  iree_hal_descriptor_set_layout_binding_t descriptor_set_layout_bindings[] = {
+      {/*binding=*/0, /*type=*/IREE_HAL_DESCRIPTOR_TYPE_STORAGE_BUFFER},
+      {/*binding=*/1, /*type=*/IREE_HAL_DESCRIPTOR_TYPE_STORAGE_BUFFER},
+  };
+  IREE_ASSERT_OK(iree_hal_descriptor_set_layout_create(
+      device_, IREE_HAL_DESCRIPTOR_SET_LAYOUT_USAGE_TYPE_IMMUTABLE,
+      IREE_ARRAYSIZE(descriptor_set_layout_bindings),
+      descriptor_set_layout_bindings, &descriptor_set_layout));
+
+  iree_hal_executable_layout_t* executable_layout = NULL;
+  IREE_ASSERT_OK(iree_hal_executable_layout_create(
+      device_, /*push_constants=*/0, /*set_layout_count=*/1,
+      &descriptor_set_layout, &executable_layout));
+
+  iree_hal_executable_layout_release(executable_layout);
+  iree_hal_descriptor_set_layout_release(descriptor_set_layout);
+}
+
+TEST_P(executable_layout_test, CreateWithTwoLayouts) {
+  iree_hal_descriptor_set_layout_t* descriptor_set_layouts[2] = {NULL};
+  iree_hal_descriptor_set_layout_binding_t layout_bindings_0[] = {
+      {/*binding=*/0, /*type=*/IREE_HAL_DESCRIPTOR_TYPE_STORAGE_BUFFER},
+      {/*binding=*/1, /*type=*/IREE_HAL_DESCRIPTOR_TYPE_STORAGE_BUFFER},
+  };
+  IREE_ASSERT_OK(iree_hal_descriptor_set_layout_create(
+      device_, IREE_HAL_DESCRIPTOR_SET_LAYOUT_USAGE_TYPE_IMMUTABLE,
+      IREE_ARRAYSIZE(layout_bindings_0), layout_bindings_0,
+      &descriptor_set_layouts[0]));
+
+  iree_hal_descriptor_set_layout_binding_t layout_bindings_1[] = {
+      {/*binding=*/0, /*type=*/IREE_HAL_DESCRIPTOR_TYPE_STORAGE_BUFFER},
+      {/*binding=*/1, /*type=*/IREE_HAL_DESCRIPTOR_TYPE_STORAGE_BUFFER},
+      {/*binding=*/2, /*type=*/IREE_HAL_DESCRIPTOR_TYPE_STORAGE_BUFFER},
+  };
+  IREE_ASSERT_OK(iree_hal_descriptor_set_layout_create(
+      device_, IREE_HAL_DESCRIPTOR_SET_LAYOUT_USAGE_TYPE_IMMUTABLE,
+      IREE_ARRAYSIZE(layout_bindings_1), layout_bindings_1,
+      &descriptor_set_layouts[1]));
+
+  iree_hal_executable_layout_t* executable_layout = NULL;
+  IREE_ASSERT_OK(iree_hal_executable_layout_create(
+      device_, /*push_constants=*/0, IREE_ARRAYSIZE(descriptor_set_layouts),
+      descriptor_set_layouts, &executable_layout));
+
+  iree_hal_executable_layout_release(executable_layout);
+  iree_hal_descriptor_set_layout_release(descriptor_set_layouts[0]);
+  iree_hal_descriptor_set_layout_release(descriptor_set_layouts[1]);
+}
+
+}  // namespace cts
+}  // namespace hal
+}  // namespace iree
+
+#endif  // IREE_HAL_CTS_EXECUTABLE_LAYOUT_TEST_H_
diff --git a/runtime/src/iree/hal/cts/semaphore_submission_test.h b/runtime/src/iree/hal/cts/semaphore_submission_test.h
new file mode 100644
index 0000000..0c41c76
--- /dev/null
+++ b/runtime/src/iree/hal/cts/semaphore_submission_test.h
@@ -0,0 +1,209 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_CTS_SEMAPHORE_SUBMISSION_TEST_H_
+#define IREE_HAL_CTS_SEMAPHORE_SUBMISSION_TEST_H_
+
+#include <cstdint>
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/cts/cts_test_base.h"
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace iree {
+namespace hal {
+namespace cts {
+
+class semaphore_submission_test : public CtsTestBase {};
+
+TEST_P(semaphore_submission_test, SubmitWithNoCommandBuffers) {
+  // No waits, one signal which we immediately wait on after submit.
+  iree_hal_submission_batch_t submission_batch;
+  submission_batch.wait_semaphores.count = 0;
+  submission_batch.wait_semaphores.semaphores = NULL;
+  submission_batch.wait_semaphores.payload_values = NULL;
+  submission_batch.command_buffer_count = 0;
+  submission_batch.command_buffers = NULL;
+  iree_hal_semaphore_t* signal_semaphore = NULL;
+  IREE_ASSERT_OK(iree_hal_semaphore_create(device_, 0ull, &signal_semaphore));
+  iree_hal_semaphore_t* signal_semaphore_ptrs[] = {signal_semaphore};
+  submission_batch.signal_semaphores.count =
+      IREE_ARRAYSIZE(signal_semaphore_ptrs);
+  submission_batch.signal_semaphores.semaphores = signal_semaphore_ptrs;
+  uint64_t payload_values[] = {1ull};
+  submission_batch.signal_semaphores.payload_values = payload_values;
+
+  IREE_ASSERT_OK(
+      iree_hal_device_queue_submit(device_, IREE_HAL_COMMAND_CATEGORY_DISPATCH,
+                                   /*queue_affinity=*/0,
+                                   /*batch_count=*/1, &submission_batch));
+  IREE_ASSERT_OK(
+      iree_hal_semaphore_wait(signal_semaphore, 1ull, iree_infinite_timeout()));
+
+  iree_hal_semaphore_release(signal_semaphore);
+}
+
+TEST_P(semaphore_submission_test, SubmitAndSignal) {
+  iree_hal_command_buffer_t* command_buffer = NULL;
+  IREE_ASSERT_OK(iree_hal_command_buffer_create(
+      device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT,
+      IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY,
+      &command_buffer));
+
+  IREE_ASSERT_OK(iree_hal_command_buffer_begin(command_buffer));
+  IREE_ASSERT_OK(iree_hal_command_buffer_end(command_buffer));
+
+  // No waits, one signal which we immediately wait on after submit.
+  iree_hal_submission_batch_t submission_batch;
+  submission_batch.wait_semaphores.count = 0;
+  submission_batch.wait_semaphores.semaphores = NULL;
+  submission_batch.wait_semaphores.payload_values = NULL;
+  submission_batch.command_buffer_count = 1;
+  submission_batch.command_buffers = &command_buffer;
+  iree_hal_semaphore_t* signal_semaphore = NULL;
+  IREE_ASSERT_OK(iree_hal_semaphore_create(device_, 0ull, &signal_semaphore));
+  iree_hal_semaphore_t* signal_semaphore_ptrs[] = {signal_semaphore};
+  submission_batch.signal_semaphores.count =
+      IREE_ARRAYSIZE(signal_semaphore_ptrs);
+  submission_batch.signal_semaphores.semaphores = signal_semaphore_ptrs;
+  uint64_t payload_values[] = {1ull};
+  submission_batch.signal_semaphores.payload_values = payload_values;
+
+  IREE_ASSERT_OK(
+      iree_hal_device_queue_submit(device_, IREE_HAL_COMMAND_CATEGORY_DISPATCH,
+                                   /*queue_affinity=*/0,
+                                   /*batch_count=*/1, &submission_batch));
+  IREE_ASSERT_OK(
+      iree_hal_semaphore_wait(signal_semaphore, 1ull, iree_infinite_timeout()));
+
+  iree_hal_command_buffer_release(command_buffer);
+  iree_hal_semaphore_release(signal_semaphore);
+}
+
+TEST_P(semaphore_submission_test, SubmitWithWait) {
+  // Empty command buffer.
+  iree_hal_command_buffer_t* command_buffer = NULL;
+  IREE_ASSERT_OK(iree_hal_command_buffer_create(
+      device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT,
+      IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY,
+      &command_buffer));
+  IREE_ASSERT_OK(iree_hal_command_buffer_begin(command_buffer));
+  IREE_ASSERT_OK(iree_hal_command_buffer_end(command_buffer));
+
+  // One wait and one signal semaphore.
+  iree_hal_submission_batch_t submission_batch;
+  iree_hal_semaphore_t* wait_semaphore = NULL;
+  iree_hal_semaphore_t* signal_semaphore = NULL;
+  IREE_ASSERT_OK(iree_hal_semaphore_create(device_, 0ull, &wait_semaphore));
+  IREE_ASSERT_OK(iree_hal_semaphore_create(device_, 100ull, &signal_semaphore));
+  iree_hal_semaphore_t* wait_semaphore_ptrs[] = {wait_semaphore};
+  iree_hal_semaphore_t* signal_semaphore_ptrs[] = {signal_semaphore};
+  uint64_t wait_payload_values[] = {1ull};
+  uint64_t signal_payload_values[] = {101ull};
+  submission_batch.wait_semaphores.count = IREE_ARRAYSIZE(wait_semaphore_ptrs);
+  submission_batch.wait_semaphores.semaphores = wait_semaphore_ptrs;
+  submission_batch.wait_semaphores.payload_values = wait_payload_values;
+  submission_batch.command_buffer_count = 1;
+  submission_batch.command_buffers = &command_buffer;
+  submission_batch.signal_semaphores.count =
+      IREE_ARRAYSIZE(signal_semaphore_ptrs);
+  submission_batch.signal_semaphores.semaphores = signal_semaphore_ptrs;
+  submission_batch.signal_semaphores.payload_values = signal_payload_values;
+
+  IREE_ASSERT_OK(
+      iree_hal_device_queue_submit(device_, IREE_HAL_COMMAND_CATEGORY_DISPATCH,
+                                   /*queue_affinity=*/0,
+                                   /*batch_count=*/1, &submission_batch));
+
+  // Work shouldn't start until the wait semaphore reaches its payload value.
+  uint64_t value;
+  IREE_ASSERT_OK(iree_hal_semaphore_query(signal_semaphore, &value));
+  EXPECT_EQ(100ull, value);
+
+  // Signal the wait semaphore, work should begin and complete.
+  IREE_ASSERT_OK(iree_hal_semaphore_signal(wait_semaphore, 1ull));
+  IREE_ASSERT_OK(iree_hal_semaphore_wait(signal_semaphore, 101ull,
+                                         iree_infinite_timeout()));
+
+  iree_hal_command_buffer_release(command_buffer);
+  iree_hal_semaphore_release(wait_semaphore);
+  iree_hal_semaphore_release(signal_semaphore);
+}
+
+TEST_P(semaphore_submission_test, SubmitWithMultipleSemaphores) {
+  iree_hal_command_buffer_t* command_buffer = NULL;
+  IREE_ASSERT_OK(iree_hal_command_buffer_create(
+      device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT,
+      IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY,
+      &command_buffer));
+
+  IREE_ASSERT_OK(iree_hal_command_buffer_begin(command_buffer));
+  IREE_ASSERT_OK(iree_hal_command_buffer_end(command_buffer));
+
+  iree_hal_submission_batch_t submission_batch;
+  iree_hal_semaphore_t* wait_semaphore_1 = NULL;
+  iree_hal_semaphore_t* wait_semaphore_2 = NULL;
+  iree_hal_semaphore_t* signal_semaphore_1 = NULL;
+  iree_hal_semaphore_t* signal_semaphore_2 = NULL;
+  IREE_ASSERT_OK(iree_hal_semaphore_create(device_, 0ull, &wait_semaphore_1));
+  IREE_ASSERT_OK(iree_hal_semaphore_create(device_, 0ull, &wait_semaphore_2));
+  IREE_ASSERT_OK(iree_hal_semaphore_create(device_, 0ull, &signal_semaphore_1));
+  IREE_ASSERT_OK(iree_hal_semaphore_create(device_, 0ull, &signal_semaphore_2));
+  iree_hal_semaphore_t* wait_semaphore_ptrs[] = {wait_semaphore_1,
+                                                 wait_semaphore_2};
+  iree_hal_semaphore_t* signal_semaphore_ptrs[] = {signal_semaphore_1,
+                                                   signal_semaphore_2};
+  uint64_t wait_payload_values[] = {1ull, 1ull};
+  uint64_t signal_payload_values[] = {1ull, 1ull};
+  submission_batch.wait_semaphores.count = IREE_ARRAYSIZE(wait_semaphore_ptrs);
+  submission_batch.wait_semaphores.semaphores = wait_semaphore_ptrs;
+  submission_batch.wait_semaphores.payload_values = wait_payload_values;
+  submission_batch.command_buffer_count = 1;
+  submission_batch.command_buffers = &command_buffer;
+  submission_batch.signal_semaphores.count =
+      IREE_ARRAYSIZE(signal_semaphore_ptrs);
+  submission_batch.signal_semaphores.semaphores = signal_semaphore_ptrs;
+  submission_batch.signal_semaphores.payload_values = signal_payload_values;
+
+  IREE_ASSERT_OK(
+      iree_hal_device_queue_submit(device_, IREE_HAL_COMMAND_CATEGORY_DISPATCH,
+                                   /*queue_affinity=*/0,
+                                   /*batch_count=*/1, &submission_batch));
+
+  // Work shouldn't start until all wait semaphores reach their payload values.
+  uint64_t value;
+  IREE_ASSERT_OK(iree_hal_semaphore_query(signal_semaphore_1, &value));
+  EXPECT_EQ(0ull, value);
+  IREE_ASSERT_OK(iree_hal_semaphore_query(signal_semaphore_2, &value));
+  EXPECT_EQ(0ull, value);
+
+  // Signal the wait semaphores, work should begin and complete.
+  IREE_ASSERT_OK(iree_hal_semaphore_signal(wait_semaphore_1, 1ull));
+  IREE_ASSERT_OK(iree_hal_semaphore_signal(wait_semaphore_2, 1ull));
+
+  iree_hal_semaphore_list_t signal_semaphore_list;
+  signal_semaphore_list.count = IREE_ARRAYSIZE(signal_semaphore_ptrs);
+  signal_semaphore_list.semaphores = signal_semaphore_ptrs;
+  uint64_t payload_values[] = {1ull, 1ull};
+  signal_semaphore_list.payload_values = payload_values;
+  IREE_ASSERT_OK(iree_hal_device_wait_semaphores(
+      device_, IREE_HAL_WAIT_MODE_ALL, &signal_semaphore_list,
+      iree_infinite_timeout()));
+
+  iree_hal_command_buffer_release(command_buffer);
+  iree_hal_semaphore_release(wait_semaphore_1);
+  iree_hal_semaphore_release(wait_semaphore_2);
+  iree_hal_semaphore_release(signal_semaphore_1);
+  iree_hal_semaphore_release(signal_semaphore_2);
+}
+
+}  // namespace cts
+}  // namespace hal
+}  // namespace iree
+
+#endif  // IREE_HAL_CTS_SEMAPHORE_SUBMISSION_TEST_H_
diff --git a/runtime/src/iree/hal/cts/semaphore_test.h b/runtime/src/iree/hal/cts/semaphore_test.h
new file mode 100644
index 0000000..d44dd13
--- /dev/null
+++ b/runtime/src/iree/hal/cts/semaphore_test.h
@@ -0,0 +1,237 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_CTS_SEMAPHORE_TEST_H_
+#define IREE_HAL_CTS_SEMAPHORE_TEST_H_
+
+#include <cstdint>
+#include <thread>
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/cts/cts_test_base.h"
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace iree {
+namespace hal {
+namespace cts {
+
+class semaphore_test : public CtsTestBase {};
+
+// Tests that a semaphore that is unused properly cleans itself up.
+TEST_P(semaphore_test, NoOp) {
+  iree_hal_semaphore_t* semaphore = NULL;
+  IREE_ASSERT_OK(iree_hal_semaphore_create(device_, 123ull, &semaphore));
+
+  uint64_t value;
+  IREE_ASSERT_OK(iree_hal_semaphore_query(semaphore, &value));
+  EXPECT_EQ(123ull, value);
+
+  iree_hal_semaphore_release(semaphore);
+}
+
+// Tests that a semaphore will accept new values as it is signaled.
+TEST_P(semaphore_test, NormalSignaling) {
+  iree_hal_semaphore_t* semaphore = NULL;
+  IREE_ASSERT_OK(iree_hal_semaphore_create(device_, 2ull, &semaphore));
+
+  uint64_t value;
+  IREE_ASSERT_OK(iree_hal_semaphore_query(semaphore, &value));
+  EXPECT_EQ(2ull, value);
+  IREE_ASSERT_OK(iree_hal_semaphore_signal(semaphore, 3ull));
+  IREE_ASSERT_OK(iree_hal_semaphore_query(semaphore, &value));
+  EXPECT_EQ(3ull, value);
+  IREE_ASSERT_OK(iree_hal_semaphore_signal(semaphore, 40ull));
+  IREE_ASSERT_OK(iree_hal_semaphore_query(semaphore, &value));
+  EXPECT_EQ(40ull, value);
+
+  iree_hal_semaphore_release(semaphore);
+}
+
+// Note: Behavior is undefined when signaling with decreasing values, so we
+// can't reliably test it across backends. Some backends may return errors,
+// while others may accept the new, decreasing, values.
+
+// Tests semaphore failure handling.
+TEST_P(semaphore_test, Failure) {
+  iree_hal_semaphore_t* semaphore = NULL;
+  IREE_ASSERT_OK(iree_hal_semaphore_create(device_, 2ull, &semaphore));
+
+  IREE_ASSERT_OK(iree_hal_semaphore_signal(semaphore, 3ull));
+  uint64_t value;
+  IREE_ASSERT_OK(iree_hal_semaphore_query(semaphore, &value));
+  EXPECT_EQ(3ull, value);
+
+  iree_hal_semaphore_fail(semaphore,
+                          iree_status_from_code(IREE_STATUS_UNKNOWN));
+  EXPECT_TRUE(
+      iree_status_is_unknown(iree_hal_semaphore_query(semaphore, &value)));
+
+  // Signaling again is undefined behavior. Some backends may return a sticky
+  // failure status while others may silently process new signal values.
+
+  iree_hal_semaphore_release(semaphore);
+}
+
+// Tests waiting on no semaphores.
+TEST_P(semaphore_test, EmptyWait) {
+  IREE_ASSERT_OK(iree_hal_device_wait_semaphores(
+      device_, IREE_HAL_WAIT_MODE_ANY, NULL,
+      iree_make_deadline(IREE_TIME_INFINITE_FUTURE)));
+  IREE_ASSERT_OK(iree_hal_device_wait_semaphores(
+      device_, IREE_HAL_WAIT_MODE_ALL, NULL,
+      iree_make_deadline(IREE_TIME_INFINITE_FUTURE)));
+
+  IREE_ASSERT_OK(iree_hal_device_wait_semaphores(
+      device_, IREE_HAL_WAIT_MODE_ANY, NULL,
+      iree_make_timeout_ns(IREE_DURATION_INFINITE)));
+  IREE_ASSERT_OK(iree_hal_device_wait_semaphores(
+      device_, IREE_HAL_WAIT_MODE_ALL, NULL,
+      iree_make_timeout_ns(IREE_DURATION_INFINITE)));
+}
+
+// Tests waiting on a semaphore that has already been signaled.
+// **Never completes when using SwiftShader**
+TEST_P(semaphore_test, DISABLED_WaitAlreadySignaled) {
+  iree_hal_semaphore_t* semaphore = NULL;
+  IREE_ASSERT_OK(iree_hal_semaphore_create(device_, 2ull, &semaphore));
+
+  // Test both previous and current values.
+  IREE_ASSERT_OK(iree_hal_semaphore_wait(
+      semaphore, 1ull, iree_make_deadline(IREE_TIME_INFINITE_FUTURE)));
+  IREE_ASSERT_OK(iree_hal_semaphore_wait(
+      semaphore, 2ull, iree_make_deadline(IREE_TIME_INFINITE_FUTURE)));
+
+  IREE_ASSERT_OK(iree_hal_semaphore_wait(
+      semaphore, 1ull, iree_make_timeout_ns(IREE_DURATION_INFINITE)));
+  IREE_ASSERT_OK(iree_hal_semaphore_wait(
+      semaphore, 2ull, iree_make_timeout_ns(IREE_DURATION_INFINITE)));
+
+  iree_hal_semaphore_release(semaphore);
+}
+
+// Tests waiting on a semaphore that has not been signaled.
+TEST_P(semaphore_test, WaitUnsignaled) {
+  iree_hal_semaphore_t* semaphore = NULL;
+  IREE_ASSERT_OK(iree_hal_semaphore_create(device_, 2ull, &semaphore));
+
+  // NOTE: we don't actually block here because otherwise we'd lock up.
+  // Result status is undefined - some backends may return DeadlineExceededError
+  // while others may return success.
+  IREE_IGNORE_ERROR(iree_hal_semaphore_wait(
+      semaphore, 3ull, iree_make_deadline(IREE_TIME_INFINITE_PAST)));
+
+  iree_hal_semaphore_release(semaphore);
+}
+
+// Waiting on a failed semaphore is undefined behavior. Some backends may
+// return UnknownError while others may succeed.
+
+// Tests IREE_HAL_WAIT_MODE_ALL when not all are signaled.
+TEST_P(semaphore_test, WaitAllButNotAllSignaled) {
+  iree_hal_semaphore_t* semaphore_a = NULL;
+  iree_hal_semaphore_t* semaphore_b = NULL;
+  IREE_ASSERT_OK(iree_hal_semaphore_create(device_, 0ull, &semaphore_a));
+  IREE_ASSERT_OK(iree_hal_semaphore_create(device_, 1ull, &semaphore_b));
+
+  iree_hal_semaphore_list_t semaphore_list;
+  iree_hal_semaphore_t* semaphore_ptrs[] = {semaphore_a, semaphore_b};
+  semaphore_list.count = IREE_ARRAYSIZE(semaphore_ptrs);
+  semaphore_list.semaphores = semaphore_ptrs;
+  uint64_t payload_values[] = {1ull, 1ull};
+  semaphore_list.payload_values = payload_values;
+
+  // NOTE: we don't actually block here because otherwise we'd lock up.
+  // Result status is undefined - some backends may return DeadlineExceededError
+  // while others may return success.
+  IREE_IGNORE_ERROR(iree_hal_device_wait_semaphores(
+      device_, IREE_HAL_WAIT_MODE_ALL, &semaphore_list,
+      iree_make_deadline(IREE_TIME_INFINITE_PAST)));
+
+  iree_hal_semaphore_release(semaphore_a);
+  iree_hal_semaphore_release(semaphore_b);
+}
+
+// Tests IREE_HAL_WAIT_MODE_ALL when all are signaled.
+TEST_P(semaphore_test, WaitAllAndAllSignaled) {
+  iree_hal_semaphore_t* semaphore_a = NULL;
+  iree_hal_semaphore_t* semaphore_b = NULL;
+  IREE_ASSERT_OK(iree_hal_semaphore_create(device_, 1ull, &semaphore_a));
+  IREE_ASSERT_OK(iree_hal_semaphore_create(device_, 1ull, &semaphore_b));
+
+  iree_hal_semaphore_list_t semaphore_list;
+  iree_hal_semaphore_t* semaphore_ptrs[] = {semaphore_a, semaphore_b};
+  semaphore_list.count = IREE_ARRAYSIZE(semaphore_ptrs);
+  semaphore_list.semaphores = semaphore_ptrs;
+  uint64_t payload_values[] = {1ull, 1ull};
+  semaphore_list.payload_values = payload_values;
+
+  // NOTE: we don't actually block here because otherwise we'd lock up.
+  // Result status is undefined - some backends may return DeadlineExceededError
+  // while others may return success.
+  IREE_IGNORE_ERROR(iree_hal_device_wait_semaphores(
+      device_, IREE_HAL_WAIT_MODE_ALL, &semaphore_list,
+      iree_make_deadline(IREE_TIME_INFINITE_FUTURE)));
+
+  iree_hal_semaphore_release(semaphore_a);
+  iree_hal_semaphore_release(semaphore_b);
+}
+
+// Tests IREE_HAL_WAIT_MODE_ANY.
+// **Fails using timeline semaphore emulation**
+TEST_P(semaphore_test, DISABLED_WaitAny) {
+  iree_hal_semaphore_t* semaphore_a = NULL;
+  iree_hal_semaphore_t* semaphore_b = NULL;
+  IREE_ASSERT_OK(iree_hal_semaphore_create(device_, 0ull, &semaphore_a));
+  IREE_ASSERT_OK(iree_hal_semaphore_create(device_, 1ull, &semaphore_b));
+
+  iree_hal_semaphore_list_t semaphore_list;
+  iree_hal_semaphore_t* semaphore_ptrs[] = {semaphore_a, semaphore_b};
+  semaphore_list.count = IREE_ARRAYSIZE(semaphore_ptrs);
+  semaphore_list.semaphores = semaphore_ptrs;
+  uint64_t payload_values[] = {1ull, 1ull};
+  semaphore_list.payload_values = payload_values;
+
+  IREE_ASSERT_OK(iree_hal_device_wait_semaphores(
+      device_, IREE_HAL_WAIT_MODE_ANY, &semaphore_list,
+      iree_make_deadline(IREE_TIME_INFINITE_FUTURE)));
+
+  iree_hal_semaphore_release(semaphore_a);
+  iree_hal_semaphore_release(semaphore_b);
+}
+
+// Tests threading behavior by ping-ponging between the test main thread and
+// a little thread.
+TEST_P(semaphore_test, PingPong) {
+  iree_hal_semaphore_t* a2b = NULL;
+  iree_hal_semaphore_t* b2a = NULL;
+  IREE_ASSERT_OK(iree_hal_semaphore_create(device_, 0ull, &a2b));
+  IREE_ASSERT_OK(iree_hal_semaphore_create(device_, 0ull, &b2a));
+  std::thread thread([&]() {
+    // Should advance right past this because the value is already set.
+    IREE_ASSERT_OK(iree_hal_semaphore_wait(
+        a2b, 0ull, iree_make_deadline(IREE_TIME_INFINITE_FUTURE)));
+    IREE_ASSERT_OK(iree_hal_semaphore_signal(b2a, 1ull));
+    // Jump ahead (blocking at first).
+    IREE_ASSERT_OK(iree_hal_semaphore_wait(
+        a2b, 4ull, iree_make_deadline(IREE_TIME_INFINITE_FUTURE)));
+  });
+  // Block until thread signals.
+  IREE_ASSERT_OK(iree_hal_semaphore_wait(
+      b2a, 1ull, iree_make_deadline(IREE_TIME_INFINITE_FUTURE)));
+  IREE_ASSERT_OK(iree_hal_semaphore_signal(a2b, 4ull));
+  thread.join();
+
+  iree_hal_semaphore_release(a2b);
+  iree_hal_semaphore_release(b2a);
+}
+
+}  // namespace cts
+}  // namespace hal
+}  // namespace iree
+
+#endif  // IREE_HAL_CTS_SEMAPHORE_TEST_H_
diff --git a/runtime/src/iree/hal/cts/testdata/command_buffer_dispatch_test.mlir b/runtime/src/iree/hal/cts/testdata/command_buffer_dispatch_test.mlir
new file mode 100644
index 0000000..63aa1ea
--- /dev/null
+++ b/runtime/src/iree/hal/cts/testdata/command_buffer_dispatch_test.mlir
@@ -0,0 +1,37 @@
+// Bootstrapped from this source IR:
+//
+// func.func @abs(%input : tensor<f32>) -> (tensor<f32>) {
+//   %result = math.abs %input : tensor<f32>
+//   return %result : tensor<f32>
+// }
+
+#executable_layout = #hal.executable.layout<push_constants = 0, sets = [
+  #hal.descriptor_set.layout<0, bindings = [
+    #hal.descriptor_set.binding<0, storage_buffer>,
+    #hal.descriptor_set.binding<1, storage_buffer>
+  ]>
+]>
+
+hal.executable.source public @executable {
+  hal.executable.entry_point public @abs layout(#executable_layout)
+
+  builtin.module {
+    func.func @abs() {
+      %c0 = arith.constant 0 : index
+
+      %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(32) : !flow.dispatch.tensor<readonly:f32>
+      %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(32) : !flow.dispatch.tensor<writeonly:f32>
+
+      %2 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:f32> -> tensor<f32>
+      %3 = linalg.init_tensor [] : tensor<f32>
+      %4 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%2 : tensor<f32>) outs(%3 : tensor<f32>) {
+      ^bb0(%arg0: f32, %arg1: f32):
+        %5 = math.abs %arg0 : f32
+        linalg.yield %5 : f32
+      } -> tensor<f32>
+      flow.dispatch.tensor.store %4, %1, offsets = [], sizes = [], strides = [] : tensor<f32> -> !flow.dispatch.tensor<writeonly:f32>
+
+      return
+    }
+  }
+}
diff --git a/runtime/src/iree/hal/cts/testdata/executable_cache_test.mlir b/runtime/src/iree/hal/cts/testdata/executable_cache_test.mlir
new file mode 100644
index 0000000..63aa1ea
--- /dev/null
+++ b/runtime/src/iree/hal/cts/testdata/executable_cache_test.mlir
@@ -0,0 +1,37 @@
+// Bootstrapped from this source IR:
+//
+// func.func @abs(%input : tensor<f32>) -> (tensor<f32>) {
+//   %result = math.abs %input : tensor<f32>
+//   return %result : tensor<f32>
+// }
+
+#executable_layout = #hal.executable.layout<push_constants = 0, sets = [
+  #hal.descriptor_set.layout<0, bindings = [
+    #hal.descriptor_set.binding<0, storage_buffer>,
+    #hal.descriptor_set.binding<1, storage_buffer>
+  ]>
+]>
+
+hal.executable.source public @executable {
+  hal.executable.entry_point public @abs layout(#executable_layout)
+
+  builtin.module {
+    func.func @abs() {
+      %c0 = arith.constant 0 : index
+
+      %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(32) : !flow.dispatch.tensor<readonly:f32>
+      %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(32) : !flow.dispatch.tensor<writeonly:f32>
+
+      %2 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:f32> -> tensor<f32>
+      %3 = linalg.init_tensor [] : tensor<f32>
+      %4 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%2 : tensor<f32>) outs(%3 : tensor<f32>) {
+      ^bb0(%arg0: f32, %arg1: f32):
+        %5 = math.abs %arg0 : f32
+        linalg.yield %5 : f32
+      } -> tensor<f32>
+      flow.dispatch.tensor.store %4, %1, offsets = [], sizes = [], strides = [] : tensor<f32> -> !flow.dispatch.tensor<writeonly:f32>
+
+      return
+    }
+  }
+}
diff --git a/runtime/src/iree/hal/cuda/CMakeLists.txt b/runtime/src/iree/hal/cuda/CMakeLists.txt
new file mode 100644
index 0000000..76f3936
--- /dev/null
+++ b/runtime/src/iree/hal/cuda/CMakeLists.txt
@@ -0,0 +1,98 @@
+# Copyright 2022 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+if(NOT IREE_HAL_DRIVER_CUDA)
+  return()
+endif()
+
+if(NOT CUDAToolkit_INCLUDE_DIRS)
+  message(FATAL_ERROR "No CUDA SDK includes found: should have been set globally")
+endif()
+
+iree_add_all_subdirs()
+
+iree_cc_library(
+  NAME
+    cuda
+  HDRS
+    "api.h"
+  SRCS
+    "api.h"
+    "context_wrapper.h"
+    "cuda_allocator.c"
+    "cuda_allocator.h"
+    "cuda_buffer.c"
+    "cuda_buffer.h"
+    "cuda_device.c"
+    "cuda_device.h"
+    "cuda_driver.c"
+    "cuda_event.c"
+    "cuda_event.h"
+    "descriptor_set_layout.c"
+    "descriptor_set_layout.h"
+    "event_semaphore.c"
+    "event_semaphore.h"
+    "executable_layout.c"
+    "executable_layout.h"
+    "graph_command_buffer.c"
+    "graph_command_buffer.h"
+    "native_executable.c"
+    "native_executable.h"
+    "nop_executable_cache.c"
+    "nop_executable_cache.h"
+    "status_util.c"
+    "status_util.h"
+    "stream_command_buffer.c"
+    "stream_command_buffer.h"
+  DEPS
+    ::dynamic_symbols
+    iree::base
+    iree::base::core_headers
+    iree::base::internal
+    iree::base::internal::arena
+    iree::base::internal::flatcc::parsing
+    iree::base::internal::synchronization
+    iree::base::tracing
+    iree::hal
+    iree::hal::utils::buffer_transfer
+    iree::hal::utils::deferred_command_buffer
+    iree::hal::utils::resource_set
+    iree::schemas::cuda_executable_def_c_fbs
+  PUBLIC
+)
+
+iree_cc_library(
+  NAME
+    dynamic_symbols
+  HDRS
+    "dynamic_symbols.h"
+  TEXTUAL_HDRS
+    "dynamic_symbol_tables.h"
+  SRCS
+    "cuda_headers.h"
+    "dynamic_symbols.c"
+  INCLUDES
+    ${CUDAToolkit_INCLUDE_DIRS}
+  DEPS
+    iree::base::core_headers
+    iree::base::internal::dynamic_library
+    iree::base::tracing
+  PUBLIC
+)
+
+iree_cc_test(
+  NAME
+    dynamic_symbols_test
+  SRCS
+    "dynamic_symbols_test.cc"
+  DEPS
+    ::dynamic_symbols
+    iree::base
+    iree::testing::gtest
+    iree::testing::gtest_main
+  LABELS
+    "driver=cuda"
+)
diff --git a/runtime/src/iree/hal/cuda/api.h b/runtime/src/iree/hal/cuda/api.h
new file mode 100644
index 0000000..b6a3402
--- /dev/null
+++ b/runtime/src/iree/hal/cuda/api.h
@@ -0,0 +1,82 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// See iree/base/api.h for documentation on the API conventions used.
+
+#ifndef IREE_HAL_CUDA_API_H_
+#define IREE_HAL_CUDA_API_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Defines how command buffers are recorded and executed.
+typedef enum iree_hal_cuda_command_buffer_mode_e {
+  // Command buffers are recorded into CUDA graphs.
+  IREE_HAL_CUDA_COMMAND_BUFFER_MODE_GRAPH = 0,
+  // Command buffers are directly issued against a CUDA stream.
+  IREE_HAL_CUDA_COMMAND_BUFFER_MODE_STREAM = 1,
+} iree_hal_cuda_command_buffer_mode_t;
+
+// Parameters configuring an iree_hal_cuda_device_t.
+// Must be initialized with iree_hal_cuda_device_params_initialize prior to use.
+typedef struct iree_hal_cuda_device_params_t {
+  // Number of queues exposed on the device.
+  // Each queue acts as a separate synchronization scope where all work executes
+  // concurrently unless prohibited by semaphores.
+  iree_host_size_t queue_count;
+
+  // Total size of each block in the device shared block pool.
+  // Larger sizes will lower overhead and ensure the heap isn't hit for
+  // transient allocations while also increasing memory consumption.
+  iree_host_size_t arena_block_size;
+
+  // Specifies how command buffers are recorded and executed.
+  iree_hal_cuda_command_buffer_mode_t command_buffer_mode;
+
+  // Allow executing command buffers against CUDA streams as they are recorded.
+  // Only command buffers produced by the compiler that have the
+  // IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION bit set will use this.
+  bool allow_inline_execution;
+} iree_hal_cuda_device_params_t;
+
+// Initializes |out_params| to default values.
+void iree_hal_cuda_device_params_initialize(
+    iree_hal_cuda_device_params_t* out_params);
+
+//===----------------------------------------------------------------------===//
+// iree_hal_cuda_driver_t
+//===----------------------------------------------------------------------===//
+
+// CUDA driver creation options.
+typedef struct iree_hal_cuda_driver_options_t {
+  // Index of the default CUDA device to use within the list of available
+  // devices.
+  int default_device_index;
+} iree_hal_cuda_driver_options_t;
+
+IREE_API_EXPORT void iree_hal_cuda_driver_options_initialize(
+    iree_hal_cuda_driver_options_t* out_options);
+
+// Creates a CUDA HAL driver that manage its own CUcontext.
+//
+// |out_driver| must be released by the caller (see |iree_hal_driver_release|).
+IREE_API_EXPORT iree_status_t iree_hal_cuda_driver_create(
+    iree_string_view_t identifier,
+    const iree_hal_cuda_device_params_t* default_params,
+    const iree_hal_cuda_driver_options_t* options,
+    iree_allocator_t host_allocator, iree_hal_driver_t** out_driver);
+
+// TODO(thomasraoux): Support importing a CUcontext from app.
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_CUDA_API_H_
diff --git a/runtime/src/iree/hal/cuda/context_wrapper.h b/runtime/src/iree/hal/cuda/context_wrapper.h
new file mode 100644
index 0000000..ab5281b
--- /dev/null
+++ b/runtime/src/iree/hal/cuda/context_wrapper.h
@@ -0,0 +1,22 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_CUDA_CONTEXT_WRAPPER_H_
+#define IREE_HAL_CUDA_CONTEXT_WRAPPER_H_
+
+#include "iree/hal/api.h"
+#include "iree/hal/cuda/cuda_headers.h"
+#include "iree/hal/cuda/dynamic_symbols.h"
+
+// Structure to wrap all objects constant within a context. This makes it
+// simpler to pass it to the different objects and saves memory.
+typedef struct iree_hal_cuda_context_wrapper_t {
+  CUcontext cu_context;
+  iree_allocator_t host_allocator;
+  iree_hal_cuda_dynamic_symbols_t* syms;
+} iree_hal_cuda_context_wrapper_t;
+
+#endif  // IREE_HAL_CUDA_CONTEXT_WRAPPER_H_
diff --git a/runtime/src/iree/hal/cuda/cts/CMakeLists.txt b/runtime/src/iree/hal/cuda/cts/CMakeLists.txt
new file mode 100644
index 0000000..69754f7
--- /dev/null
+++ b/runtime/src/iree/hal/cuda/cts/CMakeLists.txt
@@ -0,0 +1,54 @@
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+iree_hal_cts_test_suite(
+  DRIVER_NAME
+    cuda
+  DRIVER_REGISTRATION_HDR
+    "runtime/src/iree/hal/cuda/registration/driver_module.h"
+  DRIVER_REGISTRATION_FN
+    "iree_hal_cuda_driver_module_register"
+  COMPILER_TARGET_BACKEND
+    "cuda"
+  EXECUTABLE_FORMAT
+    "\"PTXE\""
+  DEPS
+    iree::hal::cuda::registration
+  EXCLUDED_TESTS
+    # This test depends on iree_hal_cuda_stream_command_buffer_update_buffer
+    # via iree_hal_buffer_view_allocate_buffer, which is not implemented yet.
+    "command_buffer_dispatch"
+    # Non-push descriptor sets are not implemented in the CUDA backend yet.
+    "descriptor_set"
+    # Semaphores are not implemented in the CUDA backend yet.
+    "semaphore_submission"
+    "semaphore"
+)
+
+# Variant test suite using graph command buffers (--cuda_use_streams=0)
+iree_hal_cts_test_suite(
+  DRIVER_NAME
+    cuda
+  VARIANT_SUFFIX
+    graph
+  DRIVER_REGISTRATION_HDR
+    "runtime/src/iree/hal/cuda/registration/driver_module.h"
+  DRIVER_REGISTRATION_FN
+    "iree_hal_cuda_driver_module_register"
+  COMPILER_TARGET_BACKEND
+    "cuda"
+  EXECUTABLE_FORMAT
+    "\"PTXE\""
+  ARGS
+    "--cuda_use_streams=0"
+  DEPS
+    iree::hal::cuda::registration
+  INCLUDED_TESTS
+    "command_buffer"
+    # This test depends on iree_hal_cuda_stream_command_buffer_update_buffer
+    # via iree_hal_buffer_view_allocate_buffer, which is not implemented yet.
+    # "command_buffer_dispatch"
+)
diff --git a/runtime/src/iree/hal/cuda/cuda_allocator.c b/runtime/src/iree/hal/cuda/cuda_allocator.c
new file mode 100644
index 0000000..e0b6eaf
--- /dev/null
+++ b/runtime/src/iree/hal/cuda/cuda_allocator.c
@@ -0,0 +1,327 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/cuda/cuda_allocator.h"
+
+#include <stddef.h>
+
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/cuda/cuda_buffer.h"
+#include "iree/hal/cuda/dynamic_symbols.h"
+#include "iree/hal/cuda/status_util.h"
+
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_ALLOCATION_TRACKING
+static const char* IREE_HAL_CUDA_ALLOCATOR_ID = "CUDA";
+#endif
+
+typedef struct iree_hal_cuda_allocator_t {
+  iree_hal_resource_t resource;
+  iree_hal_device_t* base_device;
+  iree_hal_cuda_context_wrapper_t* context;
+  CUdevice device;
+  CUstream stream;
+  bool supports_concurrent_managed_access;
+
+  IREE_STATISTICS(iree_hal_allocator_statistics_t statistics;)
+} iree_hal_cuda_allocator_t;
+
+static const iree_hal_allocator_vtable_t iree_hal_cuda_allocator_vtable;
+
+static iree_hal_cuda_allocator_t* iree_hal_cuda_allocator_cast(
+    iree_hal_allocator_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda_allocator_vtable);
+  return (iree_hal_cuda_allocator_t*)base_value;
+}
+
+iree_status_t iree_hal_cuda_allocator_create(
+    iree_hal_device_t* base_device, iree_hal_cuda_context_wrapper_t* context,
+    CUdevice device, CUstream stream, iree_hal_allocator_t** out_allocator) {
+  IREE_ASSERT_ARGUMENT(base_device);
+  IREE_ASSERT_ARGUMENT(context);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // To support device-local + host-visible memory we need concurrent managed
+  // access indicating that the host and devices can concurrently access the
+  // device memory. If we don't have this feature then we fall back to forcing
+  // all device-local + host-visible memory into host-local + device-visible
+  // page-locked memory. The compiler tries to avoid this for high-traffic
+  // buffers except for readback staging buffers.
+  int supports_concurrent_managed_access = 0;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, CU_RESULT_TO_STATUS(
+              context->syms,
+              cuDeviceGetAttribute(
+                  &supports_concurrent_managed_access,
+                  CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, device),
+              "cuDeviceGetAttribute"));
+
+  IREE_TRACE_ZONE_APPEND_TEXT(
+      z0, supports_concurrent_managed_access
+              ? "has CONCURRENT_MANAGED_ACCESS"
+              : "no CONCURRENT_MANAGED_ACCESS (expect slow accesses on "
+                "device-local + host-visible memory)");
+
+  iree_hal_cuda_allocator_t* allocator = NULL;
+  iree_status_t status = iree_allocator_malloc(
+      context->host_allocator, sizeof(*allocator), (void**)&allocator);
+  if (iree_status_is_ok(status)) {
+    iree_hal_resource_initialize(&iree_hal_cuda_allocator_vtable,
+                                 &allocator->resource);
+    allocator->base_device = base_device;
+    allocator->context = context;
+    allocator->device = device;
+    allocator->stream = stream;
+    allocator->supports_concurrent_managed_access =
+        supports_concurrent_managed_access != 0;
+    *out_allocator = (iree_hal_allocator_t*)allocator;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_cuda_allocator_destroy(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator) {
+  iree_hal_cuda_allocator_t* allocator =
+      iree_hal_cuda_allocator_cast(base_allocator);
+  iree_allocator_t host_allocator = allocator->context->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_allocator_free(host_allocator, allocator);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_allocator_t iree_hal_cuda_allocator_host_allocator(
+    const iree_hal_allocator_t* IREE_RESTRICT base_allocator) {
+  iree_hal_cuda_allocator_t* allocator =
+      (iree_hal_cuda_allocator_t*)base_allocator;
+  return allocator->context->host_allocator;
+}
+
+static iree_status_t iree_hal_cuda_allocator_trim(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator) {
+  return iree_ok_status();
+}
+
+static void iree_hal_cuda_allocator_query_statistics(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    iree_hal_allocator_statistics_t* IREE_RESTRICT out_statistics) {
+  IREE_STATISTICS({
+    iree_hal_cuda_allocator_t* allocator =
+        iree_hal_cuda_allocator_cast(base_allocator);
+    memcpy(out_statistics, &allocator->statistics, sizeof(*out_statistics));
+  });
+}
+
+static iree_hal_buffer_compatibility_t
+iree_hal_cuda_allocator_query_compatibility(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    const iree_hal_buffer_params_t* IREE_RESTRICT params,
+    iree_device_size_t allocation_size) {
+  iree_hal_cuda_allocator_t* allocator =
+      iree_hal_cuda_allocator_cast(base_allocator);
+
+  // If concurrent managed access is not supported then we disallow mapping of
+  // device local memory.
+  if (!allocator->supports_concurrent_managed_access &&
+      iree_all_bits_set(params->usage, IREE_HAL_BUFFER_USAGE_MAPPING) &&
+      iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL |
+                                          IREE_HAL_MEMORY_TYPE_HOST_VISIBLE)) {
+    return IREE_HAL_BUFFER_COMPATIBILITY_NONE;
+  }
+
+  // All buffers can be allocated on the heap.
+  iree_hal_buffer_compatibility_t compatibility =
+      IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE;
+
+  // CUDA supports host <-> device for all copies.
+  if (iree_all_bits_set(params->usage, IREE_HAL_BUFFER_USAGE_TRANSFER)) {
+    compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_TRANSFER;
+  }
+
+  // Buffers can only be used on the queue if they are device visible.
+  if (iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE)) {
+    if (iree_all_bits_set(params->usage, IREE_HAL_BUFFER_USAGE_DISPATCH)) {
+      compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_DISPATCH;
+    }
+  }
+
+  return compatibility;
+}
+
+static void iree_hal_cuda_buffer_free(iree_hal_cuda_context_wrapper_t* context,
+                                      iree_hal_memory_type_t memory_type,
+                                      CUdeviceptr device_ptr, void* host_ptr) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  if (iree_all_bits_set(memory_type, IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL)) {
+    // Device local.
+    CUDA_IGNORE_ERROR(context->syms, cuMemFree(device_ptr));
+  } else {
+    // Host local.
+    CUDA_IGNORE_ERROR(context->syms, cuMemFreeHost(host_ptr));
+  }
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_status_t iree_hal_cuda_allocator_allocate_buffer(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    const iree_hal_buffer_params_t* IREE_RESTRICT params,
+    iree_device_size_t allocation_size, iree_const_byte_span_t initial_data,
+    iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
+  iree_hal_cuda_allocator_t* allocator =
+      iree_hal_cuda_allocator_cast(base_allocator);
+  // Guard against the corner case where the requested buffer size is 0. The
+  // application is unlikely to do anything when requesting a 0-byte buffer; but
+  // it can happen in real world use cases. So we should at least not crash.
+  if (allocation_size == 0) allocation_size = 4;
+
+  // If concurrent managed access is not supported then make device-local +
+  // host-visible allocations fall back to host-local + device-visible
+  // page-locked memory. This will be significantly slower for the device to
+  // access but the compiler only uses this type for readback staging buffers
+  // and it's better to function than function fast.
+  iree_hal_memory_type_t memory_type = params->type;
+  if (!allocator->supports_concurrent_managed_access &&
+      iree_all_bits_set(memory_type, IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL |
+                                         IREE_HAL_MEMORY_TYPE_HOST_VISIBLE)) {
+    memory_type &= ~(IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL |
+                     IREE_HAL_MEMORY_TYPE_HOST_VISIBLE);
+    memory_type |=
+        IREE_HAL_MEMORY_TYPE_HOST_LOCAL | IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE;
+  }
+
+  iree_status_t status = iree_ok_status();
+  void* host_ptr = NULL;
+  CUdeviceptr device_ptr = 0;
+  IREE_TRACE_ZONE_BEGIN_NAMED(z0, "iree_hal_cuda_buffer_allocate");
+  if (iree_all_bits_set(memory_type, IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL)) {
+    // Device local case.
+    if (iree_all_bits_set(memory_type, IREE_HAL_MEMORY_TYPE_HOST_VISIBLE)) {
+      status =
+          CU_RESULT_TO_STATUS(allocator->context->syms,
+                              cuMemAllocManaged(&device_ptr, allocation_size,
+                                                CU_MEM_ATTACH_GLOBAL));
+      if (iree_status_is_ok(status)) {
+        // Prefetch the buffer on the GPU device.
+        status = CU_RESULT_TO_STATUS(
+            allocator->context->syms,
+            cuMemPrefetchAsync(device_ptr, allocation_size, allocator->device,
+                               allocator->stream));
+      }
+      host_ptr = (void*)device_ptr;
+    } else {
+      // Device only.
+      status = CU_RESULT_TO_STATUS(allocator->context->syms,
+                                   cuMemAlloc(&device_ptr, allocation_size));
+    }
+  } else {
+    unsigned int flags = CU_MEMHOSTALLOC_DEVICEMAP;
+    if (!iree_all_bits_set(memory_type, IREE_HAL_MEMORY_TYPE_HOST_CACHED)) {
+      flags |= CU_MEMHOSTALLOC_WRITECOMBINED;
+    }
+    status =
+        CU_RESULT_TO_STATUS(allocator->context->syms,
+                            cuMemHostAlloc(&host_ptr, allocation_size, flags));
+    if (iree_status_is_ok(status)) {
+      status = CU_RESULT_TO_STATUS(
+          allocator->context->syms,
+          cuMemHostGetDevicePointer(&device_ptr, host_ptr, /*flags=*/0));
+    }
+  }
+  IREE_TRACE_ZONE_END(z0);
+
+  iree_hal_buffer_t* buffer = NULL;
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_cuda_buffer_wrap(
+        base_allocator, memory_type, params->access, params->usage,
+        allocation_size,
+        /*byte_offset=*/0,
+        /*byte_length=*/allocation_size, device_ptr, host_ptr, &buffer);
+  }
+
+  // Copy the initial contents into the buffer. This may require staging.
+  if (iree_status_is_ok(status) &&
+      !iree_const_byte_span_is_empty(initial_data)) {
+    status = iree_hal_device_transfer_range(
+        allocator->base_device,
+        iree_hal_make_host_transfer_buffer_span((void*)initial_data.data,
+                                                initial_data.data_length),
+        0, iree_hal_make_device_transfer_buffer(buffer), 0,
+        initial_data.data_length, IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT,
+        iree_infinite_timeout());
+  }
+
+  if (iree_status_is_ok(status)) {
+    IREE_TRACE_ALLOC_NAMED(IREE_HAL_CUDA_ALLOCATOR_ID,
+                           (void*)iree_hal_cuda_buffer_device_pointer(buffer),
+                           allocation_size);
+    IREE_STATISTICS(iree_hal_allocator_statistics_record_alloc(
+        &allocator->statistics, memory_type, allocation_size));
+    *out_buffer = buffer;
+  } else {
+    if (!buffer) {
+      iree_hal_cuda_buffer_free(allocator->context, memory_type, device_ptr,
+                                host_ptr);
+    } else {
+      iree_hal_buffer_release(buffer);
+    }
+  }
+  return status;
+}
+
+static void iree_hal_cuda_allocator_deallocate_buffer(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    iree_hal_buffer_t* IREE_RESTRICT base_buffer) {
+  iree_hal_cuda_allocator_t* allocator =
+      iree_hal_cuda_allocator_cast(base_allocator);
+  iree_hal_memory_type_t memory_type = iree_hal_buffer_memory_type(base_buffer);
+  iree_hal_cuda_buffer_free(allocator->context, memory_type,
+                            iree_hal_cuda_buffer_device_pointer(base_buffer),
+                            iree_hal_cuda_buffer_host_pointer(base_buffer));
+
+  IREE_TRACE_FREE_NAMED(
+      IREE_HAL_CUDA_ALLOCATOR_ID,
+      (void*)iree_hal_cuda_buffer_device_pointer(base_buffer));
+  IREE_STATISTICS(iree_hal_allocator_statistics_record_free(
+      &allocator->statistics, memory_type,
+      iree_hal_buffer_allocation_size(base_buffer)));
+
+  iree_hal_buffer_destroy(base_buffer);
+}
+
+static iree_status_t iree_hal_cuda_allocator_import_buffer(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    const iree_hal_buffer_params_t* IREE_RESTRICT params,
+    iree_hal_external_buffer_t* IREE_RESTRICT external_buffer,
+    iree_hal_buffer_release_callback_t release_callback,
+    iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
+  return iree_make_status(IREE_STATUS_UNAVAILABLE,
+                          "importing from external buffers not supported");
+}
+
+static iree_status_t iree_hal_cuda_allocator_export_buffer(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    iree_hal_buffer_t* IREE_RESTRICT buffer,
+    iree_hal_external_buffer_type_t requested_type,
+    iree_hal_external_buffer_flags_t requested_flags,
+    iree_hal_external_buffer_t* IREE_RESTRICT out_external_buffer) {
+  return iree_make_status(IREE_STATUS_UNAVAILABLE,
+                          "exporting to external buffers not supported");
+}
+
+static const iree_hal_allocator_vtable_t iree_hal_cuda_allocator_vtable = {
+    .destroy = iree_hal_cuda_allocator_destroy,
+    .host_allocator = iree_hal_cuda_allocator_host_allocator,
+    .trim = iree_hal_cuda_allocator_trim,
+    .query_statistics = iree_hal_cuda_allocator_query_statistics,
+    .query_compatibility = iree_hal_cuda_allocator_query_compatibility,
+    .allocate_buffer = iree_hal_cuda_allocator_allocate_buffer,
+    .deallocate_buffer = iree_hal_cuda_allocator_deallocate_buffer,
+    .import_buffer = iree_hal_cuda_allocator_import_buffer,
+    .export_buffer = iree_hal_cuda_allocator_export_buffer,
+};
diff --git a/runtime/src/iree/hal/cuda/cuda_allocator.h b/runtime/src/iree/hal/cuda/cuda_allocator.h
new file mode 100644
index 0000000..4f22579
--- /dev/null
+++ b/runtime/src/iree/hal/cuda/cuda_allocator.h
@@ -0,0 +1,28 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_CUDA_ALLOCATOR_H_
+#define IREE_HAL_CUDA_ALLOCATOR_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/cuda/context_wrapper.h"
+#include "iree/hal/cuda/status_util.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Create a cuda allocator.
+iree_status_t iree_hal_cuda_allocator_create(
+    iree_hal_device_t* base_device, iree_hal_cuda_context_wrapper_t* context,
+    CUdevice device, CUstream stream, iree_hal_allocator_t** out_allocator);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_CUDA_ALLOCATOR_H_
diff --git a/runtime/src/iree/hal/cuda/cuda_buffer.c b/runtime/src/iree/hal/cuda/cuda_buffer.c
new file mode 100644
index 0000000..b69241f
--- /dev/null
+++ b/runtime/src/iree/hal/cuda/cuda_buffer.c
@@ -0,0 +1,136 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/cuda/cuda_buffer.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+
+typedef struct iree_hal_cuda_buffer_t {
+  iree_hal_buffer_t base;
+  void* host_ptr;
+  CUdeviceptr device_ptr;
+} iree_hal_cuda_buffer_t;
+
+static const iree_hal_buffer_vtable_t iree_hal_cuda_buffer_vtable;
+
+static iree_hal_cuda_buffer_t* iree_hal_cuda_buffer_cast(
+    iree_hal_buffer_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda_buffer_vtable);
+  return (iree_hal_cuda_buffer_t*)base_value;
+}
+
+iree_status_t iree_hal_cuda_buffer_wrap(
+    iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type,
+    iree_hal_memory_access_t allowed_access,
+    iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size,
+    iree_device_size_t byte_offset, iree_device_size_t byte_length,
+    CUdeviceptr device_ptr, void* host_ptr, iree_hal_buffer_t** out_buffer) {
+  IREE_ASSERT_ARGUMENT(allocator);
+  IREE_ASSERT_ARGUMENT(out_buffer);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_allocator_t host_allocator =
+      iree_hal_allocator_host_allocator(allocator);
+  iree_hal_cuda_buffer_t* buffer = NULL;
+  iree_status_t status =
+      iree_allocator_malloc(host_allocator, sizeof(*buffer), (void**)&buffer);
+  if (iree_status_is_ok(status)) {
+    iree_hal_buffer_initialize(host_allocator, allocator, &buffer->base,
+                               allocation_size, byte_offset, byte_length,
+                               memory_type, allowed_access, allowed_usage,
+                               &iree_hal_cuda_buffer_vtable, &buffer->base);
+    buffer->host_ptr = host_ptr;
+    buffer->device_ptr = device_ptr;
+    *out_buffer = &buffer->base;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_cuda_buffer_destroy(iree_hal_buffer_t* base_buffer) {
+  iree_hal_cuda_buffer_t* buffer = iree_hal_cuda_buffer_cast(base_buffer);
+  iree_allocator_t host_allocator = base_buffer->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_allocator_free(host_allocator, buffer);
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_status_t iree_hal_cuda_buffer_map_range(
+    iree_hal_buffer_t* base_buffer, iree_hal_mapping_mode_t mapping_mode,
+    iree_hal_memory_access_t memory_access,
+    iree_device_size_t local_byte_offset, iree_device_size_t local_byte_length,
+    iree_hal_buffer_mapping_t* mapping) {
+  iree_hal_cuda_buffer_t* buffer = iree_hal_cuda_buffer_cast(base_buffer);
+
+  // TODO(benvanik): add upload/download for unmapped buffers.
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_memory_type(
+      iree_hal_buffer_memory_type(base_buffer),
+      IREE_HAL_MEMORY_TYPE_HOST_VISIBLE));
+  IREE_RETURN_IF_ERROR(
+      iree_hal_buffer_validate_usage(iree_hal_buffer_allowed_usage(base_buffer),
+                                     IREE_HAL_BUFFER_USAGE_MAPPING));
+
+  uint8_t* data_ptr = (uint8_t*)(buffer->host_ptr) + local_byte_offset;
+  // If we mapped for discard scribble over the bytes. This is not a mandated
+  // behavior but it will make debugging issues easier. Alternatively for
+  // heap buffers we could reallocate them such that ASAN yells, but that
+  // would only work if the entire buffer was discarded.
+#ifndef NDEBUG
+  if (iree_any_bit_set(memory_access, IREE_HAL_MEMORY_ACCESS_DISCARD)) {
+    memset(data_ptr, 0xCD, local_byte_length);
+  }
+#endif  // !NDEBUG
+
+  mapping->contents = iree_make_byte_span(data_ptr, local_byte_length);
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_cuda_buffer_unmap_range(
+    iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset,
+    iree_device_size_t local_byte_length, iree_hal_buffer_mapping_t* mapping) {
+  // Nothing to do (today).
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_cuda_buffer_invalidate_range(
+    iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset,
+    iree_device_size_t local_byte_length) {
+  // Nothing to do.
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_cuda_buffer_flush_range(
+    iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset,
+    iree_device_size_t local_byte_length) {
+  // Nothing to do.
+  return iree_ok_status();
+}
+
+CUdeviceptr iree_hal_cuda_buffer_device_pointer(
+    iree_hal_buffer_t* base_buffer) {
+  iree_hal_cuda_buffer_t* buffer = iree_hal_cuda_buffer_cast(base_buffer);
+  return buffer->device_ptr;
+}
+
+void* iree_hal_cuda_buffer_host_pointer(iree_hal_buffer_t* base_buffer) {
+  iree_hal_cuda_buffer_t* buffer = iree_hal_cuda_buffer_cast(base_buffer);
+  return buffer->host_ptr;
+}
+
+static const iree_hal_buffer_vtable_t iree_hal_cuda_buffer_vtable = {
+    .recycle = iree_hal_buffer_recycle,
+    .destroy = iree_hal_cuda_buffer_destroy,
+    .map_range = iree_hal_cuda_buffer_map_range,
+    .unmap_range = iree_hal_cuda_buffer_unmap_range,
+    .invalidate_range = iree_hal_cuda_buffer_invalidate_range,
+    .flush_range = iree_hal_cuda_buffer_flush_range,
+};
diff --git a/runtime/src/iree/hal/cuda/cuda_buffer.h b/runtime/src/iree/hal/cuda/cuda_buffer.h
new file mode 100644
index 0000000..2aaf037
--- /dev/null
+++ b/runtime/src/iree/hal/cuda/cuda_buffer.h
@@ -0,0 +1,38 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_CUDA_BUFFER_H_
+#define IREE_HAL_CUDA_BUFFER_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/cuda/cuda_headers.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Wraps a CUDA allocation in an iree_hal_buffer_t.
+iree_status_t iree_hal_cuda_buffer_wrap(
+    iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type,
+    iree_hal_memory_access_t allowed_access,
+    iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size,
+    iree_device_size_t byte_offset, iree_device_size_t byte_length,
+    CUdeviceptr device_ptr, void* host_ptr, iree_hal_buffer_t** out_buffer);
+
+// Returns the CUDA base pointer for the given |buffer|.
+// This is the entire allocated_buffer and must be offset by the buffer
+// byte_offset and byte_length when used.
+CUdeviceptr iree_hal_cuda_buffer_device_pointer(iree_hal_buffer_t* buffer);
+
+// Returns the CUDA host pointer for the given |buffer|, if available.
+void* iree_hal_cuda_buffer_host_pointer(iree_hal_buffer_t* buffer);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_CUDA_BUFFER_H_
diff --git a/runtime/src/iree/hal/cuda/cuda_device.c b/runtime/src/iree/hal/cuda/cuda_device.c
new file mode 100644
index 0000000..4cd0290
--- /dev/null
+++ b/runtime/src/iree/hal/cuda/cuda_device.c
@@ -0,0 +1,407 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/cuda/cuda_device.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "iree/base/internal/arena.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/cuda/context_wrapper.h"
+#include "iree/hal/cuda/cuda_allocator.h"
+#include "iree/hal/cuda/cuda_event.h"
+#include "iree/hal/cuda/descriptor_set_layout.h"
+#include "iree/hal/cuda/dynamic_symbols.h"
+#include "iree/hal/cuda/event_semaphore.h"
+#include "iree/hal/cuda/executable_layout.h"
+#include "iree/hal/cuda/graph_command_buffer.h"
+#include "iree/hal/cuda/nop_executable_cache.h"
+#include "iree/hal/cuda/status_util.h"
+#include "iree/hal/cuda/stream_command_buffer.h"
+#include "iree/hal/utils/buffer_transfer.h"
+#include "iree/hal/utils/deferred_command_buffer.h"
+
+//===----------------------------------------------------------------------===//
+// iree_hal_cuda_device_t
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_cuda_device_t {
+  iree_hal_resource_t resource;
+  iree_string_view_t identifier;
+
+  // Block pool used for command buffers with a larger block size (as command
+  // buffers can contain inlined data uploads).
+  iree_arena_block_pool_t block_pool;
+
+  // Optional driver that owns the CUDA symbols. We retain it for our lifetime
+  // to ensure the symbols remains valid.
+  iree_hal_driver_t* driver;
+
+  // Parameters used to control device behavior.
+  iree_hal_cuda_device_params_t params;
+
+  CUdevice device;
+
+  // TODO: support multiple streams.
+  CUstream stream;
+  iree_hal_cuda_context_wrapper_t context_wrapper;
+  iree_hal_allocator_t* device_allocator;
+
+  // Cache of the direct stream command buffer initialized when in stream mode.
+  // TODO: have one cached per stream once there are multiple streams.
+  iree_hal_command_buffer_t* stream_command_buffer;
+} iree_hal_cuda_device_t;
+
+static const iree_hal_device_vtable_t iree_hal_cuda_device_vtable;
+
+static iree_hal_cuda_device_t* iree_hal_cuda_device_cast(
+    iree_hal_device_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda_device_vtable);
+  return (iree_hal_cuda_device_t*)base_value;
+}
+
+void iree_hal_cuda_device_params_initialize(
+    iree_hal_cuda_device_params_t* out_params) {
+  out_params->arena_block_size = 32 * 1024;
+  out_params->queue_count = 8;
+  out_params->command_buffer_mode = IREE_HAL_CUDA_COMMAND_BUFFER_MODE_GRAPH;
+  out_params->allow_inline_execution = false;
+}
+
+static iree_status_t iree_hal_cuda_device_check_params(
+    const iree_hal_cuda_device_params_t* params) {
+  if (params->arena_block_size < 4096) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "arena block size too small (< 4096 bytes)");
+  }
+  if (params->queue_count == 0) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "at least one queue is required");
+  }
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_cuda_device_create_internal(
+    iree_hal_driver_t* driver, iree_string_view_t identifier,
+    const iree_hal_cuda_device_params_t* params, CUdevice cu_device,
+    CUstream stream, CUcontext context, iree_hal_cuda_dynamic_symbols_t* syms,
+    iree_allocator_t host_allocator, iree_hal_device_t** out_device) {
+  iree_hal_cuda_device_t* device = NULL;
+  iree_host_size_t total_size = iree_sizeof_struct(*device) + identifier.size;
+  IREE_RETURN_IF_ERROR(
+      iree_allocator_malloc(host_allocator, total_size, (void**)&device));
+  memset(device, 0, total_size);
+  iree_hal_resource_initialize(&iree_hal_cuda_device_vtable, &device->resource);
+  device->driver = driver;
+  iree_hal_driver_retain(device->driver);
+  iree_string_view_append_to_buffer(
+      identifier, &device->identifier,
+      (char*)device + iree_sizeof_struct(*device));
+  device->params = *params;
+  device->device = cu_device;
+  device->stream = stream;
+  device->context_wrapper.cu_context = context;
+  device->context_wrapper.host_allocator = host_allocator;
+  iree_arena_block_pool_initialize(params->arena_block_size, host_allocator,
+                                   &device->block_pool);
+  device->context_wrapper.syms = syms;
+
+  iree_status_t status = iree_hal_cuda_allocator_create(
+      (iree_hal_device_t*)device, &device->context_wrapper, cu_device, stream,
+      &device->device_allocator);
+
+  if (iree_status_is_ok(status) &&
+      params->command_buffer_mode == IREE_HAL_CUDA_COMMAND_BUFFER_MODE_STREAM) {
+    status = iree_hal_cuda_stream_command_buffer_create(
+        (iree_hal_device_t*)device, &device->context_wrapper,
+        IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION,
+        IREE_HAL_COMMAND_CATEGORY_ANY, device->stream, /*block_pool=*/NULL,
+        &device->stream_command_buffer);
+  }
+
+  if (iree_status_is_ok(status)) {
+    *out_device = (iree_hal_device_t*)device;
+  } else {
+    iree_hal_device_release((iree_hal_device_t*)device);
+  }
+  return status;
+}
+
+iree_status_t iree_hal_cuda_device_create(
+    iree_hal_driver_t* driver, iree_string_view_t identifier,
+    const iree_hal_cuda_device_params_t* params,
+    iree_hal_cuda_dynamic_symbols_t* syms, CUdevice device,
+    iree_allocator_t host_allocator, iree_hal_device_t** out_device) {
+  IREE_ASSERT_ARGUMENT(params);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(z0,
+                                    iree_hal_cuda_device_check_params(params));
+  CUcontext context;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, CU_RESULT_TO_STATUS(syms, cuCtxCreate(&context, 0, device)));
+  CUstream stream;
+  iree_status_t status = CU_RESULT_TO_STATUS(
+      syms, cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
+
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_cuda_device_create_internal(driver, identifier, params,
+                                                  device, stream, context, syms,
+                                                  host_allocator, out_device);
+  }
+  if (!iree_status_is_ok(status)) {
+    if (stream) {
+      syms->cuStreamDestroy(stream);
+    }
+    syms->cuCtxDestroy(context);
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_cuda_device_destroy(iree_hal_device_t* base_device) {
+  iree_hal_cuda_device_t* device = iree_hal_cuda_device_cast(base_device);
+  iree_allocator_t host_allocator = iree_hal_device_host_allocator(base_device);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // There should be no more buffers live that use the allocator.
+  iree_hal_command_buffer_release(device->stream_command_buffer);
+  iree_hal_allocator_release(device->device_allocator);
+  CUDA_IGNORE_ERROR(device->context_wrapper.syms,
+                    cuStreamDestroy(device->stream));
+
+  iree_arena_block_pool_deinitialize(&device->block_pool);
+
+  // Finally, destroy the device.
+  iree_hal_driver_release(device->driver);
+
+  iree_allocator_free(host_allocator, device);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_string_view_t iree_hal_cuda_device_id(
+    iree_hal_device_t* base_device) {
+  iree_hal_cuda_device_t* device = iree_hal_cuda_device_cast(base_device);
+  return device->identifier;
+}
+
+static iree_allocator_t iree_hal_cuda_device_host_allocator(
+    iree_hal_device_t* base_device) {
+  iree_hal_cuda_device_t* device = iree_hal_cuda_device_cast(base_device);
+  return device->context_wrapper.host_allocator;
+}
+
+static iree_hal_allocator_t* iree_hal_cuda_device_allocator(
+    iree_hal_device_t* base_device) {
+  iree_hal_cuda_device_t* device = iree_hal_cuda_device_cast(base_device);
+  return device->device_allocator;
+}
+
+static iree_status_t iree_hal_cuda_device_trim(iree_hal_device_t* base_device) {
+  iree_hal_cuda_device_t* device = iree_hal_cuda_device_cast(base_device);
+  iree_arena_block_pool_trim(&device->block_pool);
+  return iree_hal_allocator_trim(device->device_allocator);
+}
+
+static iree_status_t iree_hal_cuda_device_query_i32(
+    iree_hal_device_t* base_device, iree_string_view_t category,
+    iree_string_view_t key, int32_t* out_value) {
+  // iree_hal_cuda_device_t* device = iree_hal_cuda_device_cast(base_device);
+  *out_value = 0;
+
+  if (iree_string_view_equal(category,
+                             iree_make_cstring_view("hal.executable.format"))) {
+    *out_value =
+        iree_string_view_equal(key, iree_make_cstring_view("cuda-nvptx-fb"))
+            ? 1
+            : 0;
+    return iree_ok_status();
+  }
+
+  return iree_make_status(
+      IREE_STATUS_NOT_FOUND,
+      "unknown device configuration key value '%.*s :: %.*s'",
+      (int)category.size, category.data, (int)key.size, key.data);
+}
+
+static iree_status_t iree_hal_cuda_device_create_command_buffer(
+    iree_hal_device_t* base_device, iree_hal_command_buffer_mode_t mode,
+    iree_hal_command_category_t command_categories,
+    iree_hal_queue_affinity_t queue_affinity,
+    iree_hal_command_buffer_t** out_command_buffer) {
+  iree_hal_cuda_device_t* device = iree_hal_cuda_device_cast(base_device);
+  if (device->params.allow_inline_execution &&
+      iree_all_bits_set(mode,
+                        IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION)) {
+    // The caller has indicated the command buffer can be executed as it is
+    // recorded, implying that the command buffer cannot be reused and doesn't
+    // need to be persisted. This lets us lower the execution delay as we can
+    // directly route commands to a CUDA stream and let it eagerly flush.
+    return iree_hal_cuda_stream_command_buffer_create(
+        base_device, &device->context_wrapper, mode, command_categories,
+        device->stream, &device->block_pool, out_command_buffer);
+  }
+  switch (device->params.command_buffer_mode) {
+    case IREE_HAL_CUDA_COMMAND_BUFFER_MODE_GRAPH:
+      return iree_hal_cuda_graph_command_buffer_create(
+          base_device, &device->context_wrapper, mode, command_categories,
+          queue_affinity, &device->block_pool, out_command_buffer);
+    case IREE_HAL_CUDA_COMMAND_BUFFER_MODE_STREAM:
+      return iree_hal_deferred_command_buffer_create(
+          base_device, mode, command_categories, &device->block_pool,
+          iree_hal_device_host_allocator(base_device), out_command_buffer);
+    default:
+      return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                              "invalid command buffer mode");
+  }
+}
+
+static iree_status_t iree_hal_cuda_device_create_descriptor_set(
+    iree_hal_device_t* base_device,
+    iree_hal_descriptor_set_layout_t* set_layout,
+    iree_host_size_t binding_count,
+    const iree_hal_descriptor_set_binding_t* bindings,
+    iree_hal_descriptor_set_t** out_descriptor_set) {
+  return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                          "non-push descriptor sets still need work");
+}
+
+static iree_status_t iree_hal_cuda_device_create_descriptor_set_layout(
+    iree_hal_device_t* base_device,
+    iree_hal_descriptor_set_layout_usage_type_t usage_type,
+    iree_host_size_t binding_count,
+    const iree_hal_descriptor_set_layout_binding_t* bindings,
+    iree_hal_descriptor_set_layout_t** out_descriptor_set_layout) {
+  iree_hal_cuda_device_t* device = iree_hal_cuda_device_cast(base_device);
+  return iree_hal_cuda_descriptor_set_layout_create(
+      &device->context_wrapper, usage_type, binding_count, bindings,
+      out_descriptor_set_layout);
+}
+
+static iree_status_t iree_hal_cuda_device_create_event(
+    iree_hal_device_t* base_device, iree_hal_event_t** out_event) {
+  iree_hal_cuda_device_t* device = iree_hal_cuda_device_cast(base_device);
+  return iree_hal_cuda_event_create(&device->context_wrapper, out_event);
+}
+
+static iree_status_t iree_hal_cuda_device_create_executable_cache(
+    iree_hal_device_t* base_device, iree_string_view_t identifier,
+    iree_loop_t loop, iree_hal_executable_cache_t** out_executable_cache) {
+  iree_hal_cuda_device_t* device = iree_hal_cuda_device_cast(base_device);
+  return iree_hal_cuda_nop_executable_cache_create(
+      &device->context_wrapper, identifier, out_executable_cache);
+}
+
+static iree_status_t iree_hal_cuda_device_create_executable_layout(
+    iree_hal_device_t* base_device, iree_host_size_t push_constants,
+    iree_host_size_t set_layout_count,
+    iree_hal_descriptor_set_layout_t** set_layouts,
+    iree_hal_executable_layout_t** out_executable_layout) {
+  iree_hal_cuda_device_t* device = iree_hal_cuda_device_cast(base_device);
+  return iree_hal_cuda_executable_layout_create(
+      &device->context_wrapper, set_layout_count, set_layouts, push_constants,
+      out_executable_layout);
+}
+
+static iree_status_t iree_hal_cuda_device_create_semaphore(
+    iree_hal_device_t* base_device, uint64_t initial_value,
+    iree_hal_semaphore_t** out_semaphore) {
+  iree_hal_cuda_device_t* device = iree_hal_cuda_device_cast(base_device);
+  return iree_hal_cuda_semaphore_create(&device->context_wrapper, initial_value,
+                                        out_semaphore);
+}
+
+static iree_status_t iree_hal_cuda_device_queue_submit(
+    iree_hal_device_t* base_device,
+    iree_hal_command_category_t command_categories,
+    iree_hal_queue_affinity_t queue_affinity, iree_host_size_t batch_count,
+    const iree_hal_submission_batch_t* batches) {
+  iree_hal_cuda_device_t* device = iree_hal_cuda_device_cast(base_device);
+  for (int i = 0; i < batch_count; i++) {
+    for (int j = 0; j < batches[i].command_buffer_count; j++) {
+      iree_hal_command_buffer_t* command_buffer = batches[i].command_buffers[j];
+      if (iree_hal_cuda_stream_command_buffer_isa(command_buffer)) {
+        // Nothing to do for an inline command buffer; all the work has already
+        // been submitted. When we support semaphores we'll still need to signal
+        // their completion but do not have to worry about any waits: if there
+        // were waits we wouldn't have been able to execute inline!
+      } else if (iree_hal_cuda_graph_command_buffer_isa(command_buffer)) {
+        CUgraphExec exec = iree_hal_cuda_graph_command_buffer_exec(
+            batches[i].command_buffers[j]);
+        CUDA_RETURN_IF_ERROR(device->context_wrapper.syms,
+                             cuGraphLaunch(exec, device->stream),
+                             "cuGraphLaunch");
+      } else {
+        IREE_RETURN_IF_ERROR(iree_hal_deferred_command_buffer_apply(
+            batches[i].command_buffers[j], device->stream_command_buffer));
+      }
+    }
+  }
+  // TODO(thomasraoux): implement semaphores - for now this conservatively
+  // synchronizes after every submit.
+  CUDA_RETURN_IF_ERROR(device->context_wrapper.syms,
+                       cuStreamSynchronize(device->stream),
+                       "cuStreamSynchronize");
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_cuda_device_submit_and_wait(
+    iree_hal_device_t* base_device,
+    iree_hal_command_category_t command_categories,
+    iree_hal_queue_affinity_t queue_affinity, iree_host_size_t batch_count,
+    const iree_hal_submission_batch_t* batches,
+    iree_hal_semaphore_t* wait_semaphore, uint64_t wait_value,
+    iree_timeout_t timeout) {
+  // Submit...
+  IREE_RETURN_IF_ERROR(iree_hal_cuda_device_queue_submit(
+      base_device, command_categories, queue_affinity, batch_count, batches));
+
+  // ...and wait.
+  return iree_hal_semaphore_wait(wait_semaphore, wait_value, timeout);
+}
+
+static iree_status_t iree_hal_cuda_device_wait_semaphores(
+    iree_hal_device_t* base_device, iree_hal_wait_mode_t wait_mode,
+    const iree_hal_semaphore_list_t* semaphore_list, iree_timeout_t timeout) {
+  return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                          "semaphore not implemented");
+}
+
+static iree_status_t iree_hal_cuda_device_wait_idle(
+    iree_hal_device_t* base_device, iree_timeout_t timeout) {
+  iree_hal_cuda_device_t* device = iree_hal_cuda_device_cast(base_device);
+  // Wait until the stream is done.
+  // TODO(thomasraoux): CUDA doesn't support a deadline for wait, figure out how
+  // to handle it better.
+  CUDA_RETURN_IF_ERROR(device->context_wrapper.syms,
+                       cuStreamSynchronize(device->stream),
+                       "cuStreamSynchronize");
+  return iree_ok_status();
+}
+
+static const iree_hal_device_vtable_t iree_hal_cuda_device_vtable = {
+    .destroy = iree_hal_cuda_device_destroy,
+    .id = iree_hal_cuda_device_id,
+    .host_allocator = iree_hal_cuda_device_host_allocator,
+    .device_allocator = iree_hal_cuda_device_allocator,
+    .trim = iree_hal_cuda_device_trim,
+    .query_i32 = iree_hal_cuda_device_query_i32,
+    .create_command_buffer = iree_hal_cuda_device_create_command_buffer,
+    .create_descriptor_set = iree_hal_cuda_device_create_descriptor_set,
+    .create_descriptor_set_layout =
+        iree_hal_cuda_device_create_descriptor_set_layout,
+    .create_event = iree_hal_cuda_device_create_event,
+    .create_executable_cache = iree_hal_cuda_device_create_executable_cache,
+    .create_executable_layout = iree_hal_cuda_device_create_executable_layout,
+    .create_semaphore = iree_hal_cuda_device_create_semaphore,
+    .transfer_range = iree_hal_device_submit_transfer_range_and_wait,
+    .queue_submit = iree_hal_cuda_device_queue_submit,
+    .submit_and_wait = iree_hal_cuda_device_submit_and_wait,
+    .wait_semaphores = iree_hal_cuda_device_wait_semaphores,
+    .wait_idle = iree_hal_cuda_device_wait_idle,
+};
diff --git a/runtime/src/iree/hal/cuda/cuda_device.h b/runtime/src/iree/hal/cuda/cuda_device.h
new file mode 100644
index 0000000..d7b5790
--- /dev/null
+++ b/runtime/src/iree/hal/cuda/cuda_device.h
@@ -0,0 +1,30 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_CUDA_CUDA_DEVICE_H_
+#define IREE_HAL_CUDA_CUDA_DEVICE_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/cuda/api.h"
+#include "iree/hal/cuda/dynamic_symbols.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Creates a device that owns and manages its own CUcontext.
+iree_status_t iree_hal_cuda_device_create(
+    iree_hal_driver_t* driver, iree_string_view_t identifier,
+    const iree_hal_cuda_device_params_t* params,
+    iree_hal_cuda_dynamic_symbols_t* syms, CUdevice device,
+    iree_allocator_t host_allocator, iree_hal_device_t** out_device);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_CUDA_CUDA_DEVICE_H_
diff --git a/runtime/src/iree/hal/cuda/cuda_driver.c b/runtime/src/iree/hal/cuda/cuda_driver.c
new file mode 100644
index 0000000..e78b4e9
--- /dev/null
+++ b/runtime/src/iree/hal/cuda/cuda_driver.c
@@ -0,0 +1,228 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <stdint.h>
+#include <string.h>
+
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/api.h"
+#include "iree/hal/cuda/api.h"
+#include "iree/hal/cuda/cuda_device.h"
+#include "iree/hal/cuda/dynamic_symbols.h"
+#include "iree/hal/cuda/status_util.h"
+
+typedef struct iree_hal_cuda_driver_t {
+  iree_hal_resource_t resource;
+  iree_allocator_t host_allocator;
+  // Identifier used for the driver in the IREE driver registry.
+  // We allow overriding so that multiple CUDA versions can be exposed in the
+  // same process.
+  iree_string_view_t identifier;
+  iree_hal_cuda_device_params_t default_params;
+  int default_device_index;
+  // CUDA symbols.
+  iree_hal_cuda_dynamic_symbols_t syms;
+} iree_hal_cuda_driver_t;
+
+// Pick a fixed lenght size for device names.
+#define IREE_MAX_CUDA_DEVICE_NAME_LENGTH 100
+
+static const iree_hal_driver_vtable_t iree_hal_cuda_driver_vtable;
+
+static iree_hal_cuda_driver_t* iree_hal_cuda_driver_cast(
+    iree_hal_driver_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda_driver_vtable);
+  return (iree_hal_cuda_driver_t*)base_value;
+}
+
+IREE_API_EXPORT void iree_hal_cuda_driver_options_initialize(
+    iree_hal_cuda_driver_options_t* out_options) {
+  memset(out_options, 0, sizeof(*out_options));
+  out_options->default_device_index = 0;
+}
+
+static iree_status_t iree_hal_cuda_driver_create_internal(
+    iree_string_view_t identifier,
+    const iree_hal_cuda_device_params_t* default_params,
+    const iree_hal_cuda_driver_options_t* options,
+    iree_allocator_t host_allocator, iree_hal_driver_t** out_driver) {
+  iree_hal_cuda_driver_t* driver = NULL;
+  iree_host_size_t total_size = iree_sizeof_struct(*driver) + identifier.size;
+  IREE_RETURN_IF_ERROR(
+      iree_allocator_malloc(host_allocator, total_size, (void**)&driver));
+
+  iree_hal_resource_initialize(&iree_hal_cuda_driver_vtable, &driver->resource);
+  driver->host_allocator = host_allocator;
+  iree_string_view_append_to_buffer(
+      identifier, &driver->identifier,
+      (char*)driver + iree_sizeof_struct(*driver));
+  memcpy(&driver->default_params, default_params,
+         sizeof(driver->default_params));
+  driver->default_device_index = options->default_device_index;
+
+  iree_status_t status =
+      iree_hal_cuda_dynamic_symbols_initialize(host_allocator, &driver->syms);
+  if (iree_status_is_ok(status)) {
+    *out_driver = (iree_hal_driver_t*)driver;
+  } else {
+    iree_hal_driver_release((iree_hal_driver_t*)driver);
+  }
+  return status;
+}
+
+static void iree_hal_cuda_driver_destroy(iree_hal_driver_t* base_driver) {
+  iree_hal_cuda_driver_t* driver = iree_hal_cuda_driver_cast(base_driver);
+  iree_allocator_t host_allocator = driver->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_cuda_dynamic_symbols_deinitialize(&driver->syms);
+  iree_allocator_free(host_allocator, driver);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_cuda_driver_create(
+    iree_string_view_t identifier,
+    const iree_hal_cuda_device_params_t* default_params,
+    const iree_hal_cuda_driver_options_t* options,
+    iree_allocator_t host_allocator, iree_hal_driver_t** out_driver) {
+  IREE_ASSERT_ARGUMENT(default_params);
+  IREE_ASSERT_ARGUMENT(options);
+  IREE_ASSERT_ARGUMENT(out_driver);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_status_t status = iree_hal_cuda_driver_create_internal(
+      identifier, default_params, options, host_allocator, out_driver);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+// Populates device information from the given CUDA physical device handle.
+// |out_device_info| must point to valid memory and additional data will be
+// appended to |buffer_ptr| and the new pointer is returned.
+static uint8_t* iree_hal_cuda_populate_device_info(
+    CUdevice device, iree_hal_cuda_dynamic_symbols_t* syms, uint8_t* buffer_ptr,
+    iree_hal_device_info_t* out_device_info) {
+  char device_name[IREE_MAX_CUDA_DEVICE_NAME_LENGTH];
+  CUDA_IGNORE_ERROR(syms,
+                    cuDeviceGetName(device_name, sizeof(device_name), device));
+  memset(out_device_info, 0, sizeof(*out_device_info));
+  out_device_info->device_id = (iree_hal_device_id_t)device;
+
+  iree_string_view_t device_name_string =
+      iree_make_string_view(device_name, strlen(device_name));
+  buffer_ptr += iree_string_view_append_to_buffer(
+      device_name_string, &out_device_info->name, (char*)buffer_ptr);
+  return buffer_ptr;
+}
+
+// Return true if the device support all the extension required.
+static bool iree_hal_cuda_is_valid_device(iree_hal_cuda_driver_t* driver,
+                                          CUdevice device) {
+  return true;
+}
+
+static iree_status_t iree_hal_cuda_driver_query_available_devices(
+    iree_hal_driver_t* base_driver, iree_allocator_t host_allocator,
+    iree_hal_device_info_t** out_device_infos,
+    iree_host_size_t* out_device_info_count) {
+  iree_hal_cuda_driver_t* driver = iree_hal_cuda_driver_cast(base_driver);
+  // Query the number of available CUDA devices.
+  int device_count = 0;
+  CUDA_RETURN_IF_ERROR(&driver->syms, cuDeviceGetCount(&device_count),
+                       "cuDeviceGetCount");
+
+  // Allocate the return infos and populate with the devices.
+  iree_hal_device_info_t* device_infos = NULL;
+  iree_host_size_t total_size = device_count * sizeof(iree_hal_device_info_t);
+  for (iree_host_size_t i = 0; i < device_count; ++i) {
+    total_size += IREE_MAX_CUDA_DEVICE_NAME_LENGTH * sizeof(char);
+  }
+  iree_status_t status =
+      iree_allocator_malloc(host_allocator, total_size, (void**)&device_infos);
+  int valid_device_count = 0;
+  if (iree_status_is_ok(status)) {
+    uint8_t* buffer_ptr =
+        (uint8_t*)device_infos + device_count * sizeof(iree_hal_device_info_t);
+    for (iree_host_size_t i = 0; i < device_count; ++i) {
+      CUdevice device;
+      iree_status_t status = CU_RESULT_TO_STATUS(
+          &driver->syms, cuDeviceGet(&device, i), "cuDeviceGet");
+      if (!iree_status_is_ok(status)) break;
+      if (!iree_hal_cuda_is_valid_device(driver, device)) continue;
+      buffer_ptr = iree_hal_cuda_populate_device_info(
+          device, &driver->syms, buffer_ptr, &device_infos[valid_device_count]);
+      valid_device_count++;
+    }
+  }
+  if (iree_status_is_ok(status)) {
+    *out_device_info_count = valid_device_count;
+    *out_device_infos = device_infos;
+  } else {
+    iree_allocator_free(host_allocator, device_infos);
+  }
+  return status;
+}
+
+static iree_status_t iree_hal_cuda_driver_select_default_device(
+    iree_hal_driver_t* base_driver, iree_hal_cuda_dynamic_symbols_t* syms,
+    int default_device_index, iree_allocator_t host_allocator,
+    CUdevice* out_device) {
+  iree_hal_device_info_t* out_device_infos;
+  iree_host_size_t device_count;
+  IREE_RETURN_IF_ERROR(iree_hal_cuda_driver_query_available_devices(
+      base_driver, host_allocator, &out_device_infos, &device_count));
+  iree_status_t status = iree_ok_status();
+  if (device_count == 0) {
+    status = iree_make_status(IREE_STATUS_UNAVAILABLE,
+                              "no compatible CUDA devices were found");
+  } else if (default_device_index >= device_count) {
+    status = iree_make_status(IREE_STATUS_NOT_FOUND,
+                              "default device %d not found (of %ld enumerated)",
+                              default_device_index, device_count);
+  } else {
+    *out_device = (CUdevice)out_device_infos[default_device_index].device_id;
+  }
+  iree_allocator_free(host_allocator, out_device_infos);
+  return status;
+}
+
+static iree_status_t iree_hal_cuda_driver_create_device(
+    iree_hal_driver_t* base_driver, iree_hal_device_id_t device_id,
+    iree_allocator_t host_allocator, iree_hal_device_t** out_device) {
+  iree_hal_cuda_driver_t* driver = iree_hal_cuda_driver_cast(base_driver);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, CU_RESULT_TO_STATUS(&driver->syms, cuInit(0), "cuInit"));
+  // Use either the specified device (enumerated earlier) or whatever default
+  // one was specified when the driver was created.
+  CUdevice device = (CUdevice)device_id;
+  if (device == 0) {
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z0, iree_hal_cuda_driver_select_default_device(
+                base_driver, &driver->syms, driver->default_device_index,
+                host_allocator, &device));
+  }
+
+  iree_string_view_t device_name = iree_make_cstring_view("cuda");
+
+  // Attempt to create the device.
+  iree_status_t status = iree_hal_cuda_device_create(
+      base_driver, device_name, &driver->default_params, &driver->syms, device,
+      host_allocator, out_device);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static const iree_hal_driver_vtable_t iree_hal_cuda_driver_vtable = {
+    .destroy = iree_hal_cuda_driver_destroy,
+    .query_available_devices = iree_hal_cuda_driver_query_available_devices,
+    .create_device = iree_hal_cuda_driver_create_device,
+};
diff --git a/runtime/src/iree/hal/cuda/cuda_event.c b/runtime/src/iree/hal/cuda/cuda_event.c
new file mode 100644
index 0000000..ce4d5dd
--- /dev/null
+++ b/runtime/src/iree/hal/cuda/cuda_event.c
@@ -0,0 +1,61 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/cuda/cuda_event.h"
+
+#include <stddef.h>
+
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+
+// Dummy events for now, don't do anything.
+typedef struct iree_hal_cuda_event_t {
+  iree_hal_resource_t resource;
+  iree_hal_cuda_context_wrapper_t* context_wrapper;
+} iree_hal_cuda_event_t;
+
+static const iree_hal_event_vtable_t iree_hal_cuda_event_vtable;
+
+static iree_hal_cuda_event_t* iree_hal_cuda_event_cast(
+    iree_hal_event_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda_event_vtable);
+  return (iree_hal_cuda_event_t*)base_value;
+}
+
+iree_status_t iree_hal_cuda_event_create(
+    iree_hal_cuda_context_wrapper_t* context_wrapper,
+    iree_hal_event_t** out_event) {
+  IREE_ASSERT_ARGUMENT(context_wrapper);
+  IREE_ASSERT_ARGUMENT(out_event);
+  *out_event = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_cuda_event_t* event = NULL;
+  iree_status_t status = iree_allocator_malloc(context_wrapper->host_allocator,
+                                               sizeof(*event), (void**)&event);
+  if (iree_status_is_ok(status)) {
+    iree_hal_resource_initialize(&iree_hal_cuda_event_vtable, &event->resource);
+    event->context_wrapper = context_wrapper;
+    *out_event = (iree_hal_event_t*)event;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_cuda_event_destroy(iree_hal_event_t* base_event) {
+  iree_hal_cuda_event_t* event = iree_hal_cuda_event_cast(base_event);
+  iree_allocator_t host_allocator = event->context_wrapper->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_allocator_free(host_allocator, event);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static const iree_hal_event_vtable_t iree_hal_cuda_event_vtable = {
+    .destroy = iree_hal_cuda_event_destroy,
+};
diff --git a/runtime/src/iree/hal/cuda/cuda_event.h b/runtime/src/iree/hal/cuda/cuda_event.h
new file mode 100644
index 0000000..cf18b47
--- /dev/null
+++ b/runtime/src/iree/hal/cuda/cuda_event.h
@@ -0,0 +1,31 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_CUDA_EVENT_H_
+#define IREE_HAL_CUDA_EVENT_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/cuda/context_wrapper.h"
+#include "iree/hal/cuda/cuda_headers.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Creates a dummy event object. Object will be represented by CUDA Graph edges
+// so nothing is created at creation time. When an event is signaled in the
+// command buffer we will add the appropriate edges to enforce the right
+// synchronization.
+iree_status_t iree_hal_cuda_event_create(
+    iree_hal_cuda_context_wrapper_t* context_wrapper,
+    iree_hal_event_t** out_event);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_CUDA_EVENT_H_
diff --git a/runtime/src/iree/hal/cuda/cuda_headers.h b/runtime/src/iree/hal/cuda/cuda_headers.h
new file mode 100644
index 0000000..cdfbff7
--- /dev/null
+++ b/runtime/src/iree/hal/cuda/cuda_headers.h
@@ -0,0 +1,12 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_CUDA_CUDA_HEADERS_H_
+#define IREE_HAL_CUDA_CUDA_HEADERS_H_
+
+#include "cuda.h"  // IWYU pragma: export
+
+#endif  // IREE_HAL_CUDA_CUDA_HEADERS_H_
diff --git a/runtime/src/iree/hal/cuda/descriptor_set_layout.c b/runtime/src/iree/hal/cuda/descriptor_set_layout.c
new file mode 100644
index 0000000..062cc7e
--- /dev/null
+++ b/runtime/src/iree/hal/cuda/descriptor_set_layout.c
@@ -0,0 +1,81 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/cuda/descriptor_set_layout.h"
+
+#include <stddef.h>
+
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+
+typedef struct iree_hal_cuda_descriptor_set_layout_t {
+  iree_hal_resource_t resource;
+  iree_hal_cuda_context_wrapper_t* context;
+  iree_host_size_t binding_count;
+} iree_hal_cuda_descriptor_set_layout_t;
+
+static const iree_hal_descriptor_set_layout_vtable_t
+    iree_hal_cuda_descriptor_set_layout_vtable;
+
+static iree_hal_cuda_descriptor_set_layout_t*
+iree_hal_cuda_descriptor_set_layout_cast(
+    iree_hal_descriptor_set_layout_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda_descriptor_set_layout_vtable);
+  return (iree_hal_cuda_descriptor_set_layout_t*)base_value;
+}
+
+iree_status_t iree_hal_cuda_descriptor_set_layout_create(
+    iree_hal_cuda_context_wrapper_t* context,
+    iree_hal_descriptor_set_layout_usage_type_t usage_type,
+    iree_host_size_t binding_count,
+    const iree_hal_descriptor_set_layout_binding_t* bindings,
+    iree_hal_descriptor_set_layout_t** out_descriptor_set_layout) {
+  IREE_ASSERT_ARGUMENT(context);
+  IREE_ASSERT_ARGUMENT(!binding_count || bindings);
+  IREE_ASSERT_ARGUMENT(out_descriptor_set_layout);
+  *out_descriptor_set_layout = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_cuda_descriptor_set_layout_t* descriptor_set_layout = NULL;
+  iree_status_t status = iree_allocator_malloc(context->host_allocator,
+                                               sizeof(*descriptor_set_layout),
+                                               (void**)&descriptor_set_layout);
+  if (iree_status_is_ok(status)) {
+    iree_hal_resource_initialize(&iree_hal_cuda_descriptor_set_layout_vtable,
+                                 &descriptor_set_layout->resource);
+    descriptor_set_layout->context = context;
+    descriptor_set_layout->binding_count = binding_count;
+    *out_descriptor_set_layout =
+        (iree_hal_descriptor_set_layout_t*)descriptor_set_layout;
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+iree_host_size_t iree_hal_cuda_descriptor_set_layout_binding_count(
+    iree_hal_descriptor_set_layout_t* base_descriptor_set_layout) {
+  iree_hal_cuda_descriptor_set_layout_t* descriptor_set_layout =
+      iree_hal_cuda_descriptor_set_layout_cast(base_descriptor_set_layout);
+  return descriptor_set_layout->binding_count;
+}
+
+static void iree_hal_cuda_descriptor_set_layout_destroy(
+    iree_hal_descriptor_set_layout_t* base_descriptor_set_layout) {
+  iree_hal_cuda_descriptor_set_layout_t* descriptor_set_layout =
+      iree_hal_cuda_descriptor_set_layout_cast(base_descriptor_set_layout);
+  iree_allocator_t host_allocator =
+      descriptor_set_layout->context->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_allocator_free(host_allocator, descriptor_set_layout);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static const iree_hal_descriptor_set_layout_vtable_t
+    iree_hal_cuda_descriptor_set_layout_vtable = {
+        .destroy = iree_hal_cuda_descriptor_set_layout_destroy,
+};
diff --git a/runtime/src/iree/hal/cuda/descriptor_set_layout.h b/runtime/src/iree/hal/cuda/descriptor_set_layout.h
new file mode 100644
index 0000000..c630d4c
--- /dev/null
+++ b/runtime/src/iree/hal/cuda/descriptor_set_layout.h
@@ -0,0 +1,33 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_CUDA_DESCRIPTOR_SET_LAYOUT_H_
+#define IREE_HAL_CUDA_DESCRIPTOR_SET_LAYOUT_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/cuda/context_wrapper.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+iree_status_t iree_hal_cuda_descriptor_set_layout_create(
+    iree_hal_cuda_context_wrapper_t* context,
+    iree_hal_descriptor_set_layout_usage_type_t usage_type,
+    iree_host_size_t binding_count,
+    const iree_hal_descriptor_set_layout_binding_t* bindings,
+    iree_hal_descriptor_set_layout_t** out_descriptor_set_layout);
+
+// Return the binding count for the given descriptor set layout.
+iree_host_size_t iree_hal_cuda_descriptor_set_layout_binding_count(
+    iree_hal_descriptor_set_layout_t* descriptor_set_layout);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_CUDA_DESCRIPTOR_SET_LAYOUT_H_
diff --git a/runtime/src/iree/hal/cuda/dynamic_symbol_tables.h b/runtime/src/iree/hal/cuda/dynamic_symbol_tables.h
new file mode 100644
index 0000000..9b3f5c9
--- /dev/null
+++ b/runtime/src/iree/hal/cuda/dynamic_symbol_tables.h
@@ -0,0 +1,55 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+CU_PFN_DECL(cuCtxCreate, CUcontext*, unsigned int, CUdevice)
+CU_PFN_DECL(cuCtxDestroy, CUcontext)
+CU_PFN_DECL(cuDeviceGet, CUdevice*, int)
+CU_PFN_DECL(cuDeviceGetCount, int*)
+CU_PFN_DECL(cuDeviceGetName, char*, int, CUdevice)
+CU_PFN_DECL(cuDeviceGetAttribute, int*, CUdevice_attribute, CUdevice)
+CU_PFN_DECL(cuGetErrorName, CUresult, const char**)
+CU_PFN_DECL(cuGetErrorString, CUresult, const char**)
+CU_PFN_DECL(cuGraphAddMemcpyNode, CUgraphNode*, CUgraph, const CUgraphNode*,
+            size_t, const CUDA_MEMCPY3D*, CUcontext)
+CU_PFN_DECL(cuGraphAddMemsetNode, CUgraphNode*, CUgraph, const CUgraphNode*,
+            size_t, const CUDA_MEMSET_NODE_PARAMS*, CUcontext)
+CU_PFN_DECL(cuGraphAddKernelNode, CUgraphNode*, CUgraph, const CUgraphNode*,
+            size_t, const CUDA_KERNEL_NODE_PARAMS*)
+CU_PFN_DECL(cuGraphCreate, CUgraph*, unsigned int)
+CU_PFN_DECL(cuGraphDestroy, CUgraph)
+CU_PFN_DECL(cuGraphExecDestroy, CUgraphExec)
+CU_PFN_DECL(cuGraphGetNodes, CUgraph, CUgraphNode*, size_t*)
+CU_PFN_DECL(cuGraphInstantiate, CUgraphExec*, CUgraph, CUgraphNode*, char*,
+            size_t)
+CU_PFN_DECL(cuGraphLaunch, CUgraphExec, CUstream)
+CU_PFN_DECL(cuInit, unsigned int)
+CU_PFN_DECL(cuMemAllocManaged, CUdeviceptr*, size_t, unsigned int)
+CU_PFN_DECL(cuMemPrefetchAsync, CUdeviceptr, size_t, CUdevice, CUstream)
+CU_PFN_DECL(cuMemAlloc, CUdeviceptr*, size_t)
+CU_PFN_DECL(cuMemFree, CUdeviceptr)
+CU_PFN_DECL(cuMemFreeHost, void*)
+CU_PFN_DECL(cuMemHostAlloc, void**, size_t, unsigned int)
+CU_PFN_DECL(cuMemHostGetDevicePointer, CUdeviceptr*, void*, unsigned int)
+CU_PFN_DECL(cuModuleGetFunction, CUfunction*, CUmodule, const char*)
+CU_PFN_DECL(cuModuleLoadDataEx, CUmodule*, const void*, unsigned int,
+            CUjit_option*, void**)
+CU_PFN_DECL(cuModuleUnload, CUmodule)
+CU_PFN_DECL(cuStreamCreate, CUstream*, unsigned int)
+CU_PFN_DECL(cuStreamDestroy, CUstream)
+CU_PFN_DECL(cuStreamSynchronize, CUstream)
+CU_PFN_DECL(cuStreamWaitEvent, CUstream, CUevent, unsigned int)
+CU_PFN_DECL(cuMemsetD32Async, unsigned long long, unsigned int, size_t,
+            CUstream)
+CU_PFN_DECL(cuMemsetD16Async, unsigned long long, unsigned short, size_t,
+            CUstream)
+CU_PFN_DECL(cuMemsetD8Async, unsigned long long, unsigned char, size_t,
+            CUstream)
+CU_PFN_DECL(cuMemcpyAsync, CUdeviceptr, CUdeviceptr, size_t, CUstream)
+CU_PFN_DECL(cuMemcpyHtoDAsync_v2, CUdeviceptr, const void*, size_t, CUstream)
+CU_PFN_DECL(cuFuncSetAttribute, CUfunction, CUfunction_attribute, int)
+CU_PFN_DECL(cuLaunchKernel, CUfunction, unsigned int, unsigned int,
+            unsigned int, unsigned int, unsigned int, unsigned int,
+            unsigned int, CUstream, void**, void**)
diff --git a/runtime/src/iree/hal/cuda/dynamic_symbols.c b/runtime/src/iree/hal/cuda/dynamic_symbols.c
new file mode 100644
index 0000000..84b93ad
--- /dev/null
+++ b/runtime/src/iree/hal/cuda/dynamic_symbols.c
@@ -0,0 +1,72 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/cuda/dynamic_symbols.h"
+
+#include <string.h>
+
+#include "iree/base/internal/dynamic_library.h"
+#include "iree/base/target_platform.h"
+#include "iree/base/tracing.h"
+
+static const char* kCUDALoaderSearchNames[] = {
+#if defined(IREE_PLATFORM_WINDOWS)
+    "nvcuda.dll",
+#else
+    "libcuda.so",
+#endif
+};
+
+#define concat(A, B) A B
+
+// Load CUDA entry points, prefer _v2 version if it exists.
+static iree_status_t iree_hal_cuda_dynamic_symbols_resolve_all(
+    iree_hal_cuda_dynamic_symbols_t* syms) {
+#define CU_PFN_DECL(cudaSymbolName, ...)                                       \
+  {                                                                            \
+    static const char* kName = #cudaSymbolName;                                \
+    IREE_RETURN_IF_ERROR(iree_dynamic_library_lookup_symbol(                   \
+        syms->loader_library, kName, (void**)&syms->cudaSymbolName));          \
+    static const char* kNameV2 = concat(#cudaSymbolName, "_v2");               \
+    void* funV2;                                                               \
+    iree_dynamic_library_lookup_symbol(syms->loader_library, kNameV2, &funV2); \
+    if (funV2) syms->cudaSymbolName = funV2;                                   \
+  }
+#include "iree/hal/cuda/dynamic_symbol_tables.h"  // IWYU pragma: keep
+#undef CU_PFN_DECL
+  return iree_ok_status();
+}
+
+iree_status_t iree_hal_cuda_dynamic_symbols_initialize(
+    iree_allocator_t allocator, iree_hal_cuda_dynamic_symbols_t* out_syms) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  memset(out_syms, 0, sizeof(*out_syms));
+  iree_status_t status = iree_dynamic_library_load_from_files(
+      IREE_ARRAYSIZE(kCUDALoaderSearchNames), kCUDALoaderSearchNames,
+      IREE_DYNAMIC_LIBRARY_FLAG_NONE, allocator, &out_syms->loader_library);
+  if (iree_status_is_not_found(status)) {
+    iree_status_ignore(status);
+    return iree_make_status(
+        IREE_STATUS_UNAVAILABLE,
+        "CUDA runtime library not available; ensure installed and on path");
+  }
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_cuda_dynamic_symbols_resolve_all(out_syms);
+  }
+  if (!iree_status_is_ok(status)) {
+    iree_hal_cuda_dynamic_symbols_deinitialize(out_syms);
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+void iree_hal_cuda_dynamic_symbols_deinitialize(
+    iree_hal_cuda_dynamic_symbols_t* syms) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_dynamic_library_release(syms->loader_library);
+  memset(syms, 0, sizeof(*syms));
+  IREE_TRACE_ZONE_END(z0);
+}
diff --git a/runtime/src/iree/hal/cuda/dynamic_symbols.h b/runtime/src/iree/hal/cuda/dynamic_symbols.h
new file mode 100644
index 0000000..ccdba6c
--- /dev/null
+++ b/runtime/src/iree/hal/cuda/dynamic_symbols.h
@@ -0,0 +1,47 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_CUDA_DYNAMIC_SYMBOLS_H_
+#define IREE_HAL_CUDA_DYNAMIC_SYMBOLS_H_
+
+#include "iree/base/api.h"
+#include "iree/base/internal/dynamic_library.h"
+#include "iree/hal/cuda/cuda_headers.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// DynamicSymbols allow loading dynamically a subset of CUDA driver API. It
+// loads all the function declared in `dynamic_symbol_tables.def` and fail if
+// any of the symbol is not available. The functions signatures are matching
+// the declarations in `cuda.h`.
+typedef struct iree_hal_cuda_dynamic_symbols_t {
+  iree_dynamic_library_t* loader_library;
+
+#define CU_PFN_DECL(cudaSymbolName, ...) \
+  CUresult (*cudaSymbolName)(__VA_ARGS__);
+#include "iree/hal/cuda/dynamic_symbol_tables.h"  // IWYU pragma: export
+#undef CU_PFN_DECL
+} iree_hal_cuda_dynamic_symbols_t;
+
+// Initializes |out_syms| in-place with dynamically loaded CUDA symbols.
+// iree_hal_cuda_dynamic_symbols_deinitialize must be used to release the
+// library resources.
+iree_status_t iree_hal_cuda_dynamic_symbols_initialize(
+    iree_allocator_t allocator, iree_hal_cuda_dynamic_symbols_t* out_syms);
+
+// Deinitializes |syms| by unloading the backing library. All function pointers
+// will be invalidated. They _may_ still work if there are other reasons the
+// library remains loaded so be careful.
+void iree_hal_cuda_dynamic_symbols_deinitialize(
+    iree_hal_cuda_dynamic_symbols_t* syms);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_CUDA_DYNAMIC_SYMBOLS_H_
diff --git a/runtime/src/iree/hal/cuda/dynamic_symbols_test.cc b/runtime/src/iree/hal/cuda/dynamic_symbols_test.cc
new file mode 100644
index 0000000..ab5136c
--- /dev/null
+++ b/runtime/src/iree/hal/cuda/dynamic_symbols_test.cc
@@ -0,0 +1,50 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/cuda/dynamic_symbols.h"
+
+#include <iostream>
+
+#include "iree/base/api.h"
+#include "iree/testing/gtest.h"
+
+namespace iree {
+namespace hal {
+namespace cuda {
+namespace {
+
+#define CUDE_CHECK_ERRORS(expr)      \
+  {                                  \
+    CUresult status = expr;          \
+    ASSERT_EQ(CUDA_SUCCESS, status); \
+  }
+
+TEST(DynamicSymbolsTest, CreateFromSystemLoader) {
+  iree_hal_cuda_dynamic_symbols_t symbols;
+  iree_status_t status = iree_hal_cuda_dynamic_symbols_initialize(
+      iree_allocator_system(), &symbols);
+  if (!iree_status_is_ok(status)) {
+    iree_status_fprint(stderr, status);
+    iree_status_ignore(status);
+    std::cerr << "Symbols cannot be loaded, skipping test.";
+    GTEST_SKIP();
+  }
+
+  int device_count = 0;
+  CUDE_CHECK_ERRORS(symbols.cuInit(0));
+  CUDE_CHECK_ERRORS(symbols.cuDeviceGetCount(&device_count));
+  if (device_count > 0) {
+    CUdevice device;
+    CUDE_CHECK_ERRORS(symbols.cuDeviceGet(&device, /*ordinal=*/0));
+  }
+
+  iree_hal_cuda_dynamic_symbols_deinitialize(&symbols);
+}
+
+}  // namespace
+}  // namespace cuda
+}  // namespace hal
+}  // namespace iree
diff --git a/runtime/src/iree/hal/cuda/event_semaphore.c b/runtime/src/iree/hal/cuda/event_semaphore.c
new file mode 100644
index 0000000..17a5bfb
--- /dev/null
+++ b/runtime/src/iree/hal/cuda/event_semaphore.c
@@ -0,0 +1,93 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/cuda/event_semaphore.h"
+
+#include <stddef.h>
+
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+
+typedef struct iree_hal_cuda_semaphore_t {
+  iree_hal_resource_t resource;
+  iree_hal_cuda_context_wrapper_t* context;
+  uint64_t initial_value;
+} iree_hal_cuda_semaphore_t;
+
+static const iree_hal_semaphore_vtable_t iree_hal_cuda_semaphore_vtable;
+
+static iree_hal_cuda_semaphore_t* iree_hal_cuda_semaphore_cast(
+    iree_hal_semaphore_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda_semaphore_vtable);
+  return (iree_hal_cuda_semaphore_t*)base_value;
+}
+
+iree_status_t iree_hal_cuda_semaphore_create(
+    iree_hal_cuda_context_wrapper_t* context, uint64_t initial_value,
+    iree_hal_semaphore_t** out_semaphore) {
+  IREE_ASSERT_ARGUMENT(context);
+  IREE_ASSERT_ARGUMENT(out_semaphore);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_cuda_semaphore_t* semaphore = NULL;
+  iree_status_t status = iree_allocator_malloc(
+      context->host_allocator, sizeof(*semaphore), (void**)&semaphore);
+  if (iree_status_is_ok(status)) {
+    iree_hal_resource_initialize(&iree_hal_cuda_semaphore_vtable,
+                                 &semaphore->resource);
+    semaphore->context = context;
+    semaphore->initial_value = initial_value;
+    *out_semaphore = (iree_hal_semaphore_t*)semaphore;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_cuda_semaphore_destroy(
+    iree_hal_semaphore_t* base_semaphore) {
+  iree_hal_cuda_semaphore_t* semaphore =
+      iree_hal_cuda_semaphore_cast(base_semaphore);
+  iree_allocator_t host_allocator = semaphore->context->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_allocator_free(host_allocator, semaphore);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_status_t iree_hal_cuda_semaphore_query(
+    iree_hal_semaphore_t* base_semaphore, uint64_t* out_value) {
+  // TODO: Support semaphores completely.
+  *out_value = 0;
+  return iree_make_status(IREE_STATUS_UNIMPLEMENTED, "Not impemented on CUDA");
+}
+
+static iree_status_t iree_hal_cuda_semaphore_signal(
+    iree_hal_semaphore_t* base_semaphore, uint64_t new_value) {
+  // TODO: Support semaphores completely. Return OK currently as everything is
+  // synchronized for each submit to allow things to run.
+  return iree_ok_status();
+}
+
+static void iree_hal_cuda_semaphore_fail(iree_hal_semaphore_t* base_semaphore,
+                                         iree_status_t status) {}
+
+static iree_status_t iree_hal_cuda_semaphore_wait(
+    iree_hal_semaphore_t* base_semaphore, uint64_t value,
+    iree_timeout_t timeout) {
+  // TODO: Support semaphores completely. Return OK currently as everything is
+  // synchronized for each submit to allow things to run.
+  return iree_ok_status();
+}
+
+static const iree_hal_semaphore_vtable_t iree_hal_cuda_semaphore_vtable = {
+    .destroy = iree_hal_cuda_semaphore_destroy,
+    .query = iree_hal_cuda_semaphore_query,
+    .signal = iree_hal_cuda_semaphore_signal,
+    .fail = iree_hal_cuda_semaphore_fail,
+    .wait = iree_hal_cuda_semaphore_wait,
+};
diff --git a/runtime/src/iree/hal/cuda/event_semaphore.h b/runtime/src/iree/hal/cuda/event_semaphore.h
new file mode 100644
index 0000000..3580bf2
--- /dev/null
+++ b/runtime/src/iree/hal/cuda/event_semaphore.h
@@ -0,0 +1,30 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_CUDA_SEMAPHORE_H_
+#define IREE_HAL_CUDA_SEMAPHORE_H_
+
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/cuda/context_wrapper.h"
+#include "iree/hal/cuda/status_util.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Create a cuda allocator.
+iree_status_t iree_hal_cuda_semaphore_create(
+    iree_hal_cuda_context_wrapper_t* context, uint64_t initial_value,
+    iree_hal_semaphore_t** out_semaphore);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_CUDA_SEMAPHORE_H_
diff --git a/runtime/src/iree/hal/cuda/executable_layout.c b/runtime/src/iree/hal/cuda/executable_layout.c
new file mode 100644
index 0000000..892e03f
--- /dev/null
+++ b/runtime/src/iree/hal/cuda/executable_layout.c
@@ -0,0 +1,126 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/cuda/executable_layout.h"
+
+#include <stddef.h>
+
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/cuda/descriptor_set_layout.h"
+
+typedef struct iree_hal_cuda_executable_layout_t {
+  iree_hal_resource_t resource;
+  iree_hal_cuda_context_wrapper_t* context;
+  iree_host_size_t push_constant_base_index;
+  iree_host_size_t push_constant_count;
+  iree_host_size_t set_layout_count;
+  iree_hal_descriptor_set_layout_t* set_layouts[];
+} iree_hal_cuda_executable_layout_t;
+
+static const iree_hal_executable_layout_vtable_t
+    iree_hal_cuda_executable_layout_vtable;
+
+static iree_hal_cuda_executable_layout_t* iree_hal_cuda_executable_layout_cast(
+    iree_hal_executable_layout_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda_executable_layout_vtable);
+  return (iree_hal_cuda_executable_layout_t*)base_value;
+}
+
+static void iree_hal_cuda_executable_layout_destroy(
+    iree_hal_executable_layout_t* base_executable_layout) {
+  iree_hal_cuda_executable_layout_t* executable_layout =
+      iree_hal_cuda_executable_layout_cast(base_executable_layout);
+  iree_allocator_t host_allocator = executable_layout->context->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  for (iree_host_size_t i = 0; i < executable_layout->set_layout_count; ++i) {
+    iree_hal_descriptor_set_layout_release(executable_layout->set_layouts[i]);
+  }
+  iree_allocator_free(host_allocator, executable_layout);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+iree_status_t iree_hal_cuda_executable_layout_create(
+    iree_hal_cuda_context_wrapper_t* context, iree_host_size_t set_layout_count,
+    iree_hal_descriptor_set_layout_t** set_layouts,
+    iree_host_size_t push_constant_count,
+    iree_hal_executable_layout_t** out_executable_layout) {
+  IREE_ASSERT_ARGUMENT(context);
+  IREE_ASSERT_ARGUMENT(!set_layout_count || set_layouts);
+  IREE_ASSERT_ARGUMENT(out_executable_layout);
+  *out_executable_layout = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  if (push_constant_count > IREE_HAL_CUDA_MAX_PUSH_CONSTANT_COUNT) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "push constant count %zu over the limit of %d",
+                            push_constant_count,
+                            IREE_HAL_CUDA_MAX_PUSH_CONSTANT_COUNT);
+  }
+
+  // Currently the executable layout doesn't do anything.
+  // TODO: Handle creating the argument layout at that time hadling both push
+  // constant and buffers.
+  iree_hal_cuda_executable_layout_t* executable_layout = NULL;
+  iree_host_size_t total_size =
+      sizeof(*executable_layout) +
+      set_layout_count * sizeof(*executable_layout->set_layouts);
+  iree_status_t status = iree_allocator_malloc(
+      context->host_allocator, total_size, (void**)&executable_layout);
+  if (iree_status_is_ok(status)) {
+    iree_hal_resource_initialize(&iree_hal_cuda_executable_layout_vtable,
+                                 &executable_layout->resource);
+    executable_layout->context = context;
+    executable_layout->set_layout_count = set_layout_count;
+    iree_host_size_t binding_number = 0;
+    for (iree_host_size_t i = 0; i < set_layout_count; ++i) {
+      executable_layout->set_layouts[i] = set_layouts[i];
+      iree_hal_descriptor_set_layout_retain(set_layouts[i]);
+      binding_number +=
+          iree_hal_cuda_descriptor_set_layout_binding_count(set_layouts[i]);
+    }
+    executable_layout->push_constant_base_index = binding_number;
+    executable_layout->push_constant_count = push_constant_count;
+    *out_executable_layout = (iree_hal_executable_layout_t*)executable_layout;
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+iree_host_size_t iree_hal_cuda_base_binding_index(
+    iree_hal_executable_layout_t* base_executable_layout, uint32_t set) {
+  iree_hal_cuda_executable_layout_t* executable_layout =
+      iree_hal_cuda_executable_layout_cast(base_executable_layout);
+  iree_host_size_t base_binding = 0;
+  for (iree_host_size_t i = 0; i < set; ++i) {
+    iree_host_size_t binding_count =
+        iree_hal_cuda_descriptor_set_layout_binding_count(
+            executable_layout->set_layouts[i]);
+    base_binding += binding_count;
+  }
+  return base_binding;
+}
+
+iree_host_size_t iree_hal_cuda_push_constant_index(
+    iree_hal_executable_layout_t* base_executable_layout) {
+  iree_hal_cuda_executable_layout_t* executable_layout =
+      iree_hal_cuda_executable_layout_cast(base_executable_layout);
+  return executable_layout->push_constant_base_index;
+}
+
+iree_host_size_t iree_hal_cuda_executable_layout_num_constants(
+    iree_hal_executable_layout_t* base_executable_layout) {
+  iree_hal_cuda_executable_layout_t* executable_layout =
+      iree_hal_cuda_executable_layout_cast(base_executable_layout);
+  return executable_layout->push_constant_count;
+}
+
+static const iree_hal_executable_layout_vtable_t
+    iree_hal_cuda_executable_layout_vtable = {
+        .destroy = iree_hal_cuda_executable_layout_destroy,
+};
diff --git a/runtime/src/iree/hal/cuda/executable_layout.h b/runtime/src/iree/hal/cuda/executable_layout.h
new file mode 100644
index 0000000..b7810e0
--- /dev/null
+++ b/runtime/src/iree/hal/cuda/executable_layout.h
@@ -0,0 +1,43 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_CUDA_EXECUTABLE_LAYOUT_H_
+#define IREE_HAL_CUDA_EXECUTABLE_LAYOUT_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/cuda/context_wrapper.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+#define IREE_HAL_CUDA_MAX_PUSH_CONSTANT_COUNT 64
+
+// Creates the kernel arguments.
+iree_status_t iree_hal_cuda_executable_layout_create(
+    iree_hal_cuda_context_wrapper_t* context, iree_host_size_t set_layout_count,
+    iree_hal_descriptor_set_layout_t** set_layouts,
+    iree_host_size_t push_constant_count,
+    iree_hal_executable_layout_t** out_executable_layout);
+
+// Return the base binding index for the given set.
+iree_host_size_t iree_hal_cuda_base_binding_index(
+    iree_hal_executable_layout_t* executable_layout, uint32_t set);
+
+// Return the base index for push constant data.
+iree_host_size_t iree_hal_cuda_push_constant_index(
+    iree_hal_executable_layout_t* base_executable_layout);
+
+// Return the number of constants in the executable layout.
+iree_host_size_t iree_hal_cuda_executable_layout_num_constants(
+    iree_hal_executable_layout_t* base_executable_layout);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_CUDA_EXECUTABLE_LAYOUT_H_
diff --git a/runtime/src/iree/hal/cuda/graph_command_buffer.c b/runtime/src/iree/hal/cuda/graph_command_buffer.c
new file mode 100644
index 0000000..d5ea450
--- /dev/null
+++ b/runtime/src/iree/hal/cuda/graph_command_buffer.c
@@ -0,0 +1,583 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/cuda/graph_command_buffer.h"
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/cuda/cuda_buffer.h"
+#include "iree/hal/cuda/dynamic_symbols.h"
+#include "iree/hal/cuda/executable_layout.h"
+#include "iree/hal/cuda/native_executable.h"
+#include "iree/hal/cuda/status_util.h"
+#include "iree/hal/utils/resource_set.h"
+
+#define IREE_HAL_CUDA_MAX_BINDING_COUNT 64
+// Kernel arguments contains binding and push constants.
+#define IREE_HAL_CUDA_MAX_KERNEL_ARG 128
+
+// Command buffer implementation that directly maps to cuda graph.
+// This records the commands on the calling thread without additional threading
+// indirection.
+typedef struct iree_hal_cuda_graph_command_buffer_t {
+  iree_hal_command_buffer_t base;
+  iree_hal_cuda_context_wrapper_t* context;
+
+  // Maintains a reference to all resources used within the command buffer.
+  // Reset on each begin.
+  iree_hal_resource_set_t* resource_set;
+
+  // Staging arena used for host->device transfers.
+  // Used for when we need CUDA to be able to reference memory as it performs
+  // asynchronous operations.
+  iree_arena_allocator_t arena;
+
+  CUgraph graph;
+  CUgraphExec exec;
+
+  // Keep track of the last node added to the command buffer as we are currently
+  // serializing all the nodes (each node depends on the previous one).
+  CUgraphNode last_node;
+  int32_t push_constant[IREE_HAL_CUDA_MAX_PUSH_CONSTANT_COUNT];
+  // Keep track of the current set of kernel arguments.
+  void* current_descriptor[];
+} iree_hal_cuda_graph_command_buffer_t;
+
+static const iree_hal_command_buffer_vtable_t
+    iree_hal_cuda_graph_command_buffer_vtable;
+
+static iree_hal_cuda_graph_command_buffer_t*
+iree_hal_cuda_graph_command_buffer_cast(iree_hal_command_buffer_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda_graph_command_buffer_vtable);
+  return (iree_hal_cuda_graph_command_buffer_t*)base_value;
+}
+
+iree_status_t iree_hal_cuda_graph_command_buffer_create(
+    iree_hal_device_t* device, iree_hal_cuda_context_wrapper_t* context,
+    iree_hal_command_buffer_mode_t mode,
+    iree_hal_command_category_t command_categories,
+    iree_hal_queue_affinity_t queue_affinity,
+    iree_arena_block_pool_t* block_pool,
+    iree_hal_command_buffer_t** out_command_buffer) {
+  IREE_ASSERT_ARGUMENT(context);
+  IREE_ASSERT_ARGUMENT(block_pool);
+  IREE_ASSERT_ARGUMENT(out_command_buffer);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_cuda_graph_command_buffer_t* command_buffer = NULL;
+  size_t total_size = sizeof(*command_buffer) +
+                      IREE_HAL_CUDA_MAX_KERNEL_ARG * sizeof(void*) +
+                      IREE_HAL_CUDA_MAX_KERNEL_ARG * sizeof(CUdeviceptr);
+  iree_status_t status = iree_allocator_malloc(
+      context->host_allocator, total_size, (void**)&command_buffer);
+  if (iree_status_is_ok(status)) {
+    iree_hal_command_buffer_initialize(
+        device, mode, command_categories, queue_affinity,
+        &iree_hal_cuda_graph_command_buffer_vtable, &command_buffer->base);
+    command_buffer->context = context;
+    iree_arena_initialize(block_pool, &command_buffer->arena);
+    command_buffer->graph = NULL;
+    command_buffer->exec = NULL;
+    command_buffer->last_node = NULL;
+
+    CUdeviceptr* device_ptrs =
+        (CUdeviceptr*)(command_buffer->current_descriptor +
+                       IREE_HAL_CUDA_MAX_KERNEL_ARG);
+    for (size_t i = 0; i < IREE_HAL_CUDA_MAX_KERNEL_ARG; i++) {
+      command_buffer->current_descriptor[i] = &device_ptrs[i];
+    }
+
+    status = iree_hal_resource_set_allocate(block_pool,
+                                            &command_buffer->resource_set);
+  }
+
+  if (iree_status_is_ok(status)) {
+    *out_command_buffer = &command_buffer->base;
+  } else {
+    iree_hal_command_buffer_release(&command_buffer->base);
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_cuda_graph_command_buffer_reset(
+    iree_hal_cuda_graph_command_buffer_t* command_buffer) {
+  if (command_buffer->graph != NULL) {
+    CUDA_IGNORE_ERROR(command_buffer->context->syms,
+                      cuGraphDestroy(command_buffer->graph));
+    command_buffer->graph = NULL;
+  }
+
+  if (command_buffer->exec != NULL) {
+    CUDA_IGNORE_ERROR(command_buffer->context->syms,
+                      cuGraphExecDestroy(command_buffer->exec));
+    command_buffer->exec = NULL;
+  }
+
+  command_buffer->last_node = NULL;
+
+  iree_hal_resource_set_reset(command_buffer->resource_set);
+  iree_arena_reset(&command_buffer->arena);
+}
+
+static void iree_hal_cuda_graph_command_buffer_destroy(
+    iree_hal_command_buffer_t* base_command_buffer) {
+  iree_hal_cuda_graph_command_buffer_t* command_buffer =
+      iree_hal_cuda_graph_command_buffer_cast(base_command_buffer);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_cuda_graph_command_buffer_reset(command_buffer);
+  iree_hal_resource_set_free(command_buffer->resource_set);
+  iree_arena_deinitialize(&command_buffer->arena);
+  iree_allocator_free(command_buffer->context->host_allocator, command_buffer);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+CUgraphExec iree_hal_cuda_graph_command_buffer_handle(
+    iree_hal_command_buffer_t* base_command_buffer) {
+  iree_hal_cuda_graph_command_buffer_t* command_buffer =
+      iree_hal_cuda_graph_command_buffer_cast(base_command_buffer);
+  return command_buffer->exec;
+}
+
+bool iree_hal_cuda_graph_command_buffer_isa(
+    iree_hal_command_buffer_t* command_buffer) {
+  return iree_hal_command_buffer_dyn_cast(
+      command_buffer, &iree_hal_cuda_graph_command_buffer_vtable);
+}
+
+static void* iree_hal_cuda_graph_command_buffer_dyn_cast(
+    iree_hal_command_buffer_t* command_buffer, const void* vtable) {
+  if (vtable == &iree_hal_cuda_graph_command_buffer_vtable) {
+    IREE_HAL_ASSERT_TYPE(command_buffer, vtable);
+    return command_buffer;
+  }
+  return NULL;
+}
+
+static iree_status_t iree_hal_cuda_graph_command_buffer_begin(
+    iree_hal_command_buffer_t* base_command_buffer) {
+  iree_hal_cuda_graph_command_buffer_t* command_buffer =
+      iree_hal_cuda_graph_command_buffer_cast(base_command_buffer);
+
+  // Reset any prior recorded commands.
+  iree_hal_cuda_graph_command_buffer_reset(command_buffer);
+
+  // Create a new empty graph to record into.
+  CUDA_RETURN_IF_ERROR(command_buffer->context->syms,
+                       cuGraphCreate(&command_buffer->graph, /*flags=*/0),
+                       "cuGraphCreate");
+
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_cuda_graph_command_buffer_end(
+    iree_hal_command_buffer_t* base_command_buffer) {
+  iree_hal_cuda_graph_command_buffer_t* command_buffer =
+      iree_hal_cuda_graph_command_buffer_cast(base_command_buffer);
+
+  // Reset state used during recording.
+  command_buffer->last_node = NULL;
+
+  // Compile the graph.
+  CUgraphNode error_node = NULL;
+  iree_status_t status =
+      CU_RESULT_TO_STATUS(command_buffer->context->syms,
+                          cuGraphInstantiate(&command_buffer->exec,
+                                             command_buffer->graph, &error_node,
+                                             /*logBuffer=*/NULL,
+                                             /*bufferSize=*/0));
+  if (iree_status_is_ok(status)) {
+    // No longer need the source graph used for construction.
+    CUDA_IGNORE_ERROR(command_buffer->context->syms,
+                      cuGraphDestroy(command_buffer->graph));
+    command_buffer->graph = NULL;
+  }
+
+  return iree_ok_status();
+}
+
+static void iree_hal_cuda_graph_command_buffer_begin_debug_group(
+    iree_hal_command_buffer_t* base_command_buffer, iree_string_view_t label,
+    iree_hal_label_color_t label_color,
+    const iree_hal_label_location_t* location) {
+  // TODO(benvanik): tracy event stack.
+}
+
+static void iree_hal_cuda_graph_command_buffer_end_debug_group(
+    iree_hal_command_buffer_t* base_command_buffer) {
+  // TODO(benvanik): tracy event stack.
+}
+
+static iree_status_t iree_hal_cuda_graph_command_buffer_execution_barrier(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_execution_stage_t source_stage_mask,
+    iree_hal_execution_stage_t target_stage_mask,
+    iree_hal_execution_barrier_flags_t flags,
+    iree_host_size_t memory_barrier_count,
+    const iree_hal_memory_barrier_t* memory_barriers,
+    iree_host_size_t buffer_barrier_count,
+    const iree_hal_buffer_barrier_t* buffer_barriers) {
+  // TODO: Implement barrier with Graph edges. Right now all the nodes are
+  // serialized.
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_cuda_graph_command_buffer_signal_event(
+    iree_hal_command_buffer_t* base_command_buffer, iree_hal_event_t* event,
+    iree_hal_execution_stage_t source_stage_mask) {
+  // TODO: Implement barrier with Graph edges. Right now all the nodes are
+  // serialized.
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_cuda_graph_command_buffer_reset_event(
+    iree_hal_command_buffer_t* base_command_buffer, iree_hal_event_t* event,
+    iree_hal_execution_stage_t source_stage_mask) {
+  // TODO: Implement barrier with Graph edges. Right now all the nodes are
+  // serialized.
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_cuda_graph_command_buffer_wait_events(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_host_size_t event_count, const iree_hal_event_t** events,
+    iree_hal_execution_stage_t source_stage_mask,
+    iree_hal_execution_stage_t target_stage_mask,
+    iree_host_size_t memory_barrier_count,
+    const iree_hal_memory_barrier_t* memory_barriers,
+    iree_host_size_t buffer_barrier_count,
+    const iree_hal_buffer_barrier_t* buffer_barriers) {
+  // TODO: Implement barrier with Graph edges. Right now all the nodes are
+  // serialized.
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_cuda_graph_command_buffer_discard_buffer(
+    iree_hal_command_buffer_t* base_command_buffer, iree_hal_buffer_t* buffer) {
+  // We could mark the memory as invalidated so that if managed CUDA does not
+  // try to copy it back to the host.
+  return iree_ok_status();
+}
+
+// Splats a pattern value of 1, 2, or 4 bytes out to a 4 byte value.
+static uint32_t iree_hal_cuda_splat_pattern(const void* pattern,
+                                            size_t pattern_length) {
+  switch (pattern_length) {
+    case 1: {
+      uint32_t pattern_value = *(const uint8_t*)(pattern);
+      return (pattern_value << 24) | (pattern_value << 16) |
+             (pattern_value << 8) | pattern_value;
+    }
+    case 2: {
+      uint32_t pattern_value = *(const uint16_t*)(pattern);
+      return (pattern_value << 16) | pattern_value;
+    }
+    case 4: {
+      uint32_t pattern_value = *(const uint32_t*)(pattern);
+      return pattern_value;
+    }
+    default:
+      return 0;  // Already verified that this should not be possible.
+  }
+}
+
+static iree_status_t iree_hal_cuda_graph_command_buffer_fill_buffer(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+    iree_device_size_t length, const void* pattern,
+    iree_host_size_t pattern_length) {
+  iree_hal_cuda_graph_command_buffer_t* command_buffer =
+      iree_hal_cuda_graph_command_buffer_cast(base_command_buffer);
+
+  IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+      command_buffer->resource_set, 1, &target_buffer));
+
+  CUdeviceptr target_device_buffer = iree_hal_cuda_buffer_device_pointer(
+      iree_hal_buffer_allocated_buffer(target_buffer));
+  target_offset += iree_hal_buffer_byte_offset(target_buffer);
+  uint32_t dword_pattern = iree_hal_cuda_splat_pattern(pattern, pattern_length);
+  CUDA_MEMSET_NODE_PARAMS params = {
+      .dst = target_device_buffer + target_offset,
+      .elementSize = pattern_length,
+      // width in number of elements despite what driver documentation says.
+      .width = length / pattern_length,
+      .height = 1,
+      .value = dword_pattern,
+  };
+  // Serialize all the nodes for now.
+  CUgraphNode dep[] = {command_buffer->last_node};
+  size_t numNode = command_buffer->last_node ? 1 : 0;
+  CUDA_RETURN_IF_ERROR(
+      command_buffer->context->syms,
+      cuGraphAddMemsetNode(&command_buffer->last_node, command_buffer->graph,
+                           dep, numNode, &params,
+                           command_buffer->context->cu_context),
+      "cuGraphAddMemsetNode");
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_cuda_graph_command_buffer_update_buffer(
+    iree_hal_command_buffer_t* base_command_buffer, const void* source_buffer,
+    iree_host_size_t source_offset, iree_hal_buffer_t* target_buffer,
+    iree_device_size_t target_offset, iree_device_size_t length) {
+  iree_hal_cuda_graph_command_buffer_t* command_buffer =
+      iree_hal_cuda_graph_command_buffer_cast(base_command_buffer);
+
+  // Allocate scratch space in the arena for the data and copy it in.
+  // The update buffer API requires that the command buffer capture the host
+  // memory at the time the method is called in case the caller wants to reuse
+  // the memory. Because CUDA memcpys are async if we didn't copy it's possible
+  // for the reused memory to change before the stream reaches the copy
+  // operation and get the wrong data.
+  uint8_t* storage = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_arena_allocate(&command_buffer->arena, length, (void**)&storage));
+  memcpy(storage, (const uint8_t*)source_buffer + source_offset, length);
+
+  IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+      command_buffer->resource_set, 1, &target_buffer));
+
+  CUdeviceptr target_device_buffer = iree_hal_cuda_buffer_device_pointer(
+      iree_hal_buffer_allocated_buffer(target_buffer));
+  CUDA_MEMCPY3D params = {
+      .srcMemoryType = CU_MEMORYTYPE_HOST,
+      .srcHost = storage,
+      .dstMemoryType = CU_MEMORYTYPE_DEVICE,
+      .dstDevice = target_device_buffer,
+      .dstXInBytes = iree_hal_buffer_byte_offset(target_buffer) + target_offset,
+      .WidthInBytes = length,
+      .Height = 1,
+      .Depth = 1,
+  };
+  // Serialize all the nodes for now.
+  CUgraphNode dep[] = {command_buffer->last_node};
+  size_t numNode = command_buffer->last_node ? 1 : 0;
+  CUDA_RETURN_IF_ERROR(
+      command_buffer->context->syms,
+      cuGraphAddMemcpyNode(&command_buffer->last_node, command_buffer->graph,
+                           dep, numNode, &params,
+                           command_buffer->context->cu_context),
+      "cuGraphAddMemcpyNode");
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_cuda_graph_command_buffer_copy_buffer(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
+    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+    iree_device_size_t length) {
+  iree_hal_cuda_graph_command_buffer_t* command_buffer =
+      iree_hal_cuda_graph_command_buffer_cast(base_command_buffer);
+
+  const iree_hal_buffer_t* buffers[2] = {source_buffer, target_buffer};
+  IREE_RETURN_IF_ERROR(
+      iree_hal_resource_set_insert(command_buffer->resource_set, 2, buffers));
+
+  CUdeviceptr target_device_buffer = iree_hal_cuda_buffer_device_pointer(
+      iree_hal_buffer_allocated_buffer(target_buffer));
+  target_offset += iree_hal_buffer_byte_offset(target_buffer);
+  CUdeviceptr source_device_buffer = iree_hal_cuda_buffer_device_pointer(
+      iree_hal_buffer_allocated_buffer(source_buffer));
+  source_offset += iree_hal_buffer_byte_offset(source_buffer);
+  CUDA_MEMCPY3D params = {
+      .srcMemoryType = CU_MEMORYTYPE_DEVICE,
+      .srcDevice = source_device_buffer,
+      .srcXInBytes = source_offset,
+      .dstMemoryType = CU_MEMORYTYPE_DEVICE,
+      .dstDevice = target_device_buffer,
+      .dstXInBytes = target_offset,
+      .WidthInBytes = length,
+      .Height = 1,
+      .Depth = 1,
+  };
+  // Serialize all the nodes for now.
+  CUgraphNode dep[] = {command_buffer->last_node};
+  size_t numNode = command_buffer->last_node ? 1 : 0;
+  CUDA_RETURN_IF_ERROR(
+      command_buffer->context->syms,
+      cuGraphAddMemcpyNode(&command_buffer->last_node, command_buffer->graph,
+                           dep, numNode, &params,
+                           command_buffer->context->cu_context),
+      "cuGraphAddMemcpyNode");
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_cuda_graph_command_buffer_push_constants(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_executable_layout_t* executable_layout, iree_host_size_t offset,
+    const void* values, iree_host_size_t values_length) {
+  iree_hal_cuda_graph_command_buffer_t* command_buffer =
+      iree_hal_cuda_graph_command_buffer_cast(base_command_buffer);
+  iree_host_size_t constant_base_index = offset / sizeof(int32_t);
+  for (iree_host_size_t i = 0; i < values_length / sizeof(int32_t); i++) {
+    command_buffer->push_constant[i + constant_base_index] =
+        ((uint32_t*)values)[i];
+  }
+  return iree_ok_status();
+}
+
+// Tie together the binding index and its index in |bindings| array.
+typedef struct {
+  uint32_t index;
+  uint32_t binding;
+} iree_hal_cuda_binding_mapping_t;
+
+// Helper to sort the binding based on their binding index.
+static int compare_binding_index(const void* a, const void* b) {
+  const iree_hal_cuda_binding_mapping_t buffer_a =
+      *(const iree_hal_cuda_binding_mapping_t*)a;
+  const iree_hal_cuda_binding_mapping_t buffer_b =
+      *(const iree_hal_cuda_binding_mapping_t*)b;
+  return buffer_a.binding < buffer_b.binding ? -1 : 1;
+}
+
+static iree_status_t iree_hal_cuda_graph_command_buffer_push_descriptor_set(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_executable_layout_t* executable_layout, uint32_t set,
+    iree_host_size_t binding_count,
+    const iree_hal_descriptor_set_binding_t* bindings) {
+  iree_hal_cuda_graph_command_buffer_t* command_buffer =
+      iree_hal_cuda_graph_command_buffer_cast(base_command_buffer);
+  iree_host_size_t base_binding =
+      iree_hal_cuda_base_binding_index(executable_layout, set);
+  // Convention with the compiler side. We map bindings to kernel argument.
+  // We compact the bindings to get a dense set of arguments and keep them order
+  // based on the binding index.
+  // Sort the binding based on the binding index and map the array index to the
+  // argument index.
+  iree_hal_cuda_binding_mapping_t binding_used[IREE_HAL_CUDA_MAX_BINDING_COUNT];
+  for (iree_host_size_t i = 0; i < binding_count; i++) {
+    iree_hal_cuda_binding_mapping_t buffer = {i, bindings[i].binding};
+    binding_used[i] = buffer;
+  }
+  qsort(binding_used, binding_count, sizeof(iree_hal_cuda_binding_mapping_t),
+        compare_binding_index);
+  IREE_ASSERT_LT(binding_count, IREE_HAL_CUDA_MAX_BINDING_COUNT,
+                 "binding count larger than the max expected");
+  for (iree_host_size_t i = 0; i < binding_count; i++) {
+    const iree_hal_descriptor_set_binding_t* binding =
+        &bindings[binding_used[i].index];
+    CUdeviceptr device_ptr =
+        iree_hal_cuda_buffer_device_pointer(
+            iree_hal_buffer_allocated_buffer(binding->buffer)) +
+        iree_hal_buffer_byte_offset(binding->buffer) + binding->offset;
+    *((CUdeviceptr*)command_buffer->current_descriptor[i + base_binding]) =
+        device_ptr;
+    IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+        command_buffer->resource_set, 1, &binding->buffer));
+  }
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_cuda_graph_command_buffer_bind_descriptor_set(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_executable_layout_t* executable_layout, uint32_t set,
+    iree_hal_descriptor_set_t* descriptor_set,
+    iree_host_size_t dynamic_offset_count,
+    const iree_device_size_t* dynamic_offsets) {
+  return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                          "need cuda implementation");
+}
+
+static iree_status_t iree_hal_cuda_graph_command_buffer_dispatch(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_executable_t* executable, int32_t entry_point,
+    uint32_t workgroup_x, uint32_t workgroup_y, uint32_t workgroup_z) {
+  iree_hal_cuda_graph_command_buffer_t* command_buffer =
+      iree_hal_cuda_graph_command_buffer_cast(base_command_buffer);
+  IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+      command_buffer->resource_set, 1, &executable));
+  iree_hal_executable_layout_t* layout =
+      iree_hal_cuda_executable_get_layout(executable, entry_point);
+  iree_host_size_t num_constants =
+      iree_hal_cuda_executable_layout_num_constants(layout);
+  iree_host_size_t constant_base_index =
+      iree_hal_cuda_push_constant_index(layout);
+  // Patch the push constants in the kernel arguments.
+  for (iree_host_size_t i = 0; i < num_constants; i++) {
+    *((uint32_t*)command_buffer->current_descriptor[i + constant_base_index]) =
+        command_buffer->push_constant[i];
+  }
+  int32_t block_size_x, block_size_y, block_size_z;
+  int32_t shared_memory_size;
+  IREE_RETURN_IF_ERROR(iree_hal_cuda_native_executable_block_size(
+      executable, entry_point, &block_size_x, &block_size_y, &block_size_z));
+  IREE_RETURN_IF_ERROR(iree_hal_cuda_native_executable_shared_memory_size(
+      executable, entry_point, &shared_memory_size));
+  CUDA_KERNEL_NODE_PARAMS params = {
+      .func = iree_hal_cuda_native_executable_for_entry_point(executable,
+                                                              entry_point),
+      .blockDimX = block_size_x,
+      .blockDimY = block_size_y,
+      .blockDimZ = block_size_z,
+      .gridDimX = workgroup_x,
+      .gridDimY = workgroup_y,
+      .gridDimZ = workgroup_z,
+      .kernelParams = command_buffer->current_descriptor,
+      .sharedMemBytes = shared_memory_size,
+  };
+  // Serialize all the nodes for now.
+  CUgraphNode dep[] = {command_buffer->last_node};
+  size_t numNodes = command_buffer->last_node ? 1 : 0;
+  CUDA_RETURN_IF_ERROR(
+      command_buffer->context->syms,
+      cuGraphAddKernelNode(&command_buffer->last_node, command_buffer->graph,
+                           dep, numNodes, &params),
+      "cuGraphAddKernelNode");
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_cuda_graph_command_buffer_dispatch_indirect(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_executable_t* executable, int32_t entry_point,
+    iree_hal_buffer_t* workgroups_buffer,
+    iree_device_size_t workgroups_offset) {
+  return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                          "need cuda implementation");
+}
+
+CUgraphExec iree_hal_cuda_graph_command_buffer_exec(
+    iree_hal_command_buffer_t* base_command_buffer) {
+  iree_hal_cuda_graph_command_buffer_t* command_buffer =
+      (iree_hal_cuda_graph_command_buffer_t*)iree_hal_command_buffer_dyn_cast(
+          base_command_buffer, &iree_hal_cuda_graph_command_buffer_vtable);
+  IREE_ASSERT_TRUE(command_buffer);
+  return command_buffer->exec;
+}
+
+static const iree_hal_command_buffer_vtable_t
+    iree_hal_cuda_graph_command_buffer_vtable = {
+        .destroy = iree_hal_cuda_graph_command_buffer_destroy,
+        .dyn_cast = iree_hal_cuda_graph_command_buffer_dyn_cast,
+        .begin = iree_hal_cuda_graph_command_buffer_begin,
+        .end = iree_hal_cuda_graph_command_buffer_end,
+        .begin_debug_group =
+            iree_hal_cuda_graph_command_buffer_begin_debug_group,
+        .end_debug_group = iree_hal_cuda_graph_command_buffer_end_debug_group,
+        .execution_barrier =
+            iree_hal_cuda_graph_command_buffer_execution_barrier,
+        .signal_event = iree_hal_cuda_graph_command_buffer_signal_event,
+        .reset_event = iree_hal_cuda_graph_command_buffer_reset_event,
+        .wait_events = iree_hal_cuda_graph_command_buffer_wait_events,
+        .discard_buffer = iree_hal_cuda_graph_command_buffer_discard_buffer,
+        .fill_buffer = iree_hal_cuda_graph_command_buffer_fill_buffer,
+        .update_buffer = iree_hal_cuda_graph_command_buffer_update_buffer,
+        .copy_buffer = iree_hal_cuda_graph_command_buffer_copy_buffer,
+        .push_constants = iree_hal_cuda_graph_command_buffer_push_constants,
+        .push_descriptor_set =
+            iree_hal_cuda_graph_command_buffer_push_descriptor_set,
+        .bind_descriptor_set =
+            iree_hal_cuda_graph_command_buffer_bind_descriptor_set,
+        .dispatch = iree_hal_cuda_graph_command_buffer_dispatch,
+        .dispatch_indirect =
+            iree_hal_cuda_graph_command_buffer_dispatch_indirect,
+};
diff --git a/runtime/src/iree/hal/cuda/graph_command_buffer.h b/runtime/src/iree/hal/cuda/graph_command_buffer.h
new file mode 100644
index 0000000..8ef4fda
--- /dev/null
+++ b/runtime/src/iree/hal/cuda/graph_command_buffer.h
@@ -0,0 +1,46 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_CUDA_GRAPH_COMMAND_BUFFER_H_
+#define IREE_HAL_CUDA_GRAPH_COMMAND_BUFFER_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/cuda/context_wrapper.h"
+#include "iree/hal/cuda/cuda_headers.h"
+#include "iree/hal/cuda/dynamic_symbols.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef struct iree_arena_block_pool_t iree_arena_block_pool_t;
+
+// Creates a command buffer that records into a CUDA graph.
+//
+// NOTE: the |block_pool| must remain live for the lifetime of the command
+// buffers that use it.
+iree_status_t iree_hal_cuda_graph_command_buffer_create(
+    iree_hal_device_t* device, iree_hal_cuda_context_wrapper_t* context,
+    iree_hal_command_buffer_mode_t mode,
+    iree_hal_command_category_t command_categories,
+    iree_hal_queue_affinity_t queue_affinity,
+    iree_arena_block_pool_t* block_pool,
+    iree_hal_command_buffer_t** out_command_buffer);
+
+// Returns true if |command_buffer| is a CUDA graph-based command buffer.
+bool iree_hal_cuda_graph_command_buffer_isa(
+    iree_hal_command_buffer_t* command_buffer);
+
+// Returns the native cuda graph associated to the command buffer.
+CUgraphExec iree_hal_cuda_graph_command_buffer_exec(
+    iree_hal_command_buffer_t* command_buffer);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_CUDA_GRAPH_COMMAND_BUFFER_H_
diff --git a/runtime/src/iree/hal/cuda/native_executable.c b/runtime/src/iree/hal/cuda/native_executable.c
new file mode 100644
index 0000000..5046595
--- /dev/null
+++ b/runtime/src/iree/hal/cuda/native_executable.c
@@ -0,0 +1,188 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/cuda/native_executable.h"
+
+#include <stddef.h>
+
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/cuda/dynamic_symbols.h"
+#include "iree/hal/cuda/executable_layout.h"
+#include "iree/hal/cuda/status_util.h"
+
+// flatcc schemas:
+#include "iree/base/internal/flatcc/parsing.h"
+#include "iree/schemas/cuda_executable_def_reader.h"
+#include "iree/schemas/cuda_executable_def_verifier.h"
+
+typedef struct iree_hal_cuda_native_executable_function_t {
+  CUfunction cu_function;
+  uint32_t block_size_x;
+  uint32_t block_size_y;
+  uint32_t block_size_z;
+  uint32_t shared_memory_size;
+} iree_hal_cuda_native_executable_function_t;
+
+typedef struct iree_hal_cuda_native_executable_t {
+  iree_hal_resource_t resource;
+  iree_hal_cuda_context_wrapper_t* context;
+  iree_hal_executable_layout_t** executable_layouts;
+  iree_host_size_t entry_count;
+  CUmodule module;
+  iree_hal_cuda_native_executable_function_t entry_functions[];
+} iree_hal_cuda_native_executable_t;
+
+static const iree_hal_executable_vtable_t
+    iree_hal_cuda_native_executable_vtable;
+
+static iree_hal_cuda_native_executable_t* iree_hal_cuda_native_executable_cast(
+    iree_hal_executable_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda_native_executable_vtable);
+  return (iree_hal_cuda_native_executable_t*)base_value;
+}
+
+iree_status_t iree_hal_cuda_native_executable_create(
+    iree_hal_cuda_context_wrapper_t* context,
+    const iree_hal_executable_params_t* executable_params,
+    iree_hal_executable_t** out_executable) {
+  IREE_ASSERT_ARGUMENT(context);
+  IREE_ASSERT_ARGUMENT(executable_params);
+  IREE_ASSERT_ARGUMENT(out_executable);
+  *out_executable = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_cuda_native_executable_t* executable = NULL;
+
+  // TODO: Verify the flat buffer.
+  iree_CUDAExecutableDef_table_t executable_def =
+      iree_CUDAExecutableDef_as_root(executable_params->executable_data.data);
+
+  // Create the kernel module.
+  flatbuffers_string_t ptx_image =
+      iree_CUDAExecutableDef_ptx_image_get(executable_def);
+  flatbuffers_uint32_vec_t shared_memory_sizes =
+      iree_CUDAExecutableDef_shared_memory_size_get(executable_def);
+  flatbuffers_string_vec_t entry_points_vec =
+      iree_CUDAExecutableDef_entry_points_get(executable_def);
+  iree_CUDABlockSizeDef_vec_t block_sizes_vec =
+      iree_CUDAExecutableDef_block_sizes_get(executable_def);
+  iree_host_size_t entry_count = flatbuffers_string_vec_len(entry_points_vec);
+  iree_host_size_t total_size =
+      sizeof(*executable) +
+      entry_count * sizeof(iree_hal_cuda_native_executable_function_t) +
+      entry_count * sizeof(iree_hal_executable_layout_t*);
+  iree_status_t status = iree_allocator_malloc(context->host_allocator,
+                                               total_size, (void**)&executable);
+  CUmodule module = NULL;
+  if (iree_status_is_ok(status)) {
+    iree_hal_resource_initialize(&iree_hal_cuda_native_executable_vtable,
+                                 &executable->resource);
+    executable->module = module;
+    executable->context = context;
+
+    executable->executable_layouts =
+        (void*)((char*)executable + sizeof(*executable) +
+                entry_count *
+                    sizeof(iree_hal_cuda_native_executable_function_t));
+    status = CU_RESULT_TO_STATUS(
+        context->syms, cuModuleLoadDataEx(&module, ptx_image, 0, NULL, NULL),
+        "cuModuleLoadDataEx");
+  }
+
+  executable->entry_count = entry_count;
+  for (iree_host_size_t i = 0; i < entry_count; i++) {
+    if (iree_status_is_ok(status)) {
+      CUfunction function = NULL;
+      const char* entry_name = flatbuffers_string_vec_at(entry_points_vec, i);
+      status = CU_RESULT_TO_STATUS(
+          context->syms, cuModuleGetFunction(&function, module, entry_name),
+          "cuModuleGetFunction");
+      if (iree_status_is_ok(status)) {
+        status = CU_RESULT_TO_STATUS(
+            context->syms,
+            cuFuncSetAttribute(function,
+                               CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+                               shared_memory_sizes[i]),
+            "cuFuncSetAttribute");
+      }
+      executable->entry_functions[i].cu_function = function;
+      executable->entry_functions[i].block_size_x = block_sizes_vec[i].x;
+      executable->entry_functions[i].block_size_y = block_sizes_vec[i].y;
+      executable->entry_functions[i].block_size_z = block_sizes_vec[i].z;
+      executable->entry_functions[i].shared_memory_size =
+          shared_memory_sizes[i];
+      executable->executable_layouts[i] =
+          executable_params->executable_layouts[i];
+      iree_hal_executable_layout_retain(
+          executable_params->executable_layouts[i]);
+    }
+  }
+
+  if (iree_status_is_ok(status)) {
+    *out_executable = (iree_hal_executable_t*)executable;
+  } else {
+    iree_hal_executable_destroy((iree_hal_executable_t*)executable);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_cuda_native_executable_destroy(
+    iree_hal_executable_t* base_executable) {
+  iree_hal_cuda_native_executable_t* executable =
+      iree_hal_cuda_native_executable_cast(base_executable);
+  iree_allocator_t host_allocator = executable->context->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  for (iree_host_size_t i = 0; i < executable->entry_count; ++i) {
+    iree_hal_executable_layout_release(executable->executable_layouts[i]);
+  }
+  iree_allocator_free(host_allocator, executable);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+CUfunction iree_hal_cuda_native_executable_for_entry_point(
+    iree_hal_executable_t* base_executable, int32_t entry_point) {
+  iree_hal_cuda_native_executable_t* executable =
+      iree_hal_cuda_native_executable_cast(base_executable);
+  return executable->entry_functions[entry_point].cu_function;
+}
+
+iree_status_t iree_hal_cuda_native_executable_block_size(
+    iree_hal_executable_t* base_executable, int32_t entry_point, uint32_t* x,
+    uint32_t* y, uint32_t* z) {
+  iree_hal_cuda_native_executable_t* executable =
+      iree_hal_cuda_native_executable_cast(base_executable);
+  *x = executable->entry_functions[entry_point].block_size_x;
+  *y = executable->entry_functions[entry_point].block_size_y;
+  *z = executable->entry_functions[entry_point].block_size_z;
+  return iree_ok_status();
+}
+
+iree_status_t iree_hal_cuda_native_executable_shared_memory_size(
+    iree_hal_executable_t* base_executable, int32_t entry_point,
+    uint32_t* shared_memory_size) {
+  iree_hal_cuda_native_executable_t* executable =
+      iree_hal_cuda_native_executable_cast(base_executable);
+  *shared_memory_size =
+      executable->entry_functions[entry_point].shared_memory_size;
+  return iree_ok_status();
+}
+
+iree_hal_executable_layout_t* iree_hal_cuda_executable_get_layout(
+    iree_hal_executable_t* base_executable, int32_t entry_point) {
+  iree_hal_cuda_native_executable_t* executable =
+      iree_hal_cuda_native_executable_cast(base_executable);
+  return executable->executable_layouts[entry_point];
+}
+
+static const iree_hal_executable_vtable_t
+    iree_hal_cuda_native_executable_vtable = {
+        .destroy = iree_hal_cuda_native_executable_destroy,
+};
diff --git a/runtime/src/iree/hal/cuda/native_executable.h b/runtime/src/iree/hal/cuda/native_executable.h
new file mode 100644
index 0000000..8c19376
--- /dev/null
+++ b/runtime/src/iree/hal/cuda/native_executable.h
@@ -0,0 +1,50 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_CUDA_NATIVE_EXECUTABLE_H_
+#define IREE_HAL_CUDA_NATIVE_EXECUTABLE_H_
+
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/cuda/context_wrapper.h"
+#include "iree/hal/cuda/cuda_headers.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Creates an executable from a PTX module. The module may contain several
+// kernels that can be extracted along with the associated block size.
+iree_status_t iree_hal_cuda_native_executable_create(
+    iree_hal_cuda_context_wrapper_t* context,
+    const iree_hal_executable_params_t* executable_params,
+    iree_hal_executable_t** out_executable);
+
+CUfunction iree_hal_cuda_native_executable_for_entry_point(
+    iree_hal_executable_t* executable, int32_t entry_point);
+
+// Return the block size of the given |entry_point| within the executable.
+iree_status_t iree_hal_cuda_native_executable_block_size(
+    iree_hal_executable_t* executable, int32_t entry_point, uint32_t* x,
+    uint32_t* y, uint32_t* z);
+
+// Return the shared memory size of the given |entry_point| within the
+// executable.
+iree_status_t iree_hal_cuda_native_executable_shared_memory_size(
+    iree_hal_executable_t* executable, int32_t entry_point,
+    uint32_t* shared_memory_size);
+
+/// Return the layout associated with the entry point.
+iree_hal_executable_layout_t* iree_hal_cuda_executable_get_layout(
+    iree_hal_executable_t* executable, int32_t entry_point);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_CUDA_NATIVE_EXECUTABLE_H_
diff --git a/runtime/src/iree/hal/cuda/nop_executable_cache.c b/runtime/src/iree/hal/cuda/nop_executable_cache.c
new file mode 100644
index 0000000..c65795f
--- /dev/null
+++ b/runtime/src/iree/hal/cuda/nop_executable_cache.c
@@ -0,0 +1,90 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/cuda/nop_executable_cache.h"
+
+#include <stdbool.h>
+#include <stddef.h>
+
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/cuda/native_executable.h"
+
+typedef struct iree_hal_cuda_nop_executable_cache_t {
+  iree_hal_resource_t resource;
+  iree_hal_cuda_context_wrapper_t* context;
+} iree_hal_cuda_nop_executable_cache_t;
+
+static const iree_hal_executable_cache_vtable_t
+    iree_hal_cuda_nop_executable_cache_vtable;
+
+static iree_hal_cuda_nop_executable_cache_t*
+iree_hal_cuda_nop_executable_cache_cast(
+    iree_hal_executable_cache_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda_nop_executable_cache_vtable);
+  return (iree_hal_cuda_nop_executable_cache_t*)base_value;
+}
+
+iree_status_t iree_hal_cuda_nop_executable_cache_create(
+    iree_hal_cuda_context_wrapper_t* context, iree_string_view_t identifier,
+    iree_hal_executable_cache_t** out_executable_cache) {
+  IREE_ASSERT_ARGUMENT(out_executable_cache);
+  *out_executable_cache = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_cuda_nop_executable_cache_t* executable_cache = NULL;
+  iree_status_t status =
+      iree_allocator_malloc(context->host_allocator, sizeof(*executable_cache),
+                            (void**)&executable_cache);
+  if (iree_status_is_ok(status)) {
+    iree_hal_resource_initialize(&iree_hal_cuda_nop_executable_cache_vtable,
+                                 &executable_cache->resource);
+    executable_cache->context = context;
+
+    *out_executable_cache = (iree_hal_executable_cache_t*)executable_cache;
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_cuda_nop_executable_cache_destroy(
+    iree_hal_executable_cache_t* base_executable_cache) {
+  iree_hal_cuda_nop_executable_cache_t* executable_cache =
+      iree_hal_cuda_nop_executable_cache_cast(base_executable_cache);
+  iree_allocator_t host_allocator = executable_cache->context->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_allocator_free(host_allocator, executable_cache);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static bool iree_hal_cuda_nop_executable_cache_can_prepare_format(
+    iree_hal_executable_cache_t* base_executable_cache,
+    iree_hal_executable_caching_mode_t caching_mode,
+    iree_string_view_t executable_format) {
+  return iree_string_view_equal(executable_format,
+                                iree_make_cstring_view("PTXE"));
+}
+
+static iree_status_t iree_hal_cuda_nop_executable_cache_prepare_executable(
+    iree_hal_executable_cache_t* base_executable_cache,
+    const iree_hal_executable_params_t* executable_params,
+    iree_hal_executable_t** out_executable) {
+  iree_hal_cuda_nop_executable_cache_t* executable_cache =
+      iree_hal_cuda_nop_executable_cache_cast(base_executable_cache);
+  return iree_hal_cuda_native_executable_create(
+      executable_cache->context, executable_params, out_executable);
+}
+
+static const iree_hal_executable_cache_vtable_t
+    iree_hal_cuda_nop_executable_cache_vtable = {
+        .destroy = iree_hal_cuda_nop_executable_cache_destroy,
+        .can_prepare_format =
+            iree_hal_cuda_nop_executable_cache_can_prepare_format,
+        .prepare_executable =
+            iree_hal_cuda_nop_executable_cache_prepare_executable,
+};
diff --git a/runtime/src/iree/hal/cuda/nop_executable_cache.h b/runtime/src/iree/hal/cuda/nop_executable_cache.h
new file mode 100644
index 0000000..dcb38a9
--- /dev/null
+++ b/runtime/src/iree/hal/cuda/nop_executable_cache.h
@@ -0,0 +1,29 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_CUDA_NOP_EXECUTABLE_CACHE_H_
+#define IREE_HAL_CUDA_NOP_EXECUTABLE_CACHE_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/cuda/context_wrapper.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Creates a no-op executable cache that does not cache at all.
+// This is useful to isolate pipeline caching behavior and verify compilation
+// behavior.
+iree_status_t iree_hal_cuda_nop_executable_cache_create(
+    iree_hal_cuda_context_wrapper_t* context, iree_string_view_t identifier,
+    iree_hal_executable_cache_t** out_executable_cache);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_CUDA_NOP_EXECUTABLE_CACHE_H_
diff --git a/runtime/src/iree/hal/cuda/registration/CMakeLists.txt b/runtime/src/iree/hal/cuda/registration/CMakeLists.txt
new file mode 100644
index 0000000..a26a0e9
--- /dev/null
+++ b/runtime/src/iree/hal/cuda/registration/CMakeLists.txt
@@ -0,0 +1,31 @@
+# Copyright 2022 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+iree_add_all_subdirs()
+
+if(NOT IREE_HAL_DRIVER_CUDA)
+  return()
+endif()
+
+iree_cc_library(
+  NAME
+    registration
+  HDRS
+    "driver_module.h"
+  SRCS
+    "driver_module.c"
+  DEPS
+    iree::base
+    iree::base::cc
+    iree::base::core_headers
+    iree::base::internal::flags
+    iree::base::tracing
+    iree::hal
+    iree::hal::cuda
+  DEFINES
+    "IREE_HAL_HAVE_CUDA_DRIVER_MODULE=1"
+  PUBLIC
+)
diff --git a/runtime/src/iree/hal/cuda/registration/driver_module.c b/runtime/src/iree/hal/cuda/registration/driver_module.c
new file mode 100644
index 0000000..8215dfb
--- /dev/null
+++ b/runtime/src/iree/hal/cuda/registration/driver_module.c
@@ -0,0 +1,85 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/cuda/registration/driver_module.h"
+
+#include <inttypes.h>
+#include <stddef.h>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/flags.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/cuda/api.h"
+
+#define IREE_HAL_CUDA_DRIVER_ID 0x43554441u  // CUDA
+
+// Force using CUDA streams until we support command buffer caching to avoid the
+// overhead of graph creation.
+IREE_FLAG(
+    bool, cuda_use_streams, true,
+    "Use CUDA streams for executing command buffers (instead of graphs).");
+
+IREE_FLAG(bool, cuda_allow_inline_execution, false,
+          "Allow command buffers to execute inline against CUDA streams when "
+          "possible.");
+
+IREE_FLAG(int32_t, cuda_default_index, 0, "Index of the default CUDA device.");
+
+static iree_status_t iree_hal_cuda_driver_factory_enumerate(
+    void* self, const iree_hal_driver_info_t** out_driver_infos,
+    iree_host_size_t* out_driver_info_count) {
+  // NOTE: we could query supported cuda versions or featuresets here.
+  static const iree_hal_driver_info_t driver_infos[1] = {{
+      .driver_id = IREE_HAL_CUDA_DRIVER_ID,
+      .driver_name = iree_string_view_literal("cuda"),
+      .full_name = iree_string_view_literal("CUDA (dynamic)"),
+  }};
+  *out_driver_info_count = IREE_ARRAYSIZE(driver_infos);
+  *out_driver_infos = driver_infos;
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_cuda_driver_factory_try_create(
+    void* self, iree_hal_driver_id_t driver_id, iree_allocator_t allocator,
+    iree_hal_driver_t** out_driver) {
+  IREE_ASSERT_ARGUMENT(out_driver);
+  *out_driver = NULL;
+  if (driver_id != IREE_HAL_CUDA_DRIVER_ID) {
+    return iree_make_status(IREE_STATUS_UNAVAILABLE,
+                            "no driver with ID %016" PRIu64
+                            " is provided by this factory",
+                            driver_id);
+  }
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_cuda_device_params_t default_params;
+  iree_hal_cuda_device_params_initialize(&default_params);
+  if (FLAG_cuda_use_streams) {
+    default_params.command_buffer_mode =
+        IREE_HAL_CUDA_COMMAND_BUFFER_MODE_STREAM;
+  }
+  default_params.allow_inline_execution = FLAG_cuda_allow_inline_execution;
+
+  iree_hal_cuda_driver_options_t driver_options;
+  iree_hal_cuda_driver_options_initialize(&driver_options);
+  driver_options.default_device_index = FLAG_cuda_default_index;
+
+  iree_string_view_t identifier = iree_make_cstring_view("cuda");
+  iree_status_t status = iree_hal_cuda_driver_create(
+      identifier, &default_params, &driver_options, allocator, out_driver);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t
+iree_hal_cuda_driver_module_register(iree_hal_driver_registry_t* registry) {
+  static const iree_hal_driver_factory_t factory = {
+      .self = NULL,
+      .enumerate = iree_hal_cuda_driver_factory_enumerate,
+      .try_create = iree_hal_cuda_driver_factory_try_create,
+  };
+  return iree_hal_driver_registry_register_factory(registry, &factory);
+}
diff --git a/runtime/src/iree/hal/cuda/registration/driver_module.h b/runtime/src/iree/hal/cuda/registration/driver_module.h
new file mode 100644
index 0000000..1de341e
--- /dev/null
+++ b/runtime/src/iree/hal/cuda/registration/driver_module.h
@@ -0,0 +1,24 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_CUDA_REGISTRATION_DRIVER_MODULE_H_
+#define IREE_HAL_CUDA_REGISTRATION_DRIVER_MODULE_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+IREE_API_EXPORT iree_status_t
+iree_hal_cuda_driver_module_register(iree_hal_driver_registry_t* registry);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_CUDA_REGISTRATION_DRIVER_MODULE_H_
diff --git a/runtime/src/iree/hal/cuda/status_util.c b/runtime/src/iree/hal/cuda/status_util.c
new file mode 100644
index 0000000..7532ecd
--- /dev/null
+++ b/runtime/src/iree/hal/cuda/status_util.c
@@ -0,0 +1,32 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/cuda/status_util.h"
+
+#include <stddef.h>
+
+#include "iree/hal/cuda/dynamic_symbols.h"
+
+iree_status_t iree_hal_cuda_result_to_status(
+    iree_hal_cuda_dynamic_symbols_t* syms, CUresult result, const char* file,
+    uint32_t line) {
+  if (IREE_LIKELY(result == CUDA_SUCCESS)) {
+    return iree_ok_status();
+  }
+
+  const char* error_name = NULL;
+  if (syms->cuGetErrorName(result, &error_name) != CUDA_SUCCESS) {
+    error_name = "UNKNOWN";
+  }
+
+  const char* error_string = NULL;
+  if (syms->cuGetErrorString(result, &error_string) != CUDA_SUCCESS) {
+    error_string = "Unknown error.";
+  }
+  return iree_make_status_with_location(file, line, IREE_STATUS_INTERNAL,
+                                        "CUDA driver error '%s' (%d): %s",
+                                        error_name, result, error_string);
+}
diff --git a/runtime/src/iree/hal/cuda/status_util.h b/runtime/src/iree/hal/cuda/status_util.h
new file mode 100644
index 0000000..270048e
--- /dev/null
+++ b/runtime/src/iree/hal/cuda/status_util.h
@@ -0,0 +1,54 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_CUDA_STATUS_UTIL_H_
+#define IREE_HAL_CUDA_STATUS_UTIL_H_
+
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/cuda/dynamic_symbols.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Converts a CUresult to an iree_status_t.
+//
+// Usage:
+//   iree_status_t status = CU_RESULT_TO_STATUS(cuDoThing(...));
+#define CU_RESULT_TO_STATUS(syms, expr, ...) \
+  iree_hal_cuda_result_to_status((syms), ((syms)->expr), __FILE__, __LINE__)
+
+// IREE_RETURN_IF_ERROR but implicitly converts the CUresult return value to
+// a Status.
+//
+// Usage:
+//   CUDA_RETURN_IF_ERROR(cuDoThing(...), "message");
+#define CUDA_RETURN_IF_ERROR(syms, expr, ...)                                 \
+  IREE_RETURN_IF_ERROR(iree_hal_cuda_result_to_status((syms), ((syms)->expr), \
+                                                      __FILE__, __LINE__),    \
+                       __VA_ARGS__)
+
+// IREE_IGNORE_ERROR but implicitly converts the CUresult return value to a
+// Status.
+//
+// Usage:
+//   CUDA_IGNORE_ERROR(cuDoThing(...));
+#define CUDA_IGNORE_ERROR(syms, expr)                                      \
+  IREE_IGNORE_ERROR(iree_hal_cuda_result_to_status((syms), ((syms)->expr), \
+                                                   __FILE__, __LINE__))
+
+// Converts a CUresult to a Status object.
+iree_status_t iree_hal_cuda_result_to_status(
+    iree_hal_cuda_dynamic_symbols_t* syms, CUresult result, const char* file,
+    uint32_t line);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_CUDA_STATUS_UTIL_H_
diff --git a/runtime/src/iree/hal/cuda/stream_command_buffer.c b/runtime/src/iree/hal/cuda/stream_command_buffer.c
new file mode 100644
index 0000000..08b908b
--- /dev/null
+++ b/runtime/src/iree/hal/cuda/stream_command_buffer.c
@@ -0,0 +1,411 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/cuda/stream_command_buffer.h"
+
+#include "iree/base/tracing.h"
+#include "iree/hal/cuda/cuda_buffer.h"
+#include "iree/hal/cuda/cuda_event.h"
+#include "iree/hal/cuda/executable_layout.h"
+#include "iree/hal/cuda/native_executable.h"
+#include "iree/hal/cuda/status_util.h"
+
+#define IREE_HAL_CUDA_MAX_BINDING_COUNT 64
+// Kernel arguments contains binding and push constants.
+#define IREE_HAL_CUDA_MAX_KERNEL_ARG 128
+// This records the commands on the calling thread without additional threading
+// indirection.
+
+typedef struct {
+  iree_hal_command_buffer_t base;
+  iree_hal_cuda_context_wrapper_t* context;
+  CUstream stream;
+
+  // Staging arena used for host->device transfers.
+  // Used for when we need CUDA to be able to reference memory as it performs
+  // asynchronous operations.
+  iree_arena_allocator_t arena;
+
+  int32_t push_constant[IREE_HAL_CUDA_MAX_PUSH_CONSTANT_COUNT];
+  // Keep track of the current set of kernel arguments.
+  void* current_descriptor[IREE_HAL_CUDA_MAX_KERNEL_ARG];
+  CUdeviceptr* device_ptrs[IREE_HAL_CUDA_MAX_KERNEL_ARG];
+} iree_hal_cuda_stream_command_buffer_t;
+
+static const iree_hal_command_buffer_vtable_t
+    iree_hal_cuda_stream_command_buffer_vtable;
+
+static iree_hal_cuda_stream_command_buffer_t*
+iree_hal_cuda_stream_command_buffer_cast(
+    iree_hal_command_buffer_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda_stream_command_buffer_vtable);
+  return (iree_hal_cuda_stream_command_buffer_t*)base_value;
+}
+
+iree_status_t iree_hal_cuda_stream_command_buffer_create(
+    iree_hal_device_t* device, iree_hal_cuda_context_wrapper_t* context,
+    iree_hal_command_buffer_mode_t mode,
+    iree_hal_command_category_t command_categories, CUstream stream,
+    iree_arena_block_pool_t* block_pool,
+    iree_hal_command_buffer_t** out_command_buffer) {
+  IREE_ASSERT_ARGUMENT(device);
+  IREE_ASSERT_ARGUMENT(context);
+  IREE_ASSERT_ARGUMENT(out_command_buffer);
+  *out_command_buffer = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_cuda_stream_command_buffer_t* command_buffer = NULL;
+  iree_status_t status =
+      iree_allocator_malloc(context->host_allocator, sizeof(*command_buffer),
+                            (void**)&command_buffer);
+  if (iree_status_is_ok(status)) {
+    iree_hal_command_buffer_initialize(
+        device, mode, command_categories, IREE_HAL_QUEUE_AFFINITY_ANY,
+        &iree_hal_cuda_stream_command_buffer_vtable, &command_buffer->base);
+    command_buffer->context = context;
+    command_buffer->stream = stream;
+    iree_arena_initialize(block_pool, &command_buffer->arena);
+    for (size_t i = 0; i < IREE_HAL_CUDA_MAX_KERNEL_ARG; i++) {
+      command_buffer->current_descriptor[i] = &command_buffer->device_ptrs[i];
+    }
+  }
+
+  *out_command_buffer = &command_buffer->base;
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_cuda_stream_command_buffer_destroy(
+    iree_hal_command_buffer_t* base_command_buffer) {
+  iree_hal_cuda_stream_command_buffer_t* command_buffer =
+      iree_hal_cuda_stream_command_buffer_cast(base_command_buffer);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_arena_deinitialize(&command_buffer->arena);
+  iree_allocator_free(command_buffer->context->host_allocator, command_buffer);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+bool iree_hal_cuda_stream_command_buffer_isa(
+    iree_hal_command_buffer_t* command_buffer) {
+  return iree_hal_command_buffer_dyn_cast(
+      command_buffer, &iree_hal_cuda_stream_command_buffer_vtable);
+}
+
+static void* iree_hal_cuda_stream_command_buffer_dyn_cast(
+    iree_hal_command_buffer_t* command_buffer, const void* vtable) {
+  if (vtable == &iree_hal_cuda_stream_command_buffer_vtable) {
+    IREE_HAL_ASSERT_TYPE(command_buffer, vtable);
+    return command_buffer;
+  }
+  return NULL;
+}
+
+static iree_status_t iree_hal_cuda_stream_command_buffer_begin(
+    iree_hal_command_buffer_t* base_command_buffer) {
+  iree_hal_cuda_stream_command_buffer_t* command_buffer =
+      iree_hal_cuda_stream_command_buffer_cast(base_command_buffer);
+  iree_arena_reset(&command_buffer->arena);
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_cuda_stream_command_buffer_end(
+    iree_hal_command_buffer_t* base_command_buffer) {
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_cuda_stream_command_buffer_execution_barrier(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_execution_stage_t source_stage_mask,
+    iree_hal_execution_stage_t target_stage_mask,
+    iree_hal_execution_barrier_flags_t flags,
+    iree_host_size_t memory_barrier_count,
+    const iree_hal_memory_barrier_t* memory_barriers,
+    iree_host_size_t buffer_barrier_count,
+    const iree_hal_buffer_barrier_t* buffer_barriers) {
+  // TODO(jinchen62): implement CUDA barrier
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_cuda_stream_command_buffer_signal_event(
+    iree_hal_command_buffer_t* base_command_buffer, iree_hal_event_t* event,
+    iree_hal_execution_stage_t source_stage_mask) {
+  // TODO(jinchen62): implement CUDA barrier
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_cuda_stream_command_buffer_reset_event(
+    iree_hal_command_buffer_t* base_command_buffer, iree_hal_event_t* event,
+    iree_hal_execution_stage_t source_stage_mask) {
+  // TODO(jinchen62): implement CUDA barrier
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_cuda_stream_command_buffer_wait_events(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_host_size_t event_count, const iree_hal_event_t** events,
+    iree_hal_execution_stage_t source_stage_mask,
+    iree_hal_execution_stage_t target_stage_mask,
+    iree_host_size_t memory_barrier_count,
+    const iree_hal_memory_barrier_t* memory_barriers,
+    iree_host_size_t buffer_barrier_count,
+    const iree_hal_buffer_barrier_t* buffer_barriers) {
+  // TODO(jinchen62): implement CUDA barrier
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_cuda_stream_command_buffer_discard_buffer(
+    iree_hal_command_buffer_t* base_command_buffer, iree_hal_buffer_t* buffer) {
+  // We could mark the memory as invalidated so that if managed CUDA does not
+  // try to copy it back to the host.
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_cuda_stream_command_buffer_fill_buffer(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+    iree_device_size_t length, const void* pattern,
+    iree_host_size_t pattern_length) {
+  iree_hal_cuda_stream_command_buffer_t* command_buffer =
+      iree_hal_cuda_stream_command_buffer_cast(base_command_buffer);
+
+  CUdeviceptr target_device_buffer = iree_hal_cuda_buffer_device_pointer(
+      iree_hal_buffer_allocated_buffer(target_buffer));
+  target_offset += iree_hal_buffer_byte_offset(target_buffer);
+  CUdeviceptr dst = target_device_buffer + target_offset;
+  size_t num_elements = length / pattern_length;
+  switch (pattern_length) {
+    case 4: {
+      CUDA_RETURN_IF_ERROR(
+          command_buffer->context->syms,
+          cuMemsetD32Async(dst, *(const uint32_t*)(pattern), num_elements,
+                           command_buffer->stream),
+          "cuMemsetD32Async");
+      break;
+    }
+    case 2: {
+      CUDA_RETURN_IF_ERROR(
+          command_buffer->context->syms,
+          cuMemsetD16Async(dst, *(const uint16_t*)(pattern), num_elements,
+                           command_buffer->stream),
+          "cuMemsetD16Async");
+      break;
+    }
+    case 1: {
+      CUDA_RETURN_IF_ERROR(
+          command_buffer->context->syms,
+          cuMemsetD8Async(dst, *(const uint8_t*)(pattern), num_elements,
+                          command_buffer->stream),
+          "cuMemsetD8Async");
+      break;
+    }
+    default:
+      return iree_make_status(IREE_STATUS_INTERNAL,
+                              "unsupported fill pattern length");
+  }
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_cuda_stream_command_buffer_update_buffer(
+    iree_hal_command_buffer_t* base_command_buffer, const void* source_buffer,
+    iree_host_size_t source_offset, iree_hal_buffer_t* target_buffer,
+    iree_device_size_t target_offset, iree_device_size_t length) {
+  iree_hal_cuda_stream_command_buffer_t* command_buffer =
+      iree_hal_cuda_stream_command_buffer_cast(base_command_buffer);
+
+  // Allocate scratch space in the arena for the data and copy it in.
+  // The update buffer API requires that the command buffer capture the host
+  // memory at the time the method is called in case the caller wants to reuse
+  // the memory. Because CUDA memcpys are async if we didn't copy it's possible
+  // for the reused memory to change before the stream reaches the copy
+  // operation and get the wrong data.
+  const uint8_t* src = (const uint8_t*)source_buffer + source_offset;
+  if (command_buffer->arena.block_pool) {
+    uint8_t* storage = NULL;
+    IREE_RETURN_IF_ERROR(
+        iree_arena_allocate(&command_buffer->arena, length, (void**)&storage));
+    memcpy(storage, src, length);
+    src = storage;
+  }
+
+  // Issue the copy using the scratch memory as the source.
+  CUdeviceptr target_device_buffer = iree_hal_cuda_buffer_device_pointer(
+      iree_hal_buffer_allocated_buffer(target_buffer));
+  CUdeviceptr dst = target_device_buffer +
+                    iree_hal_buffer_byte_offset(target_buffer) + target_offset;
+  CUDA_RETURN_IF_ERROR(
+      command_buffer->context->syms,
+      cuMemcpyHtoDAsync_v2(dst, src, length, command_buffer->stream),
+      "cuMemcpyHtoDAsync_v2");
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_cuda_stream_command_buffer_copy_buffer(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
+    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+    iree_device_size_t length) {
+  iree_hal_cuda_stream_command_buffer_t* command_buffer =
+      iree_hal_cuda_stream_command_buffer_cast(base_command_buffer);
+
+  CUdeviceptr target_device_buffer = iree_hal_cuda_buffer_device_pointer(
+      iree_hal_buffer_allocated_buffer(target_buffer));
+  target_offset += iree_hal_buffer_byte_offset(target_buffer);
+  CUdeviceptr source_device_buffer = iree_hal_cuda_buffer_device_pointer(
+      iree_hal_buffer_allocated_buffer(source_buffer));
+  source_offset += iree_hal_buffer_byte_offset(source_buffer);
+  CUdeviceptr dst = target_device_buffer + target_offset;
+  CUdeviceptr src = source_device_buffer + source_offset;
+  CUDA_RETURN_IF_ERROR(command_buffer->context->syms,
+                       cuMemcpyAsync(dst, src, length, command_buffer->stream),
+                       "cuMemcpyAsync");
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_cuda_stream_command_buffer_push_constants(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_executable_layout_t* executable_layout, iree_host_size_t offset,
+    const void* values, iree_host_size_t values_length) {
+  iree_hal_cuda_stream_command_buffer_t* command_buffer =
+      iree_hal_cuda_stream_command_buffer_cast(base_command_buffer);
+  iree_host_size_t constant_base_index = offset / sizeof(int32_t);
+  for (iree_host_size_t i = 0; i < values_length / sizeof(int32_t); i++) {
+    command_buffer->push_constant[i + constant_base_index] =
+        ((uint32_t*)values)[i];
+  }
+  return iree_ok_status();
+}
+
+// Tie together the binding index and its index in |bindings| array.
+typedef struct {
+  uint32_t index;
+  uint32_t binding;
+} iree_hal_cuda_binding_mapping_t;
+
+// Helper to sort the binding based on their binding index.
+static int compare_binding_index(const void* a, const void* b) {
+  const iree_hal_cuda_binding_mapping_t buffer_a =
+      *(const iree_hal_cuda_binding_mapping_t*)a;
+  const iree_hal_cuda_binding_mapping_t buffer_b =
+      *(const iree_hal_cuda_binding_mapping_t*)b;
+  return buffer_a.binding < buffer_b.binding ? -1 : 1;
+}
+
+static iree_status_t iree_hal_cuda_stream_command_buffer_push_descriptor_set(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_executable_layout_t* executable_layout, uint32_t set,
+    iree_host_size_t binding_count,
+    const iree_hal_descriptor_set_binding_t* bindings) {
+  iree_hal_cuda_stream_command_buffer_t* command_buffer =
+      iree_hal_cuda_stream_command_buffer_cast(base_command_buffer);
+  iree_host_size_t base_binding =
+      iree_hal_cuda_base_binding_index(executable_layout, set);
+  // Convention with the compiler side. We map bindings to kernel argument.
+  // We compact the bindings to get a dense set of arguments and keep them order
+  // based on the binding index.
+  // Sort the binding based on the binding index and map the array index to the
+  // argument index.
+  iree_hal_cuda_binding_mapping_t binding_used[IREE_HAL_CUDA_MAX_BINDING_COUNT];
+  for (iree_host_size_t i = 0; i < binding_count; i++) {
+    iree_hal_cuda_binding_mapping_t buffer = {i, bindings[i].binding};
+    binding_used[i] = buffer;
+  }
+  qsort(binding_used, binding_count, sizeof(iree_hal_cuda_binding_mapping_t),
+        compare_binding_index);
+  assert(binding_count < IREE_HAL_CUDA_MAX_BINDING_COUNT &&
+         "binding count larger than the max expected.");
+  for (iree_host_size_t i = 0; i < binding_count; i++) {
+    iree_hal_descriptor_set_binding_t binding = bindings[binding_used[i].index];
+    CUdeviceptr device_ptr =
+        iree_hal_cuda_buffer_device_pointer(
+            iree_hal_buffer_allocated_buffer(binding.buffer)) +
+        iree_hal_buffer_byte_offset(binding.buffer) + binding.offset;
+    *((CUdeviceptr*)command_buffer->current_descriptor[i + base_binding]) =
+        device_ptr;
+  }
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_cuda_stream_command_buffer_bind_descriptor_set(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_executable_layout_t* executable_layout, uint32_t set,
+    iree_hal_descriptor_set_t* descriptor_set,
+    iree_host_size_t dynamic_offset_count,
+    const iree_device_size_t* dynamic_offsets) {
+  return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                          "need cuda implementation of bind descriptor set");
+}
+
+static iree_status_t iree_hal_cuda_stream_command_buffer_dispatch(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_executable_t* executable, int32_t entry_point,
+    uint32_t workgroup_x, uint32_t workgroup_y, uint32_t workgroup_z) {
+  iree_hal_cuda_stream_command_buffer_t* command_buffer =
+      iree_hal_cuda_stream_command_buffer_cast(base_command_buffer);
+  iree_hal_executable_layout_t* layout =
+      iree_hal_cuda_executable_get_layout(executable, entry_point);
+  iree_host_size_t num_constants =
+      iree_hal_cuda_executable_layout_num_constants(layout);
+  iree_host_size_t constant_base_index =
+      iree_hal_cuda_push_constant_index(layout);
+  // Patch the push constants in the kernel arguments.
+  for (iree_host_size_t i = 0; i < num_constants; i++) {
+    *((uint32_t*)command_buffer->current_descriptor[i + constant_base_index]) =
+        command_buffer->push_constant[i];
+  }
+
+  int32_t block_size_x, block_size_y, block_size_z;
+  int32_t shared_memory_size;
+  IREE_RETURN_IF_ERROR(iree_hal_cuda_native_executable_block_size(
+      executable, entry_point, &block_size_x, &block_size_y, &block_size_z));
+  IREE_RETURN_IF_ERROR(iree_hal_cuda_native_executable_shared_memory_size(
+      executable, entry_point, &shared_memory_size));
+  CUfunction func =
+      iree_hal_cuda_native_executable_for_entry_point(executable, entry_point);
+  CUDA_RETURN_IF_ERROR(
+      command_buffer->context->syms,
+      cuLaunchKernel(func, workgroup_x, workgroup_y, workgroup_z, block_size_x,
+                     block_size_y, block_size_z, shared_memory_size,
+                     command_buffer->stream, command_buffer->current_descriptor,
+                     NULL),
+      "cuLaunchKernel");
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_cuda_stream_command_buffer_dispatch_indirect(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_executable_t* executable, int32_t entry_point,
+    iree_hal_buffer_t* workgroups_buffer,
+    iree_device_size_t workgroups_offset) {
+  return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                          "need cuda implementation of dispatch indirect");
+}
+
+static const iree_hal_command_buffer_vtable_t
+    iree_hal_cuda_stream_command_buffer_vtable = {
+        .destroy = iree_hal_cuda_stream_command_buffer_destroy,
+        .dyn_cast = iree_hal_cuda_stream_command_buffer_dyn_cast,
+        .begin = iree_hal_cuda_stream_command_buffer_begin,
+        .end = iree_hal_cuda_stream_command_buffer_end,
+        .execution_barrier =
+            iree_hal_cuda_stream_command_buffer_execution_barrier,
+        .signal_event = iree_hal_cuda_stream_command_buffer_signal_event,
+        .reset_event = iree_hal_cuda_stream_command_buffer_reset_event,
+        .wait_events = iree_hal_cuda_stream_command_buffer_wait_events,
+        .discard_buffer = iree_hal_cuda_stream_command_buffer_discard_buffer,
+        .fill_buffer = iree_hal_cuda_stream_command_buffer_fill_buffer,
+        .update_buffer = iree_hal_cuda_stream_command_buffer_update_buffer,
+        .copy_buffer = iree_hal_cuda_stream_command_buffer_copy_buffer,
+        .push_constants = iree_hal_cuda_stream_command_buffer_push_constants,
+        .push_descriptor_set =
+            iree_hal_cuda_stream_command_buffer_push_descriptor_set,
+        .bind_descriptor_set =
+            iree_hal_cuda_stream_command_buffer_bind_descriptor_set,
+        .dispatch = iree_hal_cuda_stream_command_buffer_dispatch,
+        .dispatch_indirect =
+            iree_hal_cuda_stream_command_buffer_dispatch_indirect,
+};
diff --git a/runtime/src/iree/hal/cuda/stream_command_buffer.h b/runtime/src/iree/hal/cuda/stream_command_buffer.h
new file mode 100644
index 0000000..d22d3ff
--- /dev/null
+++ b/runtime/src/iree/hal/cuda/stream_command_buffer.h
@@ -0,0 +1,46 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_CUDA_STREAM_COMMAND_BUFFER_H_
+#define IREE_HAL_CUDA_STREAM_COMMAND_BUFFER_H_
+
+#include "iree/base/internal/arena.h"
+#include "iree/hal/api.h"
+#include "iree/hal/cuda/context_wrapper.h"
+#include "iree/hal/cuda/cuda_headers.h"
+#include "iree/hal/cuda/dynamic_symbols.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Creates a cuda stream command buffer that immediately issues commands against
+// the given |stream|. Access to |stream| must be synchronized by the user.
+//
+// If |block_pool| is non-NULL then the stream command buffer will retain copies
+// of input data until reset. If NULL then the caller must ensure the lifetime
+// of input data outlives the command buffer.
+//
+// This command buffer is used to both replay deferred command buffers and
+// perform inline execution. When replaying the scratch data required for things
+// like buffer updates is retained by the source deferred command buffer and as
+// such the |block_pool| and can be NULL to avoid a double copy.
+iree_status_t iree_hal_cuda_stream_command_buffer_create(
+    iree_hal_device_t* device, iree_hal_cuda_context_wrapper_t* context,
+    iree_hal_command_buffer_mode_t mode,
+    iree_hal_command_category_t command_categories, CUstream stream,
+    iree_arena_block_pool_t* block_pool,
+    iree_hal_command_buffer_t** out_command_buffer);
+
+// Returns true if |command_buffer| is a CUDA stream-based command buffer.
+bool iree_hal_cuda_stream_command_buffer_isa(
+    iree_hal_command_buffer_t* command_buffer);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_CUDA_STREAM_COMMAND_BUFFER_H_
diff --git a/runtime/src/iree/hal/descriptor_set.c b/runtime/src/iree/hal/descriptor_set.c
new file mode 100644
index 0000000..65bdd6d
--- /dev/null
+++ b/runtime/src/iree/hal/descriptor_set.c
@@ -0,0 +1,37 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/descriptor_set.h"
+
+#include <stddef.h>
+
+#include "iree/base/tracing.h"
+#include "iree/hal/detail.h"
+#include "iree/hal/device.h"
+#include "iree/hal/resource.h"
+
+#define _VTABLE_DISPATCH(descriptor_set, method_name) \
+  IREE_HAL_VTABLE_DISPATCH(descriptor_set, iree_hal_descriptor_set, method_name)
+
+IREE_HAL_API_RETAIN_RELEASE(descriptor_set);
+
+IREE_API_EXPORT iree_status_t iree_hal_descriptor_set_create(
+    iree_hal_device_t* device, iree_hal_descriptor_set_layout_t* set_layout,
+    iree_host_size_t binding_count,
+    const iree_hal_descriptor_set_binding_t* bindings,
+    iree_hal_descriptor_set_t** out_descriptor_set) {
+  IREE_ASSERT_ARGUMENT(device);
+  IREE_ASSERT_ARGUMENT(set_layout);
+  IREE_ASSERT_ARGUMENT(!binding_count || bindings);
+  IREE_ASSERT_ARGUMENT(out_descriptor_set);
+  *out_descriptor_set = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_status_t status =
+      IREE_HAL_VTABLE_DISPATCH(device, iree_hal_device, create_descriptor_set)(
+          device, set_layout, binding_count, bindings, out_descriptor_set);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
diff --git a/runtime/src/iree/hal/descriptor_set.h b/runtime/src/iree/hal/descriptor_set.h
new file mode 100644
index 0000000..11c7957
--- /dev/null
+++ b/runtime/src/iree/hal/descriptor_set.h
@@ -0,0 +1,102 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_DESCRIPTOR_SET_H_
+#define IREE_HAL_DESCRIPTOR_SET_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/buffer.h"
+#include "iree/hal/descriptor_set_layout.h"
+#include "iree/hal/resource.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef struct iree_hal_device_t iree_hal_device_t;
+
+//===----------------------------------------------------------------------===//
+// Types and Enums
+//===----------------------------------------------------------------------===//
+
+// Specifies a descriptor set binding.
+// The range specified by [offset, length) will be made available to executables
+// on the given binding. If the descriptor type is dynamic then the range will
+// be [offset + dynamic_offset, length).
+//
+// The IREE HAL buffer type may internally be offset; such offset is applied
+// here as if it were the base address of the buffer. Note that the offset will
+// be applied at the time the binding is recording into the command buffer.
+//
+// Maps to VkDescriptorSetBinding.
+typedef struct iree_hal_descriptor_set_binding_t {
+  // The binding number of this entry and corresponds to a resource of the
+  // same binding number in the executable interface.
+  uint32_t binding;
+  // Buffer bound to the binding number.
+  // May be NULL if the binding is not used by the executable.
+  iree_hal_buffer_t* buffer;
+  // Offset, in bytes, into the buffer that the binding starts at.
+  // If the descriptor type is dynamic this will be added to the dynamic
+  // offset provided during binding.
+  iree_device_size_t offset;
+  // Length, in bytes, of the buffer that is available to the executable.
+  // This can be IREE_WHOLE_BUFFER, however note that if the entire buffer
+  // contents are larger than supported by the device (~128MiB, usually) this
+  // will fail. If the descriptor type is dynamic this will be used for all
+  // ranges regardless of offset.
+  iree_device_size_t length;
+} iree_hal_descriptor_set_binding_t;
+
+//===----------------------------------------------------------------------===//
+// iree_hal_descriptor_set_t
+//===----------------------------------------------------------------------===//
+
+// Opaque handle to a descriptor set object.
+// A "descriptor" is effectively a bound memory range and each dispatch can use
+// one or more "descriptor sets" to access their I/O memory. Each descriptor set
+// conforms to a template "descriptor set layout".
+//
+// Maps to VkDescriptorSet:
+// https://www.khronos.org/registry/vulkan/specs/1.2-extensions/man/html/VkDescriptorSet.html
+typedef struct iree_hal_descriptor_set_t iree_hal_descriptor_set_t;
+
+// Creates a descriptor set of the given layout and bindings.
+// Descriptor sets are immutable and retain their bindings.
+IREE_API_EXPORT iree_status_t iree_hal_descriptor_set_create(
+    iree_hal_device_t* device, iree_hal_descriptor_set_layout_t* set_layout,
+    iree_host_size_t binding_count,
+    const iree_hal_descriptor_set_binding_t* bindings,
+    iree_hal_descriptor_set_t** out_descriptor_set);
+
+// Retains the given |set| for the caller.
+IREE_API_EXPORT void iree_hal_descriptor_set_retain(
+    iree_hal_descriptor_set_t* descriptor_set);
+
+// Releases the given |set| from the caller.
+IREE_API_EXPORT void iree_hal_descriptor_set_release(
+    iree_hal_descriptor_set_t* descriptor_set);
+
+//===----------------------------------------------------------------------===//
+// iree_hal_descriptor_set_t implementation details
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_descriptor_set_vtable_t {
+  void(IREE_API_PTR* destroy)(iree_hal_descriptor_set_t* descriptor_set);
+} iree_hal_descriptor_set_vtable_t;
+IREE_HAL_ASSERT_VTABLE_LAYOUT(iree_hal_descriptor_set_vtable_t);
+
+IREE_API_EXPORT void iree_hal_descriptor_set_destroy(
+    iree_hal_descriptor_set_t* descriptor_set);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_DESCRIPTOR_SET_H_
diff --git a/runtime/src/iree/hal/descriptor_set_layout.c b/runtime/src/iree/hal/descriptor_set_layout.c
new file mode 100644
index 0000000..76a3893
--- /dev/null
+++ b/runtime/src/iree/hal/descriptor_set_layout.c
@@ -0,0 +1,38 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/descriptor_set_layout.h"
+
+#include <stddef.h>
+
+#include "iree/base/tracing.h"
+#include "iree/hal/detail.h"
+#include "iree/hal/device.h"
+#include "iree/hal/resource.h"
+
+#define _VTABLE_DISPATCH(descriptor_set_layout, method_name) \
+  IREE_HAL_VTABLE_DISPATCH(descriptor_set_layout,            \
+                           iree_hal_descriptor_set_layout, method_name)
+
+IREE_HAL_API_RETAIN_RELEASE(descriptor_set_layout);
+
+IREE_API_EXPORT iree_status_t iree_hal_descriptor_set_layout_create(
+    iree_hal_device_t* device,
+    iree_hal_descriptor_set_layout_usage_type_t usage_type,
+    iree_host_size_t binding_count,
+    const iree_hal_descriptor_set_layout_binding_t* bindings,
+    iree_hal_descriptor_set_layout_t** out_descriptor_set_layout) {
+  IREE_ASSERT_ARGUMENT(device);
+  IREE_ASSERT_ARGUMENT(!binding_count || bindings);
+  IREE_ASSERT_ARGUMENT(out_descriptor_set_layout);
+  *out_descriptor_set_layout = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_status_t status = IREE_HAL_VTABLE_DISPATCH(device, iree_hal_device,
+                                                  create_descriptor_set_layout)(
+      device, usage_type, binding_count, bindings, out_descriptor_set_layout);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
diff --git a/runtime/src/iree/hal/descriptor_set_layout.h b/runtime/src/iree/hal/descriptor_set_layout.h
new file mode 100644
index 0000000..36e3940
--- /dev/null
+++ b/runtime/src/iree/hal/descriptor_set_layout.h
@@ -0,0 +1,104 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_DESCRIPTOR_SET_LAYOUT_H_
+#define IREE_HAL_DESCRIPTOR_SET_LAYOUT_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/buffer.h"
+#include "iree/hal/resource.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef struct iree_hal_device_t iree_hal_device_t;
+
+//===----------------------------------------------------------------------===//
+// Types and Enums
+//===----------------------------------------------------------------------===//
+
+// Specifies the type of a descriptor in a descriptor set.
+typedef enum iree_hal_descriptor_type_e {
+  IREE_HAL_DESCRIPTOR_TYPE_UNIFORM_BUFFER = 6,
+  IREE_HAL_DESCRIPTOR_TYPE_STORAGE_BUFFER = 7,
+  IREE_HAL_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC = 8,
+  IREE_HAL_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC = 9,
+} iree_hal_descriptor_type_t;
+
+// Specifies the usage type of the descriptor set.
+typedef enum iree_hal_descriptor_set_layout_usage_type_e {
+  // Descriptor set will be initialized once and never changed.
+  IREE_HAL_DESCRIPTOR_SET_LAYOUT_USAGE_TYPE_IMMUTABLE = 0,
+  // Descriptor set is never created and instead used with push descriptors.
+  IREE_HAL_DESCRIPTOR_SET_LAYOUT_USAGE_TYPE_PUSH_ONLY = 1,
+} iree_hal_descriptor_set_layout_usage_type_t;
+
+// Specifies a descriptor set layout binding.
+//
+// Maps to VkDescriptorSetLayoutBinding.
+typedef struct iree_hal_descriptor_set_layout_binding_t {
+  // The binding number of this entry and corresponds to a resource of the
+  // same binding number in the executable interface.
+  uint32_t binding;
+  // Specifies which type of resource descriptors are used for this binding.
+  iree_hal_descriptor_type_t type;
+} iree_hal_descriptor_set_layout_binding_t;
+
+//===----------------------------------------------------------------------===//
+// iree_hal_descriptor_set_layout_t
+//===----------------------------------------------------------------------===//
+
+// Opaque handle to a descriptor set layout object.
+// A "descriptor" is effectively a bound memory range and each dispatch can use
+// one or more "descriptor sets" to access their I/O memory. A "descriptor set
+// layout" defines the types and usage semantics of the descriptors that make up
+// one set. Implementations can use this to verify program correctness and
+// accelerate reservation/allocation/computation of descriptor-related
+// operations.
+//
+// Maps to VkDescriptorSetLayout:
+// https://www.khronos.org/registry/vulkan/specs/1.2-extensions/man/html/VkDescriptorSetLayout.html
+typedef struct iree_hal_descriptor_set_layout_t
+    iree_hal_descriptor_set_layout_t;
+
+// Creates a descriptor set layout with the given bindings.
+IREE_API_EXPORT iree_status_t iree_hal_descriptor_set_layout_create(
+    iree_hal_device_t* device,
+    iree_hal_descriptor_set_layout_usage_type_t usage_type,
+    iree_host_size_t binding_count,
+    const iree_hal_descriptor_set_layout_binding_t* bindings,
+    iree_hal_descriptor_set_layout_t** out_descriptor_set_layout);
+
+// Retains the given |descriptor_set_layout| for the caller.
+IREE_API_EXPORT void iree_hal_descriptor_set_layout_retain(
+    iree_hal_descriptor_set_layout_t* descriptor_set_layout);
+
+// Releases the given |descriptor_set_layout| from the caller.
+IREE_API_EXPORT void iree_hal_descriptor_set_layout_release(
+    iree_hal_descriptor_set_layout_t* descriptor_set_layout);
+
+//===----------------------------------------------------------------------===//
+// iree_hal_descriptor_set_layout_t implementation details
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_descriptor_set_layout_vtable_t {
+  void(IREE_API_PTR* destroy)(
+      iree_hal_descriptor_set_layout_t* descriptor_set_layout);
+} iree_hal_descriptor_set_layout_vtable_t;
+IREE_HAL_ASSERT_VTABLE_LAYOUT(iree_hal_descriptor_set_layout_vtable_t);
+
+IREE_API_EXPORT void iree_hal_descriptor_set_layout_destroy(
+    iree_hal_descriptor_set_layout_t* descriptor_set_layout);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_DESCRIPTOR_SET_LAYOUT_H_
diff --git a/runtime/src/iree/hal/detail.h b/runtime/src/iree/hal/detail.h
new file mode 100644
index 0000000..b4387e2
--- /dev/null
+++ b/runtime/src/iree/hal/detail.h
@@ -0,0 +1,67 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_DETAIL_H_
+#define IREE_HAL_DETAIL_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "iree/base/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Dispatches a method on a HAL object vtable.
+//
+// In the future we can use this to compile in a mode where all indirect
+// dispatches are replaced by direct calls to static methods. For example,
+// by changing the macro to resolve to `iree_hal_[resource]_[method_name]` we
+// can rely on LTO to perform cross-compilation unit inlining/strip unused HAL
+// calls/etc. This will be particularly useful for super tiny builds
+// (web/embedded) where there's only ever one usable backend and debugging
+// features like command buffer validation aren't required.
+//
+// Some changes (mostly whackamole) are still required to fully support this and
+// it's critical there's a CI building with the setting as it's not hard to keep
+// working but very easy to accidentally break (by not routing through this
+// interface, using the vtable for object instance comparison, etc).
+#define IREE_HAL_VTABLE_DISPATCH(resource, type_prefix, method_name)       \
+  ((const type_prefix##_vtable_t*)((const iree_hal_resource_t*)(resource)) \
+       ->vtable)                                                           \
+      ->method_name
+
+// Defines the iree_hal_<type_name>_retain/_release methods.
+#define IREE_HAL_API_RETAIN_RELEASE(type_name)                           \
+  IREE_API_EXPORT void iree_hal_##type_name##_destroy(                   \
+      iree_hal_##type_name##_t* type_name) {                             \
+    if (IREE_LIKELY(type_name)) {                                        \
+      IREE_HAL_VTABLE_DISPATCH(type_name, iree_hal_##type_name, destroy) \
+      (type_name);                                                       \
+    }                                                                    \
+  }                                                                      \
+  IREE_API_EXPORT void iree_hal_##type_name##_retain(                    \
+      iree_hal_##type_name##_t* type_name) {                             \
+    if (IREE_LIKELY(type_name)) {                                        \
+      iree_atomic_ref_count_inc(                                         \
+          &((iree_hal_resource_t*)(type_name))->ref_count);              \
+    }                                                                    \
+  }                                                                      \
+  IREE_API_EXPORT void iree_hal_##type_name##_release(                   \
+      iree_hal_##type_name##_t* type_name) {                             \
+    if (IREE_LIKELY(type_name) &&                                        \
+        iree_atomic_ref_count_dec(                                       \
+            &((iree_hal_resource_t*)(type_name))->ref_count) == 1) {     \
+      iree_hal_##type_name##_destroy(type_name);                         \
+    }                                                                    \
+  }
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_DETAIL_H_
diff --git a/runtime/src/iree/hal/device.c b/runtime/src/iree/hal/device.c
new file mode 100644
index 0000000..d389906
--- /dev/null
+++ b/runtime/src/iree/hal/device.c
@@ -0,0 +1,287 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/device.h"
+
+#include "iree/base/tracing.h"
+#include "iree/hal/allocator.h"
+#include "iree/hal/buffer.h"
+#include "iree/hal/command_buffer.h"
+#include "iree/hal/detail.h"
+#include "iree/hal/resource.h"
+
+#define _VTABLE_DISPATCH(device, method_name) \
+  IREE_HAL_VTABLE_DISPATCH(device, iree_hal_device, method_name)
+
+IREE_HAL_API_RETAIN_RELEASE(device);
+
+IREE_API_EXPORT iree_string_view_t
+iree_hal_device_id(iree_hal_device_t* device) {
+  IREE_ASSERT_ARGUMENT(device);
+  return _VTABLE_DISPATCH(device, id)(device);
+}
+
+IREE_API_EXPORT iree_allocator_t
+iree_hal_device_host_allocator(iree_hal_device_t* device) {
+  IREE_ASSERT_ARGUMENT(device);
+  return _VTABLE_DISPATCH(device, host_allocator)(device);
+}
+
+IREE_API_EXPORT iree_hal_allocator_t* iree_hal_device_allocator(
+    iree_hal_device_t* device) {
+  IREE_ASSERT_ARGUMENT(device);
+  return _VTABLE_DISPATCH(device, device_allocator)(device);
+}
+
+IREE_API_EXPORT
+iree_status_t iree_hal_device_trim(iree_hal_device_t* device) {
+  IREE_ASSERT_ARGUMENT(device);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_status_t status = _VTABLE_DISPATCH(device, trim)(device);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_device_query_i32(
+    iree_hal_device_t* device, iree_string_view_t category,
+    iree_string_view_t key, int32_t* out_value) {
+  IREE_ASSERT_ARGUMENT(device);
+  IREE_ASSERT_ARGUMENT(out_value);
+
+  if (iree_string_view_equal(category,
+                             iree_make_cstring_view("hal.device.id"))) {
+    *out_value =
+        iree_string_view_match_pattern(iree_hal_device_id(device), key) ? 1 : 0;
+    return iree_ok_status();
+  }
+
+  return _VTABLE_DISPATCH(device, query_i32)(device, category, key, out_value);
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_device_transfer_range(
+    iree_hal_device_t* device, iree_hal_transfer_buffer_t source,
+    iree_device_size_t source_offset, iree_hal_transfer_buffer_t target,
+    iree_device_size_t target_offset, iree_device_size_t data_length,
+    iree_hal_transfer_buffer_flags_t flags, iree_timeout_t timeout) {
+  if (data_length == 0) {
+    return iree_ok_status();  // No-op.
+  }
+
+  // host->host is not allowed. We may want to support this one day to allow for
+  // parallelized copies and such, however the validation code differs quite a
+  // bit and it'd be better to have this as part of a task system API.
+  bool is_source_host = source.device_buffer == NULL;
+  bool is_target_host = target.device_buffer == NULL;
+  if (is_source_host && is_target_host) {
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "cannot perform host->host transfers via this API, use memcpy/memmove");
+  }
+
+  // Check for overlap - like memcpy we require that the two ranges don't have
+  // any overlap as we may use memcpy. This only matters if the buffers are
+  // both device buffers - host and device should never alias: behavior is
+  // undefined if a user tries to pass a mapped device pointer as if it was a
+  // host pointer.
+  if (!is_source_host && !is_target_host &&
+      iree_hal_buffer_test_overlap(source.device_buffer, source_offset,
+                                   data_length, target.device_buffer,
+                                   target_offset, data_length) !=
+          IREE_HAL_BUFFER_OVERLAP_DISJOINT) {
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "source and target ranges must not overlap within the same buffer");
+  }
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_TEXT(
+      z0, is_source_host ? "h2d" : (is_target_host ? "d2h" : "d2d"));
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, data_length);
+
+  // Defer to the backing implementation.
+  iree_status_t status = _VTABLE_DISPATCH(device, transfer_range)(
+      device, source, source_offset, target, target_offset, data_length, flags,
+      timeout);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_device_transfer_h2d(
+    iree_hal_device_t* device, const void* source, iree_hal_buffer_t* target,
+    iree_device_size_t target_offset, iree_device_size_t data_length,
+    iree_hal_transfer_buffer_flags_t flags, iree_timeout_t timeout) {
+  return iree_hal_device_transfer_range(
+      device,
+      iree_hal_make_host_transfer_buffer_span((void*)source, data_length), 0,
+      iree_hal_make_device_transfer_buffer(target), target_offset, data_length,
+      flags, timeout);
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_device_transfer_d2h(
+    iree_hal_device_t* device, iree_hal_buffer_t* source,
+    iree_device_size_t source_offset, void* target,
+    iree_device_size_t data_length, iree_hal_transfer_buffer_flags_t flags,
+    iree_timeout_t timeout) {
+  return iree_hal_device_transfer_range(
+      device, iree_hal_make_device_transfer_buffer(source), source_offset,
+      iree_hal_make_host_transfer_buffer_span(target, data_length), 0,
+      data_length, flags, timeout);
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_device_transfer_d2d(
+    iree_hal_device_t* device, iree_hal_buffer_t* source,
+    iree_device_size_t source_offset, iree_hal_buffer_t* target,
+    iree_device_size_t target_offset, iree_device_size_t data_length,
+    iree_hal_transfer_buffer_flags_t flags, iree_timeout_t timeout) {
+  return iree_hal_device_transfer_range(
+      device, iree_hal_make_device_transfer_buffer(source), source_offset,
+      iree_hal_make_device_transfer_buffer(target), target_offset, data_length,
+      flags, timeout);
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_device_transfer_and_wait(
+    iree_hal_device_t* device, iree_hal_semaphore_t* wait_semaphore,
+    uint64_t wait_value, iree_host_size_t transfer_count,
+    const iree_hal_transfer_command_t* transfer_commands,
+    iree_timeout_t timeout) {
+  IREE_ASSERT_ARGUMENT(device);
+  IREE_ASSERT_ARGUMENT(!transfer_count || transfer_commands);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // We only want to allow inline execution if we have not been instructed to
+  // wait on a semaphore and it hasn't yet been signaled.
+  iree_hal_command_buffer_mode_t mode = IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT;
+  if (wait_semaphore) {
+    uint64_t current_value = 0ull;
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z0, iree_hal_semaphore_query(wait_semaphore, &current_value));
+    if (current_value >= wait_value) {
+      mode |= IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION;
+    }
+  } else {
+    mode |= IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION;
+  }
+
+  // Create a command buffer performing all of the transfer operations.
+  iree_hal_command_buffer_t* command_buffer = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_hal_create_transfer_command_buffer(
+              device, mode, IREE_HAL_QUEUE_AFFINITY_ANY, transfer_count,
+              transfer_commands, &command_buffer));
+
+  // Perform a full submit-and-wait. On devices with multiple queues this can
+  // run out-of-order/overlapped with other work and return earlier than device
+  // idle.
+  iree_hal_semaphore_t* fence_semaphore = NULL;
+  iree_status_t status =
+      iree_hal_semaphore_create(device, 0ull, &fence_semaphore);
+  uint64_t signal_value = 1ull;
+  if (iree_status_is_ok(status)) {
+    iree_hal_submission_batch_t batch = {
+        .wait_semaphores =
+            {
+                .count = wait_semaphore != NULL ? 1 : 0,
+                .semaphores = &wait_semaphore,
+                .payload_values = &wait_value,
+            },
+        .command_buffer_count = 1,
+        .command_buffers = &command_buffer,
+        .signal_semaphores =
+            {
+                .count = 1,
+                .semaphores = &fence_semaphore,
+                .payload_values = &signal_value,
+            },
+    };
+    status = iree_hal_device_submit_and_wait(
+        device, IREE_HAL_COMMAND_CATEGORY_TRANSFER, IREE_HAL_QUEUE_AFFINITY_ANY,
+        1, &batch, fence_semaphore, signal_value, timeout);
+  }
+
+  iree_hal_command_buffer_release(command_buffer);
+  iree_hal_semaphore_release(fence_semaphore);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+// Validates that the submission is well-formed.
+static iree_status_t iree_hal_device_validate_submission(
+    iree_host_size_t batch_count, const iree_hal_submission_batch_t* batches) {
+  for (iree_host_size_t i = 0; i < batch_count; ++i) {
+    for (iree_host_size_t j = 0; j < batches[i].command_buffer_count; ++j) {
+      if (batches[i].wait_semaphores.count > 0 &&
+          iree_all_bits_set(
+              iree_hal_command_buffer_mode(batches[i].command_buffers[j]),
+              IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION)) {
+        // Inline command buffers are not allowed to wait (as they could have
+        // already been executed!). This is a requirement of the API so we
+        // validate it across all backends even if they don't support inline
+        // execution and ignore it.
+        return iree_make_status(
+            IREE_STATUS_INVALID_ARGUMENT,
+            "inline command buffer submitted with a wait; inline command "
+            "buffers must be ready to execute immediately");
+      }
+    }
+  }
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_device_queue_submit(
+    iree_hal_device_t* device, iree_hal_command_category_t command_categories,
+    iree_hal_queue_affinity_t queue_affinity, iree_host_size_t batch_count,
+    const iree_hal_submission_batch_t* batches) {
+  IREE_ASSERT_ARGUMENT(device);
+  IREE_ASSERT_ARGUMENT(!batch_count || batches);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_hal_device_validate_submission(batch_count, batches));
+  iree_status_t status = _VTABLE_DISPATCH(device, queue_submit)(
+      device, command_categories, queue_affinity, batch_count, batches);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_device_submit_and_wait(
+    iree_hal_device_t* device, iree_hal_command_category_t command_categories,
+    iree_hal_queue_affinity_t queue_affinity, iree_host_size_t batch_count,
+    const iree_hal_submission_batch_t* batches,
+    iree_hal_semaphore_t* wait_semaphore, uint64_t wait_value,
+    iree_timeout_t timeout) {
+  IREE_ASSERT_ARGUMENT(device);
+  IREE_ASSERT_ARGUMENT(!batch_count || batches);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_hal_device_validate_submission(batch_count, batches));
+  iree_status_t status = _VTABLE_DISPATCH(device, submit_and_wait)(
+      device, command_categories, queue_affinity, batch_count, batches,
+      wait_semaphore, wait_value, timeout);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_device_wait_semaphores(
+    iree_hal_device_t* device, iree_hal_wait_mode_t wait_mode,
+    const iree_hal_semaphore_list_t* semaphore_list, iree_timeout_t timeout) {
+  IREE_ASSERT_ARGUMENT(device);
+  if (!semaphore_list || semaphore_list->count == 0) return iree_ok_status();
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_status_t status = _VTABLE_DISPATCH(device, wait_semaphores)(
+      device, wait_mode, semaphore_list, timeout);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t
+iree_hal_device_wait_idle(iree_hal_device_t* device, iree_timeout_t timeout) {
+  IREE_ASSERT_ARGUMENT(device);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_status_t status = _VTABLE_DISPATCH(device, wait_idle)(device, timeout);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
diff --git a/runtime/src/iree/hal/device.h b/runtime/src/iree/hal/device.h
new file mode 100644
index 0000000..7c97107
--- /dev/null
+++ b/runtime/src/iree/hal/device.h
@@ -0,0 +1,441 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_DEVICE_H_
+#define IREE_HAL_DEVICE_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/buffer.h"
+#include "iree/hal/command_buffer.h"
+#include "iree/hal/descriptor_set.h"
+#include "iree/hal/descriptor_set_layout.h"
+#include "iree/hal/event.h"
+#include "iree/hal/executable_cache.h"
+#include "iree/hal/executable_layout.h"
+#include "iree/hal/resource.h"
+#include "iree/hal/semaphore.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+//===----------------------------------------------------------------------===//
+// Types and Enums
+//===----------------------------------------------------------------------===//
+
+// An opaque driver-specific handle to identify different devices.
+typedef uintptr_t iree_hal_device_id_t;
+
+#define IREE_HAL_DEVICE_ID_INVALID 0ull
+
+// Describes features supported by a device.
+// These flags indicate the availability of features that may be enabled at the
+// request of the calling application. Note that certain features may disable
+// runtime optimizations or require compilation flags to ensure the required
+// metadata is present in executables.
+enum iree_hal_device_feature_bits_t {
+  IREE_HAL_DEVICE_FEATURE_NONE = 0u,
+
+  // Device supports executable debugging.
+  // When present executables *may* be compiled with
+  // IREE_HAL_EXECUTABLE_CACHING_MODE_ENABLE_DEBUGGING and will have usable
+  // debugging related methods. Note that if the input executables do not have
+  // embedded debugging information they still may not be able to perform
+  // disassembly or fine-grained breakpoint insertion.
+  IREE_HAL_DEVICE_FEATURE_SUPPORTS_DEBUGGING = 1u << 0,
+
+  // Device supports executable coverage information.
+  // When present executables *may* be compiled with
+  // IREE_HAL_EXECUTABLE_CACHING_MODE_ENABLE_COVERAGE and will produce
+  // coverage buffers during dispatch. Note that input executables must have
+  // partial embedded debug information to allow mapping back to source offsets.
+  IREE_HAL_DEVICE_FEATURE_SUPPORTS_COVERAGE = 1u << 1,
+
+  // Device supports executable and command queue profiling.
+  // When present executables *may* be compiled with
+  // IREE_HAL_EXECUTABLE_CACHING_MODE_ENABLE_PROFILING and will produce
+  // profiling buffers during dispatch. Note that input executables must have
+  // partial embedded debug information to allow mapping back to source offsets.
+  IREE_HAL_DEVICE_FEATURE_SUPPORTS_PROFILING = 1u << 2,
+};
+typedef uint32_t iree_hal_device_feature_t;
+
+// Describes an enumerated HAL device.
+typedef struct iree_hal_device_info_t {
+  // Opaque handle used by drivers. Not valid across driver instances.
+  iree_hal_device_id_t device_id;
+  // Name of the device as returned by the API.
+  iree_string_view_t name;
+} iree_hal_device_info_t;
+
+// A transfer source or destination.
+typedef struct iree_hal_transfer_buffer_t {
+  // A host-allocated void* buffer.
+  iree_byte_span_t host_buffer;
+  // A device-allocated buffer (may be of any memory type).
+  iree_hal_buffer_t* device_buffer;
+} iree_hal_transfer_buffer_t;
+
+static inline iree_hal_transfer_buffer_t iree_hal_make_host_transfer_buffer(
+    iree_byte_span_t host_buffer) {
+  iree_hal_transfer_buffer_t transfer_buffer = {
+      host_buffer,
+      NULL,
+  };
+  return transfer_buffer;
+}
+
+static inline iree_hal_transfer_buffer_t
+iree_hal_make_host_transfer_buffer_span(void* ptr, iree_host_size_t length) {
+  iree_hal_transfer_buffer_t transfer_buffer = {
+      iree_make_byte_span(ptr, length),
+      NULL,
+  };
+  return transfer_buffer;
+}
+
+static inline iree_hal_transfer_buffer_t iree_hal_make_device_transfer_buffer(
+    iree_hal_buffer_t* device_buffer) {
+  iree_hal_transfer_buffer_t transfer_buffer = {
+      iree_byte_span_empty(),
+      device_buffer,
+  };
+  return transfer_buffer;
+}
+
+// A list of semaphores and their corresponding payloads.
+// When signaling each semaphore will be set to the new payload value provided.
+// When waiting each semaphore must reach or exceed the payload value.
+typedef struct iree_hal_semaphore_list_t {
+  iree_host_size_t count;
+  iree_hal_semaphore_t** semaphores;
+  uint64_t* payload_values;
+} iree_hal_semaphore_list_t;
+
+// A single batch of command buffers submitted to a device queue.
+// All of the wait semaphores must reach or exceed the given payload value prior
+// to the batch beginning execution. Each command buffer begins execution in the
+// order it is present in the list, though note that the command buffers
+// execute concurrently and require internal synchronization via events if there
+// are any dependencies between them. Only after all command buffers have
+// completed will the signal semaphores be updated to the provided payload
+// values.
+//
+// Matches Vulkan's VkSubmitInfo:
+// https://www.khronos.org/registry/vulkan/specs/1.2-extensions/man/html/VkSubmitInfo.html
+// Note that as the HAL only models timeline semaphores we take the payload
+// values directly in this struct; see:
+// https://www.khronos.org/registry/vulkan/specs/1.2-extensions/man/html/VkTimelineSemaphoreSubmitInfo.html
+typedef struct iree_hal_submission_batch_t {
+  // Semaphores to wait on prior to executing any command buffer.
+  iree_hal_semaphore_list_t wait_semaphores;
+
+  // Command buffers to execute, in order.
+  iree_host_size_t command_buffer_count;
+  iree_hal_command_buffer_t** command_buffers;
+
+  // Semaphores to signal once all command buffers have completed execution.
+  iree_hal_semaphore_list_t signal_semaphores;
+} iree_hal_submission_batch_t;
+
+// Defines how a multi-wait operation treats the results of multiple semaphores.
+typedef enum iree_hal_wait_mode_e {
+  // Waits for all semaphores to reach or exceed their specified values.
+  IREE_HAL_WAIT_MODE_ALL = 0,
+  // Waits for one or more semaphores to reach or exceed their specified values.
+  IREE_HAL_WAIT_MODE_ANY = 1,
+} iree_hal_wait_mode_t;
+
+//===----------------------------------------------------------------------===//
+// iree_hal_device_t
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_device_t iree_hal_device_t;
+
+// Retains the given |device| for the caller.
+IREE_API_EXPORT void iree_hal_device_retain(iree_hal_device_t* device);
+
+// Releases the given |device| from the caller.
+IREE_API_EXPORT void iree_hal_device_release(iree_hal_device_t* device);
+
+// Returns the device identifier.
+// This identifier may vary based on the runtime device type; for example, a
+// Vulkan device may return `vulkan-v1.1` or `vulkan-v1.2-spec1`.
+IREE_API_EXPORT iree_string_view_t
+iree_hal_device_id(iree_hal_device_t* device);
+
+// Returns the host allocator used for objects.
+IREE_API_EXPORT iree_allocator_t
+iree_hal_device_host_allocator(iree_hal_device_t* device);
+
+// Returns a reference to the allocator of the device that can be used for
+// allocating buffers.
+IREE_API_EXPORT iree_hal_allocator_t* iree_hal_device_allocator(
+    iree_hal_device_t* device);
+
+// Trims pools and caches used by the HAL to the minimum required for live
+// allocations. This can be used on low-memory conditions or when
+// suspending/parking instances.
+IREE_API_EXPORT
+iree_status_t iree_hal_device_trim(iree_hal_device_t* device);
+
+// Queries a configuration value as an int32_t.
+// The |category| and |key| will be provided to the device driver to interpret
+// in a device-specific way and if recognized the value will be converted to an
+// int32_t and returned in |out_value|. Fails if the value represented by the
+// key is not convertable (overflows a 32-bit integer, not a number, etc).
+//
+// This is roughly equivalent to the `sysconf` linux syscall
+// (https://man7.org/linux/man-pages/man3/sysconf.3.html) in that the exact
+// set of categories and keys available and their interpretation is
+// target-dependent.
+//
+// Well-known queries (category :: key):
+//   hal.device.id :: some-pattern-*
+//   hal.device.feature :: some-pattern-*
+//   hal.device.architecture :: some-pattern-*
+//   hal.executable.format :: some-pattern-*
+//
+// Returned values must remain the same for the lifetime of the device as
+// callers may cache them to avoid redundant calls.
+IREE_API_EXPORT iree_status_t iree_hal_device_query_i32(
+    iree_hal_device_t* device, iree_string_view_t category,
+    iree_string_view_t key, int32_t* out_value);
+
+// Synchronously copies data from |source| into |target|.
+//
+// Supports host->device, device->host, and device->device transfer,
+// including across devices. This method will never fail based on device
+// capabilities but may incur some extreme transient allocations and copies in
+// order to perform the transfer.
+//
+// The ordering of the transfer is undefined with respect to queue execution on
+// the source or target device; some may require full device flushes in order to
+// perform this operation while others may immediately perform it while there is
+// still work outstanding.
+//
+// It is strongly recommended that buffer operations are performed on transfer
+// queues; using this synchronous function may incur additional cache flushes
+// and synchronous blocking behavior and is not supported on all buffer types.
+// See iree_hal_command_buffer_copy_buffer.
+IREE_API_EXPORT iree_status_t iree_hal_device_transfer_range(
+    iree_hal_device_t* device, iree_hal_transfer_buffer_t source,
+    iree_device_size_t source_offset, iree_hal_transfer_buffer_t target,
+    iree_device_size_t target_offset, iree_device_size_t data_length,
+    iree_hal_transfer_buffer_flags_t flags, iree_timeout_t timeout);
+
+// Synchronously copies data from host |source| into device |target|.
+// Convience wrapper around iree_hal_device_transfer_range.
+IREE_API_EXPORT iree_status_t iree_hal_device_transfer_h2d(
+    iree_hal_device_t* device, const void* source, iree_hal_buffer_t* target,
+    iree_device_size_t target_offset, iree_device_size_t data_length,
+    iree_hal_transfer_buffer_flags_t flags, iree_timeout_t timeout);
+
+// Synchronously copies data from device |source| into host |target|.
+// Convience wrapper around iree_hal_device_transfer_range.
+IREE_API_EXPORT iree_status_t iree_hal_device_transfer_d2h(
+    iree_hal_device_t* device, iree_hal_buffer_t* source,
+    iree_device_size_t source_offset, void* target,
+    iree_device_size_t data_length, iree_hal_transfer_buffer_flags_t flags,
+    iree_timeout_t timeout);
+
+// Synchronously copies data from device |source| into device |target|.
+// Convience wrapper around iree_hal_device_transfer_range.
+IREE_API_EXPORT iree_status_t iree_hal_device_transfer_d2d(
+    iree_hal_device_t* device, iree_hal_buffer_t* source,
+    iree_device_size_t source_offset, iree_hal_buffer_t* target,
+    iree_device_size_t target_offset, iree_device_size_t data_length,
+    iree_hal_transfer_buffer_flags_t flags, iree_timeout_t timeout);
+
+// Synchronously executes one or more transfer operations against a queue.
+// All buffers must be compatible with |device| and ranges must not overlap
+// (same as with memcpy).
+//
+// This is a blocking operation and may incur significant overheads as
+// internally it issues a command buffer with the transfer operations and waits
+// for it to complete. Users should do that themselves so that the work can be
+// issued concurrently and batched effectively. This is only useful as a
+// fallback for implementations that require it or tools where things like I/O
+// are transferred without worrying about performance. When submitting other
+// work it's preferable to use iree_hal_create_transfer_command_buffer and a
+// normal queue submission that allows for more fine-grained sequencing and
+// amortizes the submission cost by batching other work.
+//
+// The transfer will begin after the optional |wait_semaphore| reaches
+// |wait_value|. Behavior is undefined if no semaphore is provided and there are
+// in-flight operations concurrently using the buffer ranges.
+// Returns only after all transfers have completed and been flushed.
+IREE_API_EXPORT iree_status_t iree_hal_device_transfer_and_wait(
+    iree_hal_device_t* device, iree_hal_semaphore_t* wait_semaphore,
+    uint64_t wait_value, iree_host_size_t transfer_count,
+    const iree_hal_transfer_command_t* transfer_commands,
+    iree_timeout_t timeout);
+
+// Submits one or more batches of work to a device queue.
+//
+// The queue is selected based on the flags set in |command_categories| and the
+// |queue_affinity|. As the number of available queues can vary the
+// |queue_affinity| is used to hash into the available queues for the required
+// categories. For example if 2 queues support transfer commands and the
+// affinity is 5 the resulting queue could be index hash(5)=1. The affinity can
+// thus be treated as just a way to indicate whether two submissions must be
+// placed on to the same queue. Note that the exact hashing function is
+// implementation dependent.
+//
+// The submission behavior matches Vulkan's vkQueueSubmit, with each batch
+// executing its command buffers in the order they are defined but allowing the
+// command buffers to complete out-of-order. See:
+// https://www.khronos.org/registry/vulkan/specs/1.2-extensions/man/html/vkQueueSubmit.html
+IREE_API_EXPORT iree_status_t iree_hal_device_queue_submit(
+    iree_hal_device_t* device, iree_hal_command_category_t command_categories,
+    iree_hal_queue_affinity_t queue_affinity, iree_host_size_t batch_count,
+    const iree_hal_submission_batch_t* batches);
+
+// Submits batches of work and waits until |wait_semaphore| reaches or exceeds
+// |wait_value|.
+//
+// This is equivalent to following iree_hal_device_queue_submit with a
+// iree_hal_semaphore_wait on |wait_timeout|/|wait_value| but
+// may help to reduce overhead by preventing thread wakeups, kernel calls, and
+// internal tracking.
+//
+// See iree_hal_device_queue_submit for more information about the queuing
+// behavior and iree_hal_semaphore_wait for the waiting  behavior.
+IREE_API_EXPORT iree_status_t iree_hal_device_submit_and_wait(
+    iree_hal_device_t* device, iree_hal_command_category_t command_categories,
+    iree_hal_queue_affinity_t queue_affinity, iree_host_size_t batch_count,
+    const iree_hal_submission_batch_t* batches,
+    iree_hal_semaphore_t* wait_semaphore, uint64_t wait_value,
+    iree_timeout_t timeout);
+
+// Blocks the caller until the semaphores reach or exceed the specified payload
+// values or the |timeout| elapses. All semaphores in |semaphore_list| must be
+// created from this device (or be imported into it).
+//
+// |wait_mode| can be used to decide when the wait will proceed; whether *all*
+// semaphores in |semaphore_list| must be signaled or whether *any* (one or
+// more) can be signaled before an early return.
+//
+// Returns success if the wait is successful and semaphores have been signaled
+// satisfying the |wait_mode|.
+//
+// Returns IREE_STATUS_DEADLINE_EXCEEDED if the |timeout| elapses without the
+// |wait_mode| being satisfied. Note that even on success only a subset of the
+// semaphores may have been signaled and each can be queried to see which ones.
+//
+// Returns IREE_STATUS_ABORTED if one or more semaphores has failed. Callers can
+// use iree_hal_semaphore_query on the semaphores to find the ones that have
+// failed and get the status.
+IREE_API_EXPORT iree_status_t iree_hal_device_wait_semaphores(
+    iree_hal_device_t* device, iree_hal_wait_mode_t wait_mode,
+    const iree_hal_semaphore_list_t* semaphore_list, iree_timeout_t timeout);
+
+// Blocks the caller until all outstanding requests on all queues have been
+// completed or the |timeout| elapses. This is equivalent to having waited
+// on all semaphores outstanding at the time of the call, meaning that if new
+// work is submitted by another thread it may not be waited on prior to this
+// call returning.
+//
+// Returns success if the device reaches an idle point during the call.
+//
+// Returns DEADLINE_EXCEEDED if the |timeout| elapses without the device having
+// become idle.
+IREE_API_EXPORT iree_status_t
+iree_hal_device_wait_idle(iree_hal_device_t* device, iree_timeout_t timeout);
+
+//===----------------------------------------------------------------------===//
+// iree_hal_device_t implementation details
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_device_vtable_t {
+  void(IREE_API_PTR* destroy)(iree_hal_device_t* device);
+
+  iree_string_view_t(IREE_API_PTR* id)(iree_hal_device_t* device);
+
+  iree_allocator_t(IREE_API_PTR* host_allocator)(iree_hal_device_t* device);
+  iree_hal_allocator_t*(IREE_API_PTR* device_allocator)(
+      iree_hal_device_t* device);
+
+  iree_status_t(IREE_API_PTR* trim)(iree_hal_device_t* device);
+
+  iree_status_t(IREE_API_PTR* query_i32)(iree_hal_device_t* device,
+                                         iree_string_view_t category,
+                                         iree_string_view_t key,
+                                         int32_t* out_value);
+
+  iree_status_t(IREE_API_PTR* create_command_buffer)(
+      iree_hal_device_t* device, iree_hal_command_buffer_mode_t mode,
+      iree_hal_command_category_t command_categories,
+      iree_hal_queue_affinity_t queue_affinity,
+      iree_hal_command_buffer_t** out_command_buffer);
+
+  iree_status_t(IREE_API_PTR* create_descriptor_set)(
+      iree_hal_device_t* device, iree_hal_descriptor_set_layout_t* set_layout,
+      iree_host_size_t binding_count,
+      const iree_hal_descriptor_set_binding_t* bindings,
+      iree_hal_descriptor_set_t** out_descriptor_set);
+
+  iree_status_t(IREE_API_PTR* create_descriptor_set_layout)(
+      iree_hal_device_t* device,
+      iree_hal_descriptor_set_layout_usage_type_t usage_type,
+      iree_host_size_t binding_count,
+      const iree_hal_descriptor_set_layout_binding_t* bindings,
+      iree_hal_descriptor_set_layout_t** out_descriptor_set_layout);
+
+  iree_status_t(IREE_API_PTR* create_event)(iree_hal_device_t* device,
+                                            iree_hal_event_t** out_event);
+
+  iree_status_t(IREE_API_PTR* create_executable_cache)(
+      iree_hal_device_t* device, iree_string_view_t identifier,
+      iree_loop_t loop, iree_hal_executable_cache_t** out_executable_cache);
+
+  iree_status_t(IREE_API_PTR* create_executable_layout)(
+      iree_hal_device_t* device, iree_host_size_t push_constants,
+      iree_host_size_t set_layout_count,
+      iree_hal_descriptor_set_layout_t** set_layouts,
+      iree_hal_executable_layout_t** out_executable_layout);
+
+  iree_status_t(IREE_API_PTR* create_semaphore)(
+      iree_hal_device_t* device, uint64_t initial_value,
+      iree_hal_semaphore_t** out_semaphore);
+
+  iree_status_t(IREE_API_PTR* transfer_range)(
+      iree_hal_device_t* device, iree_hal_transfer_buffer_t source,
+      iree_device_size_t source_offset, iree_hal_transfer_buffer_t target,
+      iree_device_size_t target_offset, iree_device_size_t data_length,
+      iree_hal_transfer_buffer_flags_t flags, iree_timeout_t timeout);
+
+  iree_status_t(IREE_API_PTR* queue_submit)(
+      iree_hal_device_t* device, iree_hal_command_category_t command_categories,
+      iree_hal_queue_affinity_t queue_affinity, iree_host_size_t batch_count,
+      const iree_hal_submission_batch_t* batches);
+
+  iree_status_t(IREE_API_PTR* submit_and_wait)(
+      iree_hal_device_t* device, iree_hal_command_category_t command_categories,
+      iree_hal_queue_affinity_t queue_affinity, iree_host_size_t batch_count,
+      const iree_hal_submission_batch_t* batches,
+      iree_hal_semaphore_t* wait_semaphore, uint64_t wait_value,
+      iree_timeout_t timeout);
+
+  iree_status_t(IREE_API_PTR* wait_semaphores)(
+      iree_hal_device_t* device, iree_hal_wait_mode_t wait_mode,
+      const iree_hal_semaphore_list_t* semaphore_list, iree_timeout_t timeout);
+
+  iree_status_t(IREE_API_PTR* wait_idle)(iree_hal_device_t* device,
+                                         iree_timeout_t timeout);
+} iree_hal_device_vtable_t;
+IREE_HAL_ASSERT_VTABLE_LAYOUT(iree_hal_device_vtable_t);
+
+IREE_API_EXPORT void iree_hal_device_destroy(iree_hal_device_t* device);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_DEVICE_H_
diff --git a/runtime/src/iree/hal/driver.c b/runtime/src/iree/hal/driver.c
new file mode 100644
index 0000000..778aaed
--- /dev/null
+++ b/runtime/src/iree/hal/driver.c
@@ -0,0 +1,59 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/driver.h"
+
+#include <stddef.h>
+
+#include "iree/base/tracing.h"
+#include "iree/hal/detail.h"
+#include "iree/hal/resource.h"
+
+#define _VTABLE_DISPATCH(driver, method_name) \
+  IREE_HAL_VTABLE_DISPATCH(driver, iree_hal_driver, method_name)
+
+IREE_HAL_API_RETAIN_RELEASE(driver);
+
+IREE_API_EXPORT iree_status_t iree_hal_driver_query_available_devices(
+    iree_hal_driver_t* driver, iree_allocator_t allocator,
+    iree_hal_device_info_t** out_device_infos,
+    iree_host_size_t* out_device_info_count) {
+  IREE_ASSERT_ARGUMENT(driver);
+  IREE_ASSERT_ARGUMENT(out_device_infos);
+  IREE_ASSERT_ARGUMENT(out_device_info_count);
+  *out_device_info_count = 0;
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_status_t status = _VTABLE_DISPATCH(driver, query_available_devices)(
+      driver, allocator, out_device_infos, out_device_info_count);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_driver_create_device(
+    iree_hal_driver_t* driver, iree_hal_device_id_t device_id,
+    iree_allocator_t allocator, iree_hal_device_t** out_device) {
+  IREE_ASSERT_ARGUMENT(driver);
+  IREE_ASSERT_ARGUMENT(out_device);
+  *out_device = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_status_t status = _VTABLE_DISPATCH(driver, create_device)(
+      driver, device_id, allocator, out_device);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_driver_create_default_device(
+    iree_hal_driver_t* driver, iree_allocator_t allocator,
+    iree_hal_device_t** out_device) {
+  IREE_ASSERT_ARGUMENT(driver);
+  IREE_ASSERT_ARGUMENT(out_device);
+  *out_device = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_status_t status = _VTABLE_DISPATCH(driver, create_device)(
+      driver, IREE_HAL_DRIVER_ID_INVALID, allocator, out_device);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
diff --git a/runtime/src/iree/hal/driver.h b/runtime/src/iree/hal/driver.h
new file mode 100644
index 0000000..65cbd66
--- /dev/null
+++ b/runtime/src/iree/hal/driver.h
@@ -0,0 +1,117 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_DRIVER_H_
+#define IREE_HAL_DRIVER_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/device.h"
+#include "iree/hal/resource.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+//===----------------------------------------------------------------------===//
+// Types and Enums
+//===----------------------------------------------------------------------===//
+
+// An opaque factory-specific handle to identify different drivers.
+typedef uint64_t iree_hal_driver_id_t;
+
+#define IREE_HAL_DRIVER_ID_INVALID 0ull
+
+// Describes a driver providing device enumeration and creation.
+// The lifetime of memory referenced by this structure (such as strings) is
+// dependent on where it originated.
+//
+// * When using iree_hal_driver_registry_enumerate the driver info is copied
+//   into memory owned by the caller.
+// * When queried from a live driver with iree_hal_driver_info the memory is
+//   only guaranteed to live for as long as the driver is.
+// * When enumerating via factories the information may be valid only while the
+//   driver registry lock is held.
+typedef struct iree_hal_driver_info_t {
+  IREE_API_UNSTABLE
+
+  // Opaque handle used by factories. Unique across all factories.
+  iree_hal_driver_id_t driver_id;
+
+  // Canonical name of the driver as used in command lines, documentation, etc.
+  // Examples: 'metal', 'vulkan'
+  iree_string_view_t driver_name;
+
+  // Full human-readable name of the driver for display.
+  // Examples: 'Vulkan 1.2 (NVIDIA)'.
+  iree_string_view_t full_name;
+
+  // TODO(benvanik): version information; useful if wanting to expose multiple
+  // versions that may have completely different implementations (like vulkan
+  // 1.0, 1.1, and 1.2) but allow a nice sort/selection process.
+  // TODO(benvanik): triple, feature flags, etc.
+} iree_hal_driver_info_t;
+
+//===----------------------------------------------------------------------===//
+// iree_hal_driver_t
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_driver_t iree_hal_driver_t;
+
+// Retains the given |driver| for the caller.
+IREE_API_EXPORT void iree_hal_driver_retain(iree_hal_driver_t* driver);
+
+// Releases the given |driver| from the caller.
+IREE_API_EXPORT void iree_hal_driver_release(iree_hal_driver_t* driver);
+
+// Queries available devices and returns them as a list.
+// The provided |allocator| will be used to allocate the returned list and after
+// the caller is done with it |out_device_infos| must be freed with that same
+// allocator by the caller.
+IREE_API_EXPORT iree_status_t iree_hal_driver_query_available_devices(
+    iree_hal_driver_t* driver, iree_allocator_t allocator,
+    iree_hal_device_info_t** out_device_infos,
+    iree_host_size_t* out_device_info_count);
+
+// Creates a device as queried with iree_hal_driver_query_available_devices.
+IREE_API_EXPORT iree_status_t iree_hal_driver_create_device(
+    iree_hal_driver_t* driver, iree_hal_device_id_t device_id,
+    iree_allocator_t allocator, iree_hal_device_t** out_device);
+
+// Creates the driver-defined "default" device. This may simply be the first
+// device enumerated.
+IREE_API_EXPORT iree_status_t iree_hal_driver_create_default_device(
+    iree_hal_driver_t* driver, iree_allocator_t allocator,
+    iree_hal_device_t** out_device);
+
+//===----------------------------------------------------------------------===//
+// iree_hal_driver_t implementation details
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_driver_vtable_t {
+  void(IREE_API_PTR* destroy)(iree_hal_driver_t* driver);
+
+  iree_status_t(IREE_API_PTR* query_available_devices)(
+      iree_hal_driver_t* driver, iree_allocator_t allocator,
+      iree_hal_device_info_t** out_device_infos,
+      iree_host_size_t* out_device_info_count);
+
+  iree_status_t(IREE_API_PTR* create_device)(iree_hal_driver_t* driver,
+                                             iree_hal_device_id_t device_id,
+                                             iree_allocator_t allocator,
+                                             iree_hal_device_t** out_device);
+} iree_hal_driver_vtable_t;
+IREE_HAL_ASSERT_VTABLE_LAYOUT(iree_hal_driver_vtable_t);
+
+IREE_API_EXPORT void iree_hal_driver_destroy(iree_hal_driver_t* driver);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_DRIVER_H_
diff --git a/runtime/src/iree/hal/driver_registry.c b/runtime/src/iree/hal/driver_registry.c
new file mode 100644
index 0000000..d949e48
--- /dev/null
+++ b/runtime/src/iree/hal/driver_registry.c
@@ -0,0 +1,361 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/driver_registry.h"
+
+#include <stddef.h>
+#include <string.h>
+
+#include "iree/base/internal/call_once.h"
+#include "iree/base/internal/synchronization.h"
+#include "iree/base/tracing.h"
+
+//===----------------------------------------------------------------------===//
+// iree_hal_driver_registry_t
+//===----------------------------------------------------------------------===//
+
+// 8 factories is enough for anyone, right?
+// But really this is here to prevent the need for dynamically allocated memory.
+// Because it's an implementation detail it's easy to grow in the future if we
+// want to support additional factories.
+//
+// An alternative would be to keep factories in an intrusive list - that way
+// there is no storage beyond the factory itself. This is less ideal as it would
+// force all factory storage to be in writeable memory and limit the ability for
+// the same factory to be registered with multiple registries (useful when
+// isolating/sandboxing/multi-versioning).
+#define IREE_HAL_MAX_DRIVER_FACTORY_COUNT 8
+
+struct iree_hal_driver_registry_t {
+  iree_allocator_t host_allocator;
+  iree_slim_mutex_t mutex;
+
+  // Factories in registration order. As factories are unregistered the list is
+  // shifted to be kept dense.
+  iree_host_size_t factory_count;
+  const iree_hal_driver_factory_t* factories[IREE_HAL_MAX_DRIVER_FACTORY_COUNT];
+};
+
+static iree_hal_driver_registry_t iree_hal_driver_registry_default_;
+static iree_once_flag iree_hal_driver_registry_default_flag_ =
+    IREE_ONCE_FLAG_INIT;
+static void iree_hal_driver_registry_default_initialize(void) {
+  memset(&iree_hal_driver_registry_default_, 0,
+         sizeof(iree_hal_driver_registry_default_));
+  iree_slim_mutex_initialize(&iree_hal_driver_registry_default_.mutex);
+}
+
+IREE_API_EXPORT iree_hal_driver_registry_t* iree_hal_driver_registry_default(
+    void) {
+  iree_call_once(&iree_hal_driver_registry_default_flag_,
+                 iree_hal_driver_registry_default_initialize);
+  return &iree_hal_driver_registry_default_;
+}
+
+IREE_API_EXPORT iree_status_t
+iree_hal_driver_registry_allocate(iree_allocator_t host_allocator,
+                                  iree_hal_driver_registry_t** out_registry) {
+  IREE_ASSERT_ARGUMENT(out_registry);
+  *out_registry = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_driver_registry_t* registry = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_allocator_malloc(host_allocator, sizeof(*registry),
+                                (void**)&registry));
+  registry->host_allocator = host_allocator;
+  iree_slim_mutex_initialize(&registry->mutex);
+
+  *out_registry = registry;
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT void iree_hal_driver_registry_free(
+    iree_hal_driver_registry_t* registry) {
+  if (!registry) return;
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_allocator_t host_allocator = registry->host_allocator;
+
+  iree_slim_mutex_deinitialize(&registry->mutex);
+  iree_allocator_free(host_allocator, registry);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_driver_registry_register_factory(
+    iree_hal_driver_registry_t* registry,
+    const iree_hal_driver_factory_t* factory) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_slim_mutex_lock(&registry->mutex);
+
+  // Fail if already present; not because having it in there would harm anything
+  // but because we can't then balance with unregisters if we were to skip it
+  // when present and want to keep the list small and not have callers fill it
+  // with tons of duplicate entries.
+  iree_status_t status = iree_ok_status();
+  for (iree_host_size_t i = 0; i < registry->factory_count; ++i) {
+    if (registry->factories[i] == factory) {
+      status = iree_make_status(IREE_STATUS_ALREADY_EXISTS,
+                                "factory has already been registered");
+      break;
+    }
+  }
+
+  // Note that we check the capacity limit *after* checking for dupes so that
+  // callers will find issues with duplicate registrations easier. Otherwise,
+  // they'd just get a RESOURCE_EXHAUSTED and think there were too many unique
+  // factories registered already.
+  if (iree_status_is_ok(status) &&
+      registry->factory_count + 1 >= IREE_ARRAYSIZE(registry->factories)) {
+    status = iree_make_status(
+        IREE_STATUS_RESOURCE_EXHAUSTED,
+        "the maximum number of factories (%zu) have been registered",
+        IREE_ARRAYSIZE(registry->factories));
+  }
+
+  if (iree_status_is_ok(status)) {
+    registry->factories[registry->factory_count++] = factory;
+  }
+
+  iree_slim_mutex_unlock(&registry->mutex);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_driver_registry_unregister_factory(
+    iree_hal_driver_registry_t* registry,
+    const iree_hal_driver_factory_t* factory) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_slim_mutex_lock(&registry->mutex);
+
+  iree_status_t status = iree_ok_status();
+  iree_host_size_t index = -1;
+  for (iree_host_size_t i = 0; i < registry->factory_count; ++i) {
+    if (registry->factories[i] != factory) continue;
+    index = i;
+    break;
+  }
+  if (index == -1) {
+    status =
+        iree_make_status(IREE_STATUS_NOT_FOUND,
+                         "factory to remove is not registered at this time");
+  }
+
+  if (iree_status_is_ok(status)) {
+    // Compact list. Note that registration order is preserved.
+    // C4090 bug in MSVC: https://tinyurl.com/y46hlogx
+    memmove((void*)&registry->factories[index], &registry->factories[index + 1],
+            registry->factory_count - index - 1);
+    registry->factories[--registry->factory_count] = NULL;
+  }
+
+  iree_slim_mutex_unlock(&registry->mutex);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+// Computes the total byte size required to store driver info strings.
+static iree_host_size_t iree_hal_driver_info_compute_storage_size(
+    const iree_hal_driver_info_t* driver_info) {
+  iree_host_size_t storage_size = 0;
+  storage_size += driver_info->driver_name.size;
+  storage_size += driver_info->full_name.size;
+  return storage_size;
+}
+
+// Copies |source_driver_info| into |target_driver_info| using |string_storage|
+// for the nested strings. Returns the total number of bytes added to
+// string_storage.
+static iree_host_size_t iree_hal_driver_info_copy(
+    const iree_hal_driver_info_t* source_driver_info,
+    iree_hal_driver_info_t* target_driver_info, char* string_storage) {
+  // Copy everything by default (primitive fields, etc).
+  memcpy(target_driver_info, source_driver_info, sizeof(*target_driver_info));
+
+  // Copy in each string field to the string storage and set the ptr.
+  iree_host_size_t storage_size = 0;
+  storage_size += iree_string_view_append_to_buffer(
+      source_driver_info->driver_name, &target_driver_info->driver_name,
+      string_storage + storage_size);
+  storage_size += iree_string_view_append_to_buffer(
+      source_driver_info->full_name, &target_driver_info->full_name,
+      string_storage + storage_size);
+  return storage_size;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_driver_registry_enumerate(
+    iree_hal_driver_registry_t* registry, iree_allocator_t allocator,
+    iree_hal_driver_info_t** out_driver_infos,
+    iree_host_size_t* out_driver_info_count) {
+  IREE_ASSERT_ARGUMENT(registry);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  *out_driver_info_count = 0;
+  *out_driver_infos = NULL;
+
+  iree_status_t status = iree_ok_status();
+  iree_slim_mutex_lock(&registry->mutex);
+
+  // Enumerate each factory and figure out how much memory we need to fully
+  // store all data we need to clone.
+  iree_host_size_t total_driver_info_count = 0;
+  iree_host_size_t total_storage_size = 0;
+  for (iree_host_size_t i = 0; i < registry->factory_count; ++i) {
+    const iree_hal_driver_factory_t* factory = registry->factories[i];
+    const iree_hal_driver_info_t* driver_infos = NULL;
+    iree_host_size_t driver_info_count = 0;
+    status =
+        factory->enumerate(factory->self, &driver_infos, &driver_info_count);
+    if (!iree_status_is_ok(status)) break;
+    total_driver_info_count += driver_info_count;
+    for (iree_host_size_t j = 0; j < driver_info_count; j++) {
+      total_storage_size +=
+          iree_hal_driver_info_compute_storage_size(&driver_infos[j]);
+    }
+  }
+
+  // Allocate the required memory for both the driver infos and the string
+  // storage in a single block.
+  iree_host_size_t total_driver_infos_size =
+      total_driver_info_count * sizeof(iree_hal_driver_info_t);
+  if (iree_status_is_ok(status)) {
+    status = iree_allocator_malloc(allocator,
+                                   total_driver_infos_size + total_storage_size,
+                                   (void**)out_driver_infos);
+  }
+
+  // Write driver info and associated nested resources to the output. We have
+  // to enumerate again but enumeration is expected to be immutable for a given
+  // registration and we hold the lock so we're safe.
+  if (iree_status_is_ok(status)) {
+    iree_hal_driver_info_t* driver_info_storage_ptr = *out_driver_infos;
+    char* string_storage_ptr =
+        (char*)(*out_driver_infos) + total_driver_infos_size;
+    for (iree_host_size_t i = 0; i < registry->factory_count; ++i) {
+      const iree_hal_driver_factory_t* factory = registry->factories[i];
+      const iree_hal_driver_info_t* driver_infos = NULL;
+      iree_host_size_t driver_info_count = 0;
+      status =
+          factory->enumerate(factory->self, &driver_infos, &driver_info_count);
+      if (!iree_status_is_ok(status)) break;
+      for (iree_host_size_t j = 0; j < driver_info_count; j++) {
+        string_storage_ptr += iree_hal_driver_info_copy(
+            &driver_infos[j], driver_info_storage_ptr, string_storage_ptr);
+        ++driver_info_storage_ptr;
+      }
+    }
+    *out_driver_info_count = total_driver_info_count;
+  }
+
+  iree_slim_mutex_unlock(&registry->mutex);
+
+  // Cleanup memory if we failed.
+  if (!iree_status_is_ok(status) && *out_driver_infos) {
+    iree_allocator_free(allocator, *out_driver_infos);
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_driver_registry_try_create(
+    iree_hal_driver_registry_t* registry, iree_hal_driver_id_t driver_id,
+    iree_allocator_t allocator, iree_hal_driver_t** out_driver) {
+  IREE_ASSERT_ARGUMENT(registry);
+  if (driver_id == IREE_HAL_DRIVER_ID_INVALID) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "invalid driver id");
+  }
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, driver_id);
+
+  *out_driver = NULL;
+
+  iree_status_t status = iree_ok_status();
+  iree_slim_mutex_lock(&registry->mutex);
+
+  // TODO(benvanik): figure out a good way of lining this up. The issue is that
+  // the driver_id is something we return during enumeration but we really
+  // want it to be something dynamic. We could pack an epoch into it that is
+  // bumped each time the registry factory list is modified so we could tell
+  // when a factory was added/removed, etc. So:
+  //   driver_id = [3 byte epoch] [1 byte index into factory list] [4 byte id]
+  // Not sure which status code to return if the epoch is a mismatch, maybe
+  // IREE_STATUS_UNAVAILABLE? If you are mutating the registry from multiple
+  // threads while also enumerating, that may just be enough of a footgun to
+  // bail and force the caller to resolve :)
+  status =
+      iree_make_status(IREE_STATUS_UNIMPLEMENTED, "driver creation by id nyi");
+
+  iree_slim_mutex_unlock(&registry->mutex);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_driver_registry_try_create_by_name(
+    iree_hal_driver_registry_t* registry, iree_string_view_t driver_name,
+    iree_allocator_t allocator, iree_hal_driver_t** out_driver) {
+  IREE_ASSERT_ARGUMENT(registry);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_TEXT(z0, driver_name.data, driver_name.size);
+
+  *out_driver = NULL;
+
+  // NOTE: we hold the lock the entire time here so that we can avoid
+  // allocations and avoid spurious failures by outside mutation of the
+  // registry.
+  iree_status_t status = iree_ok_status();
+  iree_slim_mutex_lock(&registry->mutex);
+
+  // Enumerate each factory and scan for the requested driver.
+  // NOTE: we scan in reverse so that we prefer the first hit in the most
+  // recently registered factory.
+  const iree_hal_driver_factory_t* hit_factory = NULL;
+  iree_hal_driver_id_t hit_driver_id = IREE_HAL_DRIVER_ID_INVALID;
+  for (iree_host_size_t i = 0; i < registry->factory_count; ++i) {
+    // Reach inside and grab the internal factory data structures.
+    const iree_hal_driver_factory_t* factory =
+        registry->factories[registry->factory_count - i - 1];
+    const iree_hal_driver_info_t* driver_infos = NULL;
+    iree_host_size_t driver_info_count = 0;
+    status =
+        factory->enumerate(factory->self, &driver_infos, &driver_info_count);
+    if (!iree_status_is_ok(status)) break;
+
+    // Scan for the specific driver by name.
+    // NOTE: we scan in reverse here too so multiple drivers with the same name
+    // from the same factory prefer the later drivers in the list.
+    for (iree_host_size_t j = 0; j < driver_info_count; j++) {
+      const iree_hal_driver_info_t* driver_info =
+          &driver_infos[driver_info_count - j - 1];
+      if (iree_string_view_equal(driver_name, driver_info->driver_name)) {
+        hit_factory = factory;
+        hit_driver_id = driver_info->driver_id;
+        break;
+      }
+    }
+    // Since we are scanning in reverse we stop searching when we find the first
+    // hit (aka the most recently added driver).
+    if (hit_driver_id != IREE_HAL_DRIVER_ID_INVALID) break;
+  }
+
+  // If we found a driver during the scan try to create it now.
+  // This may block the caller (with the lock held!), and may fail if for
+  // example a delay-loaded driver cannot be created even if it was enumerated.
+  if (hit_driver_id != IREE_HAL_DRIVER_ID_INVALID) {
+    status = hit_factory->try_create(hit_factory->self, hit_driver_id,
+                                     allocator, out_driver);
+  } else {
+    status =
+        iree_make_status(IREE_STATUS_NOT_FOUND, "no driver '%.*s' registered",
+                         (int)driver_name.size, driver_name.data);
+  }
+
+  iree_slim_mutex_unlock(&registry->mutex);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
diff --git a/runtime/src/iree/hal/driver_registry.h b/runtime/src/iree/hal/driver_registry.h
new file mode 100644
index 0000000..fad02b4
--- /dev/null
+++ b/runtime/src/iree/hal/driver_registry.h
@@ -0,0 +1,168 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_DRIVER_REGISTRY_H_
+#define IREE_HAL_DRIVER_REGISTRY_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/driver.h"
+#include "iree/hal/resource.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+//===----------------------------------------------------------------------===//
+// Types and Enums
+//===----------------------------------------------------------------------===//
+
+// Factory interface used for driver enumeration and creation.
+// The factory is designed to in many cases live in rodata by not requiring any
+// real code or processing when the driver is statically known to be available.
+// When drivers may be dynamically available based on system configuration a
+// factory can discover them and provide them during enumeration.
+//
+// Delay-loaded drivers that may require non-trivial setup time (such as those
+// implemented in dynamic libraries or over RPC) can be speculatively enumerated
+// by a factory and then rely on the try_create to actually perform the slow
+// work once the user has explicitly signaled that they are willing to pay the
+// cost (and deal with the consequences).
+//
+// WARNING: this API is unstable until the HAL is fully ported. Do not use.
+typedef struct iree_hal_driver_factory_t {
+  // TODO(benvanik): version field.
+  IREE_API_UNSTABLE
+
+  // User-defined pointer passed to all functions.
+  void* self;
+
+  // Queries the list of available drivers provided by the factory, if any.
+  // |out_driver_infos| will be populated with a *reference* to factory data
+  // structures (such as the driver name) that callers may choose to clone if
+  // needed.
+  //
+  // Implementers must make their factory enumeration results immutable for the
+  // duration they are registered, though the behavior of try_create is allowed
+  // to change call-to-call. If a factory needs to mutate its set of enumerated
+  // devices then it must do so by first unregistering itself and re-registering
+  // only after the changes have been made.
+  //
+  // Called with the driver registry lock held; may be called from any thread.
+  iree_status_t(IREE_API_PTR* enumerate)(
+      void* self, const iree_hal_driver_info_t** out_driver_infos,
+      iree_host_size_t* out_driver_info_count);
+
+  // Tries to create a driver as previously queried with enumerate.
+  // |driver_id| is the opaque ID returned from enumeration; note that there may
+  // be a significant amount of time between enumeration and creation and the
+  // driver registry lock may have been release between then.
+  //
+  // Delay-loaded drivers may still fail here if - for example - required system
+  // resources are unavailable or permission is denied.
+  //
+  // Called with the driver registry lock held; may be called from any thread.
+  iree_status_t(IREE_API_PTR* try_create)(void* self,
+                                          iree_hal_driver_id_t driver_id,
+                                          iree_allocator_t allocator,
+                                          iree_hal_driver_t** out_driver);
+} iree_hal_driver_factory_t;
+
+//===----------------------------------------------------------------------===//
+// iree_hal_driver_registry_t
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_driver_registry_t iree_hal_driver_registry_t;
+
+// Returns the default per-process driver registry.
+// In simple applications this is usually where you want to go to register and
+// create drivers. More sophisticated applications that want tighter control
+// over the visibility of drivers to certain callers such as when dealing with
+// requests from multiple users may choose to allocate their own registries and
+// manage their lifetime as desired.
+IREE_API_EXPORT iree_hal_driver_registry_t* iree_hal_driver_registry_default(
+    void);
+
+// Allocates a driver registry that can be used to register and enumerate
+// HAL drivers.
+//
+// Callers must free the registry with iree_hal_driver_registry_free when it is
+// no longer needed.
+IREE_API_EXPORT iree_status_t iree_hal_driver_registry_allocate(
+    iree_allocator_t host_allocator, iree_hal_driver_registry_t** out_registry);
+
+// Frees a driver registry.
+// All factories will be implicitly unregistered.
+IREE_API_EXPORT void iree_hal_driver_registry_free(
+    iree_hal_driver_registry_t* registry);
+
+// Registers a driver factory to serve future queries/requests for drivers.
+// See iree_hal_driver_registry_t for more information.
+//
+// Thread-safe. The factory is not retained and must be kept alive by the caller
+// until it is unregistered (or the application terminates).
+IREE_API_EXPORT iree_status_t iree_hal_driver_registry_register_factory(
+    iree_hal_driver_registry_t* registry,
+    const iree_hal_driver_factory_t* factory);
+
+// Unregisters a driver factory.
+// Unregistering a factory only prevents new drivers from being created;
+// existing drivers may remain live even after unregistering. Factories can
+// expect that no new drivers will be created via the factory after the call
+// returns.
+//
+// Thread-safe. As the factory is not retained by the registry the caller must
+// release its memory (if needed) after this call returns.
+IREE_API_EXPORT iree_status_t iree_hal_driver_registry_unregister_factory(
+    iree_hal_driver_registry_t* registry,
+    const iree_hal_driver_factory_t* factory);
+
+// Enumerates all drivers from registered factories and returns them as a list.
+// The provided |allocator| will be used to allocate the returned list and after
+// the caller is done with it |out_driver_infos| must be freed with that same
+// allocator by the caller.
+//
+// The set of drivers returned should be considered the superset of those that
+// may be available for successful creation as it's possible that delay-loaded
+// drivers may fail even if they appear in this list.
+//
+// Thread-safe. Note that the factory may be unregistered between the query
+// completing and any attempt to instantiate the driver.
+IREE_API_EXPORT iree_status_t iree_hal_driver_registry_enumerate(
+    iree_hal_driver_registry_t* registry, iree_allocator_t allocator,
+    iree_hal_driver_info_t** out_driver_infos,
+    iree_host_size_t* out_driver_info_count);
+
+// Attempts to create a driver registered with the driver registry by a specific
+// ID as returned during enumeration in iree_hal_driver_info_t::driver_id.
+// This can be used to specify the exact driver to create in cases where there
+// may be multiple factories providing drivers with the same name.
+//
+// Thread-safe. May block the caller if the driver is delay-loaded and needs to
+// perform additional loading/verification/etc before returning.
+IREE_API_EXPORT iree_status_t iree_hal_driver_registry_try_create(
+    iree_hal_driver_registry_t* registry, iree_hal_driver_id_t driver_id,
+    iree_allocator_t allocator, iree_hal_driver_t** out_driver);
+
+// Attempts to create a driver registered with the given canonical driver name.
+// Effectively enumerate + find by name + try_create if found. Factories are
+// searched in most-recently-added order such that it's possible to override
+// drivers with newer registrations when multiple factories provide the same
+// driver name.
+//
+// Thread-safe. May block the caller if the driver is delay-loaded and needs to
+// perform additional loading/verification/etc before returning.
+IREE_API_EXPORT iree_status_t iree_hal_driver_registry_try_create_by_name(
+    iree_hal_driver_registry_t* registry, iree_string_view_t driver_name,
+    iree_allocator_t allocator, iree_hal_driver_t** out_driver);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_DRIVER_REGISTRY_H_
diff --git a/runtime/src/iree/hal/drivers/BUILD b/runtime/src/iree/hal/drivers/BUILD
new file mode 100644
index 0000000..cdf2b78
--- /dev/null
+++ b/runtime/src/iree/hal/drivers/BUILD
@@ -0,0 +1,77 @@
+# Copyright 2020 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("@bazel_skylib//rules:common_settings.bzl", "string_list_flag")
+load("//iree:build_defs.oss.bzl", "iree_runtime_cc_library")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+ALL_DRIVERS = [
+    "dylib",
+    "dylib-sync",
+    "vmvx",
+    "vmvx-sync",
+    "vulkan",
+    "cuda",
+]
+
+string_list_flag(
+    name = "enabled_drivers",
+    build_setting_default = [
+        "dylib",
+        "dylib-sync",
+        "vmvx",
+        "vmvx-sync",
+        "vulkan",
+    ],
+)
+
+[
+    config_setting(
+        name = "{}_enabled".format(driver),
+        flag_values = {
+            ":enabled_drivers": driver,
+        },
+    )
+    for driver in ALL_DRIVERS
+]
+
+iree_runtime_cc_library(
+    name = "drivers",
+    srcs = ["init.c"],
+    hdrs = ["init.h"],
+    deps = [
+               "//runtime/src/iree/base",
+               "//runtime/src/iree/base:tracing",
+           ] + select({
+               ":dylib_enabled": ["//runtime/src/iree/hal/dylib/registration"],
+               "//conditions:default": [],
+           }) +
+           select({
+               ":dylib-sync_enabled": ["//runtime/src/iree/hal/dylib/registration:sync"],
+               "//conditions:default": [],
+           }) +
+           select({
+               ":vmvx_enabled": ["//runtime/src/iree/hal/vmvx/registration"],
+               "//conditions:default": [],
+           }) +
+           select({
+               ":vmvx-sync_enabled": ["//runtime/src/iree/hal/vmvx/registration:sync"],
+               "//conditions:default": [],
+           }) +
+           select({
+               ":vulkan_enabled": ["//runtime/src/iree/hal/vulkan/registration"],
+               "//conditions:default": [],
+           }) +
+           select({
+               ":cuda_enabled": ["//runtime/src/iree/hal/cuda/registration"],
+               "//conditions:default": [],
+           }),
+)
diff --git a/runtime/src/iree/hal/drivers/CMakeLists.txt b/runtime/src/iree/hal/drivers/CMakeLists.txt
new file mode 100644
index 0000000..5dadc57
--- /dev/null
+++ b/runtime/src/iree/hal/drivers/CMakeLists.txt
@@ -0,0 +1,44 @@
+# Copyright 2020 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# Doesn't use bazel_to_cmake because of custom configuration vars
+
+set(IREE_HAL_DRIVER_MODULES)
+if(IREE_HAL_DRIVER_CUDA)
+  list(APPEND IREE_HAL_DRIVER_MODULES iree::hal::cuda::registration)
+endif()
+if(IREE_HAL_DRIVER_DYLIB)
+  list(APPEND IREE_HAL_DRIVER_MODULES iree::hal::dylib::registration)
+endif()
+if(IREE_HAL_DRIVER_DYLIB_SYNC)
+  list(APPEND IREE_HAL_DRIVER_MODULES iree::hal::dylib::registration::sync)
+endif()
+if(IREE_HAL_DRIVER_VMVX)
+  list(APPEND IREE_HAL_DRIVER_MODULES iree::hal::vmvx::registration)
+endif()
+if(IREE_HAL_DRIVER_VMVX_SYNC)
+  list(APPEND IREE_HAL_DRIVER_MODULES iree::hal::vmvx::registration::sync)
+endif()
+if(IREE_HAL_DRIVER_VULKAN)
+  list(APPEND IREE_HAL_DRIVER_MODULES iree::hal::vulkan::registration)
+endif()
+if(IREE_HAL_DRIVER_EXPERIMENTAL_ROCM)
+  list(APPEND IREE_HAL_DRIVER_MODULES experimental::rocm::registration)
+endif()
+
+iree_cc_library(
+  NAME
+    drivers
+  HDRS
+    "init.h"
+  SRCS
+    "init.c"
+  DEPS
+    iree::base
+    iree::base::tracing
+    ${IREE_HAL_DRIVER_MODULES}
+  PUBLIC
+)
diff --git a/runtime/src/iree/hal/drivers/init.c b/runtime/src/iree/hal/drivers/init.c
new file mode 100644
index 0000000..71f9d20
--- /dev/null
+++ b/runtime/src/iree/hal/drivers/init.c
@@ -0,0 +1,80 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/drivers/init.h"
+
+#include "iree/base/tracing.h"
+
+#if defined(IREE_HAL_HAVE_CUDA_DRIVER_MODULE)
+#include "iree/hal/cuda/registration/driver_module.h"
+#endif  // IREE_HAL_HAVE_CUDA_DRIVER_MODULE
+
+#if defined(IREE_HAL_HAVE_DYLIB_DRIVER_MODULE)
+#include "iree/hal/dylib/registration/driver_module.h"
+#endif  // IREE_HAL_HAVE_DYLIB_DRIVER_MODULE
+
+#if defined(IREE_HAL_HAVE_DYLIB_SYNC_DRIVER_MODULE)
+#include "iree/hal/dylib/registration/driver_module_sync.h"
+#endif  // IREE_HAL_HAVE_DYLIB_SYNC_DRIVER_MODULE
+
+#if defined(IREE_HAL_HAVE_VMVX_DRIVER_MODULE)
+#include "iree/hal/vmvx/registration/driver_module.h"
+#endif  // IREE_HAL_HAVE_VMVX_DRIVER_MODULE
+
+#if defined(IREE_HAL_HAVE_VMVX_SYNC_DRIVER_MODULE)
+#include "iree/hal/vmvx/registration/driver_module_sync.h"
+#endif  // IREE_HAL_HAVE_VMVX_SYNC_DRIVER_MODULE
+
+#if defined(IREE_HAL_HAVE_VULKAN_DRIVER_MODULE)
+#include "iree/hal/vulkan/registration/driver_module.h"
+#endif  // IREE_HAL_HAVE_VULKAN_DRIVER_MODULE
+
+#if defined(IREE_HAL_HAVE_EXPERIMENTAL_ROCM_DRIVER_MODULE)
+#include "experimental/rocm/registration/driver_module.h"
+#endif  // IREE_HAL_HAVE_EXPERIMENTAL_ROCM_DRIVER_MODULE
+
+IREE_API_EXPORT iree_status_t
+iree_hal_register_all_available_drivers(iree_hal_driver_registry_t* registry) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+#if defined(IREE_HAL_HAVE_CUDA_DRIVER_MODULE)
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_hal_cuda_driver_module_register(registry));
+#endif  // IREE_HAL_HAVE_CUDA_DRIVER_MODULE
+
+#if defined(IREE_HAL_HAVE_DYLIB_DRIVER_MODULE)
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_hal_dylib_driver_module_register(registry));
+#endif  // IREE_HAL_HAVE_DYLIB_DRIVER_MODULE
+
+#if defined(IREE_HAL_HAVE_DYLIB_SYNC_DRIVER_MODULE)
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_hal_dylib_sync_driver_module_register(registry));
+#endif  // IREE_HAL_HAVE_DYLIB_SYNC_DRIVER_MODULE
+
+#if defined(IREE_HAL_HAVE_VMVX_DRIVER_MODULE)
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_hal_vmvx_driver_module_register(registry));
+#endif  // IREE_HAL_HAVE_VMVX_DRIVER_MODULE
+
+#if defined(IREE_HAL_HAVE_VMVX_SYNC_DRIVER_MODULE)
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_hal_vmvx_sync_driver_module_register(registry));
+#endif  // IREE_HAL_HAVE_VMVX_SYNC_DRIVER_MODULE
+
+#if defined(IREE_HAL_HAVE_VULKAN_DRIVER_MODULE)
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_hal_vulkan_driver_module_register(registry));
+#endif  // IREE_HAL_HAVE_VULKAN_DRIVER_MODULE
+
+#if defined(IREE_HAL_HAVE_EXPERIMENTAL_ROCM_DRIVER_MODULE)
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_hal_rocm_driver_module_register(registry));
+#endif  // IREE_HAL_HAVE_EXPERIMENTAL_ROCM_DRIVER_MODULE
+
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
diff --git a/runtime/src/iree/hal/drivers/init.h b/runtime/src/iree/hal/drivers/init.h
new file mode 100644
index 0000000..849816c
--- /dev/null
+++ b/runtime/src/iree/hal/drivers/init.h
@@ -0,0 +1,31 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_DRIVERS_INIT_H_
+#define IREE_HAL_DRIVERS_INIT_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Registers all drivers that were linked into the current binary based on the
+// build configuration. Note that there may be no drivers available.
+//
+// This only registers IREE core drivers (those under iree/hal/). User-provided
+// drivers must be directly registered or directly created, though a user could
+// create their own user_register_all_available_drivers() that calls this as
+// well as registering their drivers.
+IREE_API_EXPORT iree_status_t
+iree_hal_register_all_available_drivers(iree_hal_driver_registry_t* registry);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_DRIVERS_INIT_H_
diff --git a/runtime/src/iree/hal/dylib/BUILD b/runtime/src/iree/hal/dylib/BUILD
new file mode 100644
index 0000000..236a474
--- /dev/null
+++ b/runtime/src/iree/hal/dylib/BUILD
@@ -0,0 +1,11 @@
+# Copyright 2020 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
diff --git a/runtime/src/iree/hal/dylib/CMakeLists.txt b/runtime/src/iree/hal/dylib/CMakeLists.txt
new file mode 100644
index 0000000..c6326d7
--- /dev/null
+++ b/runtime/src/iree/hal/dylib/CMakeLists.txt
@@ -0,0 +1,13 @@
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# runtime/src/iree/hal/dylib/BUILD                                             #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
+
+iree_add_all_subdirs()
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/runtime/src/iree/hal/dylib/cts/CMakeLists.txt b/runtime/src/iree/hal/dylib/cts/CMakeLists.txt
new file mode 100644
index 0000000..5bc7537
--- /dev/null
+++ b/runtime/src/iree/hal/dylib/cts/CMakeLists.txt
@@ -0,0 +1,48 @@
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+if(IREE_BYTECODE_MODULE_FORCE_SYSTEM_DYLIB_LINKER)
+  set(EXECUTABLE_FORMAT_PREFIX "system")
+else()
+  set (EXECUTABLE_FORMAT_PREFIX "embedded")
+endif()
+
+set(EXECUTABLE_FORMAT "\"${EXECUTABLE_FORMAT_PREFIX}-elf-\" IREE_ARCH")
+
+iree_hal_cts_test_suite(
+  DRIVER_NAME
+    dylib
+  DRIVER_REGISTRATION_HDR
+    "runtime/src/iree/hal/dylib/registration/driver_module.h"
+  DRIVER_REGISTRATION_FN
+    "iree_hal_dylib_driver_module_register"
+  COMPILER_TARGET_BACKEND
+    "dylib-llvm-aot"
+  EXECUTABLE_FORMAT
+    "${EXECUTABLE_FORMAT}"
+  DEPS
+    iree::hal::dylib::registration
+)
+
+iree_hal_cts_test_suite(
+  DRIVER_NAME
+    dylib-sync
+  DRIVER_REGISTRATION_HDR
+    "runtime/src/iree/hal/dylib/registration/driver_module_sync.h"
+  DRIVER_REGISTRATION_FN
+    "iree_hal_dylib_sync_driver_module_register"
+  COMPILER_TARGET_BACKEND
+    "dylib-llvm-aot"
+  EXECUTABLE_FORMAT
+    "${EXECUTABLE_FORMAT}"
+  DEPS
+    iree::hal::dylib::registration::sync
+  EXCLUDED_TESTS
+    # TODO(#4680): command buffer recording so that these can run on sync HAL
+    "command_buffer"
+    "event"
+    "semaphore_submission"
+)
diff --git a/runtime/src/iree/hal/dylib/registration/BUILD b/runtime/src/iree/hal/dylib/registration/BUILD
new file mode 100644
index 0000000..44bda46
--- /dev/null
+++ b/runtime/src/iree/hal/dylib/registration/BUILD
@@ -0,0 +1,71 @@
+# Copyright 2020 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("//iree:build_defs.oss.bzl", "iree_cmake_extra_content", "iree_runtime_cc_library")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+iree_cmake_extra_content(
+    content = """
+if(${IREE_HAL_DRIVER_DYLIB})
+""",
+    inline = True,
+)
+
+iree_runtime_cc_library(
+    name = "registration",
+    srcs = ["driver_module.c"],
+    hdrs = ["driver_module.h"],
+    defines = [
+        "IREE_HAL_HAVE_DYLIB_DRIVER_MODULE=1",
+    ],
+    deps = [
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base/internal:flags",
+        "//runtime/src/iree/hal",
+        "//runtime/src/iree/hal/local",
+        "//runtime/src/iree/hal/local:task_driver",
+        "//runtime/src/iree/hal/local/loaders:embedded_library_loader",
+        "//runtime/src/iree/hal/local/loaders:system_library_loader",
+        "//runtime/src/iree/task:api",
+    ],
+)
+
+iree_cmake_extra_content(
+    content = """
+endif()
+
+if(${IREE_HAL_DRIVER_DYLIB_SYNC})
+""",
+    inline = True,
+)
+
+iree_runtime_cc_library(
+    name = "sync",
+    srcs = ["driver_module_sync.c"],
+    hdrs = ["driver_module_sync.h"],
+    defines = [
+        "IREE_HAL_HAVE_DYLIB_SYNC_DRIVER_MODULE=1",
+    ],
+    deps = [
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/hal",
+        "//runtime/src/iree/hal/local",
+        "//runtime/src/iree/hal/local:sync_driver",
+        "//runtime/src/iree/hal/local/loaders:embedded_library_loader",
+    ],
+)
+
+iree_cmake_extra_content(
+    content = """
+endif()
+""",
+    inline = True,
+)
diff --git a/runtime/src/iree/hal/dylib/registration/CMakeLists.txt b/runtime/src/iree/hal/dylib/registration/CMakeLists.txt
new file mode 100644
index 0000000..edee5dc
--- /dev/null
+++ b/runtime/src/iree/hal/dylib/registration/CMakeLists.txt
@@ -0,0 +1,60 @@
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# runtime/src/iree/hal/dylib/registration/BUILD                                #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
+
+iree_add_all_subdirs()
+
+if(${IREE_HAL_DRIVER_DYLIB})
+
+iree_cc_library(
+  NAME
+    registration
+  HDRS
+    "driver_module.h"
+  SRCS
+    "driver_module.c"
+  DEPS
+    iree::base
+    iree::base::internal::flags
+    iree::hal
+    iree::hal::local
+    iree::hal::local::loaders::embedded_library_loader
+    iree::hal::local::loaders::system_library_loader
+    iree::hal::local::task_driver
+    iree::task::api
+  DEFINES
+    "IREE_HAL_HAVE_DYLIB_DRIVER_MODULE=1"
+  PUBLIC
+)
+
+endif()
+
+if(${IREE_HAL_DRIVER_DYLIB_SYNC})
+
+iree_cc_library(
+  NAME
+    sync
+  HDRS
+    "driver_module_sync.h"
+  SRCS
+    "driver_module_sync.c"
+  DEPS
+    iree::base
+    iree::hal
+    iree::hal::local
+    iree::hal::local::loaders::embedded_library_loader
+    iree::hal::local::sync_driver
+  DEFINES
+    "IREE_HAL_HAVE_DYLIB_SYNC_DRIVER_MODULE=1"
+  PUBLIC
+)
+
+endif()
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/runtime/src/iree/hal/dylib/registration/driver_module.c b/runtime/src/iree/hal/dylib/registration/driver_module.c
new file mode 100644
index 0000000..836db33
--- /dev/null
+++ b/runtime/src/iree/hal/dylib/registration/driver_module.c
@@ -0,0 +1,108 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/dylib/registration/driver_module.h"
+
+#include <inttypes.h>
+#include <stddef.h>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/flags.h"
+#include "iree/hal/local/executable_loader.h"
+#include "iree/hal/local/loaders/embedded_library_loader.h"
+#include "iree/hal/local/loaders/system_library_loader.h"
+#include "iree/hal/local/task_device.h"
+#include "iree/hal/local/task_driver.h"
+#include "iree/task/api.h"
+
+// TODO(#4298): remove this driver registration and wrapper.
+// By having a single iree/hal/local/registration that then has the loaders
+// added to it based on compilation settings we can have a single set of flags
+// for everything. We can also have API helper methods that register the driver
+// using an existing executor so that we can entirely externalize the task
+// system configuration from the HAL.
+
+#define IREE_HAL_DYLIB_DRIVER_ID 0x58444C4Cu  // XDLL
+
+static iree_status_t iree_hal_dylib_driver_factory_enumerate(
+    void* self, const iree_hal_driver_info_t** out_driver_infos,
+    iree_host_size_t* out_driver_info_count) {
+  static const iree_hal_driver_info_t driver_infos[1] = {
+      {
+          .driver_id = IREE_HAL_DYLIB_DRIVER_ID,
+          .driver_name = iree_string_view_literal("dylib"),
+          .full_name =
+              iree_string_view_literal("AOT compiled dynamic libraries"),
+      },
+  };
+  *out_driver_info_count = IREE_ARRAYSIZE(driver_infos);
+  *out_driver_infos = driver_infos;
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_dylib_driver_factory_try_create(
+    void* self, iree_hal_driver_id_t driver_id, iree_allocator_t host_allocator,
+    iree_hal_driver_t** out_driver) {
+  if (driver_id != IREE_HAL_DYLIB_DRIVER_ID) {
+    return iree_make_status(IREE_STATUS_UNAVAILABLE,
+                            "no driver with ID %016" PRIu64
+                            " is provided by this factory",
+                            driver_id);
+  }
+
+  iree_hal_task_device_params_t default_params;
+  iree_hal_task_device_params_initialize(&default_params);
+
+  iree_status_t status = iree_ok_status();
+
+  iree_hal_executable_loader_t* loaders[2] = {NULL, NULL};
+  iree_host_size_t loader_count = 0;
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_embedded_library_loader_create(
+        iree_hal_executable_import_provider_null(), host_allocator,
+        &loaders[loader_count++]);
+  }
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_system_library_loader_create(
+        iree_hal_executable_import_provider_null(), host_allocator,
+        &loaders[loader_count++]);
+  }
+
+  iree_task_executor_t* executor = NULL;
+  if (iree_status_is_ok(status)) {
+    status = iree_task_executor_create_from_flags(host_allocator, &executor);
+  }
+
+  iree_hal_allocator_t* device_allocator = NULL;
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_allocator_create_heap(iree_make_cstring_view("cpu"),
+                                            host_allocator, host_allocator,
+                                            &device_allocator);
+  }
+
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_task_driver_create(
+        iree_make_cstring_view("cpu"), &default_params, executor, loader_count,
+        loaders, device_allocator, host_allocator, out_driver);
+  }
+
+  iree_hal_allocator_release(device_allocator);
+  iree_task_executor_release(executor);
+  for (iree_host_size_t i = 0; i < loader_count; ++i) {
+    iree_hal_executable_loader_release(loaders[i]);
+  }
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t
+iree_hal_dylib_driver_module_register(iree_hal_driver_registry_t* registry) {
+  static const iree_hal_driver_factory_t factory = {
+      .self = NULL,
+      .enumerate = iree_hal_dylib_driver_factory_enumerate,
+      .try_create = iree_hal_dylib_driver_factory_try_create,
+  };
+  return iree_hal_driver_registry_register_factory(registry, &factory);
+}
diff --git a/runtime/src/iree/hal/dylib/registration/driver_module.h b/runtime/src/iree/hal/dylib/registration/driver_module.h
new file mode 100644
index 0000000..7c13188
--- /dev/null
+++ b/runtime/src/iree/hal/dylib/registration/driver_module.h
@@ -0,0 +1,26 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_DYLIB_REGISTRATION_DRIVER_MODULE_H_
+#define IREE_HAL_DYLIB_REGISTRATION_DRIVER_MODULE_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// DEPRECATED: this entire driver will be removed soon.
+// TODO(#3580): remove this entire driver w/ iree_hal_executable_library_t.
+IREE_API_EXPORT iree_status_t
+iree_hal_dylib_driver_module_register(iree_hal_driver_registry_t* registry);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_DYLIB_REGISTRATION_DRIVER_MODULE_H_
diff --git a/runtime/src/iree/hal/dylib/registration/driver_module_sync.c b/runtime/src/iree/hal/dylib/registration/driver_module_sync.c
new file mode 100644
index 0000000..29f0a69
--- /dev/null
+++ b/runtime/src/iree/hal/dylib/registration/driver_module_sync.c
@@ -0,0 +1,86 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/dylib/registration/driver_module_sync.h"
+
+#include <inttypes.h>
+#include <stddef.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/local/executable_loader.h"
+#include "iree/hal/local/loaders/embedded_library_loader.h"
+#include "iree/hal/local/sync_device.h"
+#include "iree/hal/local/sync_driver.h"
+
+// TODO(#4298): remove this driver registration and wrapper.
+// By having a single iree/hal/local/registration that then has the loaders
+// added to it based on compilation settings we can have a single set of flags
+// for everything.
+
+#define IREE_HAL_DYLIB_SYNC_DRIVER_ID 0x53444C4Cu  // SDLL
+
+static iree_status_t iree_hal_dylib_sync_driver_factory_enumerate(
+    void* self, const iree_hal_driver_info_t** out_driver_infos,
+    iree_host_size_t* out_driver_info_count) {
+  static const iree_hal_driver_info_t default_driver_info = {
+      .driver_id = IREE_HAL_DYLIB_SYNC_DRIVER_ID,
+      .driver_name = iree_string_view_literal("dylib-sync"),
+      .full_name = iree_string_view_literal(
+          "synchronous AOT compiled dynamic embedded libraries"),
+  };
+  *out_driver_info_count = 1;
+  *out_driver_infos = &default_driver_info;
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_dylib_sync_driver_factory_try_create(
+    void* self, iree_hal_driver_id_t driver_id, iree_allocator_t host_allocator,
+    iree_hal_driver_t** out_driver) {
+  if (driver_id != IREE_HAL_DYLIB_SYNC_DRIVER_ID) {
+    return iree_make_status(IREE_STATUS_UNAVAILABLE,
+                            "no driver with ID %016" PRIu64
+                            " is provided by this factory",
+                            driver_id);
+  }
+
+  iree_hal_sync_device_params_t default_params;
+  iree_hal_sync_device_params_initialize(&default_params);
+
+  iree_status_t status = iree_ok_status();
+  iree_hal_executable_loader_t* loaders[1] = {NULL};
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_embedded_library_loader_create(
+        iree_hal_executable_import_provider_null(), host_allocator,
+        &loaders[0]);
+  }
+
+  iree_hal_allocator_t* device_allocator = NULL;
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_allocator_create_heap(iree_make_cstring_view("cpu"),
+                                            host_allocator, host_allocator,
+                                            &device_allocator);
+  }
+
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_sync_driver_create(
+        iree_make_cstring_view("cpu"), &default_params, IREE_ARRAYSIZE(loaders),
+        loaders, device_allocator, host_allocator, out_driver);
+  }
+
+  iree_hal_allocator_release(device_allocator);
+  iree_hal_executable_loader_release(loaders[0]);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_dylib_sync_driver_module_register(
+    iree_hal_driver_registry_t* registry) {
+  static const iree_hal_driver_factory_t factory = {
+      .self = NULL,
+      .enumerate = iree_hal_dylib_sync_driver_factory_enumerate,
+      .try_create = iree_hal_dylib_sync_driver_factory_try_create,
+  };
+  return iree_hal_driver_registry_register_factory(registry, &factory);
+}
diff --git a/runtime/src/iree/hal/dylib/registration/driver_module_sync.h b/runtime/src/iree/hal/dylib/registration/driver_module_sync.h
new file mode 100644
index 0000000..2f8139f
--- /dev/null
+++ b/runtime/src/iree/hal/dylib/registration/driver_module_sync.h
@@ -0,0 +1,26 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_DYLIB_REGISTRATION_DRIVER_MODULE_SYNC_H_
+#define IREE_HAL_DYLIB_REGISTRATION_DRIVER_MODULE_SYNC_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// DEPRECATED: this entire driver will be removed soon.
+// TODO(#3580): remove this entire driver w/ iree_hal_executable_library_t.
+IREE_API_EXPORT iree_status_t iree_hal_dylib_sync_driver_module_register(
+    iree_hal_driver_registry_t* registry);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_DYLIB_REGISTRATION_DRIVER_MODULE_SYNC_H_
diff --git a/runtime/src/iree/hal/event.c b/runtime/src/iree/hal/event.c
new file mode 100644
index 0000000..95bda1f
--- /dev/null
+++ b/runtime/src/iree/hal/event.c
@@ -0,0 +1,31 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/event.h"
+
+#include <stddef.h>
+
+#include "iree/base/tracing.h"
+#include "iree/hal/detail.h"
+#include "iree/hal/device.h"
+#include "iree/hal/resource.h"
+
+#define _VTABLE_DISPATCH(event, method_name) \
+  IREE_HAL_VTABLE_DISPATCH(event, iree_hal_event, method_name)
+
+IREE_HAL_API_RETAIN_RELEASE(event);
+
+IREE_API_EXPORT iree_status_t
+iree_hal_event_create(iree_hal_device_t* device, iree_hal_event_t** out_event) {
+  IREE_ASSERT_ARGUMENT(device);
+  IREE_ASSERT_ARGUMENT(out_event);
+  *out_event = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_status_t status = IREE_HAL_VTABLE_DISPATCH(
+      device, iree_hal_device, create_event)(device, out_event);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
diff --git a/runtime/src/iree/hal/event.h b/runtime/src/iree/hal/event.h
new file mode 100644
index 0000000..a6ea312
--- /dev/null
+++ b/runtime/src/iree/hal/event.h
@@ -0,0 +1,64 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_EVENT_H_
+#define IREE_HAL_EVENT_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/resource.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef struct iree_hal_device_t iree_hal_device_t;
+
+//===----------------------------------------------------------------------===//
+// iree_hal_event_t
+//===----------------------------------------------------------------------===//
+
+// Events are used for defining synchronization scopes within command buffers.
+// An event only exists within a single CommandBuffer and must not be used
+// across command buffers from the same device or others.
+//
+// See iree_hal_command_buffer_signal_event and
+// iree_hal_command_buffer_wait_events for more info.
+//
+// Maps to VkEvent:
+// https://www.khronos.org/registry/vulkan/specs/1.2-extensions/man/html/VkEvent.html
+typedef struct iree_hal_event_t iree_hal_event_t;
+
+// Creates an event for recording into command buffers.
+// The returned event object is only usable with this device and events must
+// only be used to synchronize within the same queue.
+IREE_API_EXPORT iree_status_t
+iree_hal_event_create(iree_hal_device_t* device, iree_hal_event_t** out_event);
+
+// Retains the given |event| for the caller.
+IREE_API_EXPORT void iree_hal_event_retain(iree_hal_event_t* event);
+
+// Releases the given |event| from the caller.
+IREE_API_EXPORT void iree_hal_event_release(iree_hal_event_t* event);
+
+//===----------------------------------------------------------------------===//
+// iree_hal_event_t implementation details
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_event_vtable_t {
+  void(IREE_API_PTR* destroy)(iree_hal_event_t* event);
+} iree_hal_event_vtable_t;
+IREE_HAL_ASSERT_VTABLE_LAYOUT(iree_hal_event_vtable_t);
+
+IREE_API_EXPORT void iree_hal_event_destroy(iree_hal_event_t* event);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_EVENT_H_
diff --git a/runtime/src/iree/hal/executable.c b/runtime/src/iree/hal/executable.c
new file mode 100644
index 0000000..00a7c9b
--- /dev/null
+++ b/runtime/src/iree/hal/executable.c
@@ -0,0 +1,15 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/executable.h"
+
+#include "iree/hal/detail.h"
+#include "iree/hal/resource.h"
+
+#define _VTABLE_DISPATCH(executable, method_name) \
+  IREE_HAL_VTABLE_DISPATCH(executable, iree_hal_executable, method_name)
+
+IREE_HAL_API_RETAIN_RELEASE(executable);
diff --git a/runtime/src/iree/hal/executable.h b/runtime/src/iree/hal/executable.h
new file mode 100644
index 0000000..561ed3a
--- /dev/null
+++ b/runtime/src/iree/hal/executable.h
@@ -0,0 +1,65 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_EXECUTABLE_H_
+#define IREE_HAL_EXECUTABLE_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/resource.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef struct iree_hal_device_t iree_hal_device_t;
+
+//===----------------------------------------------------------------------===//
+// iree_hal_executable_t
+//===----------------------------------------------------------------------===//
+
+// Handle to a loaded executable.
+// Loading of executables routes through an executable cache, allowing for
+// context-aware scoped caches. HAL implementations can use this to preserve
+// JIT'ed executables across processes or reuse executables across device
+// instances.
+//
+// Executables provide one or more entry points that can be dispatched via
+// iree_hal_command_buffer_dispatch. Some entry points may represent the same
+// computation but specialized in different ways such that the runtime can
+// switch strategies and choose between them per-dispatch.
+//
+//
+// Maps (roughly) to vkShaderModule + VkPipeline[].
+typedef struct iree_hal_executable_t iree_hal_executable_t;
+
+// Retains the given |executable| for the caller.
+IREE_API_EXPORT void iree_hal_executable_retain(
+    iree_hal_executable_t* executable);
+
+// Releases the given |executable| from the caller.
+IREE_API_EXPORT void iree_hal_executable_release(
+    iree_hal_executable_t* executable);
+
+//===----------------------------------------------------------------------===//
+// iree_hal_executable_t implementation details
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_executable_vtable_t {
+  void(IREE_API_PTR* destroy)(iree_hal_executable_t* executable);
+} iree_hal_executable_vtable_t;
+IREE_HAL_ASSERT_VTABLE_LAYOUT(iree_hal_executable_vtable_t);
+
+IREE_API_EXPORT void iree_hal_executable_destroy(
+    iree_hal_executable_t* executable);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_EXECUTABLE_H_
diff --git a/runtime/src/iree/hal/executable_cache.c b/runtime/src/iree/hal/executable_cache.c
new file mode 100644
index 0000000..73bd9bb
--- /dev/null
+++ b/runtime/src/iree/hal/executable_cache.c
@@ -0,0 +1,69 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/executable_cache.h"
+
+#include <stddef.h>
+#include <string.h>
+
+#include "iree/base/tracing.h"
+#include "iree/hal/detail.h"
+#include "iree/hal/device.h"
+#include "iree/hal/resource.h"
+
+void iree_hal_executable_params_initialize(
+    iree_hal_executable_params_t* out_executable_params) {
+  memset(out_executable_params, 0, sizeof(*out_executable_params));
+  out_executable_params->caching_mode =
+      IREE_HAL_EXECUTABLE_CACHING_MODE_ALLOW_PERSISTENT_CACHING |
+      IREE_HAL_EXECUTABLE_CACHING_MODE_ALLOW_OPTIMIZATION;
+}
+
+#define _VTABLE_DISPATCH(executable_cache, method_name)                 \
+  IREE_HAL_VTABLE_DISPATCH(executable_cache, iree_hal_executable_cache, \
+                           method_name)
+
+IREE_HAL_API_RETAIN_RELEASE(executable_cache);
+
+IREE_API_EXPORT iree_status_t iree_hal_executable_cache_create(
+    iree_hal_device_t* device, iree_string_view_t identifier, iree_loop_t loop,
+    iree_hal_executable_cache_t** out_executable_cache) {
+  IREE_ASSERT_ARGUMENT(device);
+  IREE_ASSERT_ARGUMENT(out_executable_cache);
+  *out_executable_cache = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_status_t status = IREE_HAL_VTABLE_DISPATCH(device, iree_hal_device,
+                                                  create_executable_cache)(
+      device, identifier, loop, out_executable_cache);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT bool iree_hal_executable_cache_can_prepare_format(
+    iree_hal_executable_cache_t* executable_cache,
+    iree_hal_executable_caching_mode_t caching_mode,
+    iree_string_view_t executable_format) {
+  IREE_ASSERT_ARGUMENT(executable_cache);
+  return _VTABLE_DISPATCH(executable_cache, can_prepare_format)(
+      executable_cache, caching_mode, executable_format);
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_executable_cache_prepare_executable(
+    iree_hal_executable_cache_t* executable_cache,
+    const iree_hal_executable_params_t* executable_params,
+    iree_hal_executable_t** out_executable) {
+  IREE_ASSERT_ARGUMENT(executable_cache);
+  IREE_ASSERT_ARGUMENT(executable_params);
+  IREE_ASSERT_ARGUMENT(!executable_params->executable_layout_count ||
+                       executable_params->executable_layouts);
+  IREE_ASSERT_ARGUMENT(out_executable);
+  *out_executable = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_status_t status = _VTABLE_DISPATCH(executable_cache, prepare_executable)(
+      executable_cache, executable_params, out_executable);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
diff --git a/runtime/src/iree/hal/executable_cache.h b/runtime/src/iree/hal/executable_cache.h
new file mode 100644
index 0000000..9fd53f0
--- /dev/null
+++ b/runtime/src/iree/hal/executable_cache.h
@@ -0,0 +1,217 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_EXECUTABLE_CACHE_H_
+#define IREE_HAL_EXECUTABLE_CACHE_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/executable.h"
+#include "iree/hal/executable_layout.h"
+#include "iree/hal/resource.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef struct iree_hal_device_t iree_hal_device_t;
+
+//===----------------------------------------------------------------------===//
+// Types and Enums
+//===----------------------------------------------------------------------===//
+
+// Defines how the executable cache performs preparation.
+enum iree_hal_executable_caching_mode_bits_t {
+  // Allows the cache to reference the provided executable_data after it has
+  // prepared the executable. Callers must ensure the data remains valid for the
+  // lifetime of the cache. If memory mapping constant executable data from
+  // disk this can be used to avoid copies.
+  IREE_HAL_EXECUTABLE_CACHING_MODE_ALIAS_PROVIDED_DATA = 1u << 0,
+  // Allows the prepared executable to be cached persistently (on disk/etc).
+  // Enable for any executable that is likely to be used in future runs.
+  // Note that not all caches support persistent serialization and this is just
+  // a hint.
+  IREE_HAL_EXECUTABLE_CACHING_MODE_ALLOW_PERSISTENT_CACHING = 1u << 1,
+  // Allows the cache to optimize the executable as much as it can.
+  // This may cause preparation to take significantly longer while (hopefully)
+  // improving runtime performance. Avoid for one-shot executables.
+  IREE_HAL_EXECUTABLE_CACHING_MODE_ALLOW_OPTIMIZATION = 1u << 2,
+  // Enables Executable debugging methods if supported by the device and
+  // executable. This may disable certain optimizations or retain additional
+  // data to allow disassembly, stepping, etc.
+  //
+  // Device must support the IREE_HAL_DEVICE_FEATURE_SUPPORTS_DEBUGGING feature
+  // and executables must support the ExecutableFeature::kDebugging feature.
+  IREE_HAL_EXECUTABLE_CACHING_MODE_ENABLE_DEBUGGING = 1u << 3,
+  // Enables Executable coverage if supported by the device and executable.
+  // Depending on the optimization mode this may produce partial coverage
+  // results (for example, when certain source operations were optimized away).
+  //
+  // Device must support the IREE_HAL_DEVICE_FEATURE_SUPPORTS_COVERAGE feature
+  // and executables must support the ExecutableFeature::kCoverage feature.
+  IREE_HAL_EXECUTABLE_CACHING_MODE_ENABLE_COVERAGE = 1u << 4,
+  // Enables Executable profiling if supported by the device and executable.
+  // Depending on the optimization mode this may produce partial profiling
+  // results. Profiling attribution (whether to the entire executable or
+  // specific operations) depends on the implementation.
+  //
+  // Device must support the IREE_HAL_DEVICE_FEATURE_SUPPORTS_PROFILING feature
+  // and executables must support the ExecutableFeature::kProfiling feature.
+  IREE_HAL_EXECUTABLE_CACHING_MODE_ENABLE_PROFILING = 1u << 5,
+  // Disables verification of executable layouts and modes.
+  // This is useful when debugging with partial information but should never
+  // be enabled for real usage as the verification is the best way to catch
+  // API misuse.
+  IREE_HAL_EXECUTABLE_CACHING_MODE_DISABLE_VERIFICATION = 1u << 6,
+};
+typedef uint32_t iree_hal_executable_caching_mode_t;
+
+// Defines an executable compilation specification.
+typedef struct iree_hal_executable_params_t {
+  // Specifies what caching the executable cache is allowed to perform and
+  // (if supported) which transformations on the executable contents are
+  // allowed.
+  iree_hal_executable_caching_mode_t caching_mode;
+
+  // Indicates the format of the data in |executable_data|.
+  iree_string_view_t executable_format;
+
+  // Opaque compiler-generated executable data.
+  // By default the memory storing the executable data is owned by the caller
+  // and not guaranteed to live beyond the preparation call.
+  //
+  // Callers can indicate that they guarantee the lifetime of the memory
+  // outlives the executable that will be created from it with the
+  // IREE_HAL_EXECUTABLE_CACHING_MODE_ALIAS_PROVIDED_DATA flag, in which case
+  // the cache is allowed to retain the data for as long as there is a reference
+  // to any executable created using it still held by the caller.
+  iree_const_byte_span_t executable_data;
+
+  // A set of executable layouts for each entry point in the executable.
+  // The order matches that produced by the compiler. As multiple entry points
+  // may share the same layout some entries in this list may reference the same
+  // executable layout objects.
+  iree_host_size_t executable_layout_count;
+  iree_hal_executable_layout_t* const* executable_layouts;
+
+  // Executable-level constants table used to perform runtime specialization
+  // when information is not available statically during compilation. The
+  // compiler defines the contents of the table, how they are populated, and
+  // their usage in the executable.
+  //
+  // For targets that natively support specialization these directly map down:
+  //   Metal: function constants
+  //   WGSL: pipeline overrides
+  //   Vulkan/SPIR-V: specialization constants
+  // Other targets may present these as constant tables or uniform buffers.
+  // Since the values cannot change after initialization targets that JIT may
+  // perform substitution during initialization to inline the values
+  // immediately (via CUDA PTX linking, etc).
+  iree_host_size_t constant_count;
+  const uint32_t* constants;
+} iree_hal_executable_params_t;
+
+// Initializes |out_executable_params| to the default values for normal
+// executables. Callers must override the fields as required.
+void iree_hal_executable_params_initialize(
+    iree_hal_executable_params_t* out_executable_params);
+
+//===----------------------------------------------------------------------===//
+// iree_hal_executable_cache_t
+//===----------------------------------------------------------------------===//
+
+// A cache of prepared executables for a particular device.
+// Caches may be shared across multiple devices from the same driver or specific
+// to individual devices. Caches may persist prepared executables across process
+// launches or re-prepare them each run. Callers should assume that the cache is
+// a no-op and the returned Executables only live for as long as the cache does.
+//
+// The term 'cache' here is rather optimistic - it's perfectly acceptable for
+// implementations to not cache at all and return new Executables for each
+// iree_hal_executable_cache_prepare_executable called (even for the same
+// executable). Callers should expect such behavior and try to retain the
+// results of the iree_hal_executable_cache_prepare_executable calls to reduce
+// overhead in re-preparing executables.
+//
+// Thread-safe - multiple threads may prepare executables (including the *same*
+// executable) simultaneously.
+typedef struct iree_hal_executable_cache_t iree_hal_executable_cache_t;
+
+// Creates an executable cache using the given identifier.
+// The identifier is provided to the backing cache API as way to partition
+// caches between different groups of executables (from different modules, etc).
+//
+// Any host-side work that needs to be performed will be scheduled on |loop|.
+// This enables JITs, device-specific translation, and verification to be
+// parallelized using a shared scheduler. The loop must remain valid for the
+// lifetime of the executable cache.
+IREE_API_EXPORT iree_status_t iree_hal_executable_cache_create(
+    iree_hal_device_t* device, iree_string_view_t identifier, iree_loop_t loop,
+    iree_hal_executable_cache_t** out_executable_cache);
+
+// Retains the given |executable_cache| for the caller.
+IREE_API_EXPORT void iree_hal_executable_cache_retain(
+    iree_hal_executable_cache_t* executable_cache);
+
+// Releases the given |executable_cache| from the caller.
+IREE_API_EXPORT void iree_hal_executable_cache_release(
+    iree_hal_executable_cache_t* executable_cache);
+
+// Returns true if the executable cache can prepare the given executable input
+// format. Preparation may still fail if the particular version or features
+// required by the executable are not supported.
+IREE_API_EXPORT bool iree_hal_executable_cache_can_prepare_format(
+    iree_hal_executable_cache_t* executable_cache,
+    iree_hal_executable_caching_mode_t caching_mode,
+    iree_string_view_t executable_format);
+
+// Prepares the executable defined by |executable_params| for use.
+// The provided |executable_data| (in a format defined by |executable_format|)
+// will be used to either lookup a previously prepared executable in the cache
+// or prepare a new one.
+//
+// Each entry point in the executable requires a corresponding value in
+// |executable_layouts| defining the layout used by the entry point. If multiple
+// entry points use the same layouts they can reuse the same values.
+//
+// Depending on the driver preparation may take a non-trivial amount of time
+// (such as when JITing/etc). As the cache is internally synchronized callers
+// can issue preparation requests from multiple threads - even for the same
+// executables - and calls will block until preparation completes.
+IREE_API_EXPORT iree_status_t iree_hal_executable_cache_prepare_executable(
+    iree_hal_executable_cache_t* executable_cache,
+    const iree_hal_executable_params_t* executable_params,
+    iree_hal_executable_t** out_executable);
+
+//===----------------------------------------------------------------------===//
+// iree_hal_executable_cache_t implementation details
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_executable_cache_vtable_t {
+  void(IREE_API_PTR* destroy)(iree_hal_executable_cache_t* executable_cache);
+
+  bool(IREE_API_PTR* can_prepare_format)(
+      iree_hal_executable_cache_t* executable_cache,
+      iree_hal_executable_caching_mode_t caching_mode,
+      iree_string_view_t executable_format);
+
+  iree_status_t(IREE_API_PTR* prepare_executable)(
+      iree_hal_executable_cache_t* executable_cache,
+      const iree_hal_executable_params_t* executable_params,
+      iree_hal_executable_t** out_executable);
+} iree_hal_executable_cache_vtable_t;
+IREE_HAL_ASSERT_VTABLE_LAYOUT(iree_hal_executable_cache_vtable_t);
+
+IREE_API_EXPORT void iree_hal_executable_cache_destroy(
+    iree_hal_executable_cache_t* executable_cache);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_EXECUTABLE_CACHE_H_
diff --git a/runtime/src/iree/hal/executable_layout.c b/runtime/src/iree/hal/executable_layout.c
new file mode 100644
index 0000000..5755b8a
--- /dev/null
+++ b/runtime/src/iree/hal/executable_layout.c
@@ -0,0 +1,38 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/executable_layout.h"
+
+#include <stddef.h>
+
+#include "iree/base/tracing.h"
+#include "iree/hal/detail.h"
+#include "iree/hal/device.h"
+#include "iree/hal/resource.h"
+
+#define _VTABLE_DISPATCH(executable_layout, method_name)                  \
+  IREE_HAL_VTABLE_DISPATCH(executable_layout, iree_hal_executable_layout, \
+                           method_name)
+
+IREE_HAL_API_RETAIN_RELEASE(executable_layout);
+
+IREE_API_EXPORT iree_status_t iree_hal_executable_layout_create(
+    iree_hal_device_t* device, iree_host_size_t push_constants,
+    iree_host_size_t set_layout_count,
+    iree_hal_descriptor_set_layout_t** set_layouts,
+    iree_hal_executable_layout_t** out_executable_layout) {
+  IREE_ASSERT_ARGUMENT(device);
+  IREE_ASSERT_ARGUMENT(!set_layout_count || set_layouts);
+  IREE_ASSERT_ARGUMENT(out_executable_layout);
+  *out_executable_layout = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_status_t status = IREE_HAL_VTABLE_DISPATCH(device, iree_hal_device,
+                                                  create_executable_layout)(
+      device, push_constants, set_layout_count, set_layouts,
+      out_executable_layout);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
diff --git a/runtime/src/iree/hal/executable_layout.h b/runtime/src/iree/hal/executable_layout.h
new file mode 100644
index 0000000..7fa1a21
--- /dev/null
+++ b/runtime/src/iree/hal/executable_layout.h
@@ -0,0 +1,78 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_EXECUTABLE_LAYOUT_H_
+#define IREE_HAL_EXECUTABLE_LAYOUT_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/descriptor_set_layout.h"
+#include "iree/hal/resource.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef struct iree_hal_device_t iree_hal_device_t;
+
+//===----------------------------------------------------------------------===//
+// iree_hal_executable_layout_t
+//===----------------------------------------------------------------------===//
+
+// Defines the resource binding layout used by an executable.
+// A "descriptor" is effectively a bound memory range and each dispatch can use
+// one or more "descriptor sets" to access their I/O memory. A "descriptor set
+// layout" defines the types and usage semantics of the descriptors that make up
+// one set. An "executable layout" defines all of the set layouts that will be
+// used when dispatching. Implementations can use this to verify program
+// correctness and accelerate reservation/allocatation/computation of
+// descriptor-related operations.
+//
+// Executables can share the same layout even if they do not use all of the
+// resources referenced by descriptor sets referenced by the layout. Doing so
+// allows for more efficient binding as bound descriptor sets can be reused when
+// command buffer executable bindings change.
+//
+// Maps to VkPipelineLayout:
+// https://www.khronos.org/registry/vulkan/specs/1.2-extensions/man/html/VkPipelineLayout.html
+typedef struct iree_hal_executable_layout_t iree_hal_executable_layout_t;
+
+// Creates an executable layout composed of the given descriptor set layouts.
+// The returned executable layout can be used by multiple executables with the
+// same compatible resource binding layouts.
+IREE_API_EXPORT iree_status_t iree_hal_executable_layout_create(
+    iree_hal_device_t* device, iree_host_size_t push_constants,
+    iree_host_size_t set_layout_count,
+    iree_hal_descriptor_set_layout_t** set_layouts,
+    iree_hal_executable_layout_t** out_executable_layout);
+
+// Retains the given |executable_layout| for the caller.
+IREE_API_EXPORT void iree_hal_executable_layout_retain(
+    iree_hal_executable_layout_t* executable_layout);
+
+// Releases the given |executable_layout| from the caller.
+IREE_API_EXPORT void iree_hal_executable_layout_release(
+    iree_hal_executable_layout_t* executable_layout);
+
+//===----------------------------------------------------------------------===//
+// iree_hal_executable_layout_t implementation details
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_executable_layout_vtable_t {
+  void(IREE_API_PTR* destroy)(iree_hal_executable_layout_t* executable_layout);
+} iree_hal_executable_layout_vtable_t;
+IREE_HAL_ASSERT_VTABLE_LAYOUT(iree_hal_executable_layout_vtable_t);
+
+IREE_API_EXPORT void iree_hal_executable_layout_destroy(
+    iree_hal_executable_layout_t* executable_layout);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_EXECUTABLE_LAYOUT_H_
diff --git a/runtime/src/iree/hal/local/BUILD b/runtime/src/iree/hal/local/BUILD
new file mode 100644
index 0000000..c2927ef
--- /dev/null
+++ b/runtime/src/iree/hal/local/BUILD
@@ -0,0 +1,181 @@
+# Copyright 2020 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# Default implementations for HAL types that use the host resources.
+# These are generally just wrappers around host heap memory and host threads.
+
+load("//iree:build_defs.oss.bzl", "iree_cmake_extra_content", "iree_runtime_cc_library", "iree_runtime_cc_test")
+load("//build_tools/bazel:cc_binary_benchmark.bzl", "cc_binary_benchmark")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+iree_runtime_cc_library(
+    name = "executable_environment",
+    srcs = ["executable_environment.c"],
+    hdrs = ["executable_environment.h"],
+    deps = [
+        ":executable_library",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:tracing",
+        "//runtime/src/iree/base/internal:cpu",
+        "//runtime/src/iree/hal",
+    ],
+)
+
+iree_runtime_cc_library(
+    name = "executable_library",
+    hdrs = ["executable_library.h"],
+)
+
+cc_binary_benchmark(
+    name = "executable_library_benchmark",
+    srcs = ["executable_library_benchmark.c"],
+    deps = [
+        ":executable_environment",
+        ":executable_library",
+        ":local",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:tracing",
+        "//runtime/src/iree/base/internal:file_io",
+        "//runtime/src/iree/base/internal:flags",
+        "//runtime/src/iree/hal",
+        "//runtime/src/iree/hal/local/loaders:embedded_library_loader",
+        "//runtime/src/iree/testing:benchmark",
+    ],
+)
+
+iree_runtime_cc_test(
+    name = "executable_library_test",
+    srcs = [
+        "executable_library_demo.c",
+        "executable_library_demo.h",
+        "executable_library_test.c",
+    ],
+    deps = [
+        ":executable_environment",
+        ":executable_library",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:core_headers",
+    ],
+)
+
+iree_runtime_cc_library(
+    name = "local",
+    srcs = [
+        "executable_loader.c",
+        "inline_command_buffer.c",
+        "local_descriptor_set.c",
+        "local_descriptor_set_layout.c",
+        "local_executable.c",
+        "local_executable_cache.c",
+        "local_executable_layout.c",
+    ],
+    hdrs = [
+        "executable_loader.h",
+        "inline_command_buffer.h",
+        "local_descriptor_set.h",
+        "local_descriptor_set_layout.h",
+        "local_executable.h",
+        "local_executable_cache.h",
+        "local_executable_layout.h",
+    ],
+    deps = [
+        ":executable_environment",
+        ":executable_library",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:core_headers",
+        "//runtime/src/iree/base:tracing",
+        "//runtime/src/iree/base/internal",
+        "//runtime/src/iree/base/internal:fpu_state",
+        "//runtime/src/iree/hal",
+    ],
+)
+
+iree_runtime_cc_library(
+    name = "sync_driver",
+    srcs = [
+        "sync_device.c",
+        "sync_driver.c",
+        "sync_event.c",
+        "sync_semaphore.c",
+    ],
+    hdrs = [
+        "sync_device.h",
+        "sync_driver.h",
+        "sync_event.h",
+        "sync_semaphore.h",
+    ],
+    deps = [
+        ":local",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:core_headers",
+        "//runtime/src/iree/base:tracing",
+        "//runtime/src/iree/base/internal",
+        "//runtime/src/iree/base/internal:arena",
+        "//runtime/src/iree/base/internal:synchronization",
+        "//runtime/src/iree/hal",
+        "//runtime/src/iree/hal/utils:buffer_transfer",
+    ],
+)
+
+#===------------------------------------------------------------------------===#
+# Thread dependent packages
+#===------------------------------------------------------------------------===#
+
+iree_cmake_extra_content(
+    content = """
+# task_driver is used by asynchronuous drivers.
+# TODO(scotttodd): refactor this - code depending on threading should be
+#   possible to declare in the build system but conditionally link in
+if(NOT EMSCRIPTEN AND NOT (${IREE_HAL_DRIVER_DYLIB} OR ${IREE_HAL_DRIVER_VMVX}))
+  return()
+endif()
+""",
+    inline = True,
+)
+
+iree_runtime_cc_library(
+    name = "task_driver",
+    srcs = [
+        "task_command_buffer.c",
+        "task_device.c",
+        "task_driver.c",
+        "task_event.c",
+        "task_queue.c",
+        "task_queue_state.c",
+        "task_semaphore.c",
+    ],
+    hdrs = [
+        "task_command_buffer.h",
+        "task_device.h",
+        "task_driver.h",
+        "task_event.h",
+        "task_queue.h",
+        "task_queue_state.h",
+        "task_semaphore.h",
+    ],
+    deps = [
+        ":executable_environment",
+        ":executable_library",
+        ":local",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:core_headers",
+        "//runtime/src/iree/base:tracing",
+        "//runtime/src/iree/base/internal",
+        "//runtime/src/iree/base/internal:arena",
+        "//runtime/src/iree/base/internal:event_pool",
+        "//runtime/src/iree/base/internal:synchronization",
+        "//runtime/src/iree/base/internal:wait_handle",
+        "//runtime/src/iree/hal",
+        "//runtime/src/iree/hal/utils:buffer_transfer",
+        "//runtime/src/iree/hal/utils:resource_set",
+        "//runtime/src/iree/task",
+    ],
+)
diff --git a/runtime/src/iree/hal/local/CMakeLists.txt b/runtime/src/iree/hal/local/CMakeLists.txt
new file mode 100644
index 0000000..693f722
--- /dev/null
+++ b/runtime/src/iree/hal/local/CMakeLists.txt
@@ -0,0 +1,174 @@
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# runtime/src/iree/hal/local/BUILD                                             #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
+
+iree_add_all_subdirs()
+
+iree_cc_library(
+  NAME
+    executable_environment
+  HDRS
+    "executable_environment.h"
+  SRCS
+    "executable_environment.c"
+  DEPS
+    ::executable_library
+    iree::base
+    iree::base::internal::cpu
+    iree::base::tracing
+    iree::hal
+  PUBLIC
+)
+
+iree_cc_library(
+  NAME
+    executable_library
+  HDRS
+    "executable_library.h"
+  DEPS
+
+  PUBLIC
+)
+
+iree_cc_binary_benchmark(
+  NAME
+    executable_library_benchmark
+  SRCS
+    "executable_library_benchmark.c"
+  DEPS
+    ::executable_environment
+    ::executable_library
+    ::local
+    iree::base
+    iree::base::internal::file_io
+    iree::base::internal::flags
+    iree::base::tracing
+    iree::hal
+    iree::hal::local::loaders::embedded_library_loader
+    iree::testing::benchmark
+  TESTONLY
+)
+
+iree_cc_test(
+  NAME
+    executable_library_test
+  SRCS
+    "executable_library_demo.c"
+    "executable_library_demo.h"
+    "executable_library_test.c"
+  DEPS
+    ::executable_environment
+    ::executable_library
+    iree::base
+    iree::base::core_headers
+)
+
+iree_cc_library(
+  NAME
+    local
+  HDRS
+    "executable_loader.h"
+    "inline_command_buffer.h"
+    "local_descriptor_set.h"
+    "local_descriptor_set_layout.h"
+    "local_executable.h"
+    "local_executable_cache.h"
+    "local_executable_layout.h"
+  SRCS
+    "executable_loader.c"
+    "inline_command_buffer.c"
+    "local_descriptor_set.c"
+    "local_descriptor_set_layout.c"
+    "local_executable.c"
+    "local_executable_cache.c"
+    "local_executable_layout.c"
+  DEPS
+    ::executable_environment
+    ::executable_library
+    iree::base
+    iree::base::core_headers
+    iree::base::internal
+    iree::base::internal::fpu_state
+    iree::base::tracing
+    iree::hal
+  PUBLIC
+)
+
+iree_cc_library(
+  NAME
+    sync_driver
+  HDRS
+    "sync_device.h"
+    "sync_driver.h"
+    "sync_event.h"
+    "sync_semaphore.h"
+  SRCS
+    "sync_device.c"
+    "sync_driver.c"
+    "sync_event.c"
+    "sync_semaphore.c"
+  DEPS
+    ::local
+    iree::base
+    iree::base::core_headers
+    iree::base::internal
+    iree::base::internal::arena
+    iree::base::internal::synchronization
+    iree::base::tracing
+    iree::hal
+    iree::hal::utils::buffer_transfer
+  PUBLIC
+)
+
+# task_driver is used by asynchronuous drivers.
+# TODO(scotttodd): refactor this - code depending on threading should be
+#   possible to declare in the build system but conditionally link in
+if(NOT EMSCRIPTEN AND NOT (${IREE_HAL_DRIVER_DYLIB} OR ${IREE_HAL_DRIVER_VMVX}))
+  return()
+endif()
+
+iree_cc_library(
+  NAME
+    task_driver
+  HDRS
+    "task_command_buffer.h"
+    "task_device.h"
+    "task_driver.h"
+    "task_event.h"
+    "task_queue.h"
+    "task_queue_state.h"
+    "task_semaphore.h"
+  SRCS
+    "task_command_buffer.c"
+    "task_device.c"
+    "task_driver.c"
+    "task_event.c"
+    "task_queue.c"
+    "task_queue_state.c"
+    "task_semaphore.c"
+  DEPS
+    ::executable_environment
+    ::executable_library
+    ::local
+    iree::base
+    iree::base::core_headers
+    iree::base::internal
+    iree::base::internal::arena
+    iree::base::internal::event_pool
+    iree::base::internal::synchronization
+    iree::base::internal::wait_handle
+    iree::base::tracing
+    iree::hal
+    iree::hal::utils::buffer_transfer
+    iree::hal::utils::resource_set
+    iree::task
+  PUBLIC
+)
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/runtime/src/iree/hal/local/elf/BUILD b/runtime/src/iree/hal/local/elf/BUILD
new file mode 100644
index 0000000..f5400f9
--- /dev/null
+++ b/runtime/src/iree/hal/local/elf/BUILD
@@ -0,0 +1,96 @@
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("//build_tools/bazel:native_binary.bzl", "native_test")
+load("//iree:build_defs.oss.bzl", "iree_runtime_cc_library")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+#===------------------------------------------------------------------------===#
+# Runtime ELF module loader/linker
+#===------------------------------------------------------------------------===#
+
+iree_runtime_cc_library(
+    name = "elf_module",
+    srcs = [
+        "elf_module.c",
+    ],
+    hdrs = [
+        "elf_module.h",
+        "elf_types.h",
+    ],
+    deps = [
+        ":arch",
+        ":platform",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:core_headers",
+        "//runtime/src/iree/base:tracing",
+    ],
+)
+
+cc_binary(
+    name = "elf_module_test_binary",
+    srcs = ["elf_module_test_main.c"],
+    deps = [
+        ":elf_module",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:core_headers",
+        "//runtime/src/iree/hal/local:executable_environment",
+        "//runtime/src/iree/hal/local:executable_library",
+        "//runtime/src/iree/hal/local/elf/testdata:elementwise_mul",
+    ],
+)
+
+native_test(
+    name = "elf_module_test",
+    src = ":elf_module_test_binary",
+)
+
+#===------------------------------------------------------------------------===#
+# Architecture and platform support
+#===------------------------------------------------------------------------===#
+
+iree_runtime_cc_library(
+    name = "arch",
+    srcs = [
+        "arch/arm_32.c",
+        "arch/arm_64.c",
+        "arch/riscv.c",
+        "arch/x86_32.c",
+        "arch/x86_64.c",
+        "elf_types.h",
+    ],
+    hdrs = [
+        "arch.h",
+    ],
+    deps = [
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:core_headers",
+        "//runtime/src/iree/base:tracing",
+    ],
+)
+
+iree_runtime_cc_library(
+    name = "platform",
+    srcs = [
+        "platform/apple.c",
+        "platform/generic.c",
+        "platform/linux.c",
+        "platform/windows.c",
+    ],
+    hdrs = [
+        "platform.h",
+    ],
+    deps = [
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:core_headers",
+        "//runtime/src/iree/base:tracing",
+    ],
+)
diff --git a/runtime/src/iree/hal/local/elf/CMakeLists.txt b/runtime/src/iree/hal/local/elf/CMakeLists.txt
new file mode 100644
index 0000000..67e67b2
--- /dev/null
+++ b/runtime/src/iree/hal/local/elf/CMakeLists.txt
@@ -0,0 +1,107 @@
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# runtime/src/iree/hal/local/elf/BUILD                                         #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
+
+iree_add_all_subdirs()
+
+iree_cc_library(
+  NAME
+    elf_module
+  HDRS
+    "elf_module.h"
+    "elf_types.h"
+  SRCS
+    "elf_module.c"
+  DEPS
+    ::arch
+    ::platform
+    iree::base
+    iree::base::core_headers
+    iree::base::tracing
+  PUBLIC
+)
+
+iree_cc_binary(
+  NAME
+    elf_module_test_binary
+  SRCS
+    "elf_module_test_main.c"
+  DEPS
+    ::elf_module
+    iree::base
+    iree::base::core_headers
+    iree::hal::local::elf::testdata::elementwise_mul
+    iree::hal::local::executable_environment
+    iree::hal::local::executable_library
+)
+
+iree_native_test(
+  NAME
+    "elf_module_test"
+  SRC
+    ::elf_module_test_binary
+)
+
+iree_cc_library(
+  NAME
+    arch
+  HDRS
+    "arch.h"
+  SRCS
+    "arch/arm_32.c"
+    "arch/arm_64.c"
+    "arch/riscv.c"
+    "arch/x86_32.c"
+    "arch/x86_64.c"
+    "elf_types.h"
+  DEPS
+    iree::base
+    iree::base::core_headers
+    iree::base::tracing
+  PUBLIC
+)
+
+iree_cc_library(
+  NAME
+    platform
+  HDRS
+    "platform.h"
+  SRCS
+    "platform/apple.c"
+    "platform/generic.c"
+    "platform/linux.c"
+    "platform/windows.c"
+  DEPS
+    iree::base
+    iree::base::core_headers
+    iree::base::tracing
+  PUBLIC
+)
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
+
+# TODO(*): figure out how to make this work on Bazel+Windows.
+if(${MSVC})
+  if(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
+    set_source_files_properties(
+      arch/x86_64_msvc.asm
+      PROPERTIES
+      LANGUAGE ASM_MASM
+    )
+    # CMake + MASM does not work well and CMake ends up passing all our C/C++
+    # flags confusing MASM. We invoke MASM directly (ml64.exe) to keep it quiet.
+    target_sources(iree_hal_local_elf_arch PRIVATE "arch/x86_64_msvc.obj")
+    add_custom_command(
+      OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/arch/x86_64_msvc.obj
+      DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/arch/x86_64_msvc.asm
+      COMMAND ml64 /nologo /Zi /c /Fo ${CMAKE_CURRENT_BINARY_DIR}/arch/x86_64_msvc.obj ${CMAKE_CURRENT_SOURCE_DIR}/arch/x86_64_msvc.asm
+      VERBATIM
+    )
+  endif()
+endif()
diff --git a/runtime/src/iree/hal/local/elf/arch.h b/runtime/src/iree/hal/local/elf/arch.h
new file mode 100644
index 0000000..3933c95
--- /dev/null
+++ b/runtime/src/iree/hal/local/elf/arch.h
@@ -0,0 +1,65 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_LOCAL_ELF_ARCH_H_
+#define IREE_HAL_LOCAL_ELF_ARCH_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/local/elf/elf_types.h"
+
+//==============================================================================
+// ELF machine type/ABI
+//==============================================================================
+
+// Returns true if the reported ELF machine specification is valid.
+bool iree_elf_arch_is_valid(const iree_elf_ehdr_t* ehdr);
+
+//==============================================================================
+// ELF relocations
+//==============================================================================
+
+// State used during relocation.
+typedef struct iree_elf_relocation_state_t {
+  // Bias applied to all relative addresses (from the string table, etc) in the
+  // loaded module. This is an offset from the vaddr_base that may not be 0 if
+  // host page granularity was larger than the ELF's defined granularity.
+  uint8_t* vaddr_bias;
+
+  // PT_DYNAMIC table.
+  iree_host_size_t dyn_table_count;
+  const iree_elf_dyn_t* dyn_table;
+} iree_elf_relocation_state_t;
+
+// Applies architecture-specific relocations.
+iree_status_t iree_elf_arch_apply_relocations(
+    iree_elf_relocation_state_t* state);
+
+//==============================================================================
+// Cross-ABI function calls
+//==============================================================================
+
+// TODO(benvanik): add thunk functions (iree_elf_thunk_*) to be used by imports
+// for marshaling from linux ABI in the ELF to host ABI.
+
+// Host -> ELF: void(*)(void)
+void iree_elf_call_v_v(const void* symbol_ptr);
+
+// Host -> ELF: void*(*)(int)
+void* iree_elf_call_p_i(const void* symbol_ptr, int a0);
+
+// Host -> ELF: void*(*)(int, void*)
+void* iree_elf_call_p_ip(const void* symbol_ptr, int a0, void* a1);
+
+// Host -> ELF: int(*)(void*)
+int iree_elf_call_i_p(const void* symbol_ptr, void* a0);
+
+// Host -> ELF: int(*)(void*, void*, void*)
+int iree_elf_call_i_ppp(const void* symbol_ptr, void* a0, void* a1, void* a2);
+
+// ELF -> Host: int(*)(void*)
+int iree_elf_thunk_i_p(const void* symbol_ptr, void* a0);
+
+#endif  // IREE_HAL_LOCAL_ELF_ARCH_H_
diff --git a/runtime/src/iree/hal/local/elf/arch/arm_32.c b/runtime/src/iree/hal/local/elf/arch/arm_32.c
new file mode 100644
index 0000000..4044fbf
--- /dev/null
+++ b/runtime/src/iree/hal/local/elf/arch/arm_32.c
@@ -0,0 +1,152 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/base/target_platform.h"
+#include "iree/hal/local/elf/arch.h"
+#include "iree/hal/local/elf/elf_types.h"
+
+#if defined(IREE_ARCH_ARM_32)
+
+// Documentation:
+// https://developer.arm.com/documentation/ihi0044/h/
+
+//==============================================================================
+// ELF machine type/ABI
+//==============================================================================
+
+bool iree_elf_arch_is_valid(const iree_elf_ehdr_t* ehdr) {
+  return ehdr->e_machine == 0x28;  // EM_ARM / 40
+}
+
+//==============================================================================
+// ELF relocations
+//==============================================================================
+
+enum {
+  IREE_ELF_R_ARM_NONE = 0,
+  IREE_ELF_R_ARM_ABS32 = 2,
+  IREE_ELF_R_ARM_REL32 = 3,
+  IREE_ELF_R_ARM_GLOB_DAT = 21,
+  IREE_ELF_R_ARM_JUMP_SLOT = 22,
+  IREE_ELF_R_ARM_RELATIVE = 23,
+};
+
+static iree_status_t iree_elf_arch_arm_apply_rel(
+    iree_elf_relocation_state_t* state, iree_host_size_t rel_count,
+    const iree_elf_rel_t* rel_table) {
+  for (iree_host_size_t i = 0; i < rel_count; ++i) {
+    const iree_elf_rel_t* rel = &rel_table[i];
+    uint32_t type = IREE_ELF_R_TYPE(rel->r_info);
+    if (type == 0) continue;
+
+    // TODO(benvanik): support imports by resolving from the import table.
+    iree_elf_addr_t sym_addr = 0;
+    if (IREE_ELF_R_SYM(rel->r_info) != 0) {
+      return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                              "symbol-relative relocations not implemented");
+    }
+
+    iree_elf_addr_t instr_ptr =
+        (iree_elf_addr_t)state->vaddr_bias + rel->r_offset;
+    switch (type) {
+      case IREE_ELF_R_ARM_NONE:
+        break;
+      case IREE_ELF_R_ARM_ABS32:
+        *(uint32_t*)instr_ptr += (uint32_t)sym_addr;
+        break;
+      case IREE_ELF_R_ARM_REL32:
+        *(uint32_t*)instr_ptr += (uint32_t)sym_addr - rel->r_offset;
+        break;
+      case IREE_ELF_R_ARM_GLOB_DAT:
+      case IREE_ELF_R_ARM_JUMP_SLOT:
+        *(uint32_t*)instr_ptr = (uint32_t)sym_addr;
+        break;
+      case IREE_ELF_R_ARM_RELATIVE:
+        *(uint32_t*)instr_ptr += (uint32_t)state->vaddr_bias;
+        break;
+      default:
+        return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                                "unimplemented ARM relocation type %08X", type);
+    }
+  }
+  return iree_ok_status();
+}
+
+iree_status_t iree_elf_arch_apply_relocations(
+    iree_elf_relocation_state_t* state) {
+  // Gather the relevant relocation tables.
+  iree_host_size_t rel_count = 0;
+  const iree_elf_rel_t* rel_table = NULL;
+  for (iree_host_size_t i = 0; i < state->dyn_table_count; ++i) {
+    const iree_elf_dyn_t* dyn = &state->dyn_table[i];
+    switch (dyn->d_tag) {
+      case IREE_ELF_DT_REL:
+        rel_table =
+            (const iree_elf_rel_t*)(state->vaddr_bias + dyn->d_un.d_ptr);
+        break;
+      case IREE_ELF_DT_RELSZ:
+        rel_count = dyn->d_un.d_val / sizeof(iree_elf_rel_t);
+        break;
+
+      case IREE_ELF_DT_RELA:
+      case IREE_ELF_DT_RELASZ:
+        return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                                "unsupported DT_RELA relocations");
+      default:
+        // Ignored.
+        break;
+    }
+  }
+  if (!rel_table) rel_count = 0;
+
+  if (rel_count > 0) {
+    IREE_RETURN_IF_ERROR(
+        iree_elf_arch_arm_apply_rel(state, rel_count, rel_table));
+  }
+
+  return iree_ok_status();
+}
+
+//==============================================================================
+// Cross-ABI function calls
+//==============================================================================
+
+void iree_elf_call_v_v(const void* symbol_ptr) {
+  typedef void (*ptr_t)(void);
+  ((ptr_t)symbol_ptr)();
+}
+
+void* iree_elf_call_p_i(const void* symbol_ptr, int a0) {
+  typedef void* (*ptr_t)(int);
+  return ((ptr_t)symbol_ptr)(a0);
+}
+
+void* iree_elf_call_p_ip(const void* symbol_ptr, int a0, void* a1) {
+  typedef void* (*ptr_t)(int, void*);
+  return ((ptr_t)symbol_ptr)(a0, a1);
+}
+
+int iree_elf_call_i_p(const void* symbol_ptr, void* a0) {
+  typedef int (*ptr_t)(void*);
+  return ((ptr_t)symbol_ptr)(a0);
+}
+
+int iree_elf_call_i_ppp(const void* symbol_ptr, void* a0, void* a1, void* a2) {
+  typedef int (*ptr_t)(void*, void*, void*);
+  return ((ptr_t)symbol_ptr)(a0, a1, a2);
+}
+
+int iree_elf_thunk_i_p(const void* symbol_ptr, void* a0) {
+  typedef int (*ptr_t)(void*);
+  return ((ptr_t)symbol_ptr)(a0);
+}
+
+#endif  // IREE_ARCH_ARM_32
diff --git a/runtime/src/iree/hal/local/elf/arch/arm_64.c b/runtime/src/iree/hal/local/elf/arch/arm_64.c
new file mode 100644
index 0000000..cc8398a
--- /dev/null
+++ b/runtime/src/iree/hal/local/elf/arch/arm_64.c
@@ -0,0 +1,149 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/base/target_platform.h"
+#include "iree/hal/local/elf/arch.h"
+#include "iree/hal/local/elf/elf_types.h"
+
+#if defined(IREE_ARCH_ARM_64)
+
+// Documentation:
+// https://developer.arm.com/documentation/ihi0056/g/
+
+//==============================================================================
+// ELF machine type/ABI
+//==============================================================================
+
+bool iree_elf_arch_is_valid(const iree_elf_ehdr_t* ehdr) {
+  return ehdr->e_machine == 0xB7;  // EM_AARCH64 / 183
+}
+
+//==============================================================================
+// ELF relocations
+//==============================================================================
+
+enum {
+  IREE_ELF_R_AARCH64_NONE = 0,
+  IREE_ELF_R_AARCH64_ABS64 = 257,
+  IREE_ELF_R_AARCH64_GLOB_DAT = 1025,   // S + A
+  IREE_ELF_R_AARCH64_JUMP_SLOT = 1026,  // S + A
+  IREE_ELF_R_AARCH64_RELATIVE = 1027,   // Delta(S) + A
+};
+
+static iree_status_t iree_elf_arch_aarch64_apply_rela(
+    iree_elf_relocation_state_t* state, iree_host_size_t rela_count,
+    const iree_elf_rela_t* rela_table) {
+  for (iree_host_size_t i = 0; i < rela_count; ++i) {
+    const iree_elf_rela_t* rela = &rela_table[i];
+    uint32_t type = IREE_ELF_R_TYPE(rela->r_info);
+    if (type == 0) continue;
+
+    // TODO(benvanik): support imports by resolving from the import table.
+    iree_elf_addr_t sym_addr = 0;
+    if (IREE_ELF_R_SYM(rela->r_info) != 0) {
+      return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                              "symbol-relative relocations not implemented");
+    }
+
+    iree_elf_addr_t instr_ptr =
+        (iree_elf_addr_t)state->vaddr_bias + rela->r_offset;
+    switch (type) {
+      case IREE_ELF_R_AARCH64_NONE:
+        break;
+      case IREE_ELF_R_AARCH64_ABS64:
+        *(uint64_t*)instr_ptr += (uint64_t)(sym_addr + rela->r_addend);
+        break;
+      case IREE_ELF_R_AARCH64_GLOB_DAT:
+      case IREE_ELF_R_AARCH64_JUMP_SLOT:
+        *(uint64_t*)instr_ptr = (uint64_t)(sym_addr + rela->r_addend);
+        break;
+      case IREE_ELF_R_AARCH64_RELATIVE:
+        *(uint64_t*)instr_ptr = (uint64_t)(state->vaddr_bias + rela->r_addend);
+        break;
+      default:
+        return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                                "unimplemented aarch64 relocation type %08X",
+                                type);
+    }
+  }
+  return iree_ok_status();
+}
+
+iree_status_t iree_elf_arch_apply_relocations(
+    iree_elf_relocation_state_t* state) {
+  // Gather the relevant relocation tables.
+  iree_host_size_t rela_count = 0;
+  const iree_elf_rela_t* rela_table = NULL;
+  for (iree_host_size_t i = 0; i < state->dyn_table_count; ++i) {
+    const iree_elf_dyn_t* dyn = &state->dyn_table[i];
+    switch (dyn->d_tag) {
+      case IREE_ELF_DT_RELA:
+        rela_table =
+            (const iree_elf_rela_t*)(state->vaddr_bias + dyn->d_un.d_ptr);
+        break;
+      case IREE_ELF_DT_RELASZ:
+        rela_count = dyn->d_un.d_val / sizeof(iree_elf_rela_t);
+        break;
+
+      case IREE_ELF_DT_REL:
+      case IREE_ELF_DT_RELSZ:
+        return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                                "unsupported DT_REL relocations");
+      default:
+        // Ignored.
+        break;
+    }
+  }
+  if (!rela_table) rela_count = 0;
+
+  if (rela_count > 0) {
+    IREE_RETURN_IF_ERROR(
+        iree_elf_arch_aarch64_apply_rela(state, rela_count, rela_table));
+  }
+
+  return iree_ok_status();
+}
+
+//==============================================================================
+// Cross-ABI function calls
+//==============================================================================
+
+void iree_elf_call_v_v(const void* symbol_ptr) {
+  typedef void (*ptr_t)(void);
+  ((ptr_t)symbol_ptr)();
+}
+
+void* iree_elf_call_p_i(const void* symbol_ptr, int a0) {
+  typedef void* (*ptr_t)(int);
+  return ((ptr_t)symbol_ptr)(a0);
+}
+
+void* iree_elf_call_p_ip(const void* symbol_ptr, int a0, void* a1) {
+  typedef void* (*ptr_t)(int, void*);
+  return ((ptr_t)symbol_ptr)(a0, a1);
+}
+
+int iree_elf_call_i_p(const void* symbol_ptr, void* a0) {
+  typedef int (*ptr_t)(void*);
+  return ((ptr_t)symbol_ptr)(a0);
+}
+
+int iree_elf_call_i_ppp(const void* symbol_ptr, void* a0, void* a1, void* a2) {
+  typedef int (*ptr_t)(void*, void*, void*);
+  return ((ptr_t)symbol_ptr)(a0, a1, a2);
+}
+
+int iree_elf_thunk_i_p(const void* symbol_ptr, void* a0) {
+  typedef int (*ptr_t)(void*);
+  return ((ptr_t)symbol_ptr)(a0);
+}
+
+#endif  // IREE_ARCH_ARM_64
diff --git a/runtime/src/iree/hal/local/elf/arch/riscv.c b/runtime/src/iree/hal/local/elf/arch/riscv.c
new file mode 100644
index 0000000..807b62d
--- /dev/null
+++ b/runtime/src/iree/hal/local/elf/arch/riscv.c
@@ -0,0 +1,192 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/base/target_platform.h"
+#include "iree/hal/local/elf/arch.h"
+#include "iree/hal/local/elf/elf_types.h"
+
+#if defined(IREE_ARCH_RISCV_32) || defined(IREE_ARCH_RISCV_64)
+
+// Documentation:
+// https://github.com/riscv/riscv-elf-psabi-doc/blob/master/riscv-elf.md
+
+//==============================================================================
+// ELF machine type/ABI
+//==============================================================================
+
+bool iree_elf_arch_is_valid(const iree_elf_ehdr_t* ehdr) {
+  return ehdr->e_machine == 0xF3;  // EM_RISCV / 243
+}
+
+//==============================================================================
+// ELF relocations
+//==============================================================================
+
+enum {
+  IREE_ELF_R_RISCV_NONE = 0,
+  IREE_ELF_R_RISCV_32 = 1,
+  IREE_ELF_R_RISCV_64 = 2,
+  IREE_ELF_R_RISCV_RELATIVE = 3,
+  IREE_ELF_R_RISCV_COPY = 4,
+  IREE_ELF_R_RISCV_JUMP_SLOT = 5,
+};
+
+#if defined(IREE_ARCH_RISCV_32)
+static iree_status_t iree_elf_arch_riscv_apply_rela(
+    iree_elf_relocation_state_t* state, iree_host_size_t rela_count,
+    const iree_elf_rela_t* rela_table) {
+  for (iree_host_size_t i = 0; i < rela_count; ++i) {
+    const iree_elf_rela_t* rela = &rela_table[i];
+    uint32_t type = IREE_ELF_R_TYPE(rela->r_info);
+    if (type == 0) continue;
+
+    // TODO(benvanik): support imports by resolving from the import table.
+    iree_elf_addr_t sym_addr = 0;
+    if (IREE_ELF_R_SYM(rela->r_info) != 0) {
+      return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                              "symbol-relative relocations not implemented");
+    }
+
+    iree_elf_addr_t instr_ptr =
+        (iree_elf_addr_t)state->vaddr_bias + rela->r_offset;
+    switch (type) {
+      case IREE_ELF_R_RISCV_NONE:
+        break;
+      case IREE_ELF_R_RISCV_32:
+        *(uint32_t*)instr_ptr = (uint32_t)(sym_addr + rela->r_addend);
+        break;
+      case IREE_ELF_R_RISCV_JUMP_SLOT:
+        *(uint32_t*)instr_ptr = (uint32_t)sym_addr;
+        break;
+      case IREE_ELF_R_RISCV_RELATIVE:
+        *(uint32_t*)instr_ptr = (uint32_t)(state->vaddr_bias + rela->r_addend);
+        break;
+      default:
+        return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                                "unimplemented riscv32 relocation type %08X",
+                                type);
+    }
+  }
+  return iree_ok_status();
+}
+#else   // IREE_ARCH_RISCV_64
+static iree_status_t iree_elf_arch_riscv_apply_rela(
+    iree_elf_relocation_state_t* state, iree_host_size_t rela_count,
+    const iree_elf_rela_t* rela_table) {
+  for (iree_host_size_t i = 0; i < rela_count; ++i) {
+    const iree_elf_rela_t* rela = &rela_table[i];
+    uint32_t type = IREE_ELF_R_TYPE(rela->r_info);
+    if (type == 0) continue;
+
+    // TODO(benvanik): support imports by resolving from the import table.
+    iree_elf_addr_t sym_addr = 0;
+    if (IREE_ELF_R_SYM(rela->r_info) != 0) {
+      return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                              "symbol-relative relocations not implemented");
+    }
+
+    iree_elf_addr_t instr_ptr =
+        (iree_elf_addr_t)state->vaddr_bias + rela->r_offset;
+    switch (type) {
+      case IREE_ELF_R_RISCV_NONE:
+        break;
+      case IREE_ELF_R_RISCV_32:
+        *(uint32_t*)instr_ptr = (uint32_t)(sym_addr + rela->r_addend);
+        break;
+      case IREE_ELF_R_RISCV_64:
+        *(uint64_t*)instr_ptr = (uint64_t)(sym_addr + rela->r_addend);
+        break;
+      case IREE_ELF_R_RISCV_JUMP_SLOT:
+        *(uint64_t*)instr_ptr = (uint64_t)sym_addr;
+        break;
+      case IREE_ELF_R_RISCV_RELATIVE:
+        *(uint64_t*)instr_ptr = (uint64_t)(state->vaddr_bias + rela->r_addend);
+        break;
+      default:
+        return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                                "unimplemented riscv64 relocation type %08X",
+                                type);
+    }
+  }
+  return iree_ok_status();
+}
+#endif  // IREE_ARCH_RISCV_*
+
+iree_status_t iree_elf_arch_apply_relocations(
+    iree_elf_relocation_state_t* state) {
+  // Gather the relevant relocation tables.
+  iree_host_size_t rela_count = 0;
+  const iree_elf_rela_t* rela_table = NULL;
+  for (iree_host_size_t i = 0; i < state->dyn_table_count; ++i) {
+    const iree_elf_dyn_t* dyn = &state->dyn_table[i];
+    switch (dyn->d_tag) {
+      case IREE_ELF_DT_RELA:
+        rela_table =
+            (const iree_elf_rela_t*)(state->vaddr_bias + dyn->d_un.d_ptr);
+        break;
+      case IREE_ELF_DT_RELASZ:
+        rela_count = dyn->d_un.d_val / sizeof(iree_elf_rela_t);
+        break;
+
+      case IREE_ELF_DT_REL:
+      case IREE_ELF_DT_RELSZ:
+        return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                                "unsupported DT_REL relocations");
+      default:
+        // Ignored.
+        break;
+    }
+  }
+  if (!rela_table) rela_count = 0;
+
+  if (rela_count > 0) {
+    IREE_RETURN_IF_ERROR(
+        iree_elf_arch_riscv_apply_rela(state, rela_count, rela_table));
+  }
+
+  return iree_ok_status();
+}
+
+//==============================================================================
+// Cross-ABI function calls
+//==============================================================================
+
+void iree_elf_call_v_v(const void* symbol_ptr) {
+  typedef void (*ptr_t)(void);
+  ((ptr_t)symbol_ptr)();
+}
+
+void* iree_elf_call_p_i(const void* symbol_ptr, int a0) {
+  typedef void* (*ptr_t)(int);
+  return ((ptr_t)symbol_ptr)(a0);
+}
+
+void* iree_elf_call_p_ip(const void* symbol_ptr, int a0, void* a1) {
+  typedef void* (*ptr_t)(int, void*);
+  return ((ptr_t)symbol_ptr)(a0, a1);
+}
+
+int iree_elf_call_i_p(const void* symbol_ptr, void* a0) {
+  typedef int (*ptr_t)(void*);
+  return ((ptr_t)symbol_ptr)(a0);
+}
+
+int iree_elf_call_i_ppp(const void* symbol_ptr, void* a0, void* a1, void* a2) {
+  typedef int (*ptr_t)(void*, void*, void*);
+  return ((ptr_t)symbol_ptr)(a0, a1, a2);
+}
+
+int iree_elf_thunk_i_p(const void* symbol_ptr, void* a0) {
+  typedef int (*ptr_t)(void*);
+  return ((ptr_t)symbol_ptr)(a0);
+}
+
+#endif  // IREE_ARCH_RISCV_*
diff --git a/runtime/src/iree/hal/local/elf/arch/x86_32.c b/runtime/src/iree/hal/local/elf/arch/x86_32.c
new file mode 100644
index 0000000..9d8d885
--- /dev/null
+++ b/runtime/src/iree/hal/local/elf/arch/x86_32.c
@@ -0,0 +1,175 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/base/target_platform.h"
+#include "iree/hal/local/elf/arch.h"
+#include "iree/hal/local/elf/elf_types.h"
+
+#if defined(IREE_ARCH_X86_32)
+
+// Documentation:
+// https://uclibc.org/docs/psABI-i386.pdf
+
+//==============================================================================
+// ELF machine type/ABI
+//==============================================================================
+
+bool iree_elf_arch_is_valid(const iree_elf_ehdr_t* ehdr) {
+  return ehdr->e_machine == 0x03;  // EM_386 / 3
+}
+
+//==============================================================================
+// ELF relocations
+//==============================================================================
+
+enum {
+  IREE_ELF_R_386_NONE = 0,
+  IREE_ELF_R_386_32 = 1,
+  IREE_ELF_R_386_PC32 = 2,
+  IREE_ELF_R_386_GLOB_DAT = 6,
+  IREE_ELF_R_386_JMP_SLOT = 7,
+  IREE_ELF_R_386_RELATIVE = 8,
+};
+
+static iree_status_t iree_elf_arch_x86_32_apply_rel(
+    iree_elf_relocation_state_t* state, iree_host_size_t rel_count,
+    const iree_elf_rel_t* rel_table) {
+  for (iree_host_size_t i = 0; i < rel_count; ++i) {
+    const iree_elf_rel_t* rel = &rel_table[i];
+    uint32_t type = IREE_ELF_R_TYPE(rel->r_info);
+    if (type == IREE_ELF_R_386_NONE) continue;
+
+    // TODO(benvanik): support imports by resolving from the import table.
+    iree_elf_addr_t sym_addr = 0;
+    if (IREE_ELF_R_SYM(rel->r_info) != 0) {
+      return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                              "symbol-relative relocations not implemented");
+    }
+
+    iree_elf_addr_t instr_ptr =
+        (iree_elf_addr_t)state->vaddr_bias + rel->r_offset;
+    switch (type) {
+        // case IREE_ELF_R_386_NONE: early-exit above
+      case IREE_ELF_R_386_JMP_SLOT:
+        *(uint32_t*)instr_ptr = (uint32_t)sym_addr;
+        break;
+      case IREE_ELF_R_386_GLOB_DAT:
+        *(uint32_t*)instr_ptr = (uint32_t)sym_addr;
+        break;
+      case IREE_ELF_R_386_RELATIVE:
+        *(uint32_t*)instr_ptr += (uint32_t)state->vaddr_bias;
+        break;
+      case IREE_ELF_R_386_32:
+        *(uint32_t*)instr_ptr += (uint32_t)sym_addr;
+        break;
+      case IREE_ELF_R_386_PC32:
+        *(uint32_t*)instr_ptr += (uint32_t)(sym_addr - instr_ptr);
+        break;
+      default:
+        return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                                "unimplemented x86 relocation type %08X", type);
+    }
+  }
+  return iree_ok_status();
+}
+
+iree_status_t iree_elf_arch_apply_relocations(
+    iree_elf_relocation_state_t* state) {
+  // Gather the relevant relocation tables.
+  iree_host_size_t rel_count = 0;
+  const iree_elf_rel_t* rel_table = NULL;
+  for (iree_host_size_t i = 0; i < state->dyn_table_count; ++i) {
+    const iree_elf_dyn_t* dyn = &state->dyn_table[i];
+    switch (dyn->d_tag) {
+      case IREE_ELF_DT_REL:
+        rel_table =
+            (const iree_elf_rel_t*)(state->vaddr_bias + dyn->d_un.d_ptr);
+        break;
+      case IREE_ELF_DT_RELSZ:
+        rel_count = dyn->d_un.d_val / sizeof(iree_elf_rel_t);
+        break;
+
+      case IREE_ELF_DT_RELA:
+      case IREE_ELF_DT_RELASZ:
+        return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                                "unsupported DT_RELA relocations");
+      default:
+        // Ignored.
+        break;
+    }
+  }
+  if (!rel_table) rel_count = 0;
+
+  if (rel_count > 0) {
+    IREE_RETURN_IF_ERROR(
+        iree_elf_arch_x86_32_apply_rel(state, rel_count, rel_table));
+  }
+
+  return iree_ok_status();
+}
+
+//==============================================================================
+// Cross-ABI function calls
+//==============================================================================
+
+// System V i386 ABI (used in IREE):
+// https://uclibc.org/docs/psABI-i386.pdf
+// Arguments:
+//   (reverse order on the stack; last arg furthest from stack pointer)
+//
+// Results:
+//   EAX
+//
+// Non-volatile:
+//   EBX, ESP, EBP, ESI, EDI
+//
+// Everything but Windows uses this convention (linux/bsd/mac/etc) and as such
+// we can just use nice little C thunks.
+
+#if defined(IREE_PLATFORM_WINDOWS)
+
+#error "TODO(#6554): need cdecl -> sysv ABI shims in x86_32_msvc.asm"
+
+#else
+
+void iree_elf_call_v_v(const void* symbol_ptr) {
+  typedef void (*ptr_t)(void);
+  ((ptr_t)symbol_ptr)();
+}
+
+void* iree_elf_call_p_i(const void* symbol_ptr, int a0) {
+  typedef void* (*ptr_t)(int);
+  return ((ptr_t)symbol_ptr)(a0);
+}
+
+void* iree_elf_call_p_ip(const void* symbol_ptr, int a0, void* a1) {
+  typedef void* (*ptr_t)(int, void*);
+  return ((ptr_t)symbol_ptr)(a0, a1);
+}
+
+int iree_elf_call_i_p(const void* symbol_ptr, void* a0) {
+  typedef int (*ptr_t)(void*);
+  return ((ptr_t)symbol_ptr)(a0);
+}
+
+int iree_elf_call_i_ppp(const void* symbol_ptr, void* a0, void* a1, void* a2) {
+  typedef int (*ptr_t)(void*, void*, void*);
+  return ((ptr_t)symbol_ptr)(a0, a1, a2);
+}
+
+int iree_elf_thunk_i_p(const void* symbol_ptr, void* a0) {
+  typedef int (*ptr_t)(void*);
+  return ((ptr_t)symbol_ptr)(a0);
+}
+
+#endif  // IREE_PLATFORM_WINDOWS
+
+#endif  // IREE_ARCH_X86_32
diff --git a/runtime/src/iree/hal/local/elf/arch/x86_64.c b/runtime/src/iree/hal/local/elf/arch/x86_64.c
new file mode 100644
index 0000000..1e3adfc
--- /dev/null
+++ b/runtime/src/iree/hal/local/elf/arch/x86_64.c
@@ -0,0 +1,216 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/base/target_platform.h"
+#include "iree/hal/local/elf/arch.h"
+#include "iree/hal/local/elf/elf_types.h"
+
+#if defined(IREE_ARCH_X86_64)
+
+// Documentation:
+// https://github.com/hjl-tools/x86-psABI/wiki/x86-64-psABI-1.0.pdf
+
+//==============================================================================
+// ELF machine type/ABI
+//==============================================================================
+
+bool iree_elf_arch_is_valid(const iree_elf_ehdr_t* ehdr) {
+  return ehdr->e_machine == 0x3E;  // EM_X86_64 / 62
+}
+
+//==============================================================================
+// ELF relocations
+//==============================================================================
+
+enum {
+  IREE_ELF_R_X86_64_NONE = 0,       // No reloc
+  IREE_ELF_R_X86_64_64 = 1,         // Direct 64 bit
+  IREE_ELF_R_X86_64_PC32 = 2,       // PC relative 32 bit signed
+  IREE_ELF_R_X86_64_GOT32 = 3,      // 32 bit GOT entry
+  IREE_ELF_R_X86_64_PLT32 = 4,      // 32 bit PLT address
+  IREE_ELF_R_X86_64_COPY = 5,       // Copy symbol at runtime
+  IREE_ELF_R_X86_64_GLOB_DAT = 6,   // Create GOT entry
+  IREE_ELF_R_X86_64_JUMP_SLOT = 7,  // Create PLT entry
+  IREE_ELF_R_X86_64_RELATIVE = 8,   // Adjust by program base
+  IREE_ELF_R_X86_64_GOTPCREL = 9,   // 32 bit signed pc relative offset to GOT
+  IREE_ELF_R_X86_64_32 = 10,        // Direct 32 bit zero extended
+  IREE_ELF_R_X86_64_32S = 11,       // Direct 32 bit sign extended
+  IREE_ELF_R_X86_64_16 = 12,        // Direct 16 bit zero extended
+  IREE_ELF_R_X86_64_PC16 = 13,      // 16 bit sign extended pc relative
+  IREE_ELF_R_X86_64_8 = 14,         // Direct 8 bit sign extended
+  IREE_ELF_R_X86_64_PC8 = 15,       // 8 bit sign extended pc relative
+  IREE_ELF_R_X86_64_PC64 = 24,      // Place relative 64-bit signed
+};
+
+static iree_status_t iree_elf_arch_x86_64_apply_rela(
+    iree_elf_relocation_state_t* state, iree_host_size_t rela_count,
+    const iree_elf_rela_t* rela_table) {
+  for (iree_host_size_t i = 0; i < rela_count; ++i) {
+    const iree_elf_rela_t* rela = &rela_table[i];
+    uint32_t type = IREE_ELF_R_TYPE(rela->r_info);
+    if (type == IREE_ELF_R_X86_64_NONE) continue;
+
+    // TODO(benvanik): support imports by resolving from the import table.
+    iree_elf_addr_t sym_addr = 0;
+    if (IREE_ELF_R_SYM(rela->r_info) != 0) {
+      return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                              "symbol-relative relocations not implemented");
+    }
+
+    iree_elf_addr_t instr_ptr =
+        (iree_elf_addr_t)state->vaddr_bias + rela->r_offset;
+    switch (type) {
+      // case IREE_ELF_R_X86_64_NONE: early-exit above
+      case IREE_ELF_R_X86_64_RELATIVE:
+        *(uint64_t*)instr_ptr = (uint64_t)(state->vaddr_bias + rela->r_addend);
+        break;
+      case IREE_ELF_R_X86_64_JUMP_SLOT:
+        *(uint64_t*)instr_ptr = (uint64_t)sym_addr;
+        break;
+      case IREE_ELF_R_X86_64_GLOB_DAT:
+        *(uint64_t*)instr_ptr = (uint64_t)sym_addr;
+        break;
+      case IREE_ELF_R_X86_64_COPY:
+        *(uint64_t*)instr_ptr = (uint64_t)sym_addr;
+        break;
+      case IREE_ELF_R_X86_64_64:
+        *(uint64_t*)instr_ptr = (uint64_t)(sym_addr + rela->r_addend);
+        break;
+      case IREE_ELF_R_X86_64_32:
+        *(uint32_t*)instr_ptr = (uint32_t)(sym_addr + rela->r_addend);
+        break;
+      case IREE_ELF_R_X86_64_32S:
+        *(int32_t*)instr_ptr = (int32_t)(sym_addr + rela->r_addend);
+        break;
+      case IREE_ELF_R_X86_64_PC32:
+        *(uint32_t*)instr_ptr =
+            (uint32_t)(sym_addr + rela->r_addend - instr_ptr);
+        break;
+      default:
+        return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                                "unimplemented x86_64 relocation type %08X",
+                                type);
+    }
+  }
+  return iree_ok_status();
+}
+
+iree_status_t iree_elf_arch_apply_relocations(
+    iree_elf_relocation_state_t* state) {
+  // Gather the relevant relocation tables.
+  iree_host_size_t rela_count = 0;
+  const iree_elf_rela_t* rela_table = NULL;
+  iree_host_size_t plt_rela_count = 0;
+  const iree_elf_rela_t* plt_rela_table = NULL;
+  for (iree_host_size_t i = 0; i < state->dyn_table_count; ++i) {
+    const iree_elf_dyn_t* dyn = &state->dyn_table[i];
+    switch (dyn->d_tag) {
+      case IREE_ELF_DT_RELA:
+        rela_table =
+            (const iree_elf_rela_t*)(state->vaddr_bias + dyn->d_un.d_ptr);
+        break;
+      case IREE_ELF_DT_RELASZ:
+        rela_count = dyn->d_un.d_val / sizeof(iree_elf_rela_t);
+        break;
+
+      case IREE_ELF_DT_PLTREL:
+        // Type of reloc in PLT; we expect DT_RELA right now.
+        if (dyn->d_un.d_val != IREE_ELF_DT_RELA) {
+          return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                                  "unsupported DT_PLTREL != DT_RELA");
+        }
+        break;
+      case IREE_ELF_DT_JMPREL:
+        plt_rela_table =
+            (const iree_elf_rela_t*)(state->vaddr_bias + dyn->d_un.d_ptr);
+        break;
+      case IREE_ELF_DT_PLTRELSZ:
+        plt_rela_count = dyn->d_un.d_val / sizeof(iree_elf_rela_t);
+        break;
+
+      case IREE_ELF_DT_REL:
+      case IREE_ELF_DT_RELSZ:
+        return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                                "unsupported DT_REL relocations");
+
+      default:
+        // Ignored.
+        break;
+    }
+  }
+  if (!rela_table) rela_count = 0;
+  if (!plt_rela_table) plt_rela_count = 0;
+
+  if (rela_count > 0) {
+    IREE_RETURN_IF_ERROR(
+        iree_elf_arch_x86_64_apply_rela(state, rela_count, rela_table));
+  }
+  if (plt_rela_count > 0) {
+    IREE_RETURN_IF_ERROR(
+        iree_elf_arch_x86_64_apply_rela(state, plt_rela_count, plt_rela_table));
+  }
+
+  return iree_ok_status();
+}
+
+//==============================================================================
+// Cross-ABI function calls
+//==============================================================================
+
+// System V AMD64 ABI (used in IREE):
+// https://github.com/hjl-tools/x86-psABI/wiki/x86-64-psABI-1.0.pdf
+// Arguments:
+//   RDI, RSI, RDX, RCX, R8, R9, [stack]...
+// Results:
+//   RAX, RDX
+//
+// Everything but Windows uses this convention (linux/bsd/mac/etc) and as such
+// we can just use nice little C thunks.
+
+#if defined(IREE_PLATFORM_WINDOWS)
+// Host is using the Microsoft x64 calling convention and we need to translate
+// to the System V AMD64 ABI conventions. Unfortunately MSVC does not support
+// inline assembly and we have to outline the calls in x86_64_msvc.asm.
+#else
+
+void iree_elf_call_v_v(const void* symbol_ptr) {
+  typedef void (*ptr_t)(void);
+  ((ptr_t)symbol_ptr)();
+}
+
+void* iree_elf_call_p_i(const void* symbol_ptr, int a0) {
+  typedef void* (*ptr_t)(int);
+  return ((ptr_t)symbol_ptr)(a0);
+}
+
+void* iree_elf_call_p_ip(const void* symbol_ptr, int a0, void* a1) {
+  typedef void* (*ptr_t)(int, void*);
+  return ((ptr_t)symbol_ptr)(a0, a1);
+}
+
+int iree_elf_call_i_p(const void* symbol_ptr, void* a0) {
+  typedef int (*ptr_t)(void*);
+  return ((ptr_t)symbol_ptr)(a0);
+}
+
+int iree_elf_call_i_ppp(const void* symbol_ptr, void* a0, void* a1, void* a2) {
+  typedef int (*ptr_t)(void*, void*, void*);
+  return ((ptr_t)symbol_ptr)(a0, a1, a2);
+}
+
+int iree_elf_thunk_i_p(const void* symbol_ptr, void* a0) {
+  typedef int (*ptr_t)(void*);
+  return ((ptr_t)symbol_ptr)(a0);
+}
+
+#endif  // IREE_PLATFORM_WINDOWS
+
+#endif  // IREE_ARCH_X86_64
diff --git a/runtime/src/iree/hal/local/elf/arch/x86_64_msvc.asm b/runtime/src/iree/hal/local/elf/arch/x86_64_msvc.asm
new file mode 100644
index 0000000..6e25c29
--- /dev/null
+++ b/runtime/src/iree/hal/local/elf/arch/x86_64_msvc.asm
@@ -0,0 +1,202 @@
+; Copyright 2021 The IREE Authors
+;
+; Licensed under the Apache License v2.0 with LLVM Exceptions.
+; See https://llvm.org/LICENSE.txt for license information.
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; Microsoft x64 calling convention:
+; https://docs.microsoft.com/en-us/cpp/build/x64-calling-convention
+; Arguments:
+;   RCX, RDX, R8, R9, [stack]...
+; Results:
+;   RAX
+; Non-volatile:
+;   RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-XMM15
+;
+; System V AMD64 ABI (used in IREE):
+; https://github.com/hjl-tools/x86-psABI/wiki/x86-64-psABI-1.0.pdf
+; Arguments:
+;   RDI, RSI, RDX, RCX, R8, R9, [stack]...
+; Results:
+;   RAX, RDX
+
+; Total size of non-volatile XMM registers.
+_SYSV_INTEROP_STACK_SIZE = 10 * 10h
+
+; Function prolog that saves registers that we may clobber while in code
+; following the SYS-V x64 ABI.
+;
+; This also encodes unwind table information (.xdata/.pdata) that is used by
+; debuggers/backtrace/etc to be able to look through the function on the stack.
+; Though they debugger will be totally confused by the function we call into
+; (it'll be expecting the Microsoft conventions and won't find them) it'll at
+; least let us see the leaf guest function instead of just a bunch of our
+; iree_elf_call_* thunks.
+; Docs suck but we are in black magic territory so it's expected:
+; https://docs.microsoft.com/en-us/cpp/build/exception-handling-x64?view=msvc-160#unwind-helpers-for-masm
+_sysv_interop_prolog MACRO
+  ; Save volatile general purpose registers to the stack.
+  push rbp
+  .pushreg rbp
+  mov rbp, rsp
+  .setframe rbp, 0
+  push rbx
+  .pushreg rbx
+  push rdi
+  .pushreg rdi
+  push rsi
+  .pushreg rsi
+  push r12
+  .pushreg r12
+  push r13
+  .pushreg r13
+  push r14
+  .pushreg r14
+  push r15
+  .pushreg r15
+
+  ; Setup stack space for storing the SIMD registers.
+  ; NOTE: we adjust this by 8 bytes to get on a 16-byte alignment so we can
+  ; use the aligned movaps instruction.
+  sub rsp, _SYSV_INTEROP_STACK_SIZE + 8
+  .allocstack _SYSV_INTEROP_STACK_SIZE + 8
+
+  ; Save volatile SIMD registers to the stack.
+  movaps [rsp + 00h], xmm6
+  .savexmm128 xmm6, 00h
+  movaps [rsp + 10h], xmm7
+  .savexmm128 xmm7, 10h
+  movaps [rsp + 20h], xmm8
+  .savexmm128 xmm8, 20h
+  movaps [rsp + 30h], xmm9
+  .savexmm128 xmm9, 30h
+  movaps [rsp + 40h], xmm10
+  .savexmm128 xmm10, 40h
+  movaps [rsp + 50h], xmm11
+  .savexmm128 xmm11, 50h
+  movaps [rsp + 60h], xmm12
+  .savexmm128 xmm12, 60h
+  movaps [rsp + 70h], xmm13
+  .savexmm128 xmm13, 70h
+  movaps [rsp + 80h], xmm14
+  .savexmm128 xmm14, 80h
+  movaps [rsp + 90h], xmm15
+  .savexmm128 xmm15, 90h
+
+  .endprolog
+ENDM
+
+; Function epilog that restores registers that we may have clobbered while in
+; code following the SYS-V x64 ABI.
+_sysv_interop_epilog MACRO
+  ; Restore volatile SIMD registers from the stack.
+  movaps xmm6, [rsp + 00h]
+  movaps xmm7, [rsp + 10h]
+  movaps xmm8, [rsp + 20h]
+  movaps xmm9, [rsp + 30h]
+  movaps xmm10, [rsp + 40h]
+  movaps xmm11, [rsp + 50h]
+  movaps xmm12, [rsp + 60h]
+  movaps xmm13, [rsp + 70h]
+  movaps xmm14, [rsp + 80h]
+  movaps xmm15, [rsp + 90h]
+  add rsp, _SYSV_INTEROP_STACK_SIZE + 8
+
+  ; Restore volatile general purpose registers from the stack.
+  pop r15
+  pop r14
+  pop r13
+  pop r12
+  pop rsi
+  pop rdi
+  pop rbx
+  leave  ; mov rsp, rbp + pop ebp
+ENDM
+
+_TEXT SEGMENT
+ALIGN 16
+
+; void iree_elf_call_v_v(const void* symbol_ptr)
+iree_elf_call_v_v PROC FRAME
+  _sysv_interop_prolog
+
+  ; RCX = symbol_ptr
+  call rcx
+
+  _sysv_interop_epilog
+  ret
+iree_elf_call_v_v ENDP
+
+; void* iree_elf_call_p_i(const void* symbol_ptr, int a0)
+iree_elf_call_p_i PROC FRAME
+  _sysv_interop_prolog
+
+  ; RCX = symbol_ptr
+  ; RDX = a0
+  mov rdi, rdx
+  call rcx
+
+  _sysv_interop_epilog
+  ret
+iree_elf_call_p_i ENDP
+
+; void* iree_elf_call_p_ip(const void* symbol_ptr, int a0, void* a1)
+iree_elf_call_p_ip PROC FRAME
+  _sysv_interop_prolog
+
+  ; RCX = symbol_ptr
+  ; RDX = a0
+  ; R8 = a1
+  mov rdi, rdx
+  mov rsi, r8
+  call rcx
+
+  _sysv_interop_epilog
+  ret
+iree_elf_call_p_ip ENDP
+
+; int iree_elf_call_i_p(const void* symbol_ptr, void* a0)
+iree_elf_call_i_p PROC FRAME
+  _sysv_interop_prolog
+
+  ; RCX = symbol_ptr
+  ; RDX = a0
+  mov rdi, rdx
+  call rcx
+
+  _sysv_interop_epilog
+  ret
+iree_elf_call_i_p ENDP
+
+; int iree_elf_call_i_ppp(const void* symbol_ptr, void* a0, void* a1, void* a2)
+iree_elf_call_i_ppp PROC FRAME
+  _sysv_interop_prolog
+
+  ; RCX = symbol_ptr
+  ; RDX = a0
+  ; R8 = a1
+  ; R9 = a2
+  mov rdi, rdx
+  mov rsi, r8
+  mov rdx, r9
+  call rcx
+
+  _sysv_interop_epilog
+  ret
+iree_elf_call_i_ppp ENDP
+
+; int iree_elf_thunk_i_p(const void* symbol_ptr, void* a0)
+iree_elf_thunk_i_p PROC FRAME
+  _sysv_interop_prolog
+
+  ; RDI = symbol_ptr
+  ; RSI = a0
+  mov rcx, rsi
+  call rdi
+
+  _sysv_interop_epilog
+  ret
+iree_elf_thunk_i_p ENDP
+
+_TEXT ENDS
+END
diff --git a/runtime/src/iree/hal/local/elf/elf_module.c b/runtime/src/iree/hal/local/elf/elf_module.c
new file mode 100644
index 0000000..61f68e9
--- /dev/null
+++ b/runtime/src/iree/hal/local/elf/elf_module.c
@@ -0,0 +1,660 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/local/elf/elf_module.h"
+
+#include <inttypes.h>
+#include <string.h>
+
+#include "iree/base/target_platform.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/local/elf/arch.h"
+#include "iree/hal/local/elf/platform.h"
+
+//==============================================================================
+// Verification and section/info caching
+//==============================================================================
+
+// Fields taken from the ELF headers used only during verification and loading.
+typedef struct iree_elf_module_load_state_t {
+  iree_memory_info_t memory_info;
+  const iree_elf_ehdr_t* ehdr;
+  const iree_elf_phdr_t* phdr_table;  // ehdr.e_phnum has count
+  const iree_elf_shdr_t* shdr_table;  // ehdr.e_shnum has count
+
+  const iree_elf_dyn_t* dyn_table;  // PT_DYNAMIC
+  iree_host_size_t dyn_table_count;
+
+  iree_elf_addr_t init;               // DT_INIT
+  const iree_elf_addr_t* init_array;  // DT_INIT_ARRAY
+  iree_host_size_t init_array_count;  // DT_INIT_ARRAYSZ
+} iree_elf_module_load_state_t;
+
+// Verifies the ELF file header and machine class.
+static iree_status_t iree_elf_module_verify_ehdr(
+    iree_const_byte_span_t raw_data) {
+  // Size must be larger than the header we are trying to load.
+  if (raw_data.data_length < sizeof(iree_elf_ehdr_t)) {
+    return iree_make_status(
+        IREE_STATUS_FAILED_PRECONDITION,
+        "ELF data provided (%zu) is smaller than ehdr (%zu)",
+        raw_data.data_length, sizeof(iree_elf_ehdr_t));
+  }
+
+  // Check for ELF identifier.
+  const iree_elf_ehdr_t* ehdr = (const iree_elf_ehdr_t*)raw_data.data;
+  static const iree_elf_byte_t elf_magic[4] = {0x7F, 'E', 'L', 'F'};
+  if (memcmp(ehdr->e_ident, elf_magic, sizeof(elf_magic)) != 0) {
+    return iree_make_status(
+        IREE_STATUS_FAILED_PRECONDITION,
+        "data provided does not contain the ELF identifier");
+  }
+
+  // Check critical identifier bytes before attempting to deal with any more of
+  // the header; the class determines the size of the header fields and the
+  // endianness determines how multi-byte fields are interpreted.
+
+#if defined(IREE_PTR_SIZE_32)
+  if (ehdr->e_ident[IREE_ELF_EI_CLASS] != IREE_ELF_ELFCLASS32) {
+    return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                            "system/ELF class mismatch: expected 32-bit");
+  }
+#elif defined(IREE_PTR_SIZE_64)
+  if (ehdr->e_ident[IREE_ELF_EI_CLASS] != IREE_ELF_ELFCLASS64) {
+    return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                            "system/ELF class mismatch: expected 64-bit");
+  }
+#endif  // IREE_PTR_SIZE_*
+
+#if defined(IREE_ENDIANNESS_LITTLE)
+  if (ehdr->e_ident[IREE_ELF_EI_DATA] != IREE_ELF_ELFDATA2LSB) {
+    return iree_make_status(
+        IREE_STATUS_FAILED_PRECONDITION,
+        "system/ELF endianness mismatch: expected little-endian");
+  }
+#else
+  if (ehdr->e_ident[IREE_ELF_EI_DATA] != IREE_ELF_ELFDATA2MSB) {
+    return iree_make_status(
+        IREE_STATUS_FAILED_PRECONDITION,
+        "system/ELF endianness mismatch: expected big-endian");
+  }
+#endif  // IREE_ENDIANNESS_*
+
+  // ELF version == EV_CURRENT (1) is all we handle.
+  // Check this before other fields as they could change meaning in other
+  // versions.
+  if (ehdr->e_version != 1) {
+    return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                            "ELF version %u unsupported; expected 1");
+  }
+
+  // Ensure we have the right architecture compiled in.
+  if (!iree_elf_arch_is_valid(ehdr)) {
+    return iree_make_status(
+        IREE_STATUS_FAILED_PRECONDITION,
+        "ELF machine specification (%04X) does not match the "
+        "running architecture",
+        (uint32_t)ehdr->e_machine);
+  }
+
+  // We could probably support non-shared object types but no need today and it
+  // allows us to make assumptions about the sections that are present (all
+  // those marked as 'mandatory' in the spec.
+  if (ehdr->e_type != IREE_ELF_ET_DYN) {
+    return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                            "only shared object ELFs are supported");
+  }
+
+  // Sanity checks on entity sizes - they can be larger than what we expect,
+  // but overlaying our structs onto them is not going to work if they are
+  // smaller. For now we aren't doing pointer walks based on dynamic sizes so
+  // we need equality, but if we ever have a reason to do so we could change all
+  // array-style accesses to scale out based on the ehdr values
+  if (ehdr->e_ehsize != sizeof(iree_elf_ehdr_t) ||
+      ehdr->e_phentsize != sizeof(iree_elf_phdr_t) ||
+      ehdr->e_shentsize != sizeof(iree_elf_shdr_t)) {
+    return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                            "ELF entity size mismatch");
+  }
+
+  // Verify the phdr table properties. This doesn't validate each phdr but just
+  // ensures that the table is constructed correctly and within bounds.
+  if (ehdr->e_phoff == 0 || ehdr->e_phnum == 0 ||
+      (ehdr->e_phoff + ehdr->e_phnum * ehdr->e_phentsize) >
+          raw_data.data_length) {
+    return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                            "invalid mandatory phdr table");
+  }
+
+  // Verify the shdr table properties.
+  if (ehdr->e_shoff == 0 || ehdr->e_shnum == 0 ||
+      (ehdr->e_shoff + ehdr->e_shnum * ehdr->e_shentsize) >
+          raw_data.data_length) {
+    return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                            "invalid mandatory shdr table");
+  }
+
+  return iree_ok_status();
+}
+
+// Verifies the phdr table for supported types and in-bounds file references.
+static iree_status_t iree_elf_module_verify_phdr_table(
+    iree_const_byte_span_t raw_data, iree_elf_module_load_state_t* load_state) {
+  for (iree_elf_half_t i = 0; i < load_state->ehdr->e_phnum; ++i) {
+    const iree_elf_phdr_t* phdr = &load_state->phdr_table[i];
+    if (phdr->p_type != IREE_ELF_PT_LOAD) continue;
+    if (phdr->p_offset + phdr->p_filesz > raw_data.data_length) {
+      return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                              "phdr reference outside of file extents: %" PRIu64
+                              "-%" PRIu64 "of max %" PRIu64,
+                              (uint64_t)phdr->p_offset,
+                              (uint64_t)(phdr->p_offset + phdr->p_filesz),
+                              (uint64_t)raw_data.data_length);
+    }
+  }
+  return iree_ok_status();
+}
+
+// Parses the ELF to populate fields used during loading and runtime and verify
+// that the ELF matches our very, very low expectations.
+static iree_status_t iree_elf_module_parse_headers(
+    iree_const_byte_span_t raw_data,
+    iree_elf_module_load_state_t* out_load_state,
+    iree_elf_module_t* out_module) {
+  memset(out_module, 0, sizeof(*out_module));
+  memset(out_load_state, 0, sizeof(*out_load_state));
+
+  // Query the host memory information that we can use to verify we are able to
+  // meet the alignment requirements of the ELF.
+  iree_memory_query_info(&out_load_state->memory_info);
+
+  // Verify the ELF is an ELF and that it's for the current machine.
+  // NOTE: this only verifies the ehdr is as expected and nothing else: the ELF
+  // is still untrusted and may be missing mandatory sections.
+  IREE_RETURN_IF_ERROR(iree_elf_module_verify_ehdr(raw_data));
+
+  // Get the primary tables (locations verified above).
+  const iree_elf_ehdr_t* ehdr = (const iree_elf_ehdr_t*)raw_data.data;
+  const iree_elf_phdr_t* phdr_table =
+      (const iree_elf_phdr_t*)(raw_data.data + ehdr->e_phoff);
+  const iree_elf_shdr_t* shdr_table =
+      (const iree_elf_shdr_t*)(raw_data.data + ehdr->e_shoff);
+  out_load_state->ehdr = ehdr;
+  out_load_state->phdr_table = phdr_table;
+  out_load_state->shdr_table = shdr_table;
+
+  // Verify the phdr table to ensure all bounds are in range of the file.
+  IREE_RETURN_IF_ERROR(
+      iree_elf_module_verify_phdr_table(raw_data, out_load_state));
+
+  return iree_ok_status();
+}
+
+//==============================================================================
+// Allocation and layout
+//==============================================================================
+
+// Calculates the in-memory layout of the ELF module as defined by its segments.
+// Returns a byte range representing the minimum virtual address offset of any
+// segment that can be used to offset the vaddr from the host allocation and the
+// total length of the required range. The alignment will meet the requirements
+// of the ELF but is yet unadjusted for host requirements. The range will have
+// zero length if there are no segments to load (which would be weird).
+static iree_byte_range_t iree_elf_module_calculate_vaddr_range(
+    iree_elf_module_load_state_t* load_state) {
+  // Min/max virtual addresses of any allocated segment.
+  iree_elf_addr_t vaddr_min = IREE_ELF_ADDR_MAX;
+  iree_elf_addr_t vaddr_max = IREE_ELF_ADDR_MIN;
+  for (iree_elf_half_t i = 0; i < load_state->ehdr->e_phnum; ++i) {
+    const iree_elf_phdr_t* phdr = &load_state->phdr_table[i];
+    if (phdr->p_type != IREE_ELF_PT_LOAD) continue;
+    iree_elf_addr_t p_vaddr_min =
+        iree_page_align_start(phdr->p_vaddr, phdr->p_align);
+    iree_elf_addr_t p_vaddr_max =
+        iree_page_align_end(phdr->p_vaddr + phdr->p_memsz, phdr->p_align);
+    vaddr_min = iree_min(vaddr_min, p_vaddr_min);
+    vaddr_max = iree_max(vaddr_max, p_vaddr_max);
+  }
+  if (vaddr_min == IREE_ELF_ADDR_MAX) {
+    // Did not find any segments to load.
+    vaddr_min = IREE_ELF_ADDR_MIN;
+    vaddr_max = IREE_ELF_ADDR_MIN;
+  }
+  iree_byte_range_t byte_range = {
+      .offset = (iree_host_size_t)vaddr_min,
+      .length = (iree_host_size_t)(vaddr_max - vaddr_min),
+  };
+  return byte_range;
+}
+
+// Allocates space for and loads all DT_LOAD segments into the host virtual
+// address space.
+static iree_status_t iree_elf_module_load_segments(
+    iree_const_byte_span_t raw_data, iree_elf_module_load_state_t* load_state,
+    iree_elf_module_t* module) {
+  // Calculate the total internally-aligned vaddr range.
+  iree_byte_range_t vaddr_range =
+      iree_elf_module_calculate_vaddr_range(load_state);
+
+  // Reserve virtual address space in the host memory space. This memory is
+  // uncommitted by default as the ELF may only sparsely use the address space.
+  module->vaddr_size = iree_page_align_end(
+      vaddr_range.length, load_state->memory_info.normal_page_size);
+  IREE_RETURN_IF_ERROR(iree_memory_view_reserve(
+      IREE_MEMORY_VIEW_FLAG_MAY_EXECUTE, module->vaddr_size,
+      module->host_allocator, (void**)&module->vaddr_base));
+  module->vaddr_bias = module->vaddr_base - vaddr_range.offset;
+
+  // Commit and load all of the segments.
+  for (iree_elf_half_t i = 0; i < load_state->ehdr->e_phnum; ++i) {
+    const iree_elf_phdr_t* phdr = &load_state->phdr_table[i];
+    if (phdr->p_type != IREE_ELF_PT_LOAD) continue;
+
+    // Commit the range of pages used by this segment, initially with write
+    // access so that we can modify the pages.
+    iree_byte_range_t byte_range = {
+        .offset = phdr->p_vaddr,
+        .length = phdr->p_memsz,
+    };
+    IREE_RETURN_IF_ERROR(iree_memory_view_commit_ranges(
+        module->vaddr_bias, 1, &byte_range,
+        IREE_MEMORY_ACCESS_READ | IREE_MEMORY_ACCESS_WRITE));
+
+    // Copy data present in the file.
+    // TODO(benvanik): infra for being able to detect if the source model is in
+    // a mapped file - if it is, we can remap the page and directly reference it
+    // here for read-only segments and setup copy-on-write for writeable ones.
+    // We'd need a way to pass in the underlying mapping and some guarantees on
+    // the lifetime of it. Today we are just always committing above and copying
+    // here because it keeps this all super simple (you know, as simple as an
+    // entire custom ELF loader can be :).
+    if (phdr->p_filesz > 0) {
+      memcpy(module->vaddr_bias + phdr->p_vaddr, raw_data.data + phdr->p_offset,
+             phdr->p_filesz);
+    }
+
+    // NOTE: p_memsz may be larger than p_filesz - if so, the extra memory bytes
+    // must be zeroed. We require that the initial allocation is zeroed anyway
+    // so this is a no-op.
+
+    // NOTE: the pages are still writeable; we need to apply relocations before
+    // we can go back through and remove write access from read-only/executable
+    // pages in iree_elf_module_protect_segments.
+  }
+
+  return iree_ok_status();
+}
+
+// Applies segment memory protection attributes.
+// This will make pages read-only and must only be performed after relocation
+// (which writes to pages of all types). Executable pages will be flushed from
+// the instruction cache.
+static iree_status_t iree_elf_module_protect_segments(
+    iree_elf_module_load_state_t* load_state, iree_elf_module_t* module) {
+  // PT_LOAD segments (the bulk of progbits):
+  for (iree_elf_half_t i = 0; i < load_state->ehdr->e_phnum; ++i) {
+    const iree_elf_phdr_t* phdr = &load_state->phdr_table[i];
+    if (phdr->p_type != IREE_ELF_PT_LOAD) continue;
+
+    // Interpret the access bits and widen to the implicit allowable
+    // permissions. See Table 7-37:
+    // https://docs.oracle.com/cd/E19683-01/816-1386/6m7qcoblk/index.html#chapter6-34713
+    iree_memory_access_t access = 0;
+    if (phdr->p_flags & IREE_ELF_PF_R) access |= IREE_MEMORY_ACCESS_READ;
+    if (phdr->p_flags & IREE_ELF_PF_W) access |= IREE_MEMORY_ACCESS_WRITE;
+    if (phdr->p_flags & IREE_ELF_PF_X) access |= IREE_MEMORY_ACCESS_EXECUTE;
+    if (access & IREE_MEMORY_ACCESS_WRITE) access |= IREE_MEMORY_ACCESS_READ;
+    if (access & IREE_MEMORY_ACCESS_EXECUTE) access |= IREE_MEMORY_ACCESS_READ;
+
+    // We only support R+X (no W).
+    if ((phdr->p_flags & IREE_ELF_PF_X) && (phdr->p_flags & IREE_ELF_PF_W)) {
+      return iree_make_status(IREE_STATUS_PERMISSION_DENIED,
+                              "unable to create a writable executable segment");
+    }
+
+    // Apply new access protection.
+    iree_byte_range_t byte_range = {
+        .offset = phdr->p_vaddr,
+        .length = phdr->p_memsz,
+    };
+    IREE_RETURN_IF_ERROR(iree_memory_view_protect_ranges(module->vaddr_bias, 1,
+                                                         &byte_range, access));
+
+    // Flush the instruction cache if we are going to execute these pages.
+    if (access & IREE_MEMORY_ACCESS_EXECUTE) {
+      iree_memory_view_flush_icache(module->vaddr_bias + phdr->p_vaddr,
+                                    phdr->p_memsz);
+    }
+  }
+
+  // PT_GNU_RELRO: hardening of post-relocation segments.
+  // These may alias with segments above and must be processed afterward.
+  for (iree_elf_half_t i = 0; i < load_state->ehdr->e_phnum; ++i) {
+    const iree_elf_phdr_t* phdr = &load_state->phdr_table[i];
+    if (phdr->p_type != IREE_ELF_PT_GNU_RELRO) continue;
+    iree_byte_range_t byte_range = {
+        .offset = phdr->p_vaddr,
+        .length = phdr->p_memsz,
+    };
+    IREE_RETURN_IF_ERROR(iree_memory_view_protect_ranges(
+        module->vaddr_bias, 1, &byte_range, IREE_MEMORY_ACCESS_READ));
+  }
+
+  return iree_ok_status();
+}
+
+// Unloads the ELF segments from memory and releases the host virtual address
+// space reservation.
+static void iree_elf_module_unload_segments(iree_elf_module_t* module) {
+  // Decommit/unreserve the entire memory space.
+  if (module->vaddr_base != NULL) {
+    iree_memory_view_release(module->vaddr_base, module->vaddr_size,
+                             module->host_allocator);
+  }
+  module->vaddr_base = NULL;
+  module->vaddr_bias = NULL;
+  module->vaddr_size = 0;
+}
+
+//==============================================================================
+// Dynamic library handling
+//==============================================================================
+// NOTE: this happens *after* allocation and loading as the .dynsym and related
+// segments are allocated and loaded in virtual address space.
+
+// Parses, verifies, and populates dynamic symbol related tables for runtime
+// use. These tables are all in allocated memory and use fully rebased virtual
+// addresses.
+static iree_status_t iree_elf_module_parse_dynamic_tables(
+    iree_elf_module_load_state_t* load_state, iree_elf_module_t* module) {
+  // By the spec there must only be one PT_DYNAMIC.
+  // Note that we are getting the one in the loaded virtual address space.
+  const iree_elf_dyn_t* dyn_table = NULL;
+  iree_host_size_t dyn_table_count = 0;
+  for (iree_elf_half_t i = 0; i < load_state->ehdr->e_phnum; ++i) {
+    const iree_elf_phdr_t* phdr = &load_state->phdr_table[i];
+    if (phdr->p_type == IREE_ELF_PT_DYNAMIC) {
+      dyn_table = (const iree_elf_dyn_t*)(module->vaddr_bias + phdr->p_vaddr);
+      dyn_table_count = phdr->p_filesz / sizeof(iree_elf_dyn_t);
+      break;
+    }
+  }
+  if (!dyn_table || !dyn_table_count) {
+    return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                            "no PT_DYNAMIC/.dynamic segment");
+  }
+  load_state->dyn_table = dyn_table;
+  load_state->dyn_table_count = dyn_table_count;
+
+  for (iree_host_size_t i = 0; i < dyn_table_count; ++i) {
+    const iree_elf_dyn_t* dyn = &dyn_table[i];
+    switch (dyn->d_tag) {
+      case IREE_ELF_DT_STRTAB:
+        // .dynstr table for runtime symbol lookup.
+        module->dynstr = (const char*)(module->vaddr_bias + dyn->d_un.d_ptr);
+        break;
+      case IREE_ELF_DT_STRSZ:
+        module->dynstr_size = dyn->d_un.d_val;
+        break;
+
+      case IREE_ELF_DT_SYMTAB:
+        // .dynsym table for runtime symbol lookup.
+        module->dynsym =
+            (const iree_elf_sym_t*)(module->vaddr_bias + dyn->d_un.d_ptr);
+        break;
+      case IREE_ELF_DT_SYMENT:
+        if (dyn->d_un.d_val != sizeof(iree_elf_sym_t)) {
+          return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                                  "DT_SYMENT size mismatch");
+        }
+        break;
+      case IREE_ELF_DT_HASH: {
+        // NOTE: we don't care about the hash table (yet), but it is the only
+        // way to get the total symbol count.
+        const iree_elf_word_t* hash =
+            (const iree_elf_word_t*)(module->vaddr_bias + dyn->d_un.d_ptr);
+        module->dynsym_count = hash[1];  // symbol count, obviously~
+        break;
+      }
+
+      case IREE_ELF_DT_INIT:
+        // .init initializer function (runs before .init_array).
+        load_state->init = dyn->d_un.d_ptr;
+        break;
+      case IREE_ELF_DT_INIT_ARRAY:
+        // .init_array list of initializer functions.
+        load_state->init_array =
+            (const iree_elf_addr_t*)(module->vaddr_bias + dyn->d_un.d_ptr);
+        break;
+      case IREE_ELF_DT_INIT_ARRAYSZ:
+        load_state->init_array_count = dyn->d_un.d_val;
+        break;
+
+      case IREE_ELF_DT_RELENT:
+        if (dyn->d_un.d_val != sizeof(iree_elf_rel_t)) {
+          return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                                  "DT_RELENT size mismatch");
+        }
+        break;
+      case IREE_ELF_DT_RELAENT:
+        if (dyn->d_un.d_val != sizeof(iree_elf_rela_t)) {
+          return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                                  "DT_RELAENT size mismatch");
+        }
+        break;
+
+      default:
+        // Ignored.
+        break;
+    }
+  }
+
+  // Must have .dynsym/.dynstr to perform lookups.
+  if (!module->dynstr || !module->dynstr_size || !module->dynsym ||
+      !module->dynsym_count) {
+    return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                            "missing .dynsym/.dynstr in ELF .dynamic segment");
+  }
+
+  // NOTE: we could try to verify ranges here but no one seems to do that and
+  // it's somewhat annoying. You're loading untrusted code into your memory
+  // space - this is the least of your concerns :)
+
+  return iree_ok_status();
+}
+
+// Verifies that there are no dynamic imports in the module as we don't support
+// them yet.
+static iree_status_t iree_elf_module_verify_no_imports(
+    iree_elf_module_load_state_t* load_state, iree_elf_module_t* module) {
+  // NOTE: slot 0 is always the 0 placeholder.
+  for (iree_host_size_t i = 1; i < module->dynsym_count; ++i) {
+    const iree_elf_sym_t* sym = &module->dynsym[i];
+    if (sym->st_shndx == IREE_ELF_SHN_UNDEF) {
+      const char* symname IREE_ATTRIBUTE_UNUSED =
+          sym->st_name ? module->dynstr + sym->st_name : NULL;
+      return iree_make_status(IREE_STATUS_UNAVAILABLE,
+                              "ELF imports one or more symbols (trying "
+                              "'%s'); imports are not supported in the "
+                              "platform-agnostic loader",
+                              symname);
+    }
+  }
+  return iree_ok_status();
+}
+
+//==============================================================================
+// Relocation
+//==============================================================================
+
+// Applies symbol and address base relocations to the loaded sections.
+static iree_status_t iree_elf_module_apply_relocations(
+    iree_elf_module_load_state_t* load_state, iree_elf_module_t* module) {
+  // Redirect to the architecture-specific handler.
+  iree_elf_relocation_state_t reloc_state;
+  memset(&reloc_state, 0, sizeof(reloc_state));
+  reloc_state.vaddr_bias = module->vaddr_bias;
+  reloc_state.dyn_table = load_state->dyn_table;
+  reloc_state.dyn_table_count = load_state->dyn_table_count;
+  return iree_elf_arch_apply_relocations(&reloc_state);
+}
+
+//==============================================================================
+// Initialization/finalization
+//==============================================================================
+
+// Runs initializers defined within the module, if any.
+// .init is run first and then .init_array is run in array order.
+static iree_status_t iree_elf_module_run_initializers(
+    iree_elf_module_load_state_t* load_state, iree_elf_module_t* module) {
+  if (load_state->init != IREE_ELF_ADDR_MIN) {
+    iree_elf_call_v_v((void*)(module->vaddr_bias + load_state->init));
+  }
+
+  // NOTE: entries with values of 0 or -1 must be ignored.
+  for (iree_host_size_t i = 0; i < load_state->init_array_count; ++i) {
+    iree_elf_addr_t symbol_ptr = load_state->init_array[i];
+    if (symbol_ptr == 0 || symbol_ptr == IREE_ELF_ADDR_MAX) continue;
+    iree_elf_call_v_v((void*)(module->vaddr_bias + symbol_ptr));
+  }
+
+  return iree_ok_status();
+}
+
+static void iree_elf_module_run_finalizers(iree_elf_module_t* module) {
+  // NOT IMPLEMENTED
+  // Android doesn't do this for its loader and nothing we do should ever need
+  // them: we're not doing IO or (hopefully) anything stateful inside of our
+  // HAL executables that has correctness depend on them executing.
+}
+
+//==============================================================================
+// Symbol lookup
+//==============================================================================
+
+// Resolves a global symbol within the module by symbol name.
+// Currently we don't support any hashing as we have a single exported symbol
+// and this is a simple linear scan.
+//
+// If we start to get a few dozen then it may be worth it to implement the sysv
+// style as it is smallest both in code size and ELF binary size. This can be
+// specified using --hash-style=sysv with ld/lld. By default most linkers
+// (including lld, which is what we care about) will use
+// --hash-style=both and emit both `.hash` and `.gnu.hash`, but that's silly for
+// us as ideally we'd have none. If we ever try to use this for larger libraries
+// with many exported symbols (we shouldn't!) we can add support:
+// https://docs.oracle.com/cd/E23824_01/html/819-0690/chapter6-48031.html
+// https://blogs.oracle.com/solaris/gnu-hash-elf-sections-v2
+static const iree_elf_sym_t* iree_elf_module_lookup_global_symbol(
+    iree_elf_module_t* module, const char* symbol_name) {
+  // NOTE: symtab[0] is always STN_UNDEF so we skip it.
+  // NOTE: symtab has local symbols before global ones and since we are looking
+  // for global symbols we iterate in reverse.
+  for (int i = (int)module->dynsym_count - 1; i > 0; i--) {
+    const iree_elf_sym_t* sym = &module->dynsym[i];
+    iree_elf_byte_t bind = IREE_ELF_ST_BIND(sym->st_info);
+    if (bind != IREE_ELF_STB_GLOBAL && bind != IREE_ELF_STB_WEAK) continue;
+    if (sym->st_name == 0) continue;
+    if (strcmp(module->dynstr + sym->st_name, symbol_name) == 0) {
+      return sym;
+    }
+  }
+  return NULL;
+}
+
+//==============================================================================
+// API
+//==============================================================================
+
+iree_status_t iree_elf_module_initialize_from_memory(
+    iree_const_byte_span_t raw_data,
+    const iree_elf_import_table_t* import_table,
+    iree_allocator_t host_allocator, iree_elf_module_t* out_module) {
+  IREE_ASSERT_ARGUMENT(raw_data.data);
+  IREE_ASSERT_ARGUMENT(out_module);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Parse the ELF headers and verify that it's something we can handle.
+  // Temporary state required during loading such as references to subtables
+  // within the ELF are tracked here on the stack while persistent fields are
+  // initialized on |out_module|.
+  iree_elf_module_load_state_t load_state;
+  iree_status_t status =
+      iree_elf_module_parse_headers(raw_data, &load_state, out_module);
+  out_module->host_allocator = host_allocator;
+
+  // Allocate and load the ELF into memory.
+  iree_memory_jit_context_begin();
+  if (iree_status_is_ok(status)) {
+    status = iree_elf_module_load_segments(raw_data, &load_state, out_module);
+  }
+
+  // Parse required dynamic symbol tables in loaded memory. These are used for
+  // runtime symbol resolution and relocation.
+  if (iree_status_is_ok(status)) {
+    status = iree_elf_module_parse_dynamic_tables(&load_state, out_module);
+  }
+
+  // TODO(benvanik): imports would happen here. For now we just ensure there are
+  // no imports as otherwise things will fail with obscure messages later on.
+  if (iree_status_is_ok(status)) {
+    status = iree_elf_module_verify_no_imports(&load_state, out_module);
+  }
+
+  // Apply relocations to the loaded pages.
+  if (iree_status_is_ok(status)) {
+    status = iree_elf_module_apply_relocations(&load_state, out_module);
+  }
+
+  // Apply final protections to the loaded pages now that relocations have been
+  // performed.
+  if (iree_status_is_ok(status)) {
+    status = iree_elf_module_protect_segments(&load_state, out_module);
+  }
+  iree_memory_jit_context_end();
+
+  // Run initializers prior to returning to the caller.
+  if (iree_status_is_ok(status)) {
+    status = iree_elf_module_run_initializers(&load_state, out_module);
+  }
+
+  if (!iree_status_is_ok(status)) {
+    // On failure gracefully clean up the module by releasing any allocated
+    // memory during the partial initialization.
+    iree_elf_module_deinitialize(out_module);
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+void iree_elf_module_deinitialize(iree_elf_module_t* module) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_elf_module_run_finalizers(module);
+  iree_elf_module_unload_segments(module);
+  memset(module, 0, sizeof(*module));
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+iree_status_t iree_elf_module_lookup_export(iree_elf_module_t* module,
+                                            const char* symbol_name,
+                                            void** out_export) {
+  IREE_ASSERT_ARGUMENT(module);
+  IREE_ASSERT_ARGUMENT(out_export);
+  *out_export = NULL;
+
+  const iree_elf_sym_t* sym =
+      iree_elf_module_lookup_global_symbol(module, symbol_name);
+  if (IREE_UNLIKELY(!sym)) {
+    return iree_make_status(
+        IREE_STATUS_NOT_FOUND,
+        "exported symbol with name '%s' not found in module", symbol_name);
+  }
+
+  *out_export = module->vaddr_bias + sym->st_value;
+  return iree_ok_status();
+}
diff --git a/runtime/src/iree/hal/local/elf/elf_module.h b/runtime/src/iree/hal/local/elf/elf_module.h
new file mode 100644
index 0000000..326673d
--- /dev/null
+++ b/runtime/src/iree/hal/local/elf/elf_module.h
@@ -0,0 +1,92 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_LOCAL_ELF_ELF_LINKER_H_
+#define IREE_HAL_LOCAL_ELF_ELF_LINKER_H_
+
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/local/elf/arch.h"       // IWYU pragma: export
+#include "iree/hal/local/elf/elf_types.h"  // IWYU pragma: export
+
+//==============================================================================
+// ELF symbol import table
+//==============================================================================
+
+typedef struct iree_elf_import_t {
+  const char* sym_name;
+  void* thunk_ptr;
+} iree_elf_import_t;
+
+typedef struct iree_elf_import_table_t {
+  iree_host_size_t import_count;
+  const iree_elf_import_t* imports;
+} iree_elf_import_table_t;
+
+// TODO(benvanik): add import declaration macros that setup a unique thunk like
+// IREE_ELF_DEFINE_IMPORT(foo).
+
+//==============================================================================
+// Runtime ELF module loader/linker
+//==============================================================================
+
+// An ELF module mapped directly from memory.
+typedef struct iree_elf_module_t {
+  // Allocator used for additional dynamic memory when needed.
+  iree_allocator_t host_allocator;
+
+  // Base host virtual address the module is loaded into.
+  uint8_t* vaddr_base;
+  // Total size, in bytes, of the virtual address space reservation.
+  iree_host_size_t vaddr_size;
+
+  // Bias applied to all relative addresses (from the string table, etc) in the
+  // loaded module. This is an offset from the vaddr_base that may not be 0 if
+  // host page granularity was larger than the ELF's defined granularity.
+  uint8_t* vaddr_bias;
+
+  // Dynamic symbol string table (.dynstr).
+  const char* dynstr;            // DT_STRTAB
+  iree_host_size_t dynstr_size;  // DT_STRSZ (bytes)
+
+  // Dynamic symbol table (.dynsym).
+  const iree_elf_sym_t* dynsym;   // DT_SYMTAB
+  iree_host_size_t dynsym_count;  // DT_SYMENT (bytes) / sizeof(iree_elf_sym_t)
+} iree_elf_module_t;
+
+// Initializes an ELF module from the ELF |raw_data| in memory.
+// |raw_data| only needs to remain valid for the initialization of the module
+// and may be discarded afterward.
+//
+// An optional |import_table| may be specified to provide a set of symbols that
+// the module may import. Strong imports will not be resolved from the host
+// system and initialization will fail if any are not present in the provided
+// table.
+//
+// Upon return |out_module| is initialized and ready for use with any present
+// .init initialization functions having been executed. To release memory
+// allocated by the module during loading iree_elf_module_deinitialize must be
+// called to unload when it is safe (no more outstanding pointers into the
+// loaded module, etc).
+iree_status_t iree_elf_module_initialize_from_memory(
+    iree_const_byte_span_t raw_data,
+    const iree_elf_import_table_t* import_table,
+    iree_allocator_t host_allocator, iree_elf_module_t* out_module);
+
+// Deinitializes a |module|, releasing any allocated executable or data pages.
+// Invalidates all symbol pointers previous retrieved from the module and any
+// pointer to data that may have been in the module text or rwdata.
+//
+// NOTE: .fini finalizers will not be executed.
+void iree_elf_module_deinitialize(iree_elf_module_t* module);
+
+// Returns the host pointer of an exported symbol with the given |symbol_name|.
+iree_status_t iree_elf_module_lookup_export(iree_elf_module_t* module,
+                                            const char* symbol_name,
+                                            void** out_export);
+
+#endif  // IREE_HAL_LOCAL_ELF_ELF_LINKER_H_
diff --git a/runtime/src/iree/hal/local/elf/elf_module_test_main.c b/runtime/src/iree/hal/local/elf/elf_module_test_main.c
new file mode 100644
index 0000000..1a30698
--- /dev/null
+++ b/runtime/src/iree/hal/local/elf/elf_module_test_main.c
@@ -0,0 +1,166 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/api.h"
+#include "iree/base/target_platform.h"
+#include "iree/hal/local/elf/elf_module.h"
+#include "iree/hal/local/executable_environment.h"
+#include "iree/hal/local/executable_library.h"
+
+// ELF modules for various platforms embedded in the binary:
+#include "iree/hal/local/elf/testdata/elementwise_mul.h"
+
+static iree_status_t query_arch_test_file_data(
+    iree_const_byte_span_t* out_file_data) {
+  *out_file_data = iree_make_const_byte_span(NULL, 0);
+
+  iree_string_view_t pattern = iree_string_view_empty();
+#if defined(IREE_ARCH_ARM_32)
+  pattern = iree_make_cstring_view("*_arm_32.so");
+#elif defined(IREE_ARCH_ARM_64)
+  pattern = iree_make_cstring_view("*_arm_64.so");
+#elif defined(IREE_ARCH_RISCV_32)
+  pattern = iree_make_cstring_view("*_riscv_32.so");
+#elif defined(IREE_ARCH_RISCV_64)
+  pattern = iree_make_cstring_view("*_riscv_64.so");
+#elif defined(IREE_ARCH_X86_32)
+  pattern = iree_make_cstring_view("*_x86_32.so");
+#elif defined(IREE_ARCH_X86_64)
+  pattern = iree_make_cstring_view("*_x86_64.so");
+#else
+#warning "No architecture pattern specified; ELF linker will not be tested"
+#endif  // IREE_ARCH_*
+
+  if (!iree_string_view_is_empty(pattern)) {
+    for (size_t i = 0; i < elementwise_mul_size(); ++i) {
+      const struct iree_file_toc_t* file_toc = &elementwise_mul_create()[i];
+      if (iree_string_view_match_pattern(iree_make_cstring_view(file_toc->name),
+                                         pattern)) {
+        *out_file_data =
+            iree_make_const_byte_span(file_toc->data, file_toc->size);
+        return iree_ok_status();
+      }
+    }
+  }
+
+  return iree_make_status(IREE_STATUS_NOT_FOUND,
+                          "no architecture-specific ELF binary embedded into "
+                          "the application for the current target platform");
+}
+
+static iree_status_t run_test() {
+  iree_const_byte_span_t file_data;
+  IREE_RETURN_IF_ERROR(query_arch_test_file_data(&file_data));
+
+  iree_elf_import_table_t import_table;
+  memset(&import_table, 0, sizeof(import_table));
+  iree_elf_module_t module;
+  IREE_RETURN_IF_ERROR(iree_elf_module_initialize_from_memory(
+      file_data, &import_table, iree_allocator_system(), &module));
+
+  iree_hal_executable_environment_v0_t environment;
+  iree_hal_executable_environment_initialize(iree_allocator_system(),
+                                             &environment);
+
+  void* query_fn_ptr = NULL;
+  IREE_RETURN_IF_ERROR(iree_elf_module_lookup_export(
+      &module, IREE_HAL_EXECUTABLE_LIBRARY_EXPORT_NAME, &query_fn_ptr));
+
+  union {
+    const iree_hal_executable_library_header_t** header;
+    const iree_hal_executable_library_v0_t* v0;
+  } library;
+  library.header =
+      (const iree_hal_executable_library_header_t**)iree_elf_call_p_ip(
+          query_fn_ptr, IREE_HAL_EXECUTABLE_LIBRARY_VERSION_LATEST,
+          &environment);
+  if (library.header == NULL) {
+    return iree_make_status(IREE_STATUS_NOT_FOUND,
+                            "library header is empty (version mismatch?)");
+  }
+
+  const iree_hal_executable_library_header_t* header = *library.header;
+  if (header->version != IREE_HAL_EXECUTABLE_LIBRARY_VERSION_LATEST) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "library version error");
+  }
+
+  if (strncmp(header->name, "ex", strlen(header->name)) != 0) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "library name mismatches");
+  }
+
+  if (library.v0->exports.count != 1) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "entry point count mismatches");
+  }
+
+  // ret0 = arg0 * arg1
+  float arg0[4] = {1.0f, 2.0f, 3.0f, 4.0f};
+  float arg1[4] = {100.0f, 200.0f, 300.0f, 400.0f};
+  float ret0[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+  const float expected[4] = {100.0f, 400.0f, 900.0f, 1600.0f};
+
+  size_t binding_lengths[3] = {
+      sizeof(arg0),
+      sizeof(arg1),
+      sizeof(ret0),
+  };
+  void* binding_ptrs[3] = {
+      arg0,
+      arg1,
+      ret0,
+  };
+  const iree_hal_executable_dispatch_state_v0_t dispatch_state = {
+      .workgroup_size_x = 1,
+      .workgroup_size_y = 1,
+      .workgroup_size_z = 1,
+      .workgroup_count_x = 1,
+      .workgroup_count_y = 1,
+      .workgroup_count_z = 1,
+      .max_concurrency = 1,
+      .binding_count = 1,
+      .binding_lengths = binding_lengths,
+      .binding_ptrs = binding_ptrs,
+  };
+  const iree_hal_executable_workgroup_state_v0_t workgroup_state = {
+      .workgroup_id_x = 0,
+      .workgroup_id_y = 0,
+      .workgroup_id_z = 0,
+      .processor_id = iree_cpu_query_processor_id(),
+  };
+  int ret = iree_elf_call_i_ppp((const void*)library.v0->exports.ptrs[0],
+                                (void*)&environment, (void*)&dispatch_state,
+                                (void*)&workgroup_state);
+  if (ret != 0) {
+    return iree_make_status(IREE_STATUS_INTERNAL,
+                            "dispatch function returned failure: %d", ret);
+  }
+
+  iree_status_t status = iree_ok_status();
+  for (int i = 0; i < IREE_ARRAYSIZE(expected); ++i) {
+    if (ret0[i] != expected[i]) {
+      status =
+          iree_make_status(IREE_STATUS_INTERNAL,
+                           "output mismatch: ret[%d] = %.1f, expected %.1f", i,
+                           ret0[i], expected[i]);
+      break;
+    }
+  }
+
+  iree_elf_module_deinitialize(&module);
+  return status;
+}
+
+int main() {
+  const iree_status_t result = run_test();
+  int ret = (int)iree_status_code(result);
+  if (!iree_status_is_ok(result)) {
+    iree_status_fprint(stderr, result);
+    iree_status_free(result);
+  }
+  return ret;
+}
diff --git a/runtime/src/iree/hal/local/elf/elf_types.h b/runtime/src/iree/hal/local/elf/elf_types.h
new file mode 100644
index 0000000..3952786
--- /dev/null
+++ b/runtime/src/iree/hal/local/elf/elf_types.h
@@ -0,0 +1,420 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_LOCAL_ELF_ELF_TYPES_H_
+#define IREE_HAL_LOCAL_ELF_ELF_TYPES_H_
+
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/base/target_platform.h"
+
+// This file contains the ELF data structures we use in our runtime linker and
+// the definitions to support them. The structure definitions are taken from
+// the System V ABI:
+//   http://www.sco.com/developers/gabi/latest/contents.html
+// LLVM's BinaryFormat ELF headers:
+//   third_party/llvm-project/llvm/include/llvm/BinaryFormat/ELF.h
+// And the Linux specification:
+//   https://linux.die.net/man/5/elf
+//   https://refspecs.linuxbase.org/LSB_3.1.1/LSB-Core-generic/LSB-Core-generic.html
+// (among others)
+//
+// We define both 32-bit and 64-bit variants of the structures as we support
+// both; however we only ever use one at a time based on the target
+// configuration so that we are only including the code for the
+// architecture-native integer width.
+//
+// We purposefully avoid inserting a large number of enums that we never use:
+// this implementation is just to load our own compiled HAL executables and as
+// such we control both the linker configuration used to produce the inputs we
+// load.
+//
+// Code can generally be written using only the iree_elf_* types and IREE_ELF_*
+// macros; if used consistently then only one source code definition is required
+// and it'll get compiled into the appropriate form with no additional
+// configuration.
+
+typedef uint8_t iree_elf32_byte_t;
+typedef uint32_t iree_elf32_addr_t;
+typedef uint16_t iree_elf32_half_t;
+typedef uint32_t iree_elf32_off_t;
+typedef int32_t iree_elf32_sword_t;
+typedef uint32_t iree_elf32_word_t;
+
+typedef uint8_t iree_elf64_byte_t;
+typedef uint64_t iree_elf64_addr_t;
+typedef uint16_t iree_elf64_half_t;
+typedef uint64_t iree_elf64_off_t;
+typedef int32_t iree_elf64_sword_t;
+typedef uint32_t iree_elf64_word_t;
+typedef uint64_t iree_elf64_xword_t;
+typedef int64_t iree_elf64_sxword_t;
+
+enum {
+  IREE_ELF_EI_CLASS = 4,       // IREE_ELF_ELFCLASS*
+  IREE_ELF_EI_DATA = 5,        // IREE_ELF_ELFDATA*
+  IREE_ELF_EI_VERSION = 6,     // File version (1 expected)
+  IREE_ELF_EI_OSABI = 7,       // Operating system/ABI identification
+  IREE_ELF_EI_ABIVERSION = 8,  // ABI version
+  IREE_ELF_EI_PAD = 9,         // Start of padding bytes
+  IREE_ELF_EI_NIDENT = 16,     // Size of e_ident[]
+};
+
+enum {
+  IREE_ELF_ELFCLASSNONE = 0,  // Invalid class
+  IREE_ELF_ELFCLASS32 = 1,    // 32-bit objects
+  IREE_ELF_ELFCLASS64 = 2,    // 64-bit objects
+};
+
+enum {
+  IREE_ELF_ELFDATANONE = 0,  // Invalid data encoding
+  IREE_ELF_ELFDATA2LSB = 1,  // Little-endian
+  IREE_ELF_ELFDATA2MSB = 2,  // Big-endian
+};
+
+enum {
+  IREE_ELF_ET_NONE = 0,  // No file type
+  IREE_ELF_ET_REL = 1,   // Relocatable file
+  IREE_ELF_ET_EXEC = 2,  // Executable file
+  IREE_ELF_ET_DYN = 3,   // Shared object file
+  IREE_ELF_ET_CORE = 4,  // Core file
+};
+
+typedef struct {
+  iree_elf32_byte_t e_ident[IREE_ELF_EI_NIDENT];
+  iree_elf32_half_t e_type;  // IREE_ELF_ET_*
+  iree_elf32_half_t e_machine;
+  iree_elf32_word_t e_version;
+  iree_elf32_addr_t e_entry;
+  iree_elf32_off_t e_phoff;
+  iree_elf32_off_t e_shoff;
+  iree_elf32_word_t e_flags;
+  iree_elf32_half_t e_ehsize;
+  iree_elf32_half_t e_phentsize;
+  iree_elf32_half_t e_phnum;
+  iree_elf32_half_t e_shentsize;
+  iree_elf32_half_t e_shnum;
+  iree_elf32_half_t e_shstrndx;
+} iree_elf32_ehdr_t;
+
+typedef struct {
+  iree_elf64_byte_t e_ident[IREE_ELF_EI_NIDENT];
+  iree_elf64_half_t e_type;  // IREE_ELF_ET_*
+  iree_elf64_half_t e_machine;
+  iree_elf64_word_t e_version;
+  iree_elf64_addr_t e_entry;
+  iree_elf64_off_t e_phoff;
+  iree_elf64_off_t e_shoff;
+  iree_elf64_word_t e_flags;
+  iree_elf64_half_t e_ehsize;
+  iree_elf64_half_t e_phentsize;
+  iree_elf64_half_t e_phnum;
+  iree_elf64_half_t e_shentsize;
+  iree_elf64_half_t e_shnum;
+  iree_elf64_half_t e_shstrndx;
+} iree_elf64_ehdr_t;
+
+enum {
+  IREE_ELF_PT_NULL = 0,
+  IREE_ELF_PT_LOAD = 1,
+  IREE_ELF_PT_DYNAMIC = 2,
+  IREE_ELF_PT_INTERP = 3,
+  IREE_ELF_PT_NOTE = 4,
+  IREE_ELF_PT_SHLIB = 5,
+  IREE_ELF_PT_PHDR = 6,
+  IREE_ELF_PT_GNU_RELRO = 0x6474e552,
+};
+
+enum {
+  IREE_ELF_PF_X = 0x1,  // Execute
+  IREE_ELF_PF_W = 0x2,  // Write
+  IREE_ELF_PF_R = 0x4,  // Read
+};
+
+typedef struct {
+  iree_elf32_word_t p_type;  // IREE_ELF_PT_*
+  iree_elf32_off_t p_offset;
+  iree_elf32_addr_t p_vaddr;
+  iree_elf32_addr_t p_paddr;
+  iree_elf32_word_t p_filesz;
+  iree_elf32_word_t p_memsz;
+  iree_elf32_word_t p_flags;  // IREE_ELF_PF_*
+  iree_elf32_word_t p_align;
+} iree_elf32_phdr_t;
+
+typedef struct {
+  iree_elf64_word_t p_type;   // IREE_ELF_PT_*
+  iree_elf64_word_t p_flags;  // IREE_ELF_PF_*
+  iree_elf64_off_t p_offset;
+  iree_elf64_addr_t p_vaddr;
+  iree_elf64_addr_t p_paddr;
+  iree_elf64_xword_t p_filesz;
+  iree_elf64_xword_t p_memsz;
+  iree_elf64_xword_t p_align;
+} iree_elf64_phdr_t;
+
+// An undefined, missing, irrelevant, or otherwise meaningless section ref.
+#define IREE_ELF_SHN_UNDEF 0
+
+enum {
+  IREE_ELF_SHT_NULL = 0,
+  IREE_ELF_SHT_PROGBITS = 1,
+  IREE_ELF_SHT_SYMTAB = 2,
+  IREE_ELF_SHT_STRTAB = 3,
+  IREE_ELF_SHT_RELA = 4,
+  IREE_ELF_SHT_HASH = 5,
+  IREE_ELF_SHT_DYNAMIC = 6,
+  IREE_ELF_SHT_NOTE = 7,
+  IREE_ELF_SHT_NOBITS = 8,
+  IREE_ELF_SHT_REL = 9,
+  IREE_ELF_SHT_SHLIB = 10,
+  IREE_ELF_SHT_DYNSYM = 11,
+};
+
+enum {
+  IREE_ELF_SHF_WRITE = 0x1,
+  IREE_ELF_SHF_ALLOC = 0x2,
+  IREE_ELF_SHF_EXECINSTR = 0x4,
+  IREE_ELF_SHF_MERGE = 0x10,
+  IREE_ELF_SHF_STRINGS = 0x20,
+  IREE_ELF_SHF_INFO_LINK = 0x40,
+  IREE_ELF_SHF_LINK_ORDER = 0x80,
+  IREE_ELF_SHF_OS_NONCONFORMING = 0x100,
+  IREE_ELF_SHF_GROUP = 0x200
+};
+
+typedef struct {
+  iree_elf32_word_t sh_name;
+  iree_elf32_word_t sh_type;   // IREE_ELF_SHT_*
+  iree_elf32_word_t sh_flags;  // IREE_ELF_SHF_*
+  iree_elf32_addr_t sh_addr;
+  iree_elf32_off_t sh_offset;
+  iree_elf32_word_t sh_size;
+  iree_elf32_word_t sh_link;
+  iree_elf32_word_t sh_info;
+  iree_elf32_word_t sh_addralign;
+  iree_elf32_word_t sh_entsize;
+} iree_elf32_shdr_t;
+
+typedef struct {
+  iree_elf64_word_t sh_name;
+  iree_elf64_word_t sh_type;    // IREE_ELF_SHT_*
+  iree_elf64_xword_t sh_flags;  // IREE_ELF_SHF_*
+  iree_elf64_addr_t sh_addr;
+  iree_elf64_off_t sh_offset;
+  iree_elf64_xword_t sh_size;
+  iree_elf64_word_t sh_link;
+  iree_elf64_word_t sh_info;
+  iree_elf64_xword_t sh_addralign;
+  iree_elf64_xword_t sh_entsize;
+} iree_elf64_shdr_t;
+
+typedef struct {
+  iree_elf32_word_t n_namesz;
+  iree_elf32_word_t n_descsz;
+  iree_elf32_word_t n_type;
+} iree_elf32_nhdr_t;
+
+typedef struct {
+  iree_elf64_word_t n_namesz;
+  iree_elf64_word_t n_descsz;
+  iree_elf64_word_t n_type;
+} iree_elf64_nhdr_t;
+
+#define IREE_ELF_ST_INFO(bind, type) (((bind) << 4) + ((type)&0xF))
+
+#define IREE_ELF_ST_TYPE(info) ((info)&0xF)
+enum {
+  IREE_ELF_STT_NOTYPE = 0,
+  IREE_ELF_STT_OBJECT = 1,
+  IREE_ELF_STT_FUNC = 2,
+  IREE_ELF_STT_SECTION = 3,
+  IREE_ELF_STT_FILE = 4,
+  IREE_ELF_STT_COMMON = 5,
+};
+
+#define IREE_ELF_ST_BIND(info) ((info) >> 4)
+enum {
+  IREE_ELF_STB_LOCAL = 0,   // Local symbol.
+  IREE_ELF_STB_GLOBAL = 1,  // Global symbol (export).
+  IREE_ELF_STB_WEAK = 2,    // Weak symbol (somewhat like global).
+};
+
+#define IREE_ELF_ST_VISIBILITY(o) ((o)&0x3)
+enum {
+  IREE_ELF_STV_DEFAULT = 0,
+  IREE_ELF_STV_INTERNAL = 1,
+  IREE_ELF_STV_HIDDEN = 2,
+  IREE_ELF_STV_PROTECTED = 3,
+};
+
+typedef struct {
+  iree_elf32_word_t st_name;
+  iree_elf32_addr_t st_value;
+  iree_elf32_word_t st_size;
+  iree_elf32_byte_t st_info;
+  iree_elf32_byte_t st_other;
+  iree_elf32_half_t st_shndx;
+} iree_elf32_sym_t;
+
+typedef struct {
+  iree_elf64_word_t st_name;
+  iree_elf64_byte_t st_info;
+  iree_elf64_byte_t st_other;
+  iree_elf64_half_t st_shndx;
+  iree_elf64_addr_t st_value;
+  iree_elf64_xword_t st_size;
+} iree_elf64_sym_t;
+
+enum {
+  IREE_ELF_DT_NULL = 0,                   // (no data)
+  IREE_ELF_DT_NEEDED = 1,                 // d_val
+  IREE_ELF_DT_PLTRELSZ = 2,               // d_val
+  IREE_ELF_DT_PLTGOT = 3,                 // d_ptr
+  IREE_ELF_DT_HASH = 4,                   // d_ptr
+  IREE_ELF_DT_STRTAB = 5,                 // d_ptr
+  IREE_ELF_DT_SYMTAB = 6,                 // d_ptr
+  IREE_ELF_DT_RELA = 7,                   // d_ptr
+  IREE_ELF_DT_RELASZ = 8,                 // d_val
+  IREE_ELF_DT_RELAENT = 9,                // d_val
+  IREE_ELF_DT_STRSZ = 10,                 // d_val
+  IREE_ELF_DT_SYMENT = 11,                // d_val
+  IREE_ELF_DT_INIT = 12,                  // d_ptr
+  IREE_ELF_DT_FINI = 13,                  // d_ptr
+  IREE_ELF_DT_SONAME = 14,                // d_val
+  IREE_ELF_DT_RPATH = 15,                 // d_val
+  IREE_ELF_DT_SYMBOLIC = 16,              // (no data)
+  IREE_ELF_DT_REL = 17,                   // d_ptr
+  IREE_ELF_DT_RELSZ = 18,                 // d_val
+  IREE_ELF_DT_RELENT = 19,                // d_val
+  IREE_ELF_DT_PLTREL = 20,                // d_val
+  IREE_ELF_DT_TEXTREL = 22,               // (no data)
+  IREE_ELF_DT_JMPREL = 23,                // d_ptr
+  IREE_ELF_DT_BIND_NOW = 24,              // (no data)
+  IREE_ELF_DT_INIT_ARRAY = 25,            // d_ptr
+  IREE_ELF_DT_FINI_ARRAY = 26,            // d_ptr
+  IREE_ELF_DT_INIT_ARRAYSZ = 27,          // d_val
+  IREE_ELF_DT_FINI_ARRAYSZ = 28,          // d_val
+  IREE_ELF_DT_RUNPATH = 29,               // d_val
+  IREE_ELF_DT_FLAGS = 30,                 // d_val
+  IREE_ELF_DT_SUNW_RTLDINF = 0x6000000e,  // d_ptr
+  IREE_ELF_DT_CHECKSUM = 0x6ffffdf8,      // d_val
+  IREE_ELF_DT_PLTPADSZ = 0x6ffffdf9,      // d_val
+  IREE_ELF_DT_MOVEENT = 0x6ffffdfa,       // d_val
+  IREE_ELF_DT_MOVESZ = 0x6ffffdfb,        // d_val
+  IREE_ELF_DT_FEATURE_1 = 0x6ffffdfc,     // d_val
+  IREE_ELF_DT_POSFLAG_1 = 0x6ffffdfd,     // d_val
+  IREE_ELF_DT_SYMINSZ = 0x6ffffdfe,       // d_val
+  IREE_ELF_DT_SYMINENT = 0x6ffffdff,      // d_val
+  IREE_ELF_DT_CONFIG = 0x6ffffefa,        // d_ptr
+  IREE_ELF_DT_DEPAUDIT = 0x6ffffefb,      // d_ptr
+  IREE_ELF_DT_AUDIT = 0x6ffffefc,         // d_ptr
+  IREE_ELF_DT_PLTPAD = 0x6ffffefd,        // d_ptr
+  IREE_ELF_DT_MOVETAB = 0x6ffffefe,       // d_ptr
+  IREE_ELF_DT_SYMINFO = 0x6ffffeff,       // d_ptr
+  IREE_ELF_DT_RELACOUNT = 0x6ffffff9,     // d_val
+  IREE_ELF_DT_RELCOUNT = 0x6ffffffa,      // d_val
+  IREE_ELF_DT_FLAGS_1 = 0x6ffffffb,       // d_val
+  IREE_ELF_DT_VERDEF = 0x6ffffffc,        // d_ptr
+  IREE_ELF_DT_VERDEFNUM = 0x6ffffffd,     // d_val
+  IREE_ELF_DT_VERNEED = 0x6ffffffe,       // d_ptr
+  IREE_ELF_DT_VERNEEDNUM = 0x6fffffff,    // d_val
+  IREE_ELF_DT_AUXILIARY = 0x7ffffffd,     // d_val
+  IREE_ELF_DT_USED = 0x7ffffffe,          // d_val
+};
+
+typedef struct {
+  iree_elf32_sword_t d_tag;  // IREE_ELF_DT_*
+  union {
+    iree_elf32_sword_t d_val;
+    iree_elf32_addr_t d_ptr;
+  } d_un;
+} iree_elf32_dyn_t;
+
+typedef struct {
+  iree_elf64_sxword_t d_tag;  // IREE_ELF_DT_*
+  union {
+    iree_elf64_xword_t d_val;
+    iree_elf64_addr_t d_ptr;
+  } d_un;
+} iree_elf64_dyn_t;
+
+typedef struct {
+  iree_elf32_addr_t r_offset;
+  iree_elf32_word_t r_info;
+} iree_elf32_rel_t;
+
+typedef struct {
+  iree_elf64_addr_t r_offset;
+  iree_elf64_xword_t r_info;
+} iree_elf64_rel_t;
+
+typedef struct {
+  iree_elf32_addr_t r_offset;
+  iree_elf32_word_t r_info;
+  iree_elf32_sword_t r_addend;
+} iree_elf32_rela_t;
+
+typedef struct {
+  iree_elf64_addr_t r_offset;
+  iree_elf64_xword_t r_info;
+  iree_elf64_sxword_t r_addend;
+} iree_elf64_rela_t;
+
+#if defined(IREE_PTR_SIZE_32)
+
+#define IREE_ELF_ADDR_MIN 0u
+#define IREE_ELF_ADDR_MAX UINT32_MAX
+
+typedef iree_elf32_byte_t iree_elf_byte_t;
+typedef iree_elf32_addr_t iree_elf_addr_t;
+typedef iree_elf32_half_t iree_elf_half_t;
+typedef iree_elf32_off_t iree_elf_off_t;
+typedef iree_elf32_sword_t iree_elf_sword_t;
+typedef iree_elf32_word_t iree_elf_word_t;
+
+typedef iree_elf32_dyn_t iree_elf_dyn_t;
+typedef iree_elf32_rel_t iree_elf_rel_t;
+typedef iree_elf32_rela_t iree_elf_rela_t;
+typedef iree_elf32_sym_t iree_elf_sym_t;
+typedef iree_elf32_ehdr_t iree_elf_ehdr_t;
+typedef iree_elf32_phdr_t iree_elf_phdr_t;
+typedef iree_elf32_shdr_t iree_elf_shdr_t;
+typedef iree_elf32_nhdr_t iree_elf_nhdr_t;
+
+#define IREE_ELF_R_SYM(x) ((x) >> 8)
+#define IREE_ELF_R_TYPE(x) ((x)&0xFF)
+
+#elif defined(IREE_PTR_SIZE_64)
+
+#define IREE_ELF_ADDR_MIN 0ull
+#define IREE_ELF_ADDR_MAX UINT64_MAX
+
+typedef iree_elf64_byte_t iree_elf_byte_t;
+typedef iree_elf64_addr_t iree_elf_addr_t;
+typedef iree_elf64_half_t iree_elf_half_t;
+typedef iree_elf64_off_t iree_elf_off_t;
+typedef iree_elf64_sword_t iree_elf_sword_t;
+typedef iree_elf64_word_t iree_elf_word_t;
+
+typedef iree_elf64_dyn_t iree_elf_dyn_t;
+typedef iree_elf64_rel_t iree_elf_rel_t;
+typedef iree_elf64_rela_t iree_elf_rela_t;
+typedef iree_elf64_sym_t iree_elf_sym_t;
+typedef iree_elf64_ehdr_t iree_elf_ehdr_t;
+typedef iree_elf64_phdr_t iree_elf_phdr_t;
+typedef iree_elf64_shdr_t iree_elf_shdr_t;
+typedef iree_elf64_nhdr_t iree_elf_nhdr_t;
+
+#define IREE_ELF_R_SYM(i) ((i) >> 32)
+#define IREE_ELF_R_TYPE(i) ((i)&0xFFFFFFFF)
+
+#else
+#error "unsupported ELF N size (only 32/64-bits are defined)"
+#endif  // IREE_PTR_SIZE_*
+
+#endif  // IREE_HAL_LOCAL_ELF_ELF_TYPES_H_
diff --git a/runtime/src/iree/hal/local/elf/platform.h b/runtime/src/iree/hal/local/elf/platform.h
new file mode 100644
index 0000000..03af89b
--- /dev/null
+++ b/runtime/src/iree/hal/local/elf/platform.h
@@ -0,0 +1,177 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_LOCAL_ELF_PLATFORM_H_
+#define IREE_HAL_LOCAL_ELF_PLATFORM_H_
+
+#include "iree/base/api.h"
+
+// TODO(benvanik): move some of this to iree/base/internal/. A lot of this code
+// comes from an old partial implementation of memory objects that should be
+// finished. When done it will replace the need for all of these platform files.
+
+//==============================================================================
+// Alignment utilities
+//==============================================================================
+
+// Defines a range of bytes with any arbitrary alignment.
+// Most operations will adjust this range by the allocation granularity, meaning
+// that a range that stradles a page boundary will be specifying multiple pages
+// (such as offset=1, length=4096 with a page size of 4096 indicating 2 pages).
+typedef struct iree_byte_range_t {
+  iree_host_size_t offset;
+  iree_host_size_t length;
+} iree_byte_range_t;
+
+static inline uintptr_t iree_page_align_start(uintptr_t addr,
+                                              iree_host_size_t page_alignment) {
+  return addr & (~(page_alignment - 1));
+}
+
+static inline uintptr_t iree_page_align_end(uintptr_t addr,
+                                            iree_host_size_t page_alignment) {
+  return iree_page_align_start(addr + (page_alignment - 1), page_alignment);
+}
+
+// Computes a page-aligned range base and total length from a range.
+// This will produce a starting address <= the range offset and a length >=
+// the range length.
+static inline void iree_page_align_range(void* base_address,
+                                         iree_byte_range_t range,
+                                         iree_host_size_t page_alignment,
+                                         void** out_start_address,
+                                         iree_host_size_t* out_aligned_length) {
+  void* range_start = (void*)iree_page_align_start(
+      (uintptr_t)base_address + range.offset, page_alignment);
+  void* range_end = (void*)iree_page_align_end(
+      (uintptr_t)base_address + range.offset + range.length, page_alignment);
+  *out_start_address = range_start;
+  *out_aligned_length =
+      (iree_host_size_t)range_end - (iree_host_size_t)range_start;
+}
+
+//==============================================================================
+// Memory subsystem information and control
+//==============================================================================
+
+// System platform/environment information defining memory parameters.
+// These can be used to control application behavior (such as whether to enable
+// a JIT if executable pages can be allocated) and allow callers to compute
+// memory ranges based on the variable page size of the platform.
+typedef struct iree_memory_info_t {
+  // The page size and the granularity of page protection and commitment. This
+  // is the page size used by the iree_memory_view_t functions.
+  iree_host_size_t normal_page_size;
+
+  // The granularity for the starting address at which virtual memory can be
+  // allocated.
+  iree_host_size_t normal_page_granularity;
+
+  // The minimum page size and granularity for large pages or 0 if unavailable.
+  // To use large pages the size and alignment must be a multiple of this value
+  // and the IREE_MEMORY_VIEW_FLAG_LARGE_PAGES must be set.
+  iree_host_size_t large_page_granularity;
+
+  // Indicates whether executable pages may be allocated within the process.
+  // Some platforms or release environments have restrictions on whether
+  // executable pages may be allocated from user code (such as iOS).
+  bool can_allocate_executable_pages;
+} iree_memory_info_t;
+
+// Queries the system platform/environment memory information.
+// Callers should cache the results to avoid repeated queries, such as storing
+// the used fields in an allocator upon initialization to reuse during
+// allocations made via the allocator.
+void iree_memory_query_info(iree_memory_info_t* out_info);
+
+// Enter a W^X region where pages will be changed RW->RX or RX->RW and write
+// protection should be suspended. Only effects the calling thread and must be
+// paired with iree_memory_jit_context_end.
+void iree_memory_jit_context_begin(void);
+
+// Exits a W^X region previously entered with iree_memory_jit_context_begin.
+void iree_memory_jit_context_end(void);
+
+//==============================================================================
+// Virtual address space manipulation
+//==============================================================================
+
+// Defines which access operations are allowed on a view of memory.
+// Attempts to perform an access not originally allowed when the view was
+// defined may result in process termination/exceptions/sadness on platforms
+// with real MMUs and are generally not detectable: treat limited access as a
+// fail-safe mechanism only.
+enum iree_memory_access_bits_t {
+  // Pages in the view may be read by the process.
+  // Some platforms may not respect this value being unset meaning that reads
+  // will still succeed.
+  IREE_MEMORY_ACCESS_READ = 1u << 0,
+  // Pages in the view may be written by the process.
+  // If unset then writes will result in process termination.
+  IREE_MEMORY_ACCESS_WRITE = 1u << 1,
+  // Pages in the view can be executed as native machine code.
+  // Callers must ensure iree_memory_info_t::can_allocate_executable_pages is
+  // true prior to requesting executable memory as certain platforms or release
+  // environments may not support allocating/using executable pages.
+  IREE_MEMORY_ACCESS_EXECUTE = 1u << 2,
+};
+typedef uint32_t iree_memory_access_t;
+
+// Flags used to control the behavior of allocated memory views.
+enum iree_memory_view_flag_bits_t {
+  // TODO(benvanik): pull from memory_object.h.
+  IREE_MEMORY_VIEW_FLAG_NONE = 0u,
+
+  // Indicates that the memory may be used to execute code.
+  // May be used to ask for special privileges (like MAP_JIT on MacOS).
+  IREE_MEMORY_VIEW_FLAG_MAY_EXECUTE = 1u << 10,
+};
+typedef uint32_t iree_memory_view_flags_t;
+
+// Reserves a range of virtual address space in the host process.
+// The base alignment will be that of the page granularity as specified
+// (normal or large) in |flags| and |total_length| will be adjusted to match.
+//
+// The resulting range at |out_base_address| will be uncommitted and
+// inaccessible on systems with memory protection. Pages within the range must
+// first be committed with iree_memory_view_commit_ranges and then may have
+// their access permissions changed with iree_memory_view_protect_ranges.
+//
+// Implemented by VirtualAlloc+MEM_RESERVE/mmap+PROT_NONE.
+iree_status_t iree_memory_view_reserve(iree_memory_view_flags_t flags,
+                                       iree_host_size_t total_length,
+                                       iree_allocator_t allocator,
+                                       void** out_base_address);
+
+// Releases a range of virtual address
+void iree_memory_view_release(void* base_address, iree_host_size_t total_length,
+                              iree_allocator_t allocator);
+
+// Commits pages overlapping the byte ranges defined by |byte_ranges|.
+// Ranges will be adjusted to the page granularity of the view.
+//
+// Implemented by VirtualAlloc+MEM_COMMIT/mmap+!PROT_NONE.
+iree_status_t iree_memory_view_commit_ranges(
+    void* base_address, iree_host_size_t range_count,
+    const iree_byte_range_t* ranges, iree_memory_access_t initial_access);
+
+// Changes the access protection of view byte ranges defined by |byte_ranges|.
+// Ranges will be adjusted to the page granularity of the view.
+//
+// Implemented by VirtualProtect/mprotect:
+//  https://docs.microsoft.com/en-us/windows/win32/api/memoryapi/nf-memoryapi-virtualprotect
+//  https://man7.org/linux/man-pages/man2/mprotect.2.html
+iree_status_t iree_memory_view_protect_ranges(void* base_address,
+                                              iree_host_size_t range_count,
+                                              const iree_byte_range_t* ranges,
+                                              iree_memory_access_t new_access);
+
+// Flushes the CPU instruction cache for a given range of bytes.
+// May be a no-op depending on architecture, but must be called prior to
+// executing code from any pages that have been written during load.
+void iree_memory_view_flush_icache(void* base_address, iree_host_size_t length);
+
+#endif  // IREE_HAL_LOCAL_ELF_PLATFORM_H_
diff --git a/runtime/src/iree/hal/local/elf/platform/apple.c b/runtime/src/iree/hal/local/elf/platform/apple.c
new file mode 100644
index 0000000..c6c8129
--- /dev/null
+++ b/runtime/src/iree/hal/local/elf/platform/apple.c
@@ -0,0 +1,179 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/target_platform.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/local/elf/platform.h"
+
+#if defined(IREE_PLATFORM_APPLE)
+
+// NOTE: because Apple there's some hoop-jumping to get executable code.
+// https://developer.apple.com/documentation/apple-silicon/porting-just-in-time-compilers-to-apple-silicon
+// https://keith.github.io/xcode-man-pages/pthread_jit_write_protect_np.3.html
+
+#include <errno.h>
+#include <libkern/OSCacheControl.h>
+#include <mach/vm_statistics.h>
+#include <pthread.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+// MAP_JIT and related utilities are only available on MacOS 11.0+.
+#if defined(MAC_OS_VERSION_11_0) && \
+    MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_VERSION_11_0
+#define IREE_APPLE_IF_AT_LEAST_MAC_OS_11_0(expr) \
+  if (__builtin_available(macOS 11.0, *)) {      \
+    expr                                         \
+  }
+#else
+#define IREE_APPLE_IF_AT_LEAST_MAC_OS_11_0(expr)
+#endif  // MAC_OS_VERSION_11_0
+
+//==============================================================================
+// Memory subsystem information and control
+//==============================================================================
+
+void iree_memory_query_info(iree_memory_info_t* out_info) {
+  memset(out_info, 0, sizeof(*out_info));
+
+  int page_size = sysconf(_SC_PAGESIZE);
+  out_info->normal_page_size = page_size;
+  out_info->normal_page_granularity = page_size;
+  out_info->large_page_granularity = (2 * 1024 * 1024);  // What V8 uses.
+
+  out_info->can_allocate_executable_pages = true;
+}
+
+void iree_memory_jit_context_begin(void) {
+  IREE_APPLE_IF_AT_LEAST_MAC_OS_11_0({
+    if (pthread_jit_write_protect_supported_np()) {
+      pthread_jit_write_protect_np(0);
+    }
+  });
+}
+
+void iree_memory_jit_context_end(void) {
+  IREE_APPLE_IF_AT_LEAST_MAC_OS_11_0({
+    if (pthread_jit_write_protect_supported_np()) {
+      pthread_jit_write_protect_np(1);
+    }
+  });
+}
+
+//==============================================================================
+// Virtual address space manipulation
+//==============================================================================
+
+// This user tag makes it easier to find our pages in vmmap dumps.
+#define IREE_MEMORY_MMAP_FD VM_MAKE_TAG(255)
+
+static int iree_memory_access_to_prot(iree_memory_access_t access) {
+  int prot = 0;
+  if (access & IREE_MEMORY_ACCESS_READ) prot |= PROT_READ;
+  if (access & IREE_MEMORY_ACCESS_WRITE) prot |= PROT_WRITE;
+  if (access & IREE_MEMORY_ACCESS_EXECUTE) prot |= PROT_EXEC;
+  return prot;
+}
+
+iree_status_t iree_memory_view_reserve(iree_memory_view_flags_t flags,
+                                       iree_host_size_t total_length,
+                                       iree_allocator_t allocator,
+                                       void** out_base_address) {
+  *out_base_address = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  int mmap_prot = PROT_NONE;
+  int mmap_flags = MAP_PRIVATE | MAP_ANON | MAP_NORESERVE;
+  IREE_APPLE_IF_AT_LEAST_MAC_OS_11_0({
+    if (flags & IREE_MEMORY_VIEW_FLAG_MAY_EXECUTE) {
+      mmap_flags |= MAP_JIT;
+    }
+  });
+
+  iree_status_t status = iree_ok_status();
+  void* base_address =
+      mmap(NULL, total_length, mmap_prot, mmap_flags, IREE_MEMORY_MMAP_FD, 0);
+  if (base_address == MAP_FAILED) {
+    status = iree_make_status(iree_status_code_from_errno(errno),
+                              "mmap reservation failed");
+  }
+
+  *out_base_address = base_address;
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+void iree_memory_view_release(void* base_address, iree_host_size_t total_length,
+                              iree_allocator_t allocator) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // NOTE: return value ignored as this is a shutdown path.
+  munmap(base_address, total_length);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+iree_status_t iree_memory_view_commit_ranges(
+    void* base_address, iree_host_size_t range_count,
+    const iree_byte_range_t* ranges, iree_memory_access_t initial_access) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  int mmap_prot = iree_memory_access_to_prot(initial_access);
+  int mmap_flags = MAP_PRIVATE | MAP_ANON | MAP_FIXED;
+
+  iree_status_t status = iree_ok_status();
+  for (iree_host_size_t i = 0; i < range_count; ++i) {
+    void* range_start = NULL;
+    iree_host_size_t aligned_length = 0;
+    iree_page_align_range(base_address, ranges[i], getpagesize(), &range_start,
+                          &aligned_length);
+    void* result = mmap(range_start, aligned_length, mmap_prot, mmap_flags,
+                        IREE_MEMORY_MMAP_FD, 0);
+    if (result == MAP_FAILED) {
+      status = iree_make_status(iree_status_code_from_errno(errno),
+                                "mmap commit failed");
+      break;
+    }
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+iree_status_t iree_memory_view_protect_ranges(void* base_address,
+                                              iree_host_size_t range_count,
+                                              const iree_byte_range_t* ranges,
+                                              iree_memory_access_t new_access) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  int mmap_prot = iree_memory_access_to_prot(new_access);
+
+  iree_status_t status = iree_ok_status();
+  for (iree_host_size_t i = 0; i < range_count; ++i) {
+    void* range_start = NULL;
+    iree_host_size_t aligned_length = 0;
+    iree_page_align_range(base_address, ranges[i], getpagesize(), &range_start,
+                          &aligned_length);
+    int ret = mprotect(range_start, aligned_length, mmap_prot);
+    if (ret != 0) {
+      status = iree_make_status(iree_status_code_from_errno(errno),
+                                "mprotect failed");
+      break;
+    }
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+void sys_icache_invalidate(void* start, size_t len);
+
+void iree_memory_view_flush_icache(void* base_address,
+                                   iree_host_size_t length) {
+  sys_icache_invalidate(base_address, length);
+}
+
+#endif  // IREE_PLATFORM_APPLE
diff --git a/runtime/src/iree/hal/local/elf/platform/generic.c b/runtime/src/iree/hal/local/elf/platform/generic.c
new file mode 100644
index 0000000..0f68592
--- /dev/null
+++ b/runtime/src/iree/hal/local/elf/platform/generic.c
@@ -0,0 +1,99 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/target_platform.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/local/elf/platform.h"
+
+#if defined(IREE_PLATFORM_GENERIC)
+
+#include <malloc.h>
+#include <stdlib.h>
+
+//==============================================================================
+// Memory subsystem information and control
+//==============================================================================
+
+// TODO(benvanik): control with a config.h.
+#define IREE_MEMORY_PAGE_SIZE_NORMAL 4096
+#define IREE_MEMORY_PAGE_SIZE_LARGE 4096
+
+void iree_memory_query_info(iree_memory_info_t* out_info) {
+  memset(out_info, 0, sizeof(*out_info));
+
+  out_info->normal_page_size = IREE_MEMORY_PAGE_SIZE_NORMAL;
+  out_info->normal_page_granularity = IREE_MEMORY_PAGE_SIZE_NORMAL;
+  out_info->large_page_granularity = IREE_MEMORY_PAGE_SIZE_LARGE;
+
+  out_info->can_allocate_executable_pages = true;
+}
+
+void iree_memory_jit_context_begin(void) {}
+
+void iree_memory_jit_context_end(void) {}
+
+//==============================================================================
+// Virtual address space manipulation
+//==============================================================================
+
+iree_status_t iree_memory_view_reserve(iree_memory_view_flags_t flags,
+                                       iree_host_size_t total_length,
+                                       iree_allocator_t allocator,
+                                       void** out_base_address) {
+  *out_base_address = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_status_t status =
+      iree_allocator_malloc(allocator, total_length, out_base_address);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+void iree_memory_view_release(void* base_address, iree_host_size_t total_length,
+                              iree_allocator_t allocator) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_allocator_free(allocator, base_address);
+  IREE_TRACE_ZONE_END(z0);
+}
+
+iree_status_t iree_memory_view_commit_ranges(
+    void* base_address, iree_host_size_t range_count,
+    const iree_byte_range_t* ranges, iree_memory_access_t initial_access) {
+  // No-op.
+  return iree_ok_status();
+}
+
+iree_status_t iree_memory_view_protect_ranges(void* base_address,
+                                              iree_host_size_t range_count,
+                                              const iree_byte_range_t* ranges,
+                                              iree_memory_access_t new_access) {
+  // No-op.
+  return iree_ok_status();
+}
+
+// IREE_ELF_CLEAR_CACHE can be defined externally to override this default
+// behavior.
+#if !defined(IREE_ELF_CLEAR_CACHE)
+// __has_builtin was added in GCC 10, so just hard-code the availability
+// for < 10, special cased here so it can be dropped once no longer needed.
+#if defined __GNUC__ && __GNUC__ < 10
+#define IREE_ELF_CLEAR_CACHE(start, end) __builtin___clear_cache(start, end)
+#elif defined __has_builtin
+#if __has_builtin(__builtin___clear_cache)
+#define IREE_ELF_CLEAR_CACHE(start, end) __builtin___clear_cache(start, end)
+#endif  // __builtin___clear_cache
+#endif  // __has_builtin
+#endif  // !defined(IREE_ELF_CLEAR_CACHE)
+
+#if !defined(IREE_ELF_CLEAR_CACHE)
+#error "no instruction cache clear implementation"
+#endif  // !defined(IREE_ELF_CLEAR_CACHE)
+
+void iree_memory_view_flush_icache(void* base_address,
+                                   iree_host_size_t length) {
+  IREE_ELF_CLEAR_CACHE(base_address, base_address + length);
+}
+
+#endif  // IREE_PLATFORM_GENERIC
diff --git a/runtime/src/iree/hal/local/elf/platform/linux.c b/runtime/src/iree/hal/local/elf/platform/linux.c
new file mode 100644
index 0000000..4dfc1ff
--- /dev/null
+++ b/runtime/src/iree/hal/local/elf/platform/linux.c
@@ -0,0 +1,164 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/target_platform.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/local/elf/platform.h"
+
+#if defined(IREE_PLATFORM_ANDROID) || defined(IREE_PLATFORM_LINUX)
+
+#include <errno.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+//==============================================================================
+// Memory subsystem information and control
+//==============================================================================
+
+void iree_memory_query_info(iree_memory_info_t* out_info) {
+  memset(out_info, 0, sizeof(*out_info));
+
+  int page_size = sysconf(_SC_PAGESIZE);
+  out_info->normal_page_size = page_size;
+  out_info->normal_page_granularity = page_size;
+
+  // Large pages arent't currently used so we aren't introducing the build goo
+  // to detect and use them yet.
+  // https://linux.die.net/man/3/gethugepagesizes
+  // http://manpages.ubuntu.com/manpages/bionic/man3/gethugepagesize.3.html
+  // Would be:
+  //   #include <hugetlbfs.h>
+  //   out_info->large_page_granularity = gethugepagesize();
+  out_info->large_page_granularity = page_size;
+
+  out_info->can_allocate_executable_pages = true;
+}
+
+void iree_memory_jit_context_begin(void) {}
+
+void iree_memory_jit_context_end(void) {}
+
+//==============================================================================
+// Virtual address space manipulation
+//==============================================================================
+
+static int iree_memory_access_to_prot(iree_memory_access_t access) {
+  int prot = 0;
+  if (access & IREE_MEMORY_ACCESS_READ) prot |= PROT_READ;
+  if (access & IREE_MEMORY_ACCESS_WRITE) prot |= PROT_WRITE;
+  if (access & IREE_MEMORY_ACCESS_EXECUTE) prot |= PROT_EXEC;
+  return prot;
+}
+
+iree_status_t iree_memory_view_reserve(iree_memory_view_flags_t flags,
+                                       iree_host_size_t total_length,
+                                       iree_allocator_t allocator,
+                                       void** out_base_address) {
+  *out_base_address = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  int mmap_prot = PROT_NONE;
+  int mmap_flags = MAP_PRIVATE | MAP_ANON | MAP_NORESERVE;
+
+  iree_status_t status = iree_ok_status();
+  void* base_address = mmap(NULL, total_length, mmap_prot, mmap_flags, -1, 0);
+  if (base_address == MAP_FAILED) {
+    status = iree_make_status(iree_status_code_from_errno(errno),
+                              "mmap reservation failed");
+  }
+
+  *out_base_address = base_address;
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+void iree_memory_view_release(void* base_address, iree_host_size_t total_length,
+                              iree_allocator_t allocator) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // NOTE: return value ignored as this is a shutdown path.
+  munmap(base_address, total_length);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+iree_status_t iree_memory_view_commit_ranges(
+    void* base_address, iree_host_size_t range_count,
+    const iree_byte_range_t* ranges, iree_memory_access_t initial_access) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  int mmap_prot = iree_memory_access_to_prot(initial_access);
+  int mmap_flags = MAP_PRIVATE | MAP_ANON | MAP_FIXED;
+
+  iree_status_t status = iree_ok_status();
+  for (iree_host_size_t i = 0; i < range_count; ++i) {
+    void* range_start = NULL;
+    iree_host_size_t aligned_length = 0;
+    iree_page_align_range(base_address, ranges[i], getpagesize(), &range_start,
+                          &aligned_length);
+    void* result =
+        mmap(range_start, aligned_length, mmap_prot, mmap_flags, -1, 0);
+    if (result == MAP_FAILED) {
+      status = iree_make_status(iree_status_code_from_errno(errno),
+                                "mmap commit failed");
+      break;
+    }
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+iree_status_t iree_memory_view_protect_ranges(void* base_address,
+                                              iree_host_size_t range_count,
+                                              const iree_byte_range_t* ranges,
+                                              iree_memory_access_t new_access) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  int mmap_prot = iree_memory_access_to_prot(new_access);
+
+  iree_status_t status = iree_ok_status();
+  for (iree_host_size_t i = 0; i < range_count; ++i) {
+    void* range_start = NULL;
+    iree_host_size_t aligned_length = 0;
+    iree_page_align_range(base_address, ranges[i], getpagesize(), &range_start,
+                          &aligned_length);
+    int ret = mprotect(range_start, aligned_length, mmap_prot);
+    if (ret != 0) {
+      status = iree_make_status(iree_status_code_from_errno(errno),
+                                "mprotect failed");
+      break;
+    }
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+// IREE_ELF_CLEAR_CACHE can be defined externally to override this default
+// behavior.
+#if !defined(IREE_ELF_CLEAR_CACHE)
+// __has_builtin was added in GCC 10, so just hard-code the availability
+// for < 10, special cased here so it can be dropped once no longer needed.
+#if defined __GNUC__ && __GNUC__ < 10
+#define IREE_ELF_CLEAR_CACHE(start, end) __builtin___clear_cache(start, end)
+#elif defined __has_builtin
+#if __has_builtin(__builtin___clear_cache)
+#define IREE_ELF_CLEAR_CACHE(start, end) __builtin___clear_cache(start, end)
+#endif  // __builtin___clear_cache
+#endif  // __has_builtin
+#endif  // !defined(IREE_ELF_CLEAR_CACHE)
+
+#if !defined(IREE_ELF_CLEAR_CACHE)
+#error "no instruction cache clear implementation"
+#endif  // !defined(IREE_ELF_CLEAR_CACHE)
+
+void iree_memory_view_flush_icache(void* base_address,
+                                   iree_host_size_t length) {
+  IREE_ELF_CLEAR_CACHE(base_address, base_address + length);
+}
+
+#endif  // IREE_PLATFORM_*
diff --git a/runtime/src/iree/hal/local/elf/platform/windows.c b/runtime/src/iree/hal/local/elf/platform/windows.c
new file mode 100644
index 0000000..7d3b313
--- /dev/null
+++ b/runtime/src/iree/hal/local/elf/platform/windows.c
@@ -0,0 +1,152 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/api.h"
+#include "iree/base/target_platform.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/local/elf/platform.h"
+
+#if defined(IREE_PLATFORM_WINDOWS)
+
+//==============================================================================
+// Memory subsystem information and control
+//==============================================================================
+
+void iree_memory_query_info(iree_memory_info_t* out_info) {
+  memset(out_info, 0, sizeof(*out_info));
+
+  SYSTEM_INFO system_info;
+  GetSystemInfo(&system_info);
+  out_info->normal_page_size = system_info.dwPageSize;
+  out_info->normal_page_granularity = system_info.dwAllocationGranularity;
+
+  out_info->large_page_granularity = GetLargePageMinimum();
+
+#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+  out_info->can_allocate_executable_pages = true;
+#else
+  // The application can define the `codeGeneration` property to enable use of
+  // PAGE_EXECUTE but cannot use PAGE_EXECUTE_READWRITE - it's still possible to
+  // make that work but it requires aliasing views (one with READWRITE and one
+  // with EXECUTE) and I'm not sure if anyone will ever care.
+  out_info->can_allocate_executable_pages = false;
+#endif  // WINAPI_PARTITION_DESKTOP
+}
+
+void iree_memory_jit_context_begin(void) {}
+
+void iree_memory_jit_context_end(void) {}
+
+//==============================================================================
+// Virtual address space manipulation
+//==============================================================================
+
+// https://docs.microsoft.com/en-us/windows/win32/memory/memory-protection-constants
+static DWORD iree_memory_access_to_win32_page_flags(
+    iree_memory_access_t access) {
+  DWORD protect = 0;
+  if (access & IREE_MEMORY_ACCESS_EXECUTE) {
+    if (access & IREE_MEMORY_ACCESS_WRITE) {
+      protect |= PAGE_EXECUTE_READWRITE;
+    } else if (access & IREE_MEMORY_ACCESS_READ) {
+      protect |= PAGE_EXECUTE_READ;
+    } else {
+      protect |= PAGE_EXECUTE;
+    }
+  } else if (access & IREE_MEMORY_ACCESS_WRITE) {
+    protect |= PAGE_READWRITE;
+  } else if (access & IREE_MEMORY_ACCESS_READ) {
+    protect |= PAGE_READONLY;
+  } else {
+    protect |= PAGE_NOACCESS;
+  }
+  return protect;
+}
+
+iree_status_t iree_memory_view_reserve(iree_memory_view_flags_t flags,
+                                       iree_host_size_t total_length,
+                                       iree_allocator_t allocator,
+                                       void** out_base_address) {
+  *out_base_address = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_status_t status = iree_ok_status();
+
+  void* base_address =
+      VirtualAlloc(NULL, total_length, MEM_RESERVE, PAGE_NOACCESS);
+  if (base_address == NULL) {
+    status = iree_make_status(iree_status_code_from_win32_error(GetLastError()),
+                              "VirtualAlloc failed to reserve");
+  }
+
+  *out_base_address = base_address;
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+void iree_memory_view_release(void* base_address, iree_host_size_t total_length,
+                              iree_allocator_t allocator) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  // NOTE: return value ignored as this is a shutdown path.
+  VirtualFree(base_address, 0, MEM_RELEASE);
+  IREE_TRACE_ZONE_END(z0);
+}
+
+iree_status_t iree_memory_view_commit_ranges(
+    void* base_address, iree_host_size_t range_count,
+    const iree_byte_range_t* ranges, iree_memory_access_t initial_access) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  DWORD initial_protect =
+      iree_memory_access_to_win32_page_flags(initial_access);
+
+  iree_status_t status = iree_ok_status();
+  for (iree_host_size_t i = 0; i < range_count; ++i) {
+    if (!VirtualAlloc((uint8_t*)base_address + ranges[i].offset,
+                      ranges[i].length, MEM_COMMIT, initial_protect)) {
+      status =
+          iree_make_status(iree_status_code_from_win32_error(GetLastError()),
+                           "VirtualAlloc failed to commit");
+      break;
+    }
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+iree_status_t iree_memory_view_protect_ranges(void* base_address,
+                                              iree_host_size_t range_count,
+                                              const iree_byte_range_t* ranges,
+                                              iree_memory_access_t new_access) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  DWORD new_protect = iree_memory_access_to_win32_page_flags(new_access);
+
+  iree_status_t status = iree_ok_status();
+  for (iree_host_size_t i = 0; i < range_count; ++i) {
+    uint8_t* range_address = (uint8_t*)base_address + ranges[i].offset;
+    DWORD old_protect = 0;
+    BOOL ret = VirtualProtect(range_address, ranges[i].length, new_protect,
+                              &old_protect);
+    if (!ret) {
+      status =
+          iree_make_status(iree_status_code_from_win32_error(GetLastError()),
+                           "VirtualProtect failed");
+      break;
+    }
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+void iree_memory_view_flush_icache(void* base_address,
+                                   iree_host_size_t length) {
+  FlushInstructionCache(GetCurrentProcess(), base_address, length);
+}
+
+#endif  // IREE_PLATFORM_WINDOWS
diff --git a/runtime/src/iree/hal/local/elf/testdata/BUILD b/runtime/src/iree/hal/local/elf/testdata/BUILD
new file mode 100644
index 0000000..40f0124
--- /dev/null
+++ b/runtime/src/iree/hal/local/elf/testdata/BUILD
@@ -0,0 +1,21 @@
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("//build_tools/embed_data:build_defs.bzl", "c_embed_data")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+c_embed_data(
+    name = "elementwise_mul",
+    srcs = glob(["elementwise_mul_*.so"]),
+    c_file_output = "elementwise_mul.c",
+    flatten = True,
+    h_file_output = "elementwise_mul.h",
+)
diff --git a/runtime/src/iree/hal/local/elf/testdata/CMakeLists.txt b/runtime/src/iree/hal/local/elf/testdata/CMakeLists.txt
new file mode 100644
index 0000000..4e53175
--- /dev/null
+++ b/runtime/src/iree/hal/local/elf/testdata/CMakeLists.txt
@@ -0,0 +1,27 @@
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# runtime/src/iree/hal/local/elf/testdata/BUILD                                #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
+
+iree_add_all_subdirs()
+
+file(GLOB _GLOB_ELEMENTWISE_MUL_X_SO LIST_DIRECTORIES false RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} CONFIGURE_DEPENDS elementwise_mul_*.so)
+iree_c_embed_data(
+  NAME
+    elementwise_mul
+  SRCS
+    "${_GLOB_ELEMENTWISE_MUL_X_SO}"
+  C_FILE_OUTPUT
+    "elementwise_mul.c"
+  H_FILE_OUTPUT
+    "elementwise_mul.h"
+  FLATTEN
+  PUBLIC
+)
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/runtime/src/iree/hal/local/elf/testdata/elementwise_mul.mlir b/runtime/src/iree/hal/local/elf/testdata/elementwise_mul.mlir
new file mode 100644
index 0000000..65bfa0f
--- /dev/null
+++ b/runtime/src/iree/hal/local/elf/testdata/elementwise_mul.mlir
@@ -0,0 +1,74 @@
+// An elementwise multiply of two 4xf32 values:
+//   %dst = arith.mulf %lhs, %rhs : tensor<4xf32>
+// This program could be that simple however this example demonstrates how to
+// perform workgroup-level tiling.
+//
+// Can be run with:
+// iree/hal/local/executable_library_benchmark \
+//    --executable_format=EX_ELF \
+//    --executable_file=iree/hal/local/elf/testdata/elementwise_mul_x86_64.so \
+//    --entry_point=0 \
+//    --workgroup_count_x=1 \
+//    --workgroup_count_y=1 \
+//    --workgroup_count_z=1 \
+//    --workgroup_size_x=1 \
+//    --workgroup_size_y=1 \
+//    --workgroup_size_z=1 \
+//    --binding=4xf32=1,2,3,4 \
+//    --binding=4xf32=100,200,300,400 \
+//    --binding=4xf32=0,0,0,0
+
+// lhs * rhs => dst / s0b0 * s0b1 => s0b2
+#executable_layout = #hal.executable.layout<push_constants = 0, sets = [
+  #hal.descriptor_set.layout<0, bindings = [
+    #hal.descriptor_set.binding<0, storage_buffer>,
+    #hal.descriptor_set.binding<1, storage_buffer>,
+    #hal.descriptor_set.binding<2, storage_buffer>
+  ]>
+]>
+
+// A single executable source definition is allowed per translation in this mode
+// as linking and multi-executable embedding support requires our host-side IR.
+hal.executable.source public @ex {
+  // Exported functions are declared with the layout they use and may optionally
+  // contain other information - though when hand-authoring that's usually
+  // omitted.
+  //
+  // The ordinal is used to specify the entry point on command line tools and
+  // must be unique across all entry points within the same executable.
+  hal.executable.entry_point public @elementwise_mul ordinal(0) layout(#executable_layout)
+
+  // The inner module defining the executable. This may have any number of
+  // private functions and only those with declared entry points will be
+  // exported.
+  builtin.module {
+    func.func @elementwise_mul() {
+      %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) : !flow.dispatch.tensor<readonly:4xf32>
+      %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) : !flow.dispatch.tensor<readonly:4xf32>
+      %dst = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(32) : !flow.dispatch.tensor<writeonly:4xf32>
+      %workgroup_size_x = hal.interface.workgroup.size[0] : index
+      %workgroup_id_x = hal.interface.workgroup.id[0] : index
+      %workgroup_count_x = hal.interface.workgroup.count[0] : index
+      %base_i = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
+      %step_i = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
+      %end_i = arith.constant 4 : index
+      scf.for %i = %base_i to %end_i step %step_i {
+        %remaining = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 4)>(%i)[%workgroup_size_x]
+        %lhs_tile = flow.dispatch.tensor.load %lhs, offsets = [%i], sizes = [%remaining], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<?xf32>
+        %rhs_tile = flow.dispatch.tensor.load %rhs, offsets = [%i], sizes = [%remaining], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<?xf32>
+        %dst_init = linalg.init_tensor [%remaining] : tensor<?xf32>
+        %dst_tile = linalg.generic {
+          indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
+          iterator_types = ["parallel"]
+        } ins(%lhs_tile, %rhs_tile : tensor<?xf32>, tensor<?xf32>)
+          outs(%dst_init : tensor<?xf32>) {
+          ^bb0(%lhs_value: f32, %rhs_value: f32, %init_value: f32):
+            %dst_value = arith.mulf %lhs_value, %rhs_value : f32
+            linalg.yield %dst_value : f32
+          } -> tensor<?xf32>
+        flow.dispatch.tensor.store %dst_tile, %dst, offsets = [%i], sizes = [%remaining], strides = [1] : tensor<?xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
+      }
+      return
+    }
+  }
+}
diff --git a/runtime/src/iree/hal/local/elf/testdata/elementwise_mul_arm_32.so b/runtime/src/iree/hal/local/elf/testdata/elementwise_mul_arm_32.so
new file mode 100644
index 0000000..e10b64b
--- /dev/null
+++ b/runtime/src/iree/hal/local/elf/testdata/elementwise_mul_arm_32.so
Binary files differ
diff --git a/runtime/src/iree/hal/local/elf/testdata/elementwise_mul_arm_64.so b/runtime/src/iree/hal/local/elf/testdata/elementwise_mul_arm_64.so
new file mode 100644
index 0000000..50e6fb6
--- /dev/null
+++ b/runtime/src/iree/hal/local/elf/testdata/elementwise_mul_arm_64.so
Binary files differ
diff --git a/runtime/src/iree/hal/local/elf/testdata/elementwise_mul_benchmark.txt b/runtime/src/iree/hal/local/elf/testdata/elementwise_mul_benchmark.txt
new file mode 100644
index 0000000..a8f1a46
--- /dev/null
+++ b/runtime/src/iree/hal/local/elf/testdata/elementwise_mul_benchmark.txt
@@ -0,0 +1,13 @@
+--executable_format=EX_ELF
+--executable_file=iree/hal/local/elf/testdata/elementwise_mul_x86_64.so
+--entry_point=0
+--workgroup_count_x=1
+--workgroup_count_y=1
+--workgroup_count_z=1
+--workgroup_size_x=1
+--workgroup_size_y=1
+--workgroup_size_z=1
+--max_concurrency=1
+--binding=4xf32=1,2,3,4
+--binding=4xf32=100,200,300,400
+--binding=4xf32=0,0,0,0
diff --git a/runtime/src/iree/hal/local/elf/testdata/elementwise_mul_riscv_32.so b/runtime/src/iree/hal/local/elf/testdata/elementwise_mul_riscv_32.so
new file mode 100644
index 0000000..602206c
--- /dev/null
+++ b/runtime/src/iree/hal/local/elf/testdata/elementwise_mul_riscv_32.so
Binary files differ
diff --git a/runtime/src/iree/hal/local/elf/testdata/elementwise_mul_riscv_64.so b/runtime/src/iree/hal/local/elf/testdata/elementwise_mul_riscv_64.so
new file mode 100644
index 0000000..99631b4
--- /dev/null
+++ b/runtime/src/iree/hal/local/elf/testdata/elementwise_mul_riscv_64.so
Binary files differ
diff --git a/runtime/src/iree/hal/local/elf/testdata/elementwise_mul_x86_32.so b/runtime/src/iree/hal/local/elf/testdata/elementwise_mul_x86_32.so
new file mode 100644
index 0000000..7f8d18c
--- /dev/null
+++ b/runtime/src/iree/hal/local/elf/testdata/elementwise_mul_x86_32.so
Binary files differ
diff --git a/runtime/src/iree/hal/local/elf/testdata/elementwise_mul_x86_64.so b/runtime/src/iree/hal/local/elf/testdata/elementwise_mul_x86_64.so
new file mode 100644
index 0000000..e534a22
--- /dev/null
+++ b/runtime/src/iree/hal/local/elf/testdata/elementwise_mul_x86_64.so
Binary files differ
diff --git a/runtime/src/iree/hal/local/elf/testdata/generate.sh b/runtime/src/iree/hal/local/elf/testdata/generate.sh
new file mode 100755
index 0000000..7c8df03
--- /dev/null
+++ b/runtime/src/iree/hal/local/elf/testdata/generate.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# Updates the checked-in ELF files used for testing the ELF loader.
+# In general we try not to check in binary files however these files act as a
+# test of binary compatibility for artifacts users may have produced. If a
+# build break occurs here we know that we have broken compatibility. Today this
+# happens every few months as we are not yet binary-stable but in the future
+# will be a bigger issue.
+#
+# To use, ensure iree-compile and your compiled ld.lld are on your PATH and
+# run the script:
+#   $ ./iree/hal/local/elf/testdata/generate.sh
+
+# Uncomment to see the iree-translate commands issued:
+# set -x
+set -e
+
+ROOT_DIR=$(git rev-parse --show-toplevel)
+TESTDATA="${ROOT_DIR}/iree/hal/local/elf/testdata"
+
+# $1: file name ("foo_arm_32.so")
+# $2: list of iree-translate arguments for targeting
+function compile_and_extract_library() {
+  local so_name=$1
+  shift
+  local translate_args=("$@")
+
+  echo "Updating ${TESTDATA}/${so_name}"
+
+  CMD=(
+    iree-translate
+      -iree-mlir-to-hal-executable
+      ${TESTDATA}/elementwise_mul.mlir
+      -o="${TESTDATA}/${so_name}"
+
+      -iree-hal-target-backends=dylib-llvm-aot
+      -iree-llvm-debug-symbols=false
+
+      "${translate_args[@]}"
+  )
+  "${CMD[@]}"
+}
+
+ARM_32=(
+  -iree-llvm-target-triple=armv7a-pc-linux-elf
+  -iree-llvm-target-float-abi=hard
+)
+compile_and_extract_library "elementwise_mul_arm_32.so" ${ARM_32[@]}
+
+ARM_64=(
+  -iree-llvm-target-triple=aarch64-pc-linux-elf
+)
+compile_and_extract_library "elementwise_mul_arm_64.so" ${ARM_64[@]}
+
+RISCV_32=(
+  -iree-llvm-target-triple=riscv32-pc-linux-elf
+  -iree-llvm-target-cpu=generic-rv32
+  -iree-llvm-target-cpu-features=+m,+f
+  -iree-llvm-target-abi=ilp32
+)
+compile_and_extract_library "elementwise_mul_riscv_32.so" ${RISCV_32[@]}
+
+RISCV_64=(
+  -iree-llvm-target-triple=riscv64-pc-linux-elf
+  -iree-llvm-target-cpu=generic-rv64
+  -iree-llvm-target-cpu-features=+m,+a,+f,+d,+c
+  -iree-llvm-target-abi=lp64d
+)
+compile_and_extract_library "elementwise_mul_riscv_64.so" ${RISCV_64[@]}
+
+X86_32=(
+  -iree-llvm-target-triple=i686-pc-linux-elf
+)
+compile_and_extract_library "elementwise_mul_x86_32.so" ${X86_32[@]}
+
+X86_64=(
+  -iree-llvm-target-triple=x86_64-pc-linux-elf
+)
+compile_and_extract_library "elementwise_mul_x86_64.so" ${X86_64[@]}
diff --git a/runtime/src/iree/hal/local/executable_environment.c b/runtime/src/iree/hal/local/executable_environment.c
new file mode 100644
index 0000000..cebe4e2
--- /dev/null
+++ b/runtime/src/iree/hal/local/executable_environment.c
@@ -0,0 +1,40 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/local/executable_environment.h"
+
+#include "iree/base/tracing.h"
+
+//===----------------------------------------------------------------------===//
+// iree_hal_processor_*_t
+//===----------------------------------------------------------------------===//
+
+void iree_hal_processor_query(iree_allocator_t temp_allocator,
+                              iree_hal_processor_v0_t* out_processor) {
+  IREE_ASSERT_ARGUMENT(out_processor);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  memset(out_processor, 0, sizeof(*out_processor));
+
+  // TODO(benvanik): define processor features we want to query for each arch.
+  // This needs to be baked into the executable library API and made consistent
+  // with the compiler side producing the executables that access it.
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_executable_environment_*_t
+//===----------------------------------------------------------------------===//
+
+void iree_hal_executable_environment_initialize(
+    iree_allocator_t temp_allocator,
+    iree_hal_executable_environment_v0_t* out_environment) {
+  IREE_ASSERT_ARGUMENT(out_environment);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  memset(out_environment, 0, sizeof(*out_environment));
+  iree_hal_processor_query(temp_allocator, &out_environment->processor);
+  IREE_TRACE_ZONE_END(z0);
+}
diff --git a/runtime/src/iree/hal/local/executable_environment.h b/runtime/src/iree/hal/local/executable_environment.h
new file mode 100644
index 0000000..b4d23ca
--- /dev/null
+++ b/runtime/src/iree/hal/local/executable_environment.h
@@ -0,0 +1,47 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_LOCAL_EXECUTABLE_ENVIRONMENT_H_
+#define IREE_HAL_LOCAL_EXECUTABLE_ENVIRONMENT_H_
+
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/cpu.h"
+#include "iree/hal/api.h"
+#include "iree/hal/local/executable_library.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+//===----------------------------------------------------------------------===//
+// iree_hal_processor_*_t
+//===----------------------------------------------------------------------===//
+
+// Queries the current processor information and writes it to |out_processor|.
+// |temp_allocator| may be used for temporary allocations required while
+// querying. If the processor cannot be queried then |out_processor| will be
+// zeroed.
+void iree_hal_processor_query(iree_allocator_t temp_allocator,
+                              iree_hal_processor_v0_t* out_processor);
+
+//===----------------------------------------------------------------------===//
+// iree_hal_executable_environment_*_t
+//===----------------------------------------------------------------------===//
+
+// Initializes |out_environment| to the default empty environment.
+// No imports will be available unless overridden during loading.
+// |temp_allocator| may be used for temporary allocations during initialization.
+void iree_hal_executable_environment_initialize(
+    iree_allocator_t temp_allocator,
+    iree_hal_executable_environment_v0_t* out_environment);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_LOCAL_EXECUTABLE_ENVIRONMENT_H_
diff --git a/runtime/src/iree/hal/local/executable_library.h b/runtime/src/iree/hal/local/executable_library.h
new file mode 100644
index 0000000..a579a6d
--- /dev/null
+++ b/runtime/src/iree/hal/local/executable_library.h
@@ -0,0 +1,446 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_LOCAL_EXECUTABLE_LIBRARY_H_
+#define IREE_HAL_LOCAL_EXECUTABLE_LIBRARY_H_
+
+// NOTE: this file is designed to be a standalone header: it is embedded in the
+// compiler and must not take any dependencies on the runtime HAL code.
+// Changes here will require changes to the compiler and must be versioned as if
+// this was a schema: backwards-incompatible changes require version bumps or
+// the ability to feature-detect at runtime.
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+//===----------------------------------------------------------------------===//
+// Common utilities included to reduce dependencies
+//===----------------------------------------------------------------------===//
+
+// `restrict` keyword, not supported by some older compilers.
+// We define our own macro in case dependencies use `restrict` differently.
+#if defined(_MSC_VER) && _MSC_VER >= 1900
+#define IREE_RESTRICT __restrict
+#elif defined(_MSC_VER)
+#define IREE_RESTRICT
+#elif defined(__cplusplus)
+#define IREE_RESTRICT __restrict__
+#else
+#define IREE_RESTRICT restrict
+#endif  // _MSC_VER
+
+//===----------------------------------------------------------------------===//
+// Runtime feature support metadata
+//===----------------------------------------------------------------------===//
+
+// Defines a bitfield of features that the library requires or supports.
+enum iree_hal_executable_library_feature_bits_t {
+  IREE_HAL_EXECUTABLE_LIBRARY_FEATURE_NONE = 0u,
+  // TODO(benvanik): declare features for debugging/coverage/printf/etc.
+  // These will control which symbols are injected into the library at runtime.
+};
+typedef uint32_t iree_hal_executable_library_features_t;
+
+// Defines a set of supported sanitizers that libraries may be compiled with.
+// Loaders can use this declaration to check as to whether the library is
+// compatible with the hosting environment for cases where the sanitizer
+// requires host support.
+typedef enum iree_hal_executable_library_sanitizer_kind_e {
+  IREE_HAL_EXECUTABLE_LIBRARY_SANITIZER_NONE = 0,
+  // Indicates the library is compiled to use AddressSanitizer:
+  // https://clang.llvm.org/docs/AddressSanitizer.html
+  // Equivalent compiler flag: -fsanitize=address
+  IREE_HAL_EXECUTABLE_LIBRARY_SANITIZER_ADDRESS = 1,
+  // Indicates the library is compiled to use MemorySanitizer:
+  // https://clang.llvm.org/docs/MemorySanitizer.html
+  // Equivalent compiler flag: -fsanitize=memory
+  IREE_HAL_EXECUTABLE_LIBRARY_SANITIZER_MEMORY = 2,
+  // Indicates the library is compiled to use ThreadSanitizer:
+  // https://clang.llvm.org/docs/ThreadSanitizer.html
+  // Equivalent compiler flag: -fsanitize=thread
+  IREE_HAL_EXECUTABLE_LIBRARY_SANITIZER_THREAD = 3,
+  // Indicates the library is compiled to use UndefinedBehaviorSanitizer:
+  // https://clang.llvm.org/docs/UndefinedBehaviorSanitizer.html
+  // Equivalent compiler flag: -fsanitize=undefined
+  IREE_HAL_EXECUTABLE_LIBRARY_SANITIZER_UNDEFINED = 4,
+
+  IREE_HAL_EXECUTABLE_LIBRARY_SANITIZER_MAX_ENUM = INT32_MAX,
+} iree_hal_executable_library_sanitizer_kind_t;
+
+//===----------------------------------------------------------------------===//
+// Versioning and interface querying
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_executable_environment_v0_t
+    iree_hal_executable_environment_v0_t;
+
+// Version code indicating the minimum required runtime structures.
+// Runtimes cannot load executables with newer versions but may be able to load
+// older versions if backward compatibility is enabled.
+//
+// NOTE: until we hit v1 the versioning scheme here is not set in stone.
+// We may want to make this major release number, date codes (0x20220307),
+// or some semantic versioning we track in whatever spec we end up having.
+typedef uint32_t iree_hal_executable_library_version_t;
+
+#define IREE_HAL_EXECUTABLE_LIBRARY_VERSION_0_2 0x00000002u
+
+// The latest version of the library API; can be used to populate the
+// iree_hal_executable_library_header_t::version when building libraries.
+#define IREE_HAL_EXECUTABLE_LIBRARY_VERSION_LATEST \
+  IREE_HAL_EXECUTABLE_LIBRARY_VERSION_0_2
+
+// A header present at the top of all versions of the library API used by the
+// runtime to ensure version compatibility.
+typedef struct iree_hal_executable_library_header_t {
+  // Version of the API this library was built with, which was likely the value
+  // of IREE_HAL_EXECUTABLE_LIBRARY_VERSION_LATEST.
+  iree_hal_executable_library_version_t version;
+
+  // Name used for logging/diagnostics.
+  const char* name;
+
+  // Bitfield of features required/supported by this executable.
+  iree_hal_executable_library_features_t features;
+
+  // Which sanitizer the library is compiled to use, if any.
+  // Libraries meant for use with a particular sanitizer will are only usable
+  // with hosting code that is using the same sanitizer.
+  iree_hal_executable_library_sanitizer_kind_t sanitizer;
+} iree_hal_executable_library_header_t;
+
+// Exported function from dynamic libraries for querying library information.
+//
+// The provided |max_version| is the maximum version the caller supports;
+// callees must return NULL if their lowest available version is greater
+// than the max version supported by the caller.
+//
+// The provided |environment| field contains information about the hosting
+// execution environment that the executable may use to specialize its
+// implementation, such as using specific imports or exporting
+// architecture-specific dispatch routines. Some environmental properties may
+// change per-invocation such as the CPU info when performing dispatches on
+// heterogenous processors that may change over the lifetime of the program.
+typedef const iree_hal_executable_library_header_t** (
+    *iree_hal_executable_library_query_fn_t)(
+    iree_hal_executable_library_version_t max_version,
+    const iree_hal_executable_environment_v0_t* environment);
+
+// Function name exported from dynamic libraries (pass to dlsym).
+#define IREE_HAL_EXECUTABLE_LIBRARY_EXPORT_NAME \
+  "iree_hal_executable_library_query"
+
+//===----------------------------------------------------------------------===//
+// IREE_HAL_EXECUTABLE_LIBRARY_VERSION_0_*
+//===----------------------------------------------------------------------===//
+
+// Function signature of imported functions for use in the executable.
+// Each call takes opaque parameters as defined by the imported function.
+// Both the compiler and the runtime must agree on the parameter format
+// (including struct alignment and packing) and doing so is outside the scope
+// of this API. In general one should only pass precisely what they need
+// (pointers directly into buffers being manipulated, arguments, etc) and not
+// try to replicate the dispatch structure (workgroup information and bindings)
+// so that the imported functions can be versioned independently from this
+// specification.
+//
+// Returns 0 on success and non-zero on failure. Failures will cause device loss
+// and should only be used to communicate serious issues that should abort all
+// execution within the current device. Buffer overflows are a good example of
+// a useful failure though the HAL does not mandate that all overflows are
+// caught and only that they are not harmful - clamping byte ranges and never
+// returning a failure is sufficient.
+typedef int (*iree_hal_executable_import_v0_t)(void* import_params);
+
+// A thunk function used to call an import.
+// All imports must be called through this function by passing the import
+// function pointer as the first argument followed by the arguments of the
+// import function itself.
+typedef int (*iree_hal_executable_import_thunk_v0_t)(
+    iree_hal_executable_import_v0_t fn_ptr, void* import_params);
+
+// Declares imports available to the executable library at runtime.
+// To enable linker isolation, ABI shimming, and import multi-versioning we use
+// this import table exclusively and do not allow platform-level linking. If it
+// were allowed the deployment situation gets significantly more complex as the
+// libraries containing the imported symbols will differ on all platforms, will
+// have the platform-dependent ABI (Windows, MacOS, etc), and may not be
+// available at all (bare-metal).
+//
+// Static libraries may choose to still dynamically link against external
+// symbols without using this table as in that scenario much of the above
+// concerns do not apply: all code is being linked together into the same binary
+// and symbol availability is known during build-time linking. Static linking
+// also enables LTO to strip any import not used by any executables in contrast
+// to the dynamic style elsewhere.
+//
+// Represented as a struct-of-arrays for more efficient packing and more
+// locality during lookup. Each subarray - when not omitted and NULL - is
+// indexed by import ordinal and has up to |count| entries.
+typedef struct iree_hal_executable_import_table_v0_t {
+  // Total number of imports in the table.
+  uint32_t count;
+
+  // Import symbol name encoding the name and whether it is weak.
+  // Example: `mylib_some_fn_v2?`
+  //   `mylib_...`:
+  //     Prefix indicating the owner of the function; symbols have a global
+  //     namespace and this is used to reduce collisions.
+  //   `some_fn...`:
+  //     Name of the function used to link to the imports available in the
+  //     hosting executable.
+  //   `..._v2`:
+  //     Function-specified version number used to allow multiple versions to
+  //     to be imported. For backward compatibility one could import both
+  //     `some_fn_v1?` and `some_fn_v2?` and use whichever is available.
+  //     Note that this is just a convention for the suffix and can be anything.
+  //   `?`:
+  //     Indicates when an import is optional. If the import of the specified
+  //     version is not found the table entry will be NULL. When omitted if the
+  //     import is unavailable loading will fail.
+  //
+  // The symbol table is sorted ascending alphabetical (by strcmp).
+  const char* const* symbols;
+} iree_hal_executable_import_table_v0_t;
+
+// Maximum number of data fields in iree_hal_processor_v0_t.
+#define IREE_HAL_PROCESSOR_DATA_CAPACITY_V0 8
+
+// Architecture-specific CPU information available to executables.
+// This encodes zero or more fields of opaque processor data.
+// The intent is that this structure can be put in .rodata when there are no
+// runtime features that need to be queried.
+//
+// The format of the data is architecture-specific as by construction no value
+// will ever be used in a compiled binary from another architecture. This
+// allows us to simplify this interface as we can't for example load the same
+// executable library for both aarch64 on riscv32 and don't need to normalize
+// any of the fields across them both.
+typedef struct iree_hal_processor_v0_t {
+  // Opaque architecture-specific encoding in 64-bit words.
+  // This may represent a fixed-length data structure, a series of hardware
+  // registers, or key-value pairs.
+  //
+  // The contents are opaque here as to support out-of-tree architectures. The
+  // runtime code deriving the identifier/flags and providing it here is losely
+  // coupled with the compiler code emitting checks based on the identifier and
+  // only those two places ever need to change.
+  uint64_t data[IREE_HAL_PROCESSOR_DATA_CAPACITY_V0];
+} iree_hal_processor_v0_t;
+static_assert(sizeof(iree_hal_processor_v0_t) % sizeof(uint64_t) == 0,
+              "8-byte alignment required");
+
+// Defines the environment in which the executable is being used.
+// Executables only have access to the information in this structure and must
+// make all decisions based on it; this ensures executables are portable across
+// operating environments (Linux, Mac, bare-metal, web, etc) by not having
+// platform-specific syscalls and register query emulation.
+typedef struct iree_hal_executable_environment_v0_t {
+  // Specialization constants available to the executable, if any.
+  // Contains as many as declared in the library header.
+  const uint32_t* constants;
+
+  // Thunk function for calling imports. All calls must be made through this.
+  iree_hal_executable_import_thunk_v0_t import_thunk;
+  // Optional imported functions available for use within the executable.
+  // Contains one entry per imported function. If an import was marked as weak
+  // then the corresponding entry may be NULL.
+  const iree_hal_executable_import_v0_t* imports;
+
+  // Optional architecture-specific CPU information.
+  // In heterogenous processors this may represent any of the subarchitecture
+  // types as it is derived from the core the calling thread is scheduled on.
+  // Will be all zeros if unavailable.
+  iree_hal_processor_v0_t processor;
+} iree_hal_executable_environment_v0_t;
+
+// Read-only per-dispatch state passed to each workgroup in a dispatch.
+//
+// We layout to try to fit everything commonly used into the first cache line
+// (on archs with 64-bit pointers; 32-bit fits in a single line).
+//
+// For workgroup dimensions we allow the full 32-bit range on X and Y as those
+// are the primary distribution dimensions. Z is the coarsest control and is
+// usually in the 1-16 range; any higher and it can pessimize scheduling. Almost
+// all GPUs also have this limitation (max Z of 65K) for the same reason.
+typedef struct iree_hal_executable_dispatch_state_v0_t {
+  // Workgroup size chosen for the dispatch. For compilation modes where the
+  // workgroup size is constant this may be ignored.
+  uint32_t workgroup_size_x;
+  uint32_t workgroup_size_y;
+  uint16_t workgroup_size_z;
+
+  // Total number of available 4 byte push constant values in |push_constants|.
+  uint16_t push_constant_count;
+
+  // Total workgroup count for the dispatch. This is sourced from either the
+  // original dispatch call (for iree_hal_command_buffer_dispatch) or the
+  // indirection buffer (for iree_hal_command_buffer_dispatch_indirect).
+  uint32_t workgroup_count_x;
+  uint32_t workgroup_count_y;
+  uint16_t workgroup_count_z;
+
+  // Estimated maximum concurrent workgroups; loosely maps to the number of
+  // processors allowed to execute the dispatch. The actual number will vary
+  // based on competing dispatches and dynamic executor configuration.
+  uint8_t max_concurrency;
+
+  // Total number of binding base pointers in |binding_ptrs| and
+  // |binding_lengths|. The set is packed densely based on which bindings are
+  // used (known at compile-time).
+  uint8_t binding_count;
+
+  // |push_constant_count| values.
+  const uint32_t* push_constants;
+  // Base pointers to each binding buffer.
+  void* const* binding_ptrs;
+  // The length of each binding in bytes, 1:1 with |binding_ptrs|.
+  const size_t* binding_lengths;
+
+  // NOTE: the above fields are frequently accessed and should be kept together
+  // to ensure cache-friendly behavior. The first instructions every dispatch
+  // executes are loads from the fields and we want to avoid a cascade of
+  // cache misses. Less-frequently used fields can follow.
+} iree_hal_executable_dispatch_state_v0_t;
+static_assert(sizeof(iree_hal_executable_dispatch_state_v0_t) <= 64,
+              "try keeping dispatch state small enough to fit in a cache line");
+
+// Read-only per-workgroup state passed to each workgroup in a dispatch.
+//
+// We layout to try to fit everything commonly used into the first cache line
+// (on archs with 64-bit pointers; 32-bit fits in a single line).
+typedef struct iree_hal_executable_workgroup_state_v0_t {
+  // Workgroup ID of the currently executing workgroup.
+  // This is in the range of 0-workgroup_count and each unique workgroup is to
+  // perform workgroup_size invocations.
+  uint32_t workgroup_id_x;
+  uint32_t workgroup_id_y;
+  uint16_t workgroup_id_z;
+
+  // Reserved for future use.
+  uint16_t reserved;
+
+  // Logical processor identifier used to index into processor info fields.
+  // Depending on the implementation this may be an ordinal, a bitfield, or an
+  // opaque unique identifier.
+  //
+  // NOTE: we could steal bits from the |processor_id| if needed; today the ID
+  // is the global ID but it really only needs to be within the current node
+  // (8-bits, or 16-bit for single-node thousand-core future proofing).
+  uint32_t processor_id;
+
+  // Scratch memory available for use by the workgroup.
+  // Requires a non-zero value to be specified for |local_memory_pages|; at
+  // least the size specified will be available. This memory is transient and
+  // exclusive to the workgroup. The provided pointer may be NULL if no
+  // workgroup local memory was requested.
+  void* local_memory;
+  // Total number of bytes available in |local_memory|. This may be larger than
+  // the requested amount.
+  uint32_t local_memory_size;
+
+  // +4 trailing bytes of free space
+} iree_hal_executable_workgroup_state_v0_t;
+static_assert(
+    sizeof(iree_hal_executable_workgroup_state_v0_t) <= 64,
+    "try keeping workgroup state small enough to fit in a cache line");
+
+// Function signature of exported executable entry points.
+// The same |environment| is passed to all dispatches.
+// The same |dispatch_state| is passed to all workgroups within a dispatch.
+// A unique |workgroup_state| is passed to every workgroup within a dispatch.
+//
+// Returns 0 on success and non-zero on failure. Failures will cause device loss
+// and should only be used to communicate serious issues that should abort all
+// execution within the current device. Buffer overflows are a good example of
+// a useful failure though the HAL does not mandate that all overflows are
+// caught and only that they are not harmful - clamping byte ranges and never
+// returning a failure is sufficient.
+typedef int (*iree_hal_executable_dispatch_v0_t)(
+    const iree_hal_executable_environment_v0_t* environment,
+    const iree_hal_executable_dispatch_state_v0_t* dispatch_state,
+    const iree_hal_executable_workgroup_state_v0_t* workgroup_state);
+
+// Bytes per page of workgroup local memory.
+// This is chosen to match the common page size of devices.
+#define IREE_HAL_WORKGROUP_LOCAL_MEMORY_PAGE_SIZE 4096
+
+// Attributes for exported dispatch functions defining how they are to be
+// executed. 0 defaults are well-specified and the entire attributes table may
+// be omitted if no dispatch functions require these fields.
+typedef struct iree_hal_executable_dispatch_attrs_v0_t {
+  // Number of IREE_HAL_WORKGROUP_LOCAL_MEMORY_PAGE_SIZE byte pages (or 0)
+  // indicating how much workgroup local memory is required for the dispatch.
+  // This is the size of the buffer referenced by the `local_memory` argument.
+  uint16_t local_memory_pages;
+  // Must be 0. May be used in the future for flags controlling the dispatch
+  // behavior/synchronization requirements.
+  uint16_t reserved;
+} iree_hal_executable_dispatch_attrs_v0_t;
+static_assert(sizeof(iree_hal_executable_dispatch_attrs_v0_t) == 4, "uint32_t");
+
+// A table of exported functions arranged as a struct-of-arrays for more
+// efficient packing and faster lookup. Each subarray - when not omitted and
+// NULL - is indexed by export ordinal and has up to |count| entries.
+typedef struct iree_hal_executable_export_table_v0_t {
+  // Total number of exports in the table.
+  uint32_t count;
+
+  // Function pointers for each exported entry point.
+  const iree_hal_executable_dispatch_v0_t* ptrs;
+
+  // Optional table of attributes 1:1 with ptrs.
+  // Omitting the table entirely means that no exports need workgroup local
+  // memory (or whatever else we pack into the attributes).
+  const iree_hal_executable_dispatch_attrs_v0_t* attrs;
+
+  // Optional table of export function entry point names 1:1 with ptrs.
+  // These names are only used for tracing/debugging and can be omitted to save
+  // binary size.
+  const char* const* names;
+
+  // Optional table of entry point tags 1:1 with ptrs.
+  // Used to describe the entry point in a human-readable format useful for
+  // verbose logging. The string values, when present, may be attached to
+  // tracing/debugging events related to the entry point.
+  const char* const* tags;
+} iree_hal_executable_export_table_v0_t;
+
+// A table declaring the executable-level constants that can be used to
+// specialize the executable behavior.
+typedef struct iree_hal_executable_constant_table_v0_t {
+  // Total number of constants in the table.
+  uint32_t count;
+  // We could add more metadata here if we wanted to enable reflection.
+} iree_hal_executable_constant_table_v0_t;
+
+// Structure used for v0 library interfaces.
+// The entire structure is designed to be read-only and able to live embedded in
+// the binary .rdata section.
+//
+// The information held within the structure is not cached by the runtime.
+// Implementations may choose to heap allocate this structure and modify its
+// members at runtime so long as they observe the thread-safety guarantees.
+// For example, a JIT may default all exports to JIT thunk functions and then
+// atomically swap them out for the translated function pointers as they are
+// available.
+typedef struct iree_hal_executable_library_v0_t {
+  // Version/metadata header.
+  // Will have a version of IREE_HAL_EXECUTABLE_LIBRARY_VERSION_*.
+  const iree_hal_executable_library_header_t* header;
+
+  // Table of imported functions available to functions in the executable.
+  iree_hal_executable_import_table_v0_t imports;
+
+  // Table of exported functions from the executable.
+  iree_hal_executable_export_table_v0_t exports;
+
+  // Table of executable-level constants.
+  iree_hal_executable_constant_table_v0_t constants;
+} iree_hal_executable_library_v0_t;
+
+#endif  // IREE_HAL_LOCAL_EXECUTABLE_LIBRARY_H_
diff --git a/runtime/src/iree/hal/local/executable_library_benchmark.c b/runtime/src/iree/hal/local/executable_library_benchmark.c
new file mode 100644
index 0000000..b20aa2b
--- /dev/null
+++ b/runtime/src/iree/hal/local/executable_library_benchmark.c
@@ -0,0 +1,335 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/file_io.h"
+#include "iree/base/internal/flags.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/api.h"
+#include "iree/hal/local/executable_library.h"
+#include "iree/hal/local/executable_loader.h"
+#include "iree/hal/local/local_descriptor_set_layout.h"
+#include "iree/hal/local/local_executable.h"
+#include "iree/hal/local/local_executable_layout.h"
+#include "iree/testing/benchmark.h"
+
+IREE_FLAG(string, executable_format, "",
+          "Format of the executable file being loaded.");
+IREE_FLAG(string, executable_file, "",
+          "Path to the executable library file to load.");
+
+IREE_FLAG(int32_t, entry_point, 0, "Entry point ordinal to run.");
+
+IREE_FLAG(int32_t, workgroup_count_x, 1,
+          "X dimension of the workgroup count defining the number of\n"
+          "workgroup invocations that will be run per benchmark iteration.\n"
+          "This is the fastest-changing dimension.");
+IREE_FLAG(int32_t, workgroup_count_y, 1,
+          "Y dimension of the workgroup count defining the number of\n"
+          "workgroup invocations that will be run per benchmark iteration.");
+IREE_FLAG(int32_t, workgroup_count_z, 1,
+          "Z dimension of the workgroup count defining the number of\n"
+          "workgroup invocations that will be run per benchmark iteration.\n"
+          "This is the slowest-changing dimension.");
+IREE_FLAG(int32_t, workgroup_size_x, 1,
+          "X dimension of the workgroup size passed to the executable.");
+IREE_FLAG(int32_t, workgroup_size_y, 1,
+          "Y dimension of the workgroup size passed to the executable.");
+IREE_FLAG(int32_t, workgroup_size_z, 1,
+          "Z dimension of the workgroup size passed to the executable.");
+
+IREE_FLAG(int32_t, max_concurrency, 1,
+          "Maximum available concurrency exposed to the dispatch.");
+
+// Total number of bindings we (currently) allow any executable to have.
+#define IREE_HAL_LOCAL_MAX_TOTAL_BINDING_COUNT \
+  (IREE_HAL_LOCAL_MAX_DESCRIPTOR_SET_COUNT *   \
+   IREE_HAL_LOCAL_MAX_DESCRIPTOR_BINDING_COUNT)
+
+// Parsed parameters from flags.
+// Used to construct the dispatch parameters for the benchmark invocation.
+struct {
+  int32_t push_constant_count;
+  union {
+    uint32_t ui32;
+  } push_constants[IREE_HAL_LOCAL_MAX_PUSH_CONSTANT_COUNT];
+
+  int32_t binding_count;
+  iree_string_view_t bindings[IREE_HAL_LOCAL_MAX_TOTAL_BINDING_COUNT];
+} dispatch_params = {
+    .push_constant_count = 0,
+    .binding_count = 0,
+};
+
+static iree_status_t parse_push_constant(iree_string_view_t flag_name,
+                                         void* storage,
+                                         iree_string_view_t value) {
+  IREE_ASSERT_LE(dispatch_params.push_constant_count + 1,
+                 IREE_ARRAYSIZE(dispatch_params.push_constants),
+                 "too many push constants");
+  dispatch_params.push_constants[dispatch_params.push_constant_count++].ui32 =
+      atoi(value.data);
+  return iree_ok_status();
+}
+static void print_push_constant(iree_string_view_t flag_name, void* storage,
+                                FILE* file) {
+  if (dispatch_params.push_constant_count == 0) {
+    fprintf(file, "# --%.*s=[integer value]\n", (int)flag_name.size,
+            flag_name.data);
+    return;
+  }
+  for (int32_t i = 0; i < dispatch_params.push_constant_count; ++i) {
+    fprintf(file, "--%.*s=%u", (int)flag_name.size, flag_name.data,
+            dispatch_params.push_constants[i].ui32);
+    if (i < dispatch_params.push_constant_count - 1) {
+      fprintf(file, "\n");
+    }
+  }
+}
+IREE_FLAG_CALLBACK(parse_push_constant, print_push_constant, &dispatch_params,
+                   push_constant_callback,
+                   "Appends a uint32_t push constant value.\n");
+
+static iree_status_t parse_binding(iree_string_view_t flag_name, void* storage,
+                                   iree_string_view_t value) {
+  IREE_ASSERT_LE(dispatch_params.binding_count + 1,
+                 IREE_ARRAYSIZE(dispatch_params.bindings), "too many bindings");
+  dispatch_params.bindings[dispatch_params.binding_count++] = value;
+  return iree_ok_status();
+}
+static void print_binding(iree_string_view_t flag_name, void* storage,
+                          FILE* file) {
+  if (dispatch_params.binding_count == 0) {
+    fprintf(file, "# --%.*s=\"shapextype[=values]\"\n", (int)flag_name.size,
+            flag_name.data);
+    return;
+  }
+  for (int32_t i = 0; i < dispatch_params.binding_count; ++i) {
+    const iree_string_view_t binding_str = dispatch_params.bindings[i];
+    fprintf(file, "--%.*s=\"%.*s\"\n", (int)flag_name.size, flag_name.data,
+            (int)binding_str.size, binding_str.data);
+  }
+}
+IREE_FLAG_CALLBACK(
+    parse_binding, print_binding, &dispatch_params, binding,
+    "Appends a binding to the dispatch parameters.\n"
+    "Bindings are defined by their shape, element type, and their data.\n"
+    "Examples:\n"
+    "  # 16 4-byte elements zero-initialized:\n"
+    "  --binding=2x8xi32\n"
+    "  # 10000 bytes all initialized to 123:\n"
+    "  --binding=10000xi8=123\n"
+    "  # 2 4-byte floating-point values with contents [[1.4], [2.1]]:\n"
+    "  --binding=2x1xf32=1.4,2.1");
+
+#if defined(IREE_HAL_HAVE_EMBEDDED_LIBRARY_LOADER)
+#include "iree/hal/local/loaders/embedded_library_loader.h"
+#endif  // IREE_HAL_HAVE_EMBEDDED_LIBRARY_LOADER
+
+// Creates an executable loader based on the given format flag.
+static iree_status_t iree_hal_executable_library_create_loader(
+    iree_allocator_t host_allocator,
+    iree_hal_executable_loader_t** out_executable_loader) {
+#if defined(IREE_HAL_HAVE_EMBEDDED_LIBRARY_LOADER)
+  if (strcmp(FLAG_executable_format, "EX_ELF") == 0) {
+    return iree_hal_embedded_library_loader_create(
+        iree_hal_executable_import_provider_null(), host_allocator,
+        out_executable_loader);
+  }
+#endif  // IREE_HAL_HAVE_EMBEDDED_LIBRARY_LOADER
+  return iree_make_status(
+      IREE_STATUS_UNAVAILABLE,
+      "no loader available that can handle --executable_format=%s",
+      FLAG_executable_format);
+}
+
+// NOTE: error handling is here just for better diagnostics: it is not tracking
+// allocations correctly and will leak. Don't use this as an example for how to
+// write robust code.
+static iree_status_t iree_hal_executable_library_run(
+    const iree_benchmark_def_t* benchmark_def,
+    iree_benchmark_state_t* benchmark_state) {
+  iree_allocator_t host_allocator = benchmark_state->host_allocator;
+
+  // Register the loader used to load (or find) the executable.
+  iree_hal_executable_loader_t* executable_loader = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_executable_library_create_loader(
+      host_allocator, &executable_loader));
+
+  // Setup the specification used to perform the executable load.
+  // This information is normally used to select the appropriate loader but in
+  // this benchmark we only have a single one.
+  iree_hal_executable_params_t executable_params;
+  iree_hal_executable_params_initialize(&executable_params);
+  executable_params.caching_mode =
+      IREE_HAL_EXECUTABLE_CACHING_MODE_ALLOW_OPTIMIZATION |
+      IREE_HAL_EXECUTABLE_CACHING_MODE_ALIAS_PROVIDED_DATA |
+      IREE_HAL_EXECUTABLE_CACHING_MODE_DISABLE_VERIFICATION;
+  executable_params.executable_format =
+      iree_make_cstring_view(FLAG_executable_format);
+
+  // Load the executable data.
+  iree_file_contents_t* file_contents = NULL;
+  IREE_RETURN_IF_ERROR(iree_file_read_contents(FLAG_executable_file,
+                                               host_allocator, &file_contents));
+  executable_params.executable_data = file_contents->const_buffer;
+
+  // Setup the layouts defining how each entry point is interpreted.
+  // NOTE: we know for the embedded library loader that this is not required.
+  // Other loaders may need it in which case it'll have to be provided.
+  executable_params.executable_layout_count = 0;
+  executable_params.executable_layouts = NULL;
+
+  // Perform the load, which will fail if the executable cannot be loaded or
+  // there was an issue with the layouts.
+  iree_hal_executable_t* executable = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_executable_loader_try_load(
+      executable_loader, &executable_params, &executable));
+  iree_hal_local_executable_t* local_executable =
+      iree_hal_local_executable_cast(executable);
+
+  // Allocate workgroup-local memory that each invocation can use.
+  iree_byte_span_t local_memory = iree_make_byte_span(NULL, 0);
+  iree_host_size_t local_memory_size =
+      local_executable->dispatch_attrs
+          ? local_executable->dispatch_attrs[FLAG_entry_point]
+                    .local_memory_pages *
+                IREE_HAL_WORKGROUP_LOCAL_MEMORY_PAGE_SIZE
+          : 0;
+  if (local_memory_size > 0) {
+    IREE_RETURN_IF_ERROR(iree_allocator_malloc(
+        host_allocator, local_memory_size, (void**)&local_memory.data));
+    local_memory.data_length = local_memory_size;
+  }
+
+  // Allocate storage for buffers and populate them.
+  // They only need to remain valid for the duration of the invocation and all
+  // memory accessed by the invocation will come from here.
+  iree_hal_allocator_t* heap_allocator = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_allocator_create_heap(
+      iree_make_cstring_view("benchmark"), host_allocator, host_allocator,
+      &heap_allocator));
+  iree_hal_buffer_view_t* buffer_views[IREE_HAL_LOCAL_MAX_TOTAL_BINDING_COUNT];
+  void* binding_ptrs[IREE_HAL_LOCAL_MAX_TOTAL_BINDING_COUNT];
+  size_t binding_lengths[IREE_HAL_LOCAL_MAX_TOTAL_BINDING_COUNT];
+  for (iree_host_size_t i = 0; i < dispatch_params.binding_count; ++i) {
+    IREE_RETURN_IF_ERROR(iree_hal_buffer_view_parse(
+        dispatch_params.bindings[i], heap_allocator, &buffer_views[i]));
+    iree_hal_buffer_t* buffer = iree_hal_buffer_view_buffer(buffer_views[i]);
+    iree_device_size_t buffer_length =
+        iree_hal_buffer_view_byte_length(buffer_views[i]);
+    iree_hal_buffer_mapping_t buffer_mapping = {{0}};
+    IREE_RETURN_IF_ERROR(iree_hal_buffer_map_range(
+        buffer, IREE_HAL_MAPPING_MODE_PERSISTENT,
+        IREE_HAL_MEMORY_ACCESS_READ | IREE_HAL_MEMORY_ACCESS_WRITE, 0,
+        buffer_length, &buffer_mapping));
+    binding_ptrs[i] = buffer_mapping.contents.data;
+    binding_lengths[i] = (size_t)buffer_mapping.contents.data_length;
+  }
+
+  // Setup dispatch state.
+  const iree_hal_executable_dispatch_state_v0_t dispatch_state = {
+      .workgroup_count_x = FLAG_workgroup_count_x,
+      .workgroup_count_y = FLAG_workgroup_count_y,
+      .workgroup_count_z = FLAG_workgroup_count_z,
+      .workgroup_size_x = FLAG_workgroup_size_x,
+      .workgroup_size_y = FLAG_workgroup_size_y,
+      .workgroup_size_z = FLAG_workgroup_size_z,
+      .max_concurrency = FLAG_max_concurrency,
+      .push_constant_count = dispatch_params.push_constant_count,
+      .push_constants = &dispatch_params.push_constants[0].ui32,
+      .binding_count = dispatch_params.binding_count,
+      .binding_ptrs = binding_ptrs,
+      .binding_lengths = binding_lengths,
+  };
+
+  // Execute benchmark the workgroup invocation.
+  // Note that each iteration runs through the whole grid as it's important that
+  // we are testing the memory access patterns: if we just ran the same single
+  // tile processing the same exact region of memory over and over we are not
+  // testing cache effects.
+  int64_t dispatch_count = 0;
+  while (iree_benchmark_keep_running(benchmark_state, /*batch_count=*/1)) {
+    IREE_RETURN_IF_ERROR(iree_hal_local_executable_issue_dispatch_inline(
+        local_executable, FLAG_entry_point, &dispatch_state, 0, local_memory));
+    ++dispatch_count;
+  }
+
+  // To get a total time per invocation we set the item count to the total
+  // invocations dispatched. That gives us both total dispatch and single
+  // invocation times in the reporter output.
+  int64_t total_invocations =
+      dispatch_count * dispatch_state.workgroup_count_x *
+      dispatch_state.workgroup_count_y * dispatch_state.workgroup_count_z;
+  iree_benchmark_set_items_processed(benchmark_state, total_invocations);
+
+  // Deallocate buffers.
+  for (iree_host_size_t i = 0; i < dispatch_params.binding_count; ++i) {
+    iree_hal_buffer_view_release(buffer_views[i]);
+  }
+  iree_hal_allocator_release(heap_allocator);
+
+  // Unload.
+  iree_hal_executable_release(executable);
+  iree_hal_executable_loader_release(executable_loader);
+  iree_file_contents_free(file_contents);
+
+  return iree_ok_status();
+}
+
+int main(int argc, char** argv) {
+  iree_flags_set_usage(
+      "executable_library_benchmark",
+      "Benchmarks a single entry point within an executable library.\n"
+      "Executable libraries can be found in your temp path when compiling\n"
+      "with `-iree-llvm-keep-linker-artifacts`. The parameters used can be\n"
+      "inferred from the entry point `hal.interface` and dispatches to it.\n"
+      "\n"
+      "Note that this tool is intentionally low level: you must specify all\n"
+      "of the push constant/binding parameters precisely as they are expected\n"
+      "by the executable. `iree-benchmark-module` is the user-friendly\n"
+      "benchmarking tool while this one favors direct access to the\n"
+      "executables (bypassing all of the IREE VM, HAL APIs, task system,\n"
+      "etc).\n"
+      "\n"
+      "Example --flagfile:\n"
+      "  --executable_format=EX_ELF\n"
+      "  --executable_file=iree/hal/local/elf/testdata/"
+      "elementwise_mul_x86_64.so\n"
+      "  --entry_point=0\n"
+      "  --workgroup_count_x=1\n"
+      "  --workgroup_count_y=1\n"
+      "  --workgroup_count_z=1\n"
+      "  --workgroup_size_x=1\n"
+      "  --workgroup_size_y=1\n"
+      "  --workgroup_size_z=1\n"
+      "  --binding=4xf32=1,2,3,4\n"
+      "  --binding=4xf32=100,200,300,400\n"
+      "  --binding=4xf32=0,0,0,0);\n"
+      "\n");
+
+  iree_flags_parse_checked(IREE_FLAGS_PARSE_MODE_UNDEFINED_OK, &argc, &argv);
+  iree_benchmark_initialize(&argc, argv);
+
+  // TODO(benvanik): override these with our own flags.
+  iree_benchmark_def_t benchmark_def = {
+      .flags = IREE_BENCHMARK_FLAG_MEASURE_PROCESS_CPU_TIME |
+               IREE_BENCHMARK_FLAG_USE_REAL_TIME,
+      .time_unit = IREE_BENCHMARK_UNIT_NANOSECOND,
+      .minimum_duration_ns = 0,
+      .iteration_count = 0,
+      .run = iree_hal_executable_library_run,
+  };
+  iree_benchmark_register(iree_make_cstring_view("dispatch"), &benchmark_def);
+
+  iree_benchmark_run_specified();
+  return 0;
+}
diff --git a/runtime/src/iree/hal/local/executable_library_benchmark.md b/runtime/src/iree/hal/local/executable_library_benchmark.md
new file mode 100644
index 0000000..e988fbd
--- /dev/null
+++ b/runtime/src/iree/hal/local/executable_library_benchmark.md
@@ -0,0 +1,223 @@
+executable_library_benchmark
+---
+
+Use `iree/hal/local/executable_library_benchmark --help` for more information.
+This tool is intended for CPU codegen developers only and cuts into the system
+at the lowest level possible: if you wish this was automated or easier to use
+then you should be looking elsewhere in the stack.
+
+The best inputs for this are those that result in a single dispatch function
+so that you don't have to look hard to figure out what all the flags are. As
+the fusion is compiler-driven this can be tricky to ensure.
+
+Keep in mind that in IREE the generated HAL executables and the functions they
+contain are an internal implementation detail of the compiler. Using this tool
+is effectively the same as taking some random assembly dump of a C program and
+trying to call one of the private functions inside of it: it's opaque,
+ever-changing, and unfriendly for a reason!
+
+---
+
+### Full example using the files checked in to the repo
+
+Start here to ensure you have a working build and see the expected output:
+
+```
+iree/hal/local/executable_library_benchmark \
+    --executable_format=EX_ELF \
+    --executable_file=iree/hal/local/elf/testdata/elementwise_mul_x86_64.so \
+    --entry_point=0 \
+    --workgroup_count_x=1 \
+    --workgroup_count_y=1 \
+    --workgroup_count_z=1 \
+    --workgroup_size_x=1 \
+    --workgroup_size_y=1 \
+    --workgroup_size_z=1 \
+    --binding=4xf32=1,2,3,4 \
+    --binding=4xf32=100,200,300,400 \
+    --binding=4xf32=0,0,0,0
+```
+
+```
+---------------------------------------------------------------------------------------------
+Benchmark                                   Time             CPU   Iterations UserCounters...
+---------------------------------------------------------------------------------------------
+BM_dispatch/process_time/real_time       90.7 ns         90.9 ns      7739262 items_per_second=11.0312M/s
+```
+
+---
+
+It can be helpful to put the flags in flagfiles (newline separated):
+
+```
+iree/hal/local/executable_library_benchmark --flagfile=my_flags.txt
+```
+
+For an example, the flags for an x86-64 run of a simple element-wise multiply:
+
+```
+iree/hal/local/executable_library_benchmark --flagfile=iree/hal/local/testdata/elementwise_mul_benchmark.txt
+```
+
+---
+
+### Running standalone HAL executables
+
+This approach uses an explicitly specified HAL executable without any associated
+host code. When doing this the executable layout specifying the bindings and
+push constants is chosen by the user instead of being automatically derived by
+the compiler. The design of the layout can have performance implications and
+it's important to try to match the kind of layout the compiler would produce or
+ensure that what's being tested is relatively immune to the potential effects
+(having enough work per workgroup, etc).
+
+1. Hand-author a `hal.executable.source` op or extract a `hal.executable`
+
+See [iree/hal/local/testdata/elementwise_mul.mlir](iree/hal/local/testdata/elementwise_mul.mlir)
+for an example of the former that allows for the same source to be retargeted
+to many different formats/architectures.
+
+2. Translate the executable into the binary form consumed by the IREE loaders:
+
+```
+iree-translate \
+    -iree-mlir-to-hal-executable \
+    iree/hal/local/testdata/elementwise_mul.mlir \
+    -o=elementwise_mul.so \
+    -iree-hal-target-backends=dylib-llvm-aot \
+    -iree-llvm-debug-symbols=false \
+    -iree-llvm-target-triple=x86_64-pc-linux-elf
+```
+
+Note that the architecture and other related LLVM flags must be specified by the
+user. Some examples can be seen in [iree/hal/local/testdata/generate.sh](iree/hal/local/testdata/generate.sh).
+
+3. Setup flags
+
+Use the above example flagfile as a template or read below for details on how
+to map the parameters. You'll need to specify the executable file and entry
+point, the workgroup parameters, and any bindings and push constants used for
+I/O.
+
+---
+
+### Running executables from full user modules
+
+This approach extracts the embedded executable files contained within a full
+IREE module and allows for benchmarking of any of them by using the
+`--entry_point=` flag to select the executable. It's important to remember that
+the exact set of bindings and parameters are implementation details of the
+compiler and subject to change at any time - when using this approach one must
+inspect the IR to find the proper way to call their kernels.
+
+1. Build your module with the flags you want for your target architecture:
+
+```
+iree-compile \
+    -iree-input-type=mhlo \
+    iree/samples/simple_embedding/simple_embedding_test.mlir \
+    -o=module.vmfb \
+    -iree-hal-target-backends=dylib-llvm-aot \
+    -iree-llvm-debug-symbols=false \
+    -iree-llvm-target-triple=x86_64-pc-linux-elf \
+    -mlir-print-ir-after-all \
+    >module_dump.mlir 2>&1
+```
+
+This produces `module_dump.mlir` containing the IR at various stages.
+You'll need this to determine the flags used to invoke the dispatch.
+
+2. Extract the executable shared object from the module:
+
+```
+7z e -aoa -bb0 -y module.vmfb
+```
+
+This (today) results in a single extracted file you pass to the tool:
+
+```
+--executable_format=EX_ELF
+--executable_file=_simple_mul_dispatch_0_llvm_binary_ex_elf.so
+```
+
+3. Find `ResolveEntryPointOrdinalsPass` and look for the dispatch:
+
+```mlir
+  hal.command_buffer.dispatch<%cmd : !hal.command_buffer>
+      target(%3 : !hal.executable)[1]
+      workgroups([%c5, %c6, %c7])
+```
+
+This maps to the following flags defining the executable entry point and counts:
+
+```
+--entry_point=1
+--workgroup_count_x=5
+--workgroup_count_y=6
+--workgroup_count_z=7
+```
+
+4. Look up in the IR from that for where bindings are specified:
+
+```mlir
+  hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer>
+      layout(%0 : !hal.executable_layout)[%c0]
+      bindings([
+        %c0 = (%buffer : !hal.buffer)[%c0, %c16],
+        %c1 = (%buffer_0 : !hal.buffer)[%c0, %c16],
+        %c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
+      ])
+```
+
+This is 3 buffers of 16 bytes each, which is enough to call most things:
+
+```
+--binding=16xi8
+--binding=16xi8
+--binding=16xi8
+```
+
+If you want to provide real data then you can look for the `flow.executable`
+with the `!flow.dispatch.tensor` operands:
+
+```mlir
+  func.func @simple_mul_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:4xf32>,
+                              %arg1: !flow.dispatch.tensor<readonly:4xf32>,
+                              %arg2: !flow.dispatch.tensor<writeonly:4xf32>) {
+```
+
+Now we know each binding is 4 floats and can get more realistic test data:
+
+```
+--binding=4xf32=1,2,3,4
+--binding=4xf32=100,200,300,400
+--binding=4xf32=0,0,0,0
+```
+
+**Note that multiple tensors may alias to a single binding** - including
+tensors of differing data types. It's best to use the generic
+`[byte length]xi8` form above instead of trying to match the types in all but
+the most simple scenarios. You don't want to be using this tool to verify
+results and the only time it should matter what the value of the inputs are is
+if there is branching behavior inside the generated code itself. These are not
+good candidates for this tool.
+
+5. Look up in the IR to see the values of push constants, if required:
+
+```mlir
+  hal.command_buffer.push_constants<%cmd : !hal.command_buffer>
+      layout(%0 : !hal.executable_layout)
+      offset(0)
+      values(%c1, %c2, %c3, %c4) : i32, i32, i32, i32
+```
+
+These are often shape dimensions but by this point they are hard to guess if
+non-constant. This microbenchmarking approach is not generally suited for
+things like this but in cases where you know the meaning you can provide values:
+
+```
+--push_constant=1
+--push_constant=2
+--push_constant=3
+--push_constant=4
+```
diff --git a/runtime/src/iree/hal/local/executable_library_demo.c b/runtime/src/iree/hal/local/executable_library_demo.c
new file mode 100644
index 0000000..af18875
--- /dev/null
+++ b/runtime/src/iree/hal/local/executable_library_demo.c
@@ -0,0 +1,120 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/local/executable_library_demo.h"
+
+#include <stddef.h>
+
+// An executable entry point, called one or more times based on the 3D XYZ
+// workgroup count specified during the dispatch. Each invocation gets access to
+// the dispatch state via |dispatch_state| such as workgroup parameters, push
+// constants providing small arguments, and buffer bindings.
+//
+// See the iree_hal_executable_dispatch_state_v0_t struct for more
+// information on the fields here and how they can be used.
+//
+// WARNING: these functions must not access mutable global state: read-only data
+// may be used but as each invocation may be running concurrently with any
+// number of other invocations (from any number of user sessions!) all
+// communication between invocations must use the buffer bindings for I/O.
+//
+// This is a simple scalar addition:
+//    binding[1] = binding[0] + push_constant[0]
+static int dispatch_tile_a(
+    const iree_hal_executable_environment_v0_t* environment,
+    const iree_hal_executable_dispatch_state_v0_t* dispatch_state,
+    const iree_hal_executable_workgroup_state_v0_t* workgroup_state) {
+  const dispatch_tile_a_push_constants_t* push_constants =
+      (const dispatch_tile_a_push_constants_t*)dispatch_state->push_constants;
+  const float* src = ((const float*)dispatch_state->binding_ptrs[0]);
+  float* dst = ((float*)dispatch_state->binding_ptrs[1]);
+  const uint32_t x = workgroup_state->workgroup_id_x;
+  dst[x] = src[x] + push_constants->f0;
+  return 0;
+}
+
+// Just another entry point.
+static int dispatch_tile_b(
+    const iree_hal_executable_environment_v0_t* environment,
+    const iree_hal_executable_dispatch_state_v0_t* dispatch_state,
+    const iree_hal_executable_workgroup_state_v0_t* workgroup_state) {
+  return 0;
+}
+
+// Version/metadata header.
+static const iree_hal_executable_library_header_t header = {
+    // Declares what library version is present: newer runtimes may support
+    // loading older executables but newer executables cannot load on older
+    // runtimes.
+    .version = IREE_HAL_EXECUTABLE_LIBRARY_VERSION_LATEST,
+    // Name used for logging/diagnostics and rendezvous.
+    .name = "demo_library",
+    .features = IREE_HAL_EXECUTABLE_LIBRARY_FEATURE_NONE,
+    .sanitizer = IREE_HAL_EXECUTABLE_LIBRARY_SANITIZER_NONE,
+};
+// Table of export function entry points.
+static const iree_hal_executable_dispatch_v0_t entry_points[2] = {
+    dispatch_tile_a,
+    dispatch_tile_b,
+};
+// Optional attributes for each dispatch function used by the runtime.
+// The table can be omitted if no attributes are non-zero. We don't use
+// local_memory in our dispatches here and don't need to specify the sizes.
+static const iree_hal_executable_dispatch_attrs_v0_t entry_attrs[2] = {
+    {
+        .local_memory_pages = 0,
+    },
+    {
+        .local_memory_pages = 0,
+    },
+};
+// Names for each entry point.
+static const char* entry_point_names[2] = {
+    "dispatch_tile_a",
+    "dispatch_tile_b",
+};
+// User tags for debugging/logging; not used for anything but presentation.
+static const char* entry_point_tags[2] = {
+    "matmul+div",
+    "conv2d[512x512]",
+};
+static const iree_hal_executable_library_v0_t library = {
+    .header = &header,
+    .imports =
+        {
+            .count = 0,
+            .symbols = NULL,
+        },
+    .exports =
+        {
+            .count = 2,
+            .ptrs = entry_points,
+            .attrs = entry_attrs,
+            .names = entry_point_names,
+            .tags = entry_point_tags,
+        },
+    .constants =
+        {
+            .count = 0,
+        },
+};
+
+// The primary access point to the executable: in a static library this is
+// just like any other C symbol that can be called from other code (like
+// executable_library_test.c does), and in dynamic libraries this is the symbol
+// that you would be dlsym'ing.
+//
+// This is just code: if the executable wants to return different headers based
+// on the currently executing architecture or the requested version it can. For
+// example, an executable may want to swap out a few entry points to an
+// architecture-specific version.
+const iree_hal_executable_library_header_t** demo_executable_library_query(
+    iree_hal_executable_library_version_t max_version,
+    const iree_hal_executable_environment_v0_t* environment) {
+  return max_version <= IREE_HAL_EXECUTABLE_LIBRARY_VERSION_LATEST
+             ? (const iree_hal_executable_library_header_t**)&library
+             : NULL;
+}
diff --git a/runtime/src/iree/hal/local/executable_library_demo.h b/runtime/src/iree/hal/local/executable_library_demo.h
new file mode 100644
index 0000000..f458768
--- /dev/null
+++ b/runtime/src/iree/hal/local/executable_library_demo.h
@@ -0,0 +1,53 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_LOCAL_EXECUTABLE_LIBRARY_DEMO_H_
+#define IREE_HAL_LOCAL_EXECUTABLE_LIBRARY_DEMO_H_
+
+#include <stdint.h>
+
+#include "iree/hal/local/executable_library.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Ideally we would have the IREE compiler generate a header like this so that
+// it's possible to manually call into executables. For now this is just an
+// example for the demo: the real HAL does not require this header as it
+// dlsym's the function pointer and packs the push constants itself.
+
+// Push constants used in the 'dispatch_tile_a' entry point.
+typedef union {
+  uint32_t values[1];
+  struct {
+    float f0;
+  };
+} dispatch_tile_a_push_constants_t;
+
+// Returns a simple demo library with the following structure:
+//
+// Name: 'demo_library'
+//
+// [0] 'dispatch_tile_a': matmul+div
+//       push constants: 1 (dispatch_tile_a_push_constants_t)
+//       bindings: 2
+//         [0] = R
+//         [1] = W
+//
+// [1] 'dispatch_tile_b': conv2d[512x512]
+//       push constants: 0
+//       bindings: 0
+//
+const iree_hal_executable_library_header_t** demo_executable_library_query(
+    iree_hal_executable_library_version_t max_version,
+    const iree_hal_executable_environment_v0_t* environment);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_LOCAL_EXECUTABLE_LIBRARY_DEMO_H_
diff --git a/runtime/src/iree/hal/local/executable_library_test.c b/runtime/src/iree/hal/local/executable_library_test.c
new file mode 100644
index 0000000..f925117
--- /dev/null
+++ b/runtime/src/iree/hal/local/executable_library_test.c
@@ -0,0 +1,124 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/local/executable_library.h"
+
+#include <stdbool.h>
+#include <string.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/local/executable_environment.h"
+#include "iree/hal/local/executable_library_demo.h"
+
+// Demonstration of the HAL-side of the iree_hal_executable_library_t ABI.
+// This is the lowest level of the system right before calling into generated
+// code.
+//
+// This shows what the various execution systems are doing (through a lot
+// of fancy means): all `inline_command_buffer.c` and `task_command_buffer.c`
+// lead up to just calling into the iree_hal_executable_dispatch_v0_t entry
+// point functions with a state structure and a workgroup XYZ.
+//
+// Below walks through acquiring the library pointer (which in this case is a
+// hand-coded example to show the codegen-side), setting up the I/O buffers and
+// state, and calling the function to do some math.
+//
+// See iree/hal/local/executable_library.h for more information.
+int main(int argc, char** argv) {
+  // Default environment.
+  iree_hal_executable_environment_v0_t environment;
+  iree_hal_executable_environment_initialize(iree_allocator_system(),
+                                             &environment);
+
+  // Query the library header at the requested version.
+  // The query call in this example is going into the handwritten demo code
+  // but could be targeted at generated files or runtime-loaded shared objects.
+  union {
+    const iree_hal_executable_library_header_t** header;
+    const iree_hal_executable_library_v0_t* v0;
+  } library;
+  library.header = demo_executable_library_query(
+      IREE_HAL_EXECUTABLE_LIBRARY_VERSION_LATEST, &environment);
+  IREE_ASSERT_NE(library.header, NULL, "version may not have matched");
+  const iree_hal_executable_library_header_t* header = *library.header;
+  IREE_ASSERT_NE(header, NULL, "version may not have matched");
+  IREE_ASSERT_LE(
+      header->version, IREE_HAL_EXECUTABLE_LIBRARY_VERSION_LATEST,
+      "expecting the library to have the same or older version as us");
+  IREE_ASSERT(strcmp(header->name, "demo_library") == 0,
+              "library name can be used to rendezvous in a registry");
+  IREE_ASSERT_GT(library.v0->exports.count, 0,
+                 "expected at least one entry point");
+
+  // Push constants are an array of 4-byte values that are much more efficient
+  // to specify (no buffer pointer indirection) and more efficient to access
+  // (static struct offset address calculation, all fit in a few cache lines,
+  // etc). They are limited in capacity, though, so only <=64(ish) are usable.
+  dispatch_tile_a_push_constants_t push_constants;
+  memset(&push_constants, 0, sizeof(push_constants));
+  push_constants.f0 = 5.0f;
+
+  // Setup the two buffer bindings the entry point is expecting.
+  // They only need to remain valid for the duration of the invocation and all
+  // memory accessed by the invocation will come from here.
+  float arg0[4] = {1.0f, 2.0f, 3.0f, 4.0f};
+  float ret0[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+  const float ret0_expected[4] = {6.0f, 7.0f, 8.0f, 9.0f};
+  size_t binding_lengths[2] = {
+      sizeof(arg0),
+      sizeof(ret0),
+  };
+  void* binding_ptrs[2] = {
+      arg0,
+      ret0,
+  };
+
+  // Resolve the entry point by ordinal.
+  const iree_hal_executable_dispatch_v0_t entry_fn_ptr =
+      library.v0->exports.ptrs[0];
+
+  // Dispatch each workgroup with the same state.
+  const iree_hal_executable_dispatch_state_v0_t dispatch_state = {
+      .workgroup_count_x = 4,
+      .workgroup_count_y = 1,
+      .workgroup_count_z = 1,
+      .workgroup_size_x = 1,
+      .workgroup_size_y = 1,
+      .workgroup_size_z = 1,
+      .max_concurrency = 1,
+      .push_constant_count = IREE_ARRAYSIZE(push_constants.values),
+      .push_constants = push_constants.values,
+      .binding_count = IREE_ARRAYSIZE(binding_ptrs),
+      .binding_ptrs = binding_ptrs,
+      .binding_lengths = binding_lengths,
+  };
+  iree_hal_executable_workgroup_state_v0_t workgroup_state = {
+      .processor_id = iree_cpu_query_processor_id(),
+  };
+  for (uint32_t z = 0; z < dispatch_state.workgroup_count_z; ++z) {
+    workgroup_state.workgroup_id_z = z;
+    for (uint32_t y = 0; y < dispatch_state.workgroup_count_y; ++y) {
+      workgroup_state.workgroup_id_y = y;
+      for (uint32_t x = 0; x < dispatch_state.workgroup_count_x; ++x) {
+        workgroup_state.workgroup_id_x = x;
+        // Invoke the workgroup (x, y, z).
+        int ret = entry_fn_ptr(&environment, &dispatch_state, &workgroup_state);
+        IREE_ASSERT_EQ(
+            ret, 0,
+            "if we have bounds checking enabled the executable will signal "
+            "us of badness");
+      }
+    }
+  }
+
+  // Ensure it worked.
+  bool all_match = true;
+  for (size_t i = 0; i < IREE_ARRAYSIZE(ret0_expected); ++i) {
+    IREE_ASSERT_EQ(ret0[i], ret0_expected[i], "math is hard");
+    all_match = all_match && ret0[i] == ret0_expected[i];
+  }
+  return all_match ? 0 : 1;
+}
diff --git a/runtime/src/iree/hal/local/executable_loader.c b/runtime/src/iree/hal/local/executable_loader.c
new file mode 100644
index 0000000..0703a9d
--- /dev/null
+++ b/runtime/src/iree/hal/local/executable_loader.c
@@ -0,0 +1,100 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/local/executable_loader.h"
+
+iree_status_t iree_hal_executable_import_provider_resolve(
+    const iree_hal_executable_import_provider_t import_provider,
+    iree_string_view_t symbol_name, void** out_fn_ptr) {
+  IREE_ASSERT_ARGUMENT(out_fn_ptr);
+  *out_fn_ptr = NULL;
+
+  // A `?` suffix indicates the symbol is weakly linked and can be NULL.
+  bool is_weak = false;
+  if (iree_string_view_ends_with(symbol_name, iree_make_cstring_view("?"))) {
+    is_weak = true;
+    symbol_name = iree_string_view_substr(symbol_name, 0, symbol_name.size - 1);
+  }
+
+  // Note that it's fine for there to be no registered provider if all symbols
+  // are weak.
+  if (import_provider.resolve == NULL) {
+    if (is_weak) return iree_ok_status();
+    return iree_make_status(IREE_STATUS_UNAVAILABLE,
+                            "no import provider registered for resolving "
+                            "executable imports (while try to resolve %.*s)",
+                            (int)symbol_name.size, symbol_name.data);
+  }
+
+  iree_status_t status =
+      import_provider.resolve(import_provider.self, symbol_name, out_fn_ptr);
+  if (!iree_status_is_ok(status) && is_weak) {
+    status = iree_status_ignore(status);  // ok to fail on weak symbols
+  }
+
+  return status;
+}
+
+void iree_hal_executable_loader_initialize(
+    const void* vtable, iree_hal_executable_import_provider_t import_provider,
+    iree_hal_executable_loader_t* out_base_loader) {
+  iree_atomic_ref_count_init(&out_base_loader->ref_count);
+  out_base_loader->vtable = vtable;
+  out_base_loader->import_provider = import_provider;
+}
+
+void iree_hal_executable_loader_retain(
+    iree_hal_executable_loader_t* executable_loader) {
+  if (IREE_LIKELY(executable_loader)) {
+    iree_atomic_ref_count_inc(&executable_loader->ref_count);
+  }
+}
+
+void iree_hal_executable_loader_release(
+    iree_hal_executable_loader_t* executable_loader) {
+  if (IREE_LIKELY(executable_loader) &&
+      iree_atomic_ref_count_dec(&executable_loader->ref_count) == 1) {
+    executable_loader->vtable->destroy(executable_loader);
+  }
+}
+
+bool iree_hal_executable_loader_query_support(
+    iree_hal_executable_loader_t* executable_loader,
+    iree_hal_executable_caching_mode_t caching_mode,
+    iree_string_view_t executable_format) {
+  IREE_ASSERT_ARGUMENT(executable_loader);
+  return executable_loader->vtable->query_support(
+      executable_loader, caching_mode, executable_format);
+}
+
+bool iree_hal_query_any_executable_loader_support(
+    iree_host_size_t loader_count, iree_hal_executable_loader_t** loaders,
+    iree_hal_executable_caching_mode_t caching_mode,
+    iree_string_view_t executable_format) {
+  IREE_ASSERT_ARGUMENT(loaders);
+  for (iree_host_size_t i = 0; i < loader_count; ++i) {
+    if (iree_hal_executable_loader_query_support(loaders[i], caching_mode,
+                                                 executable_format)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+iree_status_t iree_hal_executable_loader_try_load(
+    iree_hal_executable_loader_t* executable_loader,
+    const iree_hal_executable_params_t* executable_params,
+    iree_hal_executable_t** out_executable) {
+  IREE_ASSERT_ARGUMENT(executable_loader);
+  IREE_ASSERT_ARGUMENT(executable_params);
+  IREE_ASSERT_ARGUMENT(!executable_params->executable_layout_count ||
+                       executable_params->executable_layouts);
+  IREE_ASSERT_ARGUMENT(!executable_params->executable_data.data_length ||
+                       executable_params->executable_data.data);
+  IREE_ASSERT_ARGUMENT(out_executable);
+  return executable_loader->vtable->try_load(executable_loader,
+                                             executable_params, out_executable);
+}
diff --git a/runtime/src/iree/hal/local/executable_loader.h b/runtime/src/iree/hal/local/executable_loader.h
new file mode 100644
index 0000000..ae8f6dc
--- /dev/null
+++ b/runtime/src/iree/hal/local/executable_loader.h
@@ -0,0 +1,149 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_LOCAL_EXECUTABLE_LOADER_H_
+#define IREE_HAL_LOCAL_EXECUTABLE_LOADER_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/atomics.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+//===----------------------------------------------------------------------===//
+// iree_hal_executable_import_provider_t
+//===----------------------------------------------------------------------===//
+
+// Interface used to resolve executable imports at load-time.
+// This virtualizes some external provider and does not take ownership of the
+// instance: callers must ensure that the provider remains valid for the
+// lifetime of the executable loader that it is providing for.
+typedef struct iree_hal_executable_import_provider_t {
+  // TODO(benvanik): version field.
+  IREE_API_UNSTABLE
+
+  // User-defined pointer passed to all functions.
+  void* self;
+
+  // Resolves an import symbol with the given |symbol_name| and stores a pointer
+  // to the function (or its context) in |out_fn_ptr|.
+  iree_status_t(IREE_API_PTR* resolve)(void* self,
+                                       iree_string_view_t symbol_name,
+                                       void** out_fn_ptr);
+} iree_hal_executable_import_provider_t;
+
+static inline iree_hal_executable_import_provider_t
+iree_hal_executable_import_provider_null() {
+  iree_hal_executable_import_provider_t provider = {NULL, NULL};
+  return provider;
+}
+
+// Resolves an import symbol with the given |symbol_name| and stores a pointer
+// to the function (or its context) in |out_fn_ptr|.
+//
+// A |symbol_name| ending in `?` indicates that the symbol is weak and is
+// allowed to be resolved to NULL. Such cases will always return OK.
+iree_status_t iree_hal_executable_import_provider_resolve(
+    const iree_hal_executable_import_provider_t import_provider,
+    iree_string_view_t symbol_name, void** out_fn_ptr);
+
+//===----------------------------------------------------------------------===//
+// iree_hal_executable_loader_t
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_executable_loader_vtable_t
+    iree_hal_executable_loader_vtable_t;
+
+// Interface for compiled executable loader implementations.
+// A loader may be as simple as something that resolves function pointers in the
+// local executable for statically linked executables or as complex as a custom
+// relocatable ELF loader. Loaders are registered and persist for each device
+// they are attached to and may keep internal caches or memoize resources shared
+// by multiple loaded executables.
+//
+// Thread-safe - multiple threads may load executables (including the *same*
+// executable) simultaneously.
+typedef struct iree_hal_executable_loader_t {
+  iree_atomic_ref_count_t ref_count;
+  const iree_hal_executable_loader_vtable_t* vtable;
+  iree_hal_executable_import_provider_t import_provider;
+} iree_hal_executable_loader_t;
+
+// Initializes the base iree_hal_executable_loader_t type.
+// Called by subclasses upon allocating their loader.
+void iree_hal_executable_loader_initialize(
+    const void* vtable, iree_hal_executable_import_provider_t import_provider,
+    iree_hal_executable_loader_t* out_base_loader);
+
+// Retains the given |executable_loader| for the caller.
+void iree_hal_executable_loader_retain(
+    iree_hal_executable_loader_t* executable_loader);
+
+// Releases the given |executable_loader| from the caller.
+void iree_hal_executable_loader_release(
+    iree_hal_executable_loader_t* executable_loader);
+
+// Returns true if the loader can load executables of the given
+// |executable_format|. Note that loading may still fail if the executable uses
+// features not available on the current host or runtime.
+bool iree_hal_executable_loader_query_support(
+    iree_hal_executable_loader_t* executable_loader,
+    iree_hal_executable_caching_mode_t caching_mode,
+    iree_string_view_t executable_format);
+
+// Returns true if any loader in the list can load executables of the given
+// |executable_format|. Note that loading may still fail if the executable uses
+// features not available on the current host or runtime.
+bool iree_hal_query_any_executable_loader_support(
+    iree_host_size_t loader_count, iree_hal_executable_loader_t** loaders,
+    iree_hal_executable_caching_mode_t caching_mode,
+    iree_string_view_t executable_format);
+
+// Tries loading the executable data provided in the given format.
+// May fail even if the executable is valid if it requires features not
+// supported by the current host or runtime (such as available architectures,
+// imports, etc).
+//
+// Depending on loader ability the caching_mode is used to enable certain
+// features such as instrumented profiling. Not all formats support these
+// features and cooperation of both the compiler producing the executables and
+// the runtime loader and system are required.
+//
+// Returns IREE_STATUS_CANCELLED when the loader cannot load the file in the
+// given format.
+iree_status_t iree_hal_executable_loader_try_load(
+    iree_hal_executable_loader_t* executable_loader,
+    const iree_hal_executable_params_t* executable_params,
+    iree_hal_executable_t** out_executable);
+
+//===----------------------------------------------------------------------===//
+// iree_hal_executable_loader_t implementation details
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_executable_loader_vtable_t {
+  void(IREE_API_PTR* destroy)(iree_hal_executable_loader_t* executable_loader);
+
+  bool(IREE_API_PTR* query_support)(
+      iree_hal_executable_loader_t* executable_loader,
+      iree_hal_executable_caching_mode_t caching_mode,
+      iree_string_view_t executable_format);
+
+  iree_status_t(IREE_API_PTR* try_load)(
+      iree_hal_executable_loader_t* executable_loader,
+      const iree_hal_executable_params_t* executable_params,
+      iree_hal_executable_t** out_executable);
+} iree_hal_executable_loader_vtable_t;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_LOCAL_EXECUTABLE_LOADER_H_
diff --git a/runtime/src/iree/hal/local/inline_command_buffer.c b/runtime/src/iree/hal/local/inline_command_buffer.c
new file mode 100644
index 0000000..5b585d0
--- /dev/null
+++ b/runtime/src/iree/hal/local/inline_command_buffer.c
@@ -0,0 +1,553 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/local/inline_command_buffer.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/fpu_state.h"
+#include "iree/base/internal/math.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/local/executable_environment.h"
+#include "iree/hal/local/executable_library.h"
+#include "iree/hal/local/local_descriptor_set_layout.h"
+#include "iree/hal/local/local_executable.h"
+#include "iree/hal/local/local_executable_layout.h"
+
+//===----------------------------------------------------------------------===//
+// iree_hal_inline_command_buffer_t
+//===----------------------------------------------------------------------===//
+
+// Inline synchronous one-shot command "buffer".
+typedef struct iree_hal_inline_command_buffer_t {
+  iree_hal_command_buffer_t base;
+  iree_allocator_t host_allocator;
+
+  struct {
+    // A flattened list of all available descriptor set bindings.
+    // As descriptor sets are pushed/bound the bindings will be updated to
+    // represent the fully-translated binding data pointer.
+    //
+    // TODO(benvanik): support proper mapping semantics and track the
+    // iree_hal_buffer_mapping_t and map/unmap where appropriate.
+    void* full_bindings[IREE_HAL_LOCAL_MAX_DESCRIPTOR_SET_COUNT *
+                        IREE_HAL_LOCAL_MAX_DESCRIPTOR_BINDING_COUNT];
+    size_t full_binding_lengths[IREE_HAL_LOCAL_MAX_DESCRIPTOR_SET_COUNT *
+                                IREE_HAL_LOCAL_MAX_DESCRIPTOR_BINDING_COUNT];
+
+    // Packed bindings scratch space used during dispatch. Executable bindings
+    // are packed into a dense list with unused bindings removed.
+    void* packed_bindings[IREE_HAL_LOCAL_MAX_DESCRIPTOR_SET_COUNT *
+                          IREE_HAL_LOCAL_MAX_DESCRIPTOR_BINDING_COUNT];
+    size_t packed_binding_lengths[IREE_HAL_LOCAL_MAX_DESCRIPTOR_SET_COUNT *
+                                  IREE_HAL_LOCAL_MAX_DESCRIPTOR_BINDING_COUNT];
+
+    // All available push constants updated each time push_constants is called.
+    // Reset only with the command buffer and otherwise will maintain its values
+    // during recording to allow for partial push_constants updates.
+    uint32_t push_constants[IREE_HAL_LOCAL_MAX_PUSH_CONSTANT_COUNT];
+
+    // Cached and initialized dispatch state reused for all dispatches.
+    // Individual dispatches must populate the dynamically changing fields like
+    // push_constant_count and binding_count.
+    iree_alignas(64) iree_hal_executable_dispatch_state_v0_t dispatch_state;
+
+    // An opaque tag used to reduce the cost of processor ID queries.
+    iree_cpu_processor_tag_t processor_tag;
+    // Guess at the current processor ID.
+    iree_cpu_processor_id_t processor_id;
+  } state;
+} iree_hal_inline_command_buffer_t;
+
+static const iree_hal_command_buffer_vtable_t
+    iree_hal_inline_command_buffer_vtable;
+
+static iree_hal_inline_command_buffer_t* iree_hal_inline_command_buffer_cast(
+    iree_hal_command_buffer_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_inline_command_buffer_vtable);
+  return (iree_hal_inline_command_buffer_t*)base_value;
+}
+
+static void iree_hal_inline_command_buffer_reset(
+    iree_hal_inline_command_buffer_t* command_buffer) {
+  memset(&command_buffer->state, 0, sizeof(command_buffer->state));
+
+  // Setup the cached dispatch state pointers that don't change.
+  iree_hal_executable_dispatch_state_v0_t* dispatch_state =
+      &command_buffer->state.dispatch_state;
+  dispatch_state->push_constants = command_buffer->state.push_constants;
+  dispatch_state->binding_ptrs = command_buffer->state.packed_bindings;
+  dispatch_state->binding_lengths =
+      command_buffer->state.packed_binding_lengths;
+}
+
+iree_status_t iree_hal_inline_command_buffer_create(
+    iree_hal_device_t* device, iree_hal_command_buffer_mode_t mode,
+    iree_hal_command_category_t command_categories,
+    iree_hal_queue_affinity_t queue_affinity, iree_allocator_t host_allocator,
+    iree_hal_command_buffer_t** out_command_buffer) {
+  IREE_ASSERT_ARGUMENT(out_command_buffer);
+  *out_command_buffer = NULL;
+  if (!iree_all_bits_set(
+          mode, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT |
+                    IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION)) {
+    // This implementation only supports command buffers that are allowed to
+    // execute inline. This mode is a contract with the caller that it is ok if
+    // we begin executing prior to submission.
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "inline command buffers must have a mode with ALLOW_INLINE_EXECUTION");
+  }
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_inline_command_buffer_t* command_buffer = NULL;
+  iree_status_t status = iree_allocator_malloc(
+      host_allocator, sizeof(*command_buffer), (void**)&command_buffer);
+  if (iree_status_is_ok(status)) {
+    iree_hal_command_buffer_initialize(
+        device, mode, command_categories, queue_affinity,
+        &iree_hal_inline_command_buffer_vtable, &command_buffer->base);
+    command_buffer->host_allocator = host_allocator;
+    iree_hal_inline_command_buffer_reset(command_buffer);
+
+    *out_command_buffer = &command_buffer->base;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_inline_command_buffer_destroy(
+    iree_hal_command_buffer_t* base_command_buffer) {
+  iree_hal_inline_command_buffer_t* command_buffer =
+      iree_hal_inline_command_buffer_cast(base_command_buffer);
+  iree_allocator_t host_allocator = command_buffer->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_inline_command_buffer_reset(command_buffer);
+  iree_allocator_free(host_allocator, command_buffer);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+bool iree_hal_inline_command_buffer_isa(
+    iree_hal_command_buffer_t* command_buffer) {
+  return iree_hal_command_buffer_dyn_cast(
+      command_buffer, &iree_hal_inline_command_buffer_vtable);
+}
+
+static void* iree_hal_inline_command_buffer_dyn_cast(
+    iree_hal_command_buffer_t* command_buffer, const void* vtable) {
+  if (vtable == &iree_hal_inline_command_buffer_vtable) {
+    IREE_HAL_ASSERT_TYPE(command_buffer, vtable);
+    return command_buffer;
+  }
+  return NULL;
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_inline_command_buffer_t recording
+//===----------------------------------------------------------------------===//
+
+static iree_status_t iree_hal_inline_command_buffer_flush_tasks(
+    iree_hal_inline_command_buffer_t* command_buffer);
+
+// Updates the cached processor ID field in the command buffer.
+static void iree_hal_inline_command_buffer_update_processor_id(
+    iree_hal_inline_command_buffer_t* command_buffer) {
+  iree_cpu_requery_processor_id(&command_buffer->state.processor_tag,
+                                &command_buffer->state.processor_id);
+}
+
+static iree_status_t iree_hal_inline_command_buffer_begin(
+    iree_hal_command_buffer_t* base_command_buffer) {
+  iree_hal_inline_command_buffer_t* command_buffer =
+      iree_hal_inline_command_buffer_cast(base_command_buffer);
+  iree_hal_inline_command_buffer_reset(command_buffer);
+
+  // Query the processor ID we start out on. We may update it during execution.
+  iree_hal_inline_command_buffer_update_processor_id(command_buffer);
+
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_inline_command_buffer_end(
+    iree_hal_command_buffer_t* base_command_buffer) {
+  iree_hal_inline_command_buffer_t* command_buffer =
+      iree_hal_inline_command_buffer_cast(base_command_buffer);
+  iree_hal_inline_command_buffer_reset(command_buffer);
+  return iree_ok_status();
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_inline_command_buffer_t debug utilities
+//===----------------------------------------------------------------------===//
+
+static void iree_hal_inline_command_buffer_begin_debug_group(
+    iree_hal_command_buffer_t* base_command_buffer, iree_string_view_t label,
+    iree_hal_label_color_t label_color,
+    const iree_hal_label_location_t* location) {
+  // TODO(benvanik): tracy event stack.
+}
+
+static void iree_hal_inline_command_buffer_end_debug_group(
+    iree_hal_command_buffer_t* base_command_buffer) {
+  // TODO(benvanik): tracy event stack.
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_command_buffer_execution_barrier
+//===----------------------------------------------------------------------===//
+
+static iree_status_t iree_hal_inline_command_buffer_execution_barrier(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_execution_stage_t source_stage_mask,
+    iree_hal_execution_stage_t target_stage_mask,
+    iree_hal_execution_barrier_flags_t flags,
+    iree_host_size_t memory_barrier_count,
+    const iree_hal_memory_barrier_t* memory_barriers,
+    iree_host_size_t buffer_barrier_count,
+    const iree_hal_buffer_barrier_t* buffer_barriers) {
+  // No-op; we execute synchronously.
+  return iree_ok_status();
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_command_buffer_signal_event
+//===----------------------------------------------------------------------===//
+
+static iree_status_t iree_hal_inline_command_buffer_signal_event(
+    iree_hal_command_buffer_t* base_command_buffer, iree_hal_event_t* event,
+    iree_hal_execution_stage_t source_stage_mask) {
+  // No-op; we execute synchronously.
+  return iree_ok_status();
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_command_buffer_reset_event
+//===----------------------------------------------------------------------===//
+
+static iree_status_t iree_hal_inline_command_buffer_reset_event(
+    iree_hal_command_buffer_t* base_command_buffer, iree_hal_event_t* event,
+    iree_hal_execution_stage_t source_stage_mask) {
+  // No-op; we execute synchronously.
+  return iree_ok_status();
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_command_buffer_wait_events
+//===----------------------------------------------------------------------===//
+
+static iree_status_t iree_hal_inline_command_buffer_wait_events(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_host_size_t event_count, const iree_hal_event_t** events,
+    iree_hal_execution_stage_t source_stage_mask,
+    iree_hal_execution_stage_t target_stage_mask,
+    iree_host_size_t memory_barrier_count,
+    const iree_hal_memory_barrier_t* memory_barriers,
+    iree_host_size_t buffer_barrier_count,
+    const iree_hal_buffer_barrier_t* buffer_barriers) {
+  // No-op; we execute synchronously.
+  return iree_ok_status();
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_command_buffer_discard_buffer
+//===----------------------------------------------------------------------===//
+
+static iree_status_t iree_hal_inline_command_buffer_discard_buffer(
+    iree_hal_command_buffer_t* base_command_buffer, iree_hal_buffer_t* buffer) {
+  // Could be treated as a cache invalidation as it indicates we won't be using
+  // the existing buffer contents again.
+  return iree_ok_status();
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_command_buffer_fill_buffer
+//===----------------------------------------------------------------------===//
+
+static iree_status_t iree_hal_inline_command_buffer_fill_buffer(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+    iree_device_size_t length, const void* pattern,
+    iree_host_size_t pattern_length) {
+  return iree_hal_buffer_map_fill(target_buffer, target_offset, length, pattern,
+                                  pattern_length);
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_command_buffer_update_buffer
+//===----------------------------------------------------------------------===//
+
+static iree_status_t iree_hal_inline_command_buffer_update_buffer(
+    iree_hal_command_buffer_t* base_command_buffer, const void* source_buffer,
+    iree_host_size_t source_offset, iree_hal_buffer_t* target_buffer,
+    iree_device_size_t target_offset, iree_device_size_t length) {
+  return iree_hal_buffer_map_write(
+      target_buffer, target_offset,
+      (const uint8_t*)source_buffer + source_offset, length);
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_command_buffer_copy_buffer
+//===----------------------------------------------------------------------===//
+
+static iree_status_t iree_hal_inline_command_buffer_copy_buffer(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
+    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+    iree_device_size_t length) {
+  return iree_hal_buffer_map_copy(source_buffer, source_offset, target_buffer,
+                                  target_offset, length);
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_command_buffer_push_constants
+//===----------------------------------------------------------------------===//
+// NOTE: command buffer state change only; enqueues no tasks.
+
+static iree_status_t iree_hal_inline_command_buffer_push_constants(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_executable_layout_t* executable_layout, iree_host_size_t offset,
+    const void* values, iree_host_size_t values_length) {
+  iree_hal_inline_command_buffer_t* command_buffer =
+      iree_hal_inline_command_buffer_cast(base_command_buffer);
+
+  if (IREE_UNLIKELY(offset + values_length >=
+                    sizeof(command_buffer->state.push_constants))) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "push constant range %zu (length=%zu) out of range",
+                            offset, values_length);
+  }
+
+  memcpy((uint8_t*)&command_buffer->state.push_constants + offset, values,
+         values_length);
+
+  return iree_ok_status();
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_command_buffer_push_descriptor_set
+//===----------------------------------------------------------------------===//
+// NOTE: command buffer state change only; enqueues no tasks.
+
+static iree_status_t iree_hal_inline_command_buffer_push_descriptor_set(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_executable_layout_t* executable_layout, uint32_t set,
+    iree_host_size_t binding_count,
+    const iree_hal_descriptor_set_binding_t* bindings) {
+  iree_hal_inline_command_buffer_t* command_buffer =
+      iree_hal_inline_command_buffer_cast(base_command_buffer);
+
+  if (IREE_UNLIKELY(set >= IREE_HAL_LOCAL_MAX_DESCRIPTOR_SET_COUNT)) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "set %u out of bounds", set);
+  }
+
+  iree_host_size_t binding_base =
+      set * IREE_HAL_LOCAL_MAX_DESCRIPTOR_BINDING_COUNT;
+  for (iree_host_size_t i = 0; i < binding_count; ++i) {
+    if (IREE_UNLIKELY(bindings[i].binding >=
+                      IREE_HAL_LOCAL_MAX_DESCRIPTOR_BINDING_COUNT)) {
+      return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                              "buffer binding index out of bounds");
+    }
+    iree_host_size_t binding_ordinal = binding_base + bindings[i].binding;
+
+    // TODO(benvanik): track mapping so we can properly map/unmap/flush/etc.
+    iree_hal_buffer_mapping_t buffer_mapping = {{0}};
+    IREE_RETURN_IF_ERROR(iree_hal_buffer_map_range(
+        bindings[i].buffer, IREE_HAL_MAPPING_MODE_PERSISTENT,
+        IREE_HAL_MEMORY_ACCESS_ANY, bindings[i].offset, bindings[i].length,
+        &buffer_mapping));
+    command_buffer->state.full_bindings[binding_ordinal] =
+        buffer_mapping.contents.data;
+    command_buffer->state.full_binding_lengths[binding_ordinal] =
+        buffer_mapping.contents.data_length;
+  }
+
+  return iree_ok_status();
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_command_buffer_bind_descriptor_set
+//===----------------------------------------------------------------------===//
+// NOTE: command buffer state change only; enqueues no tasks.
+
+static iree_status_t iree_hal_inline_command_buffer_bind_descriptor_set(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_executable_layout_t* executable_layout, uint32_t set,
+    iree_hal_descriptor_set_t* descriptor_set,
+    iree_host_size_t dynamic_offset_count,
+    const iree_device_size_t* dynamic_offsets) {
+  return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                          "descriptor set binding not yet implemented");
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_command_buffer_dispatch
+//===----------------------------------------------------------------------===//
+
+static iree_status_t iree_hal_inline_command_buffer_dispatch(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_executable_t* executable, int32_t entry_point,
+    uint32_t workgroup_x, uint32_t workgroup_y, uint32_t workgroup_z) {
+  iree_hal_inline_command_buffer_t* command_buffer =
+      iree_hal_inline_command_buffer_cast(base_command_buffer);
+
+  iree_hal_local_executable_t* local_executable =
+      iree_hal_local_executable_cast(executable);
+  iree_hal_local_executable_layout_t* local_layout =
+      local_executable->executable_layouts[entry_point];
+  iree_host_size_t local_memory_size =
+      local_executable->dispatch_attrs
+          ? local_executable->dispatch_attrs[entry_point].local_memory_pages *
+                IREE_HAL_WORKGROUP_LOCAL_MEMORY_PAGE_SIZE
+          : 0;
+
+  // Update the ID of the processor we are running on.
+  // We don't know how much time has passed since we last updated as we are
+  // running inline with the user program; if we knew we were going to be
+  // handling a batch of dispatches we could reduce the amount of times we call
+  // this - but that's what the task system is for.
+  iree_hal_inline_command_buffer_update_processor_id(command_buffer);
+
+  iree_hal_executable_dispatch_state_v0_t* dispatch_state =
+      &command_buffer->state.dispatch_state;
+
+  // TODO(benvanik): expose on API or keep fixed on executable.
+  dispatch_state->workgroup_size_x = 1;
+  dispatch_state->workgroup_size_y = 1;
+  dispatch_state->workgroup_size_z = 1;
+  dispatch_state->workgroup_count_x = workgroup_x;
+  dispatch_state->workgroup_count_y = workgroup_y;
+  dispatch_state->workgroup_count_z = workgroup_z;
+
+  // Single-threaded.
+  dispatch_state->max_concurrency = 1;
+
+  // Push constants are pulled directly from the command buffer state, but we
+  // only allow the dispatch to read what we know is initialized based on the
+  // layout.
+  dispatch_state->push_constant_count = local_layout->push_constants;
+
+  // Produce the dense binding list based on the declared bindings used.
+  // This allows us to change the descriptor sets and bindings counts supported
+  // in the HAL independent of any executable as each executable just gets the
+  // flat dense list and doesn't care about our descriptor set stuff.
+  //
+  // Note that we are just directly setting the binding data pointers here with
+  // no ownership/retaining/etc - it's part of the HAL contract that buffers are
+  // kept valid for the duration they may be in use.
+  iree_hal_local_binding_mask_t used_binding_mask = local_layout->used_bindings;
+  iree_host_size_t used_binding_count =
+      iree_math_count_ones_u64(used_binding_mask);
+  dispatch_state->binding_count = used_binding_count;
+  void** binding_ptrs = (void**)dispatch_state->binding_ptrs;
+  size_t* binding_lengths = (size_t*)dispatch_state->binding_lengths;
+  iree_host_size_t binding_base = 0;
+  for (iree_host_size_t i = 0; i < used_binding_count; ++i) {
+    int mask_offset = iree_math_count_trailing_zeros_u64(used_binding_mask);
+    int binding_ordinal = binding_base + mask_offset;
+    binding_base += mask_offset + 1;
+    used_binding_mask = iree_shr(used_binding_mask, mask_offset + 1);
+    binding_ptrs[i] = command_buffer->state.full_bindings[binding_ordinal];
+    if (!binding_ptrs[i]) {
+      return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                              "(flat) binding %d is NULL", binding_ordinal);
+    }
+    binding_lengths[i] =
+        command_buffer->state.full_binding_lengths[binding_ordinal];
+  }
+
+  // TODO(benvanik): plumb through an arena or fixed-size reservation to use.
+  // For now when deploying to devices where you want something like the
+  // inline command buffer you probably don't want 256KB of transient memory
+  // getting allocated and retained implicitly - this should be a compiler
+  // option. For now we just malloc here to make things work and strongly
+  // encourage the kind of user who wants synchronous inline execution to not
+  // also want tons of scratch memory.
+  iree_byte_span_t local_memory = iree_make_byte_span(NULL, local_memory_size);
+  if (local_memory_size > 0) {
+    IREE_RETURN_IF_ERROR(iree_allocator_malloc(command_buffer->host_allocator,
+                                               local_memory_size,
+                                               (void**)&local_memory.data));
+  }
+
+  // Since we are running on a borrowed thread, we know nothing about the
+  // floating point state. Reset it.
+  iree_fpu_state_t fpu_state =
+      iree_fpu_state_push(IREE_FPU_STATE_FLAG_FLUSH_DENORMALS_TO_ZERO);
+  iree_status_t status = iree_hal_local_executable_issue_dispatch_inline(
+      local_executable, entry_point, dispatch_state,
+      command_buffer->state.processor_id, local_memory);
+  iree_fpu_state_pop(fpu_state);
+
+  if (local_memory.data) {
+    iree_allocator_free(command_buffer->host_allocator, local_memory.data);
+  }
+  return status;
+}
+
+typedef union iree_hal_vec3_t {
+  struct {
+    uint32_t x;
+    uint32_t y;
+    uint32_t z;
+  };
+  uint32_t value[3];
+} iree_hal_vec3_t;
+
+static iree_status_t iree_hal_inline_command_buffer_dispatch_indirect(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_executable_t* executable, int32_t entry_point,
+    iree_hal_buffer_t* workgroups_buffer,
+    iree_device_size_t workgroups_offset) {
+  // TODO(benvanik): track mapping so we can properly map/unmap/flush/etc.
+  iree_hal_buffer_mapping_t buffer_mapping = {{0}};
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_map_range(
+      workgroups_buffer, IREE_HAL_MAPPING_MODE_PERSISTENT,
+      IREE_HAL_MEMORY_ACCESS_READ, workgroups_offset, 3 * sizeof(uint32_t),
+      &buffer_mapping));
+  iree_hal_vec3_t workgroup_count =
+      *(const iree_hal_vec3_t*)buffer_mapping.contents.data;
+  return iree_hal_inline_command_buffer_dispatch(
+      base_command_buffer, executable, entry_point, workgroup_count.x,
+      workgroup_count.y, workgroup_count.z);
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_command_buffer_vtable_t
+//===----------------------------------------------------------------------===//
+
+static const iree_hal_command_buffer_vtable_t
+    iree_hal_inline_command_buffer_vtable = {
+        .destroy = iree_hal_inline_command_buffer_destroy,
+        .dyn_cast = iree_hal_inline_command_buffer_dyn_cast,
+        .begin = iree_hal_inline_command_buffer_begin,
+        .end = iree_hal_inline_command_buffer_end,
+        .begin_debug_group = iree_hal_inline_command_buffer_begin_debug_group,
+        .end_debug_group = iree_hal_inline_command_buffer_end_debug_group,
+        .execution_barrier = iree_hal_inline_command_buffer_execution_barrier,
+        .signal_event = iree_hal_inline_command_buffer_signal_event,
+        .reset_event = iree_hal_inline_command_buffer_reset_event,
+        .wait_events = iree_hal_inline_command_buffer_wait_events,
+        .discard_buffer = iree_hal_inline_command_buffer_discard_buffer,
+        .fill_buffer = iree_hal_inline_command_buffer_fill_buffer,
+        .update_buffer = iree_hal_inline_command_buffer_update_buffer,
+        .copy_buffer = iree_hal_inline_command_buffer_copy_buffer,
+        .push_constants = iree_hal_inline_command_buffer_push_constants,
+        .push_descriptor_set =
+            iree_hal_inline_command_buffer_push_descriptor_set,
+        .bind_descriptor_set =
+            iree_hal_inline_command_buffer_bind_descriptor_set,
+        .dispatch = iree_hal_inline_command_buffer_dispatch,
+        .dispatch_indirect = iree_hal_inline_command_buffer_dispatch_indirect,
+};
diff --git a/runtime/src/iree/hal/local/inline_command_buffer.h b/runtime/src/iree/hal/local/inline_command_buffer.h
new file mode 100644
index 0000000..b98f5e2
--- /dev/null
+++ b/runtime/src/iree/hal/local/inline_command_buffer.h
@@ -0,0 +1,40 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_LOCAL_INLINE_COMMAND_BUFFER_H_
+#define IREE_HAL_LOCAL_INLINE_COMMAND_BUFFER_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Creates an inline synchronous one-shot single-threaded command "buffer".
+// This is designed for ultra-low latency situations where we know the command
+// buffer is going to be submitted with no wait semaphores indicating that it
+// can begin execution immediately. No inter-command-buffer scheduling will be
+// performed and all barriers and events are ignored.
+//
+// Executes all work on the calling thread synchronously (today).
+//
+// Must have IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION set.
+iree_status_t iree_hal_inline_command_buffer_create(
+    iree_hal_device_t* device, iree_hal_command_buffer_mode_t mode,
+    iree_hal_command_category_t command_categories,
+    iree_hal_queue_affinity_t queue_affinity, iree_allocator_t host_allocator,
+    iree_hal_command_buffer_t** out_command_buffer);
+
+// Returns true if |command_buffer| is an inline command buffer.
+bool iree_hal_inline_command_buffer_isa(
+    iree_hal_command_buffer_t* command_buffer);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_LOCAL_INLINE_COMMAND_BUFFER_H_
diff --git a/runtime/src/iree/hal/local/loaders/BUILD b/runtime/src/iree/hal/local/loaders/BUILD
new file mode 100644
index 0000000..ac48f01
--- /dev/null
+++ b/runtime/src/iree/hal/local/loaders/BUILD
@@ -0,0 +1,102 @@
+# Copyright 2020 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# Default implementations for HAL types that use the host resources.
+# These are generally just wrappers around host heap memory and host threads.
+
+load("//iree:build_defs.oss.bzl", "iree_cmake_extra_content", "iree_runtime_cc_library")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+iree_runtime_cc_library(
+    name = "embedded_library_loader",
+    srcs = ["embedded_library_loader.c"],
+    hdrs = ["embedded_library_loader.h"],
+    defines = [
+        "IREE_HAL_HAVE_EMBEDDED_LIBRARY_LOADER=1",
+    ],
+    deps = [
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:core_headers",
+        "//runtime/src/iree/base:tracing",
+        "//runtime/src/iree/hal",
+        "//runtime/src/iree/hal/local",
+        "//runtime/src/iree/hal/local:executable_library",
+        "//runtime/src/iree/hal/local/elf:elf_module",
+    ],
+)
+
+iree_runtime_cc_library(
+    name = "static_library_loader",
+    srcs = ["static_library_loader.c"],
+    hdrs = ["static_library_loader.h"],
+    defines = [
+        "IREE_HAL_HAVE_STATIC_LIBRARY_LOADER=1",
+    ],
+    deps = [
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:tracing",
+        "//runtime/src/iree/hal",
+        "//runtime/src/iree/hal/local",
+        "//runtime/src/iree/hal/local:executable_environment",
+        "//runtime/src/iree/hal/local:executable_library",
+    ],
+)
+
+iree_runtime_cc_library(
+    name = "system_library_loader",
+    srcs = ["system_library_loader.c"],
+    hdrs = ["system_library_loader.h"],
+    defines = [
+        "IREE_HAL_HAVE_SYSTEM_LIBRARY_LOADER=1",
+    ],
+    deps = [
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:core_headers",
+        "//runtime/src/iree/base:tracing",
+        "//runtime/src/iree/base/internal:dynamic_library",
+        "//runtime/src/iree/hal",
+        "//runtime/src/iree/hal/local",
+        "//runtime/src/iree/hal/local:executable_library",
+    ],
+)
+
+iree_cmake_extra_content(
+    content = """
+if(${IREE_HAL_DRIVER_VMVX} OR ${IREE_HAL_DRIVER_VMVX_SYNC})
+""",
+    inline = True,
+)
+
+iree_runtime_cc_library(
+    name = "vmvx_module_loader",
+    srcs = ["vmvx_module_loader.c"],
+    hdrs = ["vmvx_module_loader.h"],
+    defines = [
+        "IREE_HAL_HAVE_VMVX_MODULE_LOADER=1",
+    ],
+    deps = [
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:tracing",
+        "//runtime/src/iree/hal",
+        "//runtime/src/iree/hal/local",
+        "//runtime/src/iree/hal/local:executable_library",
+        "//runtime/src/iree/modules/vmvx",
+        "//runtime/src/iree/vm",
+        "//runtime/src/iree/vm:bytecode_module",
+    ],
+)
+
+iree_cmake_extra_content(
+    content = """
+endif()
+""",
+    inline = True,
+)
diff --git a/runtime/src/iree/hal/local/loaders/CMakeLists.txt b/runtime/src/iree/hal/local/loaders/CMakeLists.txt
new file mode 100644
index 0000000..71c8b03
--- /dev/null
+++ b/runtime/src/iree/hal/local/loaders/CMakeLists.txt
@@ -0,0 +1,97 @@
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# runtime/src/iree/hal/local/loaders/BUILD                                     #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
+
+iree_add_all_subdirs()
+
+iree_cc_library(
+  NAME
+    embedded_library_loader
+  HDRS
+    "embedded_library_loader.h"
+  SRCS
+    "embedded_library_loader.c"
+  DEPS
+    iree::base
+    iree::base::core_headers
+    iree::base::tracing
+    iree::hal
+    iree::hal::local
+    iree::hal::local::elf::elf_module
+    iree::hal::local::executable_library
+  DEFINES
+    "IREE_HAL_HAVE_EMBEDDED_LIBRARY_LOADER=1"
+  PUBLIC
+)
+
+iree_cc_library(
+  NAME
+    static_library_loader
+  HDRS
+    "static_library_loader.h"
+  SRCS
+    "static_library_loader.c"
+  DEPS
+    iree::base
+    iree::base::tracing
+    iree::hal
+    iree::hal::local
+    iree::hal::local::executable_environment
+    iree::hal::local::executable_library
+  DEFINES
+    "IREE_HAL_HAVE_STATIC_LIBRARY_LOADER=1"
+  PUBLIC
+)
+
+iree_cc_library(
+  NAME
+    system_library_loader
+  HDRS
+    "system_library_loader.h"
+  SRCS
+    "system_library_loader.c"
+  DEPS
+    iree::base
+    iree::base::core_headers
+    iree::base::internal::dynamic_library
+    iree::base::tracing
+    iree::hal
+    iree::hal::local
+    iree::hal::local::executable_library
+  DEFINES
+    "IREE_HAL_HAVE_SYSTEM_LIBRARY_LOADER=1"
+  PUBLIC
+)
+
+if(${IREE_HAL_DRIVER_VMVX} OR ${IREE_HAL_DRIVER_VMVX_SYNC})
+
+iree_cc_library(
+  NAME
+    vmvx_module_loader
+  HDRS
+    "vmvx_module_loader.h"
+  SRCS
+    "vmvx_module_loader.c"
+  DEPS
+    iree::base
+    iree::base::tracing
+    iree::hal
+    iree::hal::local
+    iree::hal::local::executable_library
+    iree::modules::vmvx
+    iree::vm
+    iree::vm::bytecode_module
+  DEFINES
+    "IREE_HAL_HAVE_VMVX_MODULE_LOADER=1"
+  PUBLIC
+)
+
+endif()
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/runtime/src/iree/hal/local/loaders/embedded_library_loader.c b/runtime/src/iree/hal/local/loaders/embedded_library_loader.c
new file mode 100644
index 0000000..017579e
--- /dev/null
+++ b/runtime/src/iree/hal/local/loaders/embedded_library_loader.c
@@ -0,0 +1,377 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/local/loaders/embedded_library_loader.h"
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "iree/base/tracing.h"
+#include "iree/hal/api.h"
+#include "iree/hal/local/elf/elf_module.h"
+#include "iree/hal/local/executable_library.h"
+#include "iree/hal/local/local_executable.h"
+#include "iree/hal/local/local_executable_layout.h"
+
+//===----------------------------------------------------------------------===//
+// iree_hal_elf_executable_t
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_elf_executable_t {
+  iree_hal_local_executable_t base;
+
+  // Loaded ELF module.
+  iree_elf_module_t module;
+
+  // Name used for the file field in tracy and debuggers.
+  iree_string_view_t identifier;
+
+  // Queried metadata from the library.
+  union {
+    const iree_hal_executable_library_header_t** header;
+    const iree_hal_executable_library_v0_t* v0;
+  } library;
+
+  iree_hal_local_executable_layout_t* layouts[];
+} iree_hal_elf_executable_t;
+
+static const iree_hal_local_executable_vtable_t iree_hal_elf_executable_vtable;
+
+static iree_status_t iree_hal_elf_executable_query_library(
+    iree_hal_elf_executable_t* executable) {
+  // Get the exported symbol used to get the library metadata.
+  iree_hal_executable_library_query_fn_t query_fn = NULL;
+  IREE_RETURN_IF_ERROR(iree_elf_module_lookup_export(
+      &executable->module, IREE_HAL_EXECUTABLE_LIBRARY_EXPORT_NAME,
+      (void**)&query_fn));
+
+  // Query for a compatible version of the library.
+  executable->library.header =
+      (const iree_hal_executable_library_header_t**)iree_elf_call_p_ip(
+          query_fn, IREE_HAL_EXECUTABLE_LIBRARY_VERSION_LATEST,
+          &executable->base.environment);
+  if (!executable->library.header) {
+    return iree_make_status(
+        IREE_STATUS_FAILED_PRECONDITION,
+        "executable does not support this version of the runtime (%08X)",
+        IREE_HAL_EXECUTABLE_LIBRARY_VERSION_LATEST);
+  }
+  const iree_hal_executable_library_header_t* header =
+      *executable->library.header;
+
+  // Ensure that if the library is built for a particular sanitizer that we also
+  // were compiled with that sanitizer enabled.
+  switch (header->sanitizer) {
+    case IREE_HAL_EXECUTABLE_LIBRARY_SANITIZER_NONE:
+      // Always safe even if the host has a sanitizer enabled; it just means
+      // that we won't be able to catch anything from within the executable,
+      // however checks outside will (often) still trigger when guard pages are
+      // dirtied/etc.
+      break;
+    default:
+      return iree_make_status(IREE_STATUS_UNAVAILABLE,
+                              "executable requires sanitizer but they are not "
+                              "yet supported with embedded libraries: %u",
+                              (uint32_t)header->sanitizer);
+  }
+
+  executable->identifier = iree_make_cstring_view(header->name);
+
+  executable->base.dispatch_attrs = executable->library.v0->exports.attrs;
+
+  return iree_ok_status();
+}
+
+// Resolves all of the imports declared by the executable using the given
+// |import_provider|.
+static iree_status_t iree_hal_elf_executable_resolve_imports(
+    iree_hal_elf_executable_t* executable,
+    const iree_hal_executable_import_provider_t import_provider) {
+  const iree_hal_executable_import_table_v0_t* import_table =
+      &executable->library.v0->imports;
+  if (!import_table->count) return iree_ok_status();
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // All calls from the loaded ELF route through our thunk function so that we
+  // can adapt to ABI differences.
+  executable->base.environment.import_thunk =
+      (iree_hal_executable_import_thunk_v0_t)iree_elf_thunk_i_p;
+
+  // Allocate storage for the imports.
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0,
+      iree_allocator_malloc(
+          executable->base.host_allocator,
+          import_table->count * sizeof(*executable->base.environment.imports),
+          (void**)&executable->base.environment.imports));
+
+  // Try to resolve each import.
+  // NOTE: imports are sorted alphabetically and if we cared we could use this
+  // information to more efficiently resolve the symbols from providers (O(n)
+  // walk vs potential O(nlogn)/O(n^2)).
+  for (uint32_t i = 0; i < import_table->count; ++i) {
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z0,
+        iree_hal_executable_import_provider_resolve(
+            import_provider, iree_make_cstring_view(import_table->symbols[i]),
+            (void**)&executable->base.environment.imports[i]));
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_elf_executable_create(
+    const iree_hal_executable_params_t* executable_params,
+    const iree_hal_executable_import_provider_t import_provider,
+    iree_allocator_t host_allocator, iree_hal_executable_t** out_executable) {
+  IREE_ASSERT_ARGUMENT(executable_params);
+  IREE_ASSERT_ARGUMENT(executable_params->executable_data.data &&
+                       executable_params->executable_data.data_length);
+  IREE_ASSERT_ARGUMENT(!executable_params->executable_layout_count ||
+                       executable_params->executable_layouts);
+  IREE_ASSERT_ARGUMENT(!executable_params->constant_count ||
+                       executable_params->constants);
+  IREE_ASSERT_ARGUMENT(out_executable);
+  *out_executable = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // TODO(benvanik): rework this so that we load and query the library before
+  // allocating so that we know the import count. Today since we allocate first
+  // we need an additional allocation once we've seen the import table.
+  iree_hal_elf_executable_t* executable = NULL;
+  iree_host_size_t total_size =
+      sizeof(*executable) +
+      executable_params->executable_layout_count *
+          sizeof(*executable->layouts) +
+      executable_params->constant_count * sizeof(*executable_params->constants);
+  iree_status_t status =
+      iree_allocator_malloc(host_allocator, total_size, (void**)&executable);
+  if (iree_status_is_ok(status)) {
+    iree_hal_local_executable_initialize(
+        &iree_hal_elf_executable_vtable,
+        executable_params->executable_layout_count,
+        executable_params->executable_layouts, &executable->layouts[0],
+        host_allocator, &executable->base);
+
+    // Copy executable constants so we own them.
+    if (executable_params->constant_count > 0) {
+      uint32_t* target_constants =
+          (uint32_t*)((uint8_t*)executable + sizeof(*executable) +
+                      executable_params->executable_layout_count *
+                          sizeof(*executable->layouts));
+      memcpy(target_constants, executable_params->constants,
+             executable_params->constant_count *
+                 sizeof(*executable_params->constants));
+      executable->base.environment.constants = target_constants;
+    }
+  }
+  if (iree_status_is_ok(status)) {
+    // Attempt to load the ELF module.
+    status = iree_elf_module_initialize_from_memory(
+        executable_params->executable_data, /*import_table=*/NULL,
+        host_allocator, &executable->module);
+  }
+  if (iree_status_is_ok(status)) {
+    // Query metadata and get the entry point function pointers.
+    status = iree_hal_elf_executable_query_library(executable);
+  }
+  if (iree_status_is_ok(status)) {
+    // Resolve imports, if any.
+    status =
+        iree_hal_elf_executable_resolve_imports(executable, import_provider);
+  }
+
+  const bool disable_verification =
+      iree_all_bits_set(executable_params->caching_mode,
+                        IREE_HAL_EXECUTABLE_CACHING_MODE_DISABLE_VERIFICATION);
+  if (iree_status_is_ok(status) && !disable_verification) {
+    // Check to make sure that the entry point count matches the layout count.
+    if (executable->library.v0->exports.count !=
+        executable_params->executable_layout_count) {
+      status =
+          iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                           "executable provides %u entry points but caller "
+                           "provided %zu; must match",
+                           executable->library.v0->exports.count,
+                           executable_params->executable_layout_count);
+    }
+  }
+  if (iree_status_is_ok(status) && !disable_verification) {
+    // Check to make sure that the constant table has values for all constants.
+    if (executable->library.v0->constants.count !=
+        executable_params->constant_count) {
+      status = iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                                "executable requires %u constants but caller "
+                                "provided %zu; must match",
+                                executable->library.v0->constants.count,
+                                executable_params->constant_count);
+    }
+  }
+
+  if (iree_status_is_ok(status)) {
+    *out_executable = (iree_hal_executable_t*)executable;
+  } else {
+    iree_hal_executable_release((iree_hal_executable_t*)executable);
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_elf_executable_destroy(
+    iree_hal_executable_t* base_executable) {
+  iree_hal_elf_executable_t* executable =
+      (iree_hal_elf_executable_t*)base_executable;
+  iree_allocator_t host_allocator = executable->base.host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_elf_module_deinitialize(&executable->module);
+
+  if (executable->base.environment.imports != NULL) {
+    iree_allocator_free(host_allocator,
+                        (void*)executable->base.environment.imports);
+  }
+
+  iree_hal_local_executable_deinitialize(
+      (iree_hal_local_executable_t*)base_executable);
+  iree_allocator_free(host_allocator, executable);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_status_t iree_hal_elf_executable_issue_call(
+    iree_hal_local_executable_t* base_executable, iree_host_size_t ordinal,
+    const iree_hal_executable_dispatch_state_v0_t* dispatch_state,
+    const iree_hal_executable_workgroup_state_v0_t* workgroup_state) {
+  iree_hal_elf_executable_t* executable =
+      (iree_hal_elf_executable_t*)base_executable;
+  const iree_hal_executable_library_v0_t* library = executable->library.v0;
+
+  if (IREE_UNLIKELY(ordinal >= library->exports.count)) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "entry point ordinal out of bounds");
+  }
+
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+  iree_string_view_t entry_point_name = iree_string_view_empty();
+  if (library->exports.names != NULL) {
+    entry_point_name = iree_make_cstring_view(library->exports.names[ordinal]);
+  }
+  if (iree_string_view_is_empty(entry_point_name)) {
+    entry_point_name = iree_make_cstring_view("unknown_elf_call");
+  }
+  IREE_TRACE_ZONE_BEGIN_EXTERNAL(
+      z0, executable->identifier.data, executable->identifier.size, ordinal,
+      entry_point_name.data, entry_point_name.size, NULL, 0);
+  if (library->exports.tags != NULL) {
+    const char* tag = library->exports.tags[ordinal];
+    if (tag) {
+      IREE_TRACE_ZONE_APPEND_TEXT(z0, tag);
+    }
+  }
+#endif  // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+
+  int ret = iree_elf_call_i_ppp(library->exports.ptrs[ordinal],
+                                (void*)&base_executable->environment,
+                                (void*)dispatch_state, (void*)workgroup_state);
+
+  IREE_TRACE_ZONE_END(z0);
+
+  return ret == 0 ? iree_ok_status()
+                  : iree_make_status(
+                        IREE_STATUS_INTERNAL,
+                        "executable entry point returned catastrophic error %d",
+                        ret);
+}
+
+static const iree_hal_local_executable_vtable_t iree_hal_elf_executable_vtable =
+    {
+        .base =
+            {
+                .destroy = iree_hal_elf_executable_destroy,
+            },
+        .issue_call = iree_hal_elf_executable_issue_call,
+};
+
+//===----------------------------------------------------------------------===//
+// iree_hal_embedded_library_loader_t
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_embedded_library_loader_t {
+  iree_hal_executable_loader_t base;
+  iree_allocator_t host_allocator;
+} iree_hal_embedded_library_loader_t;
+
+static const iree_hal_executable_loader_vtable_t
+    iree_hal_embedded_library_loader_vtable;
+
+iree_status_t iree_hal_embedded_library_loader_create(
+    iree_hal_executable_import_provider_t import_provider,
+    iree_allocator_t host_allocator,
+    iree_hal_executable_loader_t** out_executable_loader) {
+  IREE_ASSERT_ARGUMENT(out_executable_loader);
+  *out_executable_loader = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_embedded_library_loader_t* executable_loader = NULL;
+  iree_status_t status = iree_allocator_malloc(
+      host_allocator, sizeof(*executable_loader), (void**)&executable_loader);
+  if (iree_status_is_ok(status)) {
+    iree_hal_executable_loader_initialize(
+        &iree_hal_embedded_library_loader_vtable, import_provider,
+        &executable_loader->base);
+    executable_loader->host_allocator = host_allocator;
+    *out_executable_loader = (iree_hal_executable_loader_t*)executable_loader;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_embedded_library_loader_destroy(
+    iree_hal_executable_loader_t* base_executable_loader) {
+  iree_hal_embedded_library_loader_t* executable_loader =
+      (iree_hal_embedded_library_loader_t*)base_executable_loader;
+  iree_allocator_t host_allocator = executable_loader->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_allocator_free(host_allocator, executable_loader);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static bool iree_hal_embedded_library_loader_query_support(
+    iree_hal_executable_loader_t* base_executable_loader,
+    iree_hal_executable_caching_mode_t caching_mode,
+    iree_string_view_t executable_format) {
+  return iree_string_view_equal(
+      executable_format, iree_make_cstring_view("embedded-elf-" IREE_ARCH));
+}
+
+static iree_status_t iree_hal_embedded_library_loader_try_load(
+    iree_hal_executable_loader_t* base_executable_loader,
+    const iree_hal_executable_params_t* executable_params,
+    iree_hal_executable_t** out_executable) {
+  iree_hal_embedded_library_loader_t* executable_loader =
+      (iree_hal_embedded_library_loader_t*)base_executable_loader;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Perform the load of the ELF and wrap it in an executable handle.
+  iree_status_t status = iree_hal_elf_executable_create(
+      executable_params, base_executable_loader->import_provider,
+      executable_loader->host_allocator, out_executable);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static const iree_hal_executable_loader_vtable_t
+    iree_hal_embedded_library_loader_vtable = {
+        .destroy = iree_hal_embedded_library_loader_destroy,
+        .query_support = iree_hal_embedded_library_loader_query_support,
+        .try_load = iree_hal_embedded_library_loader_try_load,
+};
diff --git a/runtime/src/iree/hal/local/loaders/embedded_library_loader.h b/runtime/src/iree/hal/local/loaders/embedded_library_loader.h
new file mode 100644
index 0000000..7d75396
--- /dev/null
+++ b/runtime/src/iree/hal/local/loaders/embedded_library_loader.h
@@ -0,0 +1,33 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_LOCAL_LOADERS_ELF_LIBRARY_LOADER_H_
+#define IREE_HAL_LOCAL_LOADERS_ELF_LIBRARY_LOADER_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/local/executable_loader.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Creates an executable loader that can load minimally-featured ELF dynamic
+// libraries on any platform. This allows us to use a single file format across
+// all operating systems at the cost of some missing debugging/profiling
+// features.
+iree_status_t iree_hal_embedded_library_loader_create(
+    iree_hal_executable_import_provider_t import_provider,
+    iree_allocator_t host_allocator,
+    iree_hal_executable_loader_t** out_executable_loader);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_LOCAL_LOADERS_ELF_LIBRARY_LOADER_H_
diff --git a/runtime/src/iree/hal/local/loaders/static_library_loader.c b/runtime/src/iree/hal/local/loaders/static_library_loader.c
new file mode 100644
index 0000000..e123938
--- /dev/null
+++ b/runtime/src/iree/hal/local/loaders/static_library_loader.c
@@ -0,0 +1,312 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/local/loaders/static_library_loader.h"
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "iree/base/tracing.h"
+#include "iree/hal/api.h"
+#include "iree/hal/local/executable_environment.h"
+#include "iree/hal/local/local_executable.h"
+#include "iree/hal/local/local_executable_layout.h"
+
+//===----------------------------------------------------------------------===//
+// iree_hal_static_executable_t
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_static_executable_t {
+  iree_hal_local_executable_t base;
+
+  // Name used for the file field in tracy and debuggers.
+  iree_string_view_t identifier;
+
+  union {
+    const iree_hal_executable_library_header_t** header;
+    const iree_hal_executable_library_v0_t* v0;
+  } library;
+
+  iree_hal_local_executable_layout_t* layouts[];
+} iree_hal_static_executable_t;
+
+static const iree_hal_local_executable_vtable_t
+    iree_hal_static_executable_vtable;
+
+static iree_status_t iree_hal_static_executable_create(
+    const iree_hal_executable_params_t* executable_params,
+    const iree_hal_executable_library_header_t** library_header,
+    const iree_hal_executable_import_provider_t import_provider,
+    iree_allocator_t host_allocator, iree_hal_executable_t** out_executable) {
+  IREE_ASSERT_ARGUMENT(executable_params);
+  IREE_ASSERT_ARGUMENT(!executable_params->executable_layout_count ||
+                       executable_params->executable_layouts);
+  IREE_ASSERT_ARGUMENT(!executable_params->constant_count ||
+                       executable_params->constants);
+  IREE_ASSERT_ARGUMENT(library_header);
+  IREE_ASSERT_ARGUMENT(out_executable);
+  *out_executable = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_static_executable_t* executable = NULL;
+  iree_host_size_t total_size =
+      sizeof(*executable) +
+      executable_params->executable_layout_count *
+          sizeof(*executable->layouts) +
+      executable_params->constant_count * sizeof(*executable_params->constants);
+  iree_status_t status =
+      iree_allocator_malloc(host_allocator, total_size, (void**)&executable);
+  if (iree_status_is_ok(status)) {
+    iree_hal_local_executable_initialize(
+        &iree_hal_static_executable_vtable,
+        executable_params->executable_layout_count,
+        executable_params->executable_layouts, &executable->layouts[0],
+        host_allocator, &executable->base);
+    executable->library.header = library_header;
+    executable->identifier = iree_make_cstring_view((*library_header)->name);
+    executable->base.dispatch_attrs = executable->library.v0->exports.attrs;
+
+    // Copy executable constants so we own them.
+    if (executable_params->constant_count > 0) {
+      uint32_t* target_constants =
+          (uint32_t*)((uint8_t*)executable + sizeof(*executable) +
+                      executable_params->executable_layout_count *
+                          sizeof(*executable->layouts));
+      memcpy(target_constants, executable_params->constants,
+             executable_params->constant_count *
+                 sizeof(*executable_params->constants));
+      executable->base.environment.constants = target_constants;
+    }
+  }
+
+  if (iree_status_is_ok(status)) {
+    if (executable->library.v0->imports.count > 0) {
+      status =
+          iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                           "static libraries do not support imports and should "
+                           "directly link against the functions they require");
+    }
+  }
+
+  if (iree_status_is_ok(status)) {
+    *out_executable = (iree_hal_executable_t*)executable;
+  } else {
+    *out_executable = NULL;
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_static_executable_destroy(
+    iree_hal_executable_t* base_executable) {
+  iree_hal_static_executable_t* executable =
+      (iree_hal_static_executable_t*)base_executable;
+  iree_allocator_t host_allocator = executable->base.host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_local_executable_deinitialize(
+      (iree_hal_local_executable_t*)base_executable);
+  iree_allocator_free(host_allocator, executable);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_status_t iree_hal_static_executable_issue_call(
+    iree_hal_local_executable_t* base_executable, iree_host_size_t ordinal,
+    const iree_hal_executable_dispatch_state_v0_t* dispatch_state,
+    const iree_hal_executable_workgroup_state_v0_t* workgroup_state) {
+  iree_hal_static_executable_t* executable =
+      (iree_hal_static_executable_t*)base_executable;
+  const iree_hal_executable_library_v0_t* library = executable->library.v0;
+
+  if (IREE_UNLIKELY(ordinal >= library->exports.count)) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "entry point ordinal out of bounds");
+  }
+
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+  iree_string_view_t entry_point_name = iree_string_view_empty();
+  if (library->exports.names != NULL) {
+    entry_point_name = iree_make_cstring_view(library->exports.names[ordinal]);
+  }
+  if (iree_string_view_is_empty(entry_point_name)) {
+    entry_point_name = iree_make_cstring_view("unknown_dylib_call");
+  }
+  IREE_TRACE_ZONE_BEGIN_EXTERNAL(
+      z0, executable->identifier.data, executable->identifier.size, ordinal,
+      entry_point_name.data, entry_point_name.size, NULL, 0);
+  if (library->exports.tags != NULL) {
+    const char* tag = library->exports.tags[ordinal];
+    if (tag) {
+      IREE_TRACE_ZONE_APPEND_TEXT(z0, tag);
+    }
+  }
+#endif  // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+
+  int ret = library->exports.ptrs[ordinal](&base_executable->environment,
+                                           dispatch_state, workgroup_state);
+
+  IREE_TRACE_ZONE_END(z0);
+
+  return ret == 0 ? iree_ok_status()
+                  : iree_make_status(
+                        IREE_STATUS_INTERNAL,
+                        "executable entry point returned catastrophic error %d",
+                        ret);
+}
+
+static const iree_hal_local_executable_vtable_t
+    iree_hal_static_executable_vtable = {
+        .base =
+            {
+                .destroy = iree_hal_static_executable_destroy,
+            },
+        .issue_call = iree_hal_static_executable_issue_call,
+};
+
+//===----------------------------------------------------------------------===//
+// iree_hal_static_library_loader_t
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_static_library_loader_t {
+  iree_hal_executable_loader_t base;
+  iree_allocator_t host_allocator;
+  iree_host_size_t library_count;
+  const iree_hal_executable_library_header_t** const libraries[];
+} iree_hal_static_library_loader_t;
+
+static const iree_hal_executable_loader_vtable_t
+    iree_hal_static_library_loader_vtable;
+
+iree_status_t iree_hal_static_library_loader_create(
+    iree_host_size_t library_count,
+    const iree_hal_executable_library_query_fn_t* library_query_fns,
+    iree_hal_executable_import_provider_t import_provider,
+    iree_allocator_t host_allocator,
+    iree_hal_executable_loader_t** out_executable_loader) {
+  IREE_ASSERT_ARGUMENT(!library_count || library_query_fns);
+  IREE_ASSERT_ARGUMENT(out_executable_loader);
+  *out_executable_loader = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_static_library_loader_t* executable_loader = NULL;
+  iree_host_size_t total_size =
+      sizeof(*executable_loader) +
+      sizeof(executable_loader->libraries[0]) * library_count;
+  iree_status_t status = iree_allocator_malloc(host_allocator, total_size,
+                                               (void**)&executable_loader);
+  if (iree_status_is_ok(status)) {
+    iree_hal_executable_loader_initialize(
+        &iree_hal_static_library_loader_vtable, import_provider,
+        &executable_loader->base);
+    executable_loader->host_allocator = host_allocator;
+    executable_loader->library_count = library_count;
+
+    // Default environment to enable initialization.
+    iree_hal_executable_environment_v0_t environment;
+    iree_hal_executable_environment_initialize(host_allocator, &environment);
+
+    // Query and verify the libraries provided all match our expected version.
+    // It's rare they won't, however static libraries generated with a newer
+    // version of the IREE compiler that are then linked with an older version
+    // of the runtime are difficult to spot otherwise.
+    for (iree_host_size_t i = 0; i < library_count; ++i) {
+      const iree_hal_executable_library_header_t* const* header_ptr =
+          library_query_fns[i](IREE_HAL_EXECUTABLE_LIBRARY_VERSION_LATEST,
+                               &environment);
+      if (!header_ptr) {
+        status = iree_make_status(
+            IREE_STATUS_UNAVAILABLE,
+            "failed to query library header for runtime version %d",
+            IREE_HAL_EXECUTABLE_LIBRARY_VERSION_LATEST);
+        break;
+      }
+      const iree_hal_executable_library_header_t* header = *header_ptr;
+      IREE_TRACE_ZONE_APPEND_TEXT(z0, header->name);
+      if (header->version > IREE_HAL_EXECUTABLE_LIBRARY_VERSION_LATEST) {
+        status = iree_make_status(
+            IREE_STATUS_FAILED_PRECONDITION,
+            "executable does not support this version of the "
+            "runtime (executable: %d, runtime: %d)",
+            header->version, IREE_HAL_EXECUTABLE_LIBRARY_VERSION_LATEST);
+        break;
+      }
+      memcpy((void*)&executable_loader->libraries[i], &header_ptr,
+             sizeof(header_ptr));
+    }
+  }
+
+  if (iree_status_is_ok(status)) {
+    *out_executable_loader = (iree_hal_executable_loader_t*)executable_loader;
+  } else {
+    iree_allocator_free(host_allocator, executable_loader);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_static_library_loader_destroy(
+    iree_hal_executable_loader_t* base_executable_loader) {
+  iree_hal_static_library_loader_t* executable_loader =
+      (iree_hal_static_library_loader_t*)base_executable_loader;
+  iree_allocator_t host_allocator = executable_loader->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_allocator_free(host_allocator, executable_loader);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static bool iree_hal_static_library_loader_query_support(
+    iree_hal_executable_loader_t* base_executable_loader,
+    iree_hal_executable_caching_mode_t caching_mode,
+    iree_string_view_t executable_format) {
+  return iree_string_view_equal(executable_format,
+                                iree_make_cstring_view("static"));
+}
+
+static iree_status_t iree_hal_static_library_loader_try_load(
+    iree_hal_executable_loader_t* base_executable_loader,
+    const iree_hal_executable_params_t* executable_params,
+    iree_hal_executable_t** out_executable) {
+  iree_hal_static_library_loader_t* executable_loader =
+      (iree_hal_static_library_loader_t*)base_executable_loader;
+
+  // The executable data is just the name of the library.
+  iree_string_view_t library_name = iree_make_string_view(
+      (const char*)executable_params->executable_data.data,
+      executable_params->executable_data.data_length);
+
+  // Linear scan of the registered libraries; there's usually only one per
+  // module (aka source model) and as such it's a small list and probably not
+  // worth optimizing. We could sort the libraries list by name on loader
+  // creation to perform a binary-search fairly easily, though, at the cost of
+  // the additional code size.
+  for (iree_host_size_t i = 0; i < executable_loader->library_count; ++i) {
+    const iree_hal_executable_library_header_t* header =
+        *executable_loader->libraries[i];
+    if (iree_string_view_equal(library_name,
+                               iree_make_cstring_view(header->name))) {
+      return iree_hal_static_executable_create(
+          executable_params, executable_loader->libraries[i],
+          base_executable_loader->import_provider,
+          executable_loader->host_allocator, out_executable);
+    }
+  }
+  return iree_make_status(IREE_STATUS_NOT_FOUND,
+                          "no static library with the name '%.*s' registered",
+                          (int)library_name.size, library_name.data);
+}
+
+static const iree_hal_executable_loader_vtable_t
+    iree_hal_static_library_loader_vtable = {
+        .destroy = iree_hal_static_library_loader_destroy,
+        .query_support = iree_hal_static_library_loader_query_support,
+        .try_load = iree_hal_static_library_loader_try_load,
+};
diff --git a/runtime/src/iree/hal/local/loaders/static_library_loader.h b/runtime/src/iree/hal/local/loaders/static_library_loader.h
new file mode 100644
index 0000000..63ed4c4
--- /dev/null
+++ b/runtime/src/iree/hal/local/loaders/static_library_loader.h
@@ -0,0 +1,47 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_LOCAL_LOADERS_STATIC_LIBRARY_LOADER_H_
+#define IREE_HAL_LOCAL_LOADERS_STATIC_LIBRARY_LOADER_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/local/executable_library.h"
+#include "iree/hal/local/executable_loader.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Creates a library loader that exposes the provided libraries to the HAL for
+// use as executables.
+//
+// This loader will handle executable formats of 'static'. Version checks will
+// ensure that the IREE compiler-produced static library version is one that the
+// runtime can support.
+//
+// The name defined on each library will be used to lookup the executables and
+// must match with the names used during compilation exactly. The
+// iree_hal_executable_params_t used to reference the executables will contain
+// the library name and be used to lookup the library in the list.
+//
+// Multiple static library loaders can be registered in cases when several
+// independent sets of libraries are linked in however duplicate names both
+// within and across loaders will result in undefined behavior.
+iree_status_t iree_hal_static_library_loader_create(
+    iree_host_size_t library_count,
+    const iree_hal_executable_library_query_fn_t* library_query_fns,
+    iree_hal_executable_import_provider_t import_provider,
+    iree_allocator_t host_allocator,
+    iree_hal_executable_loader_t** out_executable_loader);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_LOCAL_LOADERS_STATIC_LIBRARY_LOADER_H_
diff --git a/runtime/src/iree/hal/local/loaders/system_library_loader.c b/runtime/src/iree/hal/local/loaders/system_library_loader.c
new file mode 100644
index 0000000..ebd0213
--- /dev/null
+++ b/runtime/src/iree/hal/local/loaders/system_library_loader.c
@@ -0,0 +1,508 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/local/loaders/system_library_loader.h"
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "iree/base/internal/dynamic_library.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/api.h"
+#include "iree/hal/local/executable_library.h"
+#include "iree/hal/local/local_executable.h"
+#include "iree/hal/local/local_executable_layout.h"
+
+//===----------------------------------------------------------------------===//
+// iree_hal_system_executable_footer_t
+//===----------------------------------------------------------------------===//
+
+// An optional footer that may exist on the system library that is used to add
+// additional debug information for use directly by IREE, such as PDB or dSYM
+// files. This is only expected to be present when there is a debug database
+// but we may want to extend it in the future.
+typedef struct iree_hal_system_executable_footer_t {
+  uint8_t magic[8];  // IREE_HAL_SYSTEM_EXECUTABLE_FOOTER_MAGIC
+  uint32_t version;  // IREE_HAL_SYSTEM_EXECUTABLE_FOOTER_VERSION
+  uint32_t flags;    // reserved
+  // Offset of the library within the parent data stream.
+  // Almost always zero but here in case we want to allow for chaining.
+  uint64_t library_offset;
+  // Size of the system library in bytes.
+  uint64_t library_size;
+  // Offset of the start of the embedded debug database within the parent data
+  // stream. There may be padding between the library and this offset.
+  uint64_t debug_offset;
+  // Size of the debug database in bytes.
+  uint64_t debug_size;
+} iree_hal_system_executable_footer_t;
+
+// EXPERIMENTAL: this is not a stable interface yet. The binary format may
+// change at any time.
+#define IREE_HAL_SYSTEM_EXECUTABLE_FOOTER_MAGIC "IREEDBG\0"
+#define IREE_HAL_SYSTEM_EXECUTABLE_FOOTER_VERSION 0
+
+// Tries to find an iree_hal_system_executable_footer_t at the end of the
+// given executable data stream.
+static const iree_hal_system_executable_footer_t*
+iree_hal_system_executable_try_query_footer(
+    iree_const_byte_span_t executable_data) {
+  if (executable_data.data_length <
+      sizeof(iree_hal_system_executable_footer_t)) {
+    return NULL;
+  }
+  const uint8_t* footer_ptr = executable_data.data +
+                              executable_data.data_length -
+                              sizeof(iree_hal_system_executable_footer_t);
+  const iree_hal_system_executable_footer_t* footer =
+      (const iree_hal_system_executable_footer_t*)(footer_ptr);
+  static_assert(sizeof(IREE_HAL_SYSTEM_EXECUTABLE_FOOTER_MAGIC) - /*NUL*/ 1 ==
+                    sizeof(footer->magic),
+                "magic number value must match struct size");
+  if (memcmp(footer->magic, IREE_HAL_SYSTEM_EXECUTABLE_FOOTER_MAGIC,
+             sizeof(footer->magic)) != 0) {
+    return NULL;
+  }
+  return footer;
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_system_executable_t
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_system_executable_t {
+  iree_hal_local_executable_t base;
+
+  // Loaded platform dynamic library.
+  iree_dynamic_library_t* handle;
+
+  // Name used for the file field in tracy and debuggers.
+  iree_string_view_t identifier;
+
+  // Queried metadata from the library.
+  union {
+    const iree_hal_executable_library_header_t** header;
+    const iree_hal_executable_library_v0_t* v0;
+  } library;
+
+  iree_hal_local_executable_layout_t* layouts[];
+} iree_hal_system_executable_t;
+
+static const iree_hal_local_executable_vtable_t
+    iree_hal_system_executable_vtable;
+
+// Loads the executable and optional debug database from the given
+// |executable_data| in memory. The memory must remain live for the lifetime
+// of the executable.
+static iree_status_t iree_hal_system_executable_load(
+    iree_hal_system_executable_t* executable,
+    iree_const_byte_span_t executable_data, iree_allocator_t host_allocator) {
+  // Check to see if the library has a footer indicating embedded debug data.
+  iree_const_byte_span_t library_data = iree_make_const_byte_span(NULL, 0);
+  iree_const_byte_span_t debug_data = iree_make_const_byte_span(NULL, 0);
+  const iree_hal_system_executable_footer_t* footer =
+      iree_hal_system_executable_try_query_footer(executable_data);
+  if (footer) {
+    // Debug file present; split the data contents.
+    iree_host_size_t data_length =
+        executable_data.data_length - sizeof(*footer);
+    if (footer->library_size > data_length ||
+        footer->debug_offset + footer->debug_size > data_length) {
+      return iree_make_status(
+          IREE_STATUS_OUT_OF_RANGE,
+          "system library footer references out of range bytes");
+    }
+    library_data =
+        iree_make_const_byte_span(executable_data.data, footer->library_size);
+    debug_data = iree_make_const_byte_span(
+        executable_data.data + footer->debug_offset, footer->debug_size);
+  } else {
+    // Entire data contents are the library.
+    library_data = executable_data;
+  }
+
+  IREE_RETURN_IF_ERROR(iree_dynamic_library_load_from_memory(
+      iree_make_cstring_view("aot"), library_data,
+      IREE_DYNAMIC_LIBRARY_FLAG_NONE, host_allocator, &executable->handle));
+
+  if (debug_data.data_length > 0) {
+    IREE_RETURN_IF_ERROR(iree_dynamic_library_attach_symbols_from_memory(
+        executable->handle, debug_data));
+  }
+
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_system_executable_query_library(
+    iree_hal_system_executable_t* executable) {
+  // Get the exported symbol used to get the library metadata.
+  iree_hal_executable_library_query_fn_t query_fn = NULL;
+  IREE_RETURN_IF_ERROR(iree_dynamic_library_lookup_symbol(
+      executable->handle, IREE_HAL_EXECUTABLE_LIBRARY_EXPORT_NAME,
+      (void**)&query_fn));
+
+  // Query for a compatible version of the library.
+  executable->library.header =
+      query_fn(IREE_HAL_EXECUTABLE_LIBRARY_VERSION_LATEST,
+               &executable->base.environment);
+  if (!executable->library.header) {
+    return iree_make_status(
+        IREE_STATUS_FAILED_PRECONDITION,
+        "executable does not support this version of the runtime (%08X)",
+        IREE_HAL_EXECUTABLE_LIBRARY_VERSION_LATEST);
+  }
+  const iree_hal_executable_library_header_t* header =
+      *executable->library.header;
+
+  // Ensure that if the library is built for a particular sanitizer that we also
+  // were compiled with that sanitizer enabled.
+  switch (header->sanitizer) {
+    case IREE_HAL_EXECUTABLE_LIBRARY_SANITIZER_NONE:
+      // Always safe even if the host has a sanitizer enabled; it just means
+      // that we won't be able to catch anything from within the executable,
+      // however checks outside will (often) still trigger when guard pages are
+      // dirtied/etc.
+      break;
+#if defined(IREE_SANITIZER_ADDRESS)
+    case IREE_HAL_EXECUTABLE_LIBRARY_SANITIZER_ADDRESS:
+      // ASAN is compiled into the host and we can load this library.
+      break;
+#else
+    case IREE_HAL_EXECUTABLE_LIBRARY_SANITIZER_ADDRESS:
+      return iree_make_status(
+          IREE_STATUS_UNAVAILABLE,
+          "executable library is compiled with ASAN support but the host "
+          "runtime is not compiled with it enabled; add -fsanitize=address to "
+          "the runtime compilation options");
+#endif  // IREE_SANITIZER_ADDRESS
+#if defined(IREE_SANITIZER_THREAD)
+    case IREE_HAL_EXECUTABLE_LIBRARY_SANITIZER_THREAD:
+      // TSAN is compiled into the host and we can load this library.
+      break;
+#else
+    case IREE_HAL_EXECUTABLE_LIBRARY_SANITIZER_THREAD:
+      return iree_make_status(
+          IREE_STATUS_UNAVAILABLE,
+          "executable library is compiled with TSAN support but the host "
+          "runtime is not compiled with it enabled; add -fsanitize=thread to "
+          "the runtime compilation options");
+#endif  // IREE_SANITIZER_THREAD
+    default:
+      return iree_make_status(
+          IREE_STATUS_UNAVAILABLE,
+          "executable library requires a sanitizer the host runtime is not "
+          "compiled to enable/understand: %u",
+          (uint32_t)header->sanitizer);
+  }
+
+  executable->identifier = iree_make_cstring_view(header->name);
+
+  executable->base.dispatch_attrs = executable->library.v0->exports.attrs;
+
+  return iree_ok_status();
+}
+
+static int iree_hal_system_executable_import_thunk_v0(
+    iree_hal_executable_import_v0_t fn_ptr, void* import_params) {
+  return fn_ptr(import_params);
+}
+
+// Resolves all of the imports declared by the executable using the given
+// |import_provider|.
+static iree_status_t iree_hal_system_executable_resolve_imports(
+    iree_hal_system_executable_t* executable,
+    const iree_hal_executable_import_provider_t import_provider) {
+  const iree_hal_executable_import_table_v0_t* import_table =
+      &executable->library.v0->imports;
+  if (!import_table->count) return iree_ok_status();
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Pass all imports right through.
+  executable->base.environment.import_thunk =
+      iree_hal_system_executable_import_thunk_v0;
+
+  // Allocate storage for the imports.
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0,
+      iree_allocator_malloc(
+          executable->base.host_allocator,
+          import_table->count * sizeof(*executable->base.environment.imports),
+          (void**)&executable->base.environment.imports));
+
+  // Try to resolve each import.
+  // NOTE: imports are sorted alphabetically and if we cared we could use this
+  // information to more efficiently resolve the symbols from providers (O(n)
+  // walk vs potential O(nlogn)/O(n^2)).
+  for (uint32_t i = 0; i < import_table->count; ++i) {
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z0,
+        iree_hal_executable_import_provider_resolve(
+            import_provider, iree_make_cstring_view(import_table->symbols[i]),
+            (void**)&executable->base.environment.imports[i]));
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_system_executable_create(
+    const iree_hal_executable_params_t* executable_params,
+    const iree_hal_executable_import_provider_t import_provider,
+    iree_allocator_t host_allocator, iree_hal_executable_t** out_executable) {
+  IREE_ASSERT_ARGUMENT(executable_params);
+  IREE_ASSERT_ARGUMENT(executable_params->executable_data.data &&
+                       executable_params->executable_data.data_length);
+  IREE_ASSERT_ARGUMENT(!executable_params->executable_layout_count ||
+                       executable_params->executable_layouts);
+  IREE_ASSERT_ARGUMENT(!executable_params->constant_count ||
+                       executable_params->constants);
+  IREE_ASSERT_ARGUMENT(out_executable);
+  *out_executable = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_system_executable_t* executable = NULL;
+  iree_host_size_t total_size =
+      sizeof(*executable) +
+      executable_params->executable_layout_count *
+          sizeof(*executable->layouts) +
+      executable_params->constant_count * sizeof(*executable_params->constants);
+  iree_status_t status =
+      iree_allocator_malloc(host_allocator, total_size, (void**)&executable);
+  if (iree_status_is_ok(status)) {
+    iree_hal_local_executable_initialize(
+        &iree_hal_system_executable_vtable,
+        executable_params->executable_layout_count,
+        executable_params->executable_layouts, &executable->layouts[0],
+        host_allocator, &executable->base);
+
+    // Copy executable constants so we own them.
+    if (executable_params->constant_count > 0) {
+      uint32_t* target_constants =
+          (uint32_t*)((uint8_t*)executable + sizeof(*executable) +
+                      executable_params->executable_layout_count *
+                          sizeof(*executable->layouts));
+      memcpy(target_constants, executable_params->constants,
+             executable_params->constant_count *
+                 sizeof(*executable_params->constants));
+      executable->base.environment.constants = target_constants;
+    }
+  }
+  if (iree_status_is_ok(status)) {
+    // Attempt to extract the embedded library and load it.
+    status = iree_hal_system_executable_load(
+        executable, executable_params->executable_data, host_allocator);
+  }
+  if (iree_status_is_ok(status)) {
+    // Query metadata and get the entry point function pointers.
+    status = iree_hal_system_executable_query_library(executable);
+  }
+  if (iree_status_is_ok(status)) {
+    // Resolve imports, if any.
+    status =
+        iree_hal_system_executable_resolve_imports(executable, import_provider);
+  }
+
+  const bool disable_verification =
+      iree_all_bits_set(executable_params->caching_mode,
+                        IREE_HAL_EXECUTABLE_CACHING_MODE_DISABLE_VERIFICATION);
+  if (iree_status_is_ok(status) && !disable_verification) {
+    // Check to make sure that the entry point count matches the layout count.
+    if (executable->library.v0->exports.count !=
+        executable_params->executable_layout_count) {
+      status =
+          iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                           "executable provides %u entry points but caller "
+                           "provided %zu; must match",
+                           executable->library.v0->exports.count,
+                           executable_params->executable_layout_count);
+    }
+  }
+  if (iree_status_is_ok(status) && !disable_verification) {
+    // Check to make sure that the constant table has values for all constants.
+    if (executable->library.v0->constants.count !=
+        executable_params->constant_count) {
+      status = iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                                "executable requires %u constants but caller "
+                                "provided %zu; must match",
+                                executable->library.v0->constants.count,
+                                executable_params->constant_count);
+    }
+  }
+
+  if (iree_status_is_ok(status)) {
+    *out_executable = (iree_hal_executable_t*)executable;
+  } else {
+    iree_hal_executable_release((iree_hal_executable_t*)executable);
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_system_executable_destroy(
+    iree_hal_executable_t* base_executable) {
+  iree_hal_system_executable_t* executable =
+      (iree_hal_system_executable_t*)base_executable;
+  iree_allocator_t host_allocator = executable->base.host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_dynamic_library_release(executable->handle);
+
+  if (executable->base.environment.imports != NULL) {
+    iree_allocator_free(host_allocator,
+                        (void*)executable->base.environment.imports);
+  }
+
+  iree_hal_local_executable_deinitialize(
+      (iree_hal_local_executable_t*)base_executable);
+  iree_allocator_free(host_allocator, executable);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_status_t iree_hal_system_executable_issue_call(
+    iree_hal_local_executable_t* base_executable, iree_host_size_t ordinal,
+    const iree_hal_executable_dispatch_state_v0_t* dispatch_state,
+    const iree_hal_executable_workgroup_state_v0_t* workgroup_state) {
+  iree_hal_system_executable_t* executable =
+      (iree_hal_system_executable_t*)base_executable;
+  const iree_hal_executable_library_v0_t* library = executable->library.v0;
+
+  if (IREE_UNLIKELY(ordinal >= library->exports.count)) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "entry point ordinal out of bounds");
+  }
+
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+  iree_string_view_t entry_point_name = iree_string_view_empty();
+  if (library->exports.names != NULL) {
+    entry_point_name = iree_make_cstring_view(library->exports.names[ordinal]);
+  }
+  if (iree_string_view_is_empty(entry_point_name)) {
+    entry_point_name = iree_make_cstring_view("unknown_dylib_call");
+  }
+  IREE_TRACE_ZONE_BEGIN_EXTERNAL(
+      z0, executable->identifier.data, executable->identifier.size, ordinal,
+      entry_point_name.data, entry_point_name.size, NULL, 0);
+  if (library->exports.tags != NULL) {
+    const char* tag = library->exports.tags[ordinal];
+    if (tag) {
+      IREE_TRACE_ZONE_APPEND_TEXT(z0, tag);
+    }
+  }
+#endif  // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+
+  int ret = library->exports.ptrs[ordinal](&base_executable->environment,
+                                           dispatch_state, workgroup_state);
+
+  IREE_TRACE_ZONE_END(z0);
+
+  return ret == 0 ? iree_ok_status()
+                  : iree_make_status(
+                        IREE_STATUS_INTERNAL,
+                        "executable entry point returned catastrophic error %d",
+                        ret);
+}
+
+static const iree_hal_local_executable_vtable_t
+    iree_hal_system_executable_vtable = {
+        .base =
+            {
+                .destroy = iree_hal_system_executable_destroy,
+            },
+        .issue_call = iree_hal_system_executable_issue_call,
+};
+
+//===----------------------------------------------------------------------===//
+// iree_hal_system_library_loader_t
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_system_library_loader_t {
+  iree_hal_executable_loader_t base;
+  iree_allocator_t host_allocator;
+} iree_hal_system_library_loader_t;
+
+static const iree_hal_executable_loader_vtable_t
+    iree_hal_system_library_loader_vtable;
+
+iree_status_t iree_hal_system_library_loader_create(
+    iree_hal_executable_import_provider_t import_provider,
+    iree_allocator_t host_allocator,
+    iree_hal_executable_loader_t** out_executable_loader) {
+  IREE_ASSERT_ARGUMENT(out_executable_loader);
+  *out_executable_loader = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_system_library_loader_t* executable_loader = NULL;
+  iree_status_t status = iree_allocator_malloc(
+      host_allocator, sizeof(*executable_loader), (void**)&executable_loader);
+  if (iree_status_is_ok(status)) {
+    iree_hal_executable_loader_initialize(
+        &iree_hal_system_library_loader_vtable, import_provider,
+        &executable_loader->base);
+    executable_loader->host_allocator = host_allocator;
+    *out_executable_loader = (iree_hal_executable_loader_t*)executable_loader;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_system_library_loader_destroy(
+    iree_hal_executable_loader_t* base_executable_loader) {
+  iree_hal_system_library_loader_t* executable_loader =
+      (iree_hal_system_library_loader_t*)base_executable_loader;
+  iree_allocator_t host_allocator = executable_loader->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_allocator_free(host_allocator, executable_loader);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+#if defined(IREE_PLATFORM_APPLE)
+#define IREE_PLATFORM_DYLIB_TYPE "dylib"
+#elif defined(IREE_PLATFORM_WINDOWS)
+#define IREE_PLATFORM_DYLIB_TYPE "dll"
+#elif defined(IREE_PLATFORM_EMSCRIPTEN)
+#define IREE_PLATFORM_DYLIB_TYPE "wasm"
+#else
+#define IREE_PLATFORM_DYLIB_TYPE "elf"
+#endif  // IREE_PLATFORM_*
+
+static bool iree_hal_system_library_loader_query_support(
+    iree_hal_executable_loader_t* base_executable_loader,
+    iree_hal_executable_caching_mode_t caching_mode,
+    iree_string_view_t executable_format) {
+  return iree_string_view_equal(
+      executable_format,
+      iree_make_cstring_view("system-" IREE_PLATFORM_DYLIB_TYPE "-" IREE_ARCH));
+}
+
+static iree_status_t iree_hal_system_library_loader_try_load(
+    iree_hal_executable_loader_t* base_executable_loader,
+    const iree_hal_executable_params_t* executable_params,
+    iree_hal_executable_t** out_executable) {
+  iree_hal_system_library_loader_t* executable_loader =
+      (iree_hal_system_library_loader_t*)base_executable_loader;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Perform the load (and requisite disgusting hackery).
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_hal_system_executable_create(
+              executable_params, base_executable_loader->import_provider,
+              executable_loader->host_allocator, out_executable));
+
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+static const iree_hal_executable_loader_vtable_t
+    iree_hal_system_library_loader_vtable = {
+        .destroy = iree_hal_system_library_loader_destroy,
+        .query_support = iree_hal_system_library_loader_query_support,
+        .try_load = iree_hal_system_library_loader_try_load,
+};
diff --git a/runtime/src/iree/hal/local/loaders/system_library_loader.h b/runtime/src/iree/hal/local/loaders/system_library_loader.h
new file mode 100644
index 0000000..23ffdd0
--- /dev/null
+++ b/runtime/src/iree/hal/local/loaders/system_library_loader.h
@@ -0,0 +1,35 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_LOCAL_LOADERS_SYSTEM_LIBRARY_LOADER_H_
+#define IREE_HAL_LOCAL_LOADERS_SYSTEM_LIBRARY_LOADER_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/local/executable_loader.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Creates an executable loader that can load files from platform-supported
+// dynamic libraries (such as .dylib on darwin, .so on linux, .dll on windows).
+//
+// This uses the legacy "dylib"-style format that will be deleted soon and is
+// only a placeholder until the compiler can be switched to output
+// iree_hal_executable_library_t-compatible files.
+iree_status_t iree_hal_system_library_loader_create(
+    iree_hal_executable_import_provider_t import_provider,
+    iree_allocator_t host_allocator,
+    iree_hal_executable_loader_t** out_executable_loader);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_LOCAL_LOADERS_SYSTEM_LIBRARY_LOADER_H_
diff --git a/runtime/src/iree/hal/local/loaders/vmvx_module_loader.c b/runtime/src/iree/hal/local/loaders/vmvx_module_loader.c
new file mode 100644
index 0000000..31f59c1
--- /dev/null
+++ b/runtime/src/iree/hal/local/loaders/vmvx_module_loader.c
@@ -0,0 +1,571 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/local/loaders/vmvx_module_loader.h"
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "iree/base/tracing.h"
+#include "iree/hal/api.h"
+#include "iree/hal/local/executable_library.h"
+#include "iree/hal/local/local_executable.h"
+#include "iree/hal/local/local_executable_layout.h"
+#include "iree/modules/vmvx/module.h"
+#include "iree/vm/bytecode_module.h"
+
+//===----------------------------------------------------------------------===//
+// iree_hal_vmvx_executable_t
+//===----------------------------------------------------------------------===//
+
+#define IREE_VMVX_ENTRY_SIGNATURE "0rrriiiiiiiii_v"
+
+typedef struct iree_hal_vmvx_executable_t {
+  iree_hal_local_executable_t base;
+
+  // Context containing both the VMVX module and the loaded executable.
+  iree_vm_context_t* context;
+
+  // Resolved entry functions from the module.
+  iree_host_size_t entry_fn_count;
+  iree_vm_function_t entry_fns[];
+} iree_hal_vmvx_executable_t;
+
+static const iree_hal_local_executable_vtable_t iree_hal_vmvx_executable_vtable;
+
+// Verifies that an entry point function exported by the bytecode module matches
+// the calling convention we expect. This avoids the need to check it during
+// dispatch (where returning errors is hard and it'd be expensive).
+static iree_status_t iree_hal_vmvx_executable_verify_entry_point(
+    iree_vm_function_t* entry_fn) {
+  iree_vm_function_signature_t signature = iree_vm_function_signature(entry_fn);
+  if (!iree_string_view_equal(
+          signature.calling_convention,
+          iree_make_cstring_view(IREE_VMVX_ENTRY_SIGNATURE))) {
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "executable entry point does not match the expected calling "
+        "convention; expected '" IREE_VMVX_ENTRY_SIGNATURE "' but got '%.*s'",
+        (int)signature.calling_convention.size,
+        signature.calling_convention.data);
+  }
+  return iree_ok_status();
+}
+
+// Calls the __set_constants method on |executable| with the given |constants|.
+// We wrap the data in VM buffer and require that it is not retained by the
+// module; the constant values should be extracted and stored in globals.
+// Fails if the constant table is not of the required size.
+static iree_status_t iree_hal_vmvx_executable_set_constants(
+    iree_hal_vmvx_executable_t* executable, iree_vm_module_t* bytecode_module,
+    iree_host_size_t constant_count, const uint32_t* constants) {
+  // Look for the exported function. If it's not present then no constants are
+  // required and if it is then we must have at least one constant.
+  iree_vm_function_t set_function;
+  iree_status_t status = iree_vm_module_lookup_function_by_name(
+      bytecode_module, IREE_VM_FUNCTION_LINKAGE_EXPORT,
+      iree_make_cstring_view("__set_constants"), &set_function);
+  if (iree_status_is_not_found(status)) {
+    // No constants required by the executable.
+    iree_status_ignore(status);
+    if (constant_count > 0) {
+      // ...but we got provided some anyway.
+      return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                              "executable has no executable-level constants "
+                              "but %" PRIhsz " constants were provided",
+                              constant_count);
+    }
+    return iree_ok_status();  // nothing to do
+  } else if (!iree_status_is_ok(status)) {
+    return status;
+  } else if (!constant_count || !constants) {
+    // Constants required but none provided.
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "executable requires executable-level constants "
+                            "but none were provided");
+  }
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // TODO(benvanik): maybe just take the cost of an alloc + clone here so that
+  // we can more gracefully handle the module doing weird things with the inputs
+  // and constants.
+
+  // Wrap the constant memory in an on-stack buffer.
+  iree_vm_buffer_t buffer = {{0}};
+  iree_vm_buffer_initialize(
+      IREE_VM_BUFFER_ACCESS_ORIGIN_HOST,
+      iree_make_byte_span((void*)constants,
+                          constant_count * sizeof(*constants)),
+      iree_allocator_null(), &buffer);
+
+  // Setup input list.
+  uint8_t input_storage[64] = {0};
+  iree_vm_list_t* inputs = NULL;
+  iree_vm_type_def_t element_type =
+      iree_vm_type_def_make_ref_type(iree_vm_buffer_type_id());
+  status = iree_vm_list_initialize(
+      iree_make_byte_span(input_storage, sizeof(input_storage)), &element_type,
+      1, &inputs);
+  if (iree_status_is_ok(status)) {
+    iree_vm_ref_t buffer_ref = iree_vm_buffer_retain_ref(&buffer);
+    status = iree_vm_list_push_ref_move(inputs, &buffer_ref);
+  }
+
+  // Copy the executable constants into the module state.
+  if (iree_status_is_ok(status)) {
+    status =
+        iree_vm_invoke(executable->context, set_function,
+                       IREE_VM_INVOCATION_FLAG_NONE, /*policy=*/NULL, inputs,
+                       /*outputs=*/NULL, executable->base.host_allocator);
+  }
+
+  // Inputs *must* be released here as we allocated it on the stack.
+  if (inputs) {
+    iree_vm_list_deinitialize(inputs);
+  }
+
+  // Buffer *must* be released here since we don't control the constant
+  // lifetime - this will abort if it's not.
+  iree_vm_buffer_deinitialize(&buffer);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static iree_status_t iree_hal_vmvx_executable_create(
+    iree_vm_context_t* context, iree_vm_module_t* bytecode_module,
+    const iree_hal_executable_params_t* executable_params,
+    iree_allocator_t host_allocator, iree_hal_executable_t** out_executable) {
+  IREE_ASSERT_ARGUMENT(context);
+  IREE_ASSERT_ARGUMENT(bytecode_module);
+  IREE_ASSERT_ARGUMENT(executable_params);
+  IREE_ASSERT_ARGUMENT(!executable_params->executable_layout_count ||
+                       executable_params->executable_layouts);
+  IREE_ASSERT_ARGUMENT(out_executable);
+  *out_executable = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_host_size_t entry_count =
+      iree_vm_module_signature(bytecode_module).export_function_count;
+  if (entry_count != executable_params->executable_layout_count) {
+    return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                            "executable provides %zu entry points but caller "
+                            "provided %zu; must match",
+                            entry_count,
+                            executable_params->executable_layout_count);
+  }
+
+  iree_hal_vmvx_executable_t* executable = NULL;
+  iree_host_size_t total_size =
+      sizeof(*executable) + entry_count * sizeof(*executable->entry_fns) +
+      entry_count * sizeof(*executable->base.dispatch_attrs) +
+      executable_params->executable_layout_count *
+          sizeof(iree_hal_local_executable_layout_t);
+  iree_status_t status =
+      iree_allocator_malloc(host_allocator, total_size, (void**)&executable);
+  iree_hal_executable_dispatch_attrs_v0_t* dispatch_attrs = NULL;
+  if (iree_status_is_ok(status)) {
+    uint8_t* ptr = (uint8_t*)executable + sizeof(*executable) +
+                   entry_count * sizeof(*executable->entry_fns);
+    dispatch_attrs = (iree_hal_executable_dispatch_attrs_v0_t*)ptr;
+    ptr += entry_count * sizeof(*executable->base.dispatch_attrs);
+    iree_hal_local_executable_layout_t** executable_layouts_ptr =
+        (iree_hal_local_executable_layout_t**)ptr;
+    iree_hal_local_executable_initialize(
+        &iree_hal_vmvx_executable_vtable,
+        executable_params->executable_layout_count,
+        executable_params->executable_layouts, executable_layouts_ptr,
+        host_allocator, &executable->base);
+    executable->context = context;
+    executable->base.dispatch_attrs = dispatch_attrs;
+    iree_vm_context_retain(executable->context);
+
+    executable->entry_fn_count = entry_count;
+    for (iree_host_size_t i = 0; i < executable->entry_fn_count; ++i) {
+      status = iree_vm_module_lookup_function_by_ordinal(
+          bytecode_module, IREE_VM_FUNCTION_LINKAGE_EXPORT, i,
+          &executable->entry_fns[i]);
+      if (!iree_status_is_ok(status)) break;
+      status = iree_hal_vmvx_executable_verify_entry_point(
+          &executable->entry_fns[i]);
+      if (!iree_status_is_ok(status)) break;
+    }
+  }
+
+  // Query the optional local workgroup size from each entry point.
+  if (iree_status_is_ok(status)) {
+    // TODO(benvanik): pack this more efficiently; this requires a lot of
+    // queries and instead could be a single packed table we can directly
+    // reference from the module. Module-level reflection attrs would help.
+    for (iree_host_size_t i = 0; i < executable->entry_fn_count; ++i) {
+      iree_string_view_t local_memory_str = iree_vm_function_reflection_attr(
+          &executable->entry_fns[i], iree_make_cstring_view("local_memory"));
+      uint32_t local_memory_size = 0;
+      if (!iree_string_view_is_empty(local_memory_str)) {
+        iree_string_view_atoi_uint32(local_memory_str, &local_memory_size);
+      }
+      local_memory_size /= IREE_HAL_WORKGROUP_LOCAL_MEMORY_PAGE_SIZE;
+      dispatch_attrs[i].local_memory_pages = (uint16_t)local_memory_size;
+    }
+  }
+
+  // Provide executable constants to the module.
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_vmvx_executable_set_constants(
+        executable, bytecode_module, executable_params->constant_count,
+        executable_params->constants);
+  }
+
+  if (iree_status_is_ok(status)) {
+    *out_executable = (iree_hal_executable_t*)executable;
+  } else {
+    iree_hal_executable_release((iree_hal_executable_t*)executable);
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_vmvx_executable_destroy(
+    iree_hal_executable_t* base_executable) {
+  iree_hal_vmvx_executable_t* executable =
+      (iree_hal_vmvx_executable_t*)base_executable;
+  iree_allocator_t host_allocator = executable->base.host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_vm_context_release(executable->context);
+  iree_hal_local_executable_deinitialize(
+      (iree_hal_local_executable_t*)base_executable);
+  iree_allocator_free(host_allocator, executable);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_status_t iree_hal_vmvx_executable_issue_call(
+    iree_hal_local_executable_t* base_executable, iree_host_size_t ordinal,
+    const iree_hal_executable_dispatch_state_v0_t* dispatch_state,
+    const iree_hal_executable_workgroup_state_v0_t* workgroup_state) {
+  iree_hal_vmvx_executable_t* executable =
+      (iree_hal_vmvx_executable_t*)base_executable;
+
+  if (IREE_UNLIKELY(ordinal >= executable->entry_fn_count)) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "entry point ordinal out of bounds");
+  }
+  iree_vm_function_t entry_fn = executable->entry_fns[ordinal];
+
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+  iree_string_view_t entry_point_name = iree_vm_function_name(&entry_fn);
+  if (iree_string_view_is_empty(entry_point_name)) {
+    entry_point_name = iree_make_cstring_view("unknown_vmvx_call");
+  }
+  IREE_TRACE_ZONE_BEGIN_NAMED_DYNAMIC(z0, entry_point_name.data,
+                                      entry_point_name.size);
+#endif  // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+
+  // On-stack interface local to this invocation.
+  // Note that we _could_ share this across all invocations in a dispatch, but
+  // it's tricky to find a good place when threading is happening and it's
+  // intentionally fairly cheap to construct by matching the dispatch_state.
+  // The list would only need to be constructed once and we could avoid the
+  // extraneous retain/releases and mappings.
+  iree_vm_type_def_t buffer_type =
+      iree_vm_type_def_make_ref_type(iree_vm_buffer_type_id());
+  iree_host_size_t binding_list_size =
+      iree_vm_list_storage_size(&buffer_type, dispatch_state->binding_count);
+  void* binding_list_storage = iree_alloca(binding_list_size);
+  iree_vm_list_t* binding_list = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_vm_list_initialize(
+              iree_make_byte_span(binding_list_storage, binding_list_size),
+              &buffer_type, dispatch_state->binding_count, &binding_list));
+  iree_vm_list_retain(binding_list);  // for call
+
+  // Map bindings into on-stack VMVX buffers.
+  iree_vm_buffer_t* binding_buffers = (iree_vm_buffer_t*)iree_alloca(
+      dispatch_state->binding_count * sizeof(iree_vm_buffer_t));
+  for (iree_host_size_t i = 0; i < dispatch_state->binding_count; ++i) {
+    iree_vm_buffer_t* binding_buffer = &binding_buffers[i];
+    // TODO(benvanik): executable layout contains the required access
+    // information. We will likely want to encode a bitmap of mutable bindings
+    // such that we can quickly set the access bit, though.
+    iree_vm_buffer_access_t access =
+        IREE_VM_BUFFER_ACCESS_MUTABLE | IREE_VM_BUFFER_ACCESS_ORIGIN_HOST;
+    iree_vm_buffer_initialize(
+        access,
+        iree_make_byte_span(dispatch_state->binding_ptrs[i],
+                            dispatch_state->binding_lengths[i]),
+        iree_allocator_null(), binding_buffer);
+    iree_vm_ref_t ref = {0};
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z0, iree_vm_ref_wrap_assign(binding_buffer, iree_vm_buffer_type_id(),
+                                    &ref));
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z0, iree_vm_list_push_ref_retain(binding_list, &ref));
+  }
+
+  // Acquire workgroup local memory for the dispatch.
+  iree_vm_buffer_t local_memory_buffer;
+  iree_vm_buffer_initialize(
+      IREE_VM_BUFFER_ACCESS_MUTABLE | IREE_VM_BUFFER_ACCESS_ORIGIN_HOST,
+      iree_make_byte_span(workgroup_state->local_memory,
+                          workgroup_state->local_memory_size),
+      iree_allocator_null(), &local_memory_buffer);
+  iree_vm_buffer_retain(&local_memory_buffer);  // for call
+
+  // Map the push constant memory directly from the dispatch state.
+  iree_vm_buffer_t constants_buffer;
+  iree_vm_buffer_initialize(
+      IREE_VM_BUFFER_ACCESS_ORIGIN_HOST,
+      iree_make_byte_span(
+          (void*)dispatch_state->push_constants,
+          sizeof(uint32_t) * dispatch_state->push_constant_count),
+      iree_allocator_null(), &constants_buffer);
+  iree_vm_buffer_retain(&constants_buffer);  // for call
+
+  // Prepare call argument buffer. We've verified the signature on creation and
+  // know the exact format we can assume here.
+  //
+  //   func.func @entry(
+  //       %local_memory: !vmvx.buffer,
+  //       %constants: !vmvx.buffer,
+  //       %bindings: !util.list<!vmvx.buffer>,
+  //       %workgroup_id_x: index,
+  //       %workgroup_id_y: index,
+  //       %workgroup_id_z: index,
+  //       %workgroup_size_x: index,
+  //       %workgroup_size_y: index,
+  //       %workgroup_size_z: index,
+  //       %workgroup_count_x: index,
+  //       %workgroup_count_y: index,
+  //       %workgroup_count_z: index
+  //    )
+  //
+  // NOTE: this level of the VM ABI is supported - but may change in the future.
+  // Users should prefer to use the invocation API that is more stable.
+  struct {
+    iree_vm_ref_t local_memory;
+    iree_vm_ref_t constants;
+    iree_vm_ref_t bindings;
+    uint32_t workgroup_id_x;
+    uint32_t workgroup_id_y;
+    uint32_t workgroup_id_z;
+    uint32_t workgroup_size_x;
+    uint32_t workgroup_size_y;
+    uint32_t workgroup_size_z;
+    uint32_t workgroup_count_x;
+    uint32_t workgroup_count_y;
+    uint32_t workgroup_count_z;
+  } call_args = {
+      .local_memory =
+          {
+              .type = iree_vm_buffer_type_id(),
+              .ptr = &local_memory_buffer,
+              .offsetof_counter = 0,
+          },
+      .constants =
+          {
+              .type = iree_vm_buffer_type_id(),
+              .ptr = &constants_buffer,
+              .offsetof_counter = 0,
+          },
+      .bindings =
+          {
+              .type = iree_vm_list_type_id(),
+              .ptr = binding_list,
+              .offsetof_counter = 0,
+          },
+      .workgroup_id_x = workgroup_state->workgroup_id_x,
+      .workgroup_id_y = workgroup_state->workgroup_id_y,
+      .workgroup_id_z = workgroup_state->workgroup_id_z,
+      .workgroup_size_x = dispatch_state->workgroup_size_x,
+      .workgroup_size_y = dispatch_state->workgroup_size_y,
+      .workgroup_size_z = dispatch_state->workgroup_size_z,
+      .workgroup_count_x = dispatch_state->workgroup_count_x,
+      .workgroup_count_y = dispatch_state->workgroup_count_y,
+      .workgroup_count_z = dispatch_state->workgroup_count_z,
+  };
+
+  // On-stack stack. We really do abuse the stack too much here.
+  // TODO(benvanik): pass in an iree_arena_t that can be used for this.
+  IREE_VM_INLINE_STACK_INITIALIZE(
+      stack, IREE_VM_INVOCATION_FLAG_NONE,
+      iree_vm_context_state_resolver(executable->context),
+      executable->base.host_allocator);
+
+  // Direct call interface.
+  iree_vm_function_call_t call;
+  memset(&call, 0, sizeof(call));
+  call.function = entry_fn;
+  call.arguments = iree_make_byte_span(&call_args, sizeof(call_args));
+  call.results = iree_make_byte_span(NULL, 0);
+  iree_vm_execution_result_t result;
+  iree_status_t status =
+      entry_fn.module->begin_call(entry_fn.module->self, stack, &call, &result);
+
+  iree_vm_stack_deinitialize(stack);
+
+  iree_vm_buffer_deinitialize(&local_memory_buffer);
+  iree_vm_buffer_deinitialize(&constants_buffer);
+  iree_vm_list_deinitialize(binding_list);
+  for (iree_host_size_t i = 0; i < dispatch_state->binding_count; ++i) {
+    iree_vm_buffer_deinitialize(&binding_buffers[i]);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static const iree_hal_local_executable_vtable_t
+    iree_hal_vmvx_executable_vtable = {
+        .base =
+            {
+                .destroy = iree_hal_vmvx_executable_destroy,
+            },
+        .issue_call = iree_hal_vmvx_executable_issue_call,
+};
+
+//===----------------------------------------------------------------------===//
+// iree_hal_vmvx_module_loader_t
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_vmvx_module_loader_t {
+  iree_hal_executable_loader_t base;
+  iree_allocator_t host_allocator;
+  iree_vm_instance_t* instance;
+  iree_vm_module_t* vmvx_module;
+} iree_hal_vmvx_module_loader_t;
+
+static const iree_hal_executable_loader_vtable_t
+    iree_hal_vmvx_module_loader_vtable;
+
+iree_status_t iree_hal_vmvx_module_loader_create(
+    iree_vm_instance_t* instance, iree_allocator_t host_allocator,
+    iree_hal_executable_loader_t** out_executable_loader) {
+  IREE_ASSERT_ARGUMENT(instance);
+  IREE_ASSERT_ARGUMENT(out_executable_loader);
+  *out_executable_loader = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // A single VMVX module is shared across all loaded executables.
+  IREE_RETURN_IF_ERROR(iree_vmvx_module_register_types());
+  iree_vm_module_t* vmvx_module = NULL;
+  IREE_RETURN_IF_ERROR(iree_vmvx_module_create(host_allocator, &vmvx_module));
+
+  iree_hal_vmvx_module_loader_t* executable_loader = NULL;
+  iree_status_t status = iree_allocator_malloc(
+      host_allocator, sizeof(*executable_loader), (void**)&executable_loader);
+  if (iree_status_is_ok(status)) {
+    iree_hal_executable_loader_initialize(
+        &iree_hal_vmvx_module_loader_vtable,
+        iree_hal_executable_import_provider_null(), &executable_loader->base);
+    executable_loader->host_allocator = host_allocator;
+    executable_loader->instance = instance;
+    iree_vm_instance_retain(executable_loader->instance);
+    executable_loader->vmvx_module = vmvx_module;
+    iree_vm_module_retain(executable_loader->vmvx_module);
+    *out_executable_loader = (iree_hal_executable_loader_t*)executable_loader;
+  }
+
+  iree_vm_module_release(vmvx_module);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_vmvx_module_loader_destroy(
+    iree_hal_executable_loader_t* base_executable_loader) {
+  iree_hal_vmvx_module_loader_t* executable_loader =
+      (iree_hal_vmvx_module_loader_t*)base_executable_loader;
+  iree_allocator_t host_allocator = executable_loader->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_vm_module_release(executable_loader->vmvx_module);
+  iree_vm_instance_release(executable_loader->instance);
+  iree_allocator_free(host_allocator, executable_loader);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static bool iree_hal_vmvx_module_loader_query_support(
+    iree_hal_executable_loader_t* base_executable_loader,
+    iree_hal_executable_caching_mode_t caching_mode,
+    iree_string_view_t executable_format) {
+  return iree_string_view_equal(executable_format,
+                                iree_make_cstring_view("vmvx-bytecode-fb"));
+}
+
+static iree_status_t iree_hal_vmvx_module_loader_try_load(
+    iree_hal_executable_loader_t* base_executable_loader,
+    const iree_hal_executable_params_t* executable_params,
+    iree_hal_executable_t** out_executable) {
+  iree_hal_vmvx_module_loader_t* executable_loader =
+      (iree_hal_vmvx_module_loader_t*)base_executable_loader;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_const_byte_span_t bytecode_module_data =
+      executable_params->executable_data;
+
+  // If the caching mode allows for aliasing the existing flatbuffer data then
+  // we avoid allocations and just pass the pointer on through. The caller
+  // ensures that the data remains valid for the duration the executable is
+  // loaded. Otherwise, we clone it and let the bytecode module take ownership.
+  iree_allocator_t bytecode_module_allocator;
+  if (iree_all_bits_set(executable_params->caching_mode,
+                        IREE_HAL_EXECUTABLE_CACHING_MODE_ALIAS_PROVIDED_DATA)) {
+    // Zero-copy route.
+    bytecode_module_allocator = iree_allocator_null();
+  } else {
+    bytecode_module_allocator = executable_loader->host_allocator;
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z0, iree_allocator_clone(executable_loader->host_allocator,
+                                 executable_params->executable_data,
+                                 (void**)&bytecode_module_data.data));
+  }
+
+  // Load the user-provided bytecode module. We pass ownership of the data (if
+  // we have it) to the module to manage.
+  iree_vm_module_t* bytecode_module = NULL;
+  iree_status_t status = iree_vm_bytecode_module_create(
+      executable_params->executable_data, bytecode_module_allocator,
+      executable_loader->host_allocator, &bytecode_module);
+
+  // Create the context tying together the shared VMVX module and the
+  // user-provided module that references it. If we wanted to allow custom
+  // modules here for user-provided functions we'd mix them in here.
+  iree_vm_context_t* context = NULL;
+  if (iree_status_is_ok(status)) {
+    iree_vm_module_t* modules[2] = {
+        executable_loader->vmvx_module,
+        bytecode_module,
+    };
+    status = iree_vm_context_create_with_modules(
+        executable_loader->instance, IREE_VM_CONTEXT_FLAG_NONE, modules,
+        IREE_ARRAYSIZE(modules), executable_loader->host_allocator, &context);
+  }
+
+  // Executable takes ownership of the entire context (including the bytecode
+  // module, which itself may own the underlying allocation).
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_vmvx_executable_create(
+        context, bytecode_module, executable_params,
+        executable_loader->host_allocator, out_executable);
+  }
+
+  iree_vm_context_release(context);
+  iree_vm_module_release(bytecode_module);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static const iree_hal_executable_loader_vtable_t
+    iree_hal_vmvx_module_loader_vtable = {
+        .destroy = iree_hal_vmvx_module_loader_destroy,
+        .query_support = iree_hal_vmvx_module_loader_query_support,
+        .try_load = iree_hal_vmvx_module_loader_try_load,
+};
diff --git a/runtime/src/iree/hal/local/loaders/vmvx_module_loader.h b/runtime/src/iree/hal/local/loaders/vmvx_module_loader.h
new file mode 100644
index 0000000..c080052
--- /dev/null
+++ b/runtime/src/iree/hal/local/loaders/vmvx_module_loader.h
@@ -0,0 +1,31 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_LOCAL_LOADERS_VMVX_MODULE_LOADER_H_
+#define IREE_HAL_LOCAL_LOADERS_VMVX_MODULE_LOADER_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/local/executable_loader.h"
+#include "iree/vm/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Creates an executable loader that can load compiled IREE VM bytecode modules
+// using the VMVX module. |instance| will be used for all loaded contexts.
+iree_status_t iree_hal_vmvx_module_loader_create(
+    iree_vm_instance_t* instance, iree_allocator_t host_allocator,
+    iree_hal_executable_loader_t** out_executable_loader);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_LOCAL_LOADERS_VMVX_MODULE_LOADER_H_
diff --git a/runtime/src/iree/hal/local/local_descriptor_set.c b/runtime/src/iree/hal/local/local_descriptor_set.c
new file mode 100644
index 0000000..c4d6210
--- /dev/null
+++ b/runtime/src/iree/hal/local/local_descriptor_set.c
@@ -0,0 +1,83 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/local/local_descriptor_set.h"
+
+#include <stddef.h>
+#include <string.h>
+
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+
+static const iree_hal_descriptor_set_vtable_t
+    iree_hal_local_descriptor_set_vtable;
+
+iree_hal_local_descriptor_set_t* iree_hal_local_descriptor_set_cast(
+    iree_hal_descriptor_set_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_local_descriptor_set_vtable);
+  return (iree_hal_local_descriptor_set_t*)base_value;
+}
+
+iree_status_t iree_hal_local_descriptor_set_create(
+    iree_hal_descriptor_set_layout_t* base_layout,
+    iree_host_size_t binding_count,
+    const iree_hal_descriptor_set_binding_t* bindings,
+    iree_hal_descriptor_set_t** out_descriptor_set) {
+  IREE_ASSERT_ARGUMENT(base_layout);
+  IREE_ASSERT_ARGUMENT(!binding_count || bindings);
+  IREE_ASSERT_ARGUMENT(out_descriptor_set);
+  *out_descriptor_set = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_local_descriptor_set_layout_t* local_layout =
+      iree_hal_local_descriptor_set_layout_cast(base_layout);
+  IREE_ASSERT_ARGUMENT(local_layout);
+
+  iree_hal_local_descriptor_set_t* descriptor_set = NULL;
+  iree_host_size_t total_size =
+      sizeof(*descriptor_set) +
+      binding_count * sizeof(*descriptor_set->bindings);
+  iree_status_t status = iree_allocator_malloc(
+      local_layout->host_allocator, total_size, (void**)&descriptor_set);
+  if (iree_status_is_ok(status)) {
+    iree_hal_resource_initialize(&iree_hal_local_descriptor_set_vtable,
+                                 &descriptor_set->resource);
+    descriptor_set->layout = local_layout;
+    iree_hal_descriptor_set_layout_retain(base_layout);
+    descriptor_set->binding_count = binding_count;
+    memcpy(descriptor_set->bindings, bindings,
+           binding_count * sizeof(iree_hal_descriptor_set_binding_t));
+    for (iree_host_size_t i = 0; i < descriptor_set->binding_count; ++i) {
+      iree_hal_buffer_retain(descriptor_set->bindings[i].buffer);
+    }
+    *out_descriptor_set = (iree_hal_descriptor_set_t*)descriptor_set;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_local_descriptor_set_destroy(
+    iree_hal_descriptor_set_t* base_descriptor_set) {
+  iree_hal_local_descriptor_set_t* descriptor_set =
+      iree_hal_local_descriptor_set_cast(base_descriptor_set);
+  iree_allocator_t host_allocator = descriptor_set->layout->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  for (iree_host_size_t i = 0; i < descriptor_set->binding_count; ++i) {
+    iree_hal_buffer_release(descriptor_set->bindings[i].buffer);
+  }
+  iree_hal_descriptor_set_layout_release(
+      (iree_hal_descriptor_set_layout_t*)descriptor_set->layout);
+  iree_allocator_free(host_allocator, descriptor_set);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static const iree_hal_descriptor_set_vtable_t
+    iree_hal_local_descriptor_set_vtable = {
+        .destroy = iree_hal_local_descriptor_set_destroy,
+};
diff --git a/runtime/src/iree/hal/local/local_descriptor_set.h b/runtime/src/iree/hal/local/local_descriptor_set.h
new file mode 100644
index 0000000..eba78a0
--- /dev/null
+++ b/runtime/src/iree/hal/local/local_descriptor_set.h
@@ -0,0 +1,37 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_LOCAL_LOCAL_DESCRIPTOR_SET_H_
+#define IREE_HAL_LOCAL_LOCAL_DESCRIPTOR_SET_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/local/local_descriptor_set_layout.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef struct iree_hal_local_descriptor_set_t {
+  iree_hal_resource_t resource;
+  iree_hal_local_descriptor_set_layout_t* layout;
+  iree_host_size_t binding_count;
+  iree_hal_descriptor_set_binding_t bindings[];
+} iree_hal_local_descriptor_set_t;
+
+iree_status_t iree_hal_local_descriptor_set_create(
+    iree_hal_descriptor_set_layout_t* layout, iree_host_size_t binding_count,
+    const iree_hal_descriptor_set_binding_t* bindings,
+    iree_hal_descriptor_set_t** out_descriptor_set);
+
+iree_hal_local_descriptor_set_t* iree_hal_local_descriptor_set_cast(
+    iree_hal_descriptor_set_t* base_value);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_LOCAL_LOCAL_DESCRIPTOR_SET_H_
diff --git a/runtime/src/iree/hal/local/local_descriptor_set_layout.c b/runtime/src/iree/hal/local/local_descriptor_set_layout.c
new file mode 100644
index 0000000..b3af9c6
--- /dev/null
+++ b/runtime/src/iree/hal/local/local_descriptor_set_layout.c
@@ -0,0 +1,77 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/local/local_descriptor_set_layout.h"
+
+#include <stddef.h>
+#include <string.h>
+
+#include "iree/base/tracing.h"
+
+static const iree_hal_descriptor_set_layout_vtable_t
+    iree_hal_local_descriptor_set_layout_vtable;
+
+iree_hal_local_descriptor_set_layout_t*
+iree_hal_local_descriptor_set_layout_cast(
+    iree_hal_descriptor_set_layout_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value,
+                       &iree_hal_local_descriptor_set_layout_vtable);
+  return (iree_hal_local_descriptor_set_layout_t*)base_value;
+}
+
+iree_status_t iree_hal_local_descriptor_set_layout_create(
+    iree_hal_descriptor_set_layout_usage_type_t usage_type,
+    iree_host_size_t binding_count,
+    const iree_hal_descriptor_set_layout_binding_t* bindings,
+    iree_allocator_t host_allocator,
+    iree_hal_descriptor_set_layout_t** out_descriptor_set_layout) {
+  IREE_ASSERT_ARGUMENT(!binding_count || bindings);
+  IREE_ASSERT_ARGUMENT(out_descriptor_set_layout);
+  *out_descriptor_set_layout = NULL;
+  if (binding_count > IREE_HAL_LOCAL_MAX_DESCRIPTOR_BINDING_COUNT) {
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT, "binding count %zu over the limit of %d",
+        binding_count, IREE_HAL_LOCAL_MAX_DESCRIPTOR_BINDING_COUNT);
+  }
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_local_descriptor_set_layout_t* layout = NULL;
+  iree_host_size_t total_size =
+      sizeof(*layout) + binding_count * sizeof(*layout->bindings);
+  iree_status_t status =
+      iree_allocator_malloc(host_allocator, total_size, (void**)&layout);
+  if (iree_status_is_ok(status)) {
+    iree_hal_resource_initialize(&iree_hal_local_descriptor_set_layout_vtable,
+                                 &layout->resource);
+    layout->host_allocator = host_allocator;
+    layout->usage_type = usage_type;
+    layout->binding_count = binding_count;
+    memcpy(layout->bindings, bindings,
+           binding_count * sizeof(iree_hal_descriptor_set_layout_binding_t));
+    *out_descriptor_set_layout = (iree_hal_descriptor_set_layout_t*)layout;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_local_descriptor_set_layout_destroy(
+    iree_hal_descriptor_set_layout_t* base_layout) {
+  iree_hal_local_descriptor_set_layout_t* layout =
+      iree_hal_local_descriptor_set_layout_cast(base_layout);
+  iree_allocator_t host_allocator = layout->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_allocator_free(host_allocator, layout);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static const iree_hal_descriptor_set_layout_vtable_t
+    iree_hal_local_descriptor_set_layout_vtable = {
+        .destroy = iree_hal_local_descriptor_set_layout_destroy,
+};
diff --git a/runtime/src/iree/hal/local/local_descriptor_set_layout.h b/runtime/src/iree/hal/local/local_descriptor_set_layout.h
new file mode 100644
index 0000000..4e11ce2
--- /dev/null
+++ b/runtime/src/iree/hal/local/local_descriptor_set_layout.h
@@ -0,0 +1,42 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_LOCAL_LOCAL_DESCRIPTOR_SET_LAYOUT_H_
+#define IREE_HAL_LOCAL_LOCAL_DESCRIPTOR_SET_LAYOUT_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+#define IREE_HAL_LOCAL_MAX_DESCRIPTOR_BINDING_COUNT 32
+
+typedef struct iree_hal_local_descriptor_set_layout_t {
+  iree_hal_resource_t resource;
+  iree_allocator_t host_allocator;
+  iree_hal_descriptor_set_layout_usage_type_t usage_type;
+  iree_host_size_t binding_count;
+  iree_hal_descriptor_set_layout_binding_t bindings[];
+} iree_hal_local_descriptor_set_layout_t;
+
+iree_status_t iree_hal_local_descriptor_set_layout_create(
+    iree_hal_descriptor_set_layout_usage_type_t usage_type,
+    iree_host_size_t binding_count,
+    const iree_hal_descriptor_set_layout_binding_t* bindings,
+    iree_allocator_t host_allocator,
+    iree_hal_descriptor_set_layout_t** out_descriptor_set_layout);
+
+iree_hal_local_descriptor_set_layout_t*
+iree_hal_local_descriptor_set_layout_cast(
+    iree_hal_descriptor_set_layout_t* base_value);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_LOCAL_LOCAL_DESCRIPTOR_SET_LAYOUT_H_
diff --git a/runtime/src/iree/hal/local/local_executable.c b/runtime/src/iree/hal/local/local_executable.c
new file mode 100644
index 0000000..1fd92ec
--- /dev/null
+++ b/runtime/src/iree/hal/local/local_executable.c
@@ -0,0 +1,108 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/local/local_executable.h"
+
+#include "iree/base/tracing.h"
+#include "iree/hal/local/executable_environment.h"
+
+void iree_hal_local_executable_initialize(
+    const iree_hal_local_executable_vtable_t* vtable,
+    iree_host_size_t executable_layout_count,
+    iree_hal_executable_layout_t* const* source_executable_layouts,
+    iree_hal_local_executable_layout_t** target_executable_layouts,
+    iree_allocator_t host_allocator,
+    iree_hal_local_executable_t* out_base_executable) {
+  iree_hal_resource_initialize(vtable, &out_base_executable->resource);
+  out_base_executable->host_allocator = host_allocator;
+
+  out_base_executable->executable_layout_count = executable_layout_count;
+  out_base_executable->executable_layouts = target_executable_layouts;
+  for (iree_host_size_t i = 0; i < executable_layout_count; ++i) {
+    target_executable_layouts[i] =
+        (iree_hal_local_executable_layout_t*)source_executable_layouts[i];
+    iree_hal_executable_layout_retain(source_executable_layouts[i]);
+  }
+
+  // Function attributes are optional and populated by the parent type.
+  out_base_executable->dispatch_attrs = NULL;
+
+  // Default environment with no imports assigned.
+  iree_hal_executable_environment_initialize(host_allocator,
+                                             &out_base_executable->environment);
+}
+
+void iree_hal_local_executable_deinitialize(
+    iree_hal_local_executable_t* base_executable) {
+  for (iree_host_size_t i = 0; i < base_executable->executable_layout_count;
+       ++i) {
+    iree_hal_executable_layout_release(
+        (iree_hal_executable_layout_t*)base_executable->executable_layouts[i]);
+  }
+}
+
+iree_hal_local_executable_t* iree_hal_local_executable_cast(
+    iree_hal_executable_t* base_value) {
+  return (iree_hal_local_executable_t*)base_value;
+}
+
+iree_status_t iree_hal_local_executable_issue_call(
+    iree_hal_local_executable_t* executable, iree_host_size_t ordinal,
+    const iree_hal_executable_dispatch_state_v0_t* dispatch_state,
+    const iree_hal_executable_workgroup_state_v0_t* workgroup_state) {
+  IREE_ASSERT_ARGUMENT(executable);
+  IREE_ASSERT_ARGUMENT(dispatch_state);
+  IREE_ASSERT_ARGUMENT(workgroup_state);
+  return ((const iree_hal_local_executable_vtable_t*)
+              executable->resource.vtable)
+      ->issue_call(executable, ordinal, dispatch_state, workgroup_state);
+}
+
+iree_status_t iree_hal_local_executable_issue_dispatch_inline(
+    iree_hal_local_executable_t* executable, iree_host_size_t ordinal,
+    const iree_hal_executable_dispatch_state_v0_t* dispatch_state,
+    uint32_t processor_id, iree_byte_span_t local_memory) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  // TODO(benvanik): annotate with executable name to calculate total time.
+
+  const uint32_t workgroup_count_x = dispatch_state->workgroup_count_x;
+  const uint32_t workgroup_count_y = dispatch_state->workgroup_count_y;
+  const uint32_t workgroup_count_z = dispatch_state->workgroup_count_z;
+
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+  char xyz_string[32];
+  int xyz_string_length =
+      snprintf(xyz_string, IREE_ARRAYSIZE(xyz_string), "%ux%ux%u",
+               workgroup_count_x, workgroup_count_y, workgroup_count_z);
+  IREE_TRACE_ZONE_APPEND_TEXT_STRING_VIEW(z0, xyz_string, xyz_string_length);
+#endif  // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+
+  iree_status_t status = iree_ok_status();
+
+  iree_alignas(64) iree_hal_executable_workgroup_state_v0_t workgroup_state = {
+      .workgroup_id_x = 0,
+      .workgroup_id_y = 0,
+      .workgroup_id_z = 0,
+      .processor_id = processor_id,
+      .local_memory = local_memory.data,
+      .local_memory_size = (size_t)local_memory.data_length,
+  };
+  for (uint32_t z = 0; z < workgroup_count_z; ++z) {
+    workgroup_state.workgroup_id_z = z;
+    for (uint32_t y = 0; y < workgroup_count_y; ++y) {
+      workgroup_state.workgroup_id_y = y;
+      for (uint32_t x = 0; x < workgroup_count_x; ++x) {
+        workgroup_state.workgroup_id_x = x;
+        status = iree_hal_local_executable_issue_call(
+            executable, ordinal, dispatch_state, &workgroup_state);
+        if (!iree_status_is_ok(status)) break;
+      }
+    }
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
diff --git a/runtime/src/iree/hal/local/local_executable.h b/runtime/src/iree/hal/local/local_executable.h
new file mode 100644
index 0000000..d9a42e4
--- /dev/null
+++ b/runtime/src/iree/hal/local/local_executable.h
@@ -0,0 +1,76 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_LOCAL_LOCAL_EXECUTABLE_H_
+#define IREE_HAL_LOCAL_LOCAL_EXECUTABLE_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/local/executable_library.h"
+#include "iree/hal/local/local_executable_layout.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef struct iree_hal_local_executable_t {
+  iree_hal_resource_t resource;
+  iree_allocator_t host_allocator;
+  iree_host_size_t executable_layout_count;
+  iree_hal_local_executable_layout_t** executable_layouts;
+
+  // Defines per-entry point how much workgroup local memory is required.
+  // Contains entries with 0 to indicate no local memory is required or >0 in
+  // units of IREE_HAL_WORKGROUP_LOCAL_MEMORY_PAGE_SIZE for the minimum amount
+  // of memory required by the function.
+  const iree_hal_executable_dispatch_attrs_v0_t* dispatch_attrs;
+
+  // Execution environment.
+  iree_hal_executable_environment_v0_t environment;
+} iree_hal_local_executable_t;
+
+typedef struct iree_hal_local_executable_vtable_t {
+  iree_hal_executable_vtable_t base;
+
+  iree_status_t(IREE_API_PTR* issue_call)(
+      iree_hal_local_executable_t* executable, iree_host_size_t ordinal,
+      const iree_hal_executable_dispatch_state_v0_t* dispatch_state,
+      const iree_hal_executable_workgroup_state_v0_t* workgroup_state);
+} iree_hal_local_executable_vtable_t;
+
+// Initializes the local executable base type.
+//
+// Callers must allocate memory for |target_executable_layouts| with at least
+// `executable_layout_count * sizeof(*target_executable_layouts)` bytes.
+void iree_hal_local_executable_initialize(
+    const iree_hal_local_executable_vtable_t* vtable,
+    iree_host_size_t executable_layout_count,
+    iree_hal_executable_layout_t* const* source_executable_layouts,
+    iree_hal_local_executable_layout_t** target_executable_layouts,
+    iree_allocator_t host_allocator,
+    iree_hal_local_executable_t* out_base_executable);
+
+void iree_hal_local_executable_deinitialize(
+    iree_hal_local_executable_t* base_executable);
+
+iree_hal_local_executable_t* iree_hal_local_executable_cast(
+    iree_hal_executable_t* base_value);
+
+iree_status_t iree_hal_local_executable_issue_call(
+    iree_hal_local_executable_t* executable, iree_host_size_t ordinal,
+    const iree_hal_executable_dispatch_state_v0_t* dispatch_state,
+    const iree_hal_executable_workgroup_state_v0_t* workgroup_state);
+
+iree_status_t iree_hal_local_executable_issue_dispatch_inline(
+    iree_hal_local_executable_t* executable, iree_host_size_t ordinal,
+    const iree_hal_executable_dispatch_state_v0_t* dispatch_state,
+    uint32_t processor_id, iree_byte_span_t local_memory);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_LOCAL_LOCAL_EXECUTABLE_H_
diff --git a/runtime/src/iree/hal/local/local_executable_cache.c b/runtime/src/iree/hal/local/local_executable_cache.c
new file mode 100644
index 0000000..c446c28
--- /dev/null
+++ b/runtime/src/iree/hal/local/local_executable_cache.c
@@ -0,0 +1,139 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/local/local_executable_cache.h"
+
+#include <stdbool.h>
+#include <stddef.h>
+
+#include "iree/base/tracing.h"
+
+typedef struct iree_hal_local_executable_cache_t {
+  iree_hal_resource_t resource;
+  iree_allocator_t host_allocator;
+  iree_string_view_t identifier;
+  iree_host_size_t loader_count;
+  iree_hal_executable_loader_t* loaders[];
+} iree_hal_local_executable_cache_t;
+
+static const iree_hal_executable_cache_vtable_t
+    iree_hal_local_executable_cache_vtable;
+
+static iree_hal_local_executable_cache_t* iree_hal_local_executable_cache_cast(
+    iree_hal_executable_cache_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_local_executable_cache_vtable);
+  return (iree_hal_local_executable_cache_t*)base_value;
+}
+
+iree_status_t iree_hal_local_executable_cache_create(
+    iree_string_view_t identifier, iree_host_size_t loader_count,
+    iree_hal_executable_loader_t** loaders, iree_allocator_t host_allocator,
+    iree_hal_executable_cache_t** out_executable_cache) {
+  IREE_ASSERT_ARGUMENT(!loader_count || loaders);
+  IREE_ASSERT_ARGUMENT(out_executable_cache);
+  *out_executable_cache = NULL;
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_local_executable_cache_t* executable_cache = NULL;
+  iree_host_size_t total_size =
+      sizeof(*executable_cache) +
+      loader_count * sizeof(*executable_cache->loaders) + identifier.size;
+  iree_status_t status = iree_allocator_malloc(host_allocator, total_size,
+                                               (void**)&executable_cache);
+  if (iree_status_is_ok(status)) {
+    iree_hal_resource_initialize(&iree_hal_local_executable_cache_vtable,
+                                 &executable_cache->resource);
+    executable_cache->host_allocator = host_allocator;
+    iree_string_view_append_to_buffer(
+        identifier, &executable_cache->identifier,
+        (char*)executable_cache + total_size - identifier.size);
+
+    executable_cache->loader_count = loader_count;
+    for (iree_host_size_t i = 0; i < executable_cache->loader_count; ++i) {
+      executable_cache->loaders[i] = loaders[i];
+      iree_hal_executable_loader_retain(executable_cache->loaders[i]);
+    }
+
+    *out_executable_cache = (iree_hal_executable_cache_t*)executable_cache;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_local_executable_cache_destroy(
+    iree_hal_executable_cache_t* base_executable_cache) {
+  iree_hal_local_executable_cache_t* executable_cache =
+      iree_hal_local_executable_cache_cast(base_executable_cache);
+  iree_allocator_t host_allocator = executable_cache->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  for (iree_host_size_t i = 0; i < executable_cache->loader_count; ++i) {
+    iree_hal_executable_loader_release(executable_cache->loaders[i]);
+  }
+  iree_allocator_free(host_allocator, executable_cache);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static bool iree_hal_local_executable_cache_can_prepare_format(
+    iree_hal_executable_cache_t* base_executable_cache,
+    iree_hal_executable_caching_mode_t caching_mode,
+    iree_string_view_t executable_format) {
+  iree_hal_local_executable_cache_t* executable_cache =
+      iree_hal_local_executable_cache_cast(base_executable_cache);
+  for (iree_host_size_t i = 0; i < executable_cache->loader_count; ++i) {
+    if (iree_hal_executable_loader_query_support(
+            executable_cache->loaders[i], caching_mode, executable_format)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+static iree_status_t iree_hal_local_executable_cache_prepare_executable(
+    iree_hal_executable_cache_t* base_executable_cache,
+    const iree_hal_executable_params_t* executable_params,
+    iree_hal_executable_t** out_executable) {
+  iree_hal_local_executable_cache_t* executable_cache =
+      iree_hal_local_executable_cache_cast(base_executable_cache);
+  for (iree_host_size_t i = 0; i < executable_cache->loader_count; ++i) {
+    if (!iree_hal_executable_loader_query_support(
+            executable_cache->loaders[i], executable_params->caching_mode,
+            executable_params->executable_format)) {
+      // Loader definitely can't handle the executable; no use trying so skip.
+      continue;
+    }
+    // The loader _may_ handle the executable; if the specific executable is not
+    // supported then the try will fail with IREE_STATUS_CANCELLED and we should
+    // continue trying other loaders.
+    iree_status_t status = iree_hal_executable_loader_try_load(
+        executable_cache->loaders[i], executable_params, out_executable);
+    if (iree_status_is_ok(status)) {
+      // Executable was successfully loaded.
+      return status;
+    } else if (!iree_status_is_cancelled(status)) {
+      // Error beyond just the try failing due to unsupported formats.
+      return status;
+    }
+    iree_status_ignore(status);
+  }
+  return iree_make_status(
+      IREE_STATUS_NOT_FOUND,
+      "no executable loader registered for the given executable format '%.*s'",
+      (int)executable_params->executable_format.size,
+      executable_params->executable_format.data);
+}
+
+static const iree_hal_executable_cache_vtable_t
+    iree_hal_local_executable_cache_vtable = {
+        .destroy = iree_hal_local_executable_cache_destroy,
+        .can_prepare_format =
+            iree_hal_local_executable_cache_can_prepare_format,
+        .prepare_executable =
+            iree_hal_local_executable_cache_prepare_executable,
+};
diff --git a/runtime/src/iree/hal/local/local_executable_cache.h b/runtime/src/iree/hal/local/local_executable_cache.h
new file mode 100644
index 0000000..0bec265
--- /dev/null
+++ b/runtime/src/iree/hal/local/local_executable_cache.h
@@ -0,0 +1,35 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_LOCAL_LOCAL_EXECUTABLE_CACHE_H_
+#define IREE_HAL_LOCAL_LOCAL_EXECUTABLE_CACHE_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/local/executable_loader.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// TODO(benvanik): when we refactor executable caches this can become something
+// more specialized; like nop_executable_cache (does nothing but pass through)
+// or inproc_lru_executable_cache (simple in-memory LRU of recent executables).
+//
+// We can also set this up so they share storage. Ideally a JIT'ed executable in
+// one device is the same JIT'ed executable in another, and in multi-tenant
+// situations we're likely to want that isolation _and_ sharing.
+
+iree_status_t iree_hal_local_executable_cache_create(
+    iree_string_view_t identifier, iree_host_size_t loader_count,
+    iree_hal_executable_loader_t** loaders, iree_allocator_t host_allocator,
+    iree_hal_executable_cache_t** out_executable_cache);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_LOCAL_LOCAL_EXECUTABLE_CACHE_H_
diff --git a/runtime/src/iree/hal/local/local_executable_layout.c b/runtime/src/iree/hal/local/local_executable_layout.c
new file mode 100644
index 0000000..360a1b0
--- /dev/null
+++ b/runtime/src/iree/hal/local/local_executable_layout.c
@@ -0,0 +1,107 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/local/local_executable_layout.h"
+
+#include <stddef.h>
+
+#include "iree/base/tracing.h"
+#include "iree/hal/local/local_descriptor_set_layout.h"
+
+static const iree_hal_executable_layout_vtable_t
+    iree_hal_local_executable_layout_vtable;
+
+iree_hal_local_executable_layout_t* iree_hal_local_executable_layout_cast(
+    iree_hal_executable_layout_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_local_executable_layout_vtable);
+  return (iree_hal_local_executable_layout_t*)base_value;
+}
+
+iree_status_t iree_hal_local_executable_layout_create(
+    iree_host_size_t push_constants, iree_host_size_t set_layout_count,
+    iree_hal_descriptor_set_layout_t** set_layouts,
+    iree_allocator_t host_allocator,
+    iree_hal_executable_layout_t** out_executable_layout) {
+  IREE_ASSERT_ARGUMENT(!set_layout_count || set_layouts);
+  IREE_ASSERT_ARGUMENT(out_executable_layout);
+  *out_executable_layout = NULL;
+  if (set_layout_count > IREE_HAL_LOCAL_MAX_DESCRIPTOR_SET_COUNT) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "set layout count %zu over the limit of %d",
+                            set_layout_count,
+                            IREE_HAL_LOCAL_MAX_DESCRIPTOR_SET_COUNT);
+  }
+  if (push_constants > IREE_HAL_LOCAL_MAX_PUSH_CONSTANT_COUNT) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "push constant count %zu over the limit of %d",
+                            push_constants,
+                            IREE_HAL_LOCAL_MAX_PUSH_CONSTANT_COUNT);
+  }
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_host_size_t total_size =
+      sizeof(iree_hal_local_executable_layout_t) +
+      set_layout_count * sizeof(iree_hal_descriptor_set_layout_t*);
+
+  iree_hal_local_executable_layout_t* layout = NULL;
+  iree_status_t status =
+      iree_allocator_malloc(host_allocator, total_size, (void**)&layout);
+  if (iree_status_is_ok(status)) {
+    iree_hal_resource_initialize(&iree_hal_local_executable_layout_vtable,
+                                 &layout->resource);
+    layout->host_allocator = host_allocator;
+    layout->push_constants = push_constants;
+    layout->dynamic_binding_count = 0;
+    layout->used_bindings = 0;
+    layout->set_layout_count = set_layout_count;
+    for (iree_host_size_t i = 0; i < set_layout_count; ++i) {
+      layout->set_layouts[i] = set_layouts[i];
+      iree_hal_descriptor_set_layout_retain(layout->set_layouts[i]);
+
+      iree_hal_local_descriptor_set_layout_t* local_set_layout =
+          iree_hal_local_descriptor_set_layout_cast(set_layouts[i]);
+      for (iree_host_size_t j = 0; j < local_set_layout->binding_count; ++j) {
+        const iree_hal_descriptor_set_layout_binding_t* binding =
+            &local_set_layout->bindings[j];
+        layout->used_bindings |=
+            1ull << (i * IREE_HAL_LOCAL_MAX_DESCRIPTOR_BINDING_COUNT + j);
+        switch (binding->type) {
+          case IREE_HAL_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
+          case IREE_HAL_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+            ++layout->dynamic_binding_count;
+            break;
+          default:
+            continue;
+        }
+      }
+    }
+    *out_executable_layout = (iree_hal_executable_layout_t*)layout;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_local_executable_layout_destroy(
+    iree_hal_executable_layout_t* base_layout) {
+  iree_hal_local_executable_layout_t* layout =
+      iree_hal_local_executable_layout_cast(base_layout);
+  iree_allocator_t host_allocator = layout->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  for (iree_host_size_t i = 0; i < layout->set_layout_count; ++i) {
+    iree_hal_descriptor_set_layout_release(layout->set_layouts[i]);
+  }
+  iree_allocator_free(host_allocator, layout);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static const iree_hal_executable_layout_vtable_t
+    iree_hal_local_executable_layout_vtable = {
+        .destroy = iree_hal_local_executable_layout_destroy,
+};
diff --git a/runtime/src/iree/hal/local/local_executable_layout.h b/runtime/src/iree/hal/local/local_executable_layout.h
new file mode 100644
index 0000000..3732b9b
--- /dev/null
+++ b/runtime/src/iree/hal/local/local_executable_layout.h
@@ -0,0 +1,50 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_LOCAL_LOCAL_EXECUTABLE_LAYOUT_H_
+#define IREE_HAL_LOCAL_LOCAL_EXECUTABLE_LAYOUT_H_
+
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+#define IREE_HAL_LOCAL_MAX_DESCRIPTOR_SET_COUNT 2
+#define IREE_HAL_LOCAL_MAX_PUSH_CONSTANT_COUNT 64
+
+typedef uint64_t iree_hal_local_binding_mask_t;
+
+#define IREE_HAL_LOCAL_BINDING_MASK_BITS \
+  (sizeof(iree_hal_local_binding_mask_t) * 8)
+
+typedef struct iree_hal_local_executable_layout_t {
+  iree_hal_resource_t resource;
+  iree_allocator_t host_allocator;
+  iree_host_size_t push_constants;
+  iree_host_size_t dynamic_binding_count;
+  iree_hal_local_binding_mask_t used_bindings;
+  iree_host_size_t set_layout_count;
+  iree_hal_descriptor_set_layout_t* set_layouts[];
+} iree_hal_local_executable_layout_t;
+
+iree_status_t iree_hal_local_executable_layout_create(
+    iree_host_size_t push_constants, iree_host_size_t set_layout_count,
+    iree_hal_descriptor_set_layout_t** set_layouts,
+    iree_allocator_t host_allocator,
+    iree_hal_executable_layout_t** out_executable_layout);
+
+iree_hal_local_executable_layout_t* iree_hal_local_executable_layout_cast(
+    iree_hal_executable_layout_t* base_value);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_LOCAL_LOCAL_EXECUTABLE_LAYOUT_H_
diff --git a/runtime/src/iree/hal/local/sync_device.c b/runtime/src/iree/hal/local/sync_device.c
new file mode 100644
index 0000000..ffb217a
--- /dev/null
+++ b/runtime/src/iree/hal/local/sync_device.c
@@ -0,0 +1,324 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/local/sync_device.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "iree/base/tracing.h"
+#include "iree/hal/local/inline_command_buffer.h"
+#include "iree/hal/local/local_descriptor_set.h"
+#include "iree/hal/local/local_descriptor_set_layout.h"
+#include "iree/hal/local/local_executable_cache.h"
+#include "iree/hal/local/local_executable_layout.h"
+#include "iree/hal/local/sync_event.h"
+#include "iree/hal/local/sync_semaphore.h"
+#include "iree/hal/utils/buffer_transfer.h"
+
+typedef struct iree_hal_sync_device_t {
+  iree_hal_resource_t resource;
+  iree_string_view_t identifier;
+
+  iree_allocator_t host_allocator;
+  iree_hal_allocator_t* device_allocator;
+
+  iree_hal_sync_semaphore_state_t semaphore_state;
+
+  iree_host_size_t loader_count;
+  iree_hal_executable_loader_t* loaders[];
+} iree_hal_sync_device_t;
+
+static const iree_hal_device_vtable_t iree_hal_sync_device_vtable;
+
+static iree_hal_sync_device_t* iree_hal_sync_device_cast(
+    iree_hal_device_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_sync_device_vtable);
+  return (iree_hal_sync_device_t*)base_value;
+}
+
+void iree_hal_sync_device_params_initialize(
+    iree_hal_sync_device_params_t* out_params) {
+  memset(out_params, 0, sizeof(*out_params));
+}
+
+static iree_status_t iree_hal_sync_device_check_params(
+    const iree_hal_sync_device_params_t* params) {
+  return iree_ok_status();
+}
+
+iree_status_t iree_hal_sync_device_create(
+    iree_string_view_t identifier, const iree_hal_sync_device_params_t* params,
+    iree_host_size_t loader_count, iree_hal_executable_loader_t** loaders,
+    iree_hal_allocator_t* device_allocator, iree_allocator_t host_allocator,
+    iree_hal_device_t** out_device) {
+  IREE_ASSERT_ARGUMENT(params);
+  IREE_ASSERT_ARGUMENT(!loader_count || loaders);
+  IREE_ASSERT_ARGUMENT(device_allocator);
+  IREE_ASSERT_ARGUMENT(out_device);
+  *out_device = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(z0,
+                                    iree_hal_sync_device_check_params(params));
+
+  iree_hal_sync_device_t* device = NULL;
+  iree_host_size_t struct_size =
+      sizeof(*device) + loader_count * sizeof(*device->loaders);
+  iree_host_size_t total_size = struct_size + identifier.size;
+  iree_status_t status =
+      iree_allocator_malloc(host_allocator, total_size, (void**)&device);
+  if (iree_status_is_ok(status)) {
+    memset(device, 0, total_size);
+    iree_hal_resource_initialize(&iree_hal_sync_device_vtable,
+                                 &device->resource);
+    iree_string_view_append_to_buffer(identifier, &device->identifier,
+                                      (char*)device + struct_size);
+    device->host_allocator = host_allocator;
+    device->device_allocator = device_allocator;
+    iree_hal_allocator_retain(device_allocator);
+
+    device->loader_count = loader_count;
+    for (iree_host_size_t i = 0; i < device->loader_count; ++i) {
+      device->loaders[i] = loaders[i];
+      iree_hal_executable_loader_retain(device->loaders[i]);
+    }
+
+    iree_hal_sync_semaphore_state_initialize(&device->semaphore_state);
+  }
+
+  if (iree_status_is_ok(status)) {
+    *out_device = (iree_hal_device_t*)device;
+  } else {
+    iree_hal_device_release((iree_hal_device_t*)device);
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_sync_device_destroy(iree_hal_device_t* base_device) {
+  iree_hal_sync_device_t* device = iree_hal_sync_device_cast(base_device);
+  iree_allocator_t host_allocator = iree_hal_device_host_allocator(base_device);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_sync_semaphore_state_deinitialize(&device->semaphore_state);
+
+  for (iree_host_size_t i = 0; i < device->loader_count; ++i) {
+    iree_hal_executable_loader_release(device->loaders[i]);
+  }
+  iree_hal_allocator_release(device->device_allocator);
+  iree_allocator_free(host_allocator, device);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_string_view_t iree_hal_sync_device_id(
+    iree_hal_device_t* base_device) {
+  iree_hal_sync_device_t* device = iree_hal_sync_device_cast(base_device);
+  return device->identifier;
+}
+
+static iree_allocator_t iree_hal_sync_device_host_allocator(
+    iree_hal_device_t* base_device) {
+  iree_hal_sync_device_t* device = iree_hal_sync_device_cast(base_device);
+  return device->host_allocator;
+}
+
+static iree_hal_allocator_t* iree_hal_sync_device_allocator(
+    iree_hal_device_t* base_device) {
+  iree_hal_sync_device_t* device = iree_hal_sync_device_cast(base_device);
+  return device->device_allocator;
+}
+
+static iree_status_t iree_hal_sync_device_trim(iree_hal_device_t* base_device) {
+  iree_hal_sync_device_t* device = iree_hal_sync_device_cast(base_device);
+  return iree_hal_allocator_trim(device->device_allocator);
+}
+
+static iree_status_t iree_hal_sync_device_query_i32(
+    iree_hal_device_t* base_device, iree_string_view_t category,
+    iree_string_view_t key, int32_t* out_value) {
+  iree_hal_sync_device_t* device = iree_hal_sync_device_cast(base_device);
+  *out_value = 0;
+
+  if (iree_string_view_equal(category,
+                             iree_make_cstring_view("hal.executable.format"))) {
+    *out_value =
+        iree_hal_query_any_executable_loader_support(
+            device->loader_count, device->loaders, /*caching_mode=*/0, key)
+            ? 1
+            : 0;
+    return iree_ok_status();
+  } else if (iree_string_view_equal(category,
+                                    iree_make_cstring_view("hal.device"))) {
+    if (iree_string_view_equal(key, iree_make_cstring_view("concurrency"))) {
+      *out_value = 1;
+      return iree_ok_status();
+    }
+  } else if (iree_string_view_equal(category,
+                                    iree_make_cstring_view("hal.dispatch"))) {
+    if (iree_string_view_equal(key, iree_make_cstring_view("concurrency"))) {
+      *out_value = 1;
+      return iree_ok_status();
+    }
+  }
+
+  return iree_make_status(
+      IREE_STATUS_NOT_FOUND,
+      "unknown device configuration key value '%.*s :: %.*s'",
+      (int)category.size, category.data, (int)key.size, key.data);
+}
+
+static iree_status_t iree_hal_sync_device_create_command_buffer(
+    iree_hal_device_t* base_device, iree_hal_command_buffer_mode_t mode,
+    iree_hal_command_category_t command_categories,
+    iree_hal_queue_affinity_t queue_affinity,
+    iree_hal_command_buffer_t** out_command_buffer) {
+  // TODO(#4680): implement a non-inline command buffer that stores its commands
+  // and can be submitted later on/multiple-times.
+  return iree_hal_inline_command_buffer_create(
+      base_device, mode, command_categories, queue_affinity,
+      iree_hal_device_host_allocator(base_device), out_command_buffer);
+}
+
+static iree_status_t iree_hal_sync_device_create_descriptor_set(
+    iree_hal_device_t* base_device,
+    iree_hal_descriptor_set_layout_t* set_layout,
+    iree_host_size_t binding_count,
+    const iree_hal_descriptor_set_binding_t* bindings,
+    iree_hal_descriptor_set_t** out_descriptor_set) {
+  return iree_hal_local_descriptor_set_create(set_layout, binding_count,
+                                              bindings, out_descriptor_set);
+}
+
+static iree_status_t iree_hal_sync_device_create_descriptor_set_layout(
+    iree_hal_device_t* base_device,
+    iree_hal_descriptor_set_layout_usage_type_t usage_type,
+    iree_host_size_t binding_count,
+    const iree_hal_descriptor_set_layout_binding_t* bindings,
+    iree_hal_descriptor_set_layout_t** out_descriptor_set_layout) {
+  return iree_hal_local_descriptor_set_layout_create(
+      usage_type, binding_count, bindings,
+      iree_hal_device_host_allocator(base_device), out_descriptor_set_layout);
+}
+
+static iree_status_t iree_hal_sync_device_create_event(
+    iree_hal_device_t* base_device, iree_hal_event_t** out_event) {
+  return iree_hal_sync_event_create(iree_hal_device_host_allocator(base_device),
+                                    out_event);
+}
+
+static iree_status_t iree_hal_sync_device_create_executable_cache(
+    iree_hal_device_t* base_device, iree_string_view_t identifier,
+    iree_loop_t loop, iree_hal_executable_cache_t** out_executable_cache) {
+  iree_hal_sync_device_t* device = iree_hal_sync_device_cast(base_device);
+  return iree_hal_local_executable_cache_create(
+      identifier, device->loader_count, device->loaders,
+      iree_hal_device_host_allocator(base_device), out_executable_cache);
+}
+
+static iree_status_t iree_hal_sync_device_create_executable_layout(
+    iree_hal_device_t* base_device, iree_host_size_t push_constants,
+    iree_host_size_t set_layout_count,
+    iree_hal_descriptor_set_layout_t** set_layouts,
+    iree_hal_executable_layout_t** out_executable_layout) {
+  return iree_hal_local_executable_layout_create(
+      push_constants, set_layout_count, set_layouts,
+      iree_hal_device_host_allocator(base_device), out_executable_layout);
+}
+
+static iree_status_t iree_hal_sync_device_create_semaphore(
+    iree_hal_device_t* base_device, uint64_t initial_value,
+    iree_hal_semaphore_t** out_semaphore) {
+  iree_hal_sync_device_t* device = iree_hal_sync_device_cast(base_device);
+  return iree_hal_sync_semaphore_create(&device->semaphore_state, initial_value,
+                                        device->host_allocator, out_semaphore);
+}
+
+static iree_status_t iree_hal_sync_device_queue_submit(
+    iree_hal_device_t* base_device,
+    iree_hal_command_category_t command_categories,
+    iree_hal_queue_affinity_t queue_affinity, iree_host_size_t batch_count,
+    const iree_hal_submission_batch_t* batches) {
+  iree_hal_sync_device_t* device = iree_hal_sync_device_cast(base_device);
+
+  // TODO(#4680): there is some better error handling here needed; we should
+  // propagate failures to all signal semaphores. Today we aren't as there
+  // shouldn't be any failures or if there are there's not much we'd be able to
+  // do - we already executed everything inline!
+
+  for (iree_host_size_t i = 0; i < batch_count; ++i) {
+    const iree_hal_submission_batch_t* batch = &batches[i];
+
+    // Wait for semaphores to be signaled before performing any work.
+    IREE_RETURN_IF_ERROR(iree_hal_sync_semaphore_multi_wait(
+        &device->semaphore_state, IREE_HAL_WAIT_MODE_ALL,
+        &batch->wait_semaphores, iree_infinite_timeout()));
+
+    // TODO(#4680): if we were doing deferred submissions we would issue them
+    // here. With only inline command buffers we have nothing to do here.
+
+    // Signal all semaphores now that batch work has completed.
+    IREE_RETURN_IF_ERROR(iree_hal_sync_semaphore_multi_signal(
+        &device->semaphore_state, &batch->signal_semaphores));
+  }
+
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_sync_device_submit_and_wait(
+    iree_hal_device_t* base_device,
+    iree_hal_command_category_t command_categories,
+    iree_hal_queue_affinity_t queue_affinity, iree_host_size_t batch_count,
+    const iree_hal_submission_batch_t* batches,
+    iree_hal_semaphore_t* wait_semaphore, uint64_t wait_value,
+    iree_timeout_t timeout) {
+  // Submit...
+  IREE_RETURN_IF_ERROR(iree_hal_sync_device_queue_submit(
+      base_device, command_categories, queue_affinity, batch_count, batches));
+
+  // ...and wait.
+  return iree_hal_semaphore_wait(wait_semaphore, wait_value, timeout);
+}
+
+static iree_status_t iree_hal_sync_device_wait_semaphores(
+    iree_hal_device_t* base_device, iree_hal_wait_mode_t wait_mode,
+    const iree_hal_semaphore_list_t* semaphore_list, iree_timeout_t timeout) {
+  iree_hal_sync_device_t* device = iree_hal_sync_device_cast(base_device);
+  return iree_hal_sync_semaphore_multi_wait(&device->semaphore_state, wait_mode,
+                                            semaphore_list, timeout);
+}
+
+static iree_status_t iree_hal_sync_device_wait_idle(
+    iree_hal_device_t* base_device, iree_timeout_t timeout) {
+  // No-op (in intended usages). If we allowed multiple threads to call into
+  // the same device then we may want to change this to an atomic flag as to
+  // whether any thread is actively performing work.
+  return iree_ok_status();
+}
+
+static const iree_hal_device_vtable_t iree_hal_sync_device_vtable = {
+    .destroy = iree_hal_sync_device_destroy,
+    .id = iree_hal_sync_device_id,
+    .host_allocator = iree_hal_sync_device_host_allocator,
+    .device_allocator = iree_hal_sync_device_allocator,
+    .trim = iree_hal_sync_device_trim,
+    .query_i32 = iree_hal_sync_device_query_i32,
+    .create_command_buffer = iree_hal_sync_device_create_command_buffer,
+    .create_descriptor_set = iree_hal_sync_device_create_descriptor_set,
+    .create_descriptor_set_layout =
+        iree_hal_sync_device_create_descriptor_set_layout,
+    .create_event = iree_hal_sync_device_create_event,
+    .create_executable_cache = iree_hal_sync_device_create_executable_cache,
+    .create_executable_layout = iree_hal_sync_device_create_executable_layout,
+    .create_semaphore = iree_hal_sync_device_create_semaphore,
+    .transfer_range = iree_hal_device_transfer_mappable_range,
+    .queue_submit = iree_hal_sync_device_queue_submit,
+    .submit_and_wait = iree_hal_sync_device_submit_and_wait,
+    .wait_semaphores = iree_hal_sync_device_wait_semaphores,
+    .wait_idle = iree_hal_sync_device_wait_idle,
+};
diff --git a/runtime/src/iree/hal/local/sync_device.h b/runtime/src/iree/hal/local/sync_device.h
new file mode 100644
index 0000000..de990b7
--- /dev/null
+++ b/runtime/src/iree/hal/local/sync_device.h
@@ -0,0 +1,41 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_LOCAL_SYNC_DEVICE_H_
+#define IREE_HAL_LOCAL_SYNC_DEVICE_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/local/executable_loader.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Parameters configuring an iree_hal_sync_device_t.
+// Must be initialized with iree_hal_sync_device_params_initialize prior to use.
+typedef struct iree_hal_sync_device_params_t {
+  int reserved;
+} iree_hal_sync_device_params_t;
+
+// Initializes |out_params| to default values.
+void iree_hal_sync_device_params_initialize(
+    iree_hal_sync_device_params_t* out_params);
+
+// Creates a new synchronous local CPU device that performs execution inline
+// on threads issuing submissions. |loaders| is the set of executable
+// loaders that are available for loading in the device context.
+iree_status_t iree_hal_sync_device_create(
+    iree_string_view_t identifier, const iree_hal_sync_device_params_t* params,
+    iree_host_size_t loader_count, iree_hal_executable_loader_t** loaders,
+    iree_hal_allocator_t* device_allocator, iree_allocator_t host_allocator,
+    iree_hal_device_t** out_device);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_LOCAL_SYNC_DEVICE_H_
diff --git a/runtime/src/iree/hal/local/sync_driver.c b/runtime/src/iree/hal/local/sync_driver.c
new file mode 100644
index 0000000..c8291f2
--- /dev/null
+++ b/runtime/src/iree/hal/local/sync_driver.c
@@ -0,0 +1,127 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/local/sync_driver.h"
+
+#include <stddef.h>
+#include <string.h>
+
+#include "iree/base/tracing.h"
+
+#define IREE_HAL_SYNC_DEVICE_ID_DEFAULT 0
+
+typedef struct iree_hal_sync_driver_t {
+  iree_hal_resource_t resource;
+  iree_allocator_t host_allocator;
+  iree_hal_allocator_t* device_allocator;
+
+  iree_string_view_t identifier;
+  iree_hal_sync_device_params_t default_params;
+
+  iree_host_size_t loader_count;
+  iree_hal_executable_loader_t* loaders[];
+} iree_hal_sync_driver_t;
+
+static const iree_hal_driver_vtable_t iree_hal_sync_driver_vtable;
+
+static iree_hal_sync_driver_t* iree_hal_sync_driver_cast(
+    iree_hal_driver_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_sync_driver_vtable);
+  return (iree_hal_sync_driver_t*)base_value;
+}
+
+iree_status_t iree_hal_sync_driver_create(
+    iree_string_view_t identifier,
+    const iree_hal_sync_device_params_t* default_params,
+    iree_host_size_t loader_count, iree_hal_executable_loader_t** loaders,
+    iree_hal_allocator_t* device_allocator, iree_allocator_t host_allocator,
+    iree_hal_driver_t** out_driver) {
+  IREE_ASSERT_ARGUMENT(default_params);
+  IREE_ASSERT_ARGUMENT(!loader_count || loaders);
+  IREE_ASSERT_ARGUMENT(device_allocator);
+  IREE_ASSERT_ARGUMENT(out_driver);
+  *out_driver = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_sync_driver_t* driver = NULL;
+  iree_host_size_t total_size = sizeof(*driver) +
+                                loader_count * sizeof(*driver->loaders) +
+                                identifier.size;
+  iree_status_t status =
+      iree_allocator_malloc(host_allocator, total_size, (void**)&driver);
+  if (iree_status_is_ok(status)) {
+    iree_hal_resource_initialize(&iree_hal_sync_driver_vtable,
+                                 &driver->resource);
+    driver->host_allocator = host_allocator;
+    driver->device_allocator = device_allocator;
+    iree_hal_allocator_retain(device_allocator);
+
+    iree_string_view_append_to_buffer(
+        identifier, &driver->identifier,
+        (char*)driver + total_size - identifier.size);
+    memcpy(&driver->default_params, default_params,
+           sizeof(driver->default_params));
+
+    driver->loader_count = loader_count;
+    for (iree_host_size_t i = 0; i < driver->loader_count; ++i) {
+      driver->loaders[i] = loaders[i];
+      iree_hal_executable_loader_retain(driver->loaders[i]);
+    }
+  }
+
+  if (iree_status_is_ok(status)) {
+    *out_driver = (iree_hal_driver_t*)driver;
+  } else {
+    iree_hal_driver_release((iree_hal_driver_t*)driver);
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_sync_driver_destroy(iree_hal_driver_t* base_driver) {
+  iree_hal_sync_driver_t* driver = iree_hal_sync_driver_cast(base_driver);
+  iree_allocator_t host_allocator = driver->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_allocator_release(driver->device_allocator);
+  for (iree_host_size_t i = 0; i < driver->loader_count; ++i) {
+    iree_hal_executable_loader_release(driver->loaders[i]);
+  }
+  iree_allocator_free(host_allocator, driver);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_status_t iree_hal_sync_driver_query_available_devices(
+    iree_hal_driver_t* base_driver, iree_allocator_t allocator,
+    iree_hal_device_info_t** out_device_infos,
+    iree_host_size_t* out_device_info_count) {
+  static const iree_hal_device_info_t device_infos[1] = {
+      {
+          .device_id = IREE_HAL_SYNC_DEVICE_ID_DEFAULT,
+          .name = iree_string_view_literal("default"),
+      },
+  };
+  *out_device_info_count = IREE_ARRAYSIZE(device_infos);
+  return iree_allocator_clone(
+      allocator, iree_make_const_byte_span(device_infos, sizeof(device_infos)),
+      (void**)out_device_infos);
+}
+
+static iree_status_t iree_hal_sync_driver_create_device(
+    iree_hal_driver_t* base_driver, iree_hal_device_id_t device_id,
+    iree_allocator_t host_allocator, iree_hal_device_t** out_device) {
+  iree_hal_sync_driver_t* driver = iree_hal_sync_driver_cast(base_driver);
+  return iree_hal_sync_device_create(
+      driver->identifier, &driver->default_params, driver->loader_count,
+      driver->loaders, driver->device_allocator, host_allocator, out_device);
+}
+
+static const iree_hal_driver_vtable_t iree_hal_sync_driver_vtable = {
+    .destroy = iree_hal_sync_driver_destroy,
+    .query_available_devices = iree_hal_sync_driver_query_available_devices,
+    .create_device = iree_hal_sync_driver_create_device,
+};
diff --git a/runtime/src/iree/hal/local/sync_driver.h b/runtime/src/iree/hal/local/sync_driver.h
new file mode 100644
index 0000000..f4ff241
--- /dev/null
+++ b/runtime/src/iree/hal/local/sync_driver.h
@@ -0,0 +1,33 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_LOCAL_SYNC_DRIVER_H_
+#define IREE_HAL_LOCAL_SYNC_DRIVER_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/local/executable_loader.h"
+#include "iree/hal/local/sync_device.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Creates a new synchronous local CPU driver that creates devices that perform
+// execution inline on threads issuing submissions. |loaders| is the set of
+// executable loaders that are available for loading in each device context.
+iree_status_t iree_hal_sync_driver_create(
+    iree_string_view_t identifier,
+    const iree_hal_sync_device_params_t* default_params,
+    iree_host_size_t loader_count, iree_hal_executable_loader_t** loaders,
+    iree_hal_allocator_t* device_allocator, iree_allocator_t host_allocator,
+    iree_hal_driver_t** out_driver);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_LOCAL_SYNC_DRIVER_H_
diff --git a/runtime/src/iree/hal/local/sync_event.c b/runtime/src/iree/hal/local/sync_event.c
new file mode 100644
index 0000000..47a32c4
--- /dev/null
+++ b/runtime/src/iree/hal/local/sync_event.c
@@ -0,0 +1,57 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/local/sync_event.h"
+
+#include <stddef.h>
+
+#include "iree/base/tracing.h"
+
+typedef struct iree_hal_sync_event_t {
+  iree_hal_resource_t resource;
+  iree_allocator_t host_allocator;
+} iree_hal_sync_event_t;
+
+static const iree_hal_event_vtable_t iree_hal_sync_event_vtable;
+
+static iree_hal_sync_event_t* iree_hal_sync_event_cast(
+    iree_hal_event_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_sync_event_vtable);
+  return (iree_hal_sync_event_t*)base_value;
+}
+
+iree_status_t iree_hal_sync_event_create(iree_allocator_t host_allocator,
+                                         iree_hal_event_t** out_event) {
+  IREE_ASSERT_ARGUMENT(out_event);
+  *out_event = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_sync_event_t* event = NULL;
+  iree_status_t status =
+      iree_allocator_malloc(host_allocator, sizeof(*event), (void**)&event);
+  if (iree_status_is_ok(status)) {
+    iree_hal_resource_initialize(&iree_hal_sync_event_vtable, &event->resource);
+    event->host_allocator = host_allocator;
+    *out_event = (iree_hal_event_t*)event;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_sync_event_destroy(iree_hal_event_t* base_event) {
+  iree_hal_sync_event_t* event = iree_hal_sync_event_cast(base_event);
+  iree_allocator_t host_allocator = event->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_allocator_free(host_allocator, event);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static const iree_hal_event_vtable_t iree_hal_sync_event_vtable = {
+    .destroy = iree_hal_sync_event_destroy,
+};
diff --git a/runtime/src/iree/hal/local/sync_event.h b/runtime/src/iree/hal/local/sync_event.h
new file mode 100644
index 0000000..38fb354
--- /dev/null
+++ b/runtime/src/iree/hal/local/sync_event.h
@@ -0,0 +1,24 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_LOCAL_SYNC_EVENT_H_
+#define IREE_HAL_LOCAL_SYNC_EVENT_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+iree_status_t iree_hal_sync_event_create(iree_allocator_t host_allocator,
+                                         iree_hal_event_t** out_event);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_LOCAL_SYNC_EVENT_H_
diff --git a/runtime/src/iree/hal/local/sync_semaphore.c b/runtime/src/iree/hal/local/sync_semaphore.c
new file mode 100644
index 0000000..ceb5319
--- /dev/null
+++ b/runtime/src/iree/hal/local/sync_semaphore.c
@@ -0,0 +1,409 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/local/sync_semaphore.h"
+
+#include <inttypes.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <string.h>
+
+#include "iree/base/tracing.h"
+
+// Sentinel used the semaphore has failed and an error status is set.
+#define IREE_HAL_SYNC_SEMAPHORE_FAILURE_VALUE UINT64_MAX
+
+//===----------------------------------------------------------------------===//
+// iree_hal_sync_semaphore_state_t
+//===----------------------------------------------------------------------===//
+
+void iree_hal_sync_semaphore_state_initialize(
+    iree_hal_sync_semaphore_state_t* out_shared_state) {
+  memset(out_shared_state, 0, sizeof(*out_shared_state));
+  iree_notification_initialize(&out_shared_state->notification);
+}
+
+void iree_hal_sync_semaphore_state_deinitialize(
+    iree_hal_sync_semaphore_state_t* shared_state) {
+  iree_notification_deinitialize(&shared_state->notification);
+  memset(shared_state, 0, sizeof(*shared_state));
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_sync_semaphore_t
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_sync_semaphore_t {
+  iree_hal_resource_t resource;
+  iree_allocator_t host_allocator;
+
+  // Shared across all semaphores.
+  iree_hal_sync_semaphore_state_t* shared_state;
+
+  // Guards all mutable fields. We expect low contention on semaphores and since
+  // iree_slim_mutex_t is (effectively) just a CAS this keeps things simpler
+  // than trying to make the entire structure lock-free.
+  iree_slim_mutex_t mutex;
+
+  // Current signaled value. May be IREE_HAL_SYNC_SEMAPHORE_FAILURE_VALUE to
+  // indicate that the semaphore has been signaled for failure and
+  // |failure_status| contains the error.
+  uint64_t current_value;
+
+  // OK or the status passed to iree_hal_semaphore_fail. Owned by the semaphore.
+  iree_status_t failure_status;
+} iree_hal_sync_semaphore_t;
+
+static const iree_hal_semaphore_vtable_t iree_hal_sync_semaphore_vtable;
+
+static iree_hal_sync_semaphore_t* iree_hal_sync_semaphore_cast(
+    iree_hal_semaphore_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_sync_semaphore_vtable);
+  return (iree_hal_sync_semaphore_t*)base_value;
+}
+
+iree_status_t iree_hal_sync_semaphore_create(
+    iree_hal_sync_semaphore_state_t* shared_state, uint64_t initial_value,
+    iree_allocator_t host_allocator, iree_hal_semaphore_t** out_semaphore) {
+  IREE_ASSERT_ARGUMENT(shared_state);
+  IREE_ASSERT_ARGUMENT(out_semaphore);
+  *out_semaphore = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_sync_semaphore_t* semaphore = NULL;
+  iree_status_t status = iree_allocator_malloc(
+      host_allocator, sizeof(*semaphore), (void**)&semaphore);
+  if (iree_status_is_ok(status)) {
+    iree_hal_resource_initialize(&iree_hal_sync_semaphore_vtable,
+                                 &semaphore->resource);
+    semaphore->host_allocator = host_allocator;
+    semaphore->shared_state = shared_state;
+
+    iree_slim_mutex_initialize(&semaphore->mutex);
+    semaphore->current_value = initial_value;
+    semaphore->failure_status = iree_ok_status();
+
+    *out_semaphore = (iree_hal_semaphore_t*)semaphore;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_sync_semaphore_destroy(
+    iree_hal_semaphore_t* base_semaphore) {
+  iree_hal_sync_semaphore_t* semaphore =
+      iree_hal_sync_semaphore_cast(base_semaphore);
+  iree_allocator_t host_allocator = semaphore->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_status_free(semaphore->failure_status);
+  iree_slim_mutex_deinitialize(&semaphore->mutex);
+  iree_allocator_free(host_allocator, semaphore);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_status_t iree_hal_sync_semaphore_query(
+    iree_hal_semaphore_t* base_semaphore, uint64_t* out_value) {
+  iree_hal_sync_semaphore_t* semaphore =
+      iree_hal_sync_semaphore_cast(base_semaphore);
+
+  iree_slim_mutex_lock(&semaphore->mutex);
+
+  *out_value = semaphore->current_value;
+
+  iree_status_t status = iree_ok_status();
+  if (*out_value >= IREE_HAL_SYNC_SEMAPHORE_FAILURE_VALUE) {
+    status = iree_status_clone(semaphore->failure_status);
+  }
+
+  iree_slim_mutex_unlock(&semaphore->mutex);
+
+  return status;
+}
+
+// Signals |semaphore| to |new_value| or returns an error if doing so would be
+// invalid. The semaphore mutex must be held.
+static iree_status_t iree_hal_sync_semaphore_signal_unsafe(
+    iree_hal_sync_semaphore_t* semaphore, uint64_t new_value) {
+  if (new_value <= semaphore->current_value) {
+    uint64_t current_value IREE_ATTRIBUTE_UNUSED = semaphore->current_value;
+    iree_slim_mutex_unlock(&semaphore->mutex);
+    return iree_make_status(IREE_STATUS_OUT_OF_RANGE,
+                            "semaphore values must be monotonically "
+                            "increasing; current_value=%" PRIu64
+                            ", new_value=%" PRIu64,
+                            current_value, new_value);
+  }
+
+  // Update to the new value.
+  semaphore->current_value = new_value;
+
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_sync_semaphore_signal(
+    iree_hal_semaphore_t* base_semaphore, uint64_t new_value) {
+  iree_hal_sync_semaphore_t* semaphore =
+      iree_hal_sync_semaphore_cast(base_semaphore);
+
+  iree_slim_mutex_lock(&semaphore->mutex);
+  iree_status_t status =
+      iree_hal_sync_semaphore_signal_unsafe(semaphore, new_value);
+  iree_slim_mutex_unlock(&semaphore->mutex);
+
+  if (iree_status_is_ok(status)) {
+    // Post a global notification so that any waiter will wake.
+    // TODO(#4680): make notifications per-semaphore; would make multi-wait
+    // impossible with iree_notification_t and we'd have to use wait handles.
+    iree_notification_post(&semaphore->shared_state->notification,
+                           IREE_ALL_WAITERS);
+  }
+
+  return status;
+}
+
+static void iree_hal_sync_semaphore_fail(iree_hal_semaphore_t* base_semaphore,
+                                         iree_status_t status) {
+  iree_hal_sync_semaphore_t* semaphore =
+      iree_hal_sync_semaphore_cast(base_semaphore);
+
+  iree_slim_mutex_lock(&semaphore->mutex);
+
+  // Try to set our local status - we only preserve the first failure so only
+  // do this if we are going from a valid semaphore to a failed one.
+  if (!iree_status_is_ok(semaphore->failure_status)) {
+    // Previous status was not OK; drop our new status.
+    IREE_IGNORE_ERROR(status);
+    iree_slim_mutex_unlock(&semaphore->mutex);
+    return;
+  }
+
+  // Signal to our failure sentinel value.
+  semaphore->current_value = IREE_HAL_SYNC_SEMAPHORE_FAILURE_VALUE;
+  semaphore->failure_status = status;
+
+  iree_slim_mutex_unlock(&semaphore->mutex);
+
+  iree_notification_post(&semaphore->shared_state->notification,
+                         IREE_ALL_WAITERS);
+}
+
+iree_status_t iree_hal_sync_semaphore_multi_signal(
+    iree_hal_sync_semaphore_state_t* shared_state,
+    const iree_hal_semaphore_list_t* semaphore_list) {
+  // Try to signal all semaphores, stopping if we encounter any issues.
+  iree_status_t status = iree_ok_status();
+  for (iree_host_size_t i = 0; i < semaphore_list->count; ++i) {
+    iree_hal_sync_semaphore_t* semaphore =
+        iree_hal_sync_semaphore_cast(semaphore_list->semaphores[i]);
+    iree_slim_mutex_lock(&semaphore->mutex);
+    status = iree_hal_sync_semaphore_signal_unsafe(
+        semaphore, semaphore_list->payload_values[i]);
+    iree_slim_mutex_unlock(&semaphore->mutex);
+    if (!iree_status_is_ok(status)) break;
+  }
+
+  // Notify all waiters that we've updated semaphores. They'll wake and check
+  // to see if they are satisfied.
+  // NOTE: we do this even if there was a failure as we may have signaled some
+  // of the list.
+  iree_notification_post(&shared_state->notification, IREE_ALL_WAITERS);
+
+  return status;
+}
+
+typedef struct iree_hal_sync_semaphore_notify_state_t {
+  iree_hal_sync_semaphore_t* semaphore;
+  uint64_t value;
+} iree_hal_sync_semaphore_notify_state_t;
+
+static bool iree_hal_sync_semaphore_is_signaled(
+    iree_hal_sync_semaphore_notify_state_t* state) {
+  iree_hal_sync_semaphore_t* semaphore = state->semaphore;
+  iree_slim_mutex_lock(&semaphore->mutex);
+  bool is_signaled = semaphore->current_value >= state->value ||
+                     !iree_status_is_ok(semaphore->failure_status);
+  iree_slim_mutex_unlock(&semaphore->mutex);
+  return is_signaled;
+}
+
+static iree_status_t iree_hal_sync_semaphore_wait(
+    iree_hal_semaphore_t* base_semaphore, uint64_t value,
+    iree_timeout_t timeout) {
+  iree_hal_sync_semaphore_t* semaphore =
+      iree_hal_sync_semaphore_cast(base_semaphore);
+
+  // Try to see if we can return immediately.
+  iree_slim_mutex_lock(&semaphore->mutex);
+  if (!iree_status_is_ok(semaphore->failure_status)) {
+    // Fastest path: failed; return an error to tell callers to query for it.
+    iree_slim_mutex_unlock(&semaphore->mutex);
+    return iree_status_from_code(IREE_STATUS_ABORTED);
+  } else if (semaphore->current_value >= value) {
+    // Fast path: already satisfied.
+    iree_slim_mutex_unlock(&semaphore->mutex);
+    return iree_ok_status();
+  } else if (iree_timeout_is_immediate(timeout)) {
+    // Not satisfied but a poll, so can avoid the expensive wait handle work.
+    iree_slim_mutex_unlock(&semaphore->mutex);
+    return iree_status_from_code(IREE_STATUS_DEADLINE_EXCEEDED);
+  }
+  iree_slim_mutex_unlock(&semaphore->mutex);
+
+  // TODO(#4680): we should be checking for DEADLINE_EXCEEDED here. This is
+  // easy when it's iree_timeout_is_infinite (we can just use the notification
+  // as below) but if it's an actual deadline we'll need to probably switch to
+  // iree_wait_handle_t.
+
+  // Perform wait on the global notification. Will wait forever.
+  iree_hal_sync_semaphore_state_t* shared_state = semaphore->shared_state;
+  iree_hal_sync_semaphore_notify_state_t notify_state = {
+      .semaphore = semaphore,
+      .value = value,
+  };
+  iree_notification_await(
+      &shared_state->notification,
+      (iree_condition_fn_t)iree_hal_sync_semaphore_is_signaled,
+      (void*)&notify_state, timeout);
+
+  iree_status_t status = iree_ok_status();
+  iree_slim_mutex_lock(&semaphore->mutex);
+  if (!iree_status_is_ok(semaphore->failure_status)) {
+    // Semaphore has failed.
+    status = iree_status_from_code(IREE_STATUS_ABORTED);
+  } else if (semaphore->current_value < value) {
+    // Deadline expired before the semaphore was signaled.
+    status = iree_status_from_code(IREE_STATUS_DEADLINE_EXCEEDED);
+  }
+  iree_slim_mutex_unlock(&semaphore->mutex);
+  return status;
+}
+
+// Returns true if any semaphore in the list has signaled (or failed).
+// Used with with iree_condition_fn_t and must match that signature.
+static bool iree_hal_sync_semaphore_any_signaled(
+    const iree_hal_semaphore_list_t* semaphore_list) {
+  for (iree_host_size_t i = 0; i < semaphore_list->count; ++i) {
+    iree_hal_sync_semaphore_t* semaphore =
+        iree_hal_sync_semaphore_cast(semaphore_list->semaphores[i]);
+    iree_slim_mutex_lock(&semaphore->mutex);
+    bool is_signaled =
+        semaphore->current_value >= semaphore_list->payload_values[i] ||
+        !iree_status_is_ok(semaphore->failure_status);
+    iree_slim_mutex_unlock(&semaphore->mutex);
+    if (is_signaled) return true;
+  }
+  return false;
+}
+
+// Returns true if all semaphores in the list has signaled (or any failed).
+// Used with with iree_condition_fn_t and must match that signature.
+static bool iree_hal_sync_semaphore_all_signaled(
+    const iree_hal_semaphore_list_t* semaphore_list) {
+  for (iree_host_size_t i = 0; i < semaphore_list->count; ++i) {
+    iree_hal_sync_semaphore_t* semaphore =
+        iree_hal_sync_semaphore_cast(semaphore_list->semaphores[i]);
+    iree_slim_mutex_lock(&semaphore->mutex);
+    bool is_signaled =
+        semaphore->current_value >= semaphore_list->payload_values[i] ||
+        !iree_status_is_ok(semaphore->failure_status);
+    iree_slim_mutex_unlock(&semaphore->mutex);
+    if (!is_signaled) return false;
+  }
+  return true;
+}
+
+// Returns a status derived from the |semaphore_list| at the current time:
+// - IREE_STATUS_OK: any or all semaphores signaled (based on |wait_mode|).
+// - IREE_STATUS_ABORTED: one or more semaphores failed.
+// - IREE_STATUS_DEADLINE_EXCEEDED: any or all semaphores unsignaled.
+static iree_status_t iree_hal_sync_semaphore_result_from_state(
+    iree_hal_wait_mode_t wait_mode,
+    const iree_hal_semaphore_list_t* semaphore_list) {
+  bool any_signaled = false;
+  bool all_signaled = true;
+  bool any_failed = false;
+  for (iree_host_size_t i = 0; i < semaphore_list->count; ++i) {
+    iree_hal_sync_semaphore_t* semaphore =
+        iree_hal_sync_semaphore_cast(semaphore_list->semaphores[i]);
+    iree_slim_mutex_lock(&semaphore->mutex);
+    if (!iree_status_is_ok(semaphore->failure_status)) {
+      // Semaphore has failed.
+      any_failed = true;
+    } else if (semaphore->current_value < semaphore_list->payload_values[i]) {
+      // Deadline expired before the semaphore was signaled.
+      all_signaled = false;
+    } else {
+      // Signaled!
+      any_signaled = true;
+    }
+    iree_slim_mutex_unlock(&semaphore->mutex);
+  }
+  if (any_failed) {
+    // Always prioritize failure state.
+    return iree_status_from_code(IREE_STATUS_ABORTED);
+  }
+  switch (wait_mode) {
+    default:
+    case IREE_HAL_WAIT_MODE_ALL:
+      return all_signaled
+                 ? iree_ok_status()
+                 : iree_status_from_code(IREE_STATUS_DEADLINE_EXCEEDED);
+    case IREE_HAL_WAIT_MODE_ANY:
+      return any_signaled
+                 ? iree_ok_status()
+                 : iree_status_from_code(IREE_STATUS_DEADLINE_EXCEEDED);
+  }
+}
+
+iree_status_t iree_hal_sync_semaphore_multi_wait(
+    iree_hal_sync_semaphore_state_t* shared_state,
+    iree_hal_wait_mode_t wait_mode,
+    const iree_hal_semaphore_list_t* semaphore_list, iree_timeout_t timeout) {
+  IREE_ASSERT_ARGUMENT(semaphore_list);
+  if (semaphore_list->count == 0) {
+    return iree_ok_status();
+  } else if (semaphore_list->count == 1) {
+    // Fast-path for a single semaphore.
+    return iree_hal_semaphore_wait(semaphore_list->semaphores[0],
+                                   semaphore_list->payload_values[0], timeout);
+  }
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Fast-path for polling; we'll never wait and can just do a quick query.
+  if (iree_timeout_is_immediate(timeout)) {
+    iree_status_t status =
+        iree_hal_sync_semaphore_result_from_state(wait_mode, semaphore_list);
+    IREE_TRACE_ZONE_END(z0);
+    return status;
+  }
+
+  // Perform wait on the global notification.
+  iree_notification_await(
+      &shared_state->notification,
+      wait_mode == IREE_HAL_WAIT_MODE_ALL
+          ? (iree_condition_fn_t)iree_hal_sync_semaphore_all_signaled
+          : (iree_condition_fn_t)iree_hal_sync_semaphore_any_signaled,
+      (void*)semaphore_list, iree_infinite_timeout());
+
+  // We may have been successful - or may have a partial failure.
+  iree_status_t status =
+      iree_hal_sync_semaphore_result_from_state(wait_mode, semaphore_list);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static const iree_hal_semaphore_vtable_t iree_hal_sync_semaphore_vtable = {
+    .destroy = iree_hal_sync_semaphore_destroy,
+    .query = iree_hal_sync_semaphore_query,
+    .signal = iree_hal_sync_semaphore_signal,
+    .fail = iree_hal_sync_semaphore_fail,
+    .wait = iree_hal_sync_semaphore_wait,
+};
diff --git a/runtime/src/iree/hal/local/sync_semaphore.h b/runtime/src/iree/hal/local/sync_semaphore.h
new file mode 100644
index 0000000..ecc6be6
--- /dev/null
+++ b/runtime/src/iree/hal/local/sync_semaphore.h
@@ -0,0 +1,74 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_LOCAL_SYNC_SEMAPHORE_H_
+#define IREE_HAL_LOCAL_SYNC_SEMAPHORE_H_
+
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/synchronization.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+//===----------------------------------------------------------------------===//
+// iree_hal_sync_semaphore_state_t
+//===----------------------------------------------------------------------===//
+
+// State shared between all sync semaphores.
+// Owned by the device and guaranteed to remain valid for the lifetime of any
+// semaphore created from it.
+typedef struct iree_hal_sync_semaphore_state_t {
+  // In-process notification signaled when any semaphore value changes.
+  iree_notification_t notification;
+} iree_hal_sync_semaphore_state_t;
+
+// Initializes state used to perform semaphore synchronization.
+void iree_hal_sync_semaphore_state_initialize(
+    iree_hal_sync_semaphore_state_t* out_shared_state);
+
+// Deinitializes state used to perform semaphore synchronization; no semaphores
+// must be live with references.
+void iree_hal_sync_semaphore_state_deinitialize(
+    iree_hal_sync_semaphore_state_t* shared_state);
+
+//===----------------------------------------------------------------------===//
+// iree_hal_sync_semaphore_t
+//===----------------------------------------------------------------------===//
+
+// Creates a semaphore that allows for ordering of operations on the local host.
+// Backed by a shared iree_notification_t in |shared_state|. Not efficient under
+// high contention or many simultaneous users but that's not what the
+// synchronous backend is intended for - if you want something efficient in the
+// face of hundreds or thousands of active asynchronous operations then use the
+// task system.
+iree_status_t iree_hal_sync_semaphore_create(
+    iree_hal_sync_semaphore_state_t* shared_state, uint64_t initial_value,
+    iree_allocator_t host_allocator, iree_hal_semaphore_t** out_semaphore);
+
+// Performs a signal of a list of semaphores.
+// The semaphores will transition to their new values (nearly) atomically and
+// batching up signals will reduce synchronization overhead.
+iree_status_t iree_hal_sync_semaphore_multi_signal(
+    iree_hal_sync_semaphore_state_t* shared_state,
+    const iree_hal_semaphore_list_t* semaphore_list);
+
+// Performs a multi-wait on one or more semaphores.
+// Returns IREE_STATUS_DEADLINE_EXCEEDED if the wait does not complete before
+// |timeout| elapses.
+iree_status_t iree_hal_sync_semaphore_multi_wait(
+    iree_hal_sync_semaphore_state_t* shared_state,
+    iree_hal_wait_mode_t wait_mode,
+    const iree_hal_semaphore_list_t* semaphore_list, iree_timeout_t timeout);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_LOCAL_SYNC_SEMAPHORE_H_
diff --git a/runtime/src/iree/hal/local/task_command_buffer.c b/runtime/src/iree/hal/local/task_command_buffer.c
new file mode 100644
index 0000000..0ed6533
--- /dev/null
+++ b/runtime/src/iree/hal/local/task_command_buffer.c
@@ -0,0 +1,1023 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/local/task_command_buffer.h"
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/local/executable_environment.h"
+#include "iree/hal/local/executable_library.h"
+#include "iree/hal/local/local_descriptor_set_layout.h"
+#include "iree/hal/local/local_executable.h"
+#include "iree/hal/local/local_executable_layout.h"
+#include "iree/hal/utils/resource_set.h"
+#include "iree/task/affinity_set.h"
+#include "iree/task/list.h"
+#include "iree/task/submission.h"
+#include "iree/task/task.h"
+
+//===----------------------------------------------------------------------===//
+// iree_hal_task_command_buffer_t
+//===----------------------------------------------------------------------===//
+
+// iree/task/-based command buffer.
+// We track a minimal amount of state here and incrementally build out the task
+// DAG that we can submit to the task system directly. There's no intermediate
+// data structures and we produce the iree_task_ts directly. In the steady state
+// all allocations are served from a shared per-device block pool with no
+// additional allocations required during recording or execution. That means our
+// command buffer here is essentially just a builder for the task system types
+// and manager of the lifetime of the tasks.
+typedef struct iree_hal_task_command_buffer_t {
+  iree_hal_command_buffer_t base;
+  iree_allocator_t host_allocator;
+
+  iree_task_scope_t* scope;
+
+  // Arena used for all allocations; references the shared device block pool.
+  iree_arena_allocator_t arena;
+
+  // Maintains a reference to all resources used within the command buffer.
+  // Reset on each begin.
+  iree_hal_resource_set_t* resource_set;
+
+  // One or more tasks at the root of the command buffer task DAG.
+  // These tasks are all able to execute concurrently and will be the initial
+  // ready task set in the submission.
+  iree_task_list_t root_tasks;
+
+  // One or more tasks at the leaves of the DAG.
+  // Only once all these tasks have completed execution will the command buffer
+  // be considered completed as a whole.
+  //
+  // An empty list indicates that root_tasks are also the leaves.
+  iree_task_list_t leaf_tasks;
+
+  // TODO(benvanik): move this out of the struct and allocate from the arena -
+  // we only need this during recording and it's ~4KB of waste otherwise.
+  // State tracked within the command buffer during recording only.
+  struct {
+    // The last global barrier that was inserted, if any.
+    // The barrier is allocated and inserted into the DAG when requested but the
+    // actual barrier dependency list is only allocated and set on flushes.
+    // This lets us allocate the appropriately sized barrier task list from the
+    // arena even though when the barrier is recorded we don't yet know what
+    // other tasks we'll be emitting as we walk the command stream.
+    iree_task_barrier_t* open_barrier;
+
+    // The number of tasks in the open barrier (|open_tasks|), used to quickly
+    // allocate storage for the task list without needing to walk the list.
+    iree_host_size_t open_task_count;
+
+    // All execution tasks emitted that must execute after |open_barrier|.
+    iree_task_list_t open_tasks;
+
+    // A flattened list of all available descriptor set bindings.
+    // As descriptor sets are pushed/bound the bindings will be updated to
+    // represent the fully-translated binding data pointer.
+    // TODO(benvanik): support proper mapping semantics and track the
+    // iree_hal_buffer_mapping_t and map/unmap where appropriate.
+    void* bindings[IREE_HAL_LOCAL_MAX_DESCRIPTOR_SET_COUNT *
+                   IREE_HAL_LOCAL_MAX_DESCRIPTOR_BINDING_COUNT];
+    iree_device_size_t
+        binding_lengths[IREE_HAL_LOCAL_MAX_DESCRIPTOR_SET_COUNT *
+                        IREE_HAL_LOCAL_MAX_DESCRIPTOR_BINDING_COUNT];
+
+    // All available push constants updated each time push_constants is called.
+    // Reset only with the command buffer and otherwise will maintain its values
+    // during recording to allow for partial push_constants updates.
+    uint32_t push_constants[IREE_HAL_LOCAL_MAX_PUSH_CONSTANT_COUNT];
+  } state;
+} iree_hal_task_command_buffer_t;
+
+static const iree_hal_command_buffer_vtable_t
+    iree_hal_task_command_buffer_vtable;
+
+static iree_hal_task_command_buffer_t* iree_hal_task_command_buffer_cast(
+    iree_hal_command_buffer_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_task_command_buffer_vtable);
+  return (iree_hal_task_command_buffer_t*)base_value;
+}
+
+iree_status_t iree_hal_task_command_buffer_create(
+    iree_hal_device_t* device, iree_task_scope_t* scope,
+    iree_hal_command_buffer_mode_t mode,
+    iree_hal_command_category_t command_categories,
+    iree_hal_queue_affinity_t queue_affinity,
+    iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator,
+    iree_hal_command_buffer_t** out_command_buffer) {
+  IREE_ASSERT_ARGUMENT(out_command_buffer);
+  *out_command_buffer = NULL;
+  if (!iree_all_bits_set(mode, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT)) {
+    // If we want reuse we'd need to support duplicating the task DAG after
+    // recording or have some kind of copy-on-submit behavior that does so if
+    // a command buffer is submitted for execution twice. Allowing for the same
+    // command buffer to be enqueued multiple times would be fine so long as
+    // execution doesn't overlap (`cmdbuf|cmdbuf` vs
+    // `cmdbuf -> semaphore -> cmdbuf`) though we'd still need to be careful
+    // that we did the enqueuing and reset of the task structures at the right
+    // times. Definitely something that'll be useful in the future... but not
+    // today :)
+    return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                            "only one-shot command buffer usage is supported");
+  }
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_task_command_buffer_t* command_buffer = NULL;
+  iree_status_t status = iree_allocator_malloc(
+      host_allocator, sizeof(*command_buffer), (void**)&command_buffer);
+  if (iree_status_is_ok(status)) {
+    iree_hal_command_buffer_initialize(
+        device, mode, command_categories, queue_affinity,
+        &iree_hal_task_command_buffer_vtable, &command_buffer->base);
+    command_buffer->host_allocator = host_allocator;
+    command_buffer->scope = scope;
+    iree_arena_initialize(block_pool, &command_buffer->arena);
+    iree_task_list_initialize(&command_buffer->root_tasks);
+    iree_task_list_initialize(&command_buffer->leaf_tasks);
+    memset(&command_buffer->state, 0, sizeof(command_buffer->state));
+    status = iree_hal_resource_set_allocate(block_pool,
+                                            &command_buffer->resource_set);
+  }
+  if (iree_status_is_ok(status)) {
+    *out_command_buffer = &command_buffer->base;
+  } else {
+    iree_hal_command_buffer_release(&command_buffer->base);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_task_command_buffer_reset(
+    iree_hal_task_command_buffer_t* command_buffer) {
+  memset(&command_buffer->state, 0, sizeof(command_buffer->state));
+  iree_task_list_discard(&command_buffer->leaf_tasks);
+  iree_task_list_discard(&command_buffer->root_tasks);
+  iree_hal_resource_set_reset(command_buffer->resource_set);
+  iree_arena_reset(&command_buffer->arena);
+}
+
+static void iree_hal_task_command_buffer_destroy(
+    iree_hal_command_buffer_t* base_command_buffer) {
+  iree_hal_task_command_buffer_t* command_buffer =
+      iree_hal_task_command_buffer_cast(base_command_buffer);
+  iree_allocator_t host_allocator = command_buffer->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_task_command_buffer_reset(command_buffer);
+  iree_arena_deinitialize(&command_buffer->arena);
+  iree_hal_resource_set_free(command_buffer->resource_set);
+  iree_allocator_free(host_allocator, command_buffer);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+bool iree_hal_task_command_buffer_isa(
+    iree_hal_command_buffer_t* command_buffer) {
+  return iree_hal_command_buffer_dyn_cast(command_buffer,
+                                          &iree_hal_task_command_buffer_vtable);
+}
+
+static void* iree_hal_task_command_buffer_dyn_cast(
+    iree_hal_command_buffer_t* command_buffer, const void* vtable) {
+  if (vtable == &iree_hal_task_command_buffer_vtable) {
+    IREE_HAL_ASSERT_TYPE(command_buffer, vtable);
+    return command_buffer;
+  }
+  return NULL;
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_task_command_buffer_t recording
+//===----------------------------------------------------------------------===//
+
+static iree_status_t iree_hal_task_command_buffer_flush_tasks(
+    iree_hal_task_command_buffer_t* command_buffer);
+
+static iree_status_t iree_hal_task_command_buffer_begin(
+    iree_hal_command_buffer_t* base_command_buffer) {
+  iree_hal_task_command_buffer_t* command_buffer =
+      iree_hal_task_command_buffer_cast(base_command_buffer);
+  iree_hal_task_command_buffer_reset(command_buffer);
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_task_command_buffer_end(
+    iree_hal_command_buffer_t* base_command_buffer) {
+  iree_hal_task_command_buffer_t* command_buffer =
+      iree_hal_task_command_buffer_cast(base_command_buffer);
+
+  // Flush any open barriers.
+  IREE_RETURN_IF_ERROR(
+      iree_hal_task_command_buffer_flush_tasks(command_buffer));
+
+  // Move the tasks from the leaf list (tail) to the root list (head) if this
+  // was the first set of tasks recorded.
+  if (iree_task_list_is_empty(&command_buffer->root_tasks) &&
+      !iree_task_list_is_empty(&command_buffer->leaf_tasks)) {
+    iree_task_list_move(&command_buffer->leaf_tasks,
+                        &command_buffer->root_tasks);
+  }
+
+  return iree_ok_status();
+}
+
+// Flushes all open tasks to the previous barrier and prepares for more
+// recording. The root tasks are also populated here when required as this is
+// the one place where we can see both halves of the most recent synchronization
+// event: those tasks recorded prior (if any) and the task that marks the set of
+// tasks that will be recorded after (if any).
+static iree_status_t iree_hal_task_command_buffer_flush_tasks(
+    iree_hal_task_command_buffer_t* command_buffer) {
+  iree_task_barrier_t* open_barrier = command_buffer->state.open_barrier;
+  if (open_barrier != NULL) {
+    // There is an open barrier we need to fixup the fork out to all of the open
+    // tasks that were recorded after it.
+    iree_task_t* task_head =
+        iree_task_list_front(&command_buffer->state.open_tasks);
+    iree_host_size_t dependent_task_count =
+        command_buffer->state.open_task_count;
+    if (dependent_task_count == 1) {
+      // Special-case: only one open task so we can avoid the additional barrier
+      // overhead by reusing the completion task.
+      iree_task_set_completion_task(&open_barrier->header, task_head);
+    } else if (dependent_task_count > 1) {
+      // Allocate the list of tasks we'll stash back on the previous barrier.
+      // Since we couldn't know at the time how many tasks would end up in the
+      // barrier we had to defer it until now.
+      iree_task_t** dependent_tasks = NULL;
+      IREE_RETURN_IF_ERROR(iree_arena_allocate(
+          &command_buffer->arena, dependent_task_count * sizeof(iree_task_t*),
+          (void**)&dependent_tasks));
+      iree_task_t* task = task_head;
+      for (iree_host_size_t i = 0; i < dependent_task_count; ++i) {
+        dependent_tasks[i] = task;
+        task = task->next_task;
+      }
+      iree_task_barrier_set_dependent_tasks(open_barrier, dependent_task_count,
+                                            dependent_tasks);
+    }
+  }
+  command_buffer->state.open_barrier = NULL;
+
+  // Move the open tasks to the tail as they represent the first half of the
+  // *next* barrier that will be inserted.
+  if (command_buffer->state.open_task_count > 0) {
+    iree_task_list_move(&command_buffer->state.open_tasks,
+                        &command_buffer->leaf_tasks);
+    command_buffer->state.open_task_count = 0;
+  }
+
+  return iree_ok_status();
+}
+
+// Emits a global barrier, splitting execution into all prior recorded tasks
+// and all subsequent recorded tasks. This is currently the critical piece that
+// limits our concurrency: changing to fine-grained barriers (via barrier
+// buffers or events) will allow more work to overlap at the cost of more brain
+// to build out the proper task graph.
+static iree_status_t iree_hal_task_command_buffer_emit_global_barrier(
+    iree_hal_task_command_buffer_t* command_buffer) {
+  // Flush open tasks to the previous barrier. This resets our state such that
+  // we can assign the new open barrier and start recording tasks for it.
+  // Previous tasks will be moved into the leaf_tasks list.
+  IREE_RETURN_IF_ERROR(
+      iree_hal_task_command_buffer_flush_tasks(command_buffer));
+
+  // Allocate the new open barrier.
+  // As we are recording forward we can't yet assign the dependent tasks (the
+  // second half of the synchronization domain) and instead are just inserting
+  // it so we can setup the join from previous tasks (the first half of the
+  // synchronization domain).
+  iree_task_barrier_t* barrier = NULL;
+  IREE_RETURN_IF_ERROR(iree_arena_allocate(&command_buffer->arena,
+                                           sizeof(*barrier), (void**)&barrier));
+  iree_task_barrier_initialize_empty(command_buffer->scope, barrier);
+
+  // If there were previous tasks then join them to the barrier.
+  for (iree_task_t* task = iree_task_list_front(&command_buffer->leaf_tasks);
+       task != NULL; task = task->next_task) {
+    iree_task_set_completion_task(task, &barrier->header);
+  }
+
+  // Move the tasks from the leaf list (tail) to the root list (head) if this
+  // was the first set of tasks recorded.
+  if (iree_task_list_is_empty(&command_buffer->root_tasks) &&
+      !iree_task_list_is_empty(&command_buffer->leaf_tasks)) {
+    iree_task_list_move(&command_buffer->leaf_tasks,
+                        &command_buffer->root_tasks);
+  }
+
+  // Reset the tail of the command buffer to the barrier. This leaves us in a
+  // consistent state if the recording ends immediate after this (the barrier
+  // will be the last task).
+  iree_task_list_initialize(&command_buffer->leaf_tasks);
+  iree_task_list_push_back(&command_buffer->leaf_tasks, &barrier->header);
+
+  // NOTE: all new tasks emitted will be executed after this barrier.
+  command_buffer->state.open_barrier = barrier;
+  command_buffer->state.open_task_count = 0;
+
+  return iree_ok_status();
+}
+
+// Emits a the given execution |task| into the current open synchronization
+// scope (after state.open_barrier and before the next barrier).
+static iree_status_t iree_hal_task_command_buffer_emit_execution_task(
+    iree_hal_task_command_buffer_t* command_buffer, iree_task_t* task) {
+  if (command_buffer->state.open_barrier == NULL) {
+    // If there is no open barrier then we are at the head and going right into
+    // the task DAG.
+    iree_task_list_push_back(&command_buffer->leaf_tasks, task);
+  } else {
+    // Append to the open task list that will be flushed to the open barrier.
+    iree_task_list_push_back(&command_buffer->state.open_tasks, task);
+    ++command_buffer->state.open_task_count;
+  }
+  return iree_ok_status();
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_task_command_buffer_t execution
+//===----------------------------------------------------------------------===//
+
+iree_status_t iree_hal_task_command_buffer_issue(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_task_queue_state_t* queue_state, iree_task_t* retire_task,
+    iree_arena_allocator_t* arena, iree_task_submission_t* pending_submission) {
+  iree_hal_task_command_buffer_t* command_buffer =
+      iree_hal_command_buffer_dyn_cast(base_command_buffer,
+                                       &iree_hal_task_command_buffer_vtable);
+  IREE_ASSERT_TRUE(command_buffer);
+
+  // If the command buffer is empty (valid!) then we are a no-op.
+  bool has_root_tasks = !iree_task_list_is_empty(&command_buffer->root_tasks);
+  if (!has_root_tasks) {
+    return iree_ok_status();
+  }
+
+  bool has_leaf_tasks = !iree_task_list_is_empty(&command_buffer->leaf_tasks);
+  if (has_leaf_tasks) {
+    // Chain the retire task onto the leaf tasks as their completion indicates
+    // that all commands have completed.
+    for (iree_task_t* task = command_buffer->leaf_tasks.head; task != NULL;
+         task = task->next_task) {
+      iree_task_set_completion_task(task, retire_task);
+    }
+  } else {
+    // If we have no leaf tasks it means that this is a single layer DAG and
+    // after the root tasks complete the entire command buffer has completed.
+    for (iree_task_t* task = command_buffer->root_tasks.head; task != NULL;
+         task = task->next_task) {
+      iree_task_set_completion_task(task, retire_task);
+    }
+  }
+
+  // Enqueue all root tasks that are ready to run immediately.
+  // After this all of the command buffer tasks are owned by the submission and
+  // we need to ensure the command buffer doesn't try to discard them.
+  iree_task_submission_enqueue_list(pending_submission,
+                                    &command_buffer->root_tasks);
+  iree_task_list_initialize(&command_buffer->leaf_tasks);
+
+  return iree_ok_status();
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_task_command_buffer_t debug utilities
+//===----------------------------------------------------------------------===//
+
+static void iree_hal_task_command_buffer_begin_debug_group(
+    iree_hal_command_buffer_t* base_command_buffer, iree_string_view_t label,
+    iree_hal_label_color_t label_color,
+    const iree_hal_label_location_t* location) {
+  // TODO(benvanik): tracy event stack.
+}
+
+static void iree_hal_task_command_buffer_end_debug_group(
+    iree_hal_command_buffer_t* base_command_buffer) {
+  // TODO(benvanik): tracy event stack.
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_command_buffer_execution_barrier
+//===----------------------------------------------------------------------===//
+
+static iree_status_t iree_hal_task_command_buffer_execution_barrier(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_execution_stage_t source_stage_mask,
+    iree_hal_execution_stage_t target_stage_mask,
+    iree_hal_execution_barrier_flags_t flags,
+    iree_host_size_t memory_barrier_count,
+    const iree_hal_memory_barrier_t* memory_barriers,
+    iree_host_size_t buffer_barrier_count,
+    const iree_hal_buffer_barrier_t* buffer_barriers) {
+  iree_hal_task_command_buffer_t* command_buffer =
+      iree_hal_task_command_buffer_cast(base_command_buffer);
+
+  // TODO(benvanik): actual DAG construction. Right now we are just doing simple
+  // global barriers each time and forcing a join-fork point.
+  return iree_hal_task_command_buffer_emit_global_barrier(command_buffer);
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_command_buffer_signal_event
+//===----------------------------------------------------------------------===//
+
+static iree_status_t iree_hal_task_command_buffer_signal_event(
+    iree_hal_command_buffer_t* base_command_buffer, iree_hal_event_t* event,
+    iree_hal_execution_stage_t source_stage_mask) {
+  // TODO(#4518): implement events. For now we just insert global barriers.
+  return iree_ok_status();
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_command_buffer_reset_event
+//===----------------------------------------------------------------------===//
+
+static iree_status_t iree_hal_task_command_buffer_reset_event(
+    iree_hal_command_buffer_t* base_command_buffer, iree_hal_event_t* event,
+    iree_hal_execution_stage_t source_stage_mask) {
+  // TODO(#4518): implement events. For now we just insert global barriers.
+  return iree_ok_status();
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_command_buffer_wait_events
+//===----------------------------------------------------------------------===//
+
+static iree_status_t iree_hal_task_command_buffer_wait_events(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_host_size_t event_count, const iree_hal_event_t** events,
+    iree_hal_execution_stage_t source_stage_mask,
+    iree_hal_execution_stage_t target_stage_mask,
+    iree_host_size_t memory_barrier_count,
+    const iree_hal_memory_barrier_t* memory_barriers,
+    iree_host_size_t buffer_barrier_count,
+    const iree_hal_buffer_barrier_t* buffer_barriers) {
+  iree_hal_task_command_buffer_t* command_buffer =
+      iree_hal_task_command_buffer_cast(base_command_buffer);
+  // TODO(#4518): implement events. For now we just insert global barriers.
+  return iree_hal_task_command_buffer_emit_global_barrier(command_buffer);
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_command_buffer_discard_buffer
+//===----------------------------------------------------------------------===//
+
+static iree_status_t iree_hal_task_command_buffer_discard_buffer(
+    iree_hal_command_buffer_t* base_command_buffer, iree_hal_buffer_t* buffer) {
+  return iree_ok_status();
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_command_buffer_fill_buffer
+//===----------------------------------------------------------------------===//
+// NOTE: for large copies we dispatch this as tiles for parallelism.
+// We'd want to do some measurement for when it's worth it; filling a 200KB
+// buffer: maybe not, filling a 200MB buffer: yeah. For now we just do
+// arbitrarily sized chunks.
+
+// TODO(benvanik): make this a configurable setting. Must be aligned to pattern
+// length so pick a power of two.
+#define IREE_HAL_CMD_FILL_SLICE_LENGTH (128 * 1024)
+
+typedef struct iree_hal_cmd_fill_buffer_t {
+  iree_task_dispatch_t task;
+  iree_hal_buffer_t* target_buffer;
+  iree_device_size_t target_offset;
+  iree_device_size_t length;
+  uint32_t pattern_length;
+  uint8_t pattern[8];
+} iree_hal_cmd_fill_buffer_t;
+
+static iree_status_t iree_hal_cmd_fill_tile(
+    void* user_context, const iree_task_tile_context_t* tile_context,
+    iree_task_submission_t* pending_submission) {
+  const iree_hal_cmd_fill_buffer_t* cmd =
+      (const iree_hal_cmd_fill_buffer_t*)user_context;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  uint32_t length_per_slice = tile_context->workgroup_size[0];
+  iree_device_size_t slice_offset =
+      tile_context->workgroup_xyz[0] * length_per_slice;
+  iree_device_size_t remaining_length = cmd->length - slice_offset;
+  iree_device_size_t slice_length =
+      iree_min(length_per_slice, remaining_length);
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, (uint64_t)slice_length);
+
+  iree_status_t status = iree_hal_buffer_map_fill(
+      cmd->target_buffer, cmd->target_offset + slice_offset, slice_length,
+      cmd->pattern, cmd->pattern_length);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static iree_status_t iree_hal_task_command_buffer_fill_buffer(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+    iree_device_size_t length, const void* pattern,
+    iree_host_size_t pattern_length) {
+  iree_hal_task_command_buffer_t* command_buffer =
+      iree_hal_task_command_buffer_cast(base_command_buffer);
+
+  IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+      command_buffer->resource_set, 1, &target_buffer));
+
+  iree_hal_cmd_fill_buffer_t* cmd = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_arena_allocate(&command_buffer->arena, sizeof(*cmd), (void**)&cmd));
+
+  const uint32_t workgroup_size[3] = {
+      /*x=*/IREE_HAL_CMD_FILL_SLICE_LENGTH,
+      /*y=*/1,
+      /*z=*/1,
+  };
+  const uint32_t workgroup_count[3] = {
+      /*x=*/length / workgroup_size[0] + 1,
+      /*y=*/1,
+      /*z=*/1,
+  };
+  iree_task_dispatch_initialize(
+      command_buffer->scope,
+      iree_task_make_dispatch_closure(iree_hal_cmd_fill_tile, (void*)cmd),
+      workgroup_size, workgroup_count, &cmd->task);
+  cmd->target_buffer = target_buffer;
+  cmd->target_offset = target_offset;
+  cmd->length = length;
+  memcpy(cmd->pattern, pattern, pattern_length);
+  cmd->pattern_length = pattern_length;
+
+  return iree_hal_task_command_buffer_emit_execution_task(command_buffer,
+                                                          &cmd->task.header);
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_command_buffer_update_buffer
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_cmd_update_buffer_t {
+  iree_task_call_t task;
+  iree_hal_buffer_t* target_buffer;
+  iree_device_size_t target_offset;
+  iree_device_size_t length;
+  uint8_t source_buffer[];
+} iree_hal_cmd_update_buffer_t;
+
+static iree_status_t iree_hal_cmd_update_buffer(
+    void* user_context, iree_task_t* task,
+    iree_task_submission_t* pending_submission) {
+  const iree_hal_cmd_update_buffer_t* cmd =
+      (const iree_hal_cmd_update_buffer_t*)user_context;
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_status_t status = iree_hal_buffer_map_write(
+      cmd->target_buffer, cmd->target_offset, cmd->source_buffer, cmd->length);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static iree_status_t iree_hal_task_command_buffer_update_buffer(
+    iree_hal_command_buffer_t* base_command_buffer, const void* source_buffer,
+    iree_host_size_t source_offset, iree_hal_buffer_t* target_buffer,
+    iree_device_size_t target_offset, iree_device_size_t length) {
+  iree_hal_task_command_buffer_t* command_buffer =
+      iree_hal_task_command_buffer_cast(base_command_buffer);
+
+  IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+      command_buffer->resource_set, 1, &target_buffer));
+
+  iree_host_size_t total_cmd_size =
+      sizeof(iree_hal_cmd_update_buffer_t) + length;
+
+  iree_hal_cmd_update_buffer_t* cmd = NULL;
+  IREE_RETURN_IF_ERROR(iree_arena_allocate(&command_buffer->arena,
+                                           total_cmd_size, (void**)&cmd));
+
+  iree_task_call_initialize(
+      command_buffer->scope,
+      iree_task_make_call_closure(iree_hal_cmd_update_buffer, (void*)cmd),
+      &cmd->task);
+  cmd->target_buffer = target_buffer;
+  cmd->target_offset = target_offset;
+  cmd->length = length;
+
+  memcpy(cmd->source_buffer, (const uint8_t*)source_buffer + source_offset,
+         cmd->length);
+
+  return iree_hal_task_command_buffer_emit_execution_task(command_buffer,
+                                                          &cmd->task.header);
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_command_buffer_copy_buffer
+//===----------------------------------------------------------------------===//
+// NOTE: for large copies we dispatch this as tiles for parallelism.
+// We'd want to do some measurement for when it's worth it; copying a 200KB
+// buffer: maybe not, copying a 200MB buffer: yeah. For now we just do
+// arbitrarily sized chunks.
+
+// TODO(benvanik): make this a configurable setting. Must be aligned to pattern
+// length so pick a power of two.
+#define IREE_HAL_CMD_COPY_SLICE_LENGTH (128 * 1024)
+
+typedef struct iree_hal_cmd_copy_buffer_t {
+  iree_task_dispatch_t task;
+  iree_hal_buffer_t* source_buffer;
+  iree_device_size_t source_offset;
+  iree_hal_buffer_t* target_buffer;
+  iree_device_size_t target_offset;
+  iree_device_size_t length;
+} iree_hal_cmd_copy_buffer_t;
+
+static iree_status_t iree_hal_cmd_copy_tile(
+    void* user_context, const iree_task_tile_context_t* tile_context,
+    iree_task_submission_t* pending_submission) {
+  const iree_hal_cmd_copy_buffer_t* cmd =
+      (const iree_hal_cmd_copy_buffer_t*)user_context;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  uint32_t length_per_slice = tile_context->workgroup_size[0];
+  iree_device_size_t slice_offset =
+      tile_context->workgroup_xyz[0] * length_per_slice;
+  iree_device_size_t remaining_length = cmd->length - slice_offset;
+  iree_device_size_t slice_length =
+      iree_min(length_per_slice, remaining_length);
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, (uint64_t)slice_length);
+
+  iree_status_t status = iree_hal_buffer_map_copy(
+      cmd->source_buffer, cmd->source_offset + slice_offset, cmd->target_buffer,
+      cmd->target_offset + slice_offset, slice_length);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static iree_status_t iree_hal_task_command_buffer_copy_buffer(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
+    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+    iree_device_size_t length) {
+  iree_hal_task_command_buffer_t* command_buffer =
+      iree_hal_task_command_buffer_cast(base_command_buffer);
+
+  const iree_hal_buffer_t* buffers[2] = {source_buffer, target_buffer};
+  IREE_RETURN_IF_ERROR(
+      iree_hal_resource_set_insert(command_buffer->resource_set, 2, buffers));
+
+  iree_hal_cmd_copy_buffer_t* cmd = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_arena_allocate(&command_buffer->arena, sizeof(*cmd), (void**)&cmd));
+
+  const uint32_t workgroup_size[3] = {
+      /*x=*/IREE_HAL_CMD_COPY_SLICE_LENGTH,
+      /*y=*/1,
+      /*z=*/1,
+  };
+  const uint32_t workgroup_count[3] = {
+      /*x=*/length / workgroup_size[0] + 1,
+      /*y=*/1,
+      /*z=*/1,
+  };
+  iree_task_dispatch_initialize(
+      command_buffer->scope,
+      iree_task_make_dispatch_closure(iree_hal_cmd_copy_tile, (void*)cmd),
+      workgroup_size, workgroup_count, &cmd->task);
+  cmd->source_buffer = source_buffer;
+  cmd->source_offset = source_offset;
+  cmd->target_buffer = target_buffer;
+  cmd->target_offset = target_offset;
+  cmd->length = length;
+
+  return iree_hal_task_command_buffer_emit_execution_task(command_buffer,
+                                                          &cmd->task.header);
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_command_buffer_push_constants
+//===----------------------------------------------------------------------===//
+// NOTE: command buffer state change only; enqueues no tasks.
+
+static iree_status_t iree_hal_task_command_buffer_push_constants(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_executable_layout_t* executable_layout, iree_host_size_t offset,
+    const void* values, iree_host_size_t values_length) {
+  iree_hal_task_command_buffer_t* command_buffer =
+      iree_hal_task_command_buffer_cast(base_command_buffer);
+
+  if (IREE_UNLIKELY(offset + values_length >=
+                    sizeof(command_buffer->state.push_constants))) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "push constant range %zu (length=%zu) out of range",
+                            offset, values_length);
+  }
+
+  memcpy((uint8_t*)&command_buffer->state.push_constants + offset, values,
+         values_length);
+
+  return iree_ok_status();
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_command_buffer_push_descriptor_set
+//===----------------------------------------------------------------------===//
+// NOTE: command buffer state change only; enqueues no tasks.
+
+static iree_status_t iree_hal_task_command_buffer_push_descriptor_set(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_executable_layout_t* executable_layout, uint32_t set,
+    iree_host_size_t binding_count,
+    const iree_hal_descriptor_set_binding_t* bindings) {
+  iree_hal_task_command_buffer_t* command_buffer =
+      iree_hal_task_command_buffer_cast(base_command_buffer);
+
+  if (IREE_UNLIKELY(set >= IREE_HAL_LOCAL_MAX_DESCRIPTOR_SET_COUNT)) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "set %u out of bounds", set);
+  }
+
+  iree_host_size_t binding_base =
+      set * IREE_HAL_LOCAL_MAX_DESCRIPTOR_BINDING_COUNT;
+  for (iree_host_size_t i = 0; i < binding_count; ++i) {
+    if (IREE_UNLIKELY(bindings[i].binding >=
+                      IREE_HAL_LOCAL_MAX_DESCRIPTOR_BINDING_COUNT)) {
+      return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                              "buffer binding index out of bounds");
+    }
+    iree_host_size_t binding_ordinal = binding_base + bindings[i].binding;
+
+    // TODO(benvanik): batch insert by getting the resources in their own list.
+    IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+        command_buffer->resource_set, 1, &bindings[i].buffer));
+
+    // TODO(benvanik): track mapping so we can properly map/unmap/flush/etc.
+    iree_hal_buffer_mapping_t buffer_mapping = {{0}};
+    IREE_RETURN_IF_ERROR(iree_hal_buffer_map_range(
+        bindings[i].buffer, IREE_HAL_MAPPING_MODE_PERSISTENT,
+        IREE_HAL_MEMORY_ACCESS_ANY, bindings[i].offset, bindings[i].length,
+        &buffer_mapping));
+    command_buffer->state.bindings[binding_ordinal] =
+        buffer_mapping.contents.data;
+    command_buffer->state.binding_lengths[binding_ordinal] =
+        buffer_mapping.contents.data_length;
+  }
+
+  return iree_ok_status();
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_command_buffer_bind_descriptor_set
+//===----------------------------------------------------------------------===//
+// NOTE: command buffer state change only; enqueues no tasks.
+
+static iree_status_t iree_hal_task_command_buffer_bind_descriptor_set(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_executable_layout_t* executable_layout, uint32_t set,
+    iree_hal_descriptor_set_t* descriptor_set,
+    iree_host_size_t dynamic_offset_count,
+    const iree_device_size_t* dynamic_offsets) {
+  return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                          "descriptor set binding not yet implemented");
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_command_buffer_dispatch
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_cmd_dispatch_t {
+  iree_task_dispatch_t task;
+  iree_hal_local_executable_t* executable;
+  int32_t ordinal;
+
+  // Total number of available 4 byte push constant values in |push_constants|.
+  uint16_t push_constant_count;
+
+  // Total number of binding base pointers in |binding_ptrs| and
+  // |binding_lengths|. The set is packed densely based on which binidngs are
+  // used (known at compile-time).
+  uint16_t binding_count;
+
+  // Following this structure in memory there are 3 tables:
+  // - const uint32_t push_constants[push_constant_count];
+  // - void* binding_ptrs[binding_count];
+  // - const size_t binding_lengths[binding_count];
+} iree_hal_cmd_dispatch_t;
+
+static iree_status_t iree_hal_cmd_dispatch_tile(
+    void* user_context, const iree_task_tile_context_t* tile_context,
+    iree_task_submission_t* pending_submission) {
+  const iree_hal_cmd_dispatch_t* cmd =
+      (const iree_hal_cmd_dispatch_t*)user_context;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // We could share this across all workgroups in a dispatch and reduce cache
+  // pressure as all cores would be hitting the same hot read-only cache line.
+  // It'd grow the size of iree_hal_cmd_dispatch_t by a few dozen bytes, though,
+  // and so we'd need some profiling to see if it's worth it (fixed command
+  // buffer cost vs potential for saving a cache miss or two).
+  iree_alignas(64) iree_hal_executable_dispatch_state_v0_t dispatch_state = {
+      .workgroup_size_x = tile_context->workgroup_size[0],
+      .workgroup_size_y = tile_context->workgroup_size[1],
+      .workgroup_size_z = tile_context->workgroup_size[2],
+      .push_constant_count = cmd->push_constant_count,
+      .workgroup_count_x = tile_context->workgroup_count[0],
+      .workgroup_count_y = tile_context->workgroup_count[1],
+      .workgroup_count_z = tile_context->workgroup_count[2],
+      .max_concurrency =
+          iree_task_affinity_set_count_ones(cmd->task.header.affinity_set),
+      .binding_count = cmd->binding_count,
+  };
+  uint8_t* cmd_ptr = (uint8_t*)cmd + sizeof(*cmd);
+  dispatch_state.push_constants = (uint32_t*)cmd_ptr;
+  cmd_ptr += cmd->push_constant_count * sizeof(*dispatch_state.push_constants);
+  dispatch_state.binding_ptrs = (void**)cmd_ptr;
+  cmd_ptr += cmd->binding_count * sizeof(*dispatch_state.binding_ptrs);
+  dispatch_state.binding_lengths = (size_t*)cmd_ptr;
+  cmd_ptr += cmd->binding_count * sizeof(*dispatch_state.binding_lengths);
+
+  const iree_alignas(64)
+      iree_hal_executable_workgroup_state_v0_t workgroup_state = {
+          .workgroup_id_x = tile_context->workgroup_xyz[0],
+          .workgroup_id_y = tile_context->workgroup_xyz[1],
+          .workgroup_id_z = tile_context->workgroup_xyz[2],
+          .reserved = 0,
+          .processor_id = tile_context->processor_id,
+          .local_memory = tile_context->local_memory.data,
+          .local_memory_size = (size_t)tile_context->local_memory.data_length,
+      };
+  iree_status_t status = iree_hal_local_executable_issue_call(
+      cmd->executable, cmd->ordinal, &dispatch_state, &workgroup_state);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static iree_status_t iree_hal_task_command_buffer_build_dispatch(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_executable_t* executable, int32_t entry_point,
+    uint32_t workgroup_x, uint32_t workgroup_y, uint32_t workgroup_z,
+    iree_hal_cmd_dispatch_t** out_cmd) {
+  iree_hal_task_command_buffer_t* command_buffer =
+      iree_hal_task_command_buffer_cast(base_command_buffer);
+
+  iree_hal_local_executable_t* local_executable =
+      iree_hal_local_executable_cast(executable);
+  iree_hal_local_executable_layout_t* local_layout =
+      local_executable->executable_layouts[entry_point];
+  iree_host_size_t push_constant_count = local_layout->push_constants;
+  iree_hal_local_binding_mask_t used_binding_mask = local_layout->used_bindings;
+  iree_host_size_t used_binding_count =
+      iree_math_count_ones_u64(used_binding_mask);
+
+  // To save a few command buffer bytes we narrow these:
+  if (IREE_UNLIKELY(push_constant_count >= UINT16_MAX) ||
+      IREE_UNLIKELY(used_binding_count >= UINT16_MAX)) {
+    return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED,
+                            "too many bindings/push constants");
+  }
+
+  iree_hal_cmd_dispatch_t* cmd = NULL;
+  iree_host_size_t total_cmd_size =
+      sizeof(*cmd) + push_constant_count * sizeof(uint32_t) +
+      used_binding_count * sizeof(void*) +
+      used_binding_count * sizeof(iree_device_size_t);
+  IREE_RETURN_IF_ERROR(iree_arena_allocate(&command_buffer->arena,
+                                           total_cmd_size, (void**)&cmd));
+
+  cmd->executable = local_executable;
+  cmd->ordinal = entry_point;
+  cmd->push_constant_count = push_constant_count;
+  cmd->binding_count = used_binding_count;
+
+  const uint32_t workgroup_count[3] = {workgroup_x, workgroup_y, workgroup_z};
+  // TODO(benvanik): expose on API or keep fixed on executable.
+  const uint32_t workgroup_size[3] = {1, 1, 1};
+  iree_task_dispatch_initialize(
+      command_buffer->scope,
+      iree_task_make_dispatch_closure(iree_hal_cmd_dispatch_tile, (void*)cmd),
+      workgroup_size, workgroup_count, &cmd->task);
+
+  // Tell the task system how much workgroup local memory is required for the
+  // dispatch; each invocation of the entry point will have at least as much
+  // scratch memory available during execution.
+  cmd->task.local_memory_size =
+      local_executable->dispatch_attrs
+          ? local_executable->dispatch_attrs[entry_point].local_memory_pages *
+                IREE_HAL_WORKGROUP_LOCAL_MEMORY_PAGE_SIZE
+          : 0;
+
+  // Copy only the push constant range used by the executable.
+  uint8_t* cmd_ptr = (uint8_t*)cmd + sizeof(*cmd);
+  uint32_t* push_constants = (uint32_t*)cmd_ptr;
+  memcpy(push_constants, command_buffer->state.push_constants,
+         push_constant_count * sizeof(*push_constants));
+  cmd_ptr += push_constant_count * sizeof(*push_constants);
+
+  // Produce the dense binding list based on the declared bindings used.
+  // This allows us to change the descriptor sets and bindings counts supported
+  // in the HAL independent of any executable as each executable just gets the
+  // flat dense list and doesn't care about our descriptor set stuff.
+  //
+  // Note that we are just directly setting the binding data pointers here with
+  // no ownership/retaining/etc - it's part of the HAL contract that buffers are
+  // kept valid for the duration they may be in use.
+  void** binding_ptrs = (void**)cmd_ptr;
+  cmd_ptr += used_binding_count * sizeof(*binding_ptrs);
+  size_t* binding_lengths = (size_t*)cmd_ptr;
+  cmd_ptr += used_binding_count * sizeof(*binding_lengths);
+  iree_host_size_t binding_base = 0;
+  for (iree_host_size_t i = 0; i < used_binding_count; ++i) {
+    int mask_offset = iree_math_count_trailing_zeros_u64(used_binding_mask);
+    int binding_ordinal = binding_base + mask_offset;
+    binding_base += mask_offset + 1;
+    used_binding_mask = iree_shr(used_binding_mask, mask_offset + 1);
+    binding_ptrs[i] = command_buffer->state.bindings[binding_ordinal];
+    binding_lengths[i] = command_buffer->state.binding_lengths[binding_ordinal];
+    if (!binding_ptrs[i]) {
+      return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                              "(flat) binding %d is NULL", binding_ordinal);
+    }
+  }
+
+  *out_cmd = cmd;
+  return iree_hal_task_command_buffer_emit_execution_task(command_buffer,
+                                                          &cmd->task.header);
+}
+
+static iree_status_t iree_hal_task_command_buffer_dispatch(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_executable_t* executable, int32_t entry_point,
+    uint32_t workgroup_x, uint32_t workgroup_y, uint32_t workgroup_z) {
+  iree_hal_task_command_buffer_t* command_buffer =
+      iree_hal_task_command_buffer_cast(base_command_buffer);
+  IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+      command_buffer->resource_set, 1, &executable));
+  iree_hal_cmd_dispatch_t* cmd = NULL;
+  return iree_hal_task_command_buffer_build_dispatch(
+      base_command_buffer, executable, entry_point, workgroup_x, workgroup_y,
+      workgroup_z, &cmd);
+}
+
+static iree_status_t iree_hal_task_command_buffer_dispatch_indirect(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_executable_t* executable, int32_t entry_point,
+    iree_hal_buffer_t* workgroups_buffer,
+    iree_device_size_t workgroups_offset) {
+  iree_hal_task_command_buffer_t* command_buffer =
+      iree_hal_task_command_buffer_cast(base_command_buffer);
+
+  const void* resources[2] = {executable, workgroups_buffer};
+  IREE_RETURN_IF_ERROR(
+      iree_hal_resource_set_insert(command_buffer->resource_set, 2, resources));
+
+  // TODO(benvanik): track mapping so we can properly map/unmap/flush/etc.
+  iree_hal_buffer_mapping_t buffer_mapping = {{0}};
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_map_range(
+      workgroups_buffer, IREE_HAL_MAPPING_MODE_PERSISTENT,
+      IREE_HAL_MEMORY_ACCESS_READ, workgroups_offset, 3 * sizeof(uint32_t),
+      &buffer_mapping));
+
+  iree_hal_cmd_dispatch_t* cmd = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_task_command_buffer_build_dispatch(
+      base_command_buffer, executable, entry_point, 0, 0, 0, &cmd));
+  cmd->task.workgroup_count.ptr = (const uint32_t*)buffer_mapping.contents.data;
+  cmd->task.header.flags |= IREE_TASK_FLAG_DISPATCH_INDIRECT;
+  return iree_ok_status();
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_command_buffer_vtable_t
+//===----------------------------------------------------------------------===//
+
+static const iree_hal_command_buffer_vtable_t
+    iree_hal_task_command_buffer_vtable = {
+        .destroy = iree_hal_task_command_buffer_destroy,
+        .dyn_cast = iree_hal_task_command_buffer_dyn_cast,
+        .begin = iree_hal_task_command_buffer_begin,
+        .end = iree_hal_task_command_buffer_end,
+        .begin_debug_group = iree_hal_task_command_buffer_begin_debug_group,
+        .end_debug_group = iree_hal_task_command_buffer_end_debug_group,
+        .execution_barrier = iree_hal_task_command_buffer_execution_barrier,
+        .signal_event = iree_hal_task_command_buffer_signal_event,
+        .reset_event = iree_hal_task_command_buffer_reset_event,
+        .wait_events = iree_hal_task_command_buffer_wait_events,
+        .discard_buffer = iree_hal_task_command_buffer_discard_buffer,
+        .fill_buffer = iree_hal_task_command_buffer_fill_buffer,
+        .update_buffer = iree_hal_task_command_buffer_update_buffer,
+        .copy_buffer = iree_hal_task_command_buffer_copy_buffer,
+        .push_constants = iree_hal_task_command_buffer_push_constants,
+        .push_descriptor_set = iree_hal_task_command_buffer_push_descriptor_set,
+        .bind_descriptor_set = iree_hal_task_command_buffer_bind_descriptor_set,
+        .dispatch = iree_hal_task_command_buffer_dispatch,
+        .dispatch_indirect = iree_hal_task_command_buffer_dispatch_indirect,
+};
diff --git a/runtime/src/iree/hal/local/task_command_buffer.h b/runtime/src/iree/hal/local/task_command_buffer.h
new file mode 100644
index 0000000..5e18fbd
--- /dev/null
+++ b/runtime/src/iree/hal/local/task_command_buffer.h
@@ -0,0 +1,58 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_LOCAL_TASK_COMMAND_BUFFER_H_
+#define IREE_HAL_LOCAL_TASK_COMMAND_BUFFER_H_
+
+#include "iree/base/api.h"
+#include "iree/base/internal/arena.h"
+#include "iree/hal/api.h"
+#include "iree/hal/local/task_queue_state.h"
+#include "iree/task/scope.h"
+#include "iree/task/task.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+iree_status_t iree_hal_task_command_buffer_create(
+    iree_hal_device_t* device, iree_task_scope_t* scope,
+    iree_hal_command_buffer_mode_t mode,
+    iree_hal_command_category_t command_categories,
+    iree_hal_queue_affinity_t queue_affinity,
+    iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator,
+    iree_hal_command_buffer_t** out_command_buffer);
+
+// Returns true if |command_buffer| is a task system command buffer.
+bool iree_hal_task_command_buffer_isa(
+    iree_hal_command_buffer_t* command_buffer);
+
+// Issues a recorded command buffer using the serial |queue_state|.
+// |queue_state| is used to track the synchronization scope of the queue from
+// prior commands such as signaled events and will be mutated as events are
+// reset or new events are signaled.
+//
+// |retire_task| will be scheduled once all commands issued from the command
+// buffer retire and can be used as a fence point.
+//
+// Any new tasks that are allocated as part of the issue operation (such as
+// barrier tasks to handle event synchronization) will be acquired from |arena|.
+// The lifetime of |arena| must be at least that of |retire_task| ensuring that
+// all of the allocated commands issued have completed and their memory in the
+// arena can be recycled.
+//
+// |pending_submission| will receive the ready list of commands and must be
+// submitted to the executor (or discarded on failure) by the caller.
+iree_status_t iree_hal_task_command_buffer_issue(
+    iree_hal_command_buffer_t* command_buffer,
+    iree_hal_task_queue_state_t* queue_state, iree_task_t* retire_task,
+    iree_arena_allocator_t* arena, iree_task_submission_t* pending_submission);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_LOCAL_TASK_COMMAND_BUFFER_H_
diff --git a/runtime/src/iree/hal/local/task_device.c b/runtime/src/iree/hal/local/task_device.c
new file mode 100644
index 0000000..6170367
--- /dev/null
+++ b/runtime/src/iree/hal/local/task_device.c
@@ -0,0 +1,377 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/local/task_device.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "iree/base/internal/arena.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/local/local_descriptor_set.h"
+#include "iree/hal/local/local_descriptor_set_layout.h"
+#include "iree/hal/local/local_executable_cache.h"
+#include "iree/hal/local/local_executable_layout.h"
+#include "iree/hal/local/task_command_buffer.h"
+#include "iree/hal/local/task_event.h"
+#include "iree/hal/local/task_queue.h"
+#include "iree/hal/local/task_semaphore.h"
+#include "iree/hal/utils/buffer_transfer.h"
+
+typedef struct iree_hal_task_device_t {
+  iree_hal_resource_t resource;
+  iree_string_view_t identifier;
+
+  // Block pool used for small allocations like tasks and submissions.
+  iree_arena_block_pool_t small_block_pool;
+
+  // Block pool used for command buffers with a larger block size (as command
+  // buffers can contain inlined data uploads).
+  iree_arena_block_pool_t large_block_pool;
+
+  iree_task_executor_t* executor;
+
+  iree_host_size_t loader_count;
+  iree_hal_executable_loader_t** loaders;
+
+  iree_allocator_t host_allocator;
+  iree_hal_allocator_t* device_allocator;
+
+  iree_host_size_t queue_count;
+  iree_hal_task_queue_t queues[];
+} iree_hal_task_device_t;
+
+static const iree_hal_device_vtable_t iree_hal_task_device_vtable;
+
+static iree_hal_task_device_t* iree_hal_task_device_cast(
+    iree_hal_device_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_task_device_vtable);
+  return (iree_hal_task_device_t*)base_value;
+}
+
+void iree_hal_task_device_params_initialize(
+    iree_hal_task_device_params_t* out_params) {
+  out_params->arena_block_size = 32 * 1024;
+  out_params->queue_count = 8;
+}
+
+static iree_status_t iree_hal_task_device_check_params(
+    const iree_hal_task_device_params_t* params) {
+  if (params->arena_block_size < 4096) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "arena block size too small (< 4096 bytes)");
+  }
+  if (params->queue_count == 0) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "at least one queue is required");
+  }
+  return iree_ok_status();
+}
+
+iree_status_t iree_hal_task_device_create(
+    iree_string_view_t identifier, const iree_hal_task_device_params_t* params,
+    iree_task_executor_t* executor, iree_host_size_t loader_count,
+    iree_hal_executable_loader_t** loaders,
+    iree_hal_allocator_t* device_allocator, iree_allocator_t host_allocator,
+    iree_hal_device_t** out_device) {
+  IREE_ASSERT_ARGUMENT(params);
+  IREE_ASSERT_ARGUMENT(!loader_count || loaders);
+  IREE_ASSERT_ARGUMENT(device_allocator);
+  IREE_ASSERT_ARGUMENT(out_device);
+  *out_device = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(z0,
+                                    iree_hal_task_device_check_params(params));
+
+  iree_hal_task_device_t* device = NULL;
+  iree_host_size_t struct_size = sizeof(*device) +
+                                 params->queue_count * sizeof(*device->queues) +
+                                 loader_count * sizeof(*device->loaders);
+  iree_host_size_t total_size = struct_size + identifier.size;
+  iree_status_t status =
+      iree_allocator_malloc(host_allocator, total_size, (void**)&device);
+  if (iree_status_is_ok(status)) {
+    memset(device, 0, total_size);
+    iree_hal_resource_initialize(&iree_hal_task_device_vtable,
+                                 &device->resource);
+    iree_string_view_append_to_buffer(identifier, &device->identifier,
+                                      (char*)device + struct_size);
+    device->host_allocator = host_allocator;
+    device->device_allocator = device_allocator;
+    iree_hal_allocator_retain(device_allocator);
+
+    iree_arena_block_pool_initialize(4096, host_allocator,
+                                     &device->small_block_pool);
+    iree_arena_block_pool_initialize(params->arena_block_size, host_allocator,
+                                     &device->large_block_pool);
+
+    device->executor = executor;
+    iree_task_executor_retain(device->executor);
+
+    device->loader_count = loader_count;
+    device->loaders =
+        (iree_hal_executable_loader_t**)((uint8_t*)device + sizeof(*device) +
+                                         params->queue_count *
+                                             sizeof(*device->queues));
+    for (iree_host_size_t i = 0; i < device->loader_count; ++i) {
+      device->loaders[i] = loaders[i];
+      iree_hal_executable_loader_retain(device->loaders[i]);
+    }
+
+    device->queue_count = params->queue_count;
+    for (iree_host_size_t i = 0; i < device->queue_count; ++i) {
+      // TODO(benvanik): add a number to each queue ID.
+      iree_hal_task_queue_initialize(device->identifier, device->executor,
+                                     &device->small_block_pool,
+                                     &device->queues[i]);
+    }
+  }
+
+  if (iree_status_is_ok(status)) {
+    *out_device = (iree_hal_device_t*)device;
+  } else {
+    iree_hal_device_release((iree_hal_device_t*)device);
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_task_device_destroy(iree_hal_device_t* base_device) {
+  iree_hal_task_device_t* device = iree_hal_task_device_cast(base_device);
+  iree_allocator_t host_allocator = iree_hal_device_host_allocator(base_device);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  for (iree_host_size_t i = 0; i < device->queue_count; ++i) {
+    iree_hal_task_queue_deinitialize(&device->queues[i]);
+  }
+  for (iree_host_size_t i = 0; i < device->loader_count; ++i) {
+    iree_hal_executable_loader_release(device->loaders[i]);
+  }
+  iree_task_executor_release(device->executor);
+  iree_arena_block_pool_deinitialize(&device->large_block_pool);
+  iree_arena_block_pool_deinitialize(&device->small_block_pool);
+  iree_hal_allocator_release(device->device_allocator);
+  iree_allocator_free(host_allocator, device);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_string_view_t iree_hal_task_device_id(
+    iree_hal_device_t* base_device) {
+  iree_hal_task_device_t* device = iree_hal_task_device_cast(base_device);
+  return device->identifier;
+}
+
+static iree_allocator_t iree_hal_task_device_host_allocator(
+    iree_hal_device_t* base_device) {
+  iree_hal_task_device_t* device = iree_hal_task_device_cast(base_device);
+  return device->host_allocator;
+}
+
+static iree_hal_allocator_t* iree_hal_task_device_allocator(
+    iree_hal_device_t* base_device) {
+  iree_hal_task_device_t* device = iree_hal_task_device_cast(base_device);
+  return device->device_allocator;
+}
+
+static iree_status_t iree_hal_task_device_trim(iree_hal_device_t* base_device) {
+  iree_hal_task_device_t* device = iree_hal_task_device_cast(base_device);
+  iree_arena_block_pool_trim(&device->small_block_pool);
+  iree_arena_block_pool_trim(&device->large_block_pool);
+  iree_task_executor_trim(device->executor);
+  return iree_hal_allocator_trim(device->device_allocator);
+}
+
+static iree_status_t iree_hal_task_device_query_i32(
+    iree_hal_device_t* base_device, iree_string_view_t category,
+    iree_string_view_t key, int32_t* out_value) {
+  iree_hal_task_device_t* device = iree_hal_task_device_cast(base_device);
+  *out_value = 0;
+
+  if (iree_string_view_equal(category,
+                             iree_make_cstring_view("hal.executable.format"))) {
+    *out_value =
+        iree_hal_query_any_executable_loader_support(
+            device->loader_count, device->loaders, /*caching_mode=*/0, key)
+            ? 1
+            : 0;
+    return iree_ok_status();
+  } else if (iree_string_view_equal(category,
+                                    iree_make_cstring_view("hal.device"))) {
+    if (iree_string_view_equal(key, iree_make_cstring_view("concurrency"))) {
+      *out_value = (int32_t)device->queue_count;
+      return iree_ok_status();
+    }
+  } else if (iree_string_view_equal(category,
+                                    iree_make_cstring_view("hal.dispatch"))) {
+    if (iree_string_view_equal(key, iree_make_cstring_view("concurrency"))) {
+      *out_value = (int32_t)iree_task_executor_worker_count(device->executor);
+      return iree_ok_status();
+    }
+  }
+
+  return iree_make_status(
+      IREE_STATUS_NOT_FOUND,
+      "unknown device configuration key value '%.*s :: %.*s'",
+      (int)category.size, category.data, (int)key.size, key.data);
+}
+
+// Returns the queue index to submit work to based on the |queue_affinity|.
+//
+// If we wanted to have dedicated transfer queues we'd fork off based on
+// command_categories. For now all queues are general purpose.
+static iree_host_size_t iree_hal_task_device_select_queue(
+    iree_hal_task_device_t* device,
+    iree_hal_command_category_t command_categories,
+    iree_hal_queue_affinity_t queue_affinity) {
+  // TODO(benvanik): evaluate if we want to obscure this mapping a bit so that
+  // affinity really means "equivalent affinities map to equivalent queues" and
+  // not a specific queue index.
+  return queue_affinity % device->queue_count;
+}
+
+static iree_status_t iree_hal_task_device_create_command_buffer(
+    iree_hal_device_t* base_device, iree_hal_command_buffer_mode_t mode,
+    iree_hal_command_category_t command_categories,
+    iree_hal_queue_affinity_t queue_affinity,
+    iree_hal_command_buffer_t** out_command_buffer) {
+  iree_hal_task_device_t* device = iree_hal_task_device_cast(base_device);
+  iree_host_size_t queue_index = iree_hal_task_device_select_queue(
+      device, command_categories, queue_affinity);
+  return iree_hal_task_command_buffer_create(
+      base_device, &device->queues[queue_index].scope, mode, command_categories,
+      queue_affinity, &device->large_block_pool, device->host_allocator,
+      out_command_buffer);
+}
+
+static iree_status_t iree_hal_task_device_create_descriptor_set(
+    iree_hal_device_t* base_device,
+    iree_hal_descriptor_set_layout_t* set_layout,
+    iree_host_size_t binding_count,
+    const iree_hal_descriptor_set_binding_t* bindings,
+    iree_hal_descriptor_set_t** out_descriptor_set) {
+  return iree_hal_local_descriptor_set_create(set_layout, binding_count,
+                                              bindings, out_descriptor_set);
+}
+
+static iree_status_t iree_hal_task_device_create_descriptor_set_layout(
+    iree_hal_device_t* base_device,
+    iree_hal_descriptor_set_layout_usage_type_t usage_type,
+    iree_host_size_t binding_count,
+    const iree_hal_descriptor_set_layout_binding_t* bindings,
+    iree_hal_descriptor_set_layout_t** out_descriptor_set_layout) {
+  return iree_hal_local_descriptor_set_layout_create(
+      usage_type, binding_count, bindings,
+      iree_hal_device_host_allocator(base_device), out_descriptor_set_layout);
+}
+
+static iree_status_t iree_hal_task_device_create_event(
+    iree_hal_device_t* base_device, iree_hal_event_t** out_event) {
+  return iree_hal_task_event_create(iree_hal_device_host_allocator(base_device),
+                                    out_event);
+}
+
+static iree_status_t iree_hal_task_device_create_executable_cache(
+    iree_hal_device_t* base_device, iree_string_view_t identifier,
+    iree_loop_t loop, iree_hal_executable_cache_t** out_executable_cache) {
+  iree_hal_task_device_t* device = iree_hal_task_device_cast(base_device);
+  return iree_hal_local_executable_cache_create(
+      identifier, device->loader_count, device->loaders,
+      iree_hal_device_host_allocator(base_device), out_executable_cache);
+}
+
+static iree_status_t iree_hal_task_device_create_executable_layout(
+    iree_hal_device_t* base_device, iree_host_size_t push_constants,
+    iree_host_size_t set_layout_count,
+    iree_hal_descriptor_set_layout_t** set_layouts,
+    iree_hal_executable_layout_t** out_executable_layout) {
+  return iree_hal_local_executable_layout_create(
+      push_constants, set_layout_count, set_layouts,
+      iree_hal_device_host_allocator(base_device), out_executable_layout);
+}
+
+static iree_status_t iree_hal_task_device_create_semaphore(
+    iree_hal_device_t* base_device, uint64_t initial_value,
+    iree_hal_semaphore_t** out_semaphore) {
+  iree_hal_task_device_t* device = iree_hal_task_device_cast(base_device);
+  return iree_hal_task_semaphore_create(
+      iree_task_executor_event_pool(device->executor), initial_value,
+      device->host_allocator, out_semaphore);
+}
+
+static iree_status_t iree_hal_task_device_queue_submit(
+    iree_hal_device_t* base_device,
+    iree_hal_command_category_t command_categories,
+    iree_hal_queue_affinity_t queue_affinity, iree_host_size_t batch_count,
+    const iree_hal_submission_batch_t* batches) {
+  iree_hal_task_device_t* device = iree_hal_task_device_cast(base_device);
+  iree_host_size_t queue_index = iree_hal_task_device_select_queue(
+      device, command_categories, queue_affinity);
+  return iree_hal_task_queue_submit(&device->queues[queue_index], batch_count,
+                                    batches);
+}
+
+static iree_status_t iree_hal_task_device_submit_and_wait(
+    iree_hal_device_t* base_device,
+    iree_hal_command_category_t command_categories,
+    iree_hal_queue_affinity_t queue_affinity, iree_host_size_t batch_count,
+    const iree_hal_submission_batch_t* batches,
+    iree_hal_semaphore_t* wait_semaphore, uint64_t wait_value,
+    iree_timeout_t timeout) {
+  // Submit...
+  IREE_RETURN_IF_ERROR(iree_hal_task_device_queue_submit(
+      base_device, command_categories, queue_affinity, batch_count, batches));
+
+  // ...and wait.
+  return iree_hal_semaphore_wait(wait_semaphore, wait_value, timeout);
+}
+
+static iree_status_t iree_hal_task_device_wait_semaphores(
+    iree_hal_device_t* base_device, iree_hal_wait_mode_t wait_mode,
+    const iree_hal_semaphore_list_t* semaphore_list, iree_timeout_t timeout) {
+  iree_hal_task_device_t* device = iree_hal_task_device_cast(base_device);
+  return iree_hal_task_semaphore_multi_wait(
+      wait_mode, semaphore_list, timeout,
+      iree_task_executor_event_pool(device->executor),
+      &device->large_block_pool);
+}
+
+static iree_status_t iree_hal_task_device_wait_idle(
+    iree_hal_device_t* base_device, iree_timeout_t timeout) {
+  iree_hal_task_device_t* device = iree_hal_task_device_cast(base_device);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_status_t status = iree_ok_status();
+  for (iree_host_size_t i = 0; i < device->queue_count; ++i) {
+    status = iree_hal_task_queue_wait_idle(&device->queues[i], timeout);
+    if (!iree_status_is_ok(status)) break;
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static const iree_hal_device_vtable_t iree_hal_task_device_vtable = {
+    .destroy = iree_hal_task_device_destroy,
+    .id = iree_hal_task_device_id,
+    .host_allocator = iree_hal_task_device_host_allocator,
+    .device_allocator = iree_hal_task_device_allocator,
+    .trim = iree_hal_task_device_trim,
+    .query_i32 = iree_hal_task_device_query_i32,
+    .create_command_buffer = iree_hal_task_device_create_command_buffer,
+    .create_descriptor_set = iree_hal_task_device_create_descriptor_set,
+    .create_descriptor_set_layout =
+        iree_hal_task_device_create_descriptor_set_layout,
+    .create_event = iree_hal_task_device_create_event,
+    .create_executable_cache = iree_hal_task_device_create_executable_cache,
+    .create_executable_layout = iree_hal_task_device_create_executable_layout,
+    .create_semaphore = iree_hal_task_device_create_semaphore,
+    .transfer_range = iree_hal_device_transfer_mappable_range,
+    .queue_submit = iree_hal_task_device_queue_submit,
+    .submit_and_wait = iree_hal_task_device_submit_and_wait,
+    .wait_semaphores = iree_hal_task_device_wait_semaphores,
+    .wait_idle = iree_hal_task_device_wait_idle,
+};
diff --git a/runtime/src/iree/hal/local/task_device.h b/runtime/src/iree/hal/local/task_device.h
new file mode 100644
index 0000000..d43c1cf
--- /dev/null
+++ b/runtime/src/iree/hal/local/task_device.h
@@ -0,0 +1,51 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_LOCAL_TASK_DEVICE_H_
+#define IREE_HAL_LOCAL_TASK_DEVICE_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/local/executable_loader.h"
+#include "iree/task/executor.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Parameters configuring an iree_hal_task_device_t.
+// Must be initialized with iree_hal_task_device_params_initialize prior to use.
+typedef struct iree_hal_task_device_params_t {
+  // Number of queues exposed on the device.
+  // Each queue acts as a separate synchronization scope where all work executes
+  // concurrently unless prohibited by semaphores.
+  iree_host_size_t queue_count;
+
+  // Total size of each block in the device shared block pool.
+  // Larger sizes will lower overhead and ensure the heap isn't hit for
+  // transient allocations while also increasing memory consumption.
+  iree_host_size_t arena_block_size;
+} iree_hal_task_device_params_t;
+
+// Initializes |out_params| to default values.
+void iree_hal_task_device_params_initialize(
+    iree_hal_task_device_params_t* out_params);
+
+// Creates a new iree/task/-based local CPU device that uses |executor| for
+// scheduling tasks. |loaders| is the set of executable loaders that are
+// available for loading in the device context.
+iree_status_t iree_hal_task_device_create(
+    iree_string_view_t identifier, const iree_hal_task_device_params_t* params,
+    iree_task_executor_t* executor, iree_host_size_t loader_count,
+    iree_hal_executable_loader_t** loaders,
+    iree_hal_allocator_t* device_allocator, iree_allocator_t host_allocator,
+    iree_hal_device_t** out_device);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_LOCAL_TASK_DEVICE_H_
diff --git a/runtime/src/iree/hal/local/task_driver.c b/runtime/src/iree/hal/local/task_driver.c
new file mode 100644
index 0000000..49218c4
--- /dev/null
+++ b/runtime/src/iree/hal/local/task_driver.c
@@ -0,0 +1,134 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/local/task_driver.h"
+
+#include <stddef.h>
+#include <string.h>
+
+#include "iree/base/tracing.h"
+
+#define IREE_HAL_TASK_DEVICE_ID_DEFAULT 0
+
+typedef struct iree_hal_task_driver_t {
+  iree_hal_resource_t resource;
+  iree_allocator_t host_allocator;
+  iree_hal_allocator_t* device_allocator;
+
+  iree_string_view_t identifier;
+  iree_hal_task_device_params_t default_params;
+
+  iree_task_executor_t* executor;
+
+  iree_host_size_t loader_count;
+  iree_hal_executable_loader_t* loaders[];
+} iree_hal_task_driver_t;
+
+static const iree_hal_driver_vtable_t iree_hal_task_driver_vtable;
+
+static iree_hal_task_driver_t* iree_hal_task_driver_cast(
+    iree_hal_driver_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_task_driver_vtable);
+  return (iree_hal_task_driver_t*)base_value;
+}
+
+iree_status_t iree_hal_task_driver_create(
+    iree_string_view_t identifier,
+    const iree_hal_task_device_params_t* default_params,
+    iree_task_executor_t* executor, iree_host_size_t loader_count,
+    iree_hal_executable_loader_t** loaders,
+    iree_hal_allocator_t* device_allocator, iree_allocator_t host_allocator,
+    iree_hal_driver_t** out_driver) {
+  IREE_ASSERT_ARGUMENT(default_params);
+  IREE_ASSERT_ARGUMENT(!loader_count || loaders);
+  IREE_ASSERT_ARGUMENT(device_allocator);
+  IREE_ASSERT_ARGUMENT(out_driver);
+  *out_driver = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_task_driver_t* driver = NULL;
+  iree_host_size_t struct_size =
+      sizeof(*driver) + loader_count * sizeof(*driver->loaders);
+  iree_host_size_t total_size = struct_size + identifier.size;
+  iree_status_t status =
+      iree_allocator_malloc(host_allocator, total_size, (void**)&driver);
+  if (iree_status_is_ok(status)) {
+    iree_hal_resource_initialize(&iree_hal_task_driver_vtable,
+                                 &driver->resource);
+    driver->host_allocator = host_allocator;
+    driver->device_allocator = device_allocator;
+    iree_hal_allocator_retain(device_allocator);
+
+    iree_string_view_append_to_buffer(identifier, &driver->identifier,
+                                      (char*)driver + struct_size);
+    memcpy(&driver->default_params, default_params,
+           sizeof(driver->default_params));
+
+    driver->executor = executor;
+    iree_task_executor_retain(driver->executor);
+
+    driver->loader_count = loader_count;
+    for (iree_host_size_t i = 0; i < driver->loader_count; ++i) {
+      driver->loaders[i] = loaders[i];
+      iree_hal_executable_loader_retain(driver->loaders[i]);
+    }
+  }
+
+  if (iree_status_is_ok(status)) {
+    *out_driver = (iree_hal_driver_t*)driver;
+  } else {
+    iree_hal_driver_release((iree_hal_driver_t*)driver);
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_task_driver_destroy(iree_hal_driver_t* base_driver) {
+  iree_hal_task_driver_t* driver = iree_hal_task_driver_cast(base_driver);
+  iree_allocator_t host_allocator = driver->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_allocator_release(driver->device_allocator);
+  for (iree_host_size_t i = 0; i < driver->loader_count; ++i) {
+    iree_hal_executable_loader_release(driver->loaders[i]);
+  }
+  iree_task_executor_release(driver->executor);
+  iree_allocator_free(host_allocator, driver);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_status_t iree_hal_task_driver_query_available_devices(
+    iree_hal_driver_t* base_driver, iree_allocator_t allocator,
+    iree_hal_device_info_t** out_device_infos,
+    iree_host_size_t* out_device_info_count) {
+  static const iree_hal_device_info_t device_infos[1] = {
+      {
+          .device_id = IREE_HAL_TASK_DEVICE_ID_DEFAULT,
+          .name = iree_string_view_literal("default"),
+      },
+  };
+  *out_device_info_count = IREE_ARRAYSIZE(device_infos);
+  return iree_allocator_clone(
+      allocator, iree_make_const_byte_span(device_infos, sizeof(device_infos)),
+      (void**)out_device_infos);
+}
+
+static iree_status_t iree_hal_task_driver_create_device(
+    iree_hal_driver_t* base_driver, iree_hal_device_id_t device_id,
+    iree_allocator_t host_allocator, iree_hal_device_t** out_device) {
+  iree_hal_task_driver_t* driver = iree_hal_task_driver_cast(base_driver);
+  return iree_hal_task_device_create(
+      driver->identifier, &driver->default_params, driver->executor,
+      driver->loader_count, driver->loaders, driver->device_allocator,
+      host_allocator, out_device);
+}
+
+static const iree_hal_driver_vtable_t iree_hal_task_driver_vtable = {
+    .destroy = iree_hal_task_driver_destroy,
+    .query_available_devices = iree_hal_task_driver_query_available_devices,
+    .create_device = iree_hal_task_driver_create_device,
+};
diff --git a/runtime/src/iree/hal/local/task_driver.h b/runtime/src/iree/hal/local/task_driver.h
new file mode 100644
index 0000000..4c36d2a
--- /dev/null
+++ b/runtime/src/iree/hal/local/task_driver.h
@@ -0,0 +1,35 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_LOCAL_TASK_DRIVER_H_
+#define IREE_HAL_LOCAL_TASK_DRIVER_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/local/executable_loader.h"
+#include "iree/hal/local/task_device.h"
+#include "iree/task/executor.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Creates a new iree/task/-based local CPU driver that creates devices sharing
+// the same |executor| for scheduling tasks. |loaders| is the set of executable
+// loaders that are available for loading in each device context.
+iree_status_t iree_hal_task_driver_create(
+    iree_string_view_t identifier,
+    const iree_hal_task_device_params_t* default_params,
+    iree_task_executor_t* executor, iree_host_size_t loader_count,
+    iree_hal_executable_loader_t** loaders,
+    iree_hal_allocator_t* device_allocator, iree_allocator_t host_allocator,
+    iree_hal_driver_t** out_driver);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_LOCAL_TASK_DRIVER_H_
diff --git a/runtime/src/iree/hal/local/task_event.c b/runtime/src/iree/hal/local/task_event.c
new file mode 100644
index 0000000..ec806a6
--- /dev/null
+++ b/runtime/src/iree/hal/local/task_event.c
@@ -0,0 +1,57 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/local/task_event.h"
+
+#include <stddef.h>
+
+#include "iree/base/tracing.h"
+
+typedef struct iree_hal_task_event_t {
+  iree_hal_resource_t resource;
+  iree_allocator_t host_allocator;
+} iree_hal_task_event_t;
+
+static const iree_hal_event_vtable_t iree_hal_task_event_vtable;
+
+static iree_hal_task_event_t* iree_hal_task_event_cast(
+    iree_hal_event_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_task_event_vtable);
+  return (iree_hal_task_event_t*)base_value;
+}
+
+iree_status_t iree_hal_task_event_create(iree_allocator_t host_allocator,
+                                         iree_hal_event_t** out_event) {
+  IREE_ASSERT_ARGUMENT(out_event);
+  *out_event = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_task_event_t* event = NULL;
+  iree_status_t status =
+      iree_allocator_malloc(host_allocator, sizeof(*event), (void**)&event);
+  if (iree_status_is_ok(status)) {
+    iree_hal_resource_initialize(&iree_hal_task_event_vtable, &event->resource);
+    event->host_allocator = host_allocator;
+    *out_event = (iree_hal_event_t*)event;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_task_event_destroy(iree_hal_event_t* base_event) {
+  iree_hal_task_event_t* event = iree_hal_task_event_cast(base_event);
+  iree_allocator_t host_allocator = event->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_allocator_free(host_allocator, event);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static const iree_hal_event_vtable_t iree_hal_task_event_vtable = {
+    .destroy = iree_hal_task_event_destroy,
+};
diff --git a/runtime/src/iree/hal/local/task_event.h b/runtime/src/iree/hal/local/task_event.h
new file mode 100644
index 0000000..91bbff7
--- /dev/null
+++ b/runtime/src/iree/hal/local/task_event.h
@@ -0,0 +1,24 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_LOCAL_TASK_EVENT_H_
+#define IREE_HAL_LOCAL_TASK_EVENT_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+iree_status_t iree_hal_task_event_create(iree_allocator_t host_allocator,
+                                         iree_hal_event_t** out_event);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_LOCAL_TASK_EVENT_H_
diff --git a/runtime/src/iree/hal/local/task_queue.c b/runtime/src/iree/hal/local/task_queue.c
new file mode 100644
index 0000000..23fcb43
--- /dev/null
+++ b/runtime/src/iree/hal/local/task_queue.c
@@ -0,0 +1,557 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/local/task_queue.h"
+
+#include <stddef.h>
+#include <string.h>
+
+#include "iree/base/tracing.h"
+#include "iree/hal/local/task_command_buffer.h"
+#include "iree/hal/local/task_semaphore.h"
+#include "iree/task/submission.h"
+
+// Each submission is turned into a DAG for execution:
+//
+//  +--------------------+    To preserve the sequential issue order an edge is
+//  |  (previous issue)  |    added between the previous outstanding issue (if
+//  +--------------------+    it exists) such that all issues run in the order
+//    |                       they were submitted to the queue. Note that this
+//    v                       is *only* the issue; the commands issued by two
+//  +--------------------+    submissions may still overlap and are only
+//  |  sequence barrier  |    guaranteed to begin execution in order.
+//  +--------------------+
+//    |
+//    |   +--------------+
+//    +-> | +--------------+  Unsatisfied waits are scheduled as wait tasks and
+//    .   +-|  sema waits  |  block the issuing of commands until all have
+//    .     +--------------+  been satisfied. If the wait is immediately
+//    .        | | | | |      following a signal from the same queue then it
+//    +--------+-+-+-+-+      elided - only cross-queue or external waits
+//    |                       actually go down to system wait handles.
+//    v
+//  +--------------------+    Command buffers in the batch are issued in-order
+//  |   command issue    |    as if all commands had been recorded into the same
+//  +--------------------+    command buffer (excluding recording state like
+//    |                       push constants). The dependencies between commands
+//    |   +--------------+    are determined by the events and barriers recorded
+//    +-> | +--------------+  in each command buffer.
+//    .   +-|   commands   |
+//    .     +--------------+
+//    .        | | | | |
+//    +--------+-+-+-+-+
+//    |
+//    v
+//  +--------------------+    After all commands within the batch complete the
+//  | semaphore signals  |    submission is retired and all semaphores are
+//  +--------------------+    signaled. Note that this may happen *before* other
+//    |                       earlier submissions complete if there were no
+//   ...                      dependencies between the commands in each batch.
+//
+// Could this be simplified? Probably. Improvements to the task system to allow
+// for efficient multiwaits and better stitching of independent DAGs would help.
+
+//===----------------------------------------------------------------------===//
+// Utilities
+//===----------------------------------------------------------------------===//
+
+// Clones a list of semaphores into an |arena| and initializes |out_target_list|
+// to reference the newly-cloned data.
+static iree_status_t iree_hal_semaphore_list_clone(
+    const iree_hal_semaphore_list_t* source_list, iree_arena_allocator_t* arena,
+    iree_hal_semaphore_list_t* out_target_list) {
+  iree_host_size_t semaphores_size =
+      source_list->count * sizeof(out_target_list->semaphores[0]);
+  iree_host_size_t payload_values_size =
+      source_list->count * sizeof(out_target_list->payload_values[0]);
+  iree_host_size_t total_size = semaphores_size + payload_values_size;
+  uint8_t* buffer = NULL;
+  IREE_RETURN_IF_ERROR(iree_arena_allocate(arena, total_size, (void**)&buffer));
+
+  out_target_list->count = source_list->count;
+  out_target_list->semaphores = (iree_hal_semaphore_t**)buffer;
+  out_target_list->payload_values = (uint64_t*)(buffer + semaphores_size);
+
+  for (iree_host_size_t i = 0; i < source_list->count; ++i) {
+    out_target_list->semaphores[i] = source_list->semaphores[i];
+    iree_hal_semaphore_retain(out_target_list->semaphores[i]);
+    out_target_list->payload_values[i] = source_list->payload_values[i];
+  }
+
+  return iree_ok_status();
+}
+
+static void iree_hal_semaphore_list_release(iree_hal_semaphore_list_t* list) {
+  for (iree_host_size_t i = 0; i < list->count; ++i) {
+    iree_hal_semaphore_release(list->semaphores[i]);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_task_queue_wait_cmd_t
+//===----------------------------------------------------------------------===//
+
+// Task to fork out and wait on one or more semaphores.
+// This optimizes for same-queue semaphore chaining by ensuring that semaphores
+// used to stitch together subsequent submissions never have to go to the system
+// to wait as the implicit queue ordering ensures that the signals would have
+// happened prior to the sequence command being executed. Cross-queue semaphores
+// will still cause waits if they have not yet been signaled.
+typedef struct iree_hal_task_queue_wait_cmd_t {
+  // Call to iree_hal_task_queue_wait_cmd.
+  iree_task_call_t task;
+
+  // Arena used for the submission - additional tasks can be allocated from
+  // this.
+  iree_arena_allocator_t* arena;
+
+  // A list of semaphores to wait on prior to issuing the rest of the
+  // submission.
+  iree_hal_semaphore_list_t wait_semaphores;
+} iree_hal_task_queue_wait_cmd_t;
+
+// Forks out multiple wait tasks prior to issuing the commands.
+static iree_status_t iree_hal_task_queue_wait_cmd(
+    void* user_context, iree_task_t* task,
+    iree_task_submission_t* pending_submission) {
+  iree_hal_task_queue_wait_cmd_t* cmd = (iree_hal_task_queue_wait_cmd_t*)task;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_status_t status = iree_ok_status();
+  for (iree_host_size_t i = 0; i < cmd->wait_semaphores.count; ++i) {
+    status = iree_hal_task_semaphore_enqueue_timepoint(
+        cmd->wait_semaphores.semaphores[i],
+        cmd->wait_semaphores.payload_values[i],
+        cmd->task.header.completion_task, cmd->arena, pending_submission);
+    if (IREE_UNLIKELY(!iree_status_is_ok(status))) break;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+// Cleanup for iree_hal_task_queue_wait_cmd_t that releases the retained
+// semaphores.
+static void iree_hal_task_queue_wait_cmd_cleanup(
+    iree_task_t* task, iree_status_code_t status_code) {
+  iree_hal_task_queue_wait_cmd_t* cmd = (iree_hal_task_queue_wait_cmd_t*)task;
+  iree_hal_semaphore_list_release(&cmd->wait_semaphores);
+}
+
+// Allocates and initializes a iree_hal_task_queue_wait_cmd_t task.
+static iree_status_t iree_hal_task_queue_wait_cmd_allocate(
+    iree_task_scope_t* scope, const iree_hal_semaphore_list_t* wait_semaphores,
+    iree_arena_allocator_t* arena, iree_hal_task_queue_wait_cmd_t** out_cmd) {
+  iree_hal_task_queue_wait_cmd_t* cmd = NULL;
+  IREE_RETURN_IF_ERROR(iree_arena_allocate(arena, sizeof(*cmd), (void**)&cmd));
+  iree_task_call_initialize(
+      scope, iree_task_make_call_closure(iree_hal_task_queue_wait_cmd, 0),
+      &cmd->task);
+  iree_task_set_cleanup_fn(&cmd->task.header,
+                           iree_hal_task_queue_wait_cmd_cleanup);
+  cmd->arena = arena;
+
+  // Clone the wait semaphores from the batch - we retain them and their
+  // payloads.
+  IREE_RETURN_IF_ERROR(iree_hal_semaphore_list_clone(wait_semaphores, arena,
+                                                     &cmd->wait_semaphores));
+
+  *out_cmd = cmd;
+  return iree_ok_status();
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_task_queue_issue_cmd_t
+//===----------------------------------------------------------------------===//
+
+// Task to issue all the command buffers in the batch.
+// After this task completes the commands have been issued but have not yet
+// completed and the issued commands may complete in any order.
+typedef struct iree_hal_task_queue_issue_cmd_t {
+  // Call to iree_hal_task_queue_issue_cmd.
+  iree_task_call_t task;
+
+  // Arena used for the submission - additional tasks can be allocated from
+  // this.
+  iree_arena_allocator_t* arena;
+
+  // Nasty back reference to the queue so that we can clear the tail_issue_task
+  // if we are the last issue pending.
+  iree_hal_task_queue_t* queue;
+
+  // Command buffers to be issued in the order the appeared in the submission.
+  iree_host_size_t command_buffer_count;
+  iree_hal_command_buffer_t* command_buffers[];
+} iree_hal_task_queue_issue_cmd_t;
+
+// Issues a set of command buffers without waiting for them to complete.
+static iree_status_t iree_hal_task_queue_issue_cmd(
+    void* user_context, iree_task_t* task,
+    iree_task_submission_t* pending_submission) {
+  iree_hal_task_queue_issue_cmd_t* cmd = (iree_hal_task_queue_issue_cmd_t*)task;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_status_t status = iree_ok_status();
+
+  // NOTE: it's ok for there to be no command buffers - in that case the
+  // submission was purely for synchronization.
+  if (cmd->command_buffer_count > 0) {
+    for (iree_host_size_t i = 0; i < cmd->command_buffer_count; ++i) {
+      if (iree_hal_task_command_buffer_isa(cmd->command_buffers[i])) {
+        status = iree_hal_task_command_buffer_issue(
+            cmd->command_buffers[i], &cmd->queue->state,
+            cmd->task.header.completion_task, cmd->arena, pending_submission);
+      } else {
+        status = iree_make_status(
+            IREE_STATUS_UNIMPLEMENTED,
+            "unsupported command buffer type for task queue submission");
+      }
+      if (IREE_UNLIKELY(!iree_status_is_ok(status))) break;
+    }
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+// Cleanup for iree_hal_task_queue_issue_cmd_t that resets the queue state
+// tracking the last in-flight issue.
+static void iree_hal_task_queue_issue_cmd_cleanup(
+    iree_task_t* task, iree_status_code_t status_code) {
+  iree_hal_task_queue_issue_cmd_t* cmd = (iree_hal_task_queue_issue_cmd_t*)task;
+
+  // Reset queue tail issue task if it was us.
+  iree_slim_mutex_lock(&cmd->queue->mutex);
+  if (cmd->queue->tail_issue_task == task) {
+    cmd->queue->tail_issue_task = NULL;
+  }
+  iree_slim_mutex_unlock(&cmd->queue->mutex);
+}
+
+// Allocates and initializes a iree_hal_task_queue_issue_cmd_t task.
+static iree_status_t iree_hal_task_queue_issue_cmd_allocate(
+    iree_task_scope_t* scope, iree_hal_task_queue_t* queue,
+    iree_task_t* retire_task, iree_host_size_t command_buffer_count,
+    iree_hal_command_buffer_t** const command_buffers,
+    iree_arena_allocator_t* arena, iree_hal_task_queue_issue_cmd_t** out_cmd) {
+  iree_hal_task_queue_issue_cmd_t* cmd = NULL;
+  iree_host_size_t total_cmd_size =
+      sizeof(*cmd) + command_buffer_count * sizeof(*cmd->command_buffers);
+  IREE_RETURN_IF_ERROR(
+      iree_arena_allocate(arena, total_cmd_size, (void**)&cmd));
+  iree_task_call_initialize(
+      scope, iree_task_make_call_closure(iree_hal_task_queue_issue_cmd, 0),
+      &cmd->task);
+  iree_task_set_completion_task(&cmd->task.header, retire_task);
+  iree_task_set_cleanup_fn(&cmd->task.header,
+                           iree_hal_task_queue_issue_cmd_cleanup);
+  cmd->arena = arena;
+  cmd->queue = queue;
+
+  cmd->command_buffer_count = command_buffer_count;
+  memcpy(cmd->command_buffers, command_buffers,
+         cmd->command_buffer_count * sizeof(*cmd->command_buffers));
+
+  *out_cmd = cmd;
+  return iree_ok_status();
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_task_queue_retire_cmd_t
+//===----------------------------------------------------------------------===//
+
+// Task to retire the submission and free the transient memory allocated for
+// it. The task is issued only once all commands from all command buffers in
+// the submission complete. Semaphores will be signaled and dependent
+// submissions may be issued.
+typedef struct iree_hal_task_queue_retire_cmd_t {
+  // Call to iree_hal_task_queue_retire_cmd.
+  iree_task_call_t task;
+
+  // Original arena used for all transient allocations required for the
+  // submission. All queue-related commands are allocated from this, **including
+  // this retire command**.
+  iree_arena_allocator_t arena;
+
+  // A list of semaphores to signal upon retiring.
+  iree_hal_semaphore_list_t signal_semaphores;
+} iree_hal_task_queue_retire_cmd_t;
+
+// Retires a submission by signaling semaphores to their desired value and
+// disposing of the temporary arena memory used for the submission.
+static iree_status_t iree_hal_task_queue_retire_cmd(
+    void* user_context, iree_task_t* task,
+    iree_task_submission_t* pending_submission) {
+  iree_hal_task_queue_retire_cmd_t* cmd =
+      (iree_hal_task_queue_retire_cmd_t*)task;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Signal all semaphores to their new values.
+  // Note that if any signal fails then the whole command will fail and all
+  // semaphores will be signaled to the failure state.
+  iree_status_t status = iree_ok_status();
+  for (iree_host_size_t i = 0; i < cmd->signal_semaphores.count; ++i) {
+    status =
+        iree_hal_semaphore_signal(cmd->signal_semaphores.semaphores[i],
+                                  cmd->signal_semaphores.payload_values[i]);
+    if (IREE_UNLIKELY(!iree_status_is_ok(status))) break;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+// Cleanup for iree_hal_task_queue_retire_cmd_t that ensures that the arena
+// holding the submission is properly disposed and that semaphores are signaled
+// (or signaled to failure if the command failed).
+static void iree_hal_task_queue_retire_cmd_cleanup(
+    iree_task_t* task, iree_status_code_t status_code) {
+  iree_hal_task_queue_retire_cmd_t* cmd =
+      (iree_hal_task_queue_retire_cmd_t*)task;
+
+  // If the command failed then fail all semaphores to ensure future
+  // submissions fail as well (including those on other queues).
+  if (IREE_UNLIKELY(status_code != IREE_STATUS_OK)) {
+    for (iree_host_size_t i = 0; i < cmd->signal_semaphores.count; ++i) {
+      iree_hal_semaphore_fail(cmd->signal_semaphores.semaphores[i],
+                              iree_status_from_code(status_code));
+    }
+  }
+
+  // Release all semaphores.
+  iree_hal_semaphore_list_release(&cmd->signal_semaphores);
+
+  // Drop all memory used by the submission (**including cmd**).
+  iree_arena_allocator_t arena = cmd->arena;
+  cmd = NULL;
+  iree_arena_deinitialize(&arena);
+}
+
+// Allocates and initializes a iree_hal_task_queue_retire_cmd_t task.
+// The command will own an arena that can be used for other submission-related
+// allocations.
+static iree_status_t iree_hal_task_queue_retire_cmd_allocate(
+    iree_task_scope_t* scope,
+    const iree_hal_semaphore_list_t* signal_semaphores,
+    iree_arena_block_pool_t* block_pool,
+    iree_hal_task_queue_retire_cmd_t** out_cmd) {
+  // Make an arena we'll use for allocating the command itself.
+  iree_arena_allocator_t arena;
+  iree_arena_initialize(block_pool, &arena);
+
+  // Allocate the command from the arena.
+  iree_hal_task_queue_retire_cmd_t* cmd = NULL;
+  iree_status_t status =
+      iree_arena_allocate(&arena, sizeof(*cmd), (void**)&cmd);
+  if (iree_status_is_ok(status)) {
+    iree_task_call_initialize(
+        scope, iree_task_make_call_closure(iree_hal_task_queue_retire_cmd, 0),
+        &cmd->task);
+    iree_task_set_cleanup_fn(&cmd->task.header,
+                             iree_hal_task_queue_retire_cmd_cleanup);
+  }
+
+  // Clone the signal semaphores from the batch - we retain them and their
+  // payloads.
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_semaphore_list_clone(signal_semaphores, &arena,
+                                           &cmd->signal_semaphores);
+  }
+
+  if (iree_status_is_ok(status)) {
+    // Transfer ownership of the arena to command.
+    memcpy(&cmd->arena, &arena, sizeof(cmd->arena));
+    *out_cmd = cmd;
+  } else {
+    iree_arena_deinitialize(&arena);
+  }
+  return status;
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_task_queue_t
+//===----------------------------------------------------------------------===//
+
+void iree_hal_task_queue_initialize(iree_string_view_t identifier,
+                                    iree_task_executor_t* executor,
+                                    iree_arena_block_pool_t* block_pool,
+                                    iree_hal_task_queue_t* out_queue) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_TEXT(z0, identifier.data, identifier.size);
+
+  memset(out_queue, 0, sizeof(*out_queue));
+
+  out_queue->executor = executor;
+  iree_task_executor_retain(out_queue->executor);
+  out_queue->block_pool = block_pool;
+
+  iree_task_scope_initialize(identifier, &out_queue->scope);
+
+  iree_slim_mutex_initialize(&out_queue->mutex);
+  iree_hal_task_queue_state_initialize(&out_queue->state);
+  out_queue->tail_issue_task = NULL;
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_hal_task_queue_deinitialize(iree_hal_task_queue_t* queue) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_status_ignore(
+      iree_task_scope_wait_idle(&queue->scope, IREE_TIME_INFINITE_FUTURE));
+
+  iree_slim_mutex_lock(&queue->mutex);
+  IREE_ASSERT(!queue->tail_issue_task);
+  iree_slim_mutex_unlock(&queue->mutex);
+
+  iree_hal_task_queue_state_deinitialize(&queue->state);
+  iree_slim_mutex_deinitialize(&queue->mutex);
+  iree_task_scope_deinitialize(&queue->scope);
+  iree_task_executor_release(queue->executor);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_status_t iree_hal_task_queue_submit_batch(
+    iree_hal_task_queue_t* queue, const iree_hal_submission_batch_t* batch) {
+  // Task to retire the submission and free the transient memory allocated for
+  // it (including the command itself). We allocate this first so it can get an
+  // arena which we will use to allocate all other commands.
+  iree_hal_task_queue_retire_cmd_t* retire_cmd = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_task_queue_retire_cmd_allocate(
+      &queue->scope, &batch->signal_semaphores, queue->block_pool,
+      &retire_cmd));
+
+  // NOTE: if we fail from here on we must drop the retire_cmd arena.
+  iree_status_t status = iree_ok_status();
+
+  // A fence we'll use to detect when the entire submission has completed.
+  // TODO(benvanik): fold into the retire command.
+  iree_task_fence_t* fence = NULL;
+  status =
+      iree_task_executor_acquire_fence(queue->executor, &queue->scope, &fence);
+  iree_task_set_completion_task(&retire_cmd->task.header, &fence->header);
+
+  // Task to fork and wait for unsatisfied semaphore dependencies.
+  // This is optional and only required if we have previous submissions still
+  // in-flight - if the queue is empty then we can directly schedule the waits.
+  iree_hal_task_queue_wait_cmd_t* wait_cmd = NULL;
+  if (iree_status_is_ok(status) && batch->wait_semaphores.count > 0) {
+    status = iree_hal_task_queue_wait_cmd_allocate(
+        &queue->scope, &batch->wait_semaphores, &retire_cmd->arena, &wait_cmd);
+  }
+
+  // Task to issue all the command buffers in the batch.
+  // After this task completes the commands have been issued but have not yet
+  // completed and the issued commands may complete in any order.
+  iree_hal_task_queue_issue_cmd_t* issue_cmd = NULL;
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_task_queue_issue_cmd_allocate(
+        &queue->scope, queue, &retire_cmd->task.header,
+        batch->command_buffer_count, batch->command_buffers, &retire_cmd->arena,
+        &issue_cmd);
+  }
+
+  // Last chance for failure - from here on we are submitting.
+  if (IREE_UNLIKELY(!iree_status_is_ok(status))) {
+    iree_arena_deinitialize(&retire_cmd->arena);
+    return status;
+  }
+
+  iree_task_submission_t submission;
+  iree_task_submission_initialize(&submission);
+
+  // Sequencing: wait on semaphores or go directly into the executor queue.
+  if (wait_cmd != NULL) {
+    // Ensure that we only issue command buffers after all waits have completed.
+    iree_task_set_completion_task(&wait_cmd->task.header,
+                                  &issue_cmd->task.header);
+    iree_task_submission_enqueue(&submission, &wait_cmd->task.header);
+  } else {
+    // No waits needed; directly enqueue.
+    iree_task_submission_enqueue(&submission, &issue_cmd->task.header);
+  }
+
+  iree_slim_mutex_lock(&queue->mutex);
+
+  // If there is an in-flight issue pending then we need to chain onto that
+  // so that we ensure FIFO submission order is preserved. Note that we are only
+  // waiting for the issue to complete and *not* all of the commands that are
+  // issued.
+  if (queue->tail_issue_task != NULL) {
+    iree_task_set_completion_task(queue->tail_issue_task,
+                                  &issue_cmd->task.header);
+  }
+  queue->tail_issue_task = &issue_cmd->task.header;
+
+  iree_slim_mutex_unlock(&queue->mutex);
+
+  // Submit the tasks immediately. The executor may queue them up until we
+  // force the flush after all batches have been processed.
+  iree_task_executor_submit(queue->executor, &submission);
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_task_queue_submit_batches(
+    iree_hal_task_queue_t* queue, iree_host_size_t batch_count,
+    const iree_hal_submission_batch_t* batches) {
+  // For now we process each batch independently. To elide additional semaphore
+  // work and prevent unneeded coordinator scheduling logic we could instead
+  // build the whole DAG prior to submitting.
+  for (iree_host_size_t i = 0; i < batch_count; ++i) {
+    const iree_hal_submission_batch_t* batch = &batches[i];
+    IREE_RETURN_IF_ERROR(iree_hal_task_queue_submit_batch(queue, batch));
+  }
+  return iree_ok_status();
+}
+
+iree_status_t iree_hal_task_queue_submit(
+    iree_hal_task_queue_t* queue, iree_host_size_t batch_count,
+    const iree_hal_submission_batch_t* batches) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_status_t status =
+      iree_hal_task_queue_submit_batches(queue, batch_count, batches);
+  if (iree_status_is_ok(status)) {
+    iree_task_executor_flush(queue->executor);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+iree_status_t iree_hal_task_queue_submit_and_wait(
+    iree_hal_task_queue_t* queue, iree_host_size_t batch_count,
+    const iree_hal_submission_batch_t* batches,
+    iree_hal_semaphore_t* wait_semaphore, uint64_t wait_value,
+    iree_timeout_t timeout) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_convert_timeout_to_absolute(&timeout);
+
+  // Queue all of the batches.
+  iree_status_t status =
+      iree_hal_task_queue_submit_batches(queue, batch_count, batches);
+  if (iree_status_is_ok(status)) {
+    // Flush the pending submissions and begin processing, then wait until idle.
+    // TODO(benvanik): get a wait_handle we can pass to
+    // iree_task_executor_donate_caller - it'll flush + do work.
+    iree_task_executor_flush(queue->executor);
+    status = iree_hal_task_queue_wait_idle(queue, timeout);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+iree_status_t iree_hal_task_queue_wait_idle(iree_hal_task_queue_t* queue,
+                                            iree_timeout_t timeout) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_time_t deadline_ns = iree_timeout_as_deadline_ns(timeout);
+  iree_status_t status = iree_task_scope_wait_idle(&queue->scope, deadline_ns);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
diff --git a/runtime/src/iree/hal/local/task_queue.h b/runtime/src/iree/hal/local/task_queue.h
new file mode 100644
index 0000000..7a60191
--- /dev/null
+++ b/runtime/src/iree/hal/local/task_queue.h
@@ -0,0 +1,79 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_LOCAL_TASK_QUEUE_H_
+#define IREE_HAL_LOCAL_TASK_QUEUE_H_
+
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/arena.h"
+#include "iree/base/internal/synchronization.h"
+#include "iree/hal/api.h"
+#include "iree/hal/local/task_queue_state.h"
+#include "iree/task/executor.h"
+#include "iree/task/scope.h"
+#include "iree/task/task.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef struct iree_hal_task_queue_t {
+  // Shared executor that the queue submits tasks to.
+  iree_task_executor_t* executor;
+
+  // Shared block pool for allocating submission transients (tasks/events/etc).
+  iree_arena_block_pool_t* block_pool;
+
+  // Scope used for all tasks in the queue.
+  // This allows for easy waits on all outstanding queue tasks as well as
+  // differentiation of tasks within the executor.
+  iree_task_scope_t scope;
+
+  // Guards queue state. Submissions and waits may come from any user thread and
+  // we do a bit of bookkeeping during command buffer issue that will come from
+  // an executor thread.
+  iree_slim_mutex_t mutex;
+
+  // State tracking used during command buffer issue.
+  // The intra-queue synchronization (barriers/events) carries across command
+  // buffers and this is used to rendezvous the tasks in each set.
+  iree_hal_task_queue_state_t state;
+
+  // The last active iree_hal_task_queue_issue_cmd_t submitted to the queue.
+  // If this is NULL then there are no issues pending - though there may still
+  // be active work that was previously issued. This is used to chain together
+  // issues in FIFO order such that all submissions *issue* in order but not
+  // *execute* in order.
+  iree_task_t* tail_issue_task;
+} iree_hal_task_queue_t;
+
+void iree_hal_task_queue_initialize(iree_string_view_t identifier,
+                                    iree_task_executor_t* executor,
+                                    iree_arena_block_pool_t* block_pool,
+                                    iree_hal_task_queue_t* out_queue);
+
+void iree_hal_task_queue_deinitialize(iree_hal_task_queue_t* queue);
+
+iree_status_t iree_hal_task_queue_submit(
+    iree_hal_task_queue_t* queue, iree_host_size_t batch_count,
+    const iree_hal_submission_batch_t* batches);
+
+iree_status_t iree_hal_task_queue_submit_and_wait(
+    iree_hal_task_queue_t* queue, iree_host_size_t batch_count,
+    const iree_hal_submission_batch_t* batches,
+    iree_hal_semaphore_t* wait_semaphore, uint64_t wait_value,
+    iree_timeout_t timeout);
+
+iree_status_t iree_hal_task_queue_wait_idle(iree_hal_task_queue_t* queue,
+                                            iree_timeout_t timeout);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_LOCAL_TASK_QUEUE_H_
diff --git a/runtime/src/iree/hal/local/task_queue_state.c b/runtime/src/iree/hal/local/task_queue_state.c
new file mode 100644
index 0000000..34ce329
--- /dev/null
+++ b/runtime/src/iree/hal/local/task_queue_state.c
@@ -0,0 +1,17 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/local/task_queue_state.h"
+
+#include <string.h>
+
+void iree_hal_task_queue_state_initialize(
+    iree_hal_task_queue_state_t* out_queue_state) {
+  memset(out_queue_state, 0, sizeof(*out_queue_state));
+}
+
+void iree_hal_task_queue_state_deinitialize(
+    iree_hal_task_queue_state_t* queue_state) {}
diff --git a/runtime/src/iree/hal/local/task_queue_state.h b/runtime/src/iree/hal/local/task_queue_state.h
new file mode 100644
index 0000000..40efc90
--- /dev/null
+++ b/runtime/src/iree/hal/local/task_queue_state.h
@@ -0,0 +1,41 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_LOCAL_TASK_QUEUE_STATE_H_
+#define IREE_HAL_LOCAL_TASK_QUEUE_STATE_H_
+
+#include "iree/base/api.h"
+#include "iree/base/internal/atomics.h"
+#include "iree/hal/api.h"
+#include "iree/task/scope.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// State tracking for an individual queue.
+//
+// Thread-compatible: only intended to be used by a queue with the submission
+// lock held.
+typedef struct iree_hal_task_queue_state_t {
+  // TODO(#4518): track event state.
+  int reserved;
+} iree_hal_task_queue_state_t;
+
+// Initializes queue state with the given |identifier| used to annotate tasks
+// submitted to the queue.
+void iree_hal_task_queue_state_initialize(
+    iree_hal_task_queue_state_t* out_queue_state);
+
+// Deinitializes queue state and cleans up any tracking intermediates.
+void iree_hal_task_queue_state_deinitialize(
+    iree_hal_task_queue_state_t* queue_state);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_LOCAL_TASK_QUEUE_STATE_H_
diff --git a/runtime/src/iree/hal/local/task_semaphore.c b/runtime/src/iree/hal/local/task_semaphore.c
new file mode 100644
index 0000000..a783b5c
--- /dev/null
+++ b/runtime/src/iree/hal/local/task_semaphore.c
@@ -0,0 +1,505 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/local/task_semaphore.h"
+
+#include <inttypes.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <string.h>
+
+#include "iree/base/internal/synchronization.h"
+#include "iree/base/internal/wait_handle.h"
+#include "iree/base/tracing.h"
+
+// Sentinel used the semaphore has failed and an error status is set.
+#define IREE_HAL_TASK_SEMAPHORE_FAILURE_VALUE UINT64_MAX
+
+//===----------------------------------------------------------------------===//
+// iree_hal_task_timepoint_t
+//===----------------------------------------------------------------------===//
+
+// Represents a point in the timeline that someone is waiting to be reached.
+// When the semaphore is signaled to at least the specified value then the
+// given event will be signaled and the timepoint discarded.
+//
+// Instances are owned and retained by the caller that requested them - usually
+// in the arena associated with the submission, but could be on the stack of a
+// synchronously waiting thread.
+typedef struct iree_hal_task_timepoint_t {
+  struct iree_hal_task_timepoint_t* next;
+  struct iree_hal_task_timepoint_t* prev;
+  uint64_t payload_value;
+  iree_event_t event;
+} iree_hal_task_timepoint_t;
+
+// A doubly-linked FIFO list of timepoints.
+// The order of the timepoints does *not* match increasing payload values but
+// instead the order they were added to the list.
+//
+// Note that the timepoints are not owned by the list - this just nicely
+// stitches together timepoints for the semaphore.
+typedef struct iree_hal_task_timepoint_list_t {
+  iree_hal_task_timepoint_t* head;
+  iree_hal_task_timepoint_t* tail;
+} iree_hal_task_timepoint_list_t;
+
+static void iree_hal_task_timepoint_list_initialize(
+    iree_hal_task_timepoint_list_t* out_list) {
+  memset(out_list, 0, sizeof(*out_list));
+}
+
+// Moves |source_list| into |out_target_list|.
+// |source_list| will be reset and the prior contents of |out_target_list| will
+// be discarded.
+static void iree_hal_task_timepoint_list_move(
+    iree_hal_task_timepoint_list_t* source_list,
+    iree_hal_task_timepoint_list_t* out_target_list) {
+  memcpy(out_target_list, source_list, sizeof(*out_target_list));
+  memset(source_list, 0, sizeof(*source_list));
+}
+
+// Appends a timepoint to the end of the timepoint list.
+static void iree_hal_task_timepoint_list_append(
+    iree_hal_task_timepoint_list_t* list,
+    iree_hal_task_timepoint_t* timepoint) {
+  timepoint->next = NULL;
+  timepoint->prev = list->tail;
+  if (list->tail != NULL) {
+    list->tail->next = timepoint;
+    list->tail = timepoint;
+  } else {
+    list->head = timepoint;
+    list->tail = timepoint;
+  }
+}
+
+// Erases a timepoint from the list.
+static void iree_hal_task_timepoint_list_erase(
+    iree_hal_task_timepoint_list_t* list,
+    iree_hal_task_timepoint_t* timepoint) {
+  if (timepoint->prev != NULL) timepoint->prev->next = timepoint->next;
+  if (timepoint == list->head) list->head = timepoint->next;
+  if (timepoint == list->tail) list->tail = timepoint->prev;
+  timepoint->prev = NULL;
+  timepoint->next = NULL;
+}
+
+// Scans the |pending_list| for all timepoints that are satisfied by the
+// timeline having reached |payload_value|. Each satisfied timepoint will be
+// moved to |out_ready_list|.
+static void iree_hal_task_timepoint_list_take_ready(
+    iree_hal_task_timepoint_list_t* pending_list, uint64_t payload_value,
+    iree_hal_task_timepoint_list_t* out_ready_list) {
+  iree_hal_task_timepoint_list_initialize(out_ready_list);
+  iree_hal_task_timepoint_t* next = pending_list->head;
+  while (next != NULL) {
+    iree_hal_task_timepoint_t* timepoint = next;
+    next = timepoint->next;
+    bool is_satisfied = timepoint->payload_value <= payload_value;
+    if (!is_satisfied) continue;
+
+    // Remove from pending list.
+    iree_hal_task_timepoint_list_erase(pending_list, timepoint);
+
+    // Add to ready list.
+    iree_hal_task_timepoint_list_append(out_ready_list, timepoint);
+  }
+}
+
+// Notifies all of the timepoints in the |ready_list| that their condition has
+// been satisfied. |ready_list| will be reset as ownership of the events is
+// held by the originator.
+static void iree_hal_task_timepoint_list_notify_ready(
+    iree_hal_task_timepoint_list_t* ready_list) {
+  iree_hal_task_timepoint_t* next = ready_list->head;
+  while (next != NULL) {
+    iree_hal_task_timepoint_t* timepoint = next;
+    next = timepoint->next;
+    timepoint->next = NULL;
+    timepoint->prev = NULL;
+    iree_event_set(&timepoint->event);
+  }
+  iree_hal_task_timepoint_list_initialize(ready_list);
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_task_semaphore_t
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_task_semaphore_t {
+  iree_hal_resource_t resource;
+  iree_allocator_t host_allocator;
+  iree_event_pool_t* event_pool;
+
+  // Guards all mutable fields. We expect low contention on semaphores and since
+  // iree_slim_mutex_t is (effectively) just a CAS this keeps things simpler
+  // than trying to make the entire structure lock-free.
+  iree_slim_mutex_t mutex;
+
+  // Current signaled value. May be IREE_HAL_TASK_SEMAPHORE_FAILURE_VALUE to
+  // indicate that the semaphore has been signaled for failure and
+  // |failure_status| contains the error.
+  uint64_t current_value;
+
+  // OK or the status passed to iree_hal_semaphore_fail. Owned by the semaphore.
+  iree_status_t failure_status;
+
+  // In-process notification signaled when the semaphore value changes. This is
+  // used exclusively for wait-ones to avoid going to the kernel for a full wait
+  // handle operation.
+  iree_notification_t notification;
+
+  // A list of all reserved timepoints waiting for the semaphore to reach a
+  // certain payload value.
+  iree_hal_task_timepoint_list_t timepoint_list;
+} iree_hal_task_semaphore_t;
+
+static const iree_hal_semaphore_vtable_t iree_hal_task_semaphore_vtable;
+
+static iree_hal_task_semaphore_t* iree_hal_task_semaphore_cast(
+    iree_hal_semaphore_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_task_semaphore_vtable);
+  return (iree_hal_task_semaphore_t*)base_value;
+}
+
+iree_status_t iree_hal_task_semaphore_create(
+    iree_event_pool_t* event_pool, uint64_t initial_value,
+    iree_allocator_t host_allocator, iree_hal_semaphore_t** out_semaphore) {
+  IREE_ASSERT_ARGUMENT(event_pool);
+  IREE_ASSERT_ARGUMENT(out_semaphore);
+  *out_semaphore = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_task_semaphore_t* semaphore = NULL;
+  iree_status_t status = iree_allocator_malloc(
+      host_allocator, sizeof(*semaphore), (void**)&semaphore);
+  if (iree_status_is_ok(status)) {
+    iree_hal_resource_initialize(&iree_hal_task_semaphore_vtable,
+                                 &semaphore->resource);
+    semaphore->host_allocator = host_allocator;
+    semaphore->event_pool = event_pool;
+
+    iree_slim_mutex_initialize(&semaphore->mutex);
+    semaphore->current_value = initial_value;
+    semaphore->failure_status = iree_ok_status();
+    iree_notification_initialize(&semaphore->notification);
+    iree_hal_task_timepoint_list_initialize(&semaphore->timepoint_list);
+
+    *out_semaphore = (iree_hal_semaphore_t*)semaphore;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_task_semaphore_destroy(
+    iree_hal_semaphore_t* base_semaphore) {
+  iree_hal_task_semaphore_t* semaphore =
+      iree_hal_task_semaphore_cast(base_semaphore);
+  iree_allocator_t host_allocator = semaphore->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_status_free(semaphore->failure_status);
+  iree_notification_deinitialize(&semaphore->notification);
+  iree_slim_mutex_deinitialize(&semaphore->mutex);
+  iree_allocator_free(host_allocator, semaphore);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_status_t iree_hal_task_semaphore_query(
+    iree_hal_semaphore_t* base_semaphore, uint64_t* out_value) {
+  iree_hal_task_semaphore_t* semaphore =
+      iree_hal_task_semaphore_cast(base_semaphore);
+
+  iree_slim_mutex_lock(&semaphore->mutex);
+
+  *out_value = semaphore->current_value;
+
+  iree_status_t status = iree_ok_status();
+  if (*out_value >= IREE_HAL_TASK_SEMAPHORE_FAILURE_VALUE) {
+    status = iree_status_clone(semaphore->failure_status);
+  }
+
+  iree_slim_mutex_unlock(&semaphore->mutex);
+
+  return status;
+}
+
+static iree_status_t iree_hal_task_semaphore_signal(
+    iree_hal_semaphore_t* base_semaphore, uint64_t new_value) {
+  iree_hal_task_semaphore_t* semaphore =
+      iree_hal_task_semaphore_cast(base_semaphore);
+
+  iree_slim_mutex_lock(&semaphore->mutex);
+
+  if (new_value <= semaphore->current_value) {
+    uint64_t current_value IREE_ATTRIBUTE_UNUSED = semaphore->current_value;
+    iree_slim_mutex_unlock(&semaphore->mutex);
+    return iree_make_status(IREE_STATUS_OUT_OF_RANGE,
+                            "semaphore values must be monotonically "
+                            "increasing; current_value=%" PRIu64
+                            ", new_value=%" PRIu64,
+                            current_value, new_value);
+  }
+
+  semaphore->current_value = new_value;
+
+  // Scan for all timepoints that are now satisfied and move them to our local
+  // ready list. This way we can notify them without needing to continue holding
+  // the semaphore lock.
+  iree_hal_task_timepoint_list_t ready_list;
+  iree_hal_task_timepoint_list_take_ready(&semaphore->timepoint_list, new_value,
+                                          &ready_list);
+
+  iree_notification_post(&semaphore->notification, IREE_ALL_WAITERS);
+  iree_slim_mutex_unlock(&semaphore->mutex);
+
+  // Notify all waiters - note that this must happen outside the lock.
+  iree_hal_task_timepoint_list_notify_ready(&ready_list);
+
+  return iree_ok_status();
+}
+
+static void iree_hal_task_semaphore_fail(iree_hal_semaphore_t* base_semaphore,
+                                         iree_status_t status) {
+  iree_hal_task_semaphore_t* semaphore =
+      iree_hal_task_semaphore_cast(base_semaphore);
+
+  iree_slim_mutex_lock(&semaphore->mutex);
+
+  // Try to set our local status - we only preserve the first failure so only
+  // do this if we are going from a valid semaphore to a failed one.
+  if (!iree_status_is_ok(semaphore->failure_status)) {
+    // Previous status was not OK; drop our new status.
+    IREE_IGNORE_ERROR(status);
+    iree_slim_mutex_unlock(&semaphore->mutex);
+    return;
+  }
+
+  // Signal to our failure sentinel value.
+  semaphore->current_value = IREE_HAL_TASK_SEMAPHORE_FAILURE_VALUE;
+  semaphore->failure_status = status;
+
+  // Take the whole timepoint list as we'll be signaling all of them. Since
+  // we hold the lock no other timepoints can be created while we are cleaning
+  // up.
+  iree_hal_task_timepoint_list_t ready_list;
+  iree_hal_task_timepoint_list_move(&semaphore->timepoint_list, &ready_list);
+
+  iree_notification_post(&semaphore->notification, IREE_ALL_WAITERS);
+  iree_slim_mutex_unlock(&semaphore->mutex);
+
+  // Notify all waiters - note that this must happen outside the lock.
+  iree_hal_task_timepoint_list_notify_ready(&ready_list);
+}
+
+// Acquires a timepoint waiting for the given value.
+// |out_timepoint| is owned by the caller and must be kept live until the
+// timepoint has been reached (or it is cancelled by the caller).
+static iree_status_t iree_hal_task_semaphore_acquire_timepoint(
+    iree_hal_task_semaphore_t* semaphore, uint64_t minimum_value,
+    iree_hal_task_timepoint_t* out_timepoint) {
+  memset(out_timepoint, 0, sizeof(*out_timepoint));
+  out_timepoint->payload_value = minimum_value;
+  IREE_RETURN_IF_ERROR(
+      iree_event_pool_acquire(semaphore->event_pool, 1, &out_timepoint->event));
+  iree_hal_task_timepoint_list_append(&semaphore->timepoint_list,
+                                      out_timepoint);
+  return iree_ok_status();
+}
+
+typedef struct iree_hal_task_semaphore_wait_cmd_t {
+  iree_task_wait_t task;
+  iree_hal_task_semaphore_t* semaphore;
+  iree_hal_task_timepoint_t timepoint;
+} iree_hal_task_semaphore_wait_cmd_t;
+
+// Cleans up a wait task by returning the event used to the pool and - if the
+// task failed - ensuring we scrub it from the timepoint list.
+static void iree_hal_task_semaphore_wait_cmd_cleanup(
+    iree_task_t* task, iree_status_code_t status_code) {
+  iree_hal_task_semaphore_wait_cmd_t* cmd =
+      (iree_hal_task_semaphore_wait_cmd_t*)task;
+  iree_event_pool_release(cmd->semaphore->event_pool, 1, &cmd->timepoint.event);
+  if (IREE_UNLIKELY(status_code != IREE_STATUS_OK)) {
+    // Abort the timepoint. Note that this is not designed to be fast as
+    // semaphore failure is an exceptional case.
+    iree_slim_mutex_lock(&cmd->semaphore->mutex);
+    iree_hal_task_timepoint_list_erase(&cmd->semaphore->timepoint_list,
+                                       &cmd->timepoint);
+    iree_slim_mutex_unlock(&cmd->semaphore->mutex);
+  }
+}
+
+iree_status_t iree_hal_task_semaphore_enqueue_timepoint(
+    iree_hal_semaphore_t* base_semaphore, uint64_t minimum_value,
+    iree_task_t* issue_task, iree_arena_allocator_t* arena,
+    iree_task_submission_t* submission) {
+  iree_hal_task_semaphore_t* semaphore =
+      iree_hal_task_semaphore_cast(base_semaphore);
+
+  iree_slim_mutex_lock(&semaphore->mutex);
+
+  iree_status_t status = iree_ok_status();
+  if (semaphore->current_value >= minimum_value) {
+    // Fast path: already satisfied.
+  } else {
+    // Slow path: acquire a system wait handle and perform a full wait.
+    iree_hal_task_semaphore_wait_cmd_t* cmd = NULL;
+    status = iree_arena_allocate(arena, sizeof(*cmd), (void**)&cmd);
+    if (iree_status_is_ok(status)) {
+      status = iree_hal_task_semaphore_acquire_timepoint(
+          semaphore, minimum_value, &cmd->timepoint);
+    }
+    if (iree_status_is_ok(status)) {
+      iree_task_wait_initialize(issue_task->scope,
+                                iree_event_await(&cmd->timepoint.event),
+                                IREE_TIME_INFINITE_FUTURE, &cmd->task);
+      iree_task_set_cleanup_fn(&cmd->task.header,
+                               iree_hal_task_semaphore_wait_cmd_cleanup);
+      iree_task_set_completion_task(&cmd->task.header, issue_task);
+      cmd->semaphore = semaphore;
+      iree_task_submission_enqueue(submission, &cmd->task.header);
+    }
+  }
+
+  iree_slim_mutex_unlock(&semaphore->mutex);
+  return status;
+}
+
+static iree_status_t iree_hal_task_semaphore_wait(
+    iree_hal_semaphore_t* base_semaphore, uint64_t value,
+    iree_timeout_t timeout) {
+  iree_hal_task_semaphore_t* semaphore =
+      iree_hal_task_semaphore_cast(base_semaphore);
+
+  iree_slim_mutex_lock(&semaphore->mutex);
+
+  if (!iree_status_is_ok(semaphore->failure_status)) {
+    // Fastest path: failed; return an error to tell callers to query for it.
+    iree_slim_mutex_unlock(&semaphore->mutex);
+    return iree_status_from_code(IREE_STATUS_ABORTED);
+  } else if (semaphore->current_value >= value) {
+    // Fast path: already satisfied.
+    iree_slim_mutex_unlock(&semaphore->mutex);
+    return iree_ok_status();
+  } else if (iree_timeout_is_immediate(timeout)) {
+    // Not satisfied but a poll, so can avoid the expensive wait handle work.
+    iree_slim_mutex_unlock(&semaphore->mutex);
+    return iree_status_from_code(IREE_STATUS_DEADLINE_EXCEEDED);
+  }
+
+  iree_time_t deadline_ns = iree_timeout_as_deadline_ns(timeout);
+
+  // Slow path: acquire a timepoint while we hold the lock.
+  iree_hal_task_timepoint_t timepoint;
+  iree_status_t status =
+      iree_hal_task_semaphore_acquire_timepoint(semaphore, value, &timepoint);
+
+  iree_slim_mutex_unlock(&semaphore->mutex);
+  if (IREE_UNLIKELY(!iree_status_is_ok(status))) return status;
+
+  // Wait until the timepoint resolves.
+  // If satisfied the timepoint is automatically cleaned up and we are done. If
+  // the deadline is reached before satisfied then we have to clean it up.
+  status = iree_wait_one(&timepoint.event, deadline_ns);
+  if (!iree_status_is_ok(status)) {
+    iree_slim_mutex_lock(&semaphore->mutex);
+    iree_hal_task_timepoint_list_erase(&semaphore->timepoint_list, &timepoint);
+    iree_slim_mutex_unlock(&semaphore->mutex);
+  }
+  iree_event_pool_release(semaphore->event_pool, 1, &timepoint.event);
+  return status;
+}
+
+iree_status_t iree_hal_task_semaphore_multi_wait(
+    iree_hal_wait_mode_t wait_mode,
+    const iree_hal_semaphore_list_t* semaphore_list, iree_timeout_t timeout,
+    iree_event_pool_t* event_pool, iree_arena_block_pool_t* block_pool) {
+  IREE_ASSERT_ARGUMENT(semaphore_list);
+  if (semaphore_list->count == 0) {
+    return iree_ok_status();
+  } else if (semaphore_list->count == 1) {
+    // Fast-path for a single semaphore.
+    return iree_hal_semaphore_wait(semaphore_list->semaphores[0],
+                                   semaphore_list->payload_values[0], timeout);
+  }
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_time_t deadline_ns = iree_timeout_as_deadline_ns(timeout);
+
+  // Avoid heap allocations by using the device block pool for the wait set.
+  iree_arena_allocator_t arena;
+  iree_arena_initialize(block_pool, &arena);
+  iree_wait_set_t* wait_set = NULL;
+  iree_status_t status = iree_wait_set_allocate(
+      semaphore_list->count, iree_arena_allocator(&arena), &wait_set);
+
+  // Acquire a wait handle for each semaphore timepoint we are to wait on.
+  // TODO(benvanik): flip this API around so we can batch request events from
+  // the event pool. We should be acquiring all required time points in one
+  // call.
+  iree_host_size_t timepoint_count = 0;
+  iree_hal_task_timepoint_t* timepoints = NULL;
+  iree_host_size_t total_timepoint_size =
+      semaphore_list->count * sizeof(timepoints[0]);
+  status =
+      iree_arena_allocate(&arena, total_timepoint_size, (void**)&timepoints);
+  if (iree_status_is_ok(status)) {
+    memset(timepoints, 0, total_timepoint_size);
+    for (iree_host_size_t i = 0; i < semaphore_list->count; ++i) {
+      iree_hal_task_semaphore_t* semaphore =
+          iree_hal_task_semaphore_cast(semaphore_list->semaphores[i]);
+      iree_slim_mutex_lock(&semaphore->mutex);
+      if (semaphore->current_value >= semaphore_list->payload_values[i]) {
+        // Fast path: already satisfied.
+      } else {
+        // Slow path: get a native wait handle for the timepoint.
+        iree_hal_task_timepoint_t* timepoint = &timepoints[timepoint_count++];
+        status = iree_hal_task_semaphore_acquire_timepoint(
+            semaphore, semaphore_list->payload_values[i], timepoint);
+        if (iree_status_is_ok(status)) {
+          status = iree_wait_set_insert(wait_set, timepoint->event);
+        }
+      }
+      iree_slim_mutex_unlock(&semaphore->mutex);
+      if (!iree_status_is_ok(status)) break;
+    }
+  }
+
+  // Perform the wait.
+  if (iree_status_is_ok(status)) {
+    if (wait_mode == IREE_HAL_WAIT_MODE_ANY) {
+      status = iree_wait_any(wait_set, deadline_ns, /*out_wake_handle=*/NULL);
+    } else {
+      status = iree_wait_all(wait_set, deadline_ns);
+    }
+  }
+
+  if (timepoints != NULL) {
+    // TODO(benvanik): if we flip the API to multi-acquire events from the pool
+    // above then we can multi-release here too.
+    for (iree_host_size_t i = 0; i < timepoint_count; ++i) {
+      iree_event_pool_release(event_pool, 1, &timepoints[i].event);
+    }
+  }
+  iree_wait_set_free(wait_set);
+  iree_arena_deinitialize(&arena);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static const iree_hal_semaphore_vtable_t iree_hal_task_semaphore_vtable = {
+    .destroy = iree_hal_task_semaphore_destroy,
+    .query = iree_hal_task_semaphore_query,
+    .signal = iree_hal_task_semaphore_signal,
+    .fail = iree_hal_task_semaphore_fail,
+    .wait = iree_hal_task_semaphore_wait,
+};
diff --git a/runtime/src/iree/hal/local/task_semaphore.h b/runtime/src/iree/hal/local/task_semaphore.h
new file mode 100644
index 0000000..f3a1060
--- /dev/null
+++ b/runtime/src/iree/hal/local/task_semaphore.h
@@ -0,0 +1,51 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_LOCAL_TASK_SEMAPHORE_H_
+#define IREE_HAL_LOCAL_TASK_SEMAPHORE_H_
+
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/arena.h"
+#include "iree/base/internal/event_pool.h"
+#include "iree/hal/api.h"
+#include "iree/task/submission.h"
+#include "iree/task/task.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Creates a semaphore that integrates with the task system to allow for
+// pipelined wait and signal operations.
+iree_status_t iree_hal_task_semaphore_create(
+    iree_event_pool_t* event_pool, uint64_t initial_value,
+    iree_allocator_t host_allocator, iree_hal_semaphore_t** out_semaphore);
+
+// Reserves a new timepoint in the timeline for the given minimum payload value.
+// |issue_task| will wait until the timeline semaphore is signaled to at least
+// |minimum_value| before proceeding, with a possible wait task generated and
+// appended to the |submission|. Allocations for any intermediates will be made
+// from |arena| whose lifetime must be tied to the submission.
+iree_status_t iree_hal_task_semaphore_enqueue_timepoint(
+    iree_hal_semaphore_t* semaphore, uint64_t minimum_value,
+    iree_task_t* issue_task, iree_arena_allocator_t* arena,
+    iree_task_submission_t* submission);
+
+// Performs a multi-wait on one or more semaphores.
+// Returns IREE_STATUS_DEADLINE_EXCEEDED if the wait does not complete before
+// |deadline_ns| elapses.
+iree_status_t iree_hal_task_semaphore_multi_wait(
+    iree_hal_wait_mode_t wait_mode,
+    const iree_hal_semaphore_list_t* semaphore_list, iree_timeout_t timeout,
+    iree_event_pool_t* event_pool, iree_arena_block_pool_t* block_pool);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_LOCAL_TASK_SEMAPHORE_H_
diff --git a/runtime/src/iree/hal/resource.h b/runtime/src/iree/hal/resource.h
new file mode 100644
index 0000000..0f7abbe
--- /dev/null
+++ b/runtime/src/iree/hal/resource.h
@@ -0,0 +1,116 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_RESOURCE_H_
+#define IREE_HAL_RESOURCE_H_
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/atomics.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Abstract resource type whose lifetime is managed by reference counting.
+// Used mostly just to get a virtual dtor and vtable, though we could add nicer
+// logging by allowing resources to capture debug names, stack traces of
+// creation, etc.
+//
+// All resource types must have the iree_hal_resource_t at offset 0. This allows
+// the HAL code to cast any type pointer to a resource to gain access to the
+// ref count and vtable at predictable locations. Note that this allows for the
+// resource to be at >0 of the allocation but the pointers used with the HAL
+// (iree_hal_event_t*, etc) must point to the iree_hal_resource_t.
+typedef struct iree_hal_resource_t {
+  // Reference count used to manage resource lifetime. The vtable->destroy
+  // method will be called when the reference count falls to zero.
+  iree_atomic_ref_count_t ref_count;
+
+  // Opaque vtable for the resource object.
+  // Must start with iree_hal_resource_vtable_t at offset 0.
+  //
+  // NOTE: this field may be hidden in the future. Only use this for
+  // IREE_HAL_VTABLE_DISPATCH and not equality/direct dereferencing.
+  const void* vtable;
+
+  // TODO(benvanik): debug string/logging utilities.
+} iree_hal_resource_t;
+
+// Base vtable for all resources.
+// This provides the base functions required to generically manipulate resources
+// of various types.
+//
+// This must be aliased at offset 0 of all typed vtables:
+//   typedef struct iree_hal_foo_vtable_t {
+//     void(IREE_API_PTR* destroy)(...);
+//     void(IREE_API_PTR* foo_method)(...);
+//   } iree_hal_foo_vtable_t;
+typedef struct iree_hal_resource_vtable_t {
+  // Destroys the resource upon the final reference being released.
+  // The resource pointer must be assumed invalid upon return from the function
+  // (even if in some implementations its returned to a pool and still live).
+  void(IREE_API_PTR* destroy)(iree_hal_resource_t* resource);
+} iree_hal_resource_vtable_t;
+
+// Verifies that the vtable has the right resource sub-vtable.
+#define IREE_HAL_ASSERT_VTABLE_LAYOUT(vtable_type)   \
+  static_assert(offsetof(vtable_type, destroy) == 0, \
+                "iree_hal_resource_vtable_t must be at offset 0");
+
+// Initializes the base resource type.
+static inline void iree_hal_resource_initialize(
+    const void* vtable, iree_hal_resource_t* out_resource) {
+  iree_atomic_ref_count_init(&out_resource->ref_count);
+  out_resource->vtable = vtable;
+}
+
+// Retains a resource for the caller.
+static inline void iree_hal_resource_retain(const void* any_resource) {
+  iree_hal_resource_t* resource = (iree_hal_resource_t*)any_resource;
+  if (IREE_LIKELY(resource)) {
+    iree_atomic_ref_count_inc(&resource->ref_count);
+  }
+}
+
+// Releases a resource and destroys it if there are no more references.
+// This routes through the vtable and can disable optimizations; always prefer
+// to use the type-specific release functions (such as iree_hal_buffer_release)
+// to allow for more optimizations and better compile-time type safety.
+static inline void iree_hal_resource_release(const void* any_resource) {
+  iree_hal_resource_t* resource = (iree_hal_resource_t*)any_resource;
+  if (IREE_LIKELY(resource) &&
+      iree_atomic_ref_count_dec(&resource->ref_count) == 1) {
+    ((iree_hal_resource_vtable_t*)resource->vtable)->destroy(resource);
+  }
+}
+
+// Returns true if the |resource| has the given |vtable| type.
+// This is *not* a way to ensure that an instance is of a specific type but
+// instead that it has a compatible vtable. This is because LTO may very rarely
+// dedupe identical vtables and cause the pointer comparison to succeed even if
+// the spellings of the types differs.
+static inline bool iree_hal_resource_is(const void* resource,
+                                        const void* vtable) {
+  return resource ? ((const iree_hal_resource_t*)resource)->vtable == vtable
+                  : false;
+}
+
+// Asserts (**DEBUG ONLY**) that the |resource| has the given |vtable| type.
+// This is only useful to check for programmer error and may have false
+// positives - do not rely on it for handling untrusted user input.
+#define IREE_HAL_ASSERT_TYPE(resource, vtable)             \
+  IREE_ASSERT_TRUE(iree_hal_resource_is(resource, vtable), \
+                   "type does not match expected " #vtable)
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_RESOURCE_H_
diff --git a/runtime/src/iree/hal/semaphore.c b/runtime/src/iree/hal/semaphore.c
new file mode 100644
index 0000000..50608e9
--- /dev/null
+++ b/runtime/src/iree/hal/semaphore.c
@@ -0,0 +1,73 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/semaphore.h"
+
+#include <stddef.h>
+
+#include "iree/base/tracing.h"
+#include "iree/hal/detail.h"
+#include "iree/hal/device.h"
+#include "iree/hal/resource.h"
+
+#define _VTABLE_DISPATCH(semaphore, method_name) \
+  IREE_HAL_VTABLE_DISPATCH(semaphore, iree_hal_semaphore, method_name)
+
+IREE_HAL_API_RETAIN_RELEASE(semaphore);
+
+IREE_API_EXPORT iree_status_t
+iree_hal_semaphore_create(iree_hal_device_t* device, uint64_t initial_value,
+                          iree_hal_semaphore_t** out_semaphore) {
+  IREE_ASSERT_ARGUMENT(device);
+  IREE_ASSERT_ARGUMENT(out_semaphore);
+  *out_semaphore = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_status_t status =
+      IREE_HAL_VTABLE_DISPATCH(device, iree_hal_device, create_semaphore)(
+          device, initial_value, out_semaphore);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t
+iree_hal_semaphore_query(iree_hal_semaphore_t* semaphore, uint64_t* out_value) {
+  IREE_ASSERT_ARGUMENT(semaphore);
+  IREE_ASSERT_ARGUMENT(out_value);
+  *out_value = 0;
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_status_t status =
+      _VTABLE_DISPATCH(semaphore, query)(semaphore, out_value);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t
+iree_hal_semaphore_signal(iree_hal_semaphore_t* semaphore, uint64_t new_value) {
+  IREE_ASSERT_ARGUMENT(semaphore);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_status_t status =
+      _VTABLE_DISPATCH(semaphore, signal)(semaphore, new_value);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT void iree_hal_semaphore_fail(iree_hal_semaphore_t* semaphore,
+                                             iree_status_t status) {
+  IREE_ASSERT_ARGUMENT(semaphore);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  _VTABLE_DISPATCH(semaphore, fail)(semaphore, status);
+  IREE_TRACE_ZONE_END(z0);
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_semaphore_wait(
+    iree_hal_semaphore_t* semaphore, uint64_t value, iree_timeout_t timeout) {
+  IREE_ASSERT_ARGUMENT(semaphore);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_status_t status =
+      _VTABLE_DISPATCH(semaphore, wait)(semaphore, value, timeout);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
diff --git a/runtime/src/iree/hal/semaphore.h b/runtime/src/iree/hal/semaphore.h
new file mode 100644
index 0000000..afc8959
--- /dev/null
+++ b/runtime/src/iree/hal/semaphore.h
@@ -0,0 +1,138 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_SEMAPHORE_H_
+#define IREE_HAL_SEMAPHORE_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/resource.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef struct iree_hal_device_t iree_hal_device_t;
+
+//===----------------------------------------------------------------------===//
+// iree_hal_semaphore_t
+//===----------------------------------------------------------------------===//
+
+// Synchronization mechanism for host->device, device->host, host->host,
+// and device->device notification. Semaphores behave like Vulkan timeline
+// semaphores (or D3D12 fences) and contain a monotonically increasing
+// uint64_t payload. They may be waited on any number of times even if they
+// have already been signaled for a particular value. They may also be waited
+// on for a particular value prior to the signal for that value.
+//
+// A semaphore is updated to its new value after all prior commands have
+// completed but the delay between completion and the host being woken varies.
+// Some implementations may coalesce semaphores to avoid spurious waking while
+// others will immediately synchronize with the host.
+//
+// One use of semaphores is for resource lifetime management: all resources used
+// by a set of submission batches must be considered live until the semaphore
+// attached to the submission has signaled.
+//
+// Another use of semaphores is device->device synchronization for setting up
+// the DAG of command buffers across queue submissions. This allows devices to
+// perform non-trivial scheduling behavior without the need to wake the host.
+//
+// Semaphores may be set to a permanently failed state by implementations when
+// errors occur during asynchronous execution. Users are expected to propagate
+// the failures and possibly reset the entire device that produced the error.
+//
+// For more information on semaphores see the following docs describing how
+// timelines are generally used (specifically in the device->host case):
+// https://www.youtube.com/watch?v=SpE--Rf516Y
+// https://www.khronos.org/assets/uploads/developers/library/2018-xdc/Vulkan-Timeline-Semaphores-Part-1_Sep18.pdf
+// https://docs.microsoft.com/en-us/windows/win32/direct3d12/user-mode-heap-synchronization
+typedef struct iree_hal_semaphore_t iree_hal_semaphore_t;
+
+// Creates a semaphore that can be used with command queues owned by this
+// device. To use the semaphores with other devices or instances they must
+// first be exported.
+IREE_API_EXPORT iree_status_t
+iree_hal_semaphore_create(iree_hal_device_t* device, uint64_t initial_value,
+                          iree_hal_semaphore_t** out_semaphore);
+
+// Retains the given |semaphore| for the caller.
+IREE_API_EXPORT void iree_hal_semaphore_retain(iree_hal_semaphore_t* semaphore);
+
+// Releases the given |semaphore| from the caller.
+IREE_API_EXPORT void iree_hal_semaphore_release(
+    iree_hal_semaphore_t* semaphore);
+
+// Queries the current payload of the semaphore and stores the result in
+// |out_value|. As the payload is monotonically increasing it is guaranteed that
+// the value is at least equal to the previous result of a
+// iree_hal_semaphore_query call and coherent with any waits for a
+// specified value via iree_device_wait_all_semaphores.
+//
+// Returns the status at the time the method is called without blocking and as
+// such is only valid after a semaphore has been signaled. The same failure
+// status will be returned regardless of when in the timeline the error
+// occurred.
+IREE_API_EXPORT iree_status_t
+iree_hal_semaphore_query(iree_hal_semaphore_t* semaphore, uint64_t* out_value);
+
+// Signals the |semaphore| to the given payload value.
+// The call is ignored if the current payload value exceeds |new_value|.
+IREE_API_EXPORT iree_status_t
+iree_hal_semaphore_signal(iree_hal_semaphore_t* semaphore, uint64_t new_value);
+
+// Signals the |semaphore| with a failure. The |status| will be returned from
+// iree_hal_semaphore_query and iree_hal_semaphore_signal for the lifetime
+// of the semaphore. Ownership of the status transfers to the semaphore and
+// callers must clone it if they wish to retain it.
+IREE_API_EXPORT void iree_hal_semaphore_fail(iree_hal_semaphore_t* semaphore,
+                                             iree_status_t status);
+
+// Blocks the caller until the semaphore reaches or exceedes the specified
+// payload value or the |timeout| elapses.
+//
+// Returns success if the wait is successful and the semaphore has met or
+// exceeded the required payload value.
+//
+// Returns IREE_STATUS_DEADLINE_EXCEEDED if the |timeout| elapses without the
+// semaphore reaching the required value. If an asynchronous failure occured
+// this will return the failure status that was set immediately.
+//
+// Returns IREE_STATUS_ABORTED if one or more semaphores has failed. Callers can
+// use iree_hal_semaphore_query on the semaphores to find the ones that have
+// failed and get the status.
+IREE_API_EXPORT iree_status_t iree_hal_semaphore_wait(
+    iree_hal_semaphore_t* semaphore, uint64_t value, iree_timeout_t timeout);
+
+//===----------------------------------------------------------------------===//
+// iree_hal_semaphore_t implementation details
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_semaphore_vtable_t {
+  void(IREE_API_PTR* destroy)(iree_hal_semaphore_t* semaphore);
+
+  iree_status_t(IREE_API_PTR* query)(iree_hal_semaphore_t* semaphore,
+                                     uint64_t* out_value);
+  iree_status_t(IREE_API_PTR* signal)(iree_hal_semaphore_t* semaphore,
+                                      uint64_t new_value);
+  void(IREE_API_PTR* fail)(iree_hal_semaphore_t* semaphore,
+                           iree_status_t status);
+
+  iree_status_t(IREE_API_PTR* wait)(iree_hal_semaphore_t* semaphore,
+                                    uint64_t value, iree_timeout_t timeout);
+} iree_hal_semaphore_vtable_t;
+IREE_HAL_ASSERT_VTABLE_LAYOUT(iree_hal_semaphore_vtable_t);
+
+IREE_API_EXPORT void iree_hal_semaphore_destroy(
+    iree_hal_semaphore_t* semaphore);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_SEMAPHORE_H_
diff --git a/runtime/src/iree/hal/string_util.c b/runtime/src/iree/hal/string_util.c
new file mode 100644
index 0000000..0938301
--- /dev/null
+++ b/runtime/src/iree/hal/string_util.c
@@ -0,0 +1,599 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/string_util.h"
+
+#include <ctype.h>
+#include <inttypes.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/math.h"
+#include "iree/hal/buffer_view.h"
+
+IREE_API_EXPORT iree_status_t iree_hal_parse_shape(
+    iree_string_view_t value, iree_host_size_t shape_capacity,
+    iree_hal_dim_t* out_shape, iree_host_size_t* out_shape_rank) {
+  IREE_ASSERT_ARGUMENT(out_shape_rank);
+  *out_shape_rank = 0;
+
+  if (iree_string_view_is_empty(value)) {
+    return iree_ok_status();  // empty shape
+  }
+
+  // Count the number of dimensions to see if we have capacity.
+  iree_host_size_t shape_rank = 1;  // always at least one if we are not empty
+  for (iree_host_size_t i = 0; i < value.size; ++i) {
+    if (value.data[i] == 'x') ++shape_rank;
+  }
+  if (out_shape_rank) {
+    *out_shape_rank = shape_rank;
+  }
+  if (shape_rank > shape_capacity) {
+    // NOTE: fast return for capacity queries.
+    return iree_status_from_code(IREE_STATUS_OUT_OF_RANGE);
+  }
+
+  iree_host_size_t dim_index = 0;
+  iree_string_view_t lhs;
+  iree_string_view_t rhs = value;
+  while (iree_string_view_split(rhs, 'x', &lhs, &rhs) &&
+         !iree_string_view_is_empty(lhs)) {
+    int32_t dim_value = 0;
+    if (!iree_string_view_atoi_int32(lhs, &dim_value)) {
+      return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                              "shape[%zu] invalid value '%.*s' of '%.*s'",
+                              dim_index, (int)lhs.size, lhs.data,
+                              (int)value.size, value.data);
+    }
+    if (dim_value < 0) {
+      return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                              "shape[%zu] unsupported value %d of '%.*s'",
+                              dim_index, dim_value, (int)value.size,
+                              value.data);
+    }
+    out_shape[dim_index++] = dim_value;
+  }
+  if (dim_index != shape_rank) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "invalid shape specification: '%.*s'",
+                            (int)value.size, value.data);
+  }
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_status_t
+iree_hal_format_shape(const iree_hal_dim_t* shape, iree_host_size_t shape_rank,
+                      iree_host_size_t buffer_capacity, char* buffer,
+                      iree_host_size_t* out_buffer_length) {
+  if (out_buffer_length) {
+    *out_buffer_length = 0;
+  }
+  iree_host_size_t buffer_length = 0;
+  for (iree_host_size_t i = 0; i < shape_rank; ++i) {
+    int n = snprintf(buffer ? buffer + buffer_length : NULL,
+                     buffer ? buffer_capacity - buffer_length : 0,
+                     (i < shape_rank - 1) ? "%dx" : "%d", shape[i]);
+    if (IREE_UNLIKELY(n < 0)) {
+      return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                              "snprintf failed to write dimension %zu", i);
+    } else if (buffer && n >= buffer_capacity - buffer_length) {
+      buffer = NULL;
+    }
+    buffer_length += n;
+  }
+  if (out_buffer_length) {
+    *out_buffer_length = buffer_length;
+  }
+  return buffer ? iree_ok_status()
+                : iree_status_from_code(IREE_STATUS_OUT_OF_RANGE);
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_parse_element_type(
+    iree_string_view_t value, iree_hal_element_type_t* out_element_type) {
+  IREE_ASSERT_ARGUMENT(out_element_type);
+  *out_element_type = IREE_HAL_ELEMENT_TYPE_NONE;
+
+  iree_string_view_t str_value = value;
+  iree_hal_numerical_type_t numerical_type = IREE_HAL_NUMERICAL_TYPE_UNKNOWN;
+  if (iree_string_view_consume_prefix(&str_value, IREE_SV("i"))) {
+    numerical_type = IREE_HAL_NUMERICAL_TYPE_INTEGER;
+  } else if (iree_string_view_consume_prefix(&str_value, IREE_SV("si"))) {
+    numerical_type = IREE_HAL_NUMERICAL_TYPE_INTEGER_SIGNED;
+  } else if (iree_string_view_consume_prefix(&str_value, IREE_SV("ui"))) {
+    numerical_type = IREE_HAL_NUMERICAL_TYPE_INTEGER_UNSIGNED;
+  } else if (iree_string_view_consume_prefix(&str_value, IREE_SV("f"))) {
+    numerical_type = IREE_HAL_NUMERICAL_TYPE_FLOAT_IEEE;
+  } else if (iree_string_view_consume_prefix(&str_value, IREE_SV("bf"))) {
+    numerical_type = IREE_HAL_NUMERICAL_TYPE_FLOAT_BRAIN;
+  } else if (iree_string_view_consume_prefix(&str_value, IREE_SV("x")) ||
+             iree_string_view_consume_prefix(&str_value, IREE_SV("*"))) {
+    numerical_type = IREE_HAL_NUMERICAL_TYPE_UNKNOWN;
+  } else {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "unhandled element type prefix in '%.*s'",
+                            (int)value.size, value.data);
+  }
+
+  uint32_t bit_count = 0;
+  if (!iree_string_view_atoi_uint32(str_value, &bit_count) ||
+      bit_count > 0xFFu) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "out of range bit count in '%.*s'", (int)value.size,
+                            value.data);
+  }
+
+  *out_element_type = iree_hal_make_element_type(numerical_type, bit_count);
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_format_element_type(
+    iree_hal_element_type_t element_type, iree_host_size_t buffer_capacity,
+    char* buffer, iree_host_size_t* out_buffer_length) {
+  if (out_buffer_length) {
+    *out_buffer_length = 0;
+  }
+  const char* prefix;
+  switch (iree_hal_element_numerical_type(element_type)) {
+    case IREE_HAL_NUMERICAL_TYPE_INTEGER:
+      prefix = "i";
+      break;
+    case IREE_HAL_NUMERICAL_TYPE_INTEGER_SIGNED:
+      prefix = "si";
+      break;
+    case IREE_HAL_NUMERICAL_TYPE_INTEGER_UNSIGNED:
+      prefix = "ui";
+      break;
+    case IREE_HAL_NUMERICAL_TYPE_FLOAT_IEEE:
+      prefix = "f";
+      break;
+    case IREE_HAL_NUMERICAL_TYPE_FLOAT_BRAIN:
+      prefix = "bf";
+      break;
+    default:
+      prefix = "*";
+      break;
+  }
+  int n = snprintf(buffer, buffer_capacity, "%s%d", prefix,
+                   (int32_t)iree_hal_element_bit_count(element_type));
+  if (n < 0) {
+    return iree_make_status(IREE_STATUS_FAILED_PRECONDITION, "snprintf failed");
+  }
+  if (out_buffer_length) {
+    *out_buffer_length = n;
+  }
+  return n >= buffer_capacity ? iree_status_from_code(IREE_STATUS_OUT_OF_RANGE)
+                              : iree_ok_status();
+}
+
+// Parses a string of two character pairs representing hex numbers into bytes.
+static void iree_hal_hex_string_to_bytes(const char* from, uint8_t* to,
+                                         ptrdiff_t num) {
+  /* clang-format off */
+  static const char kHexValue[256] = {
+      0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  1,  2,  3,  4,  5,  6, 7, 8, 9, 0, 0, 0, 0, 0, 0,  // '0'..'9'
+      0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 'A'..'F'
+      0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 'a'..'f'
+      0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+  };
+  /* clang-format on */
+  for (int i = 0; i < num; i++) {
+    to[i] = (kHexValue[from[i * 2] & 0xFF] << 4) +
+            (kHexValue[from[i * 2 + 1] & 0xFF]);
+  }
+}
+
+// Parses a signal element string, assuming that the caller has validated that
+// |out_data| has enough storage space for the parsed element data.
+static iree_status_t iree_hal_parse_element_unsafe(
+    iree_string_view_t data_str, iree_hal_element_type_t element_type,
+    uint8_t* out_data) {
+  switch (element_type) {
+    case IREE_HAL_ELEMENT_TYPE_INT_8:
+    case IREE_HAL_ELEMENT_TYPE_SINT_8: {
+      int32_t temp = 0;
+      if (!iree_string_view_atoi_int32(data_str, &temp) || temp > INT8_MAX) {
+        return iree_status_from_code(IREE_STATUS_INVALID_ARGUMENT);
+      }
+      *(int8_t*)out_data = (int8_t)temp;
+      return iree_ok_status();
+    }
+    case IREE_HAL_ELEMENT_TYPE_UINT_8: {
+      uint32_t temp = 0;
+      if (!iree_string_view_atoi_uint32(data_str, &temp) || temp > UINT8_MAX) {
+        return iree_status_from_code(IREE_STATUS_INVALID_ARGUMENT);
+      }
+      *(uint8_t*)out_data = (uint8_t)temp;
+      return iree_ok_status();
+    }
+    case IREE_HAL_ELEMENT_TYPE_INT_16:
+    case IREE_HAL_ELEMENT_TYPE_SINT_16: {
+      int32_t temp = 0;
+      if (!iree_string_view_atoi_int32(data_str, &temp) || temp > INT16_MAX) {
+        return iree_status_from_code(IREE_STATUS_INVALID_ARGUMENT);
+      }
+      *(int16_t*)out_data = (int16_t)temp;
+      return iree_ok_status();
+    }
+    case IREE_HAL_ELEMENT_TYPE_UINT_16: {
+      uint32_t temp = 0;
+      if (!iree_string_view_atoi_uint32(data_str, &temp) || temp > UINT16_MAX) {
+        return iree_status_from_code(IREE_STATUS_INVALID_ARGUMENT);
+      }
+      *(uint16_t*)out_data = (uint16_t)temp;
+      return iree_ok_status();
+    }
+    case IREE_HAL_ELEMENT_TYPE_INT_32:
+    case IREE_HAL_ELEMENT_TYPE_SINT_32:
+      return iree_string_view_atoi_int32(data_str, (int32_t*)out_data)
+                 ? iree_ok_status()
+                 : iree_status_from_code(IREE_STATUS_INVALID_ARGUMENT);
+    case IREE_HAL_ELEMENT_TYPE_UINT_32:
+      return iree_string_view_atoi_uint32(data_str, (uint32_t*)out_data)
+                 ? iree_ok_status()
+                 : iree_status_from_code(IREE_STATUS_INVALID_ARGUMENT);
+    case IREE_HAL_ELEMENT_TYPE_INT_64:
+    case IREE_HAL_ELEMENT_TYPE_SINT_64:
+      return iree_string_view_atoi_int64(data_str, (int64_t*)out_data)
+                 ? iree_ok_status()
+                 : iree_status_from_code(IREE_STATUS_INVALID_ARGUMENT);
+    case IREE_HAL_ELEMENT_TYPE_UINT_64:
+      return iree_string_view_atoi_uint64(data_str, (uint64_t*)out_data)
+                 ? iree_ok_status()
+                 : iree_status_from_code(IREE_STATUS_INVALID_ARGUMENT);
+    case IREE_HAL_ELEMENT_TYPE_FLOAT_16: {
+      float temp = 0;
+      if (!iree_string_view_atof(data_str, &temp)) {
+        return iree_status_from_code(IREE_STATUS_INVALID_ARGUMENT);
+      }
+      *(uint16_t*)out_data = iree_math_f32_to_f16(temp);
+      return iree_ok_status();
+    }
+    case IREE_HAL_ELEMENT_TYPE_FLOAT_32:
+      return iree_string_view_atof(data_str, (float*)out_data)
+                 ? iree_ok_status()
+                 : iree_status_from_code(IREE_STATUS_INVALID_ARGUMENT);
+    case IREE_HAL_ELEMENT_TYPE_FLOAT_64:
+      return iree_string_view_atod(data_str, (double*)out_data)
+                 ? iree_ok_status()
+                 : iree_status_from_code(IREE_STATUS_INVALID_ARGUMENT);
+    default: {
+      // Treat any unknown format as binary.
+      iree_host_size_t element_size =
+          iree_hal_element_dense_byte_count(element_type);
+      if (data_str.size != element_size * 2) {
+        return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                                "binary hex element count mismatch: buffer "
+                                "length=%zu < expected=%zu",
+                                data_str.size, element_size * 2);
+      }
+      iree_hal_hex_string_to_bytes(data_str.data, out_data, element_size);
+      return iree_ok_status();
+    }
+  }
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_parse_element(
+    iree_string_view_t data_str, iree_hal_element_type_t element_type,
+    iree_byte_span_t data_ptr) {
+  iree_host_size_t element_size =
+      iree_hal_element_dense_byte_count(element_type);
+  if (data_ptr.data_length < element_size) {
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "output data buffer overflow: data_length=%zu < element_size=%zu",
+        data_ptr.data_length, element_size);
+  }
+  return iree_hal_parse_element_unsafe(data_str, element_type, data_ptr.data);
+}
+
+// Converts a sequence of bytes into hex number strings.
+static void iree_hal_bytes_to_hex_string(const uint8_t* src, char* dest,
+                                         ptrdiff_t num) {
+  static const char kHexTable[513] =
+      "000102030405060708090A0B0C0D0E0F"
+      "101112131415161718191A1B1C1D1E1F"
+      "202122232425262728292A2B2C2D2E2F"
+      "303132333435363738393A3B3C3D3E3F"
+      "404142434445464748494A4B4C4D4E4F"
+      "505152535455565758595A5B5C5D5E5F"
+      "606162636465666768696A6B6C6D6E6F"
+      "707172737475767778797A7B7C7D7E7F"
+      "808182838485868788898A8B8C8D8E8F"
+      "909192939495969798999A9B9C9D9E9F"
+      "A0A1A2A3A4A5A6A7A8A9AAABACADAEAF"
+      "B0B1B2B3B4B5B6B7B8B9BABBBCBDBEBF"
+      "C0C1C2C3C4C5C6C7C8C9CACBCCCDCECF"
+      "D0D1D2D3D4D5D6D7D8D9DADBDCDDDEDF"
+      "E0E1E2E3E4E5E6E7E8E9EAEBECEDEEEF"
+      "F0F1F2F3F4F5F6F7F8F9FAFBFCFDFEFF";
+  for (const uint8_t* src_ptr = src; src_ptr != (src + num);
+       ++src_ptr, dest += 2) {
+    const char* hex_p = &kHexTable[*src_ptr * 2];
+    memcpy(dest, hex_p, 2);
+  }
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_format_element(
+    iree_const_byte_span_t data, iree_hal_element_type_t element_type,
+    iree_host_size_t buffer_capacity, char* buffer,
+    iree_host_size_t* out_buffer_length) {
+  iree_host_size_t element_size =
+      iree_hal_element_dense_byte_count(element_type);
+  if (data.data_length < element_size) {
+    return iree_make_status(
+        IREE_STATUS_OUT_OF_RANGE,
+        "data buffer underflow: data_length=%zu < element_size=%zu",
+        data.data_length, element_size);
+  }
+  int n = 0;
+  switch (element_type) {
+    case IREE_HAL_ELEMENT_TYPE_INT_8:
+    case IREE_HAL_ELEMENT_TYPE_SINT_8:
+      n = snprintf(buffer, buffer ? buffer_capacity : 0, "%" PRIi8,
+                   *(const int8_t*)data.data);
+      break;
+    case IREE_HAL_ELEMENT_TYPE_UINT_8:
+      n = snprintf(buffer, buffer ? buffer_capacity : 0, "%" PRIu8,
+                   *(const uint8_t*)data.data);
+      break;
+    case IREE_HAL_ELEMENT_TYPE_INT_16:
+    case IREE_HAL_ELEMENT_TYPE_SINT_16:
+      n = snprintf(buffer, buffer ? buffer_capacity : 0, "%" PRIi16,
+                   *(const int16_t*)data.data);
+      break;
+    case IREE_HAL_ELEMENT_TYPE_UINT_16:
+      n = snprintf(buffer, buffer ? buffer_capacity : 0, "%" PRIu16,
+                   *(const uint16_t*)data.data);
+      break;
+    case IREE_HAL_ELEMENT_TYPE_INT_32:
+    case IREE_HAL_ELEMENT_TYPE_SINT_32:
+      n = snprintf(buffer, buffer ? buffer_capacity : 0, "%" PRIi32,
+                   *(const int32_t*)data.data);
+      break;
+    case IREE_HAL_ELEMENT_TYPE_UINT_32:
+      n = snprintf(buffer, buffer ? buffer_capacity : 0, "%" PRIu32,
+                   *(const uint32_t*)data.data);
+      break;
+    case IREE_HAL_ELEMENT_TYPE_INT_64:
+    case IREE_HAL_ELEMENT_TYPE_SINT_64:
+      n = snprintf(buffer, buffer ? buffer_capacity : 0, "%" PRIi64,
+                   *(const int64_t*)data.data);
+      break;
+    case IREE_HAL_ELEMENT_TYPE_UINT_64:
+      n = snprintf(buffer, buffer ? buffer_capacity : 0, "%" PRIu64,
+                   *(const uint64_t*)data.data);
+      break;
+    case IREE_HAL_ELEMENT_TYPE_FLOAT_16:
+      n = snprintf(buffer, buffer ? buffer_capacity : 0, "%G",
+                   iree_math_f16_to_f32(*(const uint16_t*)data.data));
+      break;
+    case IREE_HAL_ELEMENT_TYPE_FLOAT_32:
+      n = snprintf(buffer, buffer ? buffer_capacity : 0, "%G",
+                   *(const float*)data.data);
+      break;
+    case IREE_HAL_ELEMENT_TYPE_FLOAT_64:
+      n = snprintf(buffer, buffer ? buffer_capacity : 0, "%G",
+                   *(const double*)data.data);
+      break;
+    default: {
+      // Treat any unknown format as binary.
+      n = 2 * (int)element_size;
+      if (buffer && buffer_capacity > n) {
+        iree_hal_bytes_to_hex_string(data.data, buffer, element_size);
+        buffer[n] = 0;
+      }
+    }
+  }
+  if (n < 0) {
+    return iree_make_status(IREE_STATUS_FAILED_PRECONDITION, "snprintf failed");
+  } else if (buffer && n >= buffer_capacity) {
+    buffer = NULL;
+  }
+  if (out_buffer_length) {
+    *out_buffer_length = n;
+  }
+  return buffer ? iree_ok_status()
+                : iree_status_from_code(IREE_STATUS_OUT_OF_RANGE);
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_parse_buffer_elements(
+    iree_string_view_t data_str, iree_hal_element_type_t element_type,
+    iree_byte_span_t data_ptr) {
+  iree_host_size_t element_size =
+      iree_hal_element_dense_byte_count(element_type);
+  iree_host_size_t element_capacity = data_ptr.data_length / element_size;
+  if (iree_string_view_is_empty(data_str)) {
+    memset(data_ptr.data, 0, data_ptr.data_length);
+    return iree_ok_status();
+  }
+  size_t src_i = 0;
+  size_t dst_i = 0;
+  size_t token_start = IREE_STRING_VIEW_NPOS;
+  while (src_i < data_str.size) {
+    char c = data_str.data[src_i++];
+    bool is_separator = isspace(c) || c == ',' || c == '[' || c == ']';
+    if (token_start == IREE_STRING_VIEW_NPOS) {
+      if (!is_separator) {
+        token_start = src_i - 1;
+      }
+      continue;
+    } else if (token_start != IREE_STRING_VIEW_NPOS && !is_separator) {
+      continue;
+    }
+    if (dst_i >= element_capacity) {
+      return iree_make_status(
+          IREE_STATUS_OUT_OF_RANGE,
+          "output data buffer overflow: element_capacity=%zu < dst_i=%zu+",
+          element_capacity, dst_i);
+    }
+    IREE_RETURN_IF_ERROR(iree_hal_parse_element_unsafe(
+        iree_make_string_view(data_str.data + token_start,
+                              src_i - 2 - token_start + 1),
+        element_type, data_ptr.data + dst_i * element_size));
+    ++dst_i;
+    token_start = IREE_STRING_VIEW_NPOS;
+  }
+  if (token_start != IREE_STRING_VIEW_NPOS) {
+    if (dst_i >= element_capacity) {
+      return iree_make_status(
+          IREE_STATUS_OUT_OF_RANGE,
+          "output data overflow: element_capacity=%zu < dst_i=%zu",
+          element_capacity, dst_i);
+    }
+    IREE_RETURN_IF_ERROR(iree_hal_parse_element_unsafe(
+        iree_make_string_view(data_str.data + token_start,
+                              data_str.size - token_start),
+        element_type, data_ptr.data + dst_i * element_size));
+    ++dst_i;
+  }
+  if (dst_i == 1 && element_capacity > 1) {
+    // Splat the single value we got to the entire buffer.
+    uint8_t* p = data_ptr.data + element_size;
+    for (int i = 1; i < element_capacity; ++i, p += element_size) {
+      memcpy(p, data_ptr.data, element_size);
+    }
+  } else if (dst_i < element_capacity) {
+    return iree_make_status(
+        IREE_STATUS_OUT_OF_RANGE,
+        "input data string underflow: dst_i=%zu < element_capacity=%zu", dst_i,
+        element_capacity);
+  }
+  return iree_ok_status();
+}
+
+#define APPEND_CHAR(c)                           \
+  {                                              \
+    if (buffer) {                                \
+      if (buffer_length < buffer_capacity - 1) { \
+        buffer[buffer_length] = c;               \
+        buffer[buffer_length + 1] = '\0';        \
+      } else {                                   \
+        buffer = NULL;                           \
+      }                                          \
+    }                                            \
+    ++buffer_length;                             \
+  }
+
+static iree_status_t iree_hal_format_buffer_elements_recursive(
+    iree_const_byte_span_t data, const iree_hal_dim_t* shape,
+    iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
+    iree_host_size_t* max_element_count, iree_host_size_t buffer_capacity,
+    char* buffer, iree_host_size_t* out_buffer_length) {
+  iree_host_size_t buffer_length = 0;
+  if (shape_rank == 0) {
+    // Scalar value; recurse to get on to the leaf dimension path.
+    const iree_hal_dim_t one = 1;
+    return iree_hal_format_buffer_elements_recursive(
+        data, &one, 1, element_type, max_element_count, buffer_capacity, buffer,
+        out_buffer_length);
+  } else if (shape_rank > 1) {
+    // Nested dimension; recurse into the next innermost dimension.
+    iree_hal_dim_t dim_length = 1;
+    for (iree_host_size_t i = 1; i < shape_rank; ++i) {
+      dim_length *= shape[i];
+    }
+    iree_device_size_t dim_stride =
+        dim_length * iree_hal_element_dense_byte_count(element_type);
+    if (data.data_length < dim_stride * shape[0]) {
+      return iree_make_status(
+          IREE_STATUS_OUT_OF_RANGE,
+          "input data underflow: data_length=%zu < expected=%zu",
+          data.data_length, (iree_host_size_t)(dim_stride * shape[0]));
+    }
+    iree_const_byte_span_t subdata;
+    subdata.data = data.data;
+    subdata.data_length = dim_stride;
+    for (iree_hal_dim_t i = 0; i < shape[0]; ++i) {
+      APPEND_CHAR('[');
+      iree_host_size_t actual_length = 0;
+      iree_status_t status = iree_hal_format_buffer_elements_recursive(
+          subdata, shape + 1, shape_rank - 1, element_type, max_element_count,
+          buffer ? buffer_capacity - buffer_length : 0,
+          buffer ? buffer + buffer_length : NULL, &actual_length);
+      buffer_length += actual_length;
+      if (iree_status_is_out_of_range(status)) {
+        buffer = NULL;
+      } else if (!iree_status_is_ok(status)) {
+        return status;
+      }
+      subdata.data += dim_stride;
+      APPEND_CHAR(']');
+    }
+  } else {
+    // Leaf dimension; output data.
+    iree_host_size_t max_count =
+        iree_min(*max_element_count, (iree_host_size_t)shape[0]);
+    iree_device_size_t element_stride =
+        iree_hal_element_dense_byte_count(element_type);
+    if (data.data_length < max_count * element_stride) {
+      return iree_make_status(
+          IREE_STATUS_OUT_OF_RANGE,
+          "input data underflow; data_length=%zu < expected=%zu",
+          data.data_length, (iree_host_size_t)(max_count * element_stride));
+    }
+    *max_element_count -= max_count;
+    iree_const_byte_span_t subdata;
+    subdata.data = data.data;
+    subdata.data_length = element_stride;
+    for (iree_hal_dim_t i = 0; i < max_count; ++i) {
+      if (i > 0) APPEND_CHAR(' ');
+      iree_host_size_t actual_length = 0;
+      iree_status_t status = iree_hal_format_element(
+          subdata, element_type, buffer ? buffer_capacity - buffer_length : 0,
+          buffer ? buffer + buffer_length : NULL, &actual_length);
+      subdata.data += element_stride;
+      buffer_length += actual_length;
+      if (iree_status_is_out_of_range(status)) {
+        buffer = NULL;
+      } else if (!iree_status_is_ok(status)) {
+        return status;
+      }
+    }
+    if (max_count < shape[0]) {
+      APPEND_CHAR('.');
+      APPEND_CHAR('.');
+      APPEND_CHAR('.');
+    }
+  }
+  if (out_buffer_length) {
+    *out_buffer_length = buffer_length;
+  }
+  return buffer ? iree_ok_status()
+                : iree_status_from_code(IREE_STATUS_OUT_OF_RANGE);
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_format_buffer_elements(
+    iree_const_byte_span_t data, const iree_hal_dim_t* shape,
+    iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
+    iree_host_size_t max_element_count, iree_host_size_t buffer_capacity,
+    char* buffer, iree_host_size_t* out_buffer_length) {
+  if (out_buffer_length) {
+    *out_buffer_length = 0;
+  }
+  if (buffer && buffer_capacity) {
+    buffer[0] = '\0';
+  }
+  return iree_hal_format_buffer_elements_recursive(
+      data, shape, shape_rank, element_type, &max_element_count,
+      buffer_capacity, buffer, out_buffer_length);
+}
diff --git a/runtime/src/iree/hal/string_util.h b/runtime/src/iree/hal/string_util.h
new file mode 100644
index 0000000..3e8b1bf
--- /dev/null
+++ b/runtime/src/iree/hal/string_util.h
@@ -0,0 +1,104 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_STRING_UTIL_H_
+#define IREE_HAL_STRING_UTIL_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/buffer.h"
+#include "iree/hal/buffer_view.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Parses a serialized set of shape dimensions using the canonical shape format
+// (the same as produced by iree_hal_format_shape).
+IREE_API_EXPORT iree_status_t iree_hal_parse_shape(
+    iree_string_view_t value, iree_host_size_t shape_capacity,
+    iree_hal_dim_t* out_shape, iree_host_size_t* out_shape_rank);
+
+// Converts shape dimensions into a `4x5x6` format.
+//
+// Follows the standard API string formatting rules. See iree/base/api.h.
+IREE_API_EXPORT iree_status_t
+iree_hal_format_shape(const iree_hal_dim_t* shape, iree_host_size_t shape_rank,
+                      iree_host_size_t buffer_capacity, char* buffer,
+                      iree_host_size_t* out_buffer_length);
+
+// Parses a serialized iree_hal_element_type_t and sets |out_element_type| if
+// it is valid. The format is the same as produced by
+// iree_hal_format_element_type.
+IREE_API_EXPORT iree_status_t iree_hal_parse_element_type(
+    iree_string_view_t value, iree_hal_element_type_t* out_element_type);
+
+// Converts an iree_hal_element_type_t enum value to a canonical string
+// representation, like `IREE_HAL_ELEMENT_TYPE_FLOAT_16` to `f16`.
+// |buffer_capacity| defines the size of |buffer| in bytes and
+// |out_buffer_length| will return the string length in characters.
+//
+// Follows the standard API string formatting rules. See iree/base/api.h.
+IREE_API_EXPORT iree_status_t iree_hal_format_element_type(
+    iree_hal_element_type_t element_type, iree_host_size_t buffer_capacity,
+    char* buffer, iree_host_size_t* out_buffer_length);
+
+// Parses a serialized element of |element_type| to its in-memory form.
+// |data_ptr| must be at least large enough to contain the bytes of the element.
+// For example, "1.2" of type IREE_HAL_ELEMENT_TYPE_FLOAT32 will write the 4
+// byte float value of 1.2 to |data_ptr|.
+IREE_API_EXPORT iree_status_t iree_hal_parse_element(
+    iree_string_view_t data_str, iree_hal_element_type_t element_type,
+    iree_byte_span_t data_ptr);
+
+// Converts a single element of |element_type| to a string.
+//
+// |buffer_capacity| defines the size of |buffer| in bytes and
+// |out_buffer_length| will return the string length in characters. Returns
+// IREE_STATUS_OUT_OF_RANGE if the buffer capacity is insufficient to hold the
+// formatted elements and |out_buffer_length| will contain the required size.
+//
+// Follows the standard API string formatting rules. See iree/base/api.h.
+IREE_API_EXPORT iree_status_t iree_hal_format_element(
+    iree_const_byte_span_t data, iree_hal_element_type_t element_type,
+    iree_host_size_t buffer_capacity, char* buffer,
+    iree_host_size_t* out_buffer_length);
+
+// Parses a serialized set of elements of the given |element_type|.
+// The resulting parsed data is written to |data_ptr|, which must be at least
+// large enough to contain the parsed elements. The format is the same as
+// produced by iree_hal_format_buffer_elements. Supports additional inputs of
+// empty to denote a 0 fill and a single element to denote a splat.
+IREE_API_EXPORT iree_status_t iree_hal_parse_buffer_elements(
+    iree_string_view_t data_str, iree_hal_element_type_t element_type,
+    iree_byte_span_t data_ptr);
+
+// Converts a shaped buffer of |element_type| elements to a string.
+// This will include []'s to denote each dimension, for example for a shape of
+// 2x3 the elements will be formatted as `[1 2 3][4 5 6]`.
+//
+// |max_element_count| can be used to limit the total number of elements printed
+// when the count may be large. Elided elements will be replaced with `...`.
+//
+// |buffer_capacity| defines the size of |buffer| in bytes and
+// |out_buffer_length| will return the string length in characters. Returns
+// IREE_STATUS_OUT_OF_RANGE if the buffer capacity is insufficient to hold the
+// formatted elements and |out_buffer_length| will contain the required size.
+//
+// Follows the standard API string formatting rules. See iree/base/api.h.
+IREE_API_EXPORT iree_status_t iree_hal_format_buffer_elements(
+    iree_const_byte_span_t data, const iree_hal_dim_t* shape,
+    iree_host_size_t shape_rank, iree_hal_element_type_t element_type,
+    iree_host_size_t max_element_count, iree_host_size_t buffer_capacity,
+    char* buffer, iree_host_size_t* out_buffer_length);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_STRING_UTIL_H_
diff --git a/runtime/src/iree/hal/string_util_test.cc b/runtime/src/iree/hal/string_util_test.cc
new file mode 100644
index 0000000..453f99c
--- /dev/null
+++ b/runtime/src/iree/hal/string_util_test.cc
@@ -0,0 +1,1049 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/span.h"
+#include "iree/base/status_cc.h"
+#include "iree/hal/api.h"
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace iree {
+namespace hal {
+namespace {
+
+using ::iree::testing::status::IsOkAndHolds;
+using ::iree::testing::status::StatusIs;
+using ::testing::ElementsAre;
+using ::testing::Eq;
+
+// TODO(benvanik): move these utils to C++ bindings.
+using Shape = std::vector<iree_hal_dim_t>;
+
+// Parses a serialized set of shape dimensions using the canonical shape format
+// (the same as produced by FormatShape).
+StatusOr<Shape> ParseShape(const std::string& value) {
+  Shape shape(6);
+  iree_host_size_t actual_rank = 0;
+  iree_status_t status;
+  do {
+    status =
+        iree_hal_parse_shape(iree_string_view_t{value.data(), value.size()},
+                             shape.size(), shape.data(), &actual_rank);
+    shape.resize(actual_rank);
+  } while (iree_status_is_out_of_range(status));
+  IREE_RETURN_IF_ERROR(std::move(status));
+  return std::move(shape);
+}
+
+// Converts shape dimensions into a `4x5x6` format.
+StatusOr<std::string> FormatShape(iree::span<const iree_hal_dim_t> value) {
+  std::string buffer(16, '\0');
+  iree_host_size_t actual_length = 0;
+  iree_status_t status;
+  do {
+    status =
+        iree_hal_format_shape(value.data(), value.size(), buffer.size() + 1,
+                              &buffer[0], &actual_length);
+    buffer.resize(actual_length);
+  } while (iree_status_is_out_of_range(status));
+  IREE_RETURN_IF_ERROR(std::move(status));
+  return std::move(buffer);
+}
+
+// Parses a serialized iree_hal_element_type_t. The format is the same as
+// produced by FormatElementType.
+StatusOr<iree_hal_element_type_t> ParseElementType(const std::string& value) {
+  iree_hal_element_type_t element_type = IREE_HAL_ELEMENT_TYPE_NONE;
+  iree_status_t status = iree_hal_parse_element_type(
+      iree_string_view_t{value.data(), value.size()}, &element_type);
+  IREE_RETURN_IF_ERROR(status, "Failed to parse element type '%.*s'",
+                       (int)value.size(), value.data());
+  return element_type;
+}
+
+// Converts an iree_hal_element_type_t enum value to a canonical string
+// representation, like `IREE_HAL_ELEMENT_TYPE_FLOAT_16` to `f16`.
+StatusOr<std::string> FormatElementType(iree_hal_element_type_t value) {
+  std::string buffer(16, '\0');
+  iree_host_size_t actual_length = 0;
+  iree_status_t status;
+  do {
+    status = iree_hal_format_element_type(value, buffer.size() + 1, &buffer[0],
+                                          &actual_length);
+    buffer.resize(actual_length);
+  } while (iree_status_is_out_of_range(status));
+  IREE_RETURN_IF_ERROR(status);
+  return std::move(buffer);
+}
+
+// Parses a serialized element of |element_type| to its in-memory form.
+// |buffer| be at least large enough to contain the bytes of the element.
+// For example, "1.2" of type IREE_HAL_ELEMENT_TYPE_FLOAT32 will write the 4
+// byte float value of 1.2 to |buffer|.
+template <typename T>
+Status ParseElement(const std::string& value,
+                    iree_hal_element_type_t element_type,
+                    iree::span<T> buffer) {
+  return iree_hal_parse_element(
+      iree_string_view_t{value.data(), value.size()}, element_type,
+      iree_byte_span_t{reinterpret_cast<uint8_t*>(buffer.data()),
+                       buffer.size() * sizeof(T)});
+}
+
+// Converts a single element of |element_type| to a string.
+template <typename T>
+StatusOr<std::string> FormatElement(T value,
+                                    iree_hal_element_type_t element_type) {
+  std::string result(16, '\0');
+  iree_status_t status;
+  do {
+    iree_host_size_t actual_length = 0;
+    status = iree_hal_format_element(
+        iree_const_byte_span_t{reinterpret_cast<const uint8_t*>(&value),
+                               sizeof(T)},
+        element_type, result.size() + 1, &result[0], &actual_length);
+    result.resize(actual_length);
+  } while (iree_status_is_out_of_range(status));
+  IREE_RETURN_IF_ERROR(status, "failed to format buffer element");
+  return std::move(result);
+}
+
+// Parses a serialized set of elements of the given |element_type|.
+// The resulting parsed data is written to |buffer|, which must be at least
+// large enough to contain the parsed elements. The format is the same as
+// produced by FormatBufferElements. Supports additional inputs of
+// empty to denote a 0 fill and a single element to denote a splat.
+template <typename T>
+Status ParseBufferElements(const std::string& value,
+                           iree_hal_element_type_t element_type,
+                           iree::span<T> buffer) {
+  IREE_RETURN_IF_ERROR(
+      iree_hal_parse_buffer_elements(
+          iree_string_view_t{value.data(), value.size()}, element_type,
+          iree_byte_span_t{reinterpret_cast<uint8_t*>(buffer.data()),
+                           buffer.size() * sizeof(T)}),
+      "failed to parse buffer elements '%.*s'",
+      iree_min(256, (int)value.size()), value.data());
+  return OkStatus();
+}
+
+// Converts a shaped buffer of |element_type| elements to a string.
+// This will include []'s to denote each dimension, for example for a shape of
+// 2x3 the elements will be formatted as `[1 2 3][4 5 6]`.
+//
+// |max_element_count| can be used to limit the total number of elements printed
+// when the count may be large. Elided elements will be replaced with `...`.
+template <typename T>
+StatusOr<std::string> FormatBufferElements(iree::span<const T> data,
+                                           const Shape& shape,
+                                           iree_hal_element_type_t element_type,
+                                           size_t max_element_count) {
+  std::string result(255, '\0');
+  iree_status_t status;
+  do {
+    iree_host_size_t actual_length = 0;
+    status = iree_hal_format_buffer_elements(
+        iree_const_byte_span_t{reinterpret_cast<const uint8_t*>(data.data()),
+                               data.size() * sizeof(T)},
+        shape.data(), shape.size(), element_type, max_element_count,
+        result.size() + 1, &result[0], &actual_length);
+    result.resize(actual_length);
+  } while (iree_status_is_out_of_range(status));
+  IREE_RETURN_IF_ERROR(std::move(status));
+  return std::move(result);
+}
+
+// Maps a C type (eg float) to the HAL type (eg IREE_HAL_ELEMENT_TYPE_FLOAT32).
+template <typename T>
+struct ElementTypeFromCType;
+
+template <>
+struct ElementTypeFromCType<int8_t> {
+  static constexpr iree_hal_element_type_t value = IREE_HAL_ELEMENT_TYPE_SINT_8;
+};
+template <>
+struct ElementTypeFromCType<uint8_t> {
+  static constexpr iree_hal_element_type_t value = IREE_HAL_ELEMENT_TYPE_UINT_8;
+};
+template <>
+struct ElementTypeFromCType<int16_t> {
+  static constexpr iree_hal_element_type_t value =
+      IREE_HAL_ELEMENT_TYPE_SINT_16;
+};
+template <>
+struct ElementTypeFromCType<uint16_t> {
+  static constexpr iree_hal_element_type_t value =
+      IREE_HAL_ELEMENT_TYPE_UINT_16;
+};
+template <>
+struct ElementTypeFromCType<int32_t> {
+  static constexpr iree_hal_element_type_t value =
+      IREE_HAL_ELEMENT_TYPE_SINT_32;
+};
+template <>
+struct ElementTypeFromCType<uint32_t> {
+  static constexpr iree_hal_element_type_t value =
+      IREE_HAL_ELEMENT_TYPE_UINT_32;
+};
+template <>
+struct ElementTypeFromCType<int64_t> {
+  static constexpr iree_hal_element_type_t value =
+      IREE_HAL_ELEMENT_TYPE_SINT_64;
+};
+template <>
+struct ElementTypeFromCType<uint64_t> {
+  static constexpr iree_hal_element_type_t value =
+      IREE_HAL_ELEMENT_TYPE_UINT_64;
+};
+template <>
+struct ElementTypeFromCType<float> {
+  static constexpr iree_hal_element_type_t value =
+      IREE_HAL_ELEMENT_TYPE_FLOAT_32;
+};
+template <>
+struct ElementTypeFromCType<double> {
+  static constexpr iree_hal_element_type_t value =
+      IREE_HAL_ELEMENT_TYPE_FLOAT_64;
+};
+
+// Parses a serialized element of type T to its in-memory form.
+// For example, "1.2" of type float (IREE_HAL_ELEMENT_TYPE_FLOAT32) will return
+// 1.2f.
+template <typename T>
+inline StatusOr<T> ParseElement(const std::string& value) {
+  T result = T();
+  IREE_RETURN_IF_ERROR(ParseElement(value, ElementTypeFromCType<T>::value,
+                                    iree::span<T>(&result, 1)));
+  return result;
+}
+
+// Converts a single element of to a string value.
+template <typename T>
+inline StatusOr<std::string> FormatElement(T value) {
+  return FormatElement(value, ElementTypeFromCType<T>::value);
+}
+
+// Parses a serialized set of elements of type T.
+// The resulting parsed data is written to |buffer|, which must be at least
+// large enough to contain the parsed elements. The format is the same as
+// produced by FormatBufferElements. Supports additional inputs of
+// empty to denote a 0 fill and a single element to denote a splat.
+template <typename T>
+inline Status ParseBufferElements(const std::string& value,
+                                  iree::span<T> buffer) {
+  return ParseBufferElements(value, ElementTypeFromCType<T>::value, buffer);
+}
+
+// Parses a serialized set of elements of type T defined by |shape|.
+// The format is the same as produced by FormatBufferElements. Supports
+// additional inputs of empty to denote a 0 fill and a single element to denote
+// a splat.
+template <typename T>
+inline StatusOr<std::vector<T>> ParseBufferElements(const std::string& value,
+                                                    const Shape& shape) {
+  iree_host_size_t element_count = 1;
+  for (size_t i = 0; i < shape.size(); ++i) {
+    element_count *= shape[i];
+  }
+  std::vector<T> result(element_count);
+  IREE_RETURN_IF_ERROR(ParseBufferElements(value, iree::span<T>(result)));
+  return std::move(result);
+}
+
+// Converts a shaped buffer of |element_type| elements to a string.
+// This will include []'s to denote each dimension, for example for a shape of
+// 2x3 the elements will be formatted as `[1 2 3][4 5 6]`.
+//
+// |max_element_count| can be used to limit the total number of elements printed
+// when the count may be large. Elided elements will be replaced with `...`.
+template <typename T>
+StatusOr<std::string> FormatBufferElements(
+    iree::span<const T> data, const Shape& shape,
+    size_t max_element_count = SIZE_MAX) {
+  return FormatBufferElements(data, shape, ElementTypeFromCType<T>::value,
+                              max_element_count);
+}
+
+// C API iree_*_retain/iree_*_release function pointer.
+template <typename T>
+using HandleRefFn = void(IREE_API_PTR*)(T*);
+
+// C++ RAII wrapper for an IREE C reference object.
+// Behaves the same as a thread-safe intrusive pointer.
+template <typename T, HandleRefFn<T> retain_fn, HandleRefFn<T> release_fn>
+class Handle {
+ public:
+  using handle_type = Handle<T, retain_fn, release_fn>;
+
+  static Handle Wrap(T* value) noexcept { return Handle(value, false); }
+
+  Handle() noexcept = default;
+  Handle(std::nullptr_t) noexcept {}
+  Handle(T* value) noexcept : value_(value) { retain_fn(value_); }
+
+  ~Handle() noexcept {
+    if (value_) release_fn(value_);
+  }
+
+  Handle(const Handle& rhs) noexcept : value_(rhs.value_) {
+    if (value_) retain_fn(value_);
+  }
+  Handle& operator=(const Handle& rhs) noexcept {
+    if (value_ != rhs.value_) {
+      if (value_) release_fn(value_);
+      value_ = rhs.get();
+      if (value_) retain_fn(value_);
+    }
+    return *this;
+  }
+
+  Handle(Handle&& rhs) noexcept : value_(rhs.release()) {}
+  Handle& operator=(Handle&& rhs) noexcept {
+    if (value_ != rhs.value_) {
+      if (value_) release_fn(value_);
+      value_ = rhs.release();
+    }
+    return *this;
+  }
+
+  // Gets the pointer referenced by this instance.
+  constexpr T* get() const noexcept { return value_; }
+  constexpr operator T*() const noexcept { return value_; }
+
+  // Resets the object to nullptr and decrements the reference count, possibly
+  // deleting it.
+  void reset() noexcept {
+    if (value_) {
+      release_fn(value_);
+      value_ = nullptr;
+    }
+  }
+
+  // Returns the current pointer held by this object without having its
+  // reference count decremented and resets the handle to empty. Returns
+  // nullptr if the handle holds no value. To re-wrap in a handle use either
+  // ctor(value) or assign().
+  T* release() noexcept {
+    auto* p = value_;
+    value_ = nullptr;
+    return p;
+  }
+
+  // Assigns a pointer.
+  // The pointer will be accepted by the handle and its reference count will
+  // not be incremented.
+  void assign(T* value) noexcept {
+    reset();
+    value_ = value;
+  }
+
+  // Returns a pointer to the inner pointer storage.
+  // This allows passing a pointer to the handle as an output argument to
+  // C-style creation functions.
+  constexpr T** operator&() noexcept { return &value_; }
+
+  // Support boolean expression evaluation ala unique_ptr/shared_ptr:
+  // https://en.cppreference.com/w/cpp/memory/shared_ptr/operator_bool
+  typedef T* Handle::*unspecified_bool_type;
+  constexpr operator unspecified_bool_type() const noexcept {
+    return value_ ? &Handle::value_ : nullptr;
+  }
+
+  // Supports unary expression evaluation.
+  constexpr bool operator!() const noexcept { return !value_; }
+
+  // Swap support.
+  void swap(Handle& rhs) noexcept { std::swap(value_, rhs.value_); }
+
+ protected:
+  Handle(T* value, bool) noexcept : value_(value) {}
+
+ private:
+  T* value_ = nullptr;
+};
+
+// C++ wrapper for iree_hal_allocator_t.
+struct Allocator final
+    : public Handle<iree_hal_allocator_t, iree_hal_allocator_retain,
+                    iree_hal_allocator_release> {
+  using handle_type::handle_type;
+
+  // Creates a host-local heap allocator that can be used when buffers are
+  // required that will not interact with a real hardware device (such as those
+  // used in file IO or tests). Buffers allocated with this will not be
+  // compatible with real device allocators and will likely incur a copy if
+  // used.
+  static StatusOr<Allocator> CreateHostLocal() {
+    Allocator allocator;
+    iree_status_t status = iree_hal_allocator_create_heap(
+        iree_make_cstring_view("host_local"), iree_allocator_system(),
+        iree_allocator_system(), &allocator);
+    IREE_RETURN_IF_ERROR(std::move(status));
+    return std::move(allocator);
+  }
+};
+
+// C++ wrapper for iree_hal_buffer_t.
+struct Buffer final : public Handle<iree_hal_buffer_t, iree_hal_buffer_retain,
+                                    iree_hal_buffer_release> {
+  using handle_type::handle_type;
+
+  // Returns the size in bytes of the buffer.
+  iree_device_size_t byte_length() const noexcept {
+    return iree_hal_buffer_byte_length(get());
+  }
+
+  // Returns a copy of the buffer contents interpreted as the given type in
+  // host-format.
+  template <typename T>
+  StatusOr<std::vector<T>> CloneData() noexcept {
+    iree_device_size_t total_byte_length = byte_length();
+    std::vector<T> result(total_byte_length / sizeof(T));
+    iree_status_t status =
+        iree_hal_buffer_map_read(get(), 0, result.data(), total_byte_length);
+    IREE_RETURN_IF_ERROR(std::move(status));
+    return std::move(result);
+  }
+};
+
+// C++ wrapper for iree_hal_buffer_view_t.
+struct BufferView final
+    : public Handle<iree_hal_buffer_view_t, iree_hal_buffer_view_retain,
+                    iree_hal_buffer_view_release> {
+  using handle_type::handle_type;
+
+  // Creates a buffer view with a reference to the given |buffer|.
+  static StatusOr<BufferView> Create(Buffer buffer,
+                                     iree::span<const iree_hal_dim_t> shape,
+                                     iree_hal_element_type_t element_type) {
+    iree_hal_encoding_type_t encoding_type =
+        IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR;
+    BufferView buffer_view;
+    iree_status_t status = iree_hal_buffer_view_create(
+        buffer, shape.data(), shape.size(), element_type, encoding_type,
+        iree_allocator_system(), &buffer_view);
+    IREE_RETURN_IF_ERROR(std::move(status));
+    return std::move(buffer_view);
+  }
+
+  // TODO(benvanik): subview.
+
+  // Returns the buffer underlying the buffer view.
+  inline Buffer buffer() const noexcept {
+    return Buffer(iree_hal_buffer_view_buffer(get()));
+  }
+
+  // Returns the dimensions of the shape.
+  Shape shape() const noexcept {
+    iree_status_t status;
+    Shape shape(6);
+    do {
+      iree_host_size_t actual_rank = 0;
+      status = iree_hal_buffer_view_shape(get(), shape.size(), shape.data(),
+                                          &actual_rank);
+      shape.resize(actual_rank);
+    } while (iree_status_is_out_of_range(status));
+    IREE_CHECK_OK(status);
+    return shape;
+  }
+
+  // Returns the total number of elements stored in the view.
+  inline iree_host_size_t element_count() const noexcept {
+    return iree_hal_buffer_view_element_count(get());
+  }
+
+  // Returns the element type of the buffer.
+  inline iree_hal_element_type_t element_type() const noexcept {
+    return iree_hal_buffer_view_element_type(get());
+  }
+
+  // Returns the total size of the specified view in bytes.
+  // Note that not all buffers are contiguous or densely packed.
+  inline iree_device_size_t byte_length() const noexcept {
+    return iree_hal_buffer_view_byte_length(get());
+  }
+
+  // TODO(benvanik): compute offset/range.
+
+  // Parses a serialized set of buffer elements in the canonical tensor format
+  // (the same as produced by Format).
+  static StatusOr<BufferView> Parse(const std::string& value,
+                                    Allocator allocator) {
+    BufferView buffer_view;
+    iree_status_t status = iree_hal_buffer_view_parse(
+        iree_string_view_t{value.data(), value.size()}, allocator,
+        &buffer_view);
+    IREE_RETURN_IF_ERROR(std::move(status));
+    return std::move(buffer_view);
+  }
+
+  // Converts buffer view elements into a fully-specified string-form format
+  // like `2x4xi16=[[1 2][3 4]]`.
+  //
+  // |max_element_count| can be used to limit the total number of elements
+  // printed when the count may be large. Elided elements will be replaced with
+  // `...`.
+  StatusOr<std::string> ToString(size_t max_element_count = SIZE_MAX) const {
+    std::string result(255, '\0');
+    iree_status_t status;
+    do {
+      iree_host_size_t actual_length = 0;
+      status = iree_hal_buffer_view_format(get(), max_element_count,
+                                           result.size() + 1, &result[0],
+                                           &actual_length);
+      result.resize(actual_length);
+    } while (iree_status_is_out_of_range(status));
+    IREE_RETURN_IF_ERROR(std::move(status));
+    return std::move(result);
+  }
+};
+
+TEST(ShapeStringUtilTest, ParseShape) {
+  EXPECT_THAT(ParseShape(""), IsOkAndHolds(Eq(Shape{})));
+  EXPECT_THAT(ParseShape("0"), IsOkAndHolds(Eq(Shape{0})));
+  EXPECT_THAT(ParseShape("1"), IsOkAndHolds(Eq(Shape{1})));
+  EXPECT_THAT(ParseShape("1x2"), IsOkAndHolds(Eq(Shape{1, 2})));
+  EXPECT_THAT(ParseShape(" 1 x 2 "), IsOkAndHolds(Eq(Shape{1, 2})));
+  EXPECT_THAT(ParseShape("1x2x3x4x5"), IsOkAndHolds(Eq(Shape{1, 2, 3, 4, 5})));
+  EXPECT_THAT(ParseShape("1x2x3x4x5x6x7x8x9"),
+              IsOkAndHolds(Eq(Shape{1, 2, 3, 4, 5, 6, 7, 8, 9})));
+}
+
+TEST(ShapeStringUtilTest, ParseShapeInvalid) {
+  EXPECT_THAT(ParseShape("abc"), StatusIs(StatusCode::kInvalidArgument));
+  EXPECT_THAT(ParseShape("1xf"), StatusIs(StatusCode::kInvalidArgument));
+  EXPECT_THAT(ParseShape("1xff23"), StatusIs(StatusCode::kInvalidArgument));
+  EXPECT_THAT(ParseShape("1xf32"), StatusIs(StatusCode::kInvalidArgument));
+  EXPECT_THAT(ParseShape("x"), StatusIs(StatusCode::kInvalidArgument));
+  EXPECT_THAT(ParseShape("x1"), StatusIs(StatusCode::kInvalidArgument));
+  EXPECT_THAT(ParseShape("1x"), StatusIs(StatusCode::kInvalidArgument));
+  EXPECT_THAT(ParseShape("x1x2"), StatusIs(StatusCode::kInvalidArgument));
+  EXPECT_THAT(ParseShape("1xx2"), StatusIs(StatusCode::kInvalidArgument));
+  EXPECT_THAT(ParseShape("1x2x"), StatusIs(StatusCode::kInvalidArgument));
+  EXPECT_THAT(ParseShape("0x-1"), StatusIs(StatusCode::kInvalidArgument));
+}
+
+TEST(ShapeStringUtilTest, FormatShape) {
+  EXPECT_THAT(FormatShape(Shape{}), IsOkAndHolds(Eq("")));
+  EXPECT_THAT(FormatShape(Shape{0}), IsOkAndHolds(Eq("0")));
+  EXPECT_THAT(FormatShape(Shape{1}), IsOkAndHolds(Eq("1")));
+  EXPECT_THAT(FormatShape(Shape{1, 2}), IsOkAndHolds(Eq("1x2")));
+  EXPECT_THAT(FormatShape(Shape{1, 2, 3, 4, 5}), IsOkAndHolds(Eq("1x2x3x4x5")));
+  EXPECT_THAT(
+      FormatShape(Shape{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+                        17, 18, 19}),
+      IsOkAndHolds(Eq("1x2x3x4x5x6x7x8x9x10x11x12x13x14x15x16x17x18x19")));
+}
+
+TEST(ElementTypeStringUtilTest, ParseElementType) {
+  EXPECT_THAT(ParseElementType("i8"),
+              IsOkAndHolds(Eq(IREE_HAL_ELEMENT_TYPE_INT_8)));
+  EXPECT_THAT(ParseElementType("si8"),
+              IsOkAndHolds(Eq(IREE_HAL_ELEMENT_TYPE_SINT_8)));
+  EXPECT_THAT(ParseElementType("ui16"),
+              IsOkAndHolds(Eq(IREE_HAL_ELEMENT_TYPE_UINT_16)));
+  EXPECT_THAT(ParseElementType("f32"),
+              IsOkAndHolds(Eq(IREE_HAL_ELEMENT_TYPE_FLOAT_32)));
+  EXPECT_THAT(ParseElementType("f16"),
+              IsOkAndHolds(Eq(IREE_HAL_ELEMENT_TYPE_FLOAT_16)));
+  EXPECT_THAT(ParseElementType("bf16"),
+              IsOkAndHolds(Eq(IREE_HAL_ELEMENT_TYPE_BFLOAT_16)));
+  EXPECT_THAT(ParseElementType("x64"),
+              IsOkAndHolds(Eq(IREE_HAL_ELEMENT_TYPE_OPAQUE_64)));
+  EXPECT_THAT(ParseElementType("*64"),
+              IsOkAndHolds(Eq(IREE_HAL_ELEMENT_TYPE_OPAQUE_64)));
+  EXPECT_THAT(ParseElementType("f4"),
+              IsOkAndHolds(Eq(iree_hal_make_element_type(
+                  IREE_HAL_NUMERICAL_TYPE_FLOAT_IEEE, 4))));
+}
+
+TEST(ElementTypeStringUtilTest, ParseElementTypeInvalid) {
+  EXPECT_THAT(ParseElementType(""), StatusIs(StatusCode::kInvalidArgument));
+  EXPECT_THAT(ParseElementType("1"), StatusIs(StatusCode::kInvalidArgument));
+  EXPECT_THAT(ParseElementType("*1234"),
+              StatusIs(StatusCode::kInvalidArgument));
+}
+
+TEST(ElementTypeStringUtilTest, FormatElementType) {
+  EXPECT_THAT(FormatElementType(IREE_HAL_ELEMENT_TYPE_INT_8),
+              IsOkAndHolds(Eq("i8")));
+  EXPECT_THAT(FormatElementType(IREE_HAL_ELEMENT_TYPE_SINT_8),
+              IsOkAndHolds(Eq("si8")));
+  EXPECT_THAT(FormatElementType(IREE_HAL_ELEMENT_TYPE_UINT_16),
+              IsOkAndHolds(Eq("ui16")));
+  EXPECT_THAT(FormatElementType(IREE_HAL_ELEMENT_TYPE_FLOAT_32),
+              IsOkAndHolds(Eq("f32")));
+  EXPECT_THAT(FormatElementType(IREE_HAL_ELEMENT_TYPE_BFLOAT_16),
+              IsOkAndHolds(Eq("bf16")));
+  EXPECT_THAT(FormatElementType(IREE_HAL_ELEMENT_TYPE_OPAQUE_64),
+              IsOkAndHolds(Eq("*64")));
+  EXPECT_THAT(FormatElementType(iree_hal_make_element_type(
+                  IREE_HAL_NUMERICAL_TYPE_FLOAT_IEEE, 4)),
+              IsOkAndHolds(Eq("f4")));
+}
+
+TEST(ElementStringUtilTest, ParseElement) {
+  EXPECT_THAT(ParseElement<int8_t>("-128"), IsOkAndHolds(Eq(INT8_MIN)));
+  EXPECT_THAT(ParseElement<int8_t>("127"), IsOkAndHolds(Eq(INT8_MAX)));
+  EXPECT_THAT(ParseElement<uint8_t>("255"), IsOkAndHolds(Eq(UINT8_MAX)));
+  EXPECT_THAT(ParseElement<int16_t>("-32768"), IsOkAndHolds(Eq(INT16_MIN)));
+  EXPECT_THAT(ParseElement<int16_t>("32767"), IsOkAndHolds(Eq(INT16_MAX)));
+  EXPECT_THAT(ParseElement<uint16_t>("65535"), IsOkAndHolds(Eq(UINT16_MAX)));
+  EXPECT_THAT(ParseElement<int32_t>("-2147483648"),
+              IsOkAndHolds(Eq(INT32_MIN)));
+  EXPECT_THAT(ParseElement<int32_t>("2147483647"), IsOkAndHolds(Eq(INT32_MAX)));
+  EXPECT_THAT(ParseElement<uint32_t>("4294967295"),
+              IsOkAndHolds(Eq(UINT32_MAX)));
+  EXPECT_THAT(ParseElement<int64_t>("-9223372036854775808"),
+              IsOkAndHolds(Eq(INT64_MIN)));
+  EXPECT_THAT(ParseElement<int64_t>("9223372036854775807"),
+              IsOkAndHolds(Eq(INT64_MAX)));
+  EXPECT_THAT(ParseElement<uint64_t>("18446744073709551615"),
+              IsOkAndHolds(Eq(UINT64_MAX)));
+  EXPECT_THAT(ParseElement<float>("1.5"), IsOkAndHolds(Eq(1.5f)));
+  EXPECT_THAT(ParseElement<double>("1.567890123456789"),
+              IsOkAndHolds(Eq(1.567890123456789)));
+  EXPECT_THAT(ParseElement<double>("-1.5e-10"), IsOkAndHolds(Eq(-1.5e-10)));
+}
+
+TEST(ElementStringUtilTest, ParseElementOutOfRange) {
+  EXPECT_THAT(ParseElement<int8_t>("255"),
+              StatusIs(StatusCode::kInvalidArgument));
+  EXPECT_THAT(ParseElement<uint8_t>("-128"),
+              StatusIs(StatusCode::kInvalidArgument));
+  EXPECT_THAT(ParseElement<int16_t>("65535"),
+              StatusIs(StatusCode::kInvalidArgument));
+  EXPECT_THAT(ParseElement<uint16_t>("-32768"),
+              StatusIs(StatusCode::kInvalidArgument));
+  // TODO(benvanik): these don't seem to work the same across all stdlib
+  // implementations. The current implementation works with MSVC but fails under
+  // clang. The fact that these failed like they did at all may have just been
+  // an artifact of abseil and I'm not too concerned about matching that
+  // behavior exactly enough to spend any more time on it now.
+  // EXPECT_THAT(ParseElement<int32_t>("4294967295"),
+  //             StatusIs(StatusCode::kInvalidArgument));
+  // EXPECT_THAT(ParseElement<uint32_t>("4294967296"),
+  //             StatusIs(StatusCode::kInvalidArgument));
+  EXPECT_THAT(ParseElement<int32_t>("18446744073709551615"),
+              StatusIs(StatusCode::kInvalidArgument));
+  EXPECT_THAT(ParseElement<uint32_t>("-9223372036854775808"),
+              StatusIs(StatusCode::kInvalidArgument));
+}
+
+TEST(ElementStringUtilTest, ParseElementInvalid) {
+  EXPECT_THAT(ParseElement<int8_t>(""), StatusIs(StatusCode::kInvalidArgument));
+  EXPECT_THAT(ParseElement<uint8_t>(""),
+              StatusIs(StatusCode::kInvalidArgument));
+  EXPECT_THAT(ParseElement<int16_t>(""),
+              StatusIs(StatusCode::kInvalidArgument));
+  EXPECT_THAT(ParseElement<uint16_t>(""),
+              StatusIs(StatusCode::kInvalidArgument));
+  EXPECT_THAT(ParseElement<int32_t>(""),
+              StatusIs(StatusCode::kInvalidArgument));
+  EXPECT_THAT(ParseElement<uint32_t>(""),
+              StatusIs(StatusCode::kInvalidArgument));
+  EXPECT_THAT(ParseElement<int32_t>(""),
+              StatusIs(StatusCode::kInvalidArgument));
+  EXPECT_THAT(ParseElement<uint32_t>(""),
+              StatusIs(StatusCode::kInvalidArgument));
+  EXPECT_THAT(ParseElement<float>(""), StatusIs(StatusCode::kInvalidArgument));
+  EXPECT_THAT(ParseElement<double>(""), StatusIs(StatusCode::kInvalidArgument));
+
+  EXPECT_THAT(ParseElement<int8_t>("asdfasdf"),
+              StatusIs(StatusCode::kInvalidArgument));
+  EXPECT_THAT(ParseElement<uint8_t>("asdfasdf"),
+              StatusIs(StatusCode::kInvalidArgument));
+  EXPECT_THAT(ParseElement<int16_t>("asdfasdf"),
+              StatusIs(StatusCode::kInvalidArgument));
+  EXPECT_THAT(ParseElement<uint16_t>("asdfasdf"),
+              StatusIs(StatusCode::kInvalidArgument));
+  EXPECT_THAT(ParseElement<int32_t>("asdfasdf"),
+              StatusIs(StatusCode::kInvalidArgument));
+  EXPECT_THAT(ParseElement<uint32_t>("asdfasdf"),
+              StatusIs(StatusCode::kInvalidArgument));
+  EXPECT_THAT(ParseElement<int32_t>("asdfasdf"),
+              StatusIs(StatusCode::kInvalidArgument));
+  EXPECT_THAT(ParseElement<uint32_t>("asdfasdf"),
+              StatusIs(StatusCode::kInvalidArgument));
+  EXPECT_THAT(ParseElement<float>("asdfasdf"),
+              StatusIs(StatusCode::kInvalidArgument));
+  EXPECT_THAT(ParseElement<double>("asdfasdf"),
+              StatusIs(StatusCode::kInvalidArgument));
+
+  EXPECT_THAT(ParseElement<int8_t>("🌮"),
+              StatusIs(StatusCode::kInvalidArgument));
+}
+
+TEST(ElementStringUtilTest, ParseOpaqueElement) {
+  std::vector<uint8_t> buffer1(1);
+  IREE_EXPECT_OK(ParseElement("FF", IREE_HAL_ELEMENT_TYPE_OPAQUE_8,
+                              iree::span<uint8_t>(buffer1)));
+  EXPECT_THAT(buffer1, Eq(std::vector<uint8_t>{0xFF}));
+
+  std::vector<uint16_t> buffer2(1);
+  IREE_EXPECT_OK(ParseElement("FFCD", IREE_HAL_ELEMENT_TYPE_OPAQUE_16,
+                              iree::span<uint16_t>(buffer2)));
+  EXPECT_THAT(buffer2, Eq(std::vector<uint16_t>{0xCDFFu}));
+
+  std::vector<uint32_t> buffer4(1);
+  IREE_EXPECT_OK(ParseElement("FFCDAABB", IREE_HAL_ELEMENT_TYPE_OPAQUE_32,
+                              iree::span<uint32_t>(buffer4)));
+  EXPECT_THAT(buffer4, Eq(std::vector<uint32_t>{0xBBAACDFFu}));
+
+  std::vector<uint64_t> buffer8(1);
+  IREE_EXPECT_OK(ParseElement("FFCDAABBCCDDEEFF",
+                              IREE_HAL_ELEMENT_TYPE_OPAQUE_64,
+                              iree::span<uint64_t>(buffer8)));
+  EXPECT_THAT(buffer8, Eq(std::vector<uint64_t>{0xFFEEDDCCBBAACDFFull}));
+}
+
+TEST(ElementStringUtilTest, ParseOpaqueElementInvalid) {
+  std::vector<uint8_t> buffer0(0);
+  EXPECT_THAT(ParseElement("", IREE_HAL_ELEMENT_TYPE_OPAQUE_8,
+                           iree::span<uint8_t>(buffer0)),
+              StatusIs(StatusCode::kInvalidArgument));
+  EXPECT_THAT(ParseElement("FF", IREE_HAL_ELEMENT_TYPE_OPAQUE_8,
+                           iree::span<uint8_t>(buffer0)),
+              StatusIs(StatusCode::kInvalidArgument));
+
+  std::vector<uint8_t> buffer1(1);
+  EXPECT_THAT(ParseElement("", IREE_HAL_ELEMENT_TYPE_OPAQUE_8,
+                           iree::span<uint8_t>(buffer1)),
+              StatusIs(StatusCode::kInvalidArgument));
+  EXPECT_THAT(ParseElement("F", IREE_HAL_ELEMENT_TYPE_OPAQUE_8,
+                           iree::span<uint8_t>(buffer1)),
+              StatusIs(StatusCode::kInvalidArgument));
+  EXPECT_THAT(ParseElement("FFC", IREE_HAL_ELEMENT_TYPE_OPAQUE_8,
+                           iree::span<uint8_t>(buffer1)),
+              StatusIs(StatusCode::kInvalidArgument));
+}
+
+TEST(ElementStringUtilTest, FormatElement) {
+  EXPECT_THAT(FormatElement<int8_t>(INT8_MIN), IsOkAndHolds(Eq("-128")));
+  EXPECT_THAT(FormatElement<int8_t>(INT8_MAX), IsOkAndHolds(Eq("127")));
+  EXPECT_THAT(FormatElement<uint8_t>(UINT8_MAX), IsOkAndHolds(Eq("255")));
+  EXPECT_THAT(FormatElement<int16_t>(INT16_MIN), IsOkAndHolds(Eq("-32768")));
+  EXPECT_THAT(FormatElement<int16_t>(INT16_MAX), IsOkAndHolds(Eq("32767")));
+  EXPECT_THAT(FormatElement<uint16_t>(UINT16_MAX), IsOkAndHolds(Eq("65535")));
+  EXPECT_THAT(FormatElement<int32_t>(INT32_MIN),
+              IsOkAndHolds(Eq("-2147483648")));
+  EXPECT_THAT(FormatElement<int32_t>(INT32_MAX),
+              IsOkAndHolds(Eq("2147483647")));
+  EXPECT_THAT(FormatElement<uint32_t>(UINT32_MAX),
+              IsOkAndHolds(Eq("4294967295")));
+  EXPECT_THAT(FormatElement<int64_t>(INT64_MIN),
+              IsOkAndHolds(Eq("-9223372036854775808")));
+  EXPECT_THAT(FormatElement<int64_t>(INT64_MAX),
+              IsOkAndHolds(Eq("9223372036854775807")));
+  EXPECT_THAT(FormatElement<uint64_t>(UINT64_MAX),
+              IsOkAndHolds(Eq("18446744073709551615")));
+  EXPECT_THAT(FormatElement<float>(1.5f), IsOkAndHolds(Eq("1.5")));
+  EXPECT_THAT(FormatElement<double>(1123.56789456789),
+              IsOkAndHolds(Eq("1123.57")));
+  EXPECT_THAT(FormatElement<double>(-1.5e-10), IsOkAndHolds(Eq("-1.5E-10")));
+}
+
+TEST(ElementStringUtilTest, FormatOpaqueElement) {
+  EXPECT_THAT(FormatElement<uint8_t>(129, IREE_HAL_ELEMENT_TYPE_OPAQUE_8),
+              IsOkAndHolds(Eq("81")));
+  EXPECT_THAT(FormatElement<int16_t>(-12345, IREE_HAL_ELEMENT_TYPE_OPAQUE_16),
+              IsOkAndHolds(Eq("C7CF")));
+  EXPECT_THAT(FormatElement<int32_t>(0, IREE_HAL_ELEMENT_TYPE_OPAQUE_32),
+              IsOkAndHolds(Eq("00000000")));
+  EXPECT_THAT(FormatElement<uint64_t>(0x8899AABBCCDDEEFFull,
+                                      IREE_HAL_ELEMENT_TYPE_OPAQUE_64),
+              IsOkAndHolds(Eq("FFEEDDCCBBAA9988")));
+}
+
+TEST(BufferElementsStringUtilTest, ParseBufferElements) {
+  // Empty:
+  std::vector<int8_t> buffer0(0);
+  IREE_EXPECT_OK(ParseBufferElements<int8_t>("", iree::span<int8_t>(buffer0)));
+  EXPECT_THAT(buffer0, Eq(std::vector<int8_t>{}));
+  std::vector<int8_t> buffer8(8, 123);
+  IREE_EXPECT_OK(ParseBufferElements<int8_t>("", iree::span<int8_t>(buffer8)));
+  EXPECT_THAT(buffer8, Eq(std::vector<int8_t>{0, 0, 0, 0, 0, 0, 0, 0}));
+  // Scalar:
+  std::vector<int8_t> buffer1(1);
+  IREE_EXPECT_OK(ParseBufferElements<int8_t>("1", iree::span<int8_t>(buffer1)));
+  EXPECT_THAT(buffer1, Eq(std::vector<int8_t>{1}));
+  // Splat:
+  IREE_EXPECT_OK(ParseBufferElements<int8_t>("3", iree::span<int8_t>(buffer8)));
+  EXPECT_THAT(buffer8, Eq(std::vector<int8_t>{3, 3, 3, 3, 3, 3, 3, 3}));
+  // 1:1:
+  IREE_EXPECT_OK(ParseBufferElements<int8_t>("2", iree::span<int8_t>(buffer1)));
+  EXPECT_THAT(buffer1, Eq(std::vector<int8_t>{2}));
+  std::vector<int16_t> buffer8i16(8);
+  IREE_EXPECT_OK(ParseBufferElements<int16_t>("0 1 2 3 4 5 6 7",
+                                              iree::span<int16_t>(buffer8i16)));
+  EXPECT_THAT(buffer8i16, Eq(std::vector<int16_t>{0, 1, 2, 3, 4, 5, 6, 7}));
+  std::vector<int32_t> buffer8i32(8);
+  IREE_EXPECT_OK(ParseBufferElements<int32_t>("[0 1 2 3] [4 5 6 7]",
+                                              iree::span<int32_t>(buffer8i32)));
+  EXPECT_THAT(buffer8i32, Eq(std::vector<int32_t>{0, 1, 2, 3, 4, 5, 6, 7}));
+}
+
+TEST(BufferElementsStringUtilTest, ParseBufferElementsOpaque) {
+  std::vector<uint16_t> buffer3i16(3);
+  IREE_EXPECT_OK(ParseBufferElements("0011 2233 4455",
+                                     IREE_HAL_ELEMENT_TYPE_OPAQUE_16,
+                                     iree::span<uint16_t>(buffer3i16)));
+  EXPECT_THAT(buffer3i16, Eq(std::vector<uint16_t>{0x1100, 0x3322, 0x5544}));
+}
+
+TEST(BufferElementsStringUtilTest, ParseBufferElementsInvalid) {
+  std::vector<int8_t> buffer0(0);
+  EXPECT_THAT(ParseBufferElements("abc", iree::span<int8_t>(buffer0)),
+              StatusIs(StatusCode::kOutOfRange));
+  std::vector<int8_t> buffer1(1);
+  EXPECT_THAT(ParseBufferElements("abc", iree::span<int8_t>(buffer1)),
+              StatusIs(StatusCode::kInvalidArgument));
+  std::vector<int8_t> buffer8(8);
+  EXPECT_THAT(ParseBufferElements("1 2 3", iree::span<int8_t>(buffer8)),
+              StatusIs(StatusCode::kOutOfRange));
+  std::vector<int8_t> buffer4(4);
+  EXPECT_THAT(ParseBufferElements("1 2 3 4 5", iree::span<int8_t>(buffer4)),
+              StatusIs(StatusCode::kOutOfRange));
+}
+
+TEST(BufferElementsStringUtilTest, ParseBufferElementsShaped) {
+  // Empty:
+  EXPECT_THAT(ParseBufferElements<int8_t>("", Shape{2, 4}),
+              IsOkAndHolds(Eq(std::vector<int8_t>{0, 0, 0, 0, 0, 0, 0, 0})));
+  // Scalar:
+  EXPECT_THAT(ParseBufferElements<int8_t>("", Shape{}),
+              IsOkAndHolds(Eq(std::vector<int8_t>{0})));
+  EXPECT_THAT(ParseBufferElements<int8_t>("1", Shape{}),
+              IsOkAndHolds(Eq(std::vector<int8_t>{1})));
+  // Splat:
+  EXPECT_THAT(ParseBufferElements<int8_t>("3", Shape{2, 4}),
+              IsOkAndHolds(Eq(std::vector<int8_t>{3, 3, 3, 3, 3, 3, 3, 3})));
+  // 1:1:
+  EXPECT_THAT(ParseBufferElements<int8_t>("2", Shape{1}),
+              IsOkAndHolds(Eq(std::vector<int8_t>{2})));
+  EXPECT_THAT(ParseBufferElements<int16_t>("0 1 2 3 4 5 6 7", Shape{2, 4}),
+              IsOkAndHolds(Eq(std::vector<int16_t>{0, 1, 2, 3, 4, 5, 6, 7})));
+  EXPECT_THAT(ParseBufferElements<int32_t>("[0 1 2 3] [4 5 6 7]", Shape{2, 4}),
+              IsOkAndHolds(Eq(std::vector<int32_t>{0, 1, 2, 3, 4, 5, 6, 7})));
+}
+
+TEST(BufferElementsStringUtilTest, ParseBufferElementsShapedInvalid) {
+  EXPECT_THAT(ParseBufferElements<int8_t>("abc", Shape{}),
+              StatusIs(StatusCode::kInvalidArgument));
+  EXPECT_THAT(ParseBufferElements<int8_t>("1 2 3", Shape{2, 4}),
+              StatusIs(StatusCode::kOutOfRange));
+  EXPECT_THAT(ParseBufferElements<int8_t>("1 2 3 4 5", Shape{2, 2}),
+              StatusIs(StatusCode::kOutOfRange));
+}
+
+TEST(BufferElementsStringUtilTest, FormatBufferElements) {
+  EXPECT_THAT(FormatBufferElements<int8_t>({1}, Shape{}), IsOkAndHolds("1"));
+  EXPECT_THAT(FormatBufferElements<int8_t>({1}, Shape{1}), IsOkAndHolds("1"));
+  EXPECT_THAT(FormatBufferElements<int8_t>({1, 2, 3, 4}, Shape{4}),
+              IsOkAndHolds("1 2 3 4"));
+  EXPECT_THAT(FormatBufferElements<int8_t>({1, 2, 3, 4}, Shape{2, 2}),
+              IsOkAndHolds("[1 2][3 4]"));
+  EXPECT_THAT(FormatBufferElements<int8_t>({1, 2, 3, 4}, Shape{4, 1}),
+              IsOkAndHolds("[1][2][3][4]"));
+  EXPECT_THAT(
+      FormatBufferElements<int32_t>(std::vector<int32_t>(300, -99),
+                                    Shape{100, 3}),
+      IsOkAndHolds(
+          "[-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+          "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+          "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+          "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+          "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+          "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+          "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+          "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+          "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+          "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+          "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+          "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+          "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+          "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+          "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+          "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+          "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+          "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+          "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+          "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+          "-99]"));
+}
+
+TEST(BufferElementsStringUtilTest, FormatBufferElementsElided) {
+  EXPECT_THAT(FormatBufferElements<int8_t>({1}, Shape{}, 0),
+              IsOkAndHolds("..."));
+  EXPECT_THAT(FormatBufferElements<int8_t>({1}, Shape{}, 1), IsOkAndHolds("1"));
+  EXPECT_THAT(FormatBufferElements<int8_t>({1}, Shape{}, 99123),
+              IsOkAndHolds("1"));
+
+  EXPECT_THAT(FormatBufferElements<int8_t>({1, 2, 3, 4}, Shape{4}, 0),
+              IsOkAndHolds("..."));
+  EXPECT_THAT(FormatBufferElements<int8_t>({1, 2, 3, 4}, Shape{4}, 1),
+              IsOkAndHolds("1..."));
+  EXPECT_THAT(FormatBufferElements<int8_t>({1, 2, 3, 4}, Shape{4}, 3),
+              IsOkAndHolds("1 2 3..."));
+  EXPECT_THAT(FormatBufferElements<int8_t>({1, 2, 3, 4}, Shape{4}, 99123),
+              IsOkAndHolds("1 2 3 4"));
+
+  EXPECT_THAT(FormatBufferElements<int8_t>({1, 2, 3, 4}, Shape{2, 2}, 0),
+              IsOkAndHolds("[...][...]"));
+  EXPECT_THAT(FormatBufferElements<int8_t>({1, 2, 3, 4}, Shape{2, 2}, 1),
+              IsOkAndHolds("[1...][...]"));
+  EXPECT_THAT(FormatBufferElements<int8_t>({1, 2, 3, 4}, Shape{2, 2}, 3),
+              IsOkAndHolds("[1 2][3...]"));
+  EXPECT_THAT(FormatBufferElements<int8_t>({1, 2, 3, 4}, Shape{2, 2}, 99123),
+              IsOkAndHolds("[1 2][3 4]"));
+}
+
+TEST(BufferViewStringUtilTest, Parse) {
+  IREE_ASSERT_OK_AND_ASSIGN(auto allocator, Allocator::CreateHostLocal());
+
+  // Zero fill.
+  IREE_ASSERT_OK_AND_ASSIGN(auto bv0, BufferView::Parse("i8", allocator));
+  EXPECT_THAT(bv0.buffer().CloneData<int8_t>(),
+              IsOkAndHolds(Eq(std::vector<int8_t>{0})));
+
+  // Zero fill (empty value).
+  IREE_ASSERT_OK_AND_ASSIGN(auto bv1, BufferView::Parse("2x2xi8=", allocator));
+  EXPECT_THAT(bv1.buffer().CloneData<int8_t>(),
+              IsOkAndHolds(Eq(std::vector<int8_t>{0, 0, 0, 0})));
+
+  // Splat.
+  IREE_ASSERT_OK_AND_ASSIGN(auto bv2, BufferView::Parse("2x2xi8=3", allocator));
+  EXPECT_THAT(bv2.buffer().CloneData<int8_t>(),
+              IsOkAndHolds(Eq(std::vector<int8_t>{3, 3, 3, 3})));
+
+  // Flat list.
+  IREE_ASSERT_OK_AND_ASSIGN(auto bv3,
+                            BufferView::Parse("2x2xi8=1 2 3 4", allocator));
+  EXPECT_THAT(bv3.buffer().CloneData<int8_t>(),
+              IsOkAndHolds(Eq(std::vector<int8_t>{1, 2, 3, 4})));
+
+  // Whitespace and separators shouldn't matter.
+  IREE_ASSERT_OK_AND_ASSIGN(
+      auto bv4, BufferView::Parse("  2x2xi8 =  1,\n2 3\t,4", allocator));
+  EXPECT_THAT(bv4.buffer().CloneData<int8_t>(),
+              IsOkAndHolds(Eq(std::vector<int8_t>{1, 2, 3, 4})));
+
+  // Brackets are optional.
+  IREE_ASSERT_OK_AND_ASSIGN(
+      auto bv5, BufferView::Parse("4xi16=[[0][1][2]][3]", allocator));
+  EXPECT_THAT(bv5.buffer().CloneData<int16_t>(),
+              IsOkAndHolds(Eq(std::vector<int16_t>{0, 1, 2, 3})));
+}
+
+TEST(BufferViewStringUtilTest, ParseInvalid) {
+  IREE_ASSERT_OK_AND_ASSIGN(auto allocator, Allocator::CreateHostLocal());
+
+  // Incomplete.
+  EXPECT_THAT(BufferView::Parse("", allocator),
+              StatusIs(StatusCode::kInvalidArgument));
+  EXPECT_THAT(BufferView::Parse("asdf", allocator),
+              StatusIs(StatusCode::kInvalidArgument));
+  EXPECT_THAT(BufferView::Parse("9x8=", allocator),
+              StatusIs(StatusCode::kInvalidArgument));
+  EXPECT_THAT(BufferView::Parse("=4", allocator),
+              StatusIs(StatusCode::kInvalidArgument));
+
+  // Partial data.
+  EXPECT_THAT(BufferView::Parse("2x4xi32=5 3", allocator),
+              StatusIs(StatusCode::kOutOfRange));
+}
+
+TEST(BufferViewStringUtilTest, ToString) {
+  EXPECT_THAT(FormatBufferElements<int8_t>({1}, Shape{}), IsOkAndHolds("1"));
+  EXPECT_THAT(FormatBufferElements<int8_t>({1}, Shape{1}), IsOkAndHolds("1"));
+  EXPECT_THAT(FormatBufferElements<int8_t>({1, 2, 3, 4}, Shape{4}),
+              IsOkAndHolds("1 2 3 4"));
+  EXPECT_THAT(FormatBufferElements<int8_t>({1, 2, 3, 4}, Shape{2, 2}),
+              IsOkAndHolds("[1 2][3 4]"));
+  EXPECT_THAT(FormatBufferElements<int8_t>({1, 2, 3, 4}, Shape{4, 1}),
+              IsOkAndHolds("[1][2][3][4]"));
+  EXPECT_THAT(
+      FormatBufferElements<int32_t>(std::vector<int32_t>(300, -99),
+                                    Shape{100, 3}),
+      IsOkAndHolds(
+          "[-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+          "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+          "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+          "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+          "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+          "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+          "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+          "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+          "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+          "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+          "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+          "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+          "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+          "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+          "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+          "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+          "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+          "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+          "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+          "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+          "-99]"));
+}
+
+TEST(BufferViewStringUtilTest, RoundTrip) {
+  IREE_ASSERT_OK_AND_ASSIGN(auto allocator, Allocator::CreateHostLocal());
+  auto expect_round_trip = [&](std::string source_value) {
+    IREE_ASSERT_OK_AND_ASSIGN(auto buffer_view,
+                              BufferView::Parse(source_value, allocator));
+    EXPECT_THAT(buffer_view.ToString(), IsOkAndHolds(source_value));
+  };
+
+  expect_round_trip("i8=-8");
+  expect_round_trip("si8=-8");
+  expect_round_trip("ui8=239");
+  expect_round_trip("4xi8=0 -1 2 3");
+  expect_round_trip("4xsi8=0 -1 2 3");
+  expect_round_trip("4xi16=0 -1 2 3");
+  expect_round_trip("4xui16=0 1 2 3");
+  expect_round_trip("2x2xi32=[0 1][2 3]");
+  expect_round_trip("4xf16=0 0.5 2 3");
+  expect_round_trip("4xf32=0 1.1 2 3");
+  expect_round_trip("4xf64=0 1.1 2 3");
+  expect_round_trip("1x2x3xi8=[[0 1 2][3 4 5]]");
+  expect_round_trip("2x*16=AABB CCDD");
+  expect_round_trip(
+      "100x3xi16=[-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+      "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+      "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+      "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+      "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+      "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+      "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+      "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+      "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+      "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+      "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+      "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+      "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+      "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+      "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+      "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+      "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+      "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+      "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 "
+      "-99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99][-99 -99 -99]");
+}
+
+}  // namespace
+}  // namespace hal
+}  // namespace iree
diff --git a/runtime/src/iree/hal/utils/BUILD b/runtime/src/iree/hal/utils/BUILD
new file mode 100644
index 0000000..e01552a
--- /dev/null
+++ b/runtime/src/iree/hal/utils/BUILD
@@ -0,0 +1,77 @@
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("//build_tools/bazel:cc_binary_benchmark.bzl", "cc_binary_benchmark")
+load("//iree:build_defs.oss.bzl", "iree_runtime_cc_library", "iree_runtime_cc_test")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+iree_runtime_cc_library(
+    name = "buffer_transfer",
+    srcs = ["buffer_transfer.c"],
+    hdrs = ["buffer_transfer.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:tracing",
+        "//runtime/src/iree/hal",
+    ],
+)
+
+iree_runtime_cc_library(
+    name = "deferred_command_buffer",
+    srcs = ["deferred_command_buffer.c"],
+    hdrs = ["deferred_command_buffer.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":resource_set",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:tracing",
+        "//runtime/src/iree/base/internal:arena",
+        "//runtime/src/iree/hal",
+    ],
+)
+
+iree_runtime_cc_library(
+    name = "resource_set",
+    srcs = ["resource_set.c"],
+    hdrs = ["resource_set.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:tracing",
+        "//runtime/src/iree/base/internal:arena",
+        "//runtime/src/iree/hal",
+    ],
+)
+
+cc_binary_benchmark(
+    name = "resource_set_benchmark",
+    srcs = ["resource_set_benchmark.c"],
+    deps = [
+        ":resource_set",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base/internal:prng",
+        "//runtime/src/iree/hal",
+        "//runtime/src/iree/testing:benchmark",
+    ],
+)
+
+iree_runtime_cc_test(
+    name = "resource_set_test",
+    srcs = ["resource_set_test.cc"],
+    deps = [
+        ":resource_set",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/hal",
+        "//runtime/src/iree/testing:gtest",
+        "//runtime/src/iree/testing:gtest_main",
+    ],
+)
diff --git a/runtime/src/iree/hal/utils/CMakeLists.txt b/runtime/src/iree/hal/utils/CMakeLists.txt
new file mode 100644
index 0000000..1f589f5
--- /dev/null
+++ b/runtime/src/iree/hal/utils/CMakeLists.txt
@@ -0,0 +1,85 @@
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# runtime/src/iree/hal/utils/BUILD                                             #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
+
+iree_add_all_subdirs()
+
+iree_cc_library(
+  NAME
+    buffer_transfer
+  HDRS
+    "buffer_transfer.h"
+  SRCS
+    "buffer_transfer.c"
+  DEPS
+    iree::base
+    iree::base::tracing
+    iree::hal
+  PUBLIC
+)
+
+iree_cc_library(
+  NAME
+    deferred_command_buffer
+  HDRS
+    "deferred_command_buffer.h"
+  SRCS
+    "deferred_command_buffer.c"
+  DEPS
+    ::resource_set
+    iree::base
+    iree::base::internal::arena
+    iree::base::tracing
+    iree::hal
+  PUBLIC
+)
+
+iree_cc_library(
+  NAME
+    resource_set
+  HDRS
+    "resource_set.h"
+  SRCS
+    "resource_set.c"
+  DEPS
+    iree::base
+    iree::base::internal::arena
+    iree::base::tracing
+    iree::hal
+  PUBLIC
+)
+
+iree_cc_binary_benchmark(
+  NAME
+    resource_set_benchmark
+  SRCS
+    "resource_set_benchmark.c"
+  DEPS
+    ::resource_set
+    iree::base
+    iree::base::internal::prng
+    iree::hal
+    iree::testing::benchmark
+  TESTONLY
+)
+
+iree_cc_test(
+  NAME
+    resource_set_test
+  SRCS
+    "resource_set_test.cc"
+  DEPS
+    ::resource_set
+    iree::base
+    iree::hal
+    iree::testing::gtest
+    iree::testing::gtest_main
+)
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/runtime/src/iree/hal/utils/buffer_transfer.c b/runtime/src/iree/hal/utils/buffer_transfer.c
new file mode 100644
index 0000000..553d049
--- /dev/null
+++ b/runtime/src/iree/hal/utils/buffer_transfer.c
@@ -0,0 +1,364 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/utils/buffer_transfer.h"
+
+#include "iree/base/tracing.h"
+
+//===----------------------------------------------------------------------===//
+// iree_hal_device_transfer_range implementations
+//===----------------------------------------------------------------------===//
+
+IREE_API_EXPORT iree_status_t iree_hal_device_submit_transfer_range_and_wait(
+    iree_hal_device_t* device, iree_hal_transfer_buffer_t source,
+    iree_device_size_t source_offset, iree_hal_transfer_buffer_t target,
+    iree_device_size_t target_offset, iree_device_size_t data_length,
+    iree_hal_transfer_buffer_flags_t flags, iree_timeout_t timeout) {
+  // If the source and target are both mappable into host memory (or are host
+  // memory) then we can use the fast zero-alloc path. This may actually be
+  // slower than doing a device queue transfer depending on the size of the data
+  // and where the memory lives. For example, if we have two device buffers in
+  // device-local host-visible memory we'd be performing the transfer by pulling
+  // all the memory to the CPU and pushing it back again.
+  // TODO(benvanik): check for device-local -> device-local and avoid mapping.
+  bool is_source_mappable =
+      !source.device_buffer ||
+      (iree_all_bits_set(iree_hal_buffer_memory_type(source.device_buffer),
+                         IREE_HAL_MEMORY_TYPE_HOST_VISIBLE) &&
+       iree_all_bits_set(iree_hal_buffer_allowed_usage(source.device_buffer),
+                         IREE_HAL_BUFFER_USAGE_MAPPING));
+  bool is_target_mappable =
+      !target.device_buffer ||
+      (iree_all_bits_set(iree_hal_buffer_memory_type(target.device_buffer),
+                         IREE_HAL_MEMORY_TYPE_HOST_VISIBLE) &&
+       iree_all_bits_set(iree_hal_buffer_allowed_usage(target.device_buffer),
+                         IREE_HAL_BUFFER_USAGE_MAPPING));
+  if (is_source_mappable && is_target_mappable) {
+    return iree_hal_device_transfer_mappable_range(
+        device, source, source_offset, target, target_offset, data_length,
+        flags, timeout);
+  }
+
+  // If the source is a host buffer under 64KB then we can do a more efficient
+  // (though still relatively costly) update instead of needing a staging
+  // buffer.
+  if (!source.device_buffer && target.device_buffer &&
+      data_length <= IREE_HAL_COMMAND_BUFFER_MAX_UPDATE_SIZE) {
+    const iree_hal_transfer_command_t transfer_command = {
+        .type = IREE_HAL_TRANSFER_COMMAND_TYPE_UPDATE,
+        .update =
+            {
+                .source_buffer = source.host_buffer.data,
+                .source_offset = source_offset,
+                .target_buffer = target.device_buffer,
+                .target_offset = target_offset,
+                .length = data_length,
+            },
+    };
+    return iree_hal_device_transfer_and_wait(device, /*wait_semaphore=*/NULL,
+                                             /*wait_value=*/0ull, 1,
+                                             &transfer_command, timeout);
+  }
+
+  iree_status_t status = iree_ok_status();
+
+  // Allocate the staging buffer for upload to the device.
+  iree_hal_buffer_t* source_buffer = source.device_buffer;
+  if (!source_buffer) {
+    // Allocate staging memory with a copy of the host data. We only initialize
+    // the portion being transferred.
+    // TODO(benvanik): use import if supported to avoid the allocation/copy.
+    // TODO(benvanik): make this device-local + host-visible? can be better for
+    // uploads as we know we are never going to read it back.
+    const iree_hal_buffer_params_t source_params = {
+        .type = IREE_HAL_MEMORY_TYPE_HOST_LOCAL |
+                IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE,
+        .usage = IREE_HAL_BUFFER_USAGE_TRANSFER | IREE_HAL_BUFFER_USAGE_MAPPING,
+    };
+    status = iree_hal_allocator_allocate_buffer(
+        iree_hal_device_allocator(device), source_params, data_length,
+        iree_make_const_byte_span(source.host_buffer.data + source_offset,
+                                  data_length),
+        &source_buffer);
+    source_offset = 0;
+  }
+
+  // Allocate the staging buffer for download from the device.
+  iree_hal_buffer_t* target_buffer = target.device_buffer;
+  if (!target_buffer) {
+    // Allocate uninitialized staging memory for the transfer target.
+    // We only allocate enough for the portion we are transfering.
+    // TODO(benvanik): use import if supported to avoid the allocation/copy.
+    const iree_hal_buffer_params_t target_params = {
+        .type = IREE_HAL_MEMORY_TYPE_HOST_LOCAL |
+                IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE,
+        .usage = IREE_HAL_BUFFER_USAGE_TRANSFER | IREE_HAL_BUFFER_USAGE_MAPPING,
+    };
+    status = iree_hal_allocator_allocate_buffer(
+        iree_hal_device_allocator(device), target_params, data_length,
+        iree_const_byte_span_empty(), &target_buffer);
+    target_offset = 0;
+  }
+
+  // Issue synchronous device copy.
+  if (iree_status_is_ok(status)) {
+    const iree_hal_transfer_command_t transfer_command = {
+        .type = IREE_HAL_TRANSFER_COMMAND_TYPE_COPY,
+        .copy =
+            {
+                .source_buffer = source_buffer,
+                .source_offset = source_offset,
+                .target_buffer = target_buffer,
+                .target_offset = target_offset,
+                .length = data_length,
+            },
+    };
+    status = iree_hal_device_transfer_and_wait(device, /*wait_semaphore=*/NULL,
+                                               /*wait_value=*/0ull, 1,
+                                               &transfer_command, timeout);
+  }
+
+  // Read back the staging buffer into memory, if needed.
+  if (iree_status_is_ok(status) && !target.device_buffer) {
+    status = iree_hal_buffer_map_read(target_buffer, 0, target.host_buffer.data,
+                                      data_length);
+  }
+
+  // Discard staging buffers, if they were required.
+  if (!source.device_buffer) iree_hal_buffer_release(source_buffer);
+  if (!target.device_buffer) iree_hal_buffer_release(target_buffer);
+
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_device_transfer_mappable_range(
+    iree_hal_device_t* device, iree_hal_transfer_buffer_t source,
+    iree_device_size_t source_offset, iree_hal_transfer_buffer_t target,
+    iree_device_size_t target_offset, iree_device_size_t data_length,
+    iree_hal_transfer_buffer_flags_t flags, iree_timeout_t timeout) {
+  iree_status_t status = iree_ok_status();
+
+  iree_hal_buffer_mapping_t source_mapping = {{0}};
+  if (iree_status_is_ok(status)) {
+    if (source.device_buffer) {
+      status = iree_hal_buffer_map_range(
+          source.device_buffer, IREE_HAL_MAPPING_MODE_SCOPED,
+          IREE_HAL_MEMORY_ACCESS_READ, source_offset, data_length,
+          &source_mapping);
+    } else {
+      source_mapping = (iree_hal_buffer_mapping_t){
+          .contents = source.host_buffer,
+      };
+    }
+  }
+
+  iree_hal_buffer_mapping_t target_mapping = {{0}};
+  if (iree_status_is_ok(status)) {
+    if (target.device_buffer) {
+      status = iree_hal_buffer_map_range(
+          target.device_buffer, IREE_HAL_MAPPING_MODE_SCOPED,
+          IREE_HAL_MEMORY_ACCESS_DISCARD_WRITE, target_offset, data_length,
+          &target_mapping);
+    } else {
+      target_mapping = (iree_hal_buffer_mapping_t){
+          .contents = target.host_buffer,
+      };
+    }
+  }
+
+  iree_device_size_t adjusted_data_length = 0;
+  if (iree_status_is_ok(status)) {
+    // Adjust the data length based on the min we have.
+    if (data_length == IREE_WHOLE_BUFFER) {
+      // Whole buffer copy requested - that could mean either, so take the min.
+      adjusted_data_length = iree_min(source_mapping.contents.data_length,
+                                      target_mapping.contents.data_length);
+    } else {
+      // Specific length requested - validate that we have matching lengths.
+      IREE_ASSERT_EQ(source_mapping.contents.data_length,
+                     target_mapping.contents.data_length);
+      adjusted_data_length = target_mapping.contents.data_length;
+    }
+
+    // Perform the copy, assuming there's anything to do.
+    if (adjusted_data_length != 0) {
+      memcpy(target_mapping.contents.data, source_mapping.contents.data,
+             adjusted_data_length);
+    }
+  }
+
+  if (source.device_buffer) {
+    status =
+        iree_status_join(status, iree_hal_buffer_unmap_range(&source_mapping));
+  }
+  if (target.device_buffer) {
+    if (adjusted_data_length > 0 &&
+        !iree_all_bits_set(iree_hal_buffer_memory_type(target.device_buffer),
+                           IREE_HAL_MEMORY_TYPE_HOST_COHERENT)) {
+      status = iree_status_join(
+          status, iree_hal_buffer_flush_range(&target_mapping, 0,
+                                              adjusted_data_length));
+    }
+    status =
+        iree_status_join(status, iree_hal_buffer_unmap_range(&target_mapping));
+  }
+  return status;
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_buffer_map_range implementations
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_emulated_buffer_mapping_t {
+  iree_hal_buffer_t* host_local_buffer;
+  iree_hal_buffer_mapping_t host_local_mapping;
+} iree_hal_emulated_buffer_mapping_t;
+
+IREE_API_EXPORT iree_status_t iree_hal_buffer_emulated_map_range(
+    iree_hal_device_t* device, iree_hal_buffer_t* buffer,
+    iree_hal_mapping_mode_t mapping_mode,
+    iree_hal_memory_access_t memory_access,
+    iree_device_size_t local_byte_offset, iree_device_size_t local_byte_length,
+    iree_hal_buffer_mapping_t* mapping) {
+  IREE_ASSERT_ARGUMENT(device);
+  IREE_ASSERT_ARGUMENT(buffer);
+  IREE_ASSERT_ARGUMENT(mapping);
+
+  iree_hal_allocator_t* device_allocator = iree_hal_device_allocator(device);
+  iree_allocator_t host_allocator = iree_hal_device_host_allocator(device);
+
+  // We can't perform persistent mapping with this as we need to manage the
+  // staging buffer lifetime.
+  if (IREE_UNLIKELY(mapping_mode == IREE_HAL_MAPPING_MODE_PERSISTENT)) {
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "emulated buffer mapping only possible with scoped mappings");
+  }
+
+  // No implementation should be using this emulated method with memory that is
+  // allocated as mappable.
+  if (IREE_UNLIKELY(iree_all_bits_set(iree_hal_buffer_memory_type(buffer),
+                                      IREE_HAL_BUFFER_USAGE_MAPPING))) {
+    return iree_make_status(
+        IREE_STATUS_FAILED_PRECONDITION,
+        "emulated buffer mapping should not be used with mappable buffers");
+  }
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, (uint64_t)local_byte_length);
+
+  // NOTE: this is assuming that the host is going to be doing a lot of work
+  // on the mapped memory and wants read/write caching and such. If the user
+  // wants write combining on device memory and other things they should ensure
+  // this emulated mapping path is not hit.
+
+  // Create a transient struct we use to track the emulated operation.
+  // We could pack this into the mapping but this composes better - it's small
+  // and pooled by the host allocator anyway.
+  iree_hal_emulated_buffer_mapping_t* emulation_state = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_allocator_malloc(host_allocator, sizeof(*emulation_state),
+                                (void**)&emulation_state));
+
+  // Allocate the buffer we'll be using to stage our copy of the device memory.
+  // All devices should be able to satisfy this host-local + mapping request.
+  iree_status_t status = iree_hal_allocator_allocate_buffer(
+      device_allocator,
+      (iree_hal_buffer_params_t){
+          .type = IREE_HAL_MEMORY_TYPE_HOST_LOCAL,
+          .usage =
+              IREE_HAL_BUFFER_USAGE_TRANSFER | IREE_HAL_BUFFER_USAGE_MAPPING,
+      },
+      local_byte_length, iree_const_byte_span_empty(),
+      &emulation_state->host_local_buffer);
+
+  // We need to capture a copy of the device buffer to work with; unless the
+  // user was nice and said they don't care about the contents with the DISCARD
+  // bit. Ideally we'd also enable invalidate_range to specify subranges we want
+  // to map.
+  if (iree_status_is_ok(status) &&
+      !iree_all_bits_set(memory_access, IREE_HAL_MEMORY_ACCESS_DISCARD)) {
+    // Download (device->host) the data.
+    status = iree_hal_device_transfer_range(
+        device, iree_hal_make_device_transfer_buffer(mapping->buffer),
+        local_byte_offset,
+        iree_hal_make_device_transfer_buffer(
+            emulation_state->host_local_buffer),
+        0, local_byte_length, IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT,
+        iree_infinite_timeout());
+  }
+
+  if (iree_status_is_ok(status)) {
+    // Map the scratch buffer: map-ception.
+    // Code-wise it looks like this may loop back onto this emulated path
+    // but no implementation should be using this emulation if they have host
+    // local IREE_HAL_BUFFER_USAGE_MAPPING memory - and we check that above.
+    status = iree_hal_buffer_map_range(emulation_state->host_local_buffer,
+                                       IREE_HAL_MAPPING_MODE_SCOPED,
+                                       memory_access, 0, local_byte_length,
+                                       &emulation_state->host_local_mapping);
+  }
+
+  // Retain the scratch buffer for the duration of the mapping.
+  if (iree_status_is_ok(status)) {
+    // Note that we are giving back the host-local mapped contents to the user -
+    // they don't need to know it's from our staging buffer.
+    mapping->contents = emulation_state->host_local_mapping.contents;
+    mapping->impl.reserved[0] = (uint64_t)((uintptr_t)emulation_state);
+  } else {
+    status = iree_status_join(
+        status,
+        iree_hal_buffer_unmap_range(&emulation_state->host_local_mapping));
+    iree_hal_buffer_release(emulation_state->host_local_buffer);
+    iree_allocator_free(host_allocator, emulation_state);
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_buffer_emulated_unmap_range(
+    iree_hal_device_t* device, iree_hal_buffer_t* buffer,
+    iree_device_size_t local_byte_offset, iree_device_size_t local_byte_length,
+    iree_hal_buffer_mapping_t* mapping) {
+  IREE_ASSERT_ARGUMENT(device);
+  IREE_ASSERT_ARGUMENT(buffer);
+  IREE_ASSERT_ARGUMENT(mapping);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, (uint64_t)local_byte_length);
+  iree_hal_emulated_buffer_mapping_t* emulation_state =
+      (iree_hal_emulated_buffer_mapping_t*)((uintptr_t)
+                                                mapping->impl.reserved[0]);
+  IREE_ASSERT_NE(emulation_state, NULL);
+
+  // Unmap the scratch buffer first to make it available for copying (if
+  // needed).
+  iree_status_t status =
+      iree_hal_buffer_unmap_range(&emulation_state->host_local_mapping);
+
+  // If we were writing then we'll need to flush the range.
+  // Ideally we'd keep track of this on the mapping itself based on the user's
+  // calls to flush_range to limit how much we need to transfer.
+  if (iree_status_is_ok(status) &&
+      iree_all_bits_set(mapping->impl.allowed_access,
+                        IREE_HAL_MEMORY_ACCESS_WRITE)) {
+    // Upload (host->device) the data.
+    status = iree_hal_device_transfer_range(
+        device,
+        iree_hal_make_device_transfer_buffer(
+            emulation_state->host_local_buffer),
+        0, iree_hal_make_device_transfer_buffer(mapping->buffer),
+        local_byte_offset, local_byte_length,
+        IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT, iree_infinite_timeout());
+  }
+
+  // Deallocate the scratch buffer and our emulation state.
+  iree_hal_buffer_release(emulation_state->host_local_buffer);
+  iree_allocator_t host_allocator = iree_hal_device_host_allocator(device);
+  iree_allocator_free(host_allocator, emulation_state);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
diff --git a/runtime/src/iree/hal/utils/buffer_transfer.h b/runtime/src/iree/hal/utils/buffer_transfer.h
new file mode 100644
index 0000000..2daac0a
--- /dev/null
+++ b/runtime/src/iree/hal/utils/buffer_transfer.h
@@ -0,0 +1,69 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_UTILS_BUFFER_TRANSFER_H_
+#define IREE_HAL_UTILS_BUFFER_TRANSFER_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+//===----------------------------------------------------------------------===//
+// iree_hal_device_transfer_range implementations
+//===----------------------------------------------------------------------===//
+
+// Performs a full transfer operation on a device transfer queue.
+// This creates a transfer command buffer, submits it against the device, and
+// waits for it to complete synchronously. Implementations that can do this
+// cheaper are encouraged to do so.
+//
+// Precondition: source and target do not overlap.
+IREE_API_EXPORT iree_status_t iree_hal_device_submit_transfer_range_and_wait(
+    iree_hal_device_t* device, iree_hal_transfer_buffer_t source,
+    iree_device_size_t source_offset, iree_hal_transfer_buffer_t target,
+    iree_device_size_t target_offset, iree_device_size_t data_length,
+    iree_hal_transfer_buffer_flags_t flags, iree_timeout_t timeout);
+
+// Generic implementation of iree_hal_device_transfer_range for when the buffers
+// are mappable. In certain implementations even if buffers are mappable it's
+// often cheaper to still use the full queue transfers: instead of wasting CPU
+// cycles copying the memory (and possible PCIe round-trips) letting the device
+// do it is effectively free.
+//
+// Precondition: source and target do not overlap.
+IREE_API_EXPORT iree_status_t iree_hal_device_transfer_mappable_range(
+    iree_hal_device_t* device, iree_hal_transfer_buffer_t source,
+    iree_device_size_t source_offset, iree_hal_transfer_buffer_t target,
+    iree_device_size_t target_offset, iree_device_size_t data_length,
+    iree_hal_transfer_buffer_flags_t flags, iree_timeout_t timeout);
+
+//===----------------------------------------------------------------------===//
+// iree_hal_buffer_map_range implementations
+//===----------------------------------------------------------------------===//
+
+// Generic implementation of iree_hal_buffer_map_range and unmap_range for when
+// the buffer is not mappable and a full device transfer is required. This will
+// allocate additional host-local buffers and submit copy commands.
+// Implementations able to do this more efficiently should do so.
+IREE_API_EXPORT iree_status_t iree_hal_buffer_emulated_map_range(
+    iree_hal_device_t* device, iree_hal_buffer_t* buffer,
+    iree_hal_mapping_mode_t mapping_mode,
+    iree_hal_memory_access_t memory_access,
+    iree_device_size_t local_byte_offset, iree_device_size_t local_byte_length,
+    iree_hal_buffer_mapping_t* mapping);
+IREE_API_EXPORT iree_status_t iree_hal_buffer_emulated_unmap_range(
+    iree_hal_device_t* device, iree_hal_buffer_t* buffer,
+    iree_device_size_t local_byte_offset, iree_device_size_t local_byte_length,
+    iree_hal_buffer_mapping_t* mapping);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_UTILS_BUFFER_TRANSFER_H_
diff --git a/runtime/src/iree/hal/utils/deferred_command_buffer.c b/runtime/src/iree/hal/utils/deferred_command_buffer.c
new file mode 100644
index 0000000..347a222
--- /dev/null
+++ b/runtime/src/iree/hal/utils/deferred_command_buffer.c
@@ -0,0 +1,883 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/utils/deferred_command_buffer.h"
+
+#include "iree/base/internal/arena.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/utils/resource_set.h"
+
+//===----------------------------------------------------------------------===//
+// Command recording structures
+//===----------------------------------------------------------------------===//
+
+typedef enum iree_hal_command_type_e {
+  IREE_HAL_CMD_EXECUTION_BARRIER = 0,
+  IREE_HAL_CMD_SIGNAL_EVENT,
+  IREE_HAL_CMD_RESET_EVENT,
+  IREE_HAL_CMD_WAIT_EVENTS,
+  IREE_HAL_CMD_DISCARD_BUFFER,
+  IREE_HAL_CMD_FILL_BUFFER,
+  IREE_HAL_CMD_UPDATE_BUFFER,
+  IREE_HAL_CMD_COPY_BUFFER,
+  IREE_HAL_CMD_PUSH_CONSTANTS,
+  IREE_HAL_CMD_PUSH_DESCRIPTOR_SET,
+  IREE_HAL_CMD_BIND_DESCRIPTOR_SET,
+  IREE_HAL_CMD_DISPATCH,
+  IREE_HAL_CMD_DISPATCH_INDIRECT,
+} iree_hal_cmd_type_t;
+
+// Header prefixed to all commands, forming a linked-list.
+//
+// Each command is allocated from the arena and does *not* retain any resources.
+// We could elide some of these commands by keeping local state however that
+// requires knowing more about the target device (executable layouts, etc) and
+// prevents using this as a way to debug or benchmark command buffers. The
+// intent is that each command captures the exact information passed during the
+// call such that the target command buffer cannot tell they were deferred.
+//
+// As each command is variable sized we store pointers to the following command
+// to allow us to walk the list during replay. Storing just a size would be
+// insufficient as commands may be spread across many arena blocks from the
+// block pool.
+typedef struct iree_hal_cmd_header_t {
+  // Next command in the list or NULL if the end.
+  struct iree_hal_cmd_header_t* next;
+  // Type of the command that follows.
+  iree_hal_cmd_type_t type;
+} iree_hal_cmd_header_t;
+
+typedef iree_status_t (*iree_hal_cmd_apply_fn_t)(
+    iree_hal_command_buffer_t* target_command_buffer,
+    iree_hal_cmd_header_t* cmd_header);
+
+//===----------------------------------------------------------------------===//
+// Command list allocation and storage
+//===----------------------------------------------------------------------===//
+
+// A singly-linked list of commands allocated from an arena.
+typedef struct iree_hal_cmd_list_t {
+  // Arena used to hold the recorded commands using block_pool for storage.
+  // Will be reset as the command buffer is re-recorded.
+  iree_arena_allocator_t arena;
+
+  // Head of the command list.
+  iree_hal_cmd_header_t* head;
+  // Tail of the command list (may be head).
+  iree_hal_cmd_header_t* tail;
+} iree_hal_cmd_list_t;
+
+// Initializes a new command list that allocates from the given |block_pool|.
+// Upon return the command list is ready for recording.
+static void iree_hal_cmd_list_initialize(iree_arena_block_pool_t* block_pool,
+                                         iree_hal_cmd_list_t* out_cmd_list) {
+  iree_arena_initialize(block_pool, &out_cmd_list->arena);
+  out_cmd_list->head = NULL;
+  out_cmd_list->tail = NULL;
+}
+
+// Resets the command list and returns all arena blocks back to the block pool.
+// Upon return the command list is ready for recording.
+static void iree_hal_cmd_list_reset(iree_hal_cmd_list_t* cmd_list) {
+  // We could make reset retain a single block so as we know that we'll be
+  // adding more commands on this path and it would remove a round-trip through
+  // the pool.
+  iree_arena_reset(&cmd_list->arena);
+  cmd_list->head = NULL;
+  cmd_list->tail = NULL;
+}
+
+// Deinitializes the command list, preparing for destruction.
+static void iree_hal_cmd_list_deinitialize(iree_hal_cmd_list_t* cmd_list) {
+  iree_hal_cmd_list_reset(cmd_list);
+}
+
+// Appends a new command to the command list and returns the base pointer to its
+// storage. Callers must cast to the appropriate type and populate all fields.
+static iree_status_t iree_hal_cmd_list_append_command(
+    iree_hal_cmd_list_t* cmd_list, iree_hal_cmd_type_t command_type,
+    iree_host_size_t command_size, void** out_cmd) {
+  iree_hal_cmd_header_t* header = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_arena_allocate(&cmd_list->arena, command_size, (void**)&header));
+  header->next = NULL;
+  header->type = command_type;
+  if (!cmd_list->head) {
+    cmd_list->head = header;
+  } else if (cmd_list->tail) {
+    cmd_list->tail->next = header;
+  }
+  cmd_list->tail = header;
+  *out_cmd = header;
+  return iree_ok_status();
+}
+
+// Clones a source buffer and returns the pointer into the arena.
+static iree_status_t iree_hal_cmd_list_clone_data(iree_hal_cmd_list_t* cmd_list,
+                                                  const void* source_data,
+                                                  iree_host_size_t data_length,
+                                                  void** out_target_data) {
+  void* target_data = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_arena_allocate(&cmd_list->arena, data_length, &target_data));
+  memcpy(target_data, source_data, data_length);
+  *out_target_data = target_data;
+  return iree_ok_status();
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_deferred_command_buffer_t implementation
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_deferred_command_buffer_t {
+  iree_hal_command_buffer_t base;
+  iree_allocator_t host_allocator;
+
+  // Maintains a reference to all resources used within the command buffer.
+  // Reset on each begin.
+  iree_hal_resource_set_t* resource_set;
+
+  // All commands in encoding order.
+  iree_hal_cmd_list_t cmd_list;
+} iree_hal_deferred_command_buffer_t;
+
+static const iree_hal_command_buffer_vtable_t
+    iree_hal_deferred_command_buffer_vtable;
+
+static iree_hal_deferred_command_buffer_t*
+iree_hal_deferred_command_buffer_cast(iree_hal_command_buffer_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_deferred_command_buffer_vtable);
+  return (iree_hal_deferred_command_buffer_t*)base_value;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_deferred_command_buffer_create(
+    iree_hal_device_t* device, iree_hal_command_buffer_mode_t mode,
+    iree_hal_command_category_t command_categories,
+    iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator,
+    iree_hal_command_buffer_t** out_command_buffer) {
+  IREE_ASSERT_ARGUMENT(block_pool);
+  IREE_ASSERT_ARGUMENT(out_command_buffer);
+  *out_command_buffer = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_deferred_command_buffer_t* command_buffer = NULL;
+  iree_status_t status = iree_allocator_malloc(
+      host_allocator, sizeof(*command_buffer), (void**)&command_buffer);
+  if (iree_status_is_ok(status)) {
+    iree_hal_command_buffer_initialize(
+        device, mode, command_categories, IREE_HAL_QUEUE_AFFINITY_ANY,
+        &iree_hal_deferred_command_buffer_vtable, &command_buffer->base);
+    command_buffer->host_allocator = host_allocator;
+    iree_hal_cmd_list_initialize(block_pool, &command_buffer->cmd_list);
+
+    status = iree_hal_resource_set_allocate(block_pool,
+                                            &command_buffer->resource_set);
+  }
+
+  if (iree_status_is_ok(status)) {
+    *out_command_buffer = &command_buffer->base;
+  } else {
+    iree_hal_command_buffer_destroy(&command_buffer->base);
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_deferred_command_buffer_destroy(
+    iree_hal_command_buffer_t* base_command_buffer) {
+  iree_hal_deferred_command_buffer_t* command_buffer =
+      iree_hal_deferred_command_buffer_cast(base_command_buffer);
+  iree_allocator_t host_allocator = command_buffer->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_cmd_list_deinitialize(&command_buffer->cmd_list);
+  iree_hal_resource_set_free(command_buffer->resource_set);
+  iree_allocator_free(host_allocator, command_buffer);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static void* iree_hal_deferred_command_buffer_dyn_cast(
+    iree_hal_command_buffer_t* command_buffer, const void* vtable) {
+  if (vtable == &iree_hal_deferred_command_buffer_vtable) {
+    IREE_HAL_ASSERT_TYPE(command_buffer, vtable);
+    return command_buffer;
+  }
+  return NULL;
+}
+
+static iree_status_t iree_hal_deferred_command_buffer_begin(
+    iree_hal_command_buffer_t* base_command_buffer) {
+  iree_hal_deferred_command_buffer_t* command_buffer =
+      iree_hal_deferred_command_buffer_cast(base_command_buffer);
+  iree_hal_cmd_list_reset(&command_buffer->cmd_list);
+  iree_hal_resource_set_reset(command_buffer->resource_set);
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_deferred_command_buffer_end(
+    iree_hal_command_buffer_t* base_command_buffer) {
+  return iree_ok_status();
+}
+
+//===----------------------------------------------------------------------===//
+// IREE_HAL_CMD_EXECUTION_BARRIER
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_cmd_execution_barrier_t {
+  iree_hal_cmd_header_t header;
+  iree_hal_execution_stage_t source_stage_mask;
+  iree_hal_execution_stage_t target_stage_mask;
+  iree_hal_execution_barrier_flags_t flags;
+  iree_host_size_t memory_barrier_count;
+  const iree_hal_memory_barrier_t* memory_barriers;
+  iree_host_size_t buffer_barrier_count;
+  const iree_hal_buffer_barrier_t* buffer_barriers;
+} iree_hal_cmd_execution_barrier_t;
+
+static iree_status_t iree_hal_deferred_command_buffer_execution_barrier(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_execution_stage_t source_stage_mask,
+    iree_hal_execution_stage_t target_stage_mask,
+    iree_hal_execution_barrier_flags_t flags,
+    iree_host_size_t memory_barrier_count,
+    const iree_hal_memory_barrier_t* memory_barriers,
+    iree_host_size_t buffer_barrier_count,
+    const iree_hal_buffer_barrier_t* buffer_barriers) {
+  iree_hal_cmd_list_t* cmd_list =
+      &iree_hal_deferred_command_buffer_cast(base_command_buffer)->cmd_list;
+  iree_hal_cmd_execution_barrier_t* cmd = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_cmd_list_append_command(
+      cmd_list, IREE_HAL_CMD_EXECUTION_BARRIER, sizeof(*cmd), (void**)&cmd));
+  cmd->source_stage_mask = source_stage_mask;
+  cmd->target_stage_mask = target_stage_mask;
+  cmd->flags = flags;
+  cmd->memory_barrier_count = memory_barrier_count;
+  cmd->memory_barriers = NULL;
+  cmd->buffer_barrier_count = buffer_barrier_count;
+  cmd->buffer_barriers = NULL;
+  if (memory_barrier_count > 0) {
+    IREE_RETURN_IF_ERROR(iree_hal_cmd_list_clone_data(
+        cmd_list, memory_barriers,
+        sizeof(memory_barriers[0]) * memory_barrier_count,
+        (void**)&cmd->memory_barriers));
+  }
+  if (buffer_barrier_count > 0) {
+    IREE_RETURN_IF_ERROR(iree_hal_cmd_list_clone_data(
+        cmd_list, buffer_barriers,
+        sizeof(buffer_barriers[0]) * buffer_barrier_count,
+        (void**)&cmd->buffer_barriers));
+  }
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_deferred_command_buffer_apply_execution_barrier(
+    iree_hal_command_buffer_t* target_command_buffer,
+    const iree_hal_cmd_execution_barrier_t* cmd) {
+  return iree_hal_command_buffer_execution_barrier(
+      target_command_buffer, cmd->source_stage_mask, cmd->target_stage_mask,
+      cmd->flags, cmd->memory_barrier_count, cmd->memory_barriers,
+      cmd->buffer_barrier_count, cmd->buffer_barriers);
+}
+
+//===----------------------------------------------------------------------===//
+// IREE_HAL_CMD_SIGNAL_EVENT
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_cmd_signal_event_t {
+  iree_hal_cmd_header_t header;
+  iree_hal_event_t* event;
+  iree_hal_execution_stage_t source_stage_mask;
+} iree_hal_cmd_signal_event_t;
+
+static iree_status_t iree_hal_deferred_command_buffer_signal_event(
+    iree_hal_command_buffer_t* base_command_buffer, iree_hal_event_t* event,
+    iree_hal_execution_stage_t source_stage_mask) {
+  iree_hal_deferred_command_buffer_t* command_buffer =
+      iree_hal_deferred_command_buffer_cast(base_command_buffer);
+  iree_hal_cmd_list_t* cmd_list = &command_buffer->cmd_list;
+  IREE_RETURN_IF_ERROR(
+      iree_hal_resource_set_insert(command_buffer->resource_set, 1, &event));
+  iree_hal_cmd_signal_event_t* cmd = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_cmd_list_append_command(
+      cmd_list, IREE_HAL_CMD_SIGNAL_EVENT, sizeof(*cmd), (void**)&cmd));
+  cmd->event = event;
+  cmd->source_stage_mask = source_stage_mask;
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_deferred_command_buffer_apply_signal_event(
+    iree_hal_command_buffer_t* target_command_buffer,
+    const iree_hal_cmd_signal_event_t* cmd) {
+  return iree_hal_command_buffer_signal_event(target_command_buffer, cmd->event,
+                                              cmd->source_stage_mask);
+}
+
+//===----------------------------------------------------------------------===//
+// IREE_HAL_CMD_RESET_EVENT
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_cmd_reset_event_t {
+  iree_hal_cmd_header_t header;
+  iree_hal_event_t* event;
+  iree_hal_execution_stage_t source_stage_mask;
+} iree_hal_cmd_reset_event_t;
+
+static iree_status_t iree_hal_deferred_command_buffer_reset_event(
+    iree_hal_command_buffer_t* base_command_buffer, iree_hal_event_t* event,
+    iree_hal_execution_stage_t source_stage_mask) {
+  iree_hal_deferred_command_buffer_t* command_buffer =
+      iree_hal_deferred_command_buffer_cast(base_command_buffer);
+  iree_hal_cmd_list_t* cmd_list = &command_buffer->cmd_list;
+  IREE_RETURN_IF_ERROR(
+      iree_hal_resource_set_insert(command_buffer->resource_set, 1, &event));
+  iree_hal_cmd_reset_event_t* cmd = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_cmd_list_append_command(
+      cmd_list, IREE_HAL_CMD_RESET_EVENT, sizeof(*cmd), (void**)&cmd));
+  cmd->event = event;
+  cmd->source_stage_mask = source_stage_mask;
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_deferred_command_buffer_apply_reset_event(
+    iree_hal_command_buffer_t* target_command_buffer,
+    const iree_hal_cmd_reset_event_t* cmd) {
+  return iree_hal_command_buffer_reset_event(target_command_buffer, cmd->event,
+                                             cmd->source_stage_mask);
+}
+
+//===----------------------------------------------------------------------===//
+// IREE_HAL_CMD_WAIT_EVENTS
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_cmd_wait_events_t {
+  iree_hal_cmd_header_t header;
+  iree_host_size_t event_count;
+  iree_hal_execution_stage_t source_stage_mask;
+  iree_hal_execution_stage_t target_stage_mask;
+  iree_host_size_t memory_barrier_count;
+  const iree_hal_memory_barrier_t* memory_barriers;
+  iree_host_size_t buffer_barrier_count;
+  const iree_hal_buffer_barrier_t* buffer_barriers;
+  iree_hal_event_t* events[];
+} iree_hal_cmd_wait_events_t;
+
+static iree_status_t iree_hal_deferred_command_buffer_wait_events(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_host_size_t event_count, const iree_hal_event_t** events,
+    iree_hal_execution_stage_t source_stage_mask,
+    iree_hal_execution_stage_t target_stage_mask,
+    iree_host_size_t memory_barrier_count,
+    const iree_hal_memory_barrier_t* memory_barriers,
+    iree_host_size_t buffer_barrier_count,
+    const iree_hal_buffer_barrier_t* buffer_barriers) {
+  iree_hal_deferred_command_buffer_t* command_buffer =
+      iree_hal_deferred_command_buffer_cast(base_command_buffer);
+  iree_hal_cmd_list_t* cmd_list = &command_buffer->cmd_list;
+  IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+      command_buffer->resource_set, event_count, events));
+  iree_hal_cmd_wait_events_t* cmd = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_cmd_list_append_command(
+      cmd_list, IREE_HAL_CMD_WAIT_EVENTS,
+      sizeof(*cmd) + sizeof(cmd->events[0]) * event_count, (void**)&cmd));
+  cmd->event_count = event_count;
+  cmd->source_stage_mask = source_stage_mask;
+  cmd->target_stage_mask = target_stage_mask;
+  cmd->memory_barrier_count = memory_barrier_count;
+  cmd->memory_barriers = NULL;
+  cmd->buffer_barrier_count = buffer_barrier_count;
+  cmd->buffer_barriers = NULL;
+  memcpy(cmd->events, events, sizeof(cmd->events[0]) * event_count);
+  if (memory_barrier_count > 0) {
+    IREE_RETURN_IF_ERROR(iree_hal_cmd_list_clone_data(
+        cmd_list, memory_barriers,
+        sizeof(memory_barriers[0]) * memory_barrier_count,
+        (void**)&cmd->memory_barriers));
+  }
+  if (buffer_barrier_count > 0) {
+    IREE_RETURN_IF_ERROR(iree_hal_cmd_list_clone_data(
+        cmd_list, buffer_barriers,
+        sizeof(buffer_barriers[0]) * buffer_barrier_count,
+        (void**)&cmd->buffer_barriers));
+  }
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_deferred_command_buffer_apply_wait_events(
+    iree_hal_command_buffer_t* target_command_buffer,
+    const iree_hal_cmd_wait_events_t* cmd) {
+  return iree_hal_command_buffer_wait_events(
+      target_command_buffer, cmd->event_count,
+      (const iree_hal_event_t**)cmd->events, cmd->source_stage_mask,
+      cmd->target_stage_mask, cmd->memory_barrier_count, cmd->memory_barriers,
+      cmd->buffer_barrier_count, cmd->buffer_barriers);
+}
+
+//===----------------------------------------------------------------------===//
+// IREE_HAL_CMD_DISCARD_BUFFER
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_cmd_discard_buffer_t {
+  iree_hal_cmd_header_t header;
+  iree_hal_buffer_t* buffer;
+} iree_hal_cmd_discard_buffer_t;
+
+static iree_status_t iree_hal_deferred_command_buffer_discard_buffer(
+    iree_hal_command_buffer_t* base_command_buffer, iree_hal_buffer_t* buffer) {
+  iree_hal_deferred_command_buffer_t* command_buffer =
+      iree_hal_deferred_command_buffer_cast(base_command_buffer);
+  iree_hal_cmd_list_t* cmd_list = &command_buffer->cmd_list;
+  IREE_RETURN_IF_ERROR(
+      iree_hal_resource_set_insert(command_buffer->resource_set, 1, &buffer));
+  iree_hal_cmd_discard_buffer_t* cmd = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_cmd_list_append_command(
+      cmd_list, IREE_HAL_CMD_DISCARD_BUFFER, sizeof(*cmd), (void**)&cmd));
+  cmd->buffer = buffer;
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_deferred_command_buffer_apply_discard_buffer(
+    iree_hal_command_buffer_t* target_command_buffer,
+    const iree_hal_cmd_discard_buffer_t* cmd) {
+  return iree_hal_command_buffer_discard_buffer(target_command_buffer,
+                                                cmd->buffer);
+}
+
+//===----------------------------------------------------------------------===//
+// IREE_HAL_CMD_FILL_BUFFER
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_cmd_fill_buffer_t {
+  iree_hal_cmd_header_t header;
+  iree_hal_buffer_t* target_buffer;
+  iree_device_size_t target_offset;
+  iree_device_size_t length;
+  uint64_t pattern;
+  iree_host_size_t pattern_length;
+} iree_hal_cmd_fill_buffer_t;
+
+static iree_status_t iree_hal_deferred_command_buffer_fill_buffer(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+    iree_device_size_t length, const void* pattern,
+    iree_host_size_t pattern_length) {
+  iree_hal_deferred_command_buffer_t* command_buffer =
+      iree_hal_deferred_command_buffer_cast(base_command_buffer);
+  iree_hal_cmd_list_t* cmd_list = &command_buffer->cmd_list;
+  iree_hal_cmd_fill_buffer_t* cmd = NULL;
+  if (pattern_length > sizeof(cmd->pattern)) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "fill patterns must be < 8 bytes");
+  }
+  IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+      command_buffer->resource_set, 1, &target_buffer));
+  IREE_RETURN_IF_ERROR(iree_hal_cmd_list_append_command(
+      cmd_list, IREE_HAL_CMD_FILL_BUFFER, sizeof(*cmd), (void**)&cmd));
+  cmd->target_buffer = target_buffer;
+  cmd->target_offset = target_offset;
+  cmd->length = length;
+  memcpy(&cmd->pattern, pattern, pattern_length);
+  cmd->pattern_length = pattern_length;
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_deferred_command_buffer_apply_fill_buffer(
+    iree_hal_command_buffer_t* target_command_buffer,
+    const iree_hal_cmd_fill_buffer_t* cmd) {
+  return iree_hal_command_buffer_fill_buffer(
+      target_command_buffer, cmd->target_buffer, cmd->target_offset,
+      cmd->length, (void**)&cmd->pattern, cmd->pattern_length);
+}
+
+//===----------------------------------------------------------------------===//
+// IREE_HAL_CMD_UPDATE_BUFFER
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_cmd_update_buffer_t {
+  iree_hal_cmd_header_t header;
+  iree_hal_buffer_t* target_buffer;
+  iree_device_size_t target_offset;
+  iree_device_size_t length;
+  uint8_t source_buffer[];
+} iree_hal_cmd_update_buffer_t;
+
+static iree_status_t iree_hal_deferred_command_buffer_update_buffer(
+    iree_hal_command_buffer_t* base_command_buffer, const void* source_buffer,
+    iree_host_size_t source_offset, iree_hal_buffer_t* target_buffer,
+    iree_device_size_t target_offset, iree_device_size_t length) {
+  iree_hal_deferred_command_buffer_t* command_buffer =
+      iree_hal_deferred_command_buffer_cast(base_command_buffer);
+  iree_hal_cmd_list_t* cmd_list = &command_buffer->cmd_list;
+  IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+      command_buffer->resource_set, 1, &target_buffer));
+  iree_hal_cmd_update_buffer_t* cmd = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_cmd_list_append_command(
+      cmd_list, IREE_HAL_CMD_UPDATE_BUFFER,
+      sizeof(*cmd) + sizeof(cmd->source_buffer[0]) * length, (void**)&cmd));
+  cmd->target_buffer = target_buffer;
+  cmd->target_offset = target_offset;
+  cmd->length = length;
+  memcpy(cmd->source_buffer, (const uint8_t*)source_buffer + source_offset,
+         sizeof(cmd->source_buffer[0]) * length);
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_deferred_command_buffer_apply_update_buffer(
+    iree_hal_command_buffer_t* target_command_buffer,
+    const iree_hal_cmd_update_buffer_t* cmd) {
+  return iree_hal_command_buffer_update_buffer(
+      target_command_buffer, cmd->source_buffer, 0, cmd->target_buffer,
+      cmd->target_offset, cmd->length);
+}
+
+//===----------------------------------------------------------------------===//
+// IREE_HAL_CMD_COPY_BUFFER
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_cmd_copy_buffer_t {
+  iree_hal_cmd_header_t header;
+  iree_hal_buffer_t* source_buffer;
+  iree_device_size_t source_offset;
+  iree_hal_buffer_t* target_buffer;
+  iree_device_size_t target_offset;
+  iree_device_size_t length;
+} iree_hal_cmd_copy_buffer_t;
+
+static iree_status_t iree_hal_deferred_command_buffer_copy_buffer(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
+    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+    iree_device_size_t length) {
+  iree_hal_deferred_command_buffer_t* command_buffer =
+      iree_hal_deferred_command_buffer_cast(base_command_buffer);
+  iree_hal_cmd_list_t* cmd_list = &command_buffer->cmd_list;
+  const void* buffers[2] = {source_buffer, target_buffer};
+  IREE_RETURN_IF_ERROR(
+      iree_hal_resource_set_insert(command_buffer->resource_set, 2, buffers));
+  iree_hal_cmd_copy_buffer_t* cmd = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_cmd_list_append_command(
+      cmd_list, IREE_HAL_CMD_COPY_BUFFER, sizeof(*cmd), (void**)&cmd));
+  cmd->source_buffer = source_buffer;
+  cmd->source_offset = source_offset;
+  cmd->target_buffer = target_buffer;
+  cmd->target_offset = target_offset;
+  cmd->length = length;
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_deferred_command_buffer_apply_copy_buffer(
+    iree_hal_command_buffer_t* target_command_buffer,
+    const iree_hal_cmd_copy_buffer_t* cmd) {
+  return iree_hal_command_buffer_copy_buffer(
+      target_command_buffer, cmd->source_buffer, cmd->source_offset,
+      cmd->target_buffer, cmd->target_offset, cmd->length);
+}
+
+//===----------------------------------------------------------------------===//
+// IREE_HAL_CMD_PUSH_CONSTANTS
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_cmd_push_constants_t {
+  iree_hal_cmd_header_t header;
+  iree_hal_executable_layout_t* executable_layout;
+  iree_host_size_t offset;
+  iree_host_size_t values_length;
+  uint8_t values[];
+} iree_hal_cmd_push_constants_t;
+
+static iree_status_t iree_hal_deferred_command_buffer_push_constants(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_executable_layout_t* executable_layout, iree_host_size_t offset,
+    const void* values, iree_host_size_t values_length) {
+  iree_hal_deferred_command_buffer_t* command_buffer =
+      iree_hal_deferred_command_buffer_cast(base_command_buffer);
+  iree_hal_cmd_list_t* cmd_list = &command_buffer->cmd_list;
+  IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+      command_buffer->resource_set, 1, &executable_layout));
+  iree_hal_cmd_push_constants_t* cmd = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_cmd_list_append_command(
+      cmd_list, IREE_HAL_CMD_PUSH_CONSTANTS,
+      sizeof(*cmd) + sizeof(cmd->values[0]) * values_length, (void**)&cmd));
+  cmd->executable_layout = executable_layout;
+  cmd->offset = offset;
+  cmd->values_length = values_length;
+  memcpy(cmd->values, values, sizeof(cmd->values[0]) * values_length);
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_deferred_command_buffer_apply_push_constants(
+    iree_hal_command_buffer_t* target_command_buffer,
+    const iree_hal_cmd_push_constants_t* cmd) {
+  return iree_hal_command_buffer_push_constants(
+      target_command_buffer, cmd->executable_layout, cmd->offset, cmd->values,
+      cmd->values_length);
+}
+
+//===----------------------------------------------------------------------===//
+// IREE_HAL_CMD_PUSH_DESCRIPTOR_SET
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_cmd_push_descriptor_set_t {
+  iree_hal_cmd_header_t header;
+  iree_hal_executable_layout_t* executable_layout;
+  uint32_t set;
+  iree_host_size_t binding_count;
+  iree_hal_descriptor_set_binding_t bindings[];
+} iree_hal_cmd_push_descriptor_set_t;
+
+static iree_status_t iree_hal_deferred_command_buffer_push_descriptor_set(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_executable_layout_t* executable_layout, uint32_t set,
+    iree_host_size_t binding_count,
+    const iree_hal_descriptor_set_binding_t* bindings) {
+  iree_hal_deferred_command_buffer_t* command_buffer =
+      iree_hal_deferred_command_buffer_cast(base_command_buffer);
+  iree_hal_cmd_list_t* cmd_list = &command_buffer->cmd_list;
+  IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+      command_buffer->resource_set, 1, &executable_layout));
+  for (iree_host_size_t i = 0; i < binding_count; ++i) {
+    IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+        command_buffer->resource_set, 1, &bindings[i].buffer));
+  }
+  iree_hal_cmd_push_descriptor_set_t* cmd = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_cmd_list_append_command(
+      cmd_list, IREE_HAL_CMD_PUSH_DESCRIPTOR_SET,
+      sizeof(*cmd) + sizeof(cmd->bindings[0]) * binding_count, (void**)&cmd));
+  cmd->executable_layout = executable_layout;
+  cmd->set = set;
+  cmd->binding_count = binding_count;
+  memcpy(cmd->bindings, bindings, sizeof(cmd->bindings[0]) * binding_count);
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_deferred_command_buffer_apply_push_descriptor_set(
+    iree_hal_command_buffer_t* target_command_buffer,
+    const iree_hal_cmd_push_descriptor_set_t* cmd) {
+  return iree_hal_command_buffer_push_descriptor_set(
+      target_command_buffer, cmd->executable_layout, cmd->set,
+      cmd->binding_count, cmd->bindings);
+}
+
+//===----------------------------------------------------------------------===//
+// IREE_HAL_CMD_BIND_DESCRIPTOR_SET
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_cmd_bind_descriptor_set_t {
+  iree_hal_cmd_header_t header;
+  iree_hal_executable_layout_t* executable_layout;
+  uint32_t set;
+  iree_hal_descriptor_set_t* descriptor_set;
+  iree_host_size_t dynamic_offset_count;
+  iree_device_size_t dynamic_offsets[];
+} iree_hal_cmd_bind_descriptor_set_t;
+
+static iree_status_t iree_hal_deferred_command_buffer_bind_descriptor_set(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_executable_layout_t* executable_layout, uint32_t set,
+    iree_hal_descriptor_set_t* descriptor_set,
+    iree_host_size_t dynamic_offset_count,
+    const iree_device_size_t* dynamic_offsets) {
+  iree_hal_deferred_command_buffer_t* command_buffer =
+      iree_hal_deferred_command_buffer_cast(base_command_buffer);
+  iree_hal_cmd_list_t* cmd_list = &command_buffer->cmd_list;
+  const void* resources[2] = {executable_layout, descriptor_set};
+  IREE_RETURN_IF_ERROR(
+      iree_hal_resource_set_insert(command_buffer->resource_set, 2, resources));
+  iree_hal_cmd_bind_descriptor_set_t* cmd = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_cmd_list_append_command(
+      cmd_list, IREE_HAL_CMD_BIND_DESCRIPTOR_SET,
+      sizeof(*cmd) + sizeof(cmd->dynamic_offsets[0]) * dynamic_offset_count,
+      (void**)&cmd));
+  cmd->executable_layout = executable_layout;
+  cmd->set = set;
+  cmd->descriptor_set = descriptor_set;
+  cmd->dynamic_offset_count = dynamic_offset_count;
+  memcpy(cmd->dynamic_offsets, dynamic_offsets,
+         sizeof(cmd->dynamic_offsets[0]) * dynamic_offset_count);
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_deferred_command_buffer_apply_bind_descriptor_set(
+    iree_hal_command_buffer_t* target_command_buffer,
+    const iree_hal_cmd_bind_descriptor_set_t* cmd) {
+  return iree_hal_command_buffer_bind_descriptor_set(
+      target_command_buffer, cmd->executable_layout, cmd->set,
+      cmd->descriptor_set, cmd->dynamic_offset_count, cmd->dynamic_offsets);
+}
+
+//===----------------------------------------------------------------------===//
+// IREE_HAL_CMD_DISPATCH
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_cmd_dispatch_t {
+  iree_hal_cmd_header_t header;
+  iree_hal_executable_t* executable;
+  int32_t entry_point;
+  uint32_t workgroup_x;
+  uint32_t workgroup_y;
+  uint32_t workgroup_z;
+} iree_hal_cmd_dispatch_t;
+
+static iree_status_t iree_hal_deferred_command_buffer_dispatch(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_executable_t* executable, int32_t entry_point,
+    uint32_t workgroup_x, uint32_t workgroup_y, uint32_t workgroup_z) {
+  iree_hal_deferred_command_buffer_t* command_buffer =
+      iree_hal_deferred_command_buffer_cast(base_command_buffer);
+  iree_hal_cmd_list_t* cmd_list = &command_buffer->cmd_list;
+  IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+      command_buffer->resource_set, 1, &executable));
+  iree_hal_cmd_dispatch_t* cmd = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_cmd_list_append_command(
+      cmd_list, IREE_HAL_CMD_DISPATCH, sizeof(*cmd), (void**)&cmd));
+  cmd->executable = executable;
+  cmd->entry_point = entry_point;
+  cmd->workgroup_x = workgroup_x;
+  cmd->workgroup_y = workgroup_y;
+  cmd->workgroup_z = workgroup_z;
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_deferred_command_buffer_apply_dispatch(
+    iree_hal_command_buffer_t* target_command_buffer,
+    const iree_hal_cmd_dispatch_t* cmd) {
+  return iree_hal_command_buffer_dispatch(
+      target_command_buffer, cmd->executable, cmd->entry_point,
+      cmd->workgroup_x, cmd->workgroup_y, cmd->workgroup_z);
+}
+
+//===----------------------------------------------------------------------===//
+// IREE_HAL_CMD_DISPATCH_INDIRECT
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_cmd_dispatch_indirect_t {
+  iree_hal_cmd_header_t header;
+  iree_hal_executable_t* executable;
+  int32_t entry_point;
+  iree_hal_buffer_t* workgroups_buffer;
+  iree_device_size_t workgroups_offset;
+} iree_hal_cmd_dispatch_indirect_t;
+
+static iree_status_t iree_hal_deferred_command_buffer_dispatch_indirect(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_executable_t* executable, int32_t entry_point,
+    iree_hal_buffer_t* workgroups_buffer,
+    iree_device_size_t workgroups_offset) {
+  iree_hal_deferred_command_buffer_t* command_buffer =
+      iree_hal_deferred_command_buffer_cast(base_command_buffer);
+  iree_hal_cmd_list_t* cmd_list = &command_buffer->cmd_list;
+  const void* resources[2] = {executable, workgroups_buffer};
+  IREE_RETURN_IF_ERROR(
+      iree_hal_resource_set_insert(command_buffer->resource_set, 2, resources));
+  iree_hal_cmd_dispatch_indirect_t* cmd = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_cmd_list_append_command(
+      cmd_list, IREE_HAL_CMD_DISPATCH_INDIRECT, sizeof(*cmd), (void**)&cmd));
+  cmd->executable = executable;
+  cmd->entry_point = entry_point;
+  cmd->workgroups_buffer = workgroups_buffer;
+  cmd->workgroups_offset = workgroups_offset;
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_deferred_command_buffer_apply_dispatch_indirect(
+    iree_hal_command_buffer_t* target_command_buffer,
+    const iree_hal_cmd_dispatch_indirect_t* cmd) {
+  return iree_hal_command_buffer_dispatch_indirect(
+      target_command_buffer, cmd->executable, cmd->entry_point,
+      cmd->workgroups_buffer, cmd->workgroups_offset);
+}
+
+//===----------------------------------------------------------------------===//
+// Dynamic replay dispatch
+//===----------------------------------------------------------------------===//
+
+static const iree_hal_cmd_apply_fn_t iree_hal_cmd_apply_table[] = {
+    [IREE_HAL_CMD_EXECUTION_BARRIER] = (iree_hal_cmd_apply_fn_t)
+        iree_hal_deferred_command_buffer_apply_execution_barrier,
+    [IREE_HAL_CMD_SIGNAL_EVENT] = (iree_hal_cmd_apply_fn_t)
+        iree_hal_deferred_command_buffer_apply_signal_event,
+    [IREE_HAL_CMD_RESET_EVENT] = (iree_hal_cmd_apply_fn_t)
+        iree_hal_deferred_command_buffer_apply_reset_event,
+    [IREE_HAL_CMD_WAIT_EVENTS] = (iree_hal_cmd_apply_fn_t)
+        iree_hal_deferred_command_buffer_apply_wait_events,
+    [IREE_HAL_CMD_DISCARD_BUFFER] = (iree_hal_cmd_apply_fn_t)
+        iree_hal_deferred_command_buffer_apply_discard_buffer,
+    [IREE_HAL_CMD_FILL_BUFFER] = (iree_hal_cmd_apply_fn_t)
+        iree_hal_deferred_command_buffer_apply_fill_buffer,
+    [IREE_HAL_CMD_UPDATE_BUFFER] = (iree_hal_cmd_apply_fn_t)
+        iree_hal_deferred_command_buffer_apply_update_buffer,
+    [IREE_HAL_CMD_COPY_BUFFER] = (iree_hal_cmd_apply_fn_t)
+        iree_hal_deferred_command_buffer_apply_copy_buffer,
+    [IREE_HAL_CMD_PUSH_CONSTANTS] = (iree_hal_cmd_apply_fn_t)
+        iree_hal_deferred_command_buffer_apply_push_constants,
+    [IREE_HAL_CMD_PUSH_DESCRIPTOR_SET] = (iree_hal_cmd_apply_fn_t)
+        iree_hal_deferred_command_buffer_apply_push_descriptor_set,
+    [IREE_HAL_CMD_BIND_DESCRIPTOR_SET] = (iree_hal_cmd_apply_fn_t)
+        iree_hal_deferred_command_buffer_apply_bind_descriptor_set,
+    [IREE_HAL_CMD_DISPATCH] = (iree_hal_cmd_apply_fn_t)
+        iree_hal_deferred_command_buffer_apply_dispatch,
+    [IREE_HAL_CMD_DISPATCH_INDIRECT] = (iree_hal_cmd_apply_fn_t)
+        iree_hal_deferred_command_buffer_apply_dispatch_indirect,
+};
+
+IREE_API_EXPORT iree_status_t iree_hal_deferred_command_buffer_apply(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_command_buffer_t* target_command_buffer) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_deferred_command_buffer_t* command_buffer =
+      (iree_hal_deferred_command_buffer_t*)iree_hal_command_buffer_dyn_cast(
+          base_command_buffer, &iree_hal_deferred_command_buffer_vtable);
+  iree_hal_cmd_list_t* cmd_list = &command_buffer->cmd_list;
+
+  iree_status_t status = iree_hal_command_buffer_begin(target_command_buffer);
+  if (iree_status_is_ok(status)) {
+    for (iree_hal_cmd_header_t* cmd = cmd_list->head; cmd != NULL;
+         cmd = cmd->next) {
+      status = iree_hal_cmd_apply_table[cmd->type](target_command_buffer, cmd);
+      if (!iree_status_is_ok(status)) break;
+    }
+  }
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_command_buffer_end(target_command_buffer);
+  }
+
+  // One-shot command buffers can't be replayed so we can drop the memory
+  // immediately. As command buffers must remain live for the duration of their
+  // execution this prevents us from hanging on to the commands we will never
+  // use again.
+  if (iree_status_is_ok(status) &&
+      iree_all_bits_set(command_buffer->base.mode,
+                        IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT)) {
+    iree_hal_cmd_list_reset(cmd_list);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static const iree_hal_command_buffer_vtable_t
+    iree_hal_deferred_command_buffer_vtable = {
+        .destroy = iree_hal_deferred_command_buffer_destroy,
+        .dyn_cast = iree_hal_deferred_command_buffer_dyn_cast,
+        .begin = iree_hal_deferred_command_buffer_begin,
+        .end = iree_hal_deferred_command_buffer_end,
+        .execution_barrier = iree_hal_deferred_command_buffer_execution_barrier,
+        .signal_event = iree_hal_deferred_command_buffer_signal_event,
+        .reset_event = iree_hal_deferred_command_buffer_reset_event,
+        .wait_events = iree_hal_deferred_command_buffer_wait_events,
+        .discard_buffer = iree_hal_deferred_command_buffer_discard_buffer,
+        .fill_buffer = iree_hal_deferred_command_buffer_fill_buffer,
+        .update_buffer = iree_hal_deferred_command_buffer_update_buffer,
+        .copy_buffer = iree_hal_deferred_command_buffer_copy_buffer,
+        .push_constants = iree_hal_deferred_command_buffer_push_constants,
+        .push_descriptor_set =
+            iree_hal_deferred_command_buffer_push_descriptor_set,
+        .bind_descriptor_set =
+            iree_hal_deferred_command_buffer_bind_descriptor_set,
+        .dispatch = iree_hal_deferred_command_buffer_dispatch,
+        .dispatch_indirect = iree_hal_deferred_command_buffer_dispatch_indirect,
+};
diff --git a/runtime/src/iree/hal/utils/deferred_command_buffer.h b/runtime/src/iree/hal/utils/deferred_command_buffer.h
new file mode 100644
index 0000000..f1686f3
--- /dev/null
+++ b/runtime/src/iree/hal/utils/deferred_command_buffer.h
@@ -0,0 +1,62 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_UTILS_DEFERRED_COMMAND_BUFFER_H_
+#define IREE_HAL_UTILS_DEFERRED_COMMAND_BUFFER_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/command_buffer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef struct iree_arena_block_pool_t iree_arena_block_pool_t;
+
+//===----------------------------------------------------------------------===//
+// iree_hal_command_buffer_t deferred record/replay wrapper
+//===----------------------------------------------------------------------===//
+
+// Records an in-memory command buffer that can be replayed against a target
+// command buffer at a later time.
+//
+// Argument arrays (like push constants) and host buffers (like the source
+// buffer in iree_hal_command_buffer_update_buffer) that usually live on the
+// stack will be cloned. As with all command buffers the resources (buffers,
+// events, etc) referenced will not be retained and the caller must ensure that
+// all resource lifetimes outlive the command buffer.
+//
+// |block_pool| will be used to allocate the underlying storage and the blocks
+// will be retained until the command buffer is reset or released, or if
+// IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT is set after the first time the command
+// buffer is replayed. The block size of the pool can be whatever the caller
+// wants with the caveat being that smaller sizes may result in more oversized
+// allocations from the system. 16KB, 32KB, and 64KB are reasonable starting
+// points based on system availability.
+// NOTE: the |block_pool| must remain live for the lifetime of the command
+// buffers that use it.
+//
+// After recording iree_hal_deferred_command_buffer_apply can be used to replay
+// the sequence of commands against a target command buffer implementation.
+// The command buffer can be replayed multiple times.
+IREE_API_EXPORT iree_status_t iree_hal_deferred_command_buffer_create(
+    iree_hal_device_t* device, iree_hal_command_buffer_mode_t mode,
+    iree_hal_command_category_t command_categories,
+    iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator,
+    iree_hal_command_buffer_t** out_command_buffer);
+
+// Replays a recorded |command_buffer| against a |target_command_buffer|.
+// If the command buffer was recorded in one-shot mode it will be reset upon
+// return.
+IREE_API_EXPORT iree_status_t iree_hal_deferred_command_buffer_apply(
+    iree_hal_command_buffer_t* command_buffer,
+    iree_hal_command_buffer_t* target_command_buffer);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_UTILS_DEFERRED_COMMAND_BUFFER_H_
diff --git a/runtime/src/iree/hal/utils/resource_set.c b/runtime/src/iree/hal/utils/resource_set.c
new file mode 100644
index 0000000..14e5871
--- /dev/null
+++ b/runtime/src/iree/hal/utils/resource_set.c
@@ -0,0 +1,276 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/utils/resource_set.h"
+
+#include "iree/base/tracing.h"
+
+// Inlines the first chunk into the block using all of the remaining space.
+// This is a special case chunk that is released back to the pool with the
+// resource set and lets us avoid an additional allocation.
+static void iree_hal_resource_set_setup_inline_chunk(
+    iree_hal_resource_set_t* set) {
+  uint8_t* block_ptr = (uint8_t*)set + sizeof(*set);
+  iree_hal_resource_set_chunk_t* inlined_chunk =
+      (iree_hal_resource_set_chunk_t*)block_ptr;
+  inlined_chunk->flags = IREE_HAL_RESOURCE_SET_CHUNK_FLAG_INLINE;
+  inlined_chunk->capacity = (set->block_pool->total_block_size - sizeof(*set) -
+                             sizeof(*inlined_chunk)) /
+                            sizeof(iree_hal_resource_t*);
+  inlined_chunk->capacity = iree_min(inlined_chunk->capacity,
+                                     IREE_HAL_RESOURCE_SET_CHUNK_MAX_CAPACITY);
+  inlined_chunk->count = 0;
+  set->chunk_head = inlined_chunk;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_resource_set_allocate(
+    iree_arena_block_pool_t* block_pool, iree_hal_resource_set_t** out_set) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // We could allow larger sizes (would require widening the capacity/count
+  // fields in the chunk) but in real usage having even 64k is a bit too much.
+  IREE_ASSERT_LE(block_pool->total_block_size, 64 * 1024,
+                 "keep block sizes small for resource sets");
+
+  // Acquire block and place the set struct at the head.
+  iree_arena_block_t* block = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_arena_block_pool_acquire(block_pool, &block));
+  uint8_t* block_ptr = (uint8_t*)block - block_pool->usable_block_size;
+  iree_hal_resource_set_t* set = (iree_hal_resource_set_t*)block_ptr;
+  memset(set, 0, sizeof(*set));
+  set->block_pool = block_pool;
+  iree_hal_resource_set_setup_inline_chunk(set);
+
+  *out_set = set;
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+static void iree_hal_resource_set_release_blocks(iree_hal_resource_set_t* set,
+                                                 bool preserve_set) {
+  // Release all resources in all chunks and stitch together the blocks in a
+  // linked list. We do this first so that we can release all of the chunks back
+  // to the block pool in one operation. Ideally we'd maintain the linked list
+  // in our chunks but there's some weirdness with prefix/suffix header/footers
+  // that isn't worth the complexity.
+  iree_arena_block_t* block_head = NULL;
+  iree_arena_block_t* block_tail = NULL;
+  iree_hal_resource_set_chunk_t* chunk = set->chunk_head;
+  while (chunk) {
+    // Release all resources in the chunk.
+    for (iree_host_size_t i = 0; i < chunk->count; ++i) {
+      iree_hal_resource_release(chunk->resources[i]);
+    }
+    // Consume the chunk and add it to the block pool release linked list.
+    iree_hal_resource_set_chunk_t* next_chunk = chunk->next_chunk;
+    iree_arena_block_t* block = NULL;
+    if (iree_hal_resource_set_chunk_is_stored_inline(chunk)) {
+      // This is the inlined first chunk that also stores the set header.
+      // If we are not freeing the set then we don't release the block back to
+      // the pool.
+      if (preserve_set) {
+        // Don't release the block.
+        break;
+      } else {
+        block = (iree_arena_block_t*)((uint8_t*)set +
+                                      set->block_pool->usable_block_size);
+        next_chunk = NULL;
+      }
+    } else {
+      // A chunk acquired after the set was acquired.
+      block = (iree_arena_block_t*)((uint8_t*)chunk +
+                                    set->block_pool->usable_block_size);
+    }
+    block->next = block_head;
+    block_head = block;
+    if (!block_tail) block_tail = block;
+    chunk = next_chunk;
+  }
+
+  // Release all blocks back to the block pool in one operation.
+  // NOTE: this invalidates the |set| memory.
+  iree_arena_block_pool_t* block_pool = set->block_pool;
+  iree_arena_block_pool_release(block_pool, block_head, block_tail);
+}
+
+IREE_API_EXPORT void iree_hal_resource_set_free(iree_hal_resource_set_t* set) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Release all resources and the arena block used by the set.
+  // The set pointer is invalid after this call returns.
+  iree_hal_resource_set_release_blocks(set, /*preserve_set=*/false);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+IREE_API_EXPORT void iree_hal_resource_set_reset(iree_hal_resource_set_t* set) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Release all resources and the blocks besides the base set.
+  iree_hal_resource_set_release_blocks(set, /*preserve_set=*/true);
+
+  // Reset the set state.
+  memset(set->mru, 0, sizeof(set->mru));
+  iree_hal_resource_set_setup_inline_chunk(set);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+// Retains |resource| and adds it to the main |set| list.
+static iree_status_t iree_hal_resource_set_insert_retain(
+    iree_hal_resource_set_t* set, iree_hal_resource_t* resource) {
+  iree_hal_resource_set_chunk_t* chunk = set->chunk_head;
+  if (IREE_UNLIKELY(chunk->count + 1 > chunk->capacity)) {
+    // Ran out of room in the current chunk - acquire a new one and link it into
+    // the list of chunks.
+    iree_arena_block_t* block = NULL;
+    IREE_RETURN_IF_ERROR(
+        iree_arena_block_pool_acquire(set->block_pool, &block));
+    chunk =
+        (iree_hal_resource_set_chunk_t*)((uint8_t*)block -
+                                         set->block_pool->usable_block_size);
+    chunk->next_chunk = set->chunk_head;
+    set->chunk_head = chunk;
+    chunk->capacity = (set->block_pool->total_block_size - sizeof(*chunk)) /
+                      sizeof(iree_hal_resource_t*);
+    chunk->capacity =
+        iree_min(chunk->capacity, IREE_HAL_RESOURCE_SET_CHUNK_MAX_CAPACITY);
+    chunk->count = 0;
+  }
+
+  // Retain and insert into the chunk.
+  chunk->resources[chunk->count++] = resource;
+  iree_hal_resource_retain(resource);
+  return iree_ok_status();
+}
+
+// Scans the lookaside for the resource pointer and updates the order if found.
+// If the resource was not found then it will be inserted into the main list as
+// well as the MRU.
+//
+// This performs a full scan over the MRU and if the resource is found will
+// move the resource to the front of the list before returning. Otherwise the
+// resource will be retained in the main source-of-truth list.
+//
+// Example (hit):
+//   +----+----+----+----+
+//   | AA | BB | CC | DD |  resource: CC
+//   +----+----+----+----+
+//   scan mru to find CC:
+//     found at mru[2]
+//     shift prefix down 1:
+//       +----+----+----+----+
+//       | AA | AA | BB | DD |
+//       +----+----+----+----+
+//     insert resource at front:
+//       +----+----+----+----+
+//       | CC | AA | BB | DD |
+//       +----+----+----+----+
+//
+// Example (miss):
+//   +----+----+----+----+
+//   | AA | BB | CC | DD |  resource: EE
+//   +----+----+----+----+
+//   scan mru to find EE: not found
+//   shift set down 1:
+//     +----+----+----+----+
+//     | AA | AA | BB | CC |
+//     +----+----+----+----+
+//   insert resource at front:
+//     +----+----+----+----+
+//     | EE | AA | BB | CC |
+//     +----+----+----+----+
+//   insert resource into main list
+//
+// The intent here is that we can model this behavior with SIMD ops to perform
+// both the scan and update using comparison, extraction, and permutation. The
+// best and worst case flows will load the entire MRU into registers from a
+// single cache line, do all the scanning and shifting in registers, and then
+// store back to the single cache line.
+//
+// Today, though, we leave this as an exercise to whoever comes across this :)
+// Notes:
+//   As the MRU is a fixed size we can unroll it entirely and avoid any looping.
+//   On a 32-bit system with uint32x4_t we only need 4 registers.
+//   On a 64-bit system with uint64x2_t we also only need 4 registers - though
+//   the MRU has half as many entries and we may want to go >1 cache line.
+//
+//   If we wanted to process more than one resource at a time we can specialize
+//   the code paths to handle 1/2/4/etc resources and process in batches with
+//   an optional remainder. This would increase the ratio of work performed on
+//   the loaded MRU registers before we do the shift/store.
+//
+//   The tree sequence we likely want is something like:
+//    https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_n_u32
+//    https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqq_u32
+//    https://developer.arm.com/architectures/instruction-sets/intrinsics/vorrq_u32
+//    https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxvq_u32
+//    or
+//    https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_n_u64
+//    https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqq_u64
+//    https://developer.arm.com/architectures/instruction-sets/intrinsics/vorrq_u64
+//    https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u64_u32
+//    https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxvq_u32
+//   This would yield whether the pointer was found, but instead of maxing at
+//   the end we can use the produced mask to extract out a single register with
+//   which positions are hits and use that to then permute the registers into
+//   the proper order. At the end we could use a table instruction to remap and
+//   extract out a byte/bitmap of the indices that we need to insert into the
+//   main set.
+//
+//   The shifting can be performed with
+//    https://developer.arm.com/architectures/instruction-sets/intrinsics/vextq_u32
+//    https://developer.arm.com/architectures/instruction-sets/intrinsics/vextq_u64
+//   This takes n low elements of LHS and rest from RHS and we can cascade them
+//   to shift down the whole MRU.
+//
+//   We can use SIMDE as a rosetta stone for getting neon/avx/wasm/etc:
+//   https://github.com/simd-everywhere/simde/blob/master/simde/arm/neon/ceq.h#L591
+static iree_status_t iree_hal_resource_set_insert_1(
+    iree_hal_resource_set_t* set, iree_hal_resource_t* resource) {
+  // Scan and hope for a hit.
+  for (iree_host_size_t i = 0; i < IREE_ARRAYSIZE(set->mru); ++i) {
+    if (set->mru[i] != resource) continue;
+    // Hit - keep the list sorted by most->least recently used.
+    // We shift the MRU down to make room at index 0 and store the
+    // resource there.
+    if (i > 0) {
+      memmove(&set->mru[1], &set->mru[0], sizeof(set->mru[0]) * i);
+      set->mru[0] = resource;
+    }
+    return iree_ok_status();
+  }
+
+  // Miss - insert into the main list (slow path).
+  // Note that we do this before updating the MRU in case allocation fails - we
+  // don't want to keep the pointer around unless we've really retained it.
+  IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert_retain(set, resource));
+
+  // Shift the MRU down and insert the new item at the head.
+  memmove(&set->mru[1], &set->mru[0],
+          sizeof(set->mru[0]) * (IREE_ARRAYSIZE(set->mru) - 1));
+  set->mru[0] = resource;
+
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_status_t
+iree_hal_resource_set_insert(iree_hal_resource_set_t* set,
+                             iree_host_size_t count, const void* resources) {
+  // For now we process one at a time. We should have a stride that lets us
+  // amortize the cost of doing the MRU update and insertion allocation by
+  // say slicing off 4/8/16/32 resources at a time etc. Today each miss that
+  // requires a full insertion goes down the whole path of checking chunk
+  // capacity and such.
+  iree_hal_resource_t* const* typed_resources =
+      (iree_hal_resource_t* const*)resources;
+  for (iree_host_size_t i = 0; i < count; ++i) {
+    IREE_RETURN_IF_ERROR(
+        iree_hal_resource_set_insert_1(set, typed_resources[i]));
+  }
+  return iree_ok_status();
+}
diff --git a/runtime/src/iree/hal/utils/resource_set.h b/runtime/src/iree/hal/utils/resource_set.h
new file mode 100644
index 0000000..6f63ced
--- /dev/null
+++ b/runtime/src/iree/hal/utils/resource_set.h
@@ -0,0 +1,139 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_UTILS_RESOURCE_SET_H_
+#define IREE_HAL_UTILS_RESOURCE_SET_H_
+
+#include "iree/base/api.h"
+#include "iree/base/internal/arena.h"
+#include "iree/hal/resource.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Bit 0 of the next_chunk pointer indicates whether we are inlined into the
+// resource set block - the chunks are always aligned and the bit is unused.
+#define IREE_HAL_RESOURCE_SET_CHUNK_FLAG_INLINE 0x1
+
+// Capacity is limited by how many bits we reserve for the count.
+#define IREE_HAL_RESOURCE_SET_CHUNK_MAX_CAPACITY 0xFFFFu
+
+// A chunk of resources within a resource set.
+// Chunks contain a fixed number of resources based on the block size of the
+// pool the set was allocated from.
+typedef struct iree_hal_resource_set_chunk_t {
+  // Next chunk in the chunk linked list.
+  // Bit 0 indicates whether this was an allocated block; 0 means that the
+  // chunk is stored within the parent resource set and should not be returned
+  // to the block pool. This works only because we know the blocks are allocated
+  // at an alignment >= 16 and we have a few bits to work with.
+  union {
+    struct iree_hal_resource_set_chunk_t* next_chunk;
+    uintptr_t flags;
+  };
+
+  // Retained resources - may be less than the capacity derived from the block
+  // pool block size. We keep the counts small here to reduce chunk overhead. We
+  // could recompute the capacity each time but at the point that we use even 1
+  // byte we've already consumed 4 (or 8) thanks to padding and should make use
+  // of the rest.
+  uint16_t capacity;
+  uint16_t count;
+  iree_hal_resource_t* resources[];
+} iree_hal_resource_set_chunk_t;
+
+// Returns true if the chunk is stored inline in the parent resource set.
+#define iree_hal_resource_set_chunk_is_stored_inline(chunk)      \
+  (((chunk)->flags & IREE_HAL_RESOURCE_SET_CHUNK_FLAG_INLINE) == \
+   IREE_HAL_RESOURCE_SET_CHUNK_FLAG_INLINE)
+
+// Number of elements in the most-recently-used resource list of a set.
+// The larger the number the greater the chance of having a hit but the more
+// expensive every miss will be.
+//
+// To try to keep the MRU in cache we size this based on how many pointers will
+// fit in a single cache line. This also makes it easier to author SIMD lookups
+// as we'll (in-theory) be able to load the entries into SIMD registers.
+//
+// Values for the platforms we specify for:
+//   32-bit: 64 / 4 = 16x4b ptrs (4 x uint32x4_t)
+//   64-bit: 64 / 8 = 8x8b ptrs (4 x uint64x2_t)
+// We could scale this up if we wanted but being able to unroll is nice.
+#define IREE_HAL_RESOURCE_SET_MRU_SIZE \
+  (iree_hardware_constructive_interference_size / sizeof(uintptr_t))
+
+// "Efficient" append-only set for retaining a set of resources.
+// This is a non-deterministic data structure that tries to reduce the amount of
+// overhead involved in tracking a reasonably-sized set of resources (~dozens to
+// hundreds). Set insertion may have false negatives and retain resources more
+// than strictly required by trading off the expense of precisely detecting
+// redundant insertions with the expense of an additional atomic operation.
+//
+// This tries to elide insertions by maintaining a most-recently-used list.
+// This optimizes for temporal locality of resources used (the same executables,
+// same buffers, etc) and is implemented to have a fixed cost regardless of
+// whether the values are found and should hopefully trigger enough to avoid the
+// subsequent full insertion that can introduce allocations and ref counting.
+// The idea is that if we can keep the MRU in cache and spend a dozen cycles to
+// manage it we only need to avoid a single cache miss that would occur doing
+// the full insertion. We care here because this is on the critical path of
+// command encoding and the parasitic cost of maintaining the set scales with
+// the number of commands issued. This never needs to be free, only as fast as
+// whatever user code may need to do to maintain proper lifetime - or as small
+// in terms of code-size.
+//
+// **WARNING**: thread-unsafe insertion: it's assumed that sets are constructed
+// by a single thread, sealed, and then released at once at a future time point.
+// Multiple threads needing to insert into a set should have their own sets and
+// then join them afterward.
+typedef struct iree_hal_resource_set_t {
+  // A small MRUish list of resources for quickly deduplicating insertions.
+  // We use this to perform an O(k) comparison traded off with the cost of a
+  // miss that results in an atomic inc/dec. We shouldn't make this
+  // more expensive than the additional cost of the retain/release.
+  //
+  // This lives at the head of the struct as it's used in 100% of insertions and
+  // if we can get lucky with it staying in cache we reduce a lot of memory
+  // traffic. Once we spill the MRU and go to main memory to add the resource
+  // we're going to have a cache miss and this way we avoid two (one for the
+  // set and one for the chunk).
+  //
+  // TODO(benvanik): ensure alignment on the set - should be at
+  // iree_hardware_constructive_interference_size.
+  iree_hal_resource_t* mru[IREE_HAL_RESOURCE_SET_MRU_SIZE];
+
+  // Block pool used for allocating additional set storage slabs.
+  iree_arena_block_pool_t* block_pool;
+
+  // Linked list of storage chunks.
+  iree_hal_resource_set_chunk_t* chunk_head;
+} iree_hal_resource_set_t;
+
+// Allocates a new resource from the given |block_pool|.
+// Resources can be inserted and are retained until the set is freed.
+IREE_API_EXPORT iree_status_t iree_hal_resource_set_allocate(
+    iree_arena_block_pool_t* block_pool, iree_hal_resource_set_t** out_set);
+
+// Frees a resource set and releases all inserted resources.
+// The |set| itself will be returned back to the block pool it was allocated
+// from.
+IREE_API_EXPORT void iree_hal_resource_set_free(iree_hal_resource_set_t* set);
+
+// Resets the set to its initial empty state by releasing all owned resources.
+IREE_API_EXPORT void iree_hal_resource_set_reset(iree_hal_resource_set_t* set);
+
+// Inserts zero or more resources into the set.
+// Each resource will be retained for at least the lifetime of the set.
+IREE_API_EXPORT iree_status_t
+iree_hal_resource_set_insert(iree_hal_resource_set_t* set,
+                             iree_host_size_t count, const void* resources);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_UTILS_RESOURCE_SET_H_
diff --git a/runtime/src/iree/hal/utils/resource_set_benchmark.c b/runtime/src/iree/hal/utils/resource_set_benchmark.c
new file mode 100644
index 0000000..5b22f97
--- /dev/null
+++ b/runtime/src/iree/hal/utils/resource_set_benchmark.c
@@ -0,0 +1,287 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/prng.h"
+#include "iree/hal/api.h"
+#include "iree/hal/utils/resource_set.h"
+#include "iree/testing/benchmark.h"
+
+typedef struct iree_hal_test_resource_t {
+  iree_hal_resource_t resource;
+  iree_allocator_t host_allocator;
+} iree_hal_test_resource_t;
+
+typedef struct iree_hal_test_resource_vtable_t {
+  void(IREE_API_PTR* destroy)(iree_hal_test_resource_t* resource);
+} iree_hal_test_resource_vtable_t;
+IREE_HAL_ASSERT_VTABLE_LAYOUT(iree_hal_test_resource_vtable_t);
+
+static const iree_hal_test_resource_vtable_t iree_hal_test_resource_vtable;
+
+static iree_status_t iree_hal_test_resource_create(
+    iree_allocator_t host_allocator, iree_hal_resource_t** out_resource) {
+  iree_hal_test_resource_t* test_resource = NULL;
+  IREE_RETURN_IF_ERROR(iree_allocator_malloc(
+      host_allocator, sizeof(*test_resource), (void**)&test_resource));
+  iree_hal_resource_initialize(&iree_hal_test_resource_vtable,
+                               &test_resource->resource);
+  test_resource->host_allocator = host_allocator;
+  *out_resource = (iree_hal_resource_t*)test_resource;
+  return iree_ok_status();
+}
+
+static void iree_hal_test_resource_destroy(iree_hal_test_resource_t* resource) {
+  iree_allocator_t host_allocator = resource->host_allocator;
+  iree_allocator_free(host_allocator, resource);
+}
+
+static const iree_hal_test_resource_vtable_t iree_hal_test_resource_vtable = {
+    /*.destroy=*/iree_hal_test_resource_destroy,
+};
+
+// Tests init/deinit performance when 0+ resources are in the set.
+// This is our worst-case with unique resources that never match the MRU.
+//
+// user_data is a count of elements to insert into each set.
+static iree_status_t iree_hal_resource_set_benchmark_lifecycle_n(
+    const iree_benchmark_def_t* benchmark_def,
+    iree_benchmark_state_t* benchmark_state) {
+  iree_allocator_t host_allocator = benchmark_state->host_allocator;
+
+  // Initialize the block pool we'll be serving from.
+  // Sized like we usually do it in the runtime for ~512-1024 elements.
+  iree_arena_block_pool_t block_pool;
+  iree_arena_block_pool_initialize(4096, host_allocator, &block_pool);
+
+  // Allocate the resources we'll be using - we keep them live so that we are
+  // measuring just the retain/release and set times instead of the timing of
+  // resource creation/deletion.
+  uint32_t count = (uint32_t)(uintptr_t)benchmark_def->user_data;
+  iree_hal_resource_t** resources = NULL;
+  if (count > 0) {
+    IREE_CHECK_OK(iree_allocator_malloc(host_allocator,
+                                        sizeof(iree_hal_resource_t*) * count,
+                                        (void**)&resources));
+  }
+  for (uint32_t i = 0; i < count; ++i) {
+    IREE_CHECK_OK(iree_hal_test_resource_create(host_allocator, &resources[i]));
+  }
+
+  // Create/insert/delete lifecycle.
+  while (iree_benchmark_keep_running(benchmark_state, /*batch_count=*/1)) {
+    iree_hal_resource_set_t* set = NULL;
+    IREE_CHECK_OK(iree_hal_resource_set_allocate(&block_pool, &set));
+    IREE_CHECK_OK(iree_hal_resource_set_insert(set, count, resources));
+    iree_hal_resource_set_free(set);
+  }
+
+  // Cleanup.
+  for (uint32_t i = 0; i < count; ++i) {
+    iree_hal_resource_release(resources[i]);
+  }
+  iree_allocator_free(host_allocator, resources);
+  iree_arena_block_pool_deinitialize(&block_pool);
+
+  return iree_ok_status();
+}
+
+// Tests insertion performance when either the MRU is used (n < MRU size) or
+// the worst-case performance when all resources are unique and guaranteed to
+// miss the MRU. Expect to see a cliff where we spill the MRU.
+//
+// user_data is a count of unique elements to insert.
+static iree_status_t iree_hal_resource_set_benchmark_insert_n(
+    const iree_benchmark_def_t* benchmark_def,
+    iree_benchmark_state_t* benchmark_state) {
+  iree_allocator_t host_allocator = benchmark_state->host_allocator;
+
+  // Initialize the block pool we'll be serving from.
+  // Sized like we usually do it in the runtime for ~512-1024 elements.
+  iree_arena_block_pool_t block_pool;
+  iree_arena_block_pool_initialize(4096, host_allocator, &block_pool);
+
+  // Create the empty set using the block pool for additional memory.
+  iree_hal_resource_set_t* set = NULL;
+  IREE_CHECK_OK(iree_hal_resource_set_allocate(&block_pool, &set));
+
+  // Allocate the resources we'll be using - we keep them live so that we are
+  // measuring just the retain/release and set times instead of the timing of
+  // resource creation/deletion.
+  uint32_t count = (uint32_t)(uintptr_t)benchmark_def->user_data;
+  iree_hal_resource_t** resources = NULL;
+  IREE_CHECK_OK(iree_allocator_malloc(host_allocator,
+                                      sizeof(iree_hal_resource_t*) * count,
+                                      (void**)&resources));
+  for (uint32_t i = 0; i < count; ++i) {
+    IREE_CHECK_OK(iree_hal_test_resource_create(host_allocator, &resources[i]));
+  }
+
+  // Insert the resources. After the first iteration these should all be hits.
+  while (iree_benchmark_keep_running(benchmark_state, /*batch_count=*/1)) {
+    IREE_CHECK_OK(iree_hal_resource_set_insert(set, count, resources));
+  }
+
+  // Cleanup.
+  for (uint32_t i = 0; i < count; ++i) {
+    iree_hal_resource_release(resources[i]);
+  }
+  iree_hal_resource_set_free(set);
+  iree_allocator_free(host_allocator, resources);
+  iree_arena_block_pool_deinitialize(&block_pool);
+
+  return iree_ok_status();
+}
+
+// Tests insertion into the set in a randomized order.
+// This lets us get a somewhat reasonable approximation of average performance.
+// In reality what the compiler spits out is non-random and often just
+// alternating A/B/C/B/A/C/A/B/C etc kind of sequences.
+//
+// This is the most important benchmark: if this is fast then we are :thumbsup:.
+//
+// user_data is a count of unique element pool to insert N times. The higher
+// the pool size the more likely we are to miss the MRU.
+static iree_status_t iree_hal_resource_set_benchmark_randomized_n(
+    const iree_benchmark_def_t* benchmark_def,
+    iree_benchmark_state_t* benchmark_state) {
+  iree_allocator_t host_allocator = benchmark_state->host_allocator;
+
+  // Initialize the block pool we'll be serving from.
+  // Sized like we usually do it in the runtime for ~512-1024 elements.
+  iree_arena_block_pool_t block_pool;
+  iree_arena_block_pool_initialize(4096, host_allocator, &block_pool);
+
+  // Allocate the resources we'll be using - we keep them live so that we are
+  // measuring just the retain/release and set times instead of the timing of
+  // resource creation/deletion.
+  uint32_t count = (uint32_t)(uintptr_t)benchmark_def->user_data;
+  iree_hal_resource_t** resources = NULL;
+  IREE_CHECK_OK(iree_allocator_malloc(host_allocator,
+                                      sizeof(iree_hal_resource_t*) * count,
+                                      (void**)&resources));
+  for (uint32_t i = 0; i < count; ++i) {
+    IREE_CHECK_OK(iree_hal_test_resource_create(host_allocator, &resources[i]));
+  }
+
+  // The same set is maintained; we'll eventually have all resources in the set
+  // and be testing the MRU hit %.
+  iree_hal_resource_set_t* set = NULL;
+  IREE_CHECK_OK(iree_hal_resource_set_allocate(&block_pool, &set));
+
+  // The PRNG we use to select the elements.
+  iree_prng_xoroshiro128_state_t prng = {0};
+  iree_prng_xoroshiro128_initialize(123ull, &prng);
+
+  // Insert N random resources into the set. To hide some of the overhead we do
+  // multiple insertions in each loop.
+  while (iree_benchmark_keep_running(benchmark_state, /*batch_count=*/256)) {
+    for (uint32_t i = 0; i < 256; ++i) {
+      uint32_t resource_idx =
+          iree_prng_xoroshiro128plus_next_uint32(&prng) % count;
+      iree_hal_resource_t* resource = resources[resource_idx];
+      IREE_CHECK_OK(iree_hal_resource_set_insert(set, 1, &resource));
+    }
+  }
+
+  // Cleanup.
+  iree_hal_resource_set_free(set);
+  for (uint32_t i = 0; i < count; ++i) {
+    iree_hal_resource_release(resources[i]);
+  }
+  iree_allocator_free(host_allocator, resources);
+  iree_arena_block_pool_deinitialize(&block_pool);
+
+  return iree_ok_status();
+}
+
+int main(int argc, char** argv) {
+  iree_benchmark_initialize(&argc, argv);
+
+  // iree_hal_resource_set_benchmark_lifecycle_n
+  {
+    iree_benchmark_def_t benchmark_def = {
+        .flags = IREE_BENCHMARK_FLAG_MEASURE_PROCESS_CPU_TIME |
+                 IREE_BENCHMARK_FLAG_USE_REAL_TIME,
+        .time_unit = IREE_BENCHMARK_UNIT_NANOSECOND,
+        .minimum_duration_ns = 0,
+        .iteration_count = 0,
+        .run = iree_hal_resource_set_benchmark_lifecycle_n,
+    };
+    benchmark_def.user_data = (void*)0u;
+    iree_benchmark_register(iree_make_cstring_view("lifecycle_0"),
+                            &benchmark_def);
+    benchmark_def.user_data = (void*)1u;
+    iree_benchmark_register(iree_make_cstring_view("lifecycle_1"),
+                            &benchmark_def);
+    benchmark_def.user_data = (void*)256u;
+    iree_benchmark_register(iree_make_cstring_view("lifecycle_256"),
+                            &benchmark_def);
+    benchmark_def.user_data = (void*)1024u;
+    iree_benchmark_register(iree_make_cstring_view("lifecycle_1024"),
+                            &benchmark_def);
+  }
+
+  // iree_hal_resource_set_benchmark_insert_n
+  {
+    iree_benchmark_def_t benchmark_def = {
+        .flags = IREE_BENCHMARK_FLAG_MEASURE_PROCESS_CPU_TIME |
+                 IREE_BENCHMARK_FLAG_USE_REAL_TIME,
+        .time_unit = IREE_BENCHMARK_UNIT_NANOSECOND,
+        .minimum_duration_ns = 0,
+        .iteration_count = 0,
+        .run = iree_hal_resource_set_benchmark_insert_n,
+    };
+    benchmark_def.user_data = (void*)1u;
+    iree_benchmark_register(iree_make_cstring_view("insert_1"), &benchmark_def);
+    benchmark_def.user_data = (void*)5u;
+    iree_benchmark_register(iree_make_cstring_view("insert_5"), &benchmark_def);
+    benchmark_def.user_data = (void*)32u;
+    iree_benchmark_register(iree_make_cstring_view("insert_32"),
+                            &benchmark_def);
+    benchmark_def.user_data = (void*)64u;
+    iree_benchmark_register(iree_make_cstring_view("insert_64"),
+                            &benchmark_def);
+  }
+
+  // iree_hal_resource_set_benchmark_randomized_n
+  {
+    iree_benchmark_def_t benchmark_def = {
+        .flags = IREE_BENCHMARK_FLAG_MEASURE_PROCESS_CPU_TIME |
+                 IREE_BENCHMARK_FLAG_USE_REAL_TIME,
+        .time_unit = IREE_BENCHMARK_UNIT_NANOSECOND,
+        .minimum_duration_ns = 0,
+        .iteration_count = 0,
+        .run = iree_hal_resource_set_benchmark_randomized_n,
+    };
+    benchmark_def.user_data = (void*)1u;
+    iree_benchmark_register(iree_make_cstring_view("randomized_1"),
+                            &benchmark_def);
+    benchmark_def.user_data = (void*)4u;
+    iree_benchmark_register(iree_make_cstring_view("randomized_4"),
+                            &benchmark_def);
+    benchmark_def.user_data = (void*)8u;
+    iree_benchmark_register(iree_make_cstring_view("randomized_8"),
+                            &benchmark_def);
+    benchmark_def.user_data = (void*)32u;
+    iree_benchmark_register(iree_make_cstring_view("randomized_32"),
+                            &benchmark_def);
+    benchmark_def.user_data = (void*)256u;
+    iree_benchmark_register(iree_make_cstring_view("randomized_256"),
+                            &benchmark_def);
+    benchmark_def.user_data = (void*)4096u;
+    iree_benchmark_register(iree_make_cstring_view("randomized_4096"),
+                            &benchmark_def);
+  }
+
+  iree_benchmark_run_specified();
+  return 0;
+}
diff --git a/runtime/src/iree/hal/utils/resource_set_test.cc b/runtime/src/iree/hal/utils/resource_set_test.cc
new file mode 100644
index 0000000..021bb1b
--- /dev/null
+++ b/runtime/src/iree/hal/utils/resource_set_test.cc
@@ -0,0 +1,257 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/utils/resource_set.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace iree {
+namespace hal {
+namespace {
+
+using ::iree::testing::status::IsOkAndHolds;
+using ::iree::testing::status::StatusIs;
+using ::testing::Eq;
+
+typedef struct iree_hal_test_resource_t {
+  iree_hal_resource_t resource;
+  iree_allocator_t host_allocator;
+  uint32_t index;
+  uint32_t* live_bitmap;
+} iree_hal_test_resource_t;
+
+typedef struct iree_hal_test_resource_vtable_t {
+  void(IREE_API_PTR* destroy)(iree_hal_test_resource_t* resource);
+} iree_hal_test_resource_vtable_t;
+IREE_HAL_ASSERT_VTABLE_LAYOUT(iree_hal_test_resource_vtable_t);
+
+extern const iree_hal_test_resource_vtable_t iree_hal_test_resource_vtable;
+
+static iree_status_t iree_hal_test_resource_create(
+    uint32_t index, uint32_t* live_bitmap, iree_allocator_t host_allocator,
+    iree_hal_resource_t** out_resource) {
+  iree_hal_test_resource_t* test_resource = NULL;
+  IREE_RETURN_IF_ERROR(iree_allocator_malloc(
+      host_allocator, sizeof(*test_resource), (void**)&test_resource));
+  iree_hal_resource_initialize(&iree_hal_test_resource_vtable,
+                               &test_resource->resource);
+  test_resource->host_allocator = host_allocator;
+  test_resource->index = index;
+  test_resource->live_bitmap = live_bitmap;
+  *live_bitmap |= 1 << index;
+  *out_resource = (iree_hal_resource_t*)test_resource;
+  return iree_ok_status();
+}
+
+static void iree_hal_test_resource_destroy(iree_hal_test_resource_t* resource) {
+  iree_allocator_t host_allocator = resource->host_allocator;
+  *resource->live_bitmap &= ~(1 << resource->index);
+  iree_allocator_free(host_allocator, resource);
+}
+
+const iree_hal_test_resource_vtable_t iree_hal_test_resource_vtable = {
+    /*.destroy=*/iree_hal_test_resource_destroy,
+};
+
+struct ResourceSetTest : public ::testing::Test {
+  // We could check the allocator to ensure all memory is freed if we wanted to
+  // reduce the reliance on asan.
+  iree_allocator_t host_allocator = iree_allocator_system();
+  iree_arena_block_pool_t block_pool;
+
+  void SetUp() override {
+    memset(&block_pool, 0, sizeof(block_pool));
+    iree_arena_block_pool_initialize(128, host_allocator, &block_pool);
+  }
+
+  void TearDown() override {
+    // This may assert (or at least trigger asan) if there are blocks
+    // outstanding.
+    iree_arena_block_pool_deinitialize(&block_pool);
+  }
+};
+
+using resource_set_ptr = std::unique_ptr<iree_hal_resource_set_t,
+                                         decltype(&iree_hal_resource_set_free)>;
+static resource_set_ptr make_resource_set(iree_arena_block_pool_t* block_pool) {
+  iree_hal_resource_set_t* set = NULL;
+  IREE_CHECK_OK(iree_hal_resource_set_allocate(block_pool, &set));
+  return resource_set_ptr(set, iree_hal_resource_set_free);
+}
+
+// Tests a set that has no resources added to it.
+TEST_F(ResourceSetTest, Empty) {
+  iree_hal_resource_set_t* set = NULL;
+  IREE_ASSERT_OK(iree_hal_resource_set_allocate(&block_pool, &set));
+  iree_hal_resource_set_free(set);
+}
+
+// Tests insertion of a single resource.
+TEST_F(ResourceSetTest, Insert1) {
+  auto resource_set = make_resource_set(&block_pool);
+
+  // Create test resource; it'll set its bit in the live_bitmap.
+  iree_hal_resource_t* resource = NULL;
+  uint32_t live_bitmap = 0u;
+  IREE_ASSERT_OK(iree_hal_test_resource_create(0, &live_bitmap, host_allocator,
+                                               &resource));
+  EXPECT_EQ(live_bitmap, 1u);
+
+  // Insert the resource and drop the reference; it should still be live as the
+  // set retains it.
+  IREE_ASSERT_OK(
+      iree_hal_resource_set_insert(resource_set.get(), 1, &resource));
+  iree_hal_resource_release(resource);
+  EXPECT_EQ(live_bitmap, 1u);
+
+  // Drop the set and expect the resource to be destroyed as it loses its last
+  // reference.
+  resource_set.reset();
+  EXPECT_EQ(live_bitmap, 0u);
+}
+
+// Tests inserting multiple resources at a time.
+TEST_F(ResourceSetTest, Insert5) {
+  auto resource_set = make_resource_set(&block_pool);
+
+  // Allocate 5 resources - this lets us test for special paths that may handle
+  // 4 at a time (to fit in SIMD registers) as well as the leftovers.
+  iree_hal_resource_t* resources[5] = {NULL};
+  uint32_t live_bitmap = 0u;
+  for (iree_host_size_t i = 0; i < IREE_ARRAYSIZE(resources); ++i) {
+    IREE_ASSERT_OK(iree_hal_test_resource_create(
+        i, &live_bitmap, host_allocator, &resources[i]));
+  }
+  EXPECT_EQ(live_bitmap, 0x1Fu);
+
+  // Transfer ownership of the resources to the set.
+  IREE_ASSERT_OK(iree_hal_resource_set_insert(
+      resource_set.get(), IREE_ARRAYSIZE(resources), resources));
+  for (iree_host_size_t i = 0; i < IREE_ARRAYSIZE(resources); ++i) {
+    iree_hal_resource_release(resources[i]);
+  }
+  EXPECT_EQ(live_bitmap, 0x1Fu);
+
+  // Ensure the set releases the resources.
+  resource_set.reset();
+  EXPECT_EQ(live_bitmap, 0u);
+}
+
+// Tests inserting enough resources to force set growth. This is ensured by
+// choosing a sufficiently small block size such that even 32 elements triggers
+// a growth. Of course, real usage should have at least ~4KB for the block size.
+TEST_F(ResourceSetTest, InsertionGrowth) {
+  auto resource_set = make_resource_set(&block_pool);
+
+  // Allocate 32 resources (one for each bit in our live map).
+  iree_hal_resource_t* resources[32] = {NULL};
+  uint32_t live_bitmap = 0u;
+  for (iree_host_size_t i = 0; i < IREE_ARRAYSIZE(resources); ++i) {
+    IREE_ASSERT_OK(iree_hal_test_resource_create(
+        i, &live_bitmap, host_allocator, &resources[i]));
+  }
+  EXPECT_EQ(live_bitmap, 0xFFFFFFFFu);
+
+  // Transfer ownership of the resources to the set.
+  IREE_ASSERT_OK(iree_hal_resource_set_insert(
+      resource_set.get(), IREE_ARRAYSIZE(resources), resources));
+  for (iree_host_size_t i = 0; i < IREE_ARRAYSIZE(resources); ++i) {
+    iree_hal_resource_release(resources[i]);
+  }
+  EXPECT_EQ(live_bitmap, 0xFFFFFFFFu);
+
+  // Ensure the set releases the resources.
+  resource_set.reset();
+  EXPECT_EQ(live_bitmap, 0u);
+}
+
+// Tests insertion of resources multiple times to verify the MRU works.
+TEST_F(ResourceSetTest, RedundantInsertion) {
+  auto resource_set = make_resource_set(&block_pool);
+
+  // Allocate 32 resources (one for each bit in our live map).
+  // We want to be able to miss in the MRU.
+  iree_hal_resource_t* resources[32] = {NULL};
+  static_assert(IREE_ARRAYSIZE(resources) > IREE_HAL_RESOURCE_SET_MRU_SIZE,
+                "need to pick a value that lets us exceed the MRU capacity");
+  uint32_t live_bitmap = 0u;
+  for (iree_host_size_t i = 0; i < IREE_ARRAYSIZE(resources); ++i) {
+    IREE_ASSERT_OK(iree_hal_test_resource_create(
+        i, &live_bitmap, host_allocator, &resources[i]));
+  }
+  EXPECT_EQ(live_bitmap, 0xFFFFFFFFu);
+
+  // NOTE: the only requirement of the MRU is that it's _mostly_ MRU - we may
+  // for performance reasons make it a little fuzzy to avoid additional
+  // shuffling. Today it's always a proper MRU and we check the pointers here.
+
+  // NOTE: the MRU size can vary across architectures; we know it should always
+  // be at least ~6 though so that's what we work with here.
+  static_assert(IREE_HAL_RESOURCE_SET_MRU_SIZE > 6,
+                "need at least enough elements to test with");
+
+  // Insert in sequence, MRU should contain:
+  //   31 30 29 28 27 ...
+  IREE_ASSERT_OK(iree_hal_resource_set_insert(
+      resource_set.get(), IREE_ARRAYSIZE(resources), resources));
+  EXPECT_EQ(resource_set->mru[0], resources[31]);
+  EXPECT_EQ(resource_set->mru[1], resources[30]);
+  EXPECT_EQ(resource_set->mru[2], resources[29]);
+  EXPECT_EQ(resource_set->mru[3], resources[28]);
+  EXPECT_EQ(resource_set->mru[4], resources[27]);
+
+  // Insert 31 again, MRU should remain the same as it's at the head.
+  IREE_ASSERT_OK(
+      iree_hal_resource_set_insert(resource_set.get(), 1, &resources[31]));
+  EXPECT_EQ(resource_set->mru[0], resources[31]);
+  EXPECT_EQ(resource_set->mru[1], resources[30]);
+  EXPECT_EQ(resource_set->mru[2], resources[29]);
+  EXPECT_EQ(resource_set->mru[3], resources[28]);
+  EXPECT_EQ(resource_set->mru[4], resources[27]);
+
+  // Insert 28 again, MRU should be updated to move it to the front:
+  //   28 31 30 29 27 ...
+  IREE_ASSERT_OK(
+      iree_hal_resource_set_insert(resource_set.get(), 1, &resources[28]));
+  EXPECT_EQ(resource_set->mru[0], resources[28]);
+  EXPECT_EQ(resource_set->mru[1], resources[31]);
+  EXPECT_EQ(resource_set->mru[2], resources[30]);
+  EXPECT_EQ(resource_set->mru[3], resources[29]);
+  EXPECT_EQ(resource_set->mru[4], resources[27]);
+
+  // Insert 0 again, which should be a miss as it fell off the end of the MRU:
+  //   0 28 31 30 29 27 ...
+  IREE_ASSERT_OK(
+      iree_hal_resource_set_insert(resource_set.get(), 1, &resources[0]));
+  EXPECT_EQ(resource_set->mru[0], resources[0]);
+  EXPECT_EQ(resource_set->mru[1], resources[28]);
+  EXPECT_EQ(resource_set->mru[2], resources[31]);
+  EXPECT_EQ(resource_set->mru[3], resources[30]);
+  EXPECT_EQ(resource_set->mru[4], resources[29]);
+  EXPECT_EQ(resource_set->mru[5], resources[27]);
+
+  // Release all of the resources - they should still be owned by the set.
+  for (iree_host_size_t i = 0; i < IREE_ARRAYSIZE(resources); ++i) {
+    iree_hal_resource_release(resources[i]);
+  }
+  EXPECT_EQ(live_bitmap, 0xFFFFFFFFu);
+
+  // Ensure the set releases the resources.
+  resource_set.reset();
+  EXPECT_EQ(live_bitmap, 0u);
+}
+
+}  // namespace
+}  // namespace hal
+}  // namespace iree
diff --git a/runtime/src/iree/hal/vmvx/BUILD b/runtime/src/iree/hal/vmvx/BUILD
new file mode 100644
index 0000000..0820c01
--- /dev/null
+++ b/runtime/src/iree/hal/vmvx/BUILD
@@ -0,0 +1,13 @@
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# A VMVX (VM-based Vector eXtensions) runtime HAL backend.
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
diff --git a/runtime/src/iree/hal/vmvx/CMakeLists.txt b/runtime/src/iree/hal/vmvx/CMakeLists.txt
new file mode 100644
index 0000000..b1096fc
--- /dev/null
+++ b/runtime/src/iree/hal/vmvx/CMakeLists.txt
@@ -0,0 +1,13 @@
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# runtime/src/iree/hal/vmvx/BUILD                                              #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
+
+iree_add_all_subdirs()
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/runtime/src/iree/hal/vmvx/cts/CMakeLists.txt b/runtime/src/iree/hal/vmvx/cts/CMakeLists.txt
new file mode 100644
index 0000000..8a116f9
--- /dev/null
+++ b/runtime/src/iree/hal/vmvx/cts/CMakeLists.txt
@@ -0,0 +1,40 @@
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+iree_hal_cts_test_suite(
+  DRIVER_NAME
+    vmvx
+  DRIVER_REGISTRATION_HDR
+    "runtime/src/iree/hal/vmvx/registration/driver_module.h"
+  DRIVER_REGISTRATION_FN
+    "iree_hal_vmvx_driver_module_register"
+  COMPILER_TARGET_BACKEND
+    "vmvx"
+  EXECUTABLE_FORMAT
+    "\"vmvx-bytecode-fb\""
+  DEPS
+    iree::hal::vmvx::registration
+)
+
+iree_hal_cts_test_suite(
+  DRIVER_NAME
+    vmvx-sync
+  DRIVER_REGISTRATION_HDR
+    "runtime/src/iree/hal/vmvx/registration/driver_module_sync.h"
+  DRIVER_REGISTRATION_FN
+    "iree_hal_vmvx_sync_driver_module_register"
+  COMPILER_TARGET_BACKEND
+    "vmvx"
+  EXECUTABLE_FORMAT
+    "\"vmvx-bytecode-fb\""
+  DEPS
+    iree::hal::vmvx::registration::sync
+  EXCLUDED_TESTS
+    # TODO(#4680): command buffer recording so that these can run on sync HAL
+    "command_buffer"
+    "event"
+    "semaphore_submission"
+)
diff --git a/runtime/src/iree/hal/vmvx/registration/BUILD b/runtime/src/iree/hal/vmvx/registration/BUILD
new file mode 100644
index 0000000..2bbe131
--- /dev/null
+++ b/runtime/src/iree/hal/vmvx/registration/BUILD
@@ -0,0 +1,71 @@
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("//iree:build_defs.oss.bzl", "iree_cmake_extra_content", "iree_runtime_cc_library")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+iree_cmake_extra_content(
+    content = """
+if(${IREE_HAL_DRIVER_VMVX})
+""",
+    inline = True,
+)
+
+iree_runtime_cc_library(
+    name = "registration",
+    srcs = ["driver_module.c"],
+    hdrs = ["driver_module.h"],
+    defines = [
+        "IREE_HAL_HAVE_VMVX_DRIVER_MODULE=1",
+    ],
+    deps = [
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/hal",
+        "//runtime/src/iree/hal/local",
+        "//runtime/src/iree/hal/local:task_driver",
+        "//runtime/src/iree/hal/local/loaders:vmvx_module_loader",
+        "//runtime/src/iree/task:api",
+        "//runtime/src/iree/vm",
+    ],
+)
+
+iree_cmake_extra_content(
+    content = """
+endif()
+
+if(${IREE_HAL_DRIVER_VMVX_SYNC})
+""",
+    inline = True,
+)
+
+iree_runtime_cc_library(
+    name = "sync",
+    srcs = ["driver_module_sync.c"],
+    hdrs = ["driver_module_sync.h"],
+    defines = [
+        "IREE_HAL_HAVE_VMVX_SYNC_DRIVER_MODULE=1",
+    ],
+    deps = [
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/hal",
+        "//runtime/src/iree/hal/local",
+        "//runtime/src/iree/hal/local:sync_driver",
+        "//runtime/src/iree/hal/local/loaders:vmvx_module_loader",
+        "//runtime/src/iree/vm",
+    ],
+)
+
+iree_cmake_extra_content(
+    content = """
+endif()
+""",
+    inline = True,
+)
diff --git a/runtime/src/iree/hal/vmvx/registration/CMakeLists.txt b/runtime/src/iree/hal/vmvx/registration/CMakeLists.txt
new file mode 100644
index 0000000..66ea6d2
--- /dev/null
+++ b/runtime/src/iree/hal/vmvx/registration/CMakeLists.txt
@@ -0,0 +1,60 @@
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# runtime/src/iree/hal/vmvx/registration/BUILD                                 #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
+
+iree_add_all_subdirs()
+
+if(${IREE_HAL_DRIVER_VMVX})
+
+iree_cc_library(
+  NAME
+    registration
+  HDRS
+    "driver_module.h"
+  SRCS
+    "driver_module.c"
+  DEPS
+    iree::base
+    iree::hal
+    iree::hal::local
+    iree::hal::local::loaders::vmvx_module_loader
+    iree::hal::local::task_driver
+    iree::task::api
+    iree::vm
+  DEFINES
+    "IREE_HAL_HAVE_VMVX_DRIVER_MODULE=1"
+  PUBLIC
+)
+
+endif()
+
+if(${IREE_HAL_DRIVER_VMVX_SYNC})
+
+iree_cc_library(
+  NAME
+    sync
+  HDRS
+    "driver_module_sync.h"
+  SRCS
+    "driver_module_sync.c"
+  DEPS
+    iree::base
+    iree::hal
+    iree::hal::local
+    iree::hal::local::loaders::vmvx_module_loader
+    iree::hal::local::sync_driver
+    iree::vm
+  DEFINES
+    "IREE_HAL_HAVE_VMVX_SYNC_DRIVER_MODULE=1"
+  PUBLIC
+)
+
+endif()
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/runtime/src/iree/hal/vmvx/registration/driver_module.c b/runtime/src/iree/hal/vmvx/registration/driver_module.c
new file mode 100644
index 0000000..a1a0228
--- /dev/null
+++ b/runtime/src/iree/hal/vmvx/registration/driver_module.c
@@ -0,0 +1,98 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/vmvx/registration/driver_module.h"
+
+#include <inttypes.h>
+#include <stddef.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/local/executable_loader.h"
+#include "iree/hal/local/loaders/vmvx_module_loader.h"
+#include "iree/hal/local/task_device.h"
+#include "iree/hal/local/task_driver.h"
+#include "iree/task/api.h"
+#include "iree/vm/api.h"
+
+// TODO(#4298): remove this driver registration and wrapper.
+
+// TODO(benvanik): replace with C flags.
+#define IREE_HAL_VMVX_WORKER_COUNT 0
+#define IREE_HAL_MAX_VMVX_WORKER_COUNT 16
+
+#define IREE_HAL_VMVX_DRIVER_ID 0x564D5658u  // VMVX
+
+static iree_status_t iree_hal_vmvx_driver_factory_enumerate(
+    void* self, const iree_hal_driver_info_t** out_driver_infos,
+    iree_host_size_t* out_driver_info_count) {
+  static const iree_hal_driver_info_t driver_infos[1] = {
+      {
+          .driver_id = IREE_HAL_VMVX_DRIVER_ID,
+          .driver_name = iree_string_view_literal("vmvx"),
+          .full_name = iree_string_view_literal("VM-based reference backend"),
+      },
+  };
+  *out_driver_info_count = IREE_ARRAYSIZE(driver_infos);
+  *out_driver_infos = driver_infos;
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_vmvx_driver_factory_try_create(
+    void* self, iree_hal_driver_id_t driver_id, iree_allocator_t host_allocator,
+    iree_hal_driver_t** out_driver) {
+  if (driver_id != IREE_HAL_VMVX_DRIVER_ID) {
+    return iree_make_status(IREE_STATUS_UNAVAILABLE,
+                            "no driver with ID %016" PRIu64
+                            " is provided by this factory",
+                            driver_id);
+  }
+
+  iree_vm_instance_t* instance = NULL;
+  IREE_RETURN_IF_ERROR(iree_vm_instance_create(host_allocator, &instance));
+
+  iree_hal_task_device_params_t default_params;
+  iree_hal_task_device_params_initialize(&default_params);
+
+  iree_hal_executable_loader_t* vmvx_loader = NULL;
+  iree_status_t status = iree_hal_vmvx_module_loader_create(
+      instance, host_allocator, &vmvx_loader);
+  iree_hal_executable_loader_t* loaders[1] = {vmvx_loader};
+
+  iree_task_executor_t* executor = NULL;
+  if (iree_status_is_ok(status)) {
+    status = iree_task_executor_create_from_flags(host_allocator, &executor);
+  }
+
+  iree_hal_allocator_t* device_allocator = NULL;
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_allocator_create_heap(iree_make_cstring_view("vmvx"),
+                                            host_allocator, host_allocator,
+                                            &device_allocator);
+  }
+
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_task_driver_create(
+        iree_make_cstring_view("vmvx"), &default_params, executor,
+        IREE_ARRAYSIZE(loaders), loaders, device_allocator, host_allocator,
+        out_driver);
+  }
+
+  iree_hal_allocator_release(device_allocator);
+  iree_task_executor_release(executor);
+  iree_hal_executable_loader_release(vmvx_loader);
+  iree_vm_instance_release(instance);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t
+iree_hal_vmvx_driver_module_register(iree_hal_driver_registry_t* registry) {
+  static const iree_hal_driver_factory_t factory = {
+      .self = NULL,
+      .enumerate = iree_hal_vmvx_driver_factory_enumerate,
+      .try_create = iree_hal_vmvx_driver_factory_try_create,
+  };
+  return iree_hal_driver_registry_register_factory(registry, &factory);
+}
diff --git a/runtime/src/iree/hal/vmvx/registration/driver_module.h b/runtime/src/iree/hal/vmvx/registration/driver_module.h
new file mode 100644
index 0000000..578d9c2
--- /dev/null
+++ b/runtime/src/iree/hal/vmvx/registration/driver_module.h
@@ -0,0 +1,24 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_VMVX_REGISTRATION_DRIVER_MODULE_H_
+#define IREE_HAL_VMVX_REGISTRATION_DRIVER_MODULE_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+IREE_API_EXPORT iree_status_t
+iree_hal_vmvx_driver_module_register(iree_hal_driver_registry_t* registry);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_VMVX_REGISTRATION_DRIVER_MODULE_H_
diff --git a/runtime/src/iree/hal/vmvx/registration/driver_module_sync.c b/runtime/src/iree/hal/vmvx/registration/driver_module_sync.c
new file mode 100644
index 0000000..6a5fc70
--- /dev/null
+++ b/runtime/src/iree/hal/vmvx/registration/driver_module_sync.c
@@ -0,0 +1,92 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/vmvx/registration/driver_module_sync.h"
+
+#include <inttypes.h>
+#include <stddef.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/local/executable_loader.h"
+#include "iree/hal/local/loaders/vmvx_module_loader.h"
+#include "iree/hal/local/sync_device.h"
+#include "iree/hal/local/sync_driver.h"
+#include "iree/vm/api.h"
+
+// TODO(#4298): remove this driver registration and wrapper.
+
+// TODO(benvanik): replace with C flags.
+#define IREE_HAL_VMVX_WORKER_COUNT 0
+#define IREE_HAL_MAX_VMVX_WORKER_COUNT 16
+
+#define IREE_HAL_VMVX_SYNC_DRIVER_ID 0x53564D58u  // SVMX
+
+static iree_status_t iree_hal_vmvx_sync_driver_factory_enumerate(
+    void* self, const iree_hal_driver_info_t** out_driver_infos,
+    iree_host_size_t* out_driver_info_count) {
+  static const iree_hal_driver_info_t driver_infos[1] = {
+      {
+          .driver_id = IREE_HAL_VMVX_SYNC_DRIVER_ID,
+          .driver_name = iree_string_view_literal("vmvx-sync"),
+          .full_name = iree_string_view_literal(
+              "synchronous VM-based reference backend"),
+      },
+  };
+  *out_driver_info_count = IREE_ARRAYSIZE(driver_infos);
+  *out_driver_infos = driver_infos;
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_vmvx_sync_driver_factory_try_create(
+    void* self, iree_hal_driver_id_t driver_id, iree_allocator_t host_allocator,
+    iree_hal_driver_t** out_driver) {
+  if (driver_id != IREE_HAL_VMVX_SYNC_DRIVER_ID) {
+    return iree_make_status(IREE_STATUS_UNAVAILABLE,
+                            "no driver with ID %016" PRIu64
+                            " is provided by this factory",
+                            driver_id);
+  }
+
+  iree_vm_instance_t* instance = NULL;
+  IREE_RETURN_IF_ERROR(iree_vm_instance_create(host_allocator, &instance));
+
+  iree_hal_executable_loader_t* vmvx_loader = NULL;
+  iree_status_t status = iree_hal_vmvx_module_loader_create(
+      instance, host_allocator, &vmvx_loader);
+  iree_hal_executable_loader_t* loaders[1] = {vmvx_loader};
+
+  iree_hal_allocator_t* device_allocator = NULL;
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_allocator_create_heap(iree_make_cstring_view("vmvx"),
+                                            host_allocator, host_allocator,
+                                            &device_allocator);
+  }
+
+  // Set parameters for the device created in the next step.
+  iree_hal_sync_device_params_t default_params;
+  iree_hal_sync_device_params_initialize(&default_params);
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_sync_driver_create(
+        iree_make_cstring_view("vmvx"), &default_params,
+        IREE_ARRAYSIZE(loaders), loaders, device_allocator, host_allocator,
+        out_driver);
+  }
+
+  iree_hal_allocator_release(device_allocator);
+  iree_hal_executable_loader_release(vmvx_loader);
+  iree_vm_instance_release(instance);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_vmvx_sync_driver_module_register(
+    iree_hal_driver_registry_t* registry) {
+  static const iree_hal_driver_factory_t factory = {
+      .self = NULL,
+      .enumerate = iree_hal_vmvx_sync_driver_factory_enumerate,
+      .try_create = iree_hal_vmvx_sync_driver_factory_try_create,
+  };
+  return iree_hal_driver_registry_register_factory(registry, &factory);
+}
diff --git a/runtime/src/iree/hal/vmvx/registration/driver_module_sync.h b/runtime/src/iree/hal/vmvx/registration/driver_module_sync.h
new file mode 100644
index 0000000..a73006d
--- /dev/null
+++ b/runtime/src/iree/hal/vmvx/registration/driver_module_sync.h
@@ -0,0 +1,26 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_VMVX_REGISTRATION_DRIVER_MODULE_SYNC_H_
+#define IREE_HAL_VMVX_REGISTRATION_DRIVER_MODULE_SYNC_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// DEPRECATED: this entire driver will be removed soon.
+// TODO(#3580): remove this entire driver w/ iree_hal_executable_library_t.
+IREE_API_EXPORT iree_status_t
+iree_hal_vmvx_sync_driver_module_register(iree_hal_driver_registry_t* registry);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_VMVX_REGISTRATION_DRIVER_MODULE_SYNC_H_
diff --git a/runtime/src/iree/hal/vulkan/BUILD b/runtime/src/iree/hal/vulkan/BUILD
new file mode 100644
index 0000000..743310a
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/BUILD
@@ -0,0 +1,143 @@
+# Copyright 2019 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# HAL implementation using Vulkan and (likely) SPIR-V executables.
+
+load("//iree:build_defs.oss.bzl", "iree_cmake_extra_content", "iree_runtime_cc_library", "iree_runtime_cc_test")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+iree_cmake_extra_content(
+    content = """
+if(NOT ${IREE_HAL_DRIVER_VULKAN})
+  return()
+endif()
+""",
+)
+
+iree_runtime_cc_library(
+    name = "vulkan",
+    srcs = [
+        "api.cc",
+        "builtin_executables.cc",
+        "builtin_executables.h",
+        "command_queue.h",
+        "debug_reporter.cc",
+        "debug_reporter.h",
+        "descriptor_pool_cache.cc",
+        "descriptor_pool_cache.h",
+        "descriptor_set_arena.cc",
+        "descriptor_set_arena.h",
+        "direct_command_buffer.cc",
+        "direct_command_buffer.h",
+        "direct_command_queue.cc",
+        "direct_command_queue.h",
+        "emulated_semaphore.cc",
+        "emulated_semaphore.h",
+        "extensibility_util.cc",
+        "extensibility_util.h",
+        "handle_util.h",
+        "internal_vk_mem_alloc.cc",
+        "internal_vk_mem_alloc.h",
+        "native_descriptor_set.cc",
+        "native_descriptor_set.h",
+        "native_descriptor_set_layout.cc",
+        "native_descriptor_set_layout.h",
+        "native_event.cc",
+        "native_event.h",
+        "native_executable.cc",
+        "native_executable.h",
+        "native_executable_layout.cc",
+        "native_executable_layout.h",
+        "native_semaphore.cc",
+        "native_semaphore.h",
+        "nop_executable_cache.cc",
+        "nop_executable_cache.h",
+        "serializing_command_queue.cc",
+        "serializing_command_queue.h",
+        "status_util.c",
+        "status_util.h",
+        "timepoint_util.cc",
+        "timepoint_util.h",
+        "tracing.cc",
+        "tracing.h",
+        "vma_allocator.cc",
+        "vma_allocator.h",
+        "vma_buffer.cc",
+        "vma_buffer.h",
+        "vulkan_device.cc",
+        "vulkan_driver.cc",
+        "vulkan_headers.h",
+    ],
+    hdrs = [
+        # TODO(benvanik): hide all but api.h.
+        "api.h",
+        "vulkan_device.h",
+        "vulkan_driver.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":dynamic_symbols",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:cc",
+        "//runtime/src/iree/base:core_headers",
+        "//runtime/src/iree/base:logging",
+        "//runtime/src/iree/base:tracing",
+        "//runtime/src/iree/base/internal",
+        "//runtime/src/iree/base/internal:arena",
+        "//runtime/src/iree/base/internal:synchronization",
+        "//runtime/src/iree/base/internal/flatcc:parsing",
+        "//runtime/src/iree/hal",
+        "//runtime/src/iree/hal/utils:buffer_transfer",
+        "//runtime/src/iree/hal/utils:resource_set",
+        "//runtime/src/iree/hal/vulkan/builtin",
+        "//runtime/src/iree/hal/vulkan/util:arena",
+        "//runtime/src/iree/hal/vulkan/util:intrusive_list",
+        "//runtime/src/iree/hal/vulkan/util:ref_ptr",
+        "//runtime/src/iree/schemas:spirv_executable_def_c_fbs",
+        "@vulkan_headers",
+        "@vulkan_memory_allocator//:impl_header_only",
+    ],
+)
+
+iree_runtime_cc_library(
+    name = "dynamic_symbols",
+    srcs = [
+        "dynamic_symbols.cc",
+        "vulkan_headers.h",
+    ],
+    hdrs = [
+        "dynamic_symbols.h",
+    ],
+    textual_hdrs = [
+        "dynamic_symbol_tables.h",
+    ],
+    deps = [
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:cc",
+        "//runtime/src/iree/base:core_headers",
+        "//runtime/src/iree/base:tracing",
+        "//runtime/src/iree/base/internal:dynamic_library",
+        "//runtime/src/iree/hal/vulkan/util:ref_ptr",
+        "@vulkan_headers",
+    ],
+)
+
+iree_runtime_cc_test(
+    name = "dynamic_symbols_test",
+    srcs = ["dynamic_symbols_test.cc"],
+    tags = ["driver=vulkan"],
+    deps = [
+        ":dynamic_symbols",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/testing:gtest",
+        "//runtime/src/iree/testing:gtest_main",
+    ],
+)
diff --git a/runtime/src/iree/hal/vulkan/CMakeLists.txt b/runtime/src/iree/hal/vulkan/CMakeLists.txt
new file mode 100644
index 0000000..17862ee
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/CMakeLists.txt
@@ -0,0 +1,134 @@
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# runtime/src/iree/hal/vulkan/BUILD                                            #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
+
+if(NOT ${IREE_HAL_DRIVER_VULKAN})
+  return()
+endif()
+
+iree_add_all_subdirs()
+
+iree_cc_library(
+  NAME
+    vulkan
+  HDRS
+    "api.h"
+    "vulkan_device.h"
+    "vulkan_driver.h"
+  SRCS
+    "api.cc"
+    "builtin_executables.cc"
+    "builtin_executables.h"
+    "command_queue.h"
+    "debug_reporter.cc"
+    "debug_reporter.h"
+    "descriptor_pool_cache.cc"
+    "descriptor_pool_cache.h"
+    "descriptor_set_arena.cc"
+    "descriptor_set_arena.h"
+    "direct_command_buffer.cc"
+    "direct_command_buffer.h"
+    "direct_command_queue.cc"
+    "direct_command_queue.h"
+    "emulated_semaphore.cc"
+    "emulated_semaphore.h"
+    "extensibility_util.cc"
+    "extensibility_util.h"
+    "handle_util.h"
+    "internal_vk_mem_alloc.cc"
+    "internal_vk_mem_alloc.h"
+    "native_descriptor_set.cc"
+    "native_descriptor_set.h"
+    "native_descriptor_set_layout.cc"
+    "native_descriptor_set_layout.h"
+    "native_event.cc"
+    "native_event.h"
+    "native_executable.cc"
+    "native_executable.h"
+    "native_executable_layout.cc"
+    "native_executable_layout.h"
+    "native_semaphore.cc"
+    "native_semaphore.h"
+    "nop_executable_cache.cc"
+    "nop_executable_cache.h"
+    "serializing_command_queue.cc"
+    "serializing_command_queue.h"
+    "status_util.c"
+    "status_util.h"
+    "timepoint_util.cc"
+    "timepoint_util.h"
+    "tracing.cc"
+    "tracing.h"
+    "vma_allocator.cc"
+    "vma_allocator.h"
+    "vma_buffer.cc"
+    "vma_buffer.h"
+    "vulkan_device.cc"
+    "vulkan_driver.cc"
+    "vulkan_headers.h"
+  DEPS
+    ::dynamic_symbols
+    Vulkan::Headers
+    iree::base
+    iree::base::cc
+    iree::base::core_headers
+    iree::base::internal
+    iree::base::internal::arena
+    iree::base::internal::flatcc::parsing
+    iree::base::internal::synchronization
+    iree::base::logging
+    iree::base::tracing
+    iree::hal
+    iree::hal::utils::buffer_transfer
+    iree::hal::utils::resource_set
+    iree::hal::vulkan::builtin
+    iree::hal::vulkan::util::arena
+    iree::hal::vulkan::util::intrusive_list
+    iree::hal::vulkan::util::ref_ptr
+    iree::schemas::spirv_executable_def_c_fbs
+    vulkan_memory_allocator
+  PUBLIC
+)
+
+iree_cc_library(
+  NAME
+    dynamic_symbols
+  HDRS
+    "dynamic_symbols.h"
+  TEXTUAL_HDRS
+    "dynamic_symbol_tables.h"
+  SRCS
+    "dynamic_symbols.cc"
+    "vulkan_headers.h"
+  DEPS
+    Vulkan::Headers
+    iree::base
+    iree::base::cc
+    iree::base::core_headers
+    iree::base::internal::dynamic_library
+    iree::base::tracing
+    iree::hal::vulkan::util::ref_ptr
+  PUBLIC
+)
+
+iree_cc_test(
+  NAME
+    dynamic_symbols_test
+  SRCS
+    "dynamic_symbols_test.cc"
+  DEPS
+    ::dynamic_symbols
+    iree::base
+    iree::testing::gtest
+    iree::testing::gtest_main
+  LABELS
+    "driver=vulkan"
+)
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/runtime/src/iree/hal/vulkan/api.cc b/runtime/src/iree/hal/vulkan/api.cc
new file mode 100644
index 0000000..f05a296
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/api.cc
@@ -0,0 +1,77 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/vulkan/api.h"
+
+#include <cstring>
+#include <functional>
+#include <string>
+
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/vulkan/dynamic_symbols.h"
+#include "iree/hal/vulkan/util/ref_ptr.h"
+
+using namespace iree::hal::vulkan;
+
+// TODO(benvanik): move these into the appropriate files and delete this .cc.
+
+//===----------------------------------------------------------------------===//
+// iree::hal::vulkan::DynamicSymbols
+//===----------------------------------------------------------------------===//
+
+IREE_API_EXPORT iree_status_t iree_hal_vulkan_syms_create(
+    void* vkGetInstanceProcAddr_fn, iree_allocator_t host_allocator,
+    iree_hal_vulkan_syms_t** out_syms) {
+  IREE_TRACE_SCOPE0("iree_hal_vulkan_syms_create");
+  IREE_ASSERT_ARGUMENT(out_syms);
+  *out_syms = nullptr;
+
+  iree::ref_ptr<iree::hal::vulkan::DynamicSymbols> syms;
+  IREE_RETURN_IF_ERROR(DynamicSymbols::Create(
+      [&vkGetInstanceProcAddr_fn](const char* function_name) {
+        // Only resolve vkGetInstanceProcAddr, rely on syms->LoadFromInstance()
+        // and/or syms->LoadFromDevice() for further loading.
+        std::string fn = "vkGetInstanceProcAddr";
+        if (strncmp(function_name, fn.data(), fn.size()) == 0) {
+          return reinterpret_cast<PFN_vkVoidFunction>(vkGetInstanceProcAddr_fn);
+        }
+        return reinterpret_cast<PFN_vkVoidFunction>(NULL);
+      },
+      &syms));
+
+  *out_syms = reinterpret_cast<iree_hal_vulkan_syms_t*>(syms.release());
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_vulkan_syms_create_from_system_loader(
+    iree_allocator_t host_allocator, iree_hal_vulkan_syms_t** out_syms) {
+  IREE_TRACE_SCOPE0("iree_hal_vulkan_syms_create_from_system_loader");
+  IREE_ASSERT_ARGUMENT(out_syms);
+  *out_syms = nullptr;
+
+  iree::ref_ptr<iree::hal::vulkan::DynamicSymbols> syms;
+  IREE_RETURN_IF_ERROR(DynamicSymbols::CreateFromSystemLoader(&syms));
+  *out_syms = reinterpret_cast<iree_hal_vulkan_syms_t*>(syms.release());
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT void iree_hal_vulkan_syms_retain(iree_hal_vulkan_syms_t* syms) {
+  IREE_ASSERT_ARGUMENT(syms);
+  auto* handle = reinterpret_cast<DynamicSymbols*>(syms);
+  if (handle) {
+    handle->AddReference();
+  }
+}
+
+IREE_API_EXPORT void iree_hal_vulkan_syms_release(
+    iree_hal_vulkan_syms_t* syms) {
+  IREE_ASSERT_ARGUMENT(syms);
+  auto* handle = reinterpret_cast<DynamicSymbols*>(syms);
+  if (handle) {
+    handle->ReleaseReference();
+  }
+}
diff --git a/runtime/src/iree/hal/vulkan/api.h b/runtime/src/iree/hal/vulkan/api.h
new file mode 100644
index 0000000..126b4f4
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/api.h
@@ -0,0 +1,268 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// See iree/base/api.h for documentation on the API conventions used.
+
+#ifndef IREE_HAL_VULKAN_API_H_
+#define IREE_HAL_VULKAN_API_H_
+
+#include <stdint.h>
+
+// clang-format off: must be included before all other headers.
+#include "iree/hal/vulkan/vulkan_headers.h"
+// clang-format on
+
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+//===----------------------------------------------------------------------===//
+// iree_hal_vulkan_device_t extensibility util
+//===----------------------------------------------------------------------===//
+
+// TODO(benvanik): replace with feature list (easier to version).
+// Bitfield that defines sets of Vulkan features.
+enum iree_hal_vulkan_feature_bits_t {
+  // Use VK_LAYER_KHRONOS_standard_validation to validate Vulkan API usage.
+  // Has a significant performance penalty and is *not* a security mechanism.
+  IREE_HAL_VULKAN_FEATURE_ENABLE_VALIDATION_LAYERS = 1u << 0,
+
+  // Use VK_EXT_debug_utils, record markers, and log errors.
+  IREE_HAL_VULKAN_FEATURE_ENABLE_DEBUG_UTILS = 1u << 1,
+
+  // Enables tracing of command buffers when IREE tracing is enabled.
+  // May take advantage of additional extensions for more accurate timing or
+  // hardware-specific performance counters.
+  //
+  // NOTE: tracing has a non-trivial overhead and will skew the timing of
+  // submissions and introduce false barriers between dispatches. Use this to
+  // identify slow dispatches and refine from there; be wary of whole-program
+  // tracing with this enabled.
+  IREE_HAL_VULKAN_FEATURE_ENABLE_TRACING = 1u << 2,
+};
+typedef uint32_t iree_hal_vulkan_features_t;
+
+// Describes the type of a set of Vulkan extensions.
+typedef enum iree_hal_vulkan_extensibility_set_e {
+  // A set of required instance layer names. These must all be enabled on
+  // the VkInstance for IREE to function.
+  IREE_HAL_VULKAN_EXTENSIBILITY_INSTANCE_LAYERS_REQUIRED = 0,
+
+  // A set of optional instance layer names. If omitted fallbacks may be
+  // used or debugging features may not be available.
+  IREE_HAL_VULKAN_EXTENSIBILITY_INSTANCE_LAYERS_OPTIONAL,
+
+  // A set of required instance extension names. These must all be enabled on
+  // the VkInstance for IREE to function.
+  IREE_HAL_VULKAN_EXTENSIBILITY_INSTANCE_EXTENSIONS_REQUIRED,
+
+  // A set of optional instance extension names. If omitted fallbacks may be
+  // used or debugging features may not be available.
+  IREE_HAL_VULKAN_EXTENSIBILITY_INSTANCE_EXTENSIONS_OPTIONAL,
+
+  // A set of required device extension names. These must all be enabled on
+  // the VkDevice for IREE to function.
+  IREE_HAL_VULKAN_EXTENSIBILITY_DEVICE_EXTENSIONS_REQUIRED,
+
+  // A set of optional device extension names. If omitted fallbacks may be
+  // used or debugging features may not be available.
+  IREE_HAL_VULKAN_EXTENSIBILITY_DEVICE_EXTENSIONS_OPTIONAL,
+
+  IREE_HAL_VULKAN_EXTENSIBILITY_SET_COUNT,  // used for sizing lookup tables
+} iree_hal_vulkan_extensibility_set_t;
+
+// Queries the names of the Vulkan layers and extensions used for a given set of
+// IREE |requested_features|. All devices used by IREE must have the required
+// layers and extensions as defined by these sets. Optional layers and
+// extensions will be used when needed and otherwise have fallbacks for when
+// they are not available.
+//
+// Instance extensions should be enabled on VkInstances passed to
+// |iree_hal_vulkan_driver_create_using_instance| and device extensions should
+// be enabled on VkDevices passed to |iree_hal_vulkan_driver_wrap_device|.
+//
+// |string_capacity| defines the number of elements available in
+// |out_string_values| and |out_string_count| will be set with the actual number
+// of strings returned. If |string_capacity| is too small then
+// IREE_STATUS_OUT_OF_RANGE will be returned with the required capacity in
+// |out_string_count|. To only query the required capacity then
+// |out_string_values| may be passed as NULL.
+//
+// The returned strings originate from the _EXTENSION_NAME Vulkan macros
+// (such as 'VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME') and have a
+// lifetime matching whatever module they are defined in.
+IREE_API_EXPORT iree_status_t iree_hal_vulkan_query_extensibility_set(
+    iree_hal_vulkan_features_t requested_features,
+    iree_hal_vulkan_extensibility_set_t set, iree_host_size_t string_capacity,
+    const char** out_string_values, iree_host_size_t* out_string_count);
+
+//===----------------------------------------------------------------------===//
+// iree_hal_vulkan_syms_t
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_vulkan_syms_t iree_hal_vulkan_syms_t;
+
+// Loads Vulkan functions by invoking |vkGetInstanceProcAddr|.
+//
+// |vkGetInstanceProcAddr| can be obtained in whatever way suites the calling
+// application, such as via `dlsym` or `GetProcAddress` when dynamically
+// loading Vulkan, or `reinterpret_cast<void*>(&vkGetInstanceProcAddr)` when
+// statically linking Vulkan.
+//
+// |out_syms| must be released by the caller.
+IREE_API_EXPORT iree_status_t iree_hal_vulkan_syms_create(
+    void* vkGetInstanceProcAddr_fn, iree_allocator_t host_allocator,
+    iree_hal_vulkan_syms_t** out_syms);
+
+// Loads Vulkan functions from the Vulkan loader.
+// This will look for a Vulkan loader on the system (like libvulkan.so) and
+// dlsym the functions from that.
+//
+// |out_syms| must be released by the caller with iree_hal_vulkan_syms_release.
+IREE_API_EXPORT iree_status_t iree_hal_vulkan_syms_create_from_system_loader(
+    iree_allocator_t host_allocator, iree_hal_vulkan_syms_t** out_syms);
+
+// Retains the given |syms| for the caller.
+IREE_API_EXPORT void iree_hal_vulkan_syms_retain(iree_hal_vulkan_syms_t* syms);
+
+// Releases the given |syms| from the caller.
+IREE_API_EXPORT void iree_hal_vulkan_syms_release(iree_hal_vulkan_syms_t* syms);
+
+//===----------------------------------------------------------------------===//
+// iree_hal_vulkan_device_t
+//===----------------------------------------------------------------------===//
+
+// A set of queues within a specific queue family on a VkDevice.
+typedef struct iree_hal_vulkan_queue_set_t {
+  // The index of a particular queue family on a VkPhysicalDevice, as described
+  // by vkGetPhysicalDeviceQueueFamilyProperties.
+  uint32_t queue_family_index;
+
+  // Bitfield of queue indices within the queue family at |queue_family_index|.
+  uint64_t queue_indices;
+} iree_hal_vulkan_queue_set_t;
+
+// TODO(benvanik): replace with flag list (easier to version).
+enum iree_hal_vulkan_device_flag_bits_t {
+  // Uses timeline semaphore emulation even if native support exists.
+  // May be removed in future versions when timeline semaphores can be assumed
+  // present on all platforms (looking at you, Android ಠ_ಠ).
+  IREE_HAL_VULKAN_DEVICE_FORCE_TIMELINE_SEMAPHORE_EMULATION = 1u << 0,
+};
+typedef uint32_t iree_hal_vulkan_device_flags_t;
+
+typedef struct iree_hal_vulkan_device_options_t {
+  // Flags controlling device behavior.
+  iree_hal_vulkan_device_flags_t flags;
+} iree_hal_vulkan_device_options_t;
+
+IREE_API_EXPORT void iree_hal_vulkan_device_options_initialize(
+    iree_hal_vulkan_device_options_t* out_options);
+
+// Creates a Vulkan HAL device that wraps an existing VkDevice.
+//
+// HAL devices created in this way may share Vulkan resources and synchronize
+// within the same physical VkPhysicalDevice and logical VkDevice directly.
+//
+// |logical_device| is expected to have been created with all extensions
+// returned by |iree_hal_vulkan_get_extensions| and
+// IREE_HAL_VULKAN_DEVICE_REQUIRED using the features provided during driver
+// creation.
+//
+// |instance_syms| must have at least the instance-specific functions resolved
+// and device symbols will be queried from |logical_device| as needed.
+//
+// The device will schedule commands against the queues in
+// |compute_queue_set| and (if set) |transfer_queue_set|.
+//
+// Applications may choose how these queues are created and selected in order
+// to control how commands submitted by this device are prioritized and
+// scheduled. For example, a low priority queue could be provided to one IREE
+// device for background processing or a high priority queue could be provided
+// for latency-sensitive processing.
+//
+// Dedicated compute queues (no graphics capabilities) are preferred within
+// |compute_queue_set|, if they are available.
+// Similarly, dedicated transfer queues (no compute or graphics) are preferred
+// within |transfer_queue_set|.
+// The queue sets can be the same.
+//
+// |out_device| must be released by the caller (see |iree_hal_device_release|).
+IREE_API_EXPORT iree_status_t iree_hal_vulkan_wrap_device(
+    iree_string_view_t identifier,
+    const iree_hal_vulkan_device_options_t* options,
+    const iree_hal_vulkan_syms_t* instance_syms, VkInstance instance,
+    VkPhysicalDevice physical_device, VkDevice logical_device,
+    const iree_hal_vulkan_queue_set_t* compute_queue_set,
+    const iree_hal_vulkan_queue_set_t* transfer_queue_set,
+    iree_allocator_t host_allocator, iree_hal_device_t** out_device);
+
+//===----------------------------------------------------------------------===//
+// iree_hal_vulkan_driver_t
+//===----------------------------------------------------------------------===//
+
+// Vulkan driver creation options.
+typedef struct iree_hal_vulkan_driver_options_t {
+  // Vulkan version that will be requested, e.g. `VK_API_VERSION_1_0`.
+  // Driver creation will fail if the required version is not available.
+  uint32_t api_version;
+
+  // IREE features used to configure the VkInstance and VkDevices created using
+  // it. These are used to populate the active Vulkan layers and extensions when
+  // the instance and its devices are created.
+  iree_hal_vulkan_features_t requested_features;
+
+  // TODO(benvanik): remove this single setting - it would be nice instead to
+  // pass a list to force device enumeration/matrix expansion or omit entirely
+  // to have auto-discovered options based on capabilities. Right now this
+  // forces all devices - even if from different vendors - to have the same
+  // options.
+  // Options to use for all devices created by the driver.
+  iree_hal_vulkan_device_options_t device_options;
+
+  // TODO(benvanik): change to something more canonically vulkan (like
+  // VkPhysicalDeviceProperties::deviceID).
+  // Index of the default Vulkan device to use within the list of available
+  // devices. Devices are discovered via vkEnumeratePhysicalDevices then
+  // considered "available" if compatible with the |requested_features|.
+  int default_device_index;
+} iree_hal_vulkan_driver_options_t;
+
+IREE_API_EXPORT void iree_hal_vulkan_driver_options_initialize(
+    iree_hal_vulkan_driver_options_t* out_options);
+
+// Creates a Vulkan HAL driver that manages its own VkInstance.
+//
+// |out_driver| must be released by the caller (see |iree_hal_driver_release|).
+IREE_API_EXPORT iree_status_t iree_hal_vulkan_driver_create(
+    iree_string_view_t identifier,
+    const iree_hal_vulkan_driver_options_t* options,
+    iree_hal_vulkan_syms_t* syms, iree_allocator_t host_allocator,
+    iree_hal_driver_t** out_driver);
+
+// Creates a Vulkan HAL driver that shares an existing VkInstance.
+//
+// |instance| is expected to have been created with all extensions returned by
+// the instance-specific |iree_hal_vulkan_query_extensibility_set| queries.
+//
+// |instance| must remain valid for the life of |out_driver| and |out_driver|
+// itself must be released by the caller (see |iree_hal_driver_release|).
+IREE_API_EXPORT iree_status_t iree_hal_vulkan_driver_create_using_instance(
+    iree_string_view_t identifier,
+    const iree_hal_vulkan_driver_options_t* options,
+    iree_hal_vulkan_syms_t* instance_syms, VkInstance instance,
+    iree_allocator_t host_allocator, iree_hal_driver_t** out_driver);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_VULKAN_API_H_
diff --git a/runtime/src/iree/hal/vulkan/builtin/BUILD b/runtime/src/iree/hal/vulkan/builtin/BUILD
new file mode 100644
index 0000000..083e92f
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/builtin/BUILD
@@ -0,0 +1,24 @@
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("//build_tools/embed_data:build_defs.bzl", "c_embed_data")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+c_embed_data(
+    name = "builtin",
+    srcs = [
+        "fill_unaligned.spv",
+    ],
+    c_file_output = "builtin_shaders_spv.c",
+    flatten = True,
+    h_file_output = "builtin_shaders_spv.h",
+    identifier = "builtin_shaders_spv",
+)
diff --git a/runtime/src/iree/hal/vulkan/builtin/CMakeLists.txt b/runtime/src/iree/hal/vulkan/builtin/CMakeLists.txt
new file mode 100644
index 0000000..b2c5b2e
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/builtin/CMakeLists.txt
@@ -0,0 +1,28 @@
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# runtime/src/iree/hal/vulkan/builtin/BUILD                                    #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
+
+iree_add_all_subdirs()
+
+iree_c_embed_data(
+  NAME
+    builtin
+  SRCS
+    "fill_unaligned.spv"
+  C_FILE_OUTPUT
+    "builtin_shaders_spv.c"
+  H_FILE_OUTPUT
+    "builtin_shaders_spv.h"
+  IDENTIFIER
+    "builtin_shaders_spv"
+  FLATTEN
+  PUBLIC
+)
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/runtime/src/iree/hal/vulkan/builtin/compile_shaders.sh b/runtime/src/iree/hal/vulkan/builtin/compile_shaders.sh
new file mode 100644
index 0000000..fd5f571
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/builtin/compile_shaders.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# Compiles input .glsl files into output .spv binary files. As these files are
+# updated infrequently and their binary sizes are small, we check in both files
+# and don't take a hard dependency on the shader compiler tool.
+#
+# To use, ensure `glslc` is on your PATH (such as by installing the Vulkan SDK
+# or builting it from its source at https://github.com/google/shaderc) and run
+# the script.
+
+set -e
+set -x
+
+BUILTIN_DIR="$(dirname $0)"
+
+glslc \
+  -Os -fshader-stage=compute -mfmt=bin \
+  ${BUILTIN_DIR}/fill_unaligned.glsl \
+  -o ${BUILTIN_DIR}/fill_unaligned.spv
diff --git a/runtime/src/iree/hal/vulkan/builtin/fill_unaligned.glsl b/runtime/src/iree/hal/vulkan/builtin/fill_unaligned.glsl
new file mode 100644
index 0000000..9ba434e
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/builtin/fill_unaligned.glsl
@@ -0,0 +1,64 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#version 450
+
+// Polyfill for buffer fills that are not aligned to 4 byte offsets or lengths.
+// This only implements the unaligned edges of fill operations. vkCmdFillBuffer
+// should be used for the aligned interior (if any).
+//
+// Repeats the 4 byte value |fill_pattern| into |output_elements|, between
+// |fill_offset_bytes| and |fill_offset_bytes| + |fill_length_bytes|.
+
+layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
+
+layout(set = 3, binding = 0) buffer OutputBuffer { uint output_elements[]; };
+
+layout(push_constant) uniform Constants {
+  // TODO(scotttodd): low and high for 8 byte pattern
+  uint fill_pattern;
+  uint fill_pattern_width;  // should be 1 or 2 (or 8 later on)
+  uint fill_offset_bytes;   // must be aligned to pattern width
+  uint fill_length_bytes;
+} input_constants;
+
+void FillBufferUnalignedHelper(uint fill_offset_bytes, uint fill_length_bytes) {
+  uint fill_aligned_offset = fill_offset_bytes % 4;
+  uint fill_aligned_start_bytes = fill_offset_bytes - fill_aligned_offset;
+  uint fill_aligned_start_index = fill_aligned_start_bytes / 4;
+
+  uint shifted_pattern = 0x00000000;
+  if (input_constants.fill_pattern_width == 1) {
+    // Shift the pattern into each segment that is within the fill range.
+    uint fill_start = fill_aligned_offset;
+    uint fill_end = min(4, fill_start + fill_length_bytes);
+    for (uint i = fill_start; i < fill_end; ++i) {
+      shifted_pattern |= input_constants.fill_pattern << (8 * i);
+    }
+  } else if (input_constants.fill_pattern_width == 2) {
+    // Shift the pattern into the only supported segment in the fill range.
+    shifted_pattern = input_constants.fill_pattern << (8 * fill_aligned_offset);
+  }
+  output_elements[fill_aligned_start_index] = shifted_pattern;
+}
+
+void main() {
+  uint start_byte = input_constants.fill_offset_bytes;
+  uint end_byte =
+      input_constants.fill_offset_bytes + input_constants.fill_length_bytes;
+
+  // Unaligned start fill, if needed.
+  if (start_byte % 4 != 0 || input_constants.fill_length_bytes < 4) {
+    FillBufferUnalignedHelper(start_byte, input_constants.fill_length_bytes);
+  }
+  // Unaligned end fill, if needed.
+  if ((end_byte % 4 != 0) &&
+      (start_byte % 4 + input_constants.fill_length_bytes > 4)) {
+    uint end_rounded_down = (end_byte / 4) * 4;
+    uint length_end = end_byte - end_rounded_down;
+    FillBufferUnalignedHelper(end_rounded_down, length_end);
+  }
+}
diff --git a/runtime/src/iree/hal/vulkan/builtin/fill_unaligned.spv b/runtime/src/iree/hal/vulkan/builtin/fill_unaligned.spv
new file mode 100644
index 0000000..d457e5d
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/builtin/fill_unaligned.spv
Binary files differ
diff --git a/runtime/src/iree/hal/vulkan/builtin_executables.cc b/runtime/src/iree/hal/vulkan/builtin_executables.cc
new file mode 100644
index 0000000..c7695b0
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/builtin_executables.cc
@@ -0,0 +1,201 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/vulkan/builtin_executables.h"
+
+#include <cstddef>
+
+#include "iree/base/tracing.h"
+#include "iree/hal/vulkan/builtin/builtin_shaders_spv.h"
+#include "iree/hal/vulkan/native_descriptor_set.h"
+#include "iree/hal/vulkan/native_descriptor_set_layout.h"
+#include "iree/hal/vulkan/native_executable_layout.h"
+#include "iree/hal/vulkan/status_util.h"
+
+namespace iree {
+namespace hal {
+namespace vulkan {
+
+namespace {
+
+typedef struct iree_hal_vulkan_builtin_fill_unaligned_constants_t {
+  uint32_t fill_pattern;
+  uint32_t fill_pattern_width;
+  uint32_t fill_offset_bytes;
+  uint32_t fill_length_bytes;
+} iree_hal_vulkan_builtin_fill_unaligned_constants_t;
+
+static_assert(sizeof(iree_hal_vulkan_builtin_fill_unaligned_constants_t) ==
+                  IREE_HAL_VULKAN_BUILTIN_PUSH_CONSTANT_COUNT,
+              "push constant count must match struct size");
+
+}  // namespace
+
+BuiltinExecutables::BuiltinExecutables(VkDeviceHandle* logical_device)
+    : logical_device_(logical_device) {}
+
+BuiltinExecutables::~BuiltinExecutables() {
+  if (pipeline_ != VK_NULL_HANDLE) {
+    logical_device_->syms()->vkDestroyPipeline(*logical_device_, pipeline_,
+                                               logical_device_->allocator());
+  }
+
+  if (executable_layout_) {
+    iree_hal_executable_layout_destroy(executable_layout_);
+  }
+
+  for (size_t i = 0; i < IREE_HAL_VULKAN_BUILTIN_DESCRIPTOR_SET_COUNT; ++i) {
+    iree_hal_descriptor_set_layout_release(descriptor_set_layouts_[i]);
+  }
+}
+
+iree_status_t BuiltinExecutables::InitializeExecutables() {
+  IREE_TRACE_SCOPE();
+
+  // Create descriptor set layouts for our compute pipeline.
+  // Even though we're just using one set, we still need to create layout
+  // bindings for those preceding it.
+  for (size_t i = 0; i < IREE_HAL_VULKAN_BUILTIN_DESCRIPTOR_SET_COUNT; ++i) {
+    iree_hal_descriptor_set_layout_t* layout = NULL;
+    iree_hal_descriptor_set_layout_binding_t layout_binding;
+    layout_binding.binding = 0;
+    layout_binding.type = IREE_HAL_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+    IREE_RETURN_IF_ERROR(iree_hal_vulkan_native_descriptor_set_layout_create(
+        logical_device_,
+        i < IREE_HAL_VULKAN_BUILTIN_DESCRIPTOR_SET
+            ? IREE_HAL_DESCRIPTOR_SET_LAYOUT_USAGE_TYPE_IMMUTABLE
+            : IREE_HAL_DESCRIPTOR_SET_LAYOUT_USAGE_TYPE_PUSH_ONLY,
+        /*binding_count=*/1, &layout_binding, &layout));
+    descriptor_set_layouts_[i] = layout;
+  }
+
+  iree_status_t status = iree_ok_status();
+
+  // Create shader module.
+  VkShaderModule fill_unaligned_shader = VK_NULL_HANDLE;
+  if (iree_status_is_ok(status)) {
+    VkShaderModuleCreateInfo shader_create_info;
+    shader_create_info.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO;
+    shader_create_info.pNext = NULL;
+    shader_create_info.flags = 0;
+    shader_create_info.codeSize = builtin_shaders_spv_create()[0].size;
+    shader_create_info.pCode =
+        (const uint32_t*)builtin_shaders_spv_create()[0].data;
+    status = VK_RESULT_TO_STATUS(logical_device_->syms()->vkCreateShaderModule(
+        *logical_device_, &shader_create_info, logical_device_->allocator(),
+        &fill_unaligned_shader));
+  }
+
+  // Create pipeline layout.
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_vulkan_native_executable_layout_create(
+        logical_device_, IREE_HAL_VULKAN_BUILTIN_PUSH_CONSTANT_COUNT / 4,
+        IREE_HAL_VULKAN_BUILTIN_DESCRIPTOR_SET_COUNT, descriptor_set_layouts_,
+        &executable_layout_);
+  }
+
+  // Create pipeline.
+  if (iree_status_is_ok(status)) {
+    VkComputePipelineCreateInfo pipeline_create_info;
+    pipeline_create_info.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO;
+    pipeline_create_info.pNext = NULL;
+    pipeline_create_info.flags = VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT;
+    pipeline_create_info.layout =
+        iree_hal_vulkan_native_executable_layout_handle(executable_layout_);
+    pipeline_create_info.basePipelineHandle = VK_NULL_HANDLE;
+    pipeline_create_info.basePipelineIndex = 0;
+    VkPipelineShaderStageCreateInfo* stage_create_info =
+        &pipeline_create_info.stage;
+    stage_create_info->sType =
+        VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
+    stage_create_info->pNext = NULL;
+    stage_create_info->flags = 0;
+    stage_create_info->stage = VK_SHADER_STAGE_COMPUTE_BIT;
+    stage_create_info->module = fill_unaligned_shader;
+    stage_create_info->pName = "main";
+    stage_create_info->pSpecializationInfo = NULL;
+    status =
+        VK_RESULT_TO_STATUS(logical_device_->syms()->vkCreateComputePipelines(
+            *logical_device_, /*pipeline_cache=*/VK_NULL_HANDLE,
+            /*pipeline_count=*/1, &pipeline_create_info,
+            logical_device_->allocator(), &pipeline_));
+  }
+
+  // Destroy shader module now that the pipeline is created.
+  if (fill_unaligned_shader != VK_NULL_HANDLE) {
+    logical_device_->syms()->vkDestroyShaderModule(
+        *logical_device_, fill_unaligned_shader, logical_device_->allocator());
+  }
+
+  return status;
+}
+
+iree_status_t BuiltinExecutables::FillBufferUnaligned(
+    VkCommandBuffer command_buffer, DescriptorSetArena* descriptor_set_arena,
+    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+    iree_device_size_t length, const void* pattern,
+    iree_host_size_t pattern_length, const void* push_constants_to_restore) {
+  IREE_TRACE_SCOPE();
+
+  iree_hal_vulkan_builtin_fill_unaligned_constants_t constants;
+  switch (pattern_length) {
+    case 1:
+      constants.fill_pattern = *static_cast<const uint8_t*>(pattern);
+      break;
+    case 2:
+      constants.fill_pattern = *static_cast<const uint16_t*>(pattern);
+      break;
+    case 4:
+      constants.fill_pattern = *static_cast<const uint32_t*>(pattern);
+      break;
+    default:
+      return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                              "pattern length (%" PRIhsz
+                              ") is not a power of two or is too large",
+                              pattern_length);
+  }
+
+  iree_hal_descriptor_set_binding_t binding;
+  binding.binding = 0;
+  binding.buffer = target_buffer;
+  binding.offset = 0;
+  binding.length = IREE_WHOLE_BUFFER;
+  IREE_RETURN_IF_ERROR(descriptor_set_arena->BindDescriptorSet(
+      command_buffer, executable_layout_,
+      IREE_HAL_VULKAN_BUILTIN_DESCRIPTOR_SET, /*binding_count=*/1, &binding));
+
+  logical_device_->syms()->vkCmdBindPipeline(
+      command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline_);
+
+  constants.fill_pattern_width = pattern_length;
+  constants.fill_offset_bytes = target_offset;
+  constants.fill_length_bytes = length;
+  logical_device_->syms()->vkCmdPushConstants(
+      command_buffer,
+      iree_hal_vulkan_native_executable_layout_handle(executable_layout_),
+      VK_SHADER_STAGE_COMPUTE_BIT, /*offset=*/0,
+      sizeof(iree_hal_vulkan_builtin_fill_unaligned_constants_t), &constants);
+
+  // TODO(scotttodd): insert memory barrier if we need to do dispatch<->dispatch
+  //   synchronization. The barriers inserted normally by callers would be for
+  //   transfer<->dispatch.
+
+  logical_device_->syms()->vkCmdDispatch(command_buffer, 1, 1, 1);
+
+  // Restore push constants.
+  logical_device_->syms()->vkCmdPushConstants(
+      command_buffer,
+      iree_hal_vulkan_native_executable_layout_handle(executable_layout_),
+      VK_SHADER_STAGE_COMPUTE_BIT, /*offset=*/0,
+      sizeof(iree_hal_vulkan_builtin_fill_unaligned_constants_t),
+      push_constants_to_restore);
+
+  return iree_ok_status();
+}
+
+}  // namespace vulkan
+}  // namespace hal
+}  // namespace iree
diff --git a/runtime/src/iree/hal/vulkan/builtin_executables.h b/runtime/src/iree/hal/vulkan/builtin_executables.h
new file mode 100644
index 0000000..ea25102
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/builtin_executables.h
@@ -0,0 +1,69 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_VULKAN_BUILTIN_EXECUTABLES_H_
+#define IREE_HAL_VULKAN_BUILTIN_EXECUTABLES_H_
+
+#include <vector>
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/vulkan/descriptor_set_arena.h"
+#include "iree/hal/vulkan/dynamic_symbols.h"
+#include "iree/hal/vulkan/handle_util.h"
+#include "iree/hal/vulkan/util/ref_ptr.h"
+
+namespace iree {
+namespace hal {
+namespace vulkan {
+
+// The `maxBoundDescriptorSets` limit is 4 on many devices we support and we
+// want to avoid conflicts with what the compiler uses, so we'll expect the
+// compiler to have reserved the index 3 for our exclusive use.
+#define IREE_HAL_VULKAN_BUILTIN_DESCRIPTOR_SET_COUNT 4
+#define IREE_HAL_VULKAN_BUILTIN_DESCRIPTOR_SET 3
+
+#define IREE_HAL_VULKAN_BUILTIN_PUSH_CONSTANT_COUNT 16
+
+class BuiltinExecutables {
+ public:
+  BuiltinExecutables(VkDeviceHandle* logical_device);
+  ~BuiltinExecutables();
+
+  const ref_ptr<DynamicSymbols>& syms() const {
+    return logical_device_->syms();
+  }
+
+  iree_status_t InitializeExecutables();
+
+  // Fills a buffer without 4 byte offset or length requirements.
+  //
+  // This only implements the unaligned edges of fills, vkCmdFillBuffer should
+  // be used for the aligned interior (if any).
+  //
+  // |push_constants_to_restore| will be pushed using vkCmdPushConstants over
+  // the bytes used by this call.
+  iree_status_t FillBufferUnaligned(
+      VkCommandBuffer command_buffer, DescriptorSetArena* descriptor_set_arena,
+      iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+      iree_device_size_t length, const void* pattern,
+      iree_host_size_t pattern_length, const void* push_constants_to_restore);
+
+ private:
+  VkDeviceHandle* logical_device_ = NULL;
+
+  iree_hal_descriptor_set_layout_t*
+      descriptor_set_layouts_[IREE_HAL_VULKAN_BUILTIN_DESCRIPTOR_SET_COUNT] = {
+          NULL};
+  iree_hal_executable_layout_t* executable_layout_ = NULL;
+  VkPipeline pipeline_ = VK_NULL_HANDLE;
+};
+
+}  // namespace vulkan
+}  // namespace hal
+}  // namespace iree
+
+#endif  // IREE_HAL_VULKAN_BUILTIN_EXECUTABLES_H_
diff --git a/runtime/src/iree/hal/vulkan/command_queue.h b/runtime/src/iree/hal/vulkan/command_queue.h
new file mode 100644
index 0000000..b8d73aa
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/command_queue.h
@@ -0,0 +1,78 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_VULKAN_COMMAND_QUEUE_H_
+#define IREE_HAL_VULKAN_COMMAND_QUEUE_H_
+
+#include <string>
+
+#include "iree/base/internal/synchronization.h"
+#include "iree/hal/api.h"
+#include "iree/hal/vulkan/dynamic_symbols.h"
+#include "iree/hal/vulkan/handle_util.h"
+#include "iree/hal/vulkan/tracing.h"
+#include "iree/hal/vulkan/util/arena.h"
+
+namespace iree {
+namespace hal {
+namespace vulkan {
+
+class CommandQueue {
+ public:
+  virtual ~CommandQueue() {
+    IREE_TRACE_SCOPE0("CommandQueue::dtor");
+    iree_slim_mutex_lock(&queue_mutex_);
+    syms()->vkQueueWaitIdle(queue_);
+    iree_slim_mutex_unlock(&queue_mutex_);
+    iree_slim_mutex_deinitialize(&queue_mutex_);
+  }
+
+  const ref_ptr<DynamicSymbols>& syms() const {
+    return logical_device_->syms();
+  }
+
+  VkQueue handle() const { return queue_; }
+
+  iree_hal_vulkan_tracing_context_t* tracing_context() {
+    return tracing_context_;
+  }
+  void set_tracing_context(iree_hal_vulkan_tracing_context_t* tracing_context) {
+    tracing_context_ = tracing_context;
+  }
+
+  bool can_dispatch() const {
+    return iree_all_bits_set(supported_categories_,
+                             IREE_HAL_COMMAND_CATEGORY_DISPATCH);
+  }
+  virtual iree_status_t Submit(iree_host_size_t batch_count,
+                               const iree_hal_submission_batch_t* batches) = 0;
+
+  virtual iree_status_t WaitIdle(iree_timeout_t timeout) = 0;
+
+ protected:
+  CommandQueue(VkDeviceHandle* logical_device,
+               iree_hal_command_category_t supported_categories, VkQueue queue)
+      : logical_device_(logical_device),
+        supported_categories_(supported_categories),
+        queue_(queue) {
+    iree_slim_mutex_initialize(&queue_mutex_);
+  }
+
+  VkDeviceHandle* logical_device_;
+  const iree_hal_command_category_t supported_categories_;
+
+  iree_hal_vulkan_tracing_context_t* tracing_context_ = nullptr;
+
+  // VkQueue needs to be externally synchronized.
+  iree_slim_mutex_t queue_mutex_;
+  VkQueue queue_ IREE_GUARDED_BY(queue_mutex_);
+};
+
+}  // namespace vulkan
+}  // namespace hal
+}  // namespace iree
+
+#endif  // IREE_HAL_VULKAN_COMMAND_QUEUE_H_
diff --git a/runtime/src/iree/hal/vulkan/cts/CMakeLists.txt b/runtime/src/iree/hal/vulkan/cts/CMakeLists.txt
new file mode 100644
index 0000000..17faa3b
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/cts/CMakeLists.txt
@@ -0,0 +1,23 @@
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+iree_hal_cts_test_suite(
+  DRIVER_NAME
+    vulkan
+  DRIVER_REGISTRATION_HDR
+    "runtime/src/iree/hal/vulkan/registration/driver_module.h"
+  DRIVER_REGISTRATION_FN
+    "iree_hal_vulkan_driver_module_register"
+  COMPILER_TARGET_BACKEND
+    "vulkan-spirv"
+  EXECUTABLE_FORMAT
+    "\"SPVE\""
+  DEPS
+    iree::hal::vulkan::registration
+  EXCLUDED_TESTS
+    # Non-push descriptor sets are not implemented in the Vulkan backend yet.
+    "descriptor_set"
+)
diff --git a/runtime/src/iree/hal/vulkan/debug_reporter.cc b/runtime/src/iree/hal/vulkan/debug_reporter.cc
new file mode 100644
index 0000000..6777596
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/debug_reporter.cc
@@ -0,0 +1,127 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/vulkan/debug_reporter.h"
+
+#include <cstddef>
+#include <ostream>
+
+#include "iree/base/logging.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/vulkan/status_util.h"
+
+struct iree_hal_vulkan_debug_reporter_t {
+  iree_allocator_t host_allocator;
+  VkInstance instance;
+  iree::hal::vulkan::DynamicSymbols* syms;
+  const VkAllocationCallbacks* allocation_callbacks;
+  VkDebugUtilsMessengerEXT messenger;
+};
+
+// NOTE: |user_data| may be nullptr if we are being called during instance
+// creation. Otherwise it is a pointer to the DebugReporter instance.
+//
+// NOTE: this callback must be thread safe and must be careful not to reach too
+// far outside of the call - it is called in-context from arbitrary threads with
+// some amount of Vulkan state on the stack. Assume that creating or deleting
+// Vulkan objects, issuing most Vulkan commands, etc are off-limits.
+static VKAPI_ATTR VkBool32 VKAPI_CALL
+iree_hal_vulkan_debug_utils_message_callback(
+    VkDebugUtilsMessageSeverityFlagBitsEXT message_severity,
+    VkDebugUtilsMessageTypeFlagsEXT message_type,
+    const VkDebugUtilsMessengerCallbackDataEXT* callback_data,
+    void* user_data) {
+  if (message_severity & VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT) {
+    IREE_LOG(ERROR) << callback_data->pMessage;
+  } else {
+    IREE_VLOG(1) << callback_data->pMessage;
+  }
+  return VK_FALSE;  // VK_TRUE is reserved for future use.
+}
+
+// Populates |create_info| with an instance-agnostic callback.
+// This can be used during instance creation by chaining the |create_info| to
+// VkInstanceCreateInfo::pNext.
+//
+// Only use if VK_EXT_debug_utils is present.
+static void iree_hal_vulkan_debug_reporter_populate_create_info(
+    VkDebugUtilsMessengerCreateInfoEXT* out_create_info) {
+  out_create_info->sType =
+      VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT;
+  out_create_info->pNext = nullptr;
+  out_create_info->flags = 0;
+
+  // TODO(benvanik): only enable the severities that logging has enabled.
+  out_create_info->messageSeverity =
+      VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT |
+      VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT |
+      VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT |
+      VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT;
+
+  // TODO(benvanik): allow filtering by category as a flag.
+  out_create_info->messageType =
+      VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT |
+      VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT |
+      VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT;
+
+  out_create_info->pfnUserCallback =
+      iree_hal_vulkan_debug_utils_message_callback;
+  out_create_info->pUserData = nullptr;
+}
+
+iree_status_t iree_hal_vulkan_debug_reporter_allocate(
+    VkInstance instance, iree::hal::vulkan::DynamicSymbols* syms,
+    const VkAllocationCallbacks* allocation_callbacks,
+    iree_allocator_t host_allocator,
+    iree_hal_vulkan_debug_reporter_t** out_reporter) {
+  IREE_ASSERT_ARGUMENT(instance);
+  IREE_ASSERT_ARGUMENT(syms);
+  IREE_ASSERT_ARGUMENT(out_reporter);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Allocate our struct first as we need to pass the pointer to the userdata
+  // of the messager instance when we create it.
+  iree_hal_vulkan_debug_reporter_t* reporter = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_allocator_malloc(host_allocator, sizeof(*reporter),
+                                (void**)&reporter));
+  reporter->host_allocator = host_allocator;
+  reporter->instance = instance;
+  reporter->syms = syms;
+  reporter->allocation_callbacks = allocation_callbacks;
+
+  VkDebugUtilsMessengerCreateInfoEXT create_info;
+  iree_hal_vulkan_debug_reporter_populate_create_info(&create_info);
+  create_info.pUserData = reporter;
+  iree_status_t status = VK_RESULT_TO_STATUS(
+      syms->vkCreateDebugUtilsMessengerEXT(
+          instance, &create_info, allocation_callbacks, &reporter->messenger),
+      "vkCreateDebugUtilsMessengerEXT");
+
+  if (iree_status_is_ok(status)) {
+    *out_reporter = reporter;
+  } else {
+    iree_hal_vulkan_debug_reporter_free(reporter);
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+void iree_hal_vulkan_debug_reporter_free(
+    iree_hal_vulkan_debug_reporter_t* reporter) {
+  if (!reporter) return;
+  iree_allocator_t host_allocator = reporter->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  if (reporter->messenger != VK_NULL_HANDLE) {
+    reporter->syms->vkDestroyDebugUtilsMessengerEXT(
+        reporter->instance, reporter->messenger,
+        reporter->allocation_callbacks);
+  }
+  iree_allocator_free(host_allocator, reporter);
+
+  IREE_TRACE_ZONE_END(z0);
+}
diff --git a/runtime/src/iree/hal/vulkan/debug_reporter.h b/runtime/src/iree/hal/vulkan/debug_reporter.h
new file mode 100644
index 0000000..5ddf85c
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/debug_reporter.h
@@ -0,0 +1,36 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_VULKAN_DEBUG_REPORTER_H_
+#define IREE_HAL_VULKAN_DEBUG_REPORTER_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/vulkan/dynamic_symbols.h"
+
+// A debug reporter that works with the VK_EXT_debug_utils extension.
+// One reporter should be created per VkInstance to receive callbacks from the
+// API and route them to our logging systems.
+//
+// Since creating a reporter requires a VkInstance it's not possible to report
+// on messages during instance creation. To work around this it's possible to
+// pass a *CreateInfo struct to vkCreateInstance as part of the
+// VkInstanceCreateInfo::pNext chain. The callback will only be used this way
+// during the creation call after which users can create the real
+// instance-specific reporter.
+typedef struct iree_hal_vulkan_debug_reporter_t
+    iree_hal_vulkan_debug_reporter_t;
+
+iree_status_t iree_hal_vulkan_debug_reporter_allocate(
+    VkInstance instance, iree::hal::vulkan::DynamicSymbols* syms,
+    const VkAllocationCallbacks* allocation_callbacks,
+    iree_allocator_t host_allocator,
+    iree_hal_vulkan_debug_reporter_t** out_reporter);
+
+void iree_hal_vulkan_debug_reporter_free(
+    iree_hal_vulkan_debug_reporter_t* reporter);
+
+#endif  // IREE_HAL_VULKAN_DEBUG_REPORTER_H_
diff --git a/runtime/src/iree/hal/vulkan/descriptor_pool_cache.cc b/runtime/src/iree/hal/vulkan/descriptor_pool_cache.cc
new file mode 100644
index 0000000..a62d4d1
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/descriptor_pool_cache.cc
@@ -0,0 +1,102 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/vulkan/descriptor_pool_cache.h"
+
+#include <array>
+#include <cstdint>
+#include <ostream>
+
+#include "iree/base/logging.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/vulkan/status_util.h"
+
+namespace iree {
+namespace hal {
+namespace vulkan {
+
+namespace {
+
+// TODO(benvanik): be more conservative with descriptor set count or allow
+// chaining in the command buffer when pools run out.
+static constexpr int kMaxDescriptorSets = 4096;
+
+}  // namespace
+
+DescriptorSetGroup::~DescriptorSetGroup() {
+  IREE_CHECK(descriptor_pools_.empty())
+      << "DescriptorSetGroup must be reset explicitly";
+}
+
+iree_status_t DescriptorSetGroup::Reset() {
+  IREE_TRACE_SCOPE0("DescriptorSetGroup::Reset");
+
+  if (descriptor_pool_cache_ != nullptr) {
+    IREE_RETURN_IF_ERROR(
+        descriptor_pool_cache_->ReleaseDescriptorPools(descriptor_pools_));
+  }
+  descriptor_pools_.clear();
+
+  return iree_ok_status();
+}
+
+DescriptorPoolCache::DescriptorPoolCache(VkDeviceHandle* logical_device)
+    : logical_device_(logical_device) {}
+
+iree_status_t DescriptorPoolCache::AcquireDescriptorPool(
+    VkDescriptorType descriptor_type, int max_descriptor_count,
+    DescriptorPool* out_descriptor_pool) {
+  IREE_TRACE_SCOPE0("DescriptorPoolCache::AcquireDescriptorPool");
+
+  // TODO(benvanik): lookup in cache.
+
+  VkDescriptorPoolCreateInfo create_info;
+  create_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO;
+  create_info.pNext = nullptr;
+  create_info.flags = 0;
+  create_info.maxSets = kMaxDescriptorSets;
+  std::array<VkDescriptorPoolSize, 1> pool_sizes;
+  pool_sizes[0].type = descriptor_type;
+  pool_sizes[0].descriptorCount = max_descriptor_count * create_info.maxSets;
+  create_info.poolSizeCount = static_cast<uint32_t>(pool_sizes.size());
+  create_info.pPoolSizes = pool_sizes.data();
+
+  DescriptorPool descriptor_pool;
+  descriptor_pool.descriptor_type = descriptor_type;
+  descriptor_pool.handle = VK_NULL_HANDLE;
+
+  VK_RETURN_IF_ERROR(syms().vkCreateDescriptorPool(
+                         *logical_device_, &create_info,
+                         logical_device_->allocator(), &descriptor_pool.handle),
+                     "vkCreateDescriptorPool");
+
+  *out_descriptor_pool = descriptor_pool;
+  return iree_ok_status();
+}
+
+iree_status_t DescriptorPoolCache::ReleaseDescriptorPools(
+    const std::vector<DescriptorPool>& descriptor_pools) {
+  IREE_TRACE_SCOPE0("DescriptorPoolCache::ReleaseDescriptorPools");
+
+  for (const auto& descriptor_pool : descriptor_pools) {
+    // Always reset immediately. We could do this on allocation instead however
+    // this leads to better errors when using the validation layers as we'll
+    // throw if there are in-flight command buffers using the sets in the pool.
+    VK_RETURN_IF_ERROR(syms().vkResetDescriptorPool(*logical_device_,
+                                                    descriptor_pool.handle, 0),
+                       "vkResetDescriptorPool");
+
+    // TODO(benvanik): release to cache.
+    syms().vkDestroyDescriptorPool(*logical_device_, descriptor_pool.handle,
+                                   logical_device_->allocator());
+  }
+
+  return iree_ok_status();
+}
+
+}  // namespace vulkan
+}  // namespace hal
+}  // namespace iree
diff --git a/runtime/src/iree/hal/vulkan/descriptor_pool_cache.h b/runtime/src/iree/hal/vulkan/descriptor_pool_cache.h
new file mode 100644
index 0000000..9e4259e
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/descriptor_pool_cache.h
@@ -0,0 +1,97 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_VULKAN_DESCRIPTOR_POOL_CACHE_H_
+#define IREE_HAL_VULKAN_DESCRIPTOR_POOL_CACHE_H_
+
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "iree/base/api.h"
+#include "iree/hal/vulkan/dynamic_symbols.h"
+#include "iree/hal/vulkan/handle_util.h"
+#include "iree/hal/vulkan/util/ref_ptr.h"
+
+namespace iree {
+namespace hal {
+namespace vulkan {
+
+class DescriptorPoolCache;
+
+// A descriptor pool with a single descriptor type of some number.
+// We only support a single descriptor type for now as we only generate SPIR-V
+// that uses a single type.
+struct DescriptorPool {
+  // Type of the descriptor in the set.
+  VkDescriptorType descriptor_type = VK_DESCRIPTOR_TYPE_MAX_ENUM;
+  // Pool handle.
+  VkDescriptorPool handle = VK_NULL_HANDLE;
+};
+
+// A group of descriptor sets allocated and released together.
+// The group must be explicitly reset with Reset() prior to disposing.
+class DescriptorSetGroup final {
+ public:
+  DescriptorSetGroup() = default;
+  DescriptorSetGroup(DescriptorPoolCache* descriptor_pool_cache,
+                     std::vector<DescriptorPool> descriptor_pools)
+      : descriptor_pool_cache_(descriptor_pool_cache),
+        descriptor_pools_(std::move(descriptor_pools)) {}
+  DescriptorSetGroup(const DescriptorSetGroup&) = delete;
+  DescriptorSetGroup& operator=(const DescriptorSetGroup&) = delete;
+  DescriptorSetGroup(DescriptorSetGroup&& other) noexcept
+      : descriptor_pool_cache_(std::move(other.descriptor_pool_cache_)),
+        descriptor_pools_(std::move(other.descriptor_pools_)) {}
+  DescriptorSetGroup& operator=(DescriptorSetGroup&& other) {
+    std::swap(descriptor_pool_cache_, other.descriptor_pool_cache_);
+    std::swap(descriptor_pools_, other.descriptor_pools_);
+    return *this;
+  }
+  ~DescriptorSetGroup();
+
+  iree_status_t Reset();
+
+ private:
+  DescriptorPoolCache* descriptor_pool_cache_;
+  std::vector<DescriptorPool> descriptor_pools_;
+};
+
+// A "cache" (or really, pool) of descriptor pools. These pools are allocated
+// as needed to satisfy different descriptor size requirements and are given
+// to command buffers during recording to write descriptor updates and bind
+// resources. After the descriptors in the pool are no longer used (all
+// command buffers using descriptor sets allocated from the pool have retired)
+// the pool is returned here to be reused in the future.
+class DescriptorPoolCache final {
+ public:
+  explicit DescriptorPoolCache(VkDeviceHandle* logical_device);
+
+  VkDeviceHandle* logical_device() const { return logical_device_; }
+  const DynamicSymbols& syms() const { return *logical_device_->syms(); }
+
+  // Acquires a new descriptor pool for use by the caller.
+  // The pool will have been reset and have all descriptor sets available.
+  // When all sets allocated from the pool are no longer in use it must be
+  // returned to the cache with ReleaseDescriptorPool.
+  iree_status_t AcquireDescriptorPool(VkDescriptorType descriptor_type,
+                                      int max_descriptor_count,
+                                      DescriptorPool* out_descriptor_pool);
+
+  // Releases descriptor pools back to the cache. The pools will be reset
+  // immediately and must no longer be in use by any in-flight command.
+  iree_status_t ReleaseDescriptorPools(
+      const std::vector<DescriptorPool>& descriptor_pools);
+
+ private:
+  VkDeviceHandle* logical_device_;
+};
+
+}  // namespace vulkan
+}  // namespace hal
+}  // namespace iree
+
+#endif  // IREE_HAL_VULKAN_DESCRIPTOR_POOL_CACHE_H_
diff --git a/runtime/src/iree/hal/vulkan/descriptor_set_arena.cc b/runtime/src/iree/hal/vulkan/descriptor_set_arena.cc
new file mode 100644
index 0000000..cefa6bc
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/descriptor_set_arena.cc
@@ -0,0 +1,259 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/vulkan/descriptor_set_arena.h"
+
+#include <cstddef>
+#include <type_traits>
+#include <utility>
+
+#include "iree/base/internal/math.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/vulkan/extensibility_util.h"
+#include "iree/hal/vulkan/native_descriptor_set_layout.h"
+#include "iree/hal/vulkan/native_executable_layout.h"
+#include "iree/hal/vulkan/status_util.h"
+#include "iree/hal/vulkan/vma_buffer.h"
+
+namespace iree {
+namespace hal {
+namespace vulkan {
+
+namespace {
+
+static void PopulateDescriptorSetWriteInfos(
+    iree_host_size_t binding_count,
+    const iree_hal_descriptor_set_binding_t* bindings, VkDescriptorSet dst_set,
+    Arena* arena, iree_host_size_t* out_info_count,
+    VkWriteDescriptorSet** out_infos) {
+  arena->Reset();
+  auto buffer_infos =
+      arena->AllocateSpan<VkDescriptorBufferInfo>(binding_count);
+  auto write_infos = arena->AllocateSpan<VkWriteDescriptorSet>(binding_count);
+
+  for (int i = 0; i < binding_count; ++i) {
+    const auto& binding = bindings[i];
+
+    auto& buffer_info = buffer_infos[i];
+    buffer_info.buffer = iree_hal_vulkan_vma_buffer_handle(
+        iree_hal_buffer_allocated_buffer(binding.buffer));
+    buffer_info.offset =
+        iree_hal_buffer_byte_offset(binding.buffer) + binding.offset;
+    if (binding.length == IREE_WHOLE_BUFFER) {
+      buffer_info.range = VK_WHOLE_SIZE;
+    } else {
+      // Round up to a multiple of 32-bit. 32-bit is the most native bitwidth on
+      // GPUs; it has the best support compared to other bitwidths. We use VMA
+      // to manage GPU memory for us and VMA should already handled proper
+      // alignment when performing allocations; here we just need to provide the
+      // proper "view" to Vulkan drivers over the allocated memory.
+      //
+      // Note this is needed because we can see unusal buffers like
+      // tensor<3xi8>. Depending on GPU capabilities, this might not always be
+      // directly supported by the hardware. Under such circumstances, we need
+      // to emulate i8 support with i32. Shader CodeGen takes care of that: the
+      // shader will read the buffer as tensor<i32> and perform bit shifts to
+      // extract each byte and conduct computations. The extra additional byte
+      // is read but not really used by the shader. Here in application we need
+      // to match the ABI and provide the buffer as 32-bit aligned, otherwise
+      // the whole read by the shader is considered as out of bounds per the
+      // Vulkan spec. See
+      // https://github.com/google/iree/issues/2022#issuecomment-640617234 for
+      // more details.
+      buffer_info.range = iree_device_align(
+          std::min(binding.length, iree_hal_buffer_byte_length(binding.buffer) -
+                                       binding.offset),
+          4);
+    }
+
+    auto& write_info = write_infos[i];
+    write_info.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
+    write_info.pNext = nullptr;
+    write_info.dstSet = dst_set;
+    write_info.dstBinding = binding.binding;
+    write_info.dstArrayElement = 0;
+    write_info.descriptorCount = 1;
+    write_info.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+    write_info.pImageInfo = nullptr;
+    write_info.pBufferInfo = &buffer_info;
+    write_info.pTexelBufferView = nullptr;
+  }
+
+  *out_info_count = write_infos.size();
+  *out_infos = write_infos.data();
+}
+
+static VkDescriptorSetAllocateInfo PopulateDescriptorSetsAllocateInfo(
+    const DescriptorPool& descriptor_pool,
+    iree_hal_descriptor_set_layout_t* set_layout) {
+  VkDescriptorSetAllocateInfo allocate_info;
+  allocate_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO;
+  allocate_info.pNext = nullptr;
+  allocate_info.descriptorPool = descriptor_pool.handle;
+
+  VkDescriptorSetLayout set_layout_handle =
+      iree_hal_vulkan_native_descriptor_set_layout_handle(set_layout);
+  allocate_info.descriptorSetCount = 1;
+  allocate_info.pSetLayouts = &set_layout_handle;
+
+  return allocate_info;
+}
+
+}  // namespace
+
+DescriptorSetArena::DescriptorSetArena(
+    DescriptorPoolCache* descriptor_pool_cache)
+    : logical_device_(descriptor_pool_cache->logical_device()),
+      descriptor_pool_cache_(descriptor_pool_cache) {}
+
+DescriptorSetArena::~DescriptorSetArena() {
+  if (!used_descriptor_pools_.empty()) {
+    iree_status_ignore(
+        descriptor_pool_cache_->ReleaseDescriptorPools(used_descriptor_pools_));
+    used_descriptor_pools_.clear();
+  }
+}
+
+iree_status_t DescriptorSetArena::BindDescriptorSet(
+    VkCommandBuffer command_buffer,
+    iree_hal_executable_layout_t* executable_layout, uint32_t set,
+    iree_host_size_t binding_count,
+    const iree_hal_descriptor_set_binding_t* bindings) {
+  // Always prefer using push descriptors when available as we can avoid the
+  // additional API overhead of updating/resetting pools.
+  if (logical_device_->enabled_extensions().push_descriptors) {
+    PushDescriptorSet(command_buffer, executable_layout, set, binding_count,
+                      bindings);
+    return iree_ok_status();
+  }
+
+  IREE_TRACE_SCOPE0("DescriptorSetArena::BindDescriptorSet");
+
+  auto* set_layout =
+      iree_hal_vulkan_native_executable_layout_set(executable_layout, set);
+
+  // Pick a bucket based on the number of descriptors required.
+  // NOTE: right now we are 1:1 with bindings.
+  uint32_t required_descriptor_count = static_cast<int>(binding_count * 1);
+  uint32_t max_descriptor_count =
+      std::max(8u, iree_math_round_up_to_pow2_u32(required_descriptor_count));
+  uint32_t bucket =
+      iree_math_count_trailing_zeros_u32(max_descriptor_count >> 3);
+  if (bucket >= descriptor_pool_buckets_.size()) {
+    return iree_make_status(IREE_STATUS_OUT_OF_RANGE,
+                            "too many descriptors required: %u (max=%u)",
+                            required_descriptor_count,
+                            (1 << (descriptor_pool_buckets_.size() + 3)));
+  }
+  if (descriptor_pool_buckets_[bucket].handle == VK_NULL_HANDLE) {
+    // Acquire a pool for this max_descriptor_count bucket.
+    IREE_RETURN_IF_ERROR(descriptor_pool_cache_->AcquireDescriptorPool(
+        VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, max_descriptor_count,
+        &descriptor_pool_buckets_[bucket]));
+    used_descriptor_pools_.push_back(descriptor_pool_buckets_[bucket]);
+  }
+  auto& descriptor_pool = descriptor_pool_buckets_[bucket];
+
+  VkDescriptorSetAllocateInfo allocate_info;
+  allocate_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO;
+  allocate_info.pNext = nullptr;
+  allocate_info.descriptorPool = descriptor_pool.handle;
+  VkDescriptorSetLayout set_layout_handle =
+      iree_hal_vulkan_native_descriptor_set_layout_handle(set_layout);
+  allocate_info.descriptorSetCount = 1;
+  allocate_info.pSetLayouts = &set_layout_handle;
+
+  VkDescriptorSet descriptor_set = VK_NULL_HANDLE;
+  VkResult result = syms().vkAllocateDescriptorSets(
+      *logical_device_, &allocate_info, &descriptor_set);
+
+  if (result == VK_ERROR_OUT_OF_POOL_MEMORY) {
+    // Allocation failed because the pool is either out of descriptors or too
+    // fragmented. We'll just allocate another pool.
+    IREE_RETURN_IF_ERROR(descriptor_pool_cache_->AcquireDescriptorPool(
+        VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, max_descriptor_count,
+        &descriptor_pool_buckets_[bucket]));
+    used_descriptor_pools_.push_back(descriptor_pool_buckets_[bucket]);
+
+    // Allocate descriptor sets.
+    VkDescriptorSetAllocateInfo allocate_info;
+    allocate_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO;
+    allocate_info.pNext = nullptr;
+    allocate_info.descriptorPool = descriptor_pool_buckets_[bucket].handle;
+    allocate_info.descriptorSetCount = 1;
+    allocate_info.pSetLayouts = &set_layout_handle;
+    descriptor_set = VK_NULL_HANDLE;
+    VK_RETURN_IF_ERROR(syms().vkAllocateDescriptorSets(
+                           *logical_device_, &allocate_info, &descriptor_set),
+                       "vkAllocateDescriptorSets");
+  }
+
+  // Get a list of VkWriteDescriptorSet structs with all bound buffers.
+  iree_host_size_t write_info_count = 0;
+  VkWriteDescriptorSet* write_infos = NULL;
+  PopulateDescriptorSetWriteInfos(binding_count, bindings, descriptor_set,
+                                  &scratch_arena_, &write_info_count,
+                                  &write_infos);
+
+  // This is the reason why push descriptor sets are good.
+  // We can't batch these effectively as we don't know prior to recording what
+  // descriptor sets we will need and what buffers they will point to (without
+  // doing just as much work as actually recording the buffer to try to find
+  // out).
+  syms().vkUpdateDescriptorSets(*logical_device_,
+                                static_cast<uint32_t>(write_info_count),
+                                write_infos, 0, nullptr);
+
+  // Bind the descriptor set.
+  syms().vkCmdBindDescriptorSets(
+      command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE,
+      iree_hal_vulkan_native_executable_layout_handle(executable_layout), set,
+      1, &descriptor_set, 0, nullptr);
+
+  return iree_ok_status();
+}
+
+void DescriptorSetArena::PushDescriptorSet(
+    VkCommandBuffer command_buffer,
+    iree_hal_executable_layout_t* executable_layout, uint32_t set,
+    iree_host_size_t binding_count,
+    const iree_hal_descriptor_set_binding_t* bindings) {
+  IREE_TRACE_SCOPE0("DescriptorSetArena::PushDescriptorSet");
+  VkPipelineLayout device_executable_layout =
+      iree_hal_vulkan_native_executable_layout_handle(executable_layout);
+
+  // Get a list of VkWriteDescriptorSet structs with all bound buffers.
+  iree_host_size_t write_info_count = 0;
+  VkWriteDescriptorSet* write_infos = NULL;
+  PopulateDescriptorSetWriteInfos(binding_count, bindings, VK_NULL_HANDLE,
+                                  &scratch_arena_, &write_info_count,
+                                  &write_infos);
+
+  // Fast path using push descriptors. These are pooled internally by the
+  // command buffer and prevent the need for our own pooling mechanisms.
+  syms().vkCmdPushDescriptorSetKHR(
+      command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, device_executable_layout,
+      set, static_cast<uint32_t>(write_info_count), write_infos);
+}
+
+DescriptorSetGroup DescriptorSetArena::Flush() {
+  IREE_TRACE_SCOPE0("DescriptorSetArena::Flush");
+
+  if (used_descriptor_pools_.empty()) {
+    // No resources to free.
+    return DescriptorSetGroup{};
+  }
+
+  for (auto& bucket : descriptor_pool_buckets_) {
+    bucket = {};
+  }
+  return DescriptorSetGroup(descriptor_pool_cache_,
+                            std::move(used_descriptor_pools_));
+}
+
+}  // namespace vulkan
+}  // namespace hal
+}  // namespace iree
diff --git a/runtime/src/iree/hal/vulkan/descriptor_set_arena.h b/runtime/src/iree/hal/vulkan/descriptor_set_arena.h
new file mode 100644
index 0000000..4805a2c
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/descriptor_set_arena.h
@@ -0,0 +1,76 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_VULKAN_DESCRIPTOR_SET_ARENA_H_
+#define IREE_HAL_VULKAN_DESCRIPTOR_SET_ARENA_H_
+
+#include <stdint.h>
+
+#include <array>
+#include <vector>
+
+#include "iree/base/api.h"
+#include "iree/base/status_cc.h"
+#include "iree/hal/api.h"
+#include "iree/hal/vulkan/descriptor_pool_cache.h"
+#include "iree/hal/vulkan/dynamic_symbols.h"
+#include "iree/hal/vulkan/handle_util.h"
+#include "iree/hal/vulkan/native_executable.h"
+#include "iree/hal/vulkan/util/arena.h"
+#include "iree/hal/vulkan/util/ref_ptr.h"
+
+namespace iree {
+namespace hal {
+namespace vulkan {
+
+// A reusable arena for allocating descriptor sets and batching updates.
+class DescriptorSetArena final {
+ public:
+  explicit DescriptorSetArena(DescriptorPoolCache* descriptor_pool_cache);
+  ~DescriptorSetArena();
+
+  // Allocates and binds a descriptor set from the arena.
+  // The command buffer will have the descriptor set containing |bindings| bound
+  // to it.
+  iree_status_t BindDescriptorSet(
+      VkCommandBuffer command_buffer,
+      iree_hal_executable_layout_t* executable_layout, uint32_t set,
+      iree_host_size_t binding_count,
+      const iree_hal_descriptor_set_binding_t* bindings);
+
+  // Flushes all pending writes to descriptor sets allocated from the arena and
+  // returns a group that - when dropped - will release the descriptor sets
+  // back to the pools they were allocated from.
+  DescriptorSetGroup Flush();
+
+ private:
+  const DynamicSymbols& syms() const { return *logical_device_->syms(); }
+
+  // Pushes the descriptor set to the command buffer, if supported.
+  void PushDescriptorSet(VkCommandBuffer command_buffer,
+                         iree_hal_executable_layout_t* executable_layout,
+                         uint32_t set, iree_host_size_t binding_count,
+                         const iree_hal_descriptor_set_binding_t* bindings);
+
+  VkDeviceHandle* logical_device_;
+  DescriptorPoolCache* descriptor_pool_cache_;
+
+  // Arena used for temporary binding information used during allocation.
+  Arena scratch_arena_;
+
+  // A list of pools acquired on demand as different descriptor counts are
+  // needed. Allocation granularity is max_descriptor_count=[8, 16, 32, 64].
+  std::array<DescriptorPool, 4> descriptor_pool_buckets_;
+
+  // All pools that have been used during allocation.
+  std::vector<DescriptorPool> used_descriptor_pools_;
+};
+
+}  // namespace vulkan
+}  // namespace hal
+}  // namespace iree
+
+#endif  // IREE_HAL_VULKAN_DESCRIPTOR_SET_ARENA_H_
diff --git a/runtime/src/iree/hal/vulkan/direct_command_buffer.cc b/runtime/src/iree/hal/vulkan/direct_command_buffer.cc
new file mode 100644
index 0000000..515bbc0
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/direct_command_buffer.cc
@@ -0,0 +1,856 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/vulkan/direct_command_buffer.h"
+
+#include <cstddef>
+#include <cstdint>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/inline_array.h"
+#include "iree/base/internal/math.h"
+#include "iree/base/logging.h"
+#include "iree/base/status_cc.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/utils/resource_set.h"
+#include "iree/hal/vulkan/descriptor_set_arena.h"
+#include "iree/hal/vulkan/dynamic_symbols.h"
+#include "iree/hal/vulkan/native_descriptor_set.h"
+#include "iree/hal/vulkan/native_event.h"
+#include "iree/hal/vulkan/native_executable.h"
+#include "iree/hal/vulkan/native_executable_layout.h"
+#include "iree/hal/vulkan/status_util.h"
+#include "iree/hal/vulkan/util/ref_ptr.h"
+#include "iree/hal/vulkan/vma_buffer.h"
+
+using namespace iree::hal::vulkan;
+
+// Command buffer implementation that directly maps to VkCommandBuffer.
+// This records the commands on the calling thread without additional threading
+// indirection.
+typedef struct iree_hal_vulkan_direct_command_buffer_t {
+  iree_hal_command_buffer_t base;
+  VkDeviceHandle* logical_device;
+  iree_hal_vulkan_tracing_context_t* tracing_context;
+  iree_arena_block_pool_t* block_pool;
+
+  VkCommandPoolHandle* command_pool;
+  VkCommandBuffer handle;
+
+  DynamicSymbols* syms;
+
+  // Maintains a reference to all resources used within the command buffer.
+  // Reset on each begin.
+  iree_hal_resource_set_t* resource_set;
+
+  // TODO(benvanik): may grow large - should try to reclaim or reuse.
+  DescriptorSetArena descriptor_set_arena;
+
+  // The current descriptor set group in use by the command buffer, if any.
+  // This must remain valid until all in-flight submissions of the command
+  // buffer complete.
+  DescriptorSetGroup descriptor_set_group;
+
+  BuiltinExecutables* builtin_executables;
+
+  // Shadow copy of push constants used during normal operation, for restoring
+  // after builtin_executables uses vkCmdPushConstants. Size must be greater
+  // than or equal to the push constant memory used by builtin_executables.
+  // TODO(scotttodd): use [maxPushConstantsSize - 16, maxPushConstantsSize]
+  //                  instead of [0, 16] to reduce frequency of updates
+  uint8_t push_constants_storage[IREE_HAL_VULKAN_BUILTIN_PUSH_CONSTANT_COUNT];
+} iree_hal_vulkan_direct_command_buffer_t;
+
+namespace {
+extern const iree_hal_command_buffer_vtable_t
+    iree_hal_vulkan_direct_command_buffer_vtable;
+}  // namespace
+
+static iree_hal_vulkan_direct_command_buffer_t*
+iree_hal_vulkan_direct_command_buffer_cast(
+    iree_hal_command_buffer_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value,
+                       &iree_hal_vulkan_direct_command_buffer_vtable);
+  return (iree_hal_vulkan_direct_command_buffer_t*)base_value;
+}
+
+iree_status_t iree_hal_vulkan_direct_command_buffer_allocate(
+    iree_hal_device_t* device,
+    iree::hal::vulkan::VkDeviceHandle* logical_device,
+    iree::hal::vulkan::VkCommandPoolHandle* command_pool,
+    iree_hal_command_buffer_mode_t mode,
+    iree_hal_command_category_t command_categories,
+    iree_hal_queue_affinity_t queue_affinity,
+    iree_hal_vulkan_tracing_context_t* tracing_context,
+    iree::hal::vulkan::DescriptorPoolCache* descriptor_pool_cache,
+    iree::hal::vulkan::BuiltinExecutables* builtin_executables,
+    iree_arena_block_pool_t* block_pool,
+    iree_hal_command_buffer_t** out_command_buffer) {
+  IREE_ASSERT_ARGUMENT(logical_device);
+  IREE_ASSERT_ARGUMENT(command_pool);
+  IREE_ASSERT_ARGUMENT(descriptor_pool_cache);
+  IREE_ASSERT_ARGUMENT(block_pool);
+  IREE_ASSERT_ARGUMENT(out_command_buffer);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  VkCommandBufferAllocateInfo allocate_info;
+  allocate_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
+  allocate_info.pNext = NULL;
+  allocate_info.commandPool = *command_pool;
+  allocate_info.commandBufferCount = 1;
+  allocate_info.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
+
+  VkCommandBuffer handle = VK_NULL_HANDLE;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, command_pool->Allocate(&allocate_info, &handle));
+
+  iree_hal_vulkan_direct_command_buffer_t* command_buffer = NULL;
+  iree_status_t status =
+      iree_allocator_malloc(logical_device->host_allocator(),
+                            sizeof(*command_buffer), (void**)&command_buffer);
+  if (iree_status_is_ok(status)) {
+    iree_hal_command_buffer_initialize(
+        device, mode, command_categories, queue_affinity,
+        &iree_hal_vulkan_direct_command_buffer_vtable, &command_buffer->base);
+    command_buffer->logical_device = logical_device;
+    command_buffer->tracing_context = tracing_context;
+    command_buffer->block_pool = block_pool;
+    command_buffer->command_pool = command_pool;
+    command_buffer->handle = handle;
+    command_buffer->syms = logical_device->syms().get();
+
+    new (&command_buffer->descriptor_set_arena)
+        DescriptorSetArena(descriptor_pool_cache);
+    new (&command_buffer->descriptor_set_group) DescriptorSetGroup();
+
+    command_buffer->builtin_executables = builtin_executables;
+    status = iree_hal_resource_set_allocate(block_pool,
+                                            &command_buffer->resource_set);
+  }
+
+  if (iree_status_is_ok(status)) {
+    *out_command_buffer = &command_buffer->base;
+  } else {
+    command_pool->Free(handle);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_vulkan_direct_command_buffer_reset(
+    iree_hal_vulkan_direct_command_buffer_t* command_buffer) {
+  // NOTE: we require that command buffers not be recorded while they are
+  // in-flight so this is safe.
+  IREE_IGNORE_ERROR(command_buffer->descriptor_set_group.Reset());
+  iree_hal_resource_set_reset(command_buffer->resource_set);
+}
+
+bool iree_hal_vulkan_direct_command_buffer_isa(
+    iree_hal_command_buffer_t* command_buffer) {
+  return iree_hal_command_buffer_dyn_cast(
+      command_buffer, &iree_hal_vulkan_direct_command_buffer_vtable);
+}
+
+static void* iree_hal_vulkan_direct_command_buffer_dyn_cast(
+    iree_hal_command_buffer_t* command_buffer, const void* vtable) {
+  if (vtable == &iree_hal_vulkan_direct_command_buffer_vtable) {
+    IREE_HAL_ASSERT_TYPE(command_buffer, vtable);
+    return command_buffer;
+  }
+  return NULL;
+}
+
+static void iree_hal_vulkan_direct_command_buffer_destroy(
+    iree_hal_command_buffer_t* base_command_buffer) {
+  iree_hal_vulkan_direct_command_buffer_t* command_buffer =
+      iree_hal_vulkan_direct_command_buffer_cast(base_command_buffer);
+  iree_allocator_t host_allocator =
+      command_buffer->logical_device->host_allocator();
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_vulkan_direct_command_buffer_reset(command_buffer);
+  command_buffer->command_pool->Free(command_buffer->handle);
+
+  command_buffer->descriptor_set_group.~DescriptorSetGroup();
+  command_buffer->descriptor_set_arena.~DescriptorSetArena();
+
+  iree_hal_resource_set_free(command_buffer->resource_set);
+  iree_allocator_free(host_allocator, command_buffer);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+VkCommandBuffer iree_hal_vulkan_direct_command_buffer_handle(
+    iree_hal_command_buffer_t* base_command_buffer) {
+  iree_hal_vulkan_direct_command_buffer_t* command_buffer =
+      (iree_hal_vulkan_direct_command_buffer_t*)
+          iree_hal_command_buffer_dyn_cast(
+              base_command_buffer,
+              &iree_hal_vulkan_direct_command_buffer_vtable);
+  return command_buffer->handle;
+}
+
+static iree_status_t iree_hal_vulkan_direct_command_buffer_begin(
+    iree_hal_command_buffer_t* base_command_buffer) {
+  iree_hal_vulkan_direct_command_buffer_t* command_buffer =
+      iree_hal_vulkan_direct_command_buffer_cast(base_command_buffer);
+
+  iree_hal_vulkan_direct_command_buffer_reset(command_buffer);
+
+  VkCommandBufferBeginInfo begin_info;
+  begin_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+  begin_info.pNext = NULL;
+  begin_info.flags = iree_all_bits_set(command_buffer->base.mode,
+                                       IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT)
+                         ? VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT
+                         : 0;
+  begin_info.pInheritanceInfo = NULL;
+  VK_RETURN_IF_ERROR(command_buffer->syms->vkBeginCommandBuffer(
+                         command_buffer->handle, &begin_info),
+                     "vkBeginCommandBuffer");
+
+  IREE_VULKAN_TRACE_ZONE_BEGIN_EXTERNAL(
+      command_buffer->tracing_context, command_buffer->handle,
+      /*file_name=*/NULL, 0,
+      /*line=*/0, /*func_name=*/NULL, 0,
+      "iree_hal_vulkan_direct_command_buffer",
+      strlen("iree_hal_vulkan_direct_command_buffer"));
+
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_vulkan_direct_command_buffer_end(
+    iree_hal_command_buffer_t* base_command_buffer) {
+  iree_hal_vulkan_direct_command_buffer_t* command_buffer =
+      iree_hal_vulkan_direct_command_buffer_cast(base_command_buffer);
+
+  IREE_VULKAN_TRACE_ZONE_END(command_buffer->tracing_context,
+                             command_buffer->handle);
+
+  VK_RETURN_IF_ERROR(
+      command_buffer->syms->vkEndCommandBuffer(command_buffer->handle),
+      "vkEndCommandBuffer");
+
+  // Flush all pending descriptor set writes (if any).
+  command_buffer->descriptor_set_group =
+      command_buffer->descriptor_set_arena.Flush();
+
+  return iree_ok_status();
+}
+
+static void iree_hal_vulkan_direct_command_buffer_begin_debug_group(
+    iree_hal_command_buffer_t* base_command_buffer, iree_string_view_t label,
+    iree_hal_label_color_t label_color,
+    const iree_hal_label_location_t* location) {
+  iree_hal_vulkan_direct_command_buffer_t* command_buffer =
+      iree_hal_vulkan_direct_command_buffer_cast(base_command_buffer);
+  IREE_VULKAN_TRACE_ZONE_BEGIN_EXTERNAL(
+      command_buffer->tracing_context, command_buffer->handle,
+      location ? location->file.data : NULL, location ? location->file.size : 0,
+      location ? location->line : 0, /*func_name=*/NULL, 0, label.data,
+      label.size);
+  if (command_buffer->syms->vkCmdBeginDebugUtilsLabelEXT) {
+    char label_buffer[128];
+    snprintf(label_buffer, sizeof(label_buffer), "%.*s", (int)label.size,
+             label.data);
+    VkDebugUtilsLabelEXT label_info = {
+        /*.sType=*/VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT,
+        /*.pNext=*/NULL,
+        /*.pLabelName=*/label_buffer,
+        /*.color=*/
+        {
+            /*r=*/label_color.r / 255.0f,
+            /*g=*/label_color.g / 255.0f,
+            /*b=*/label_color.b / 255.0f,
+            /*a=*/label_color.a / 255.0f,
+        },
+    };
+    command_buffer->syms->vkCmdBeginDebugUtilsLabelEXT(command_buffer->handle,
+                                                       &label_info);
+  }
+}
+
+static void iree_hal_vulkan_direct_command_buffer_end_debug_group(
+    iree_hal_command_buffer_t* base_command_buffer) {
+  iree_hal_vulkan_direct_command_buffer_t* command_buffer =
+      iree_hal_vulkan_direct_command_buffer_cast(base_command_buffer);
+  if (command_buffer->syms->vkCmdEndDebugUtilsLabelEXT) {
+    command_buffer->syms->vkCmdEndDebugUtilsLabelEXT(command_buffer->handle);
+  }
+  IREE_VULKAN_TRACE_ZONE_END(command_buffer->tracing_context,
+                             command_buffer->handle);
+}
+
+static VkPipelineStageFlags iree_hal_vulkan_convert_pipeline_stage_flags(
+    iree_hal_execution_stage_t stage_mask) {
+  VkPipelineStageFlags flags = 0;
+  flags |= iree_any_bit_set(stage_mask, IREE_HAL_EXECUTION_STAGE_COMMAND_ISSUE)
+               ? VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT
+               : 0;
+  flags |=
+      iree_any_bit_set(stage_mask, IREE_HAL_EXECUTION_STAGE_COMMAND_PROCESS)
+          ? VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT
+          : 0;
+  flags |= iree_any_bit_set(stage_mask, IREE_HAL_EXECUTION_STAGE_DISPATCH)
+               ? VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT
+               : 0;
+  flags |= iree_any_bit_set(stage_mask, IREE_HAL_EXECUTION_STAGE_TRANSFER)
+               ? VK_PIPELINE_STAGE_TRANSFER_BIT
+               : 0;
+  flags |= iree_any_bit_set(stage_mask, IREE_HAL_EXECUTION_STAGE_COMMAND_RETIRE)
+               ? VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT
+               : 0;
+  flags |= iree_any_bit_set(stage_mask, IREE_HAL_EXECUTION_STAGE_HOST)
+               ? VK_PIPELINE_STAGE_HOST_BIT
+               : 0;
+  return flags;
+}
+
+static VkAccessFlags iree_hal_vulkan_convert_access_mask(
+    iree_hal_access_scope_t access_mask) {
+  VkAccessFlags flags = 0;
+  flags |=
+      iree_any_bit_set(access_mask, IREE_HAL_ACCESS_SCOPE_INDIRECT_COMMAND_READ)
+          ? VK_ACCESS_INDIRECT_COMMAND_READ_BIT
+          : 0;
+  flags |= iree_any_bit_set(access_mask, IREE_HAL_ACCESS_SCOPE_CONSTANT_READ)
+               ? VK_ACCESS_UNIFORM_READ_BIT
+               : 0;
+  flags |= iree_any_bit_set(access_mask, IREE_HAL_ACCESS_SCOPE_DISPATCH_READ)
+               ? VK_ACCESS_SHADER_READ_BIT
+               : 0;
+  flags |= iree_any_bit_set(access_mask, IREE_HAL_ACCESS_SCOPE_DISPATCH_WRITE)
+               ? VK_ACCESS_SHADER_WRITE_BIT
+               : 0;
+  flags |= iree_any_bit_set(access_mask, IREE_HAL_ACCESS_SCOPE_TRANSFER_READ)
+               ? VK_ACCESS_TRANSFER_READ_BIT
+               : 0;
+  flags |= iree_any_bit_set(access_mask, IREE_HAL_ACCESS_SCOPE_TRANSFER_WRITE)
+               ? VK_ACCESS_TRANSFER_WRITE_BIT
+               : 0;
+  flags |= iree_any_bit_set(access_mask, IREE_HAL_ACCESS_SCOPE_HOST_READ)
+               ? VK_ACCESS_HOST_READ_BIT
+               : 0;
+  flags |= iree_any_bit_set(access_mask, IREE_HAL_ACCESS_SCOPE_HOST_WRITE)
+               ? VK_ACCESS_HOST_WRITE_BIT
+               : 0;
+  flags |= iree_any_bit_set(access_mask, IREE_HAL_ACCESS_SCOPE_MEMORY_READ)
+               ? VK_ACCESS_MEMORY_READ_BIT
+               : 0;
+  flags |= iree_any_bit_set(access_mask, IREE_HAL_ACCESS_SCOPE_MEMORY_WRITE)
+               ? VK_ACCESS_MEMORY_WRITE_BIT
+               : 0;
+  return flags;
+}
+
+static iree_status_t iree_hal_vulkan_direct_command_buffer_execution_barrier(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_execution_stage_t source_stage_mask,
+    iree_hal_execution_stage_t target_stage_mask,
+    iree_hal_execution_barrier_flags_t flags,
+    iree_host_size_t memory_barrier_count,
+    const iree_hal_memory_barrier_t* memory_barriers,
+    iree_host_size_t buffer_barrier_count,
+    const iree_hal_buffer_barrier_t* buffer_barriers) {
+  iree_hal_vulkan_direct_command_buffer_t* command_buffer =
+      iree_hal_vulkan_direct_command_buffer_cast(base_command_buffer);
+  iree_allocator_t host_allocator =
+      command_buffer->logical_device->host_allocator();
+
+  iree_inline_array(VkMemoryBarrier, memory_barrier_infos, memory_barrier_count,
+                    host_allocator);
+  for (int i = 0; i < memory_barrier_count; ++i) {
+    const auto& memory_barrier = memory_barriers[i];
+    VkMemoryBarrier* info = iree_inline_array_at(memory_barrier_infos, i);
+    info->sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
+    info->pNext = NULL;
+    info->srcAccessMask =
+        iree_hal_vulkan_convert_access_mask(memory_barrier.source_scope);
+    info->dstAccessMask =
+        iree_hal_vulkan_convert_access_mask(memory_barrier.target_scope);
+  }
+
+  iree_inline_array(VkBufferMemoryBarrier, buffer_barrier_infos,
+                    buffer_barrier_count, host_allocator);
+  for (int i = 0; i < buffer_barrier_count; ++i) {
+    const auto& buffer_barrier = buffer_barriers[i];
+    VkBufferMemoryBarrier* info = iree_inline_array_at(buffer_barrier_infos, i);
+    info->sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
+    info->pNext = NULL;
+    info->srcAccessMask =
+        iree_hal_vulkan_convert_access_mask(buffer_barrier.source_scope);
+    info->dstAccessMask =
+        iree_hal_vulkan_convert_access_mask(buffer_barrier.target_scope);
+    info->srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+    info->dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+    info->buffer = iree_hal_vulkan_vma_buffer_handle(
+        iree_hal_buffer_allocated_buffer(buffer_barrier.buffer));
+    info->offset = buffer_barrier.offset;
+    info->size = buffer_barrier.length;
+  }
+
+  command_buffer->syms->vkCmdPipelineBarrier(
+      command_buffer->handle,
+      iree_hal_vulkan_convert_pipeline_stage_flags(source_stage_mask),
+      iree_hal_vulkan_convert_pipeline_stage_flags(target_stage_mask),
+      /*dependencyFlags=*/0, (uint32_t)memory_barrier_count,
+      iree_inline_array_data(memory_barrier_infos),
+      (uint32_t)buffer_barrier_count,
+      iree_inline_array_data(buffer_barrier_infos), 0, NULL);
+
+  iree_inline_array_deinitialize(memory_barrier_infos);
+  iree_inline_array_deinitialize(buffer_barrier_infos);
+
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_vulkan_direct_command_buffer_signal_event(
+    iree_hal_command_buffer_t* base_command_buffer, iree_hal_event_t* event,
+    iree_hal_execution_stage_t source_stage_mask) {
+  iree_hal_vulkan_direct_command_buffer_t* command_buffer =
+      iree_hal_vulkan_direct_command_buffer_cast(base_command_buffer);
+
+  IREE_RETURN_IF_ERROR(
+      iree_hal_resource_set_insert(command_buffer->resource_set, 1, &event));
+
+  command_buffer->syms->vkCmdSetEvent(
+      command_buffer->handle, iree_hal_vulkan_native_event_handle(event),
+      iree_hal_vulkan_convert_pipeline_stage_flags(source_stage_mask));
+
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_vulkan_direct_command_buffer_reset_event(
+    iree_hal_command_buffer_t* base_command_buffer, iree_hal_event_t* event,
+    iree_hal_execution_stage_t source_stage_mask) {
+  iree_hal_vulkan_direct_command_buffer_t* command_buffer =
+      iree_hal_vulkan_direct_command_buffer_cast(base_command_buffer);
+
+  IREE_RETURN_IF_ERROR(
+      iree_hal_resource_set_insert(command_buffer->resource_set, 1, &event));
+
+  command_buffer->syms->vkCmdResetEvent(
+      command_buffer->handle, iree_hal_vulkan_native_event_handle(event),
+      iree_hal_vulkan_convert_pipeline_stage_flags(source_stage_mask));
+
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_vulkan_direct_command_buffer_wait_events(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_host_size_t event_count, const iree_hal_event_t** events,
+    iree_hal_execution_stage_t source_stage_mask,
+    iree_hal_execution_stage_t target_stage_mask,
+    iree_host_size_t memory_barrier_count,
+    const iree_hal_memory_barrier_t* memory_barriers,
+    iree_host_size_t buffer_barrier_count,
+    const iree_hal_buffer_barrier_t* buffer_barriers) {
+  iree_hal_vulkan_direct_command_buffer_t* command_buffer =
+      iree_hal_vulkan_direct_command_buffer_cast(base_command_buffer);
+  iree_allocator_t host_allocator =
+      command_buffer->logical_device->host_allocator();
+
+  IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+      command_buffer->resource_set, event_count, events));
+
+  iree_inline_array(VkEvent, event_handles, event_count, host_allocator);
+  for (int i = 0; i < event_count; ++i) {
+    *iree_inline_array_at(event_handles, i) =
+        iree_hal_vulkan_native_event_handle(events[i]);
+  }
+
+  iree_inline_array(VkMemoryBarrier, memory_barrier_infos, memory_barrier_count,
+                    host_allocator);
+  for (int i = 0; i < memory_barrier_count; ++i) {
+    const auto& memory_barrier = memory_barriers[i];
+    VkMemoryBarrier* info = iree_inline_array_at(memory_barrier_infos, i);
+    info->sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
+    info->pNext = NULL;
+    info->srcAccessMask =
+        iree_hal_vulkan_convert_access_mask(memory_barrier.source_scope);
+    info->dstAccessMask =
+        iree_hal_vulkan_convert_access_mask(memory_barrier.target_scope);
+  }
+
+  iree_inline_array(VkBufferMemoryBarrier, buffer_barrier_infos,
+                    buffer_barrier_count, host_allocator);
+  for (int i = 0; i < buffer_barrier_count; ++i) {
+    const auto& buffer_barrier = buffer_barriers[i];
+    VkBufferMemoryBarrier* info = iree_inline_array_at(buffer_barrier_infos, i);
+    info->sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
+    info->pNext = NULL;
+    info->srcAccessMask =
+        iree_hal_vulkan_convert_access_mask(buffer_barrier.source_scope);
+    info->dstAccessMask =
+        iree_hal_vulkan_convert_access_mask(buffer_barrier.target_scope);
+    info->srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+    info->dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+    info->buffer = iree_hal_vulkan_vma_buffer_handle(
+        iree_hal_buffer_allocated_buffer(buffer_barrier.buffer));
+    info->offset = buffer_barrier.offset;
+    info->size = buffer_barrier.length;
+  }
+
+  command_buffer->syms->vkCmdWaitEvents(
+      command_buffer->handle, (uint32_t)event_count,
+      iree_inline_array_data(event_handles),
+      iree_hal_vulkan_convert_pipeline_stage_flags(source_stage_mask),
+      iree_hal_vulkan_convert_pipeline_stage_flags(target_stage_mask),
+      (uint32_t)memory_barrier_count,
+      iree_inline_array_data(memory_barrier_infos),
+      (uint32_t)buffer_barrier_count,
+      iree_inline_array_data(buffer_barrier_infos), 0, NULL);
+
+  iree_inline_array_deinitialize(event_handles);
+  iree_inline_array_deinitialize(memory_barrier_infos);
+  iree_inline_array_deinitialize(buffer_barrier_infos);
+
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_vulkan_direct_command_buffer_discard_buffer(
+    iree_hal_command_buffer_t* base_command_buffer, iree_hal_buffer_t* buffer) {
+  // NOTE: we could use this to prevent queue family transitions.
+  return iree_ok_status();
+}
+
+// Splats a pattern value of 1, 2, or 4 bytes out to a 4 byte value.
+static uint32_t iree_hal_vulkan_splat_pattern(const void* pattern,
+                                              size_t pattern_length) {
+  switch (pattern_length) {
+    case 1: {
+      uint32_t pattern_value = *static_cast<const uint8_t*>(pattern);
+      return (pattern_value << 24) | (pattern_value << 16) |
+             (pattern_value << 8) | pattern_value;
+    }
+    case 2: {
+      uint32_t pattern_value = *static_cast<const uint16_t*>(pattern);
+      return (pattern_value << 16) | pattern_value;
+    }
+    case 4: {
+      uint32_t pattern_value = *static_cast<const uint32_t*>(pattern);
+      return pattern_value;
+    }
+    default:
+      return 0;  // Already verified that this should not be possible.
+  }
+}
+
+static iree_status_t iree_hal_vulkan_direct_command_buffer_fill_buffer(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+    iree_device_size_t length, const void* pattern,
+    iree_host_size_t pattern_length) {
+  iree_hal_vulkan_direct_command_buffer_t* command_buffer =
+      iree_hal_vulkan_direct_command_buffer_cast(base_command_buffer);
+  VkBuffer target_device_buffer = iree_hal_vulkan_vma_buffer_handle(
+      iree_hal_buffer_allocated_buffer(target_buffer));
+
+  IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+      command_buffer->resource_set, 1, &target_buffer));
+
+  // vkCmdFillBuffer requires a 4 byte alignment for the offset, pattern, and
+  // length. We use a polyfill here that fills the unaligned start and end of
+  // fill operations, if needed.
+
+  if (target_offset % 4 != 0 || length % 4 != 0) {
+    // TODO(scotttodd): only restore push constants that have been modified?
+    //                  (this can pass uninitialized memory right now, which
+    //                   *should* be safe but is wasteful)
+    IREE_RETURN_IF_ERROR(
+        command_buffer->builtin_executables->FillBufferUnaligned(
+            command_buffer->handle, &(command_buffer->descriptor_set_arena),
+            target_buffer, target_offset, length, pattern, pattern_length,
+            command_buffer->push_constants_storage));
+
+    // Continue using vkCmdFillBuffer below, but only for the inner aligned
+    // portion of the fill operation.
+    // For example:
+    //   original offset 2, length 8
+    //   aligned  offset 4, length 4
+    // [0x00,0x00,0xAB,0xAB | 0xAB,0xAB,0xAB,0xAB | 0xAB,0xAB,0x00,0x00]
+    //            <-------> <---------------------> <------->
+    //            unaligned     vkCmdFillBuffer     unaligned
+    iree_device_size_t aligned_target_offset =
+        iree_device_align(target_offset, 4);
+    iree_device_size_t target_end = target_offset + length;
+    iree_device_size_t rounded_down_target_end = (target_end / 4) * 4;
+    length -= (aligned_target_offset - target_offset) +
+              (target_end - rounded_down_target_end);
+    target_offset = aligned_target_offset;
+  }
+
+  if (length > 0) {
+    // Note that vkCmdFillBuffer only accepts 4-byte aligned values so we need
+    // to splat out our variable-length pattern.
+    target_offset += iree_hal_buffer_byte_offset(target_buffer);
+    uint32_t dword_pattern =
+        iree_hal_vulkan_splat_pattern(pattern, pattern_length);
+    command_buffer->syms->vkCmdFillBuffer(command_buffer->handle,
+                                          target_device_buffer, target_offset,
+                                          length, dword_pattern);
+  }
+
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_vulkan_direct_command_buffer_update_buffer(
+    iree_hal_command_buffer_t* base_command_buffer, const void* source_buffer,
+    iree_host_size_t source_offset, iree_hal_buffer_t* target_buffer,
+    iree_device_size_t target_offset, iree_device_size_t length) {
+  iree_hal_vulkan_direct_command_buffer_t* command_buffer =
+      iree_hal_vulkan_direct_command_buffer_cast(base_command_buffer);
+  VkBuffer target_device_buffer = iree_hal_vulkan_vma_buffer_handle(
+      iree_hal_buffer_allocated_buffer(target_buffer));
+
+  IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+      command_buffer->resource_set, 1, &target_buffer));
+
+  // Vulkan only allows updates of <= 65536 because you really, really, really
+  // shouldn't do large updates like this (as it wastes command buffer space and
+  // may be slower than just using write-through mapped memory). The
+  // recommendation in the spec for larger updates is to split the single update
+  // into multiple updates over the entire desired range.
+  const auto* source_buffer_ptr =
+      static_cast<const uint8_t*>(source_buffer) + source_offset;
+  target_offset += iree_hal_buffer_byte_offset(target_buffer);
+  while (length > 0) {
+    iree_device_size_t chunk_length =
+        iree_min((iree_device_size_t)65536u, length);
+    command_buffer->syms->vkCmdUpdateBuffer(command_buffer->handle,
+                                            target_device_buffer, target_offset,
+                                            chunk_length, source_buffer_ptr);
+    source_buffer_ptr += chunk_length;
+    target_offset += chunk_length;
+    length -= chunk_length;
+  }
+
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_vulkan_direct_command_buffer_copy_buffer(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
+    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+    iree_device_size_t length) {
+  iree_hal_vulkan_direct_command_buffer_t* command_buffer =
+      iree_hal_vulkan_direct_command_buffer_cast(base_command_buffer);
+  VkBuffer source_device_buffer = iree_hal_vulkan_vma_buffer_handle(
+      iree_hal_buffer_allocated_buffer(source_buffer));
+  VkBuffer target_device_buffer = iree_hal_vulkan_vma_buffer_handle(
+      iree_hal_buffer_allocated_buffer(target_buffer));
+
+  const iree_hal_buffer_t* buffers[2] = {source_buffer, target_buffer};
+  IREE_RETURN_IF_ERROR(
+      iree_hal_resource_set_insert(command_buffer->resource_set, 2, buffers));
+
+  VkBufferCopy region;
+  region.srcOffset = iree_hal_buffer_byte_offset(source_buffer) + source_offset;
+  region.dstOffset = iree_hal_buffer_byte_offset(target_buffer) + target_offset;
+  region.size = length;
+  command_buffer->syms->vkCmdCopyBuffer(command_buffer->handle,
+                                        source_device_buffer,
+                                        target_device_buffer, 1, &region);
+
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_vulkan_direct_command_buffer_push_constants(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_executable_layout_t* executable_layout, iree_host_size_t offset,
+    const void* values, iree_host_size_t values_length) {
+  iree_hal_vulkan_direct_command_buffer_t* command_buffer =
+      iree_hal_vulkan_direct_command_buffer_cast(base_command_buffer);
+
+  iree_host_size_t storage_size =
+      IREE_ARRAYSIZE(command_buffer->push_constants_storage);
+  if (offset < storage_size) {
+    memcpy(command_buffer->push_constants_storage + offset, values,
+           std::min(values_length, storage_size) - offset);
+  }
+
+  command_buffer->syms->vkCmdPushConstants(
+      command_buffer->handle,
+      iree_hal_vulkan_native_executable_layout_handle(executable_layout),
+      VK_SHADER_STAGE_COMPUTE_BIT, (uint32_t)offset, (uint32_t)values_length,
+      values);
+
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_vulkan_direct_command_buffer_push_descriptor_set(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_executable_layout_t* executable_layout, uint32_t set,
+    iree_host_size_t binding_count,
+    const iree_hal_descriptor_set_binding_t* bindings) {
+  iree_hal_vulkan_direct_command_buffer_t* command_buffer =
+      iree_hal_vulkan_direct_command_buffer_cast(base_command_buffer);
+
+  // TODO(benvanik): batch insert by getting the resources in their own list.
+  for (iree_host_size_t i = 0; i < binding_count; ++i) {
+    IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+        command_buffer->resource_set, 1, &bindings[i].buffer));
+  }
+
+  // Either allocate, update, and bind a descriptor set or use push descriptor
+  // sets to use the command buffer pool when supported.
+  return command_buffer->descriptor_set_arena.BindDescriptorSet(
+      command_buffer->handle, executable_layout, set, binding_count, bindings);
+}
+
+static iree_status_t iree_hal_vulkan_direct_command_buffer_bind_descriptor_set(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_executable_layout_t* executable_layout, uint32_t set,
+    iree_hal_descriptor_set_t* descriptor_set,
+    iree_host_size_t dynamic_offset_count,
+    const iree_device_size_t* dynamic_offsets) {
+  iree_hal_vulkan_direct_command_buffer_t* command_buffer =
+      iree_hal_vulkan_direct_command_buffer_cast(base_command_buffer);
+  iree_allocator_t host_allocator =
+      command_buffer->logical_device->host_allocator();
+
+  IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+      command_buffer->resource_set, 1, &descriptor_set));
+
+  // Vulkan takes uint32_t as the size here, unlike everywhere else.
+  iree_inline_array(uint32_t, dynamic_offsets_i32, dynamic_offset_count,
+                    host_allocator);
+  for (int i = 0; i < dynamic_offset_count; ++i) {
+    *iree_inline_array_at(dynamic_offsets_i32, i) =
+        (uint32_t)dynamic_offsets[i];
+  }
+
+  VkDescriptorSet descriptor_sets[1] = {
+      iree_hal_vulkan_native_descriptor_set_handle(descriptor_set),
+  };
+  command_buffer->syms->vkCmdBindDescriptorSets(
+      command_buffer->handle, VK_PIPELINE_BIND_POINT_COMPUTE,
+      iree_hal_vulkan_native_executable_layout_handle(executable_layout), set,
+      (uint32_t)IREE_ARRAYSIZE(descriptor_sets), descriptor_sets,
+      (uint32_t)dynamic_offset_count,
+      iree_inline_array_data(dynamic_offsets_i32));
+
+  iree_inline_array_deinitialize(dynamic_offsets_i32);
+
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_vulkan_direct_command_buffer_dispatch(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_executable_t* executable, int32_t entry_point,
+    uint32_t workgroup_x, uint32_t workgroup_y, uint32_t workgroup_z) {
+  iree_hal_vulkan_direct_command_buffer_t* command_buffer =
+      iree_hal_vulkan_direct_command_buffer_cast(base_command_buffer);
+
+  IREE_TRACE({
+    iree_hal_vulkan_source_location_t source_location;
+    iree_hal_vulkan_native_executable_entry_point_source_location(
+        executable, entry_point, &source_location);
+    IREE_VULKAN_TRACE_ZONE_BEGIN_EXTERNAL(
+        command_buffer->tracing_context, command_buffer->handle,
+        source_location.file_name.data, source_location.file_name.size,
+        source_location.line, /*func_name=*/NULL, 0,
+        source_location.func_name.data, source_location.func_name.size);
+  });
+
+  IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+      command_buffer->resource_set, 1, &executable));
+
+  // Get the compiled and linked pipeline for the specified entry point and
+  // bind it to the command buffer.
+  VkPipeline pipeline_handle = VK_NULL_HANDLE;
+  IREE_RETURN_IF_ERROR(
+      iree_hal_vulkan_native_executable_pipeline_for_entry_point(
+          executable, entry_point, &pipeline_handle));
+  command_buffer->syms->vkCmdBindPipeline(
+      command_buffer->handle, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline_handle);
+
+  command_buffer->syms->vkCmdDispatch(command_buffer->handle, workgroup_x,
+                                      workgroup_y, workgroup_z);
+
+  IREE_VULKAN_TRACE_ZONE_END(command_buffer->tracing_context,
+                             command_buffer->handle);
+
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_vulkan_direct_command_buffer_dispatch_indirect(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_executable_t* executable, int32_t entry_point,
+    iree_hal_buffer_t* workgroups_buffer,
+    iree_device_size_t workgroups_offset) {
+  iree_hal_vulkan_direct_command_buffer_t* command_buffer =
+      iree_hal_vulkan_direct_command_buffer_cast(base_command_buffer);
+
+  const void* resources[2] = {executable, workgroups_buffer};
+  IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+      command_buffer->resource_set, IREE_ARRAYSIZE(resources), resources));
+
+  iree_hal_vulkan_source_location_t source_location;
+  iree_hal_vulkan_native_executable_entry_point_source_location(
+      executable, entry_point, &source_location);
+  IREE_VULKAN_TRACE_ZONE_BEGIN_EXTERNAL(
+      command_buffer->tracing_context, command_buffer->handle,
+      source_location.file_name.data, source_location.file_name.size,
+      source_location.line, /*func_name=*/NULL, 0,
+      source_location.func_name.data, source_location.func_name.size);
+
+  // Get the compiled and linked pipeline for the specified entry point and
+  // bind it to the command buffer.
+  VkPipeline pipeline_handle = VK_NULL_HANDLE;
+  IREE_RETURN_IF_ERROR(
+      iree_hal_vulkan_native_executable_pipeline_for_entry_point(
+          executable, entry_point, &pipeline_handle));
+  command_buffer->syms->vkCmdBindPipeline(
+      command_buffer->handle, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline_handle);
+
+  VkBuffer workgroups_device_buffer = iree_hal_vulkan_vma_buffer_handle(
+      iree_hal_buffer_allocated_buffer(workgroups_buffer));
+  workgroups_offset += iree_hal_buffer_byte_offset(workgroups_buffer);
+  command_buffer->syms->vkCmdDispatchIndirect(
+      command_buffer->handle, workgroups_device_buffer, workgroups_offset);
+
+  IREE_VULKAN_TRACE_ZONE_END(command_buffer->tracing_context,
+                             command_buffer->handle);
+
+  return iree_ok_status();
+}
+
+namespace {
+const iree_hal_command_buffer_vtable_t
+    iree_hal_vulkan_direct_command_buffer_vtable = {
+        /*.destroy=*/iree_hal_vulkan_direct_command_buffer_destroy,
+        /*.dyn_cast=*/iree_hal_vulkan_direct_command_buffer_dyn_cast,
+        /*.begin=*/iree_hal_vulkan_direct_command_buffer_begin,
+        /*.end=*/iree_hal_vulkan_direct_command_buffer_end,
+        /*.begin_debug_group=*/
+        iree_hal_vulkan_direct_command_buffer_begin_debug_group,
+        /*.end_debug_group=*/
+        iree_hal_vulkan_direct_command_buffer_end_debug_group,
+        /*.execution_barrier=*/
+        iree_hal_vulkan_direct_command_buffer_execution_barrier,
+        /*.signal_event=*/
+        iree_hal_vulkan_direct_command_buffer_signal_event,
+        /*.reset_event=*/iree_hal_vulkan_direct_command_buffer_reset_event,
+        /*.wait_events=*/iree_hal_vulkan_direct_command_buffer_wait_events,
+        /*.discard_buffer=*/
+        iree_hal_vulkan_direct_command_buffer_discard_buffer,
+        /*.fill_buffer=*/iree_hal_vulkan_direct_command_buffer_fill_buffer,
+        /*.update_buffer=*/
+        iree_hal_vulkan_direct_command_buffer_update_buffer,
+        /*.copy_buffer=*/iree_hal_vulkan_direct_command_buffer_copy_buffer,
+        /*.push_constants=*/
+        iree_hal_vulkan_direct_command_buffer_push_constants,
+        /*.push_descriptor_set=*/
+        iree_hal_vulkan_direct_command_buffer_push_descriptor_set,
+        /*.bind_descriptor_set=*/
+        iree_hal_vulkan_direct_command_buffer_bind_descriptor_set,
+        /*.dispatch=*/iree_hal_vulkan_direct_command_buffer_dispatch,
+        /*.dispatch_indirect=*/
+        iree_hal_vulkan_direct_command_buffer_dispatch_indirect,
+};
+}  // namespace
diff --git a/runtime/src/iree/hal/vulkan/direct_command_buffer.h b/runtime/src/iree/hal/vulkan/direct_command_buffer.h
new file mode 100644
index 0000000..57c15ad
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/direct_command_buffer.h
@@ -0,0 +1,52 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_VULKAN_DIRECT_COMMAND_BUFFER_H_
+#define IREE_HAL_VULKAN_DIRECT_COMMAND_BUFFER_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/vulkan/builtin_executables.h"
+#include "iree/hal/vulkan/descriptor_pool_cache.h"
+#include "iree/hal/vulkan/handle_util.h"
+#include "iree/hal/vulkan/tracing.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef struct iree_arena_block_pool_t iree_arena_block_pool_t;
+
+// Creates a command buffer that directly records into a VkCommandBuffer.
+//
+// NOTE: the |block_pool| must remain live for the lifetime of the command
+// buffers that use it.
+iree_status_t iree_hal_vulkan_direct_command_buffer_allocate(
+    iree_hal_device_t* device,
+    iree::hal::vulkan::VkDeviceHandle* logical_device,
+    iree::hal::vulkan::VkCommandPoolHandle* command_pool,
+    iree_hal_command_buffer_mode_t mode,
+    iree_hal_command_category_t command_categories,
+    iree_hal_queue_affinity_t queue_affinity,
+    iree_hal_vulkan_tracing_context_t* tracing_context,
+    iree::hal::vulkan::DescriptorPoolCache* descriptor_pool_cache,
+    iree::hal::vulkan::BuiltinExecutables* builtin_executables,
+    iree_arena_block_pool_t* block_pool,
+    iree_hal_command_buffer_t** out_command_buffer);
+
+// Returns the native Vulkan VkCommandBuffer handle.
+VkCommandBuffer iree_hal_vulkan_direct_command_buffer_handle(
+    iree_hal_command_buffer_t* command_buffer);
+
+// Returns true if |command_buffer| is a Vulkan command buffer.
+bool iree_hal_vulkan_direct_command_buffer_isa(
+    iree_hal_command_buffer_t* command_buffer);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_VULKAN_DIRECT_COMMAND_BUFFER_H_
diff --git a/runtime/src/iree/hal/vulkan/direct_command_queue.cc b/runtime/src/iree/hal/vulkan/direct_command_queue.cc
new file mode 100644
index 0000000..1a132a8
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/direct_command_queue.cc
@@ -0,0 +1,197 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/vulkan/direct_command_queue.h"
+
+#include <cstdint>
+
+#include "iree/base/tracing.h"
+#include "iree/hal/vulkan/direct_command_buffer.h"
+#include "iree/hal/vulkan/dynamic_symbols.h"
+#include "iree/hal/vulkan/native_semaphore.h"
+#include "iree/hal/vulkan/status_util.h"
+#include "iree/hal/vulkan/tracing.h"
+#include "iree/hal/vulkan/util/ref_ptr.h"
+
+namespace iree {
+namespace hal {
+namespace vulkan {
+
+DirectCommandQueue::DirectCommandQueue(
+    VkDeviceHandle* logical_device,
+    iree_hal_command_category_t supported_categories, VkQueue queue)
+    : CommandQueue(logical_device, supported_categories, queue) {}
+
+DirectCommandQueue::~DirectCommandQueue() = default;
+
+iree_status_t DirectCommandQueue::TranslateBatchInfo(
+    const iree_hal_submission_batch_t* batch, VkSubmitInfo* submit_info,
+    VkTimelineSemaphoreSubmitInfo* timeline_submit_info, Arena* arena) {
+  // TODO(benvanik): see if we can go to finer-grained stages.
+  // For example, if this was just queue ownership transfers then we can use
+  // the pseudo-stage of VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT.
+  VkPipelineStageFlags dst_stage_mask =
+      VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
+
+  auto wait_semaphore_handles =
+      arena->AllocateSpan<VkSemaphore>(batch->wait_semaphores.count);
+  auto wait_semaphore_values =
+      arena->AllocateSpan<uint64_t>(batch->wait_semaphores.count);
+  auto wait_dst_stage_masks =
+      arena->AllocateSpan<VkPipelineStageFlags>(batch->wait_semaphores.count);
+  for (iree_host_size_t i = 0; i < batch->wait_semaphores.count; ++i) {
+    wait_semaphore_handles[i] = iree_hal_vulkan_native_semaphore_handle(
+        batch->wait_semaphores.semaphores[i]);
+    wait_semaphore_values[i] = batch->wait_semaphores.payload_values[i];
+    wait_dst_stage_masks[i] = dst_stage_mask;
+  }
+
+  auto signal_semaphore_handles =
+      arena->AllocateSpan<VkSemaphore>(batch->signal_semaphores.count);
+  auto signal_semaphore_values =
+      arena->AllocateSpan<uint64_t>(batch->signal_semaphores.count);
+  for (iree_host_size_t i = 0; i < batch->signal_semaphores.count; ++i) {
+    signal_semaphore_handles[i] = iree_hal_vulkan_native_semaphore_handle(
+        batch->signal_semaphores.semaphores[i]);
+    signal_semaphore_values[i] = batch->signal_semaphores.payload_values[i];
+  }
+
+  auto command_buffer_handles =
+      arena->AllocateSpan<VkCommandBuffer>(batch->command_buffer_count);
+  for (iree_host_size_t i = 0; i < batch->command_buffer_count; ++i) {
+    command_buffer_handles[i] =
+        iree_hal_vulkan_direct_command_buffer_handle(batch->command_buffers[i]);
+  }
+
+  submit_info->sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+  submit_info->pNext = timeline_submit_info;
+  submit_info->waitSemaphoreCount =
+      static_cast<uint32_t>(wait_semaphore_handles.size());
+  submit_info->pWaitSemaphores = wait_semaphore_handles.data();
+  submit_info->pWaitDstStageMask = wait_dst_stage_masks.data();
+  submit_info->commandBufferCount =
+      static_cast<uint32_t>(command_buffer_handles.size());
+  submit_info->pCommandBuffers = command_buffer_handles.data();
+  submit_info->signalSemaphoreCount =
+      static_cast<uint32_t>(signal_semaphore_handles.size());
+  submit_info->pSignalSemaphores = signal_semaphore_handles.data();
+
+  timeline_submit_info->sType =
+      VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO;
+  timeline_submit_info->pNext = nullptr;
+  timeline_submit_info->waitSemaphoreValueCount =
+      static_cast<uint32_t>(wait_semaphore_values.size());
+  timeline_submit_info->pWaitSemaphoreValues = wait_semaphore_values.data();
+  timeline_submit_info->signalSemaphoreValueCount =
+      static_cast<uint32_t>(signal_semaphore_values.size());
+  timeline_submit_info->pSignalSemaphoreValues = signal_semaphore_values.data();
+
+  return iree_ok_status();
+}
+
+iree_status_t DirectCommandQueue::Submit(
+    iree_host_size_t batch_count, const iree_hal_submission_batch_t* batches) {
+  IREE_TRACE_SCOPE0("DirectCommandQueue::Submit");
+
+  // Map the submission batches to VkSubmitInfos.
+  // Note that we must keep all arrays referenced alive until submission
+  // completes and since there are a bunch of them we use an arena.
+  Arena arena(4 * 1024);
+  auto submit_infos = arena.AllocateSpan<VkSubmitInfo>(batch_count);
+  auto timeline_submit_infos =
+      arena.AllocateSpan<VkTimelineSemaphoreSubmitInfo>(batch_count);
+  for (int i = 0; i < batch_count; ++i) {
+    IREE_RETURN_IF_ERROR(TranslateBatchInfo(&batches[i], &submit_infos[i],
+                                            &timeline_submit_infos[i], &arena));
+  }
+
+  iree_slim_mutex_lock(&queue_mutex_);
+  iree_status_t status = VK_RESULT_TO_STATUS(
+      syms()->vkQueueSubmit(queue_, static_cast<uint32_t>(submit_infos.size()),
+                            submit_infos.data(), VK_NULL_HANDLE),
+      "vkQueueSubmit");
+  iree_slim_mutex_unlock(&queue_mutex_);
+  IREE_RETURN_IF_ERROR(status);
+
+  return iree_ok_status();
+}
+
+iree_status_t DirectCommandQueue::WaitIdle(iree_timeout_t timeout) {
+  iree_time_t deadline_ns = iree_timeout_as_deadline_ns(timeout);
+  if (deadline_ns == IREE_TIME_INFINITE_FUTURE) {
+    // Fast path for using vkQueueWaitIdle, which is usually cheaper (as it
+    // requires fewer calls into the driver).
+    IREE_TRACE_SCOPE0("DirectCommandQueue::WaitIdle#vkQueueWaitIdle");
+    iree_slim_mutex_lock(&queue_mutex_);
+    iree_status_t status =
+        VK_RESULT_TO_STATUS(syms()->vkQueueWaitIdle(queue_), "vkQueueWaitIdle");
+    iree_slim_mutex_unlock(&queue_mutex_);
+    iree_hal_vulkan_tracing_context_collect(tracing_context(), VK_NULL_HANDLE);
+    return status;
+  }
+
+  IREE_TRACE_SCOPE0("DirectCommandQueue::WaitIdle#Fence");
+
+  // Create a new fence just for this wait. This keeps us thread-safe as the
+  // behavior of wait+reset is racey.
+  VkFenceCreateInfo create_info;
+  create_info.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
+  create_info.pNext = nullptr;
+  create_info.flags = 0;
+  VkFence fence = VK_NULL_HANDLE;
+  VK_RETURN_IF_ERROR(
+      syms()->vkCreateFence(*logical_device_, &create_info,
+                            logical_device_->allocator(), &fence),
+      "vkCreateFence");
+
+  uint64_t timeout_ns;
+  if (deadline_ns == IREE_TIME_INFINITE_PAST) {
+    // Do not wait.
+    timeout_ns = 0;
+  } else if (deadline_ns == IREE_TIME_INFINITE_FUTURE) {
+    // Wait forever.
+    timeout_ns = UINT64_MAX;
+  } else {
+    // Convert to relative time in nanoseconds.
+    // The implementation may not wait with this granularity (like by 10000x).
+    iree_time_t now_ns = iree_time_now();
+    if (deadline_ns < now_ns) {
+      return iree_status_from_code(IREE_STATUS_DEADLINE_EXCEEDED);
+    }
+    timeout_ns = (uint64_t)(deadline_ns - now_ns);
+  }
+
+  iree_slim_mutex_lock(&queue_mutex_);
+  iree_status_t status = VK_RESULT_TO_STATUS(
+      syms()->vkQueueSubmit(queue_, 0, nullptr, fence), "vkQueueSubmit");
+  iree_slim_mutex_unlock(&queue_mutex_);
+
+  if (iree_status_is_ok(status)) {
+    VkResult result = syms()->vkWaitForFences(*logical_device_, 1, &fence,
+                                              VK_TRUE, timeout_ns);
+    switch (result) {
+      case VK_SUCCESS:
+        status = iree_ok_status();
+        break;
+      case VK_TIMEOUT:
+        status = iree_status_from_code(IREE_STATUS_DEADLINE_EXCEEDED);
+        break;
+      default:
+        status = VK_RESULT_TO_STATUS(result, "vkWaitForFences");
+        break;
+    }
+  }
+
+  syms()->vkDestroyFence(*logical_device_, fence, logical_device_->allocator());
+
+  iree_hal_vulkan_tracing_context_collect(tracing_context(), VK_NULL_HANDLE);
+
+  return status;
+}
+
+}  // namespace vulkan
+}  // namespace hal
+}  // namespace iree
diff --git a/runtime/src/iree/hal/vulkan/direct_command_queue.h b/runtime/src/iree/hal/vulkan/direct_command_queue.h
new file mode 100644
index 0000000..5ff9a68
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/direct_command_queue.h
@@ -0,0 +1,43 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_VULKAN_DIRECT_COMMAND_QUEUE_H_
+#define IREE_HAL_VULKAN_DIRECT_COMMAND_QUEUE_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/vulkan/command_queue.h"
+#include "iree/hal/vulkan/handle_util.h"
+#include "iree/hal/vulkan/util/arena.h"
+
+namespace iree {
+namespace hal {
+namespace vulkan {
+
+// Command queue implementation directly maps to VkQueue.
+class DirectCommandQueue final : public CommandQueue {
+ public:
+  DirectCommandQueue(VkDeviceHandle* logical_device,
+                     iree_hal_command_category_t supported_categories,
+                     VkQueue queue);
+  ~DirectCommandQueue() override;
+
+  iree_status_t Submit(iree_host_size_t batch_count,
+                       const iree_hal_submission_batch_t* batches) override;
+
+  iree_status_t WaitIdle(iree_timeout_t timeout) override;
+
+ private:
+  iree_status_t TranslateBatchInfo(
+      const iree_hal_submission_batch_t* batch, VkSubmitInfo* submit_info,
+      VkTimelineSemaphoreSubmitInfo* timeline_submit_info, Arena* arena);
+};
+
+}  // namespace vulkan
+}  // namespace hal
+}  // namespace iree
+
+#endif  // IREE_HAL_VULKAN_DIRECT_COMMAND_QUEUE_H_
diff --git a/runtime/src/iree/hal/vulkan/dynamic_symbol_tables.h b/runtime/src/iree/hal/vulkan/dynamic_symbol_tables.h
new file mode 100644
index 0000000..e819a7e
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/dynamic_symbol_tables.h
@@ -0,0 +1,501 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Use these tables whenever enumerating all functions in the Vulkan API is
+// required. In most cases IREE_VULKAN_DYNAMIC_SYMBOL_TABLES is the right
+// choice (includes both common and enabled platform-specific functions).
+//
+// Table macros are designed to take two macros: one for each instance-specific
+// function and one for each device-specific function. These macros are also
+// passed a requirement flag that enables compile-time exclusion of methods that
+// are not used in the binary. If you find yourself getting compilation errors
+// on missing methods you probably need to change it in the tables below from
+// EXCLUDED to REQUIRED or OPTIONAL.
+//
+// Define to get instance-specific functions:
+// #define INS_PFN(requirement, function_name)
+//
+// Define to get device-specific functions:
+// #define DEV_PFN(requirement, function_name)
+//
+// requirement is one of REQUIRED, OPTIONAL, or EXCLUDED.
+
+#ifndef IREE_HAL_VULKAN_DYNAMIC_SYMBOL_TABLES_H_
+#define IREE_HAL_VULKAN_DYNAMIC_SYMBOL_TABLES_H_
+
+namespace iree {
+namespace hal {
+namespace vulkan {
+
+// Defines the list of symbols that can be queried from vkGetInstanceProcAddr
+// before Vulkan instance creation.
+#define IREE_VULKAN_DYNAMIC_SYMBOL_INSTANCELESS_TABLE(INS_PFN) \
+  INS_PFN(REQUIRED, vkCreateInstance)                          \
+  INS_PFN(REQUIRED, vkEnumerateInstanceExtensionProperties)    \
+  INS_PFN(REQUIRED, vkEnumerateInstanceLayerProperties)        \
+  INS_PFN(OPTIONAL, vkEnumerateInstanceVersion)
+
+// Defines the list of instance/device symbols that are queried from
+// vkGetInstanceProcAddr/vkGetDeviceProcAddr after Vulkan instance/device
+// creation.
+#define IREE_VULKAN_DYNAMIC_SYMBOL_COMMON_TABLE(INS_PFN, DEV_PFN)       \
+  DEV_PFN(REQUIRED, vkBeginCommandBuffer)                               \
+  DEV_PFN(EXCLUDED, vkCmdBeginConditionalRenderingEXT)                  \
+  DEV_PFN(OPTIONAL, vkCmdBeginDebugUtilsLabelEXT)                       \
+  DEV_PFN(EXCLUDED, vkCmdBeginQuery)                                    \
+  DEV_PFN(EXCLUDED, vkCmdBeginQueryIndexedEXT)                          \
+  DEV_PFN(EXCLUDED, vkCmdBeginRenderPass)                               \
+  DEV_PFN(EXCLUDED, vkCmdBeginRenderPass2KHR)                           \
+  DEV_PFN(EXCLUDED, vkCmdBeginTransformFeedbackEXT)                     \
+  DEV_PFN(REQUIRED, vkCmdBindDescriptorSets)                            \
+  DEV_PFN(EXCLUDED, vkCmdBindIndexBuffer)                               \
+  DEV_PFN(REQUIRED, vkCmdBindPipeline)                                  \
+  DEV_PFN(EXCLUDED, vkCmdBindShadingRateImageNV)                        \
+  DEV_PFN(EXCLUDED, vkCmdBindTransformFeedbackBuffersEXT)               \
+  DEV_PFN(EXCLUDED, vkCmdBindVertexBuffers)                             \
+  DEV_PFN(EXCLUDED, vkCmdBlitImage)                                     \
+  DEV_PFN(EXCLUDED, vkCmdBuildAccelerationStructureNV)                  \
+  DEV_PFN(EXCLUDED, vkCmdClearAttachments)                              \
+  DEV_PFN(EXCLUDED, vkCmdClearColorImage)                               \
+  DEV_PFN(EXCLUDED, vkCmdClearDepthStencilImage)                        \
+  DEV_PFN(EXCLUDED, vkCmdCopyAccelerationStructureNV)                   \
+  DEV_PFN(REQUIRED, vkCmdCopyBuffer)                                    \
+  DEV_PFN(EXCLUDED, vkCmdCopyBufferToImage)                             \
+  DEV_PFN(EXCLUDED, vkCmdCopyImage)                                     \
+  DEV_PFN(EXCLUDED, vkCmdCopyImageToBuffer)                             \
+  DEV_PFN(EXCLUDED, vkCmdCopyQueryPoolResults)                          \
+  DEV_PFN(EXCLUDED, vkCmdDebugMarkerBeginEXT)                           \
+  DEV_PFN(EXCLUDED, vkCmdDebugMarkerEndEXT)                             \
+  DEV_PFN(EXCLUDED, vkCmdDebugMarkerInsertEXT)                          \
+  DEV_PFN(REQUIRED, vkCmdDispatch)                                      \
+  DEV_PFN(EXCLUDED, vkCmdDispatchBase)                                  \
+  DEV_PFN(EXCLUDED, vkCmdDispatchBaseKHR)                               \
+  DEV_PFN(REQUIRED, vkCmdDispatchIndirect)                              \
+  DEV_PFN(EXCLUDED, vkCmdDraw)                                          \
+  DEV_PFN(EXCLUDED, vkCmdDrawIndexed)                                   \
+  DEV_PFN(EXCLUDED, vkCmdDrawIndexedIndirect)                           \
+  DEV_PFN(EXCLUDED, vkCmdDrawIndexedIndirectCountAMD)                   \
+  DEV_PFN(EXCLUDED, vkCmdDrawIndexedIndirectCountKHR)                   \
+  DEV_PFN(EXCLUDED, vkCmdDrawIndirect)                                  \
+  DEV_PFN(EXCLUDED, vkCmdDrawIndirectByteCountEXT)                      \
+  DEV_PFN(EXCLUDED, vkCmdDrawIndirectCountAMD)                          \
+  DEV_PFN(EXCLUDED, vkCmdDrawIndirectCountKHR)                          \
+  DEV_PFN(EXCLUDED, vkCmdDrawMeshTasksIndirectCountNV)                  \
+  DEV_PFN(EXCLUDED, vkCmdDrawMeshTasksIndirectNV)                       \
+  DEV_PFN(EXCLUDED, vkCmdDrawMeshTasksNV)                               \
+  DEV_PFN(EXCLUDED, vkCmdEndConditionalRenderingEXT)                    \
+  DEV_PFN(OPTIONAL, vkCmdEndDebugUtilsLabelEXT)                         \
+  DEV_PFN(EXCLUDED, vkCmdEndQuery)                                      \
+  DEV_PFN(EXCLUDED, vkCmdEndQueryIndexedEXT)                            \
+  DEV_PFN(EXCLUDED, vkCmdEndRenderPass)                                 \
+  DEV_PFN(EXCLUDED, vkCmdEndRenderPass2KHR)                             \
+  DEV_PFN(EXCLUDED, vkCmdEndTransformFeedbackEXT)                       \
+  DEV_PFN(REQUIRED, vkCmdExecuteCommands)                               \
+  DEV_PFN(REQUIRED, vkCmdFillBuffer)                                    \
+  DEV_PFN(OPTIONAL, vkCmdInsertDebugUtilsLabelEXT)                      \
+  DEV_PFN(EXCLUDED, vkCmdNextSubpass)                                   \
+  DEV_PFN(EXCLUDED, vkCmdNextSubpass2KHR)                               \
+  DEV_PFN(REQUIRED, vkCmdPipelineBarrier)                               \
+  DEV_PFN(EXCLUDED, vkCmdProcessCommandsNVX)                            \
+  DEV_PFN(REQUIRED, vkCmdPushConstants)                                 \
+  DEV_PFN(OPTIONAL, vkCmdPushDescriptorSetKHR)                          \
+  DEV_PFN(EXCLUDED, vkCmdPushDescriptorSetWithTemplateKHR)              \
+  DEV_PFN(EXCLUDED, vkCmdReserveSpaceForCommandsNVX)                    \
+  DEV_PFN(REQUIRED, vkCmdResetEvent)                                    \
+  DEV_PFN(REQUIRED, vkCmdResetQueryPool)                                \
+  DEV_PFN(EXCLUDED, vkCmdResolveImage)                                  \
+  DEV_PFN(EXCLUDED, vkCmdSetBlendConstants)                             \
+  DEV_PFN(EXCLUDED, vkCmdSetCheckpointNV)                               \
+  DEV_PFN(EXCLUDED, vkCmdSetCoarseSampleOrderNV)                        \
+  DEV_PFN(EXCLUDED, vkCmdSetDepthBias)                                  \
+  DEV_PFN(EXCLUDED, vkCmdSetDepthBounds)                                \
+  DEV_PFN(EXCLUDED, vkCmdSetDeviceMask)                                 \
+  DEV_PFN(EXCLUDED, vkCmdSetDeviceMaskKHR)                              \
+  DEV_PFN(EXCLUDED, vkCmdSetDiscardRectangleEXT)                        \
+  DEV_PFN(REQUIRED, vkCmdSetEvent)                                      \
+  DEV_PFN(EXCLUDED, vkCmdSetExclusiveScissorNV)                         \
+  DEV_PFN(EXCLUDED, vkCmdSetLineWidth)                                  \
+  DEV_PFN(EXCLUDED, vkCmdSetSampleLocationsEXT)                         \
+  DEV_PFN(EXCLUDED, vkCmdSetScissor)                                    \
+  DEV_PFN(EXCLUDED, vkCmdSetStencilCompareMask)                         \
+  DEV_PFN(EXCLUDED, vkCmdSetStencilReference)                           \
+  DEV_PFN(EXCLUDED, vkCmdSetStencilWriteMask)                           \
+  DEV_PFN(EXCLUDED, vkCmdSetViewport)                                   \
+  DEV_PFN(EXCLUDED, vkCmdSetViewportShadingRatePaletteNV)               \
+  DEV_PFN(EXCLUDED, vkCmdSetViewportWScalingNV)                         \
+  DEV_PFN(EXCLUDED, vkCmdTraceRaysNV)                                   \
+  DEV_PFN(REQUIRED, vkCmdUpdateBuffer)                                  \
+  DEV_PFN(REQUIRED, vkCmdWaitEvents)                                    \
+  DEV_PFN(EXCLUDED, vkCmdWriteAccelerationStructuresPropertiesNV)       \
+  DEV_PFN(EXCLUDED, vkCmdWriteBufferMarkerAMD)                          \
+  DEV_PFN(REQUIRED, vkCmdWriteTimestamp)                                \
+  DEV_PFN(REQUIRED, vkEndCommandBuffer)                                 \
+  DEV_PFN(EXCLUDED, vkResetCommandBuffer)                               \
+  DEV_PFN(EXCLUDED, vkAcquireNextImage2KHR)                             \
+  DEV_PFN(EXCLUDED, vkAcquireNextImageKHR)                              \
+  DEV_PFN(REQUIRED, vkAllocateCommandBuffers)                           \
+  DEV_PFN(REQUIRED, vkAllocateDescriptorSets)                           \
+  DEV_PFN(REQUIRED, vkAllocateMemory)                                   \
+  DEV_PFN(EXCLUDED, vkBindAccelerationStructureMemoryNV)                \
+  DEV_PFN(REQUIRED, vkBindBufferMemory)                                 \
+  DEV_PFN(EXCLUDED, vkBindBufferMemory2)                                \
+  DEV_PFN(EXCLUDED, vkBindBufferMemory2KHR)                             \
+  DEV_PFN(REQUIRED, vkBindImageMemory)                                  \
+  DEV_PFN(EXCLUDED, vkBindImageMemory2)                                 \
+  DEV_PFN(EXCLUDED, vkBindImageMemory2KHR)                              \
+  DEV_PFN(EXCLUDED, vkCompileDeferredNV)                                \
+  DEV_PFN(EXCLUDED, vkCreateAccelerationStructureNV)                    \
+  DEV_PFN(REQUIRED, vkCreateBuffer)                                     \
+  DEV_PFN(REQUIRED, vkCreateBufferView)                                 \
+  DEV_PFN(REQUIRED, vkCreateCommandPool)                                \
+  DEV_PFN(REQUIRED, vkCreateComputePipelines)                           \
+  DEV_PFN(REQUIRED, vkCreateDescriptorPool)                             \
+  DEV_PFN(REQUIRED, vkCreateDescriptorSetLayout)                        \
+  DEV_PFN(EXCLUDED, vkCreateDescriptorUpdateTemplate)                   \
+  DEV_PFN(EXCLUDED, vkCreateDescriptorUpdateTemplateKHR)                \
+  DEV_PFN(REQUIRED, vkCreateEvent)                                      \
+  DEV_PFN(REQUIRED, vkCreateFence)                                      \
+  DEV_PFN(EXCLUDED, vkCreateFramebuffer)                                \
+  DEV_PFN(EXCLUDED, vkCreateGraphicsPipelines)                          \
+  DEV_PFN(REQUIRED, vkCreateImage)                                      \
+  DEV_PFN(EXCLUDED, vkCreateImageView)                                  \
+  DEV_PFN(EXCLUDED, vkCreateIndirectCommandsLayoutNVX)                  \
+  DEV_PFN(EXCLUDED, vkCreateObjectTableNVX)                             \
+  DEV_PFN(REQUIRED, vkCreatePipelineCache)                              \
+  DEV_PFN(REQUIRED, vkCreatePipelineLayout)                             \
+  DEV_PFN(REQUIRED, vkCreateQueryPool)                                  \
+  DEV_PFN(EXCLUDED, vkCreateRayTracingPipelinesNV)                      \
+  DEV_PFN(EXCLUDED, vkCreateRenderPass)                                 \
+  DEV_PFN(EXCLUDED, vkCreateRenderPass2KHR)                             \
+  DEV_PFN(EXCLUDED, vkCreateSampler)                                    \
+  DEV_PFN(EXCLUDED, vkCreateSamplerYcbcrConversion)                     \
+  DEV_PFN(EXCLUDED, vkCreateSamplerYcbcrConversionKHR)                  \
+  DEV_PFN(REQUIRED, vkCreateSemaphore)                                  \
+  DEV_PFN(REQUIRED, vkCreateShaderModule)                               \
+  DEV_PFN(EXCLUDED, vkCreateSharedSwapchainsKHR)                        \
+  DEV_PFN(EXCLUDED, vkCreateSwapchainKHR)                               \
+  DEV_PFN(EXCLUDED, vkCreateValidationCacheEXT)                         \
+  DEV_PFN(EXCLUDED, vkDebugMarkerSetObjectNameEXT)                      \
+  DEV_PFN(EXCLUDED, vkDebugMarkerSetObjectTagEXT)                       \
+  DEV_PFN(EXCLUDED, vkDestroyAccelerationStructureNV)                   \
+  DEV_PFN(REQUIRED, vkDestroyBuffer)                                    \
+  DEV_PFN(REQUIRED, vkDestroyBufferView)                                \
+  DEV_PFN(REQUIRED, vkDestroyCommandPool)                               \
+  DEV_PFN(REQUIRED, vkDestroyDescriptorPool)                            \
+  DEV_PFN(REQUIRED, vkDestroyDescriptorSetLayout)                       \
+  DEV_PFN(EXCLUDED, vkDestroyDescriptorUpdateTemplate)                  \
+  DEV_PFN(EXCLUDED, vkDestroyDescriptorUpdateTemplateKHR)               \
+  DEV_PFN(REQUIRED, vkDestroyDevice)                                    \
+  DEV_PFN(REQUIRED, vkDestroyEvent)                                     \
+  DEV_PFN(REQUIRED, vkDestroyFence)                                     \
+  DEV_PFN(EXCLUDED, vkDestroyFramebuffer)                               \
+  DEV_PFN(REQUIRED, vkDestroyImage)                                     \
+  DEV_PFN(EXCLUDED, vkDestroyImageView)                                 \
+  DEV_PFN(EXCLUDED, vkDestroyIndirectCommandsLayoutNVX)                 \
+  DEV_PFN(EXCLUDED, vkDestroyObjectTableNVX)                            \
+  DEV_PFN(REQUIRED, vkDestroyPipeline)                                  \
+  DEV_PFN(REQUIRED, vkDestroyPipelineCache)                             \
+  DEV_PFN(REQUIRED, vkDestroyPipelineLayout)                            \
+  DEV_PFN(REQUIRED, vkDestroyQueryPool)                                 \
+  DEV_PFN(EXCLUDED, vkDestroyRenderPass)                                \
+  DEV_PFN(EXCLUDED, vkDestroySampler)                                   \
+  DEV_PFN(EXCLUDED, vkDestroySamplerYcbcrConversion)                    \
+  DEV_PFN(EXCLUDED, vkDestroySamplerYcbcrConversionKHR)                 \
+  DEV_PFN(REQUIRED, vkDestroySemaphore)                                 \
+  DEV_PFN(REQUIRED, vkDestroyShaderModule)                              \
+  DEV_PFN(EXCLUDED, vkDestroySwapchainKHR)                              \
+  DEV_PFN(EXCLUDED, vkDestroyValidationCacheEXT)                        \
+  DEV_PFN(REQUIRED, vkDeviceWaitIdle)                                   \
+  DEV_PFN(EXCLUDED, vkDisplayPowerControlEXT)                           \
+  DEV_PFN(REQUIRED, vkFlushMappedMemoryRanges)                          \
+  DEV_PFN(REQUIRED, vkFreeCommandBuffers)                               \
+  DEV_PFN(REQUIRED, vkFreeDescriptorSets)                               \
+  DEV_PFN(REQUIRED, vkFreeMemory)                                       \
+  DEV_PFN(EXCLUDED, vkGetAccelerationStructureHandleNV)                 \
+  DEV_PFN(EXCLUDED, vkGetAccelerationStructureMemoryRequirementsNV)     \
+  DEV_PFN(EXCLUDED, vkGetBufferDeviceAddressEXT)                        \
+  DEV_PFN(REQUIRED, vkGetBufferMemoryRequirements)                      \
+  DEV_PFN(EXCLUDED, vkGetBufferMemoryRequirements2)                     \
+  DEV_PFN(EXCLUDED, vkGetBufferMemoryRequirements2KHR)                  \
+  DEV_PFN(OPTIONAL, vkGetCalibratedTimestampsEXT)                       \
+  DEV_PFN(EXCLUDED, vkGetDescriptorSetLayoutSupport)                    \
+  DEV_PFN(EXCLUDED, vkGetDescriptorSetLayoutSupportKHR)                 \
+  DEV_PFN(EXCLUDED, vkGetDeviceGroupPeerMemoryFeatures)                 \
+  DEV_PFN(EXCLUDED, vkGetDeviceGroupPeerMemoryFeaturesKHR)              \
+  DEV_PFN(EXCLUDED, vkGetDeviceGroupPresentCapabilitiesKHR)             \
+  DEV_PFN(EXCLUDED, vkGetDeviceGroupSurfacePresentModesKHR)             \
+  DEV_PFN(EXCLUDED, vkGetDeviceMemoryCommitment)                        \
+  DEV_PFN(REQUIRED, vkGetDeviceQueue)                                   \
+  DEV_PFN(EXCLUDED, vkGetDeviceQueue2)                                  \
+  DEV_PFN(REQUIRED, vkGetEventStatus)                                   \
+  DEV_PFN(OPTIONAL, vkGetFenceFdKHR)                                    \
+  DEV_PFN(REQUIRED, vkGetFenceStatus)                                   \
+  DEV_PFN(EXCLUDED, vkGetImageDrmFormatModifierPropertiesEXT)           \
+  DEV_PFN(REQUIRED, vkGetImageMemoryRequirements)                       \
+  DEV_PFN(EXCLUDED, vkGetImageMemoryRequirements2)                      \
+  DEV_PFN(EXCLUDED, vkGetImageMemoryRequirements2KHR)                   \
+  DEV_PFN(EXCLUDED, vkGetImageSparseMemoryRequirements)                 \
+  DEV_PFN(EXCLUDED, vkGetImageSparseMemoryRequirements2)                \
+  DEV_PFN(EXCLUDED, vkGetImageSparseMemoryRequirements2KHR)             \
+  DEV_PFN(EXCLUDED, vkGetImageSubresourceLayout)                        \
+  DEV_PFN(EXCLUDED, vkGetImageViewHandleNVX)                            \
+  DEV_PFN(EXCLUDED, vkGetMemoryFdKHR)                                   \
+  DEV_PFN(EXCLUDED, vkGetMemoryFdPropertiesKHR)                         \
+  DEV_PFN(EXCLUDED, vkGetMemoryHostPointerPropertiesEXT)                \
+  DEV_PFN(EXCLUDED, vkGetPastPresentationTimingGOOGLE)                  \
+  DEV_PFN(REQUIRED, vkGetPipelineCacheData)                             \
+  DEV_PFN(REQUIRED, vkGetQueryPoolResults)                              \
+  DEV_PFN(EXCLUDED, vkGetRayTracingShaderGroupHandlesNV)                \
+  DEV_PFN(EXCLUDED, vkGetRefreshCycleDurationGOOGLE)                    \
+  DEV_PFN(EXCLUDED, vkGetRenderAreaGranularity)                         \
+  DEV_PFN(OPTIONAL, vkGetSemaphoreFdKHR)                                \
+  DEV_PFN(EXCLUDED, vkGetShaderInfoAMD)                                 \
+  DEV_PFN(EXCLUDED, vkGetSwapchainCounterEXT)                           \
+  DEV_PFN(EXCLUDED, vkGetSwapchainImagesKHR)                            \
+  DEV_PFN(EXCLUDED, vkGetSwapchainStatusKHR)                            \
+  DEV_PFN(EXCLUDED, vkGetValidationCacheDataEXT)                        \
+  DEV_PFN(OPTIONAL, vkImportFenceFdKHR)                                 \
+  DEV_PFN(OPTIONAL, vkImportSemaphoreFdKHR)                             \
+  DEV_PFN(REQUIRED, vkInvalidateMappedMemoryRanges)                     \
+  DEV_PFN(REQUIRED, vkMapMemory)                                        \
+  DEV_PFN(REQUIRED, vkMergePipelineCaches)                              \
+  DEV_PFN(EXCLUDED, vkMergeValidationCachesEXT)                         \
+  DEV_PFN(EXCLUDED, vkRegisterDeviceEventEXT)                           \
+  DEV_PFN(EXCLUDED, vkRegisterDisplayEventEXT)                          \
+  DEV_PFN(EXCLUDED, vkRegisterObjectsNVX)                               \
+  DEV_PFN(EXCLUDED, vkResetCommandPool)                                 \
+  DEV_PFN(REQUIRED, vkResetDescriptorPool)                              \
+  DEV_PFN(REQUIRED, vkResetEvent)                                       \
+  DEV_PFN(REQUIRED, vkResetFences)                                      \
+  DEV_PFN(OPTIONAL, vkResetQueryPool)                                   \
+  DEV_PFN(OPTIONAL, vkResetQueryPoolEXT)                                \
+  DEV_PFN(OPTIONAL, vkSetDebugUtilsObjectNameEXT)                       \
+  DEV_PFN(OPTIONAL, vkSetDebugUtilsObjectTagEXT)                        \
+  DEV_PFN(REQUIRED, vkSetEvent)                                         \
+  DEV_PFN(EXCLUDED, vkSetHdrMetadataEXT)                                \
+  DEV_PFN(EXCLUDED, vkSetLocalDimmingAMD)                               \
+  DEV_PFN(EXCLUDED, vkTrimCommandPool)                                  \
+  DEV_PFN(EXCLUDED, vkTrimCommandPoolKHR)                               \
+  DEV_PFN(REQUIRED, vkUnmapMemory)                                      \
+  DEV_PFN(EXCLUDED, vkUnregisterObjectsNVX)                             \
+  DEV_PFN(EXCLUDED, vkUpdateDescriptorSetWithTemplate)                  \
+  DEV_PFN(EXCLUDED, vkUpdateDescriptorSetWithTemplateKHR)               \
+  DEV_PFN(REQUIRED, vkUpdateDescriptorSets)                             \
+  DEV_PFN(REQUIRED, vkWaitForFences)                                    \
+                                                                        \
+  DEV_PFN(OPTIONAL, vkGetSemaphoreCounterValue)                         \
+  DEV_PFN(OPTIONAL, vkGetSemaphoreCounterValueKHR)                      \
+  DEV_PFN(OPTIONAL, vkWaitSemaphores)                                   \
+  DEV_PFN(OPTIONAL, vkWaitSemaphoresKHR)                                \
+  DEV_PFN(OPTIONAL, vkSignalSemaphore)                                  \
+  DEV_PFN(OPTIONAL, vkSignalSemaphoreKHR)                               \
+                                                                        \
+  INS_PFN(EXCLUDED, vkCreateDebugReportCallbackEXT)                     \
+  INS_PFN(OPTIONAL, vkCreateDebugUtilsMessengerEXT)                     \
+  INS_PFN(EXCLUDED, vkCreateDisplayPlaneSurfaceKHR)                     \
+  INS_PFN(EXCLUDED, vkCreateHeadlessSurfaceEXT)                         \
+  INS_PFN(EXCLUDED, vkDebugReportMessageEXT)                            \
+  INS_PFN(EXCLUDED, vkDestroyDebugReportCallbackEXT)                    \
+  INS_PFN(OPTIONAL, vkDestroyDebugUtilsMessengerEXT)                    \
+  INS_PFN(REQUIRED, vkDestroyInstance)                                  \
+  INS_PFN(EXCLUDED, vkDestroySurfaceKHR)                                \
+  INS_PFN(EXCLUDED, vkEnumeratePhysicalDeviceGroups)                    \
+  INS_PFN(EXCLUDED, vkEnumeratePhysicalDeviceGroupsKHR)                 \
+  INS_PFN(REQUIRED, vkEnumeratePhysicalDevices)                         \
+  INS_PFN(EXCLUDED, vkSubmitDebugUtilsMessageEXT)                       \
+  INS_PFN(REQUIRED, vkCreateDevice)                                     \
+  INS_PFN(EXCLUDED, vkCreateDisplayModeKHR)                             \
+  INS_PFN(REQUIRED, vkEnumerateDeviceExtensionProperties)               \
+  INS_PFN(REQUIRED, vkEnumerateDeviceLayerProperties)                   \
+  INS_PFN(EXCLUDED, vkGetDisplayModeProperties2KHR)                     \
+  INS_PFN(EXCLUDED, vkGetDisplayModePropertiesKHR)                      \
+  INS_PFN(EXCLUDED, vkGetDisplayPlaneCapabilities2KHR)                  \
+  INS_PFN(EXCLUDED, vkGetDisplayPlaneCapabilitiesKHR)                   \
+  INS_PFN(EXCLUDED, vkGetDisplayPlaneSupportedDisplaysKHR)              \
+  INS_PFN(OPTIONAL, vkGetPhysicalDeviceCalibrateableTimeDomainsEXT)     \
+  INS_PFN(EXCLUDED, vkGetPhysicalDeviceCooperativeMatrixPropertiesNV)   \
+  INS_PFN(EXCLUDED, vkGetPhysicalDeviceDisplayPlaneProperties2KHR)      \
+  INS_PFN(EXCLUDED, vkGetPhysicalDeviceDisplayPlanePropertiesKHR)       \
+  INS_PFN(EXCLUDED, vkGetPhysicalDeviceDisplayProperties2KHR)           \
+  INS_PFN(EXCLUDED, vkGetPhysicalDeviceDisplayPropertiesKHR)            \
+  INS_PFN(EXCLUDED, vkGetPhysicalDeviceExternalBufferProperties)        \
+  INS_PFN(EXCLUDED, vkGetPhysicalDeviceExternalBufferPropertiesKHR)     \
+  INS_PFN(EXCLUDED, vkGetPhysicalDeviceExternalFenceProperties)         \
+  INS_PFN(EXCLUDED, vkGetPhysicalDeviceExternalFencePropertiesKHR)      \
+  INS_PFN(EXCLUDED, vkGetPhysicalDeviceExternalImageFormatPropertiesNV) \
+  INS_PFN(EXCLUDED, vkGetPhysicalDeviceExternalSemaphoreProperties)     \
+  INS_PFN(EXCLUDED, vkGetPhysicalDeviceExternalSemaphorePropertiesKHR)  \
+  INS_PFN(REQUIRED, vkGetPhysicalDeviceFeatures)                        \
+  INS_PFN(EXCLUDED, vkGetPhysicalDeviceFeatures2)                       \
+  INS_PFN(EXCLUDED, vkGetPhysicalDeviceFeatures2KHR)                    \
+  INS_PFN(REQUIRED, vkGetPhysicalDeviceFormatProperties)                \
+  INS_PFN(EXCLUDED, vkGetPhysicalDeviceFormatProperties2)               \
+  INS_PFN(EXCLUDED, vkGetPhysicalDeviceFormatProperties2KHR)            \
+  INS_PFN(EXCLUDED, vkGetPhysicalDeviceGeneratedCommandsPropertiesNVX)  \
+  INS_PFN(EXCLUDED, vkGetPhysicalDeviceImageFormatProperties)           \
+  INS_PFN(EXCLUDED, vkGetPhysicalDeviceImageFormatProperties2)          \
+  INS_PFN(EXCLUDED, vkGetPhysicalDeviceImageFormatProperties2KHR)       \
+  INS_PFN(REQUIRED, vkGetPhysicalDeviceMemoryProperties)                \
+  INS_PFN(EXCLUDED, vkGetPhysicalDeviceMemoryProperties2)               \
+  INS_PFN(EXCLUDED, vkGetPhysicalDeviceMemoryProperties2KHR)            \
+  INS_PFN(EXCLUDED, vkGetPhysicalDeviceMultisamplePropertiesEXT)        \
+  INS_PFN(EXCLUDED, vkGetPhysicalDevicePresentRectanglesKHR)            \
+  INS_PFN(REQUIRED, vkGetPhysicalDeviceProperties)                      \
+  INS_PFN(EXCLUDED, vkGetPhysicalDeviceProperties2)                     \
+  INS_PFN(EXCLUDED, vkGetPhysicalDeviceProperties2KHR)                  \
+  INS_PFN(REQUIRED, vkGetPhysicalDeviceQueueFamilyProperties)           \
+  INS_PFN(EXCLUDED, vkGetPhysicalDeviceQueueFamilyProperties2)          \
+  INS_PFN(EXCLUDED, vkGetPhysicalDeviceQueueFamilyProperties2KHR)       \
+  INS_PFN(EXCLUDED, vkGetPhysicalDeviceSparseImageFormatProperties)     \
+  INS_PFN(EXCLUDED, vkGetPhysicalDeviceSparseImageFormatProperties2)    \
+  INS_PFN(EXCLUDED, vkGetPhysicalDeviceSparseImageFormatProperties2KHR) \
+  INS_PFN(EXCLUDED, vkGetPhysicalDeviceSurfaceCapabilities2EXT)         \
+  INS_PFN(EXCLUDED, vkGetPhysicalDeviceSurfaceCapabilities2KHR)         \
+  INS_PFN(EXCLUDED, vkGetPhysicalDeviceSurfaceCapabilitiesKHR)          \
+  INS_PFN(EXCLUDED, vkGetPhysicalDeviceSurfaceFormats2KHR)              \
+  INS_PFN(EXCLUDED, vkGetPhysicalDeviceSurfaceFormatsKHR)               \
+  INS_PFN(EXCLUDED, vkGetPhysicalDeviceSurfacePresentModesKHR)          \
+  INS_PFN(EXCLUDED, vkGetPhysicalDeviceSurfaceSupportKHR)               \
+  INS_PFN(EXCLUDED, vkReleaseDisplayEXT)                                \
+  DEV_PFN(EXCLUDED, vkGetQueueCheckpointDataNV)                         \
+  DEV_PFN(OPTIONAL, vkQueueBeginDebugUtilsLabelEXT)                     \
+  DEV_PFN(EXCLUDED, vkQueueBindSparse)                                  \
+  DEV_PFN(OPTIONAL, vkQueueEndDebugUtilsLabelEXT)                       \
+  DEV_PFN(OPTIONAL, vkQueueInsertDebugUtilsLabelEXT)                    \
+  DEV_PFN(EXCLUDED, vkQueuePresentKHR)                                  \
+  DEV_PFN(REQUIRED, vkQueueSubmit)                                      \
+  DEV_PFN(REQUIRED, vkQueueWaitIdle)
+
+#ifdef VK_USE_PLATFORM_ANDROID_KHR
+#define IREE_VULKAN_DYNAMIC_SYMBOL_TABLE_ANDROID_KHR(INS_PFN, DEV_PFN) \
+  DEV_PFN(OPTIONAL, vkGetAndroidHardwareBufferPropertiesANDROID)       \
+  DEV_PFN(OPTIONAL, vkGetMemoryAndroidHardwareBufferANDROID)           \
+  INS_PFN(EXCLUDED, vkCreateAndroidSurfaceKHR)
+#else
+#define IREE_VULKAN_DYNAMIC_SYMBOL_TABLE_ANDROID_KHR(INS_PFN, DEV_PFN)
+#endif  // VK_USE_PLATFORM_ANDROID_KHR
+
+#ifdef VK_USE_PLATFORM_GGP
+#define IREE_VULKAN_DYNAMIC_SYMBOL_TABLE_GGP(INS_PFN, DEV_PFN) \
+  INS_PFN(EXCLUDED, vkCreateStreamDescriptorSurfaceGGP)
+#else
+#define IREE_VULKAN_DYNAMIC_SYMBOL_TABLE_GGP(INS_PFN, DEV_PFN)
+#endif  // VK_USE_PLATFORM_GGP
+
+#ifdef VK_USE_PLATFORM_IOS_MVK
+#define IREE_VULKAN_DYNAMIC_SYMBOL_TABLE_IOS_MVK(INS_PFN, DEV_PFN) \
+  INS_PFN(EXCLUDED, vkCreateIOSSurfaceMVK)
+#else
+#define IREE_VULKAN_DYNAMIC_SYMBOL_TABLE_IOS_MVK(INS_PFN, DEV_PFN)
+#endif  // VK_USE_PLATFORM_IOS_MVK
+
+#ifdef VK_USE_PLATFORM_FUCHSIA
+#define IREE_VULKAN_DYNAMIC_SYMBOL_TABLE_FUSCHIA(INS_PFN, DEV_PFN) \
+  INS_PFN(EXCLUDED, vkCreateImagePipeSurfaceFUCHSIA)
+#else
+#define IREE_VULKAN_DYNAMIC_SYMBOL_TABLE_FUSCHIA(INS_PFN, DEV_PFN)
+#endif  // VK_USE_PLATFORM_FUCHSIA
+
+#ifdef VK_USE_PLATFORM_MACOS_MVK
+#define IREE_VULKAN_DYNAMIC_SYMBOL_TABLE_MACOS_MVK(INS_PFN, DEV_PFN) \
+  INS_PFN(EXCLUDED, vkCreateMacOSSurfaceMVK)
+#else
+#define IREE_VULKAN_DYNAMIC_SYMBOL_TABLE_MACOS_MVK(INS_PFN, DEV_PFN)
+#endif  // VK_USE_PLATFORM_MACOS_MVK
+
+#ifdef VK_USE_PLATFORM_METAL_EXT
+#define IREE_VULKAN_DYNAMIC_SYMBOL_TABLE_METAL_EXT(INS_PFN, DEV_PFN) \
+  INS_PFN(EXCLUDED, vkCreateMetalSurfaceEXT)
+#else
+#define IREE_VULKAN_DYNAMIC_SYMBOL_TABLE_METAL_EXT(INS_PFN, DEV_PFN)
+#endif  // VK_USE_PLATFORM_METAL_EXT
+
+#ifdef VK_USE_PLATFORM_VI_NN
+#define IREE_VULKAN_DYNAMIC_SYMBOL_TABLE_VI_NN(INS_PFN, DEV_PFN) \
+  INS_PFN(EXCLUDED, vkCreateViSurfaceNN)
+#else
+#define IREE_VULKAN_DYNAMIC_SYMBOL_TABLE_VI_NN(INS_PFN, DEV_PFN)
+#endif  // VK_USE_PLATFORM_VI_NN
+
+#ifdef VK_USE_PLATFORM_WAYLAND_KHR
+#define IREE_VULKAN_DYNAMIC_SYMBOL_TABLE_WAYLAND_KHR(INS_PFN, DEV_PFN) \
+  INS_PFN(EXCLUDED, vkCreateWaylandSurfaceKHR)                         \
+  INS_PFN(EXCLUDED, vkGetPhysicalDeviceWaylandPresentationSupportKHR)
+#else
+#define IREE_VULKAN_DYNAMIC_SYMBOL_TABLE_WAYLAND_KHR(INS_PFN, DEV_PFN)
+#endif  // VK_USE_PLATFORM_WAYLAND_KHR
+
+#ifdef VK_USE_PLATFORM_WIN32_KHR
+#define IREE_VULKAN_DYNAMIC_SYMBOL_TABLE_WIN32_KHR(INS_PFN, DEV_PFN) \
+  DEV_PFN(EXCLUDED, vkAcquireFullScreenExclusiveModeEXT)             \
+  DEV_PFN(EXCLUDED, vkGetDeviceGroupSurfacePresentModes2EXT)         \
+  DEV_PFN(REQUIRED, vkGetFenceWin32HandleKHR)                        \
+  DEV_PFN(EXCLUDED, vkGetMemoryWin32HandleKHR)                       \
+  DEV_PFN(EXCLUDED, vkGetMemoryWin32HandleNV)                        \
+  DEV_PFN(EXCLUDED, vkGetMemoryWin32HandlePropertiesKHR)             \
+  DEV_PFN(REQUIRED, vkGetSemaphoreWin32HandleKHR)                    \
+  DEV_PFN(REQUIRED, vkImportFenceWin32HandleKHR)                     \
+  DEV_PFN(REQUIRED, vkImportSemaphoreWin32HandleKHR)                 \
+  DEV_PFN(EXCLUDED, vkReleaseFullScreenExclusiveModeEXT)             \
+  INS_PFN(EXCLUDED, vkCreateWin32SurfaceKHR)                         \
+  INS_PFN(EXCLUDED, vkGetPhysicalDeviceSurfacePresentModes2EXT)      \
+  INS_PFN(EXCLUDED, vkGetPhysicalDeviceWin32PresentationSupportKHR)
+#else
+#define IREE_VULKAN_DYNAMIC_SYMBOL_TABLE_WIN32_KHR(INS_PFN, DEV_PFN)
+#endif  // VK_USE_PLATFORM_WIN32_KHR
+
+#ifdef VK_USE_PLATFORM_XCB_KHR
+#define IREE_VULKAN_DYNAMIC_SYMBOL_TABLE_XCB_KHR(INS_PFN, DEV_PFN) \
+  INS_PFN(EXCLUDED, vkCreateXcbSurfaceKHR)                         \
+  INS_PFN(EXCLUDED, vkGetPhysicalDeviceXcbPresentationSupportKHR)
+#else
+#define IREE_VULKAN_DYNAMIC_SYMBOL_TABLE_XCB_KHR(INS_PFN, DEV_PFN)
+#endif  // VK_USE_PLATFORM_XCB_KHR
+
+#ifdef VK_USE_PLATFORM_XLIB_KHR
+#define IREE_VULKAN_DYNAMIC_SYMBOL_TABLE_XLIB_KHR(INS_PFN, DEV_PFN) \
+  INS_PFN(EXCLUDED, vkCreateXlibSurfaceKHR)                         \
+  INS_PFN(EXCLUDED, vkGetPhysicalDeviceXlibPresentationSupportKHR)
+#else
+#define IREE_VULKAN_DYNAMIC_SYMBOL_TABLE_XLIB_KHR(INS_PFN, DEV_PFN)
+#endif  // VK_USE_PLATFORM_XLIB_KHR
+
+#ifdef VK_USE_PLATFORM_XLIB_XRANDR_EXT
+#define IREE_VULKAN_DYNAMIC_SYMBOL_TABLE_XLIB_XRANDR_EXT(INS_PFN, DEV_PFN) \
+  INS_PFN(EXCLUDED, vkAcquireXlibDisplayEXT)                               \
+  INS_PFN(EXCLUDED, vkGetRandROutputDisplayEXT)
+#else
+#define IREE_VULKAN_DYNAMIC_SYMBOL_TABLE_XLIB_XRANDR_EXT(INS_PFN, DEV_PFN)
+#endif  // VK_USE_PLATFORM_XLIB_XRANDR_EXT
+
+#define IREE_VULKAN_DYNAMIC_SYMBOL_PLATFORM_TABLES(INS_PFN, DEV_PFN) \
+  IREE_VULKAN_DYNAMIC_SYMBOL_TABLE_ANDROID_KHR(INS_PFN, DEV_PFN)     \
+  IREE_VULKAN_DYNAMIC_SYMBOL_TABLE_GGP(INS_PFN, DEV_PFN)             \
+  IREE_VULKAN_DYNAMIC_SYMBOL_TABLE_IOS_MVK(INS_PFN, DEV_PFN)         \
+  IREE_VULKAN_DYNAMIC_SYMBOL_TABLE_FUSCHIA(INS_PFN, DEV_PFN)         \
+  IREE_VULKAN_DYNAMIC_SYMBOL_TABLE_MACOS_MVK(INS_PFN, DEV_PFN)       \
+  IREE_VULKAN_DYNAMIC_SYMBOL_TABLE_METAL_EXT(INS_PFN, DEV_PFN)       \
+  IREE_VULKAN_DYNAMIC_SYMBOL_TABLE_VI_NN(INS_PFN, DEV_PFN)           \
+  IREE_VULKAN_DYNAMIC_SYMBOL_TABLE_WAYLAND_KHR(INS_PFN, DEV_PFN)     \
+  IREE_VULKAN_DYNAMIC_SYMBOL_TABLE_WIN32_KHR(INS_PFN, DEV_PFN)       \
+  IREE_VULKAN_DYNAMIC_SYMBOL_TABLE_XCB_KHR(INS_PFN, DEV_PFN)         \
+  IREE_VULKAN_DYNAMIC_SYMBOL_TABLE_XLIB_KHR(INS_PFN, DEV_PFN)        \
+  IREE_VULKAN_DYNAMIC_SYMBOL_TABLE_XLIB_XRANDR_EXT(INS_PFN, DEV_PFN)
+
+#define IREE_VULKAN_DYNAMIC_SYMBOL_INSTANCE_DEVICE_TABLES(INS_PFN, DEV_PFN) \
+  IREE_VULKAN_DYNAMIC_SYMBOL_COMMON_TABLE(INS_PFN, DEV_PFN)                 \
+  IREE_VULKAN_DYNAMIC_SYMBOL_PLATFORM_TABLES(INS_PFN, DEV_PFN)
+
+#define IREE_VULKAN_DYNAMIC_SYMBOL_TABLES(INS_PFN, DEV_PFN) \
+  IREE_VULKAN_DYNAMIC_SYMBOL_INSTANCELESS_TABLE(INS_PFN)    \
+  IREE_VULKAN_DYNAMIC_SYMBOL_COMMON_TABLE(INS_PFN, DEV_PFN) \
+  IREE_VULKAN_DYNAMIC_SYMBOL_PLATFORM_TABLES(INS_PFN, DEV_PFN)
+
+}  // namespace vulkan
+}  // namespace hal
+}  // namespace iree
+
+#endif  // IREE_HAL_VULKAN_DYNAMIC_SYMBOL_TABLES_H_
diff --git a/runtime/src/iree/hal/vulkan/dynamic_symbols.cc b/runtime/src/iree/hal/vulkan/dynamic_symbols.cc
new file mode 100644
index 0000000..1b2bc9d
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/dynamic_symbols.cc
@@ -0,0 +1,265 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/vulkan/dynamic_symbols.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include "iree/base/api.h"
+#include "iree/base/target_platform.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/vulkan/dynamic_symbol_tables.h"
+
+namespace iree {
+namespace hal {
+namespace vulkan {
+
+// Read-only table of function pointer information designed to be in .rdata.
+// To reduce binary size this structure is packed (knowing that we won't have
+// gigabytes of function pointers :).
+struct FunctionPtrInfo {
+  // Name of the function (like 'vkSomeFunction').
+  const char* function_name;
+  // 1 if the function pointer can be resolved via vkGetDeviceProcAddr.
+  uint32_t is_device : 1;
+  // 1 if the function is required and the loader should bail if not found.
+  uint32_t is_required : 1;
+  // TODO(benvanik): remove from table by manually walking sizeof(uintptr_t).
+  // An offset in bytes from the base of &syms to where the PFN_vkSomeFunction
+  // member is located.
+  uint32_t member_offset : 30;
+} IREE_ATTRIBUTE_PACKED;
+
+namespace {
+
+#define REQUIRED_PFN_FUNCTION_PTR(function_name, is_device) \
+  {#function_name, is_device, 1, offsetof(DynamicSymbols, function_name)},
+#define OPTIONAL_PFN_FUNCTION_PTR(function_name, is_device) \
+  {#function_name, is_device, 0, offsetof(DynamicSymbols, function_name)},
+#define EXCLUDED_PFN_FUNCTION_PTR(function_name, is_device)
+#define INS_PFN_FUNCTION_PTR(requirement, function_name) \
+  requirement##_PFN_FUNCTION_PTR(function_name, 0)
+#define DEV_PFN_FUNCTION_PTR(requirement, function_name) \
+  requirement##_PFN_FUNCTION_PTR(function_name, 1)
+
+// Defines the table of mandatory FunctionPtrInfos resolved prior to instance
+// creation. These are safe to call with no instance parameter and should be
+// exported by all loaders/ICDs.
+static constexpr const FunctionPtrInfo kInstancelessFunctionPtrInfos[] = {
+    IREE_VULKAN_DYNAMIC_SYMBOL_INSTANCELESS_TABLE(INS_PFN_FUNCTION_PTR)};
+
+// Defines the table of FunctionPtrInfos for dynamic loading that must wait
+// until an instance has been created to be resolved.
+static constexpr const FunctionPtrInfo kDynamicFunctionPtrInfos[] = {
+    IREE_VULKAN_DYNAMIC_SYMBOL_INSTANCE_DEVICE_TABLES(INS_PFN_FUNCTION_PTR,
+                                                      DEV_PFN_FUNCTION_PTR)};
+
+static const char* kVulkanLoaderSearchNames[] = {
+#if defined(IREE_PLATFORM_ANDROID)
+    "libvulkan.so",
+#elif defined(IREE_PLATFORM_IOS) || defined(IREE_PLATFORM_MACOS)
+    "libvulkan.dylib",
+#elif defined(IREE_PLATFORM_WINDOWS)
+    "vulkan-1.dll",
+#else
+    "libvulkan.so.1",
+#endif  // IREE_PLATFORM_ANDROID
+};
+
+iree_status_t ResolveFunctions(
+    DynamicSymbols* syms, const DynamicSymbols::GetProcAddrFn& get_proc_addr) {
+  // Resolve the method the shared object uses to resolve other functions.
+  // Some libraries will export all symbols while others will only export this
+  // single function.
+  syms->vkGetInstanceProcAddr = reinterpret_cast<PFN_vkGetInstanceProcAddr>(
+      get_proc_addr("vkGetInstanceProcAddr"));
+
+#if defined(IREE_PLATFORM_ANDROID)
+  // Since Android 8 Oreo, Android re-architected the OS framework with project
+  // Treble. Framework libraries and vendor libraries have a more strict and
+  // clear separation. Their dependencies are carefully scrutinized and only
+  // selected cases are allowed. This is enforced with linker namespaces.
+  //
+  // /data/local/tmp is the preferred directory for automating native binary
+  // tests built using NDK toolchain. They should be allowed to access libraries
+  // like libvulkan.so for their functionality. However, there was an issue
+  // with fully treblized Android 10 where /data/local/tmp did not have access
+  // to the linker namespaces needed by libvulkan.so. This is fixed via
+  // https://android.googlesource.com/platform/system/linkerconfig/+/296da5b1eb88a3527ee76352c2d987f82f3252eb
+  //
+  // But as typically in the Android system, it takes a long time to see the
+  // fix getting propagated, if ever. A known workaround is to symlink the
+  // vendor Vulkan implementation under /vendor/lib[64]/hw/vulkan.*.so as
+  // libvulkan.so under /data/local/tmp and use LD_LIBRARY_PATH=/data/local/tmp
+  // when invoking the test binaries. This effectively bypasses the Android
+  // Vulkan loader. This is fine for ARM Mali GPUs, whose driver exposes
+  // the symbol `vkGetInstanceProcAddr`. But for Qualcomm Adreno GPUs,
+  // the Vulkan implementation library does not directly expose the symbol.
+  // Instead it's hidden as `qglinternal::vkGetInstanceProcAddr`. So try to
+  // see whether we can get this symbol. This is a reasonable workaround
+  // as otherwise it means we need to wrap. every. single. binary. test.
+  // as. a. full-blown. Android. app.
+  if (!syms->vkGetInstanceProcAddr) {
+    syms->vkGetInstanceProcAddr =
+        reinterpret_cast<PFN_vkGetInstanceProcAddr>(get_proc_addr(
+            // C++ mangled name for "qglinternal::vkGetInstanceProcAddr"
+            "_ZN11qglinternal21vkGetInstanceProcAddrEP12VkInstance_TPKc"));
+  }
+#endif  // IREE_PLATFORM_ANDROID
+
+  if (!syms->vkGetInstanceProcAddr) {
+    return iree_make_status(
+        IREE_STATUS_UNAVAILABLE,
+        "required method vkGetInstanceProcAddr not found in provided Vulkan "
+        "library (did you pick the wrong file?)");
+  }
+
+  // Resolve the mandatory functions that we need to create instances.
+  // If the provided |get_proc_addr| cannot resolve these then it's not a loader
+  // or ICD we want to use, anyway.
+  for (int i = 0; i < IREE_ARRAYSIZE(kInstancelessFunctionPtrInfos); ++i) {
+    const auto& function_ptr = kInstancelessFunctionPtrInfos[i];
+    auto* member_ptr = reinterpret_cast<PFN_vkVoidFunction*>(
+        reinterpret_cast<uint8_t*>(syms) + function_ptr.member_offset);
+    *member_ptr =
+        syms->vkGetInstanceProcAddr(VK_NULL_HANDLE, function_ptr.function_name);
+    if (*member_ptr == nullptr) {
+      return iree_make_status(
+          IREE_STATUS_UNAVAILABLE,
+          "mandatory Vulkan function %s not available; invalid loader/ICD?",
+          function_ptr.function_name);
+    }
+  }
+
+  return iree_ok_status();
+}
+
+}  // namespace
+
+// static
+iree_status_t DynamicSymbols::Create(const GetProcAddrFn& get_proc_addr,
+                                     ref_ptr<DynamicSymbols>* out_syms) {
+  IREE_TRACE_SCOPE0("DynamicSymbols::Create");
+
+  auto syms = make_ref<DynamicSymbols>();
+  IREE_RETURN_IF_ERROR(ResolveFunctions(syms.get(), get_proc_addr));
+  syms->FixupExtensionFunctions();
+
+  *out_syms = std::move(syms);
+  return iree_ok_status();
+}
+
+// static
+iree_status_t DynamicSymbols::CreateFromSystemLoader(
+    ref_ptr<DynamicSymbols>* out_syms) {
+  IREE_TRACE_SCOPE0("DynamicSymbols::CreateFromSystemLoader");
+
+  iree_dynamic_library_t* loader_library = NULL;
+  iree_status_t status = iree_dynamic_library_load_from_files(
+      IREE_ARRAYSIZE(kVulkanLoaderSearchNames), kVulkanLoaderSearchNames,
+      IREE_DYNAMIC_LIBRARY_FLAG_NONE, iree_allocator_system(), &loader_library);
+  if (iree_status_is_not_found(status)) {
+    iree_status_ignore(status);
+    return iree_make_status(
+        IREE_STATUS_UNAVAILABLE,
+        "Vulkan runtime library not available; ensure installed and on path");
+  } else if (!iree_status_is_ok(status)) {
+    return status;
+  }
+
+  auto syms = make_ref<DynamicSymbols>();
+  syms->loader_library_ = loader_library;
+
+  IREE_RETURN_IF_ERROR(
+      ResolveFunctions(syms.get(), [loader_library](const char* function_name) {
+        PFN_vkVoidFunction fn = NULL;
+        iree_status_t status = iree_dynamic_library_lookup_symbol(
+            loader_library, function_name, (void**)&fn);
+        if (!iree_status_is_ok(status)) {
+          IREE_IGNORE_ERROR(status);
+          return (PFN_vkVoidFunction)NULL;
+        }
+        return fn;
+      }));
+  syms->FixupExtensionFunctions();
+
+  *out_syms = std::move(syms);
+  return iree_ok_status();
+}
+
+iree_status_t DynamicSymbols::LoadFromInstance(VkInstance instance) {
+  IREE_TRACE_SCOPE0("DynamicSymbols::LoadFromInstance");
+  return LoadFromDevice(instance, VK_NULL_HANDLE);
+}
+
+iree_status_t DynamicSymbols::LoadFromDevice(VkInstance instance,
+                                             VkDevice device) {
+  IREE_TRACE_SCOPE0("DynamicSymbols::LoadFromDevice");
+
+  if (!instance) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "instance must have been created and a default "
+                            "instance proc lookup function is required");
+  }
+
+  // Setup the lookup methods first. The rest of the syms uses these to
+  // resolve function pointers.
+  this->vkGetDeviceProcAddr = reinterpret_cast<PFN_vkGetDeviceProcAddr>(
+      this->vkGetInstanceProcAddr(instance, "vkGetDeviceProcAddr"));
+  if (!this->vkGetDeviceProcAddr) {
+    return iree_make_status(IREE_STATUS_UNAVAILABLE,
+                            "required Vulkan function vkGetDeviceProcAddr not "
+                            "available; invalid driver handle?");
+  }
+
+  // Load the rest of the functions.
+  for (int i = 0; i < IREE_ARRAYSIZE(kDynamicFunctionPtrInfos); ++i) {
+    const auto& function_ptr = kDynamicFunctionPtrInfos[i];
+    auto* member_ptr = reinterpret_cast<PFN_vkVoidFunction*>(
+        reinterpret_cast<uint8_t*>(this) + function_ptr.member_offset);
+    if (function_ptr.is_device && device) {
+      *member_ptr =
+          this->vkGetDeviceProcAddr(device, function_ptr.function_name);
+    } else {
+      *member_ptr =
+          this->vkGetInstanceProcAddr(instance, function_ptr.function_name);
+    }
+    if (*member_ptr == nullptr && function_ptr.is_required) {
+      return iree_make_status(IREE_STATUS_UNAVAILABLE,
+                              "required Vulkan function %s not available",
+                              function_ptr.function_name);
+    }
+  }
+
+  FixupExtensionFunctions();
+
+  return iree_ok_status();
+}
+
+DynamicSymbols::DynamicSymbols() = default;
+
+DynamicSymbols::~DynamicSymbols() {
+  if (loader_library_) {
+    iree_dynamic_library_release(loader_library_);
+  }
+}
+
+void DynamicSymbols::FixupExtensionFunctions() {
+  this->vkGetSemaphoreCounterValue = this->vkGetSemaphoreCounterValue
+                                         ? this->vkGetSemaphoreCounterValue
+                                         : this->vkGetSemaphoreCounterValueKHR;
+  this->vkWaitSemaphores = this->vkWaitSemaphores ? this->vkWaitSemaphores
+                                                  : this->vkWaitSemaphoresKHR;
+  this->vkSignalSemaphore = this->vkSignalSemaphore
+                                ? this->vkSignalSemaphore
+                                : this->vkSignalSemaphoreKHR;
+}
+
+}  // namespace vulkan
+}  // namespace hal
+}  // namespace iree
diff --git a/runtime/src/iree/hal/vulkan/dynamic_symbols.h b/runtime/src/iree/hal/vulkan/dynamic_symbols.h
new file mode 100644
index 0000000..d02c0ea
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/dynamic_symbols.h
@@ -0,0 +1,128 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_VULKAN_DYNAMIC_SYMBOLS_H_
+#define IREE_HAL_VULKAN_DYNAMIC_SYMBOLS_H_
+
+// clang-format off: must be included before all other headers.
+#include "iree/hal/vulkan/vulkan_headers.h"  // IWYU pragma: export
+// clang-format on
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/dynamic_library.h"
+#include "iree/base/status_cc.h"
+#include "iree/hal/vulkan/dynamic_symbol_tables.h"  // IWYU pragma: export
+#include "iree/hal/vulkan/util/ref_ptr.h"
+
+namespace iree {
+namespace hal {
+namespace vulkan {
+
+struct FunctionPtrInfo;
+
+// Dynamic Vulkan function loader for use with vulkan.hpp.
+// This loader is a subset of the DispatchLoaderDynamic implementation that only
+// loads functions we are interested in (a compute-specific subset) and avoids
+// extensions we will never use.
+//
+// This exposes all Vulkan methods as function pointer members. Optional
+// methods will be nullptr if not present. Excluded methods will be omitted.
+//
+// DynamicSymbols instances are designed to be passed to vulkan.hpp methods as
+// the last argument, though they may also be called directly.
+// **Always make sure to pass the loader to vulkan.hpp methods!**
+//
+// Loading is performed by walking a table of required and optional functions
+// (defined in dynamic_symbol_tables.h) and populating the member function
+// pointers exposed on this struct when available. For example, if the
+// vkSomeFunction method is marked in the table as OPTIONAL the loader will
+// attempt to lookup the function and if successful set the
+// DynamicSymbols::vkSomeFunction pointer to the resolved address. If the
+// function is not found then it will be set to nullptr so users can check for
+// function availability.
+//
+// Documentation:
+// https://github.com/KhronosGroup/Vulkan-Hpp#extensions--per-device-function-pointers
+//
+// Usage:
+//  IREE_ASSIGN_OR_RETURN(auto syms, DynamicSymbols::CreateFromSystemLoader());
+//  VkInstance instance = VK_NULL_HANDLE;
+//  syms->vkCreateInstance(..., &instance);
+//  IREE_RETURN_IF_ERROR(syms->LoadFromInstance(instance));
+struct DynamicSymbols : public RefObject<DynamicSymbols> {
+  using GetProcAddrFn =
+      std::function<PFN_vkVoidFunction(const char* function_name)>;
+
+  DynamicSymbols();
+  ~DynamicSymbols();
+
+  // Creates the dynamic symbol table using the given |get_proc_addr| to resolve
+  // the vkCreateInstance function.
+  //
+  // After the instance is created the caller must use LoadFromInstance (or
+  // LoadFromDevice) to load the remaining symbols.
+  static iree_status_t Create(const GetProcAddrFn& get_proc_addr,
+                              ref_ptr<DynamicSymbols>* out_syms);
+
+  // Loads all required and optional Vulkan functions from the Vulkan loader.
+  // This will look for a Vulkan loader on the system (like libvulkan.so) and
+  // dlsym the functions from that.
+  //
+  // The loaded function pointers will point to thunks in the ICD. This may
+  // enable additional debug checking and more readable stack traces (as
+  // errors come from within the ICD, where we have symbols).
+  static iree_status_t CreateFromSystemLoader(
+      ref_ptr<DynamicSymbols>* out_syms);
+
+  // Loads all required and optional Vulkan functions from the given instance.
+  //
+  // The loaded function pointers will point to thunks in the ICD. This may
+  // enable additional debug checking and more readable stack traces (as
+  // errors come from within the ICD, where we have symbols).
+  iree_status_t LoadFromInstance(VkInstance instance);
+
+  // Loads all required and optional Vulkan functions from the given device,
+  // falling back to the instance when required.
+  //
+  // This attempts to directly query the methods from the device, bypassing any
+  // ICD or shim layers. These methods will generally have less overhead at
+  // runtime as they need not jump through the various trampolines.
+  iree_status_t LoadFromDevice(VkInstance instance, VkDevice device);
+
+  // Define members for each function pointer.
+  // See dynamic_symbol_tables.h for the full list of methods.
+  //
+  // Each required and optional function in the loader tables will expand to
+  // the following member, such as for example 'vkSomeFunction':
+  //   PFN_vkSomeFunction vkSomeFunction;
+#define REQUIRED_PFN(function_name) PFN_##function_name function_name = nullptr
+#define OPTIONAL_PFN(function_name) PFN_##function_name function_name = nullptr
+#define EXCLUDED_PFN(function_name)
+#define PFN_MEMBER(requirement, function_name) requirement##_PFN(function_name);
+  REQUIRED_PFN(vkGetInstanceProcAddr);
+  REQUIRED_PFN(vkGetDeviceProcAddr);
+  IREE_VULKAN_DYNAMIC_SYMBOL_TABLES(PFN_MEMBER, PFN_MEMBER);
+#undef REQUIRED_PFN
+#undef OPTIONAL_PFN
+#undef EXCLUDED_PFN
+#undef PFN_MEMBER
+
+ private:
+  void FixupExtensionFunctions();
+
+  // Optional Vulkan Loader dynamic library.
+  iree_dynamic_library_t* loader_library_ = nullptr;
+};
+
+}  // namespace vulkan
+}  // namespace hal
+}  // namespace iree
+
+#endif  // IREE_HAL_VULKAN_DYNAMIC_SYMBOLS_H_
diff --git a/runtime/src/iree/hal/vulkan/dynamic_symbols_test.cc b/runtime/src/iree/hal/vulkan/dynamic_symbols_test.cc
new file mode 100644
index 0000000..4d96c92
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/dynamic_symbols_test.cc
@@ -0,0 +1,63 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/vulkan/dynamic_symbols.h"
+
+#include "iree/base/api.h"
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace iree {
+namespace hal {
+namespace vulkan {
+namespace {
+
+VkApplicationInfo GetApplicationInfo() {
+  VkApplicationInfo app_info;
+  app_info.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO;
+  app_info.pNext = nullptr;
+  app_info.pApplicationName = "IREE-ML-TEST";
+  app_info.applicationVersion = 0;
+  app_info.pEngineName = "IREE";
+  app_info.engineVersion = 0;
+  app_info.apiVersion = VK_API_VERSION_1_0;
+  return app_info;
+}
+
+VkInstanceCreateInfo GetInstanceCreateInfo(VkApplicationInfo* app_info) {
+  VkInstanceCreateInfo create_info;
+  create_info.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
+  create_info.pNext = nullptr;
+  create_info.flags = 0;
+  create_info.pApplicationInfo = app_info;
+  create_info.enabledLayerCount = 0;
+  create_info.ppEnabledLayerNames = nullptr;
+  create_info.enabledExtensionCount = 0;
+  create_info.ppEnabledExtensionNames = nullptr;
+  return create_info;
+}
+
+TEST(DynamicSymbolsTest, CreateFromSystemLoader) {
+  iree::ref_ptr<iree::hal::vulkan::DynamicSymbols> syms;
+  IREE_ASSERT_OK(DynamicSymbols::CreateFromSystemLoader(&syms));
+
+  // Create and destroy a VkInstance using the symbols. This is mainly testing
+  // that the symbols were loaded successfully and are actually able to be used.
+  VkApplicationInfo app_info = GetApplicationInfo();
+  VkInstanceCreateInfo create_info = GetInstanceCreateInfo(&app_info);
+  VkInstance instance = VK_NULL_HANDLE;
+  ASSERT_EQ(VK_SUCCESS, syms->vkCreateInstance(
+                            &create_info, /*pAllocator=*/nullptr, &instance));
+
+  IREE_ASSERT_OK(syms->LoadFromInstance(instance));
+
+  syms->vkDestroyInstance(instance, /*pAllocator=*/nullptr);
+}
+
+}  // namespace
+}  // namespace vulkan
+}  // namespace hal
+}  // namespace iree
diff --git a/runtime/src/iree/hal/vulkan/emulated_semaphore.cc b/runtime/src/iree/hal/vulkan/emulated_semaphore.cc
new file mode 100644
index 0000000..f83f20e
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/emulated_semaphore.cc
@@ -0,0 +1,649 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/vulkan/emulated_semaphore.h"
+
+#include <atomic>
+#include <cinttypes>
+#include <cstddef>
+#include <cstdint>
+#include <ostream>
+#include <vector>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/synchronization.h"
+#include "iree/base/logging.h"
+#include "iree/base/status_cc.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/vulkan/dynamic_symbols.h"
+#include "iree/hal/vulkan/serializing_command_queue.h"
+#include "iree/hal/vulkan/status_util.h"
+#include "iree/hal/vulkan/util/intrusive_list.h"
+#include "iree/hal/vulkan/util/ref_ptr.h"
+
+namespace iree {
+namespace hal {
+namespace vulkan {
+
+namespace {
+
+class RAIILock {
+ public:
+  explicit RAIILock(iree_slim_mutex_t* mu)
+      IREE_THREAD_ANNOTATION_ATTRIBUTE(no_thread_safety_analysis)
+      : mu_(mu) {
+    iree_slim_mutex_lock(mu_);
+  }
+  ~RAIILock() IREE_THREAD_ANNOTATION_ATTRIBUTE(no_thread_safety_analysis) {
+    iree_slim_mutex_unlock(mu_);
+  }
+
+ private:
+  iree_slim_mutex_t* mu_;
+};
+
+}  // namespace
+
+class EmulatedTimelineSemaphore final {
+ public:
+  EmulatedTimelineSemaphore(VkDeviceHandle* logical_device,
+                            TimePointSemaphorePool* semaphore_pool,
+                            iree_host_size_t command_queue_count,
+                            iree::hal::vulkan::CommandQueue** command_queues,
+                            uint64_t initial_value);
+
+  ~EmulatedTimelineSemaphore();
+
+  iree_status_t Query(uint64_t* out_value);
+
+  iree_status_t Signal(uint64_t value);
+
+  iree_status_t Wait(uint64_t value, iree_timeout_t timeout);
+
+  void Fail(iree_status_t status);
+
+  // Gets a binary semaphore for waiting on the timeline to advance to the given
+  // |value|. The semaphore returned won't be waited by anyone else. Returns
+  // VK_NULL_HANDLE if no available semaphores for the given |value|.
+  // |wait_fence| is the fence associated with the queue submission that waiting
+  // on this semaphore.
+  VkSemaphore GetWaitSemaphore(uint64_t value,
+                               const ref_ptr<TimePointFence>& wait_fence);
+
+  // Cancels the waiting attempt on the given binary |semaphore|. This allows
+  // the |semaphore| to be waited by others.
+  iree_status_t CancelWaitSemaphore(VkSemaphore semaphore);
+
+  // Gets a binary semaphore for signaling the timeline to the given |value|.
+  // |value| must be smaller than the current timeline value. |signal_fence| is
+  // the fence associated with the queue submission that signals this semaphore.
+  iree_status_t GetSignalSemaphore(uint64_t value,
+                                   const ref_ptr<TimePointFence>& signal_fence,
+                                   VkSemaphore* out_handle);
+
+ private:
+  // Tries to advance the timeline to the given |to_upper_value| without
+  // blocking and returns whether the |to_upper_value| is reached.
+  iree_status_t TryToAdvanceTimeline(uint64_t to_upper_value,
+                                     bool* out_reached_upper_value);
+  // Similar to the above, but also returns the fences that are known to have
+  // already signaled via |signaled_fences|.
+  iree_status_t TryToAdvanceTimeline(uint64_t to_upper_value,
+                                     bool* out_reached_upper_value,
+                                     std::vector<VkFence>* out_signaled_fences);
+
+  std::atomic<uint64_t> signaled_value_;
+
+  VkDeviceHandle* logical_device_;
+  TimePointSemaphorePool* semaphore_pool_;
+
+  iree_host_size_t command_queue_count_;
+  CommandQueue** command_queues_;
+
+  mutable iree_slim_mutex_t mutex_;
+
+  // A list of outstanding semaphores used to emulate time points.
+  //
+  // The life time of each semaphore is in one of the following state:
+  //
+  // * Unused state: value = UINT64_MAX, signal/wait fence = nullptr. This is
+  //   the state of the semaphore when it's initially acquired from the pool and
+  //   not put in the queue for emulating a time point yet.
+  // * Pending state: signaled value < value < UINT64_MAX, signal fence =
+  //   <some-fence>, wait fence == nullptr. This is the state of the semaphore
+  //   when it's put into the GPU queue for emulating a time point.
+  // * Pending and waiting state: signaled value < value < UINT64_MAX, signal
+  //   fence = <some-fence>, wait fence == <some-fence>. This is the state of
+  //   the semaphore when it's put into the GPU queue for emulating a time
+  //   point and there is another queue submission waiting on it in GPU.
+  // * Signaled and not ever waited state: value <= signaled value, singal/wait
+  //   fence = nullptr. This is the state of the semaphore when we know it's
+  //   already signaled on GPU and there is no waiters for it.
+  // * Signaled and waiting state: value <= signaled value, signal fence =
+  //   nullptr, wait fence = <some-fence>. This is the state of the semaphore
+  //   when we know it's already signaled on GPU and there is still one queue
+  //   submission on GPU is waiting for it.
+  IntrusiveList<TimePointSemaphore> outstanding_semaphores_
+      IREE_GUARDED_BY(mutex_);
+
+  // NOTE: We only need to access this status (and thus take the lock) when we
+  // want to either signal failure or query the status in the case of the
+  // semaphore being set to UINT64_MAX.
+  iree_status_t status_ IREE_GUARDED_BY(mutex_) = iree_ok_status();
+};
+
+EmulatedTimelineSemaphore::EmulatedTimelineSemaphore(
+    VkDeviceHandle* logical_device, TimePointSemaphorePool* semaphore_pool,
+    iree_host_size_t command_queue_count, CommandQueue** command_queues,
+    uint64_t initial_value)
+    : signaled_value_(initial_value),
+      logical_device_(logical_device),
+      semaphore_pool_(semaphore_pool),
+      command_queue_count_(command_queue_count),
+      command_queues_(command_queues) {
+  iree_slim_mutex_initialize(&mutex_);
+}
+
+EmulatedTimelineSemaphore::~EmulatedTimelineSemaphore() {
+  IREE_TRACE_SCOPE0("EmulatedTimelineSemaphore::dtor");
+  IREE_CHECK_OK(
+      TryToAdvanceTimeline(UINT64_MAX, /*out_reached_upper_value=*/NULL));
+
+  iree_slim_mutex_lock(&mutex_);
+  IREE_CHECK(outstanding_semaphores_.empty())
+      << "Destroying an emulated timeline semaphore without first waiting on "
+         "outstanding signals";
+  iree_status_free(status_);
+  iree_slim_mutex_unlock(&mutex_);
+  iree_slim_mutex_deinitialize(&mutex_);
+}
+
+iree_status_t EmulatedTimelineSemaphore::Query(uint64_t* out_value) {
+  IREE_TRACE_SCOPE0("EmulatedTimelineSemaphore::Query");
+  IREE_DVLOG(2) << "EmulatedTimelineSemaphore::Query";
+  IREE_RETURN_IF_ERROR(
+      TryToAdvanceTimeline(UINT64_MAX, /*out_reached_upper_value=*/NULL));
+  uint64_t value = signaled_value_.load();
+  IREE_DVLOG(2) << "Current timeline value: " << value;
+  if (value == UINT64_MAX) {
+    RAIILock locker(&mutex_);
+    return iree_status_clone(status_);
+  }
+  *out_value = value;
+  return iree_ok_status();
+}
+
+iree_status_t EmulatedTimelineSemaphore::Signal(uint64_t value) {
+  IREE_TRACE_SCOPE0("EmulatedTimelineSemaphore::Signal");
+  IREE_DVLOG(2) << "EmulatedTimelineSemaphore::Signal";
+  auto signaled_value = signaled_value_.exchange(value);
+  IREE_DVLOG(2) << "Previous value: " << signaled_value
+                << "; new value: " << value;
+  // Make sure the previous signaled value is smaller than the new value.
+  IREE_CHECK(signaled_value < value)
+      << "Attempting to signal a timeline value out of order; trying " << value
+      << " but " << signaled_value << " already signaled";
+
+  // Inform the device to make progress given we have a new value signaled now.
+  for (iree_host_size_t i = 0; i < command_queue_count_; ++i) {
+    IREE_RETURN_IF_ERROR(((SerializingCommandQueue*)command_queues_[i])
+                             ->AdvanceQueueSubmission());
+  }
+
+  return iree_ok_status();
+}
+
+iree_status_t EmulatedTimelineSemaphore::Wait(uint64_t value,
+                                              iree_timeout_t timeout) {
+  IREE_TRACE_SCOPE0("EmulatedTimelineSemaphore::Wait");
+  IREE_DVLOG(2) << "EmulatedTimelineSemaphore::Wait";
+
+  iree_time_t deadline_ns = iree_timeout_as_deadline_ns(timeout);
+
+  VkFence fence = VK_NULL_HANDLE;
+  do {
+    IREE_TRACE_SCOPE0("EmulatedTimelineSemaphore::Wait#loop");
+    // First try to advance the timeline without blocking to see whether we've
+    // already reached the desired value.
+    bool reached_desired_value = false;
+    IREE_RETURN_IF_ERROR(TryToAdvanceTimeline(value, &reached_desired_value));
+    if (reached_desired_value) return iree_ok_status();
+
+    // We must wait now. Find the first emulated time point that has a value >=
+    // the desired value so we can wait on its associated signal fence to make
+    // sure the timeline is advanced to the desired value.
+    RAIILock locker(&mutex_);
+    auto semaphore = outstanding_semaphores_.begin();
+    for (; semaphore != outstanding_semaphores_.end(); ++semaphore) {
+      if ((*semaphore)->value >= value) break;
+    }
+    if (semaphore != outstanding_semaphores_.end()) {
+      if (!(*semaphore)->signal_fence) {
+        return iree_make_status(IREE_STATUS_INTERNAL,
+                                "timeline should have a signal fence for the "
+                                "first time point beyond the signaled value");
+      }
+      IREE_DVLOG(2) << "Found timepoint semaphore " << *semaphore
+                    << " (value: " << (*semaphore)->value
+                    << ") to wait for desired timeline value: " << value;
+      fence = (*semaphore)->signal_fence->value();
+      // Found; we can break the loop and proceed to waiting now.
+      break;
+    }
+    // TODO(antiagainst): figure out a better way instead of the busy loop here.
+  } while (iree_time_now() < deadline_ns);
+
+  if (fence == VK_NULL_HANDLE) {
+    // NOTE: not an error; it may be expected that the semaphore is not ready.
+    return iree_status_from_code(IREE_STATUS_DEADLINE_EXCEEDED);
+  }
+
+  uint64_t timeout_ns =
+      static_cast<uint64_t>(iree_absolute_deadline_to_timeout_ns(deadline_ns));
+  VK_RETURN_IF_ERROR(logical_device_->syms()->vkWaitForFences(
+                         *logical_device_, /*fenceCount=*/1, &fence,
+                         /*waitAll=*/true, timeout_ns),
+                     "vkWaitForFences");
+
+  return TryToAdvanceTimeline(value, /*out_reached_upper_value=*/NULL);
+}
+
+void EmulatedTimelineSemaphore::Fail(iree_status_t status) {
+  IREE_TRACE_SCOPE0("EmulatedTimelineSemaphore::Fail");
+  RAIILock locker(&mutex_);
+  if (status_) return;
+  status_ = status;
+  signaled_value_.store(UINT64_MAX);
+}
+
+VkSemaphore EmulatedTimelineSemaphore::GetWaitSemaphore(
+    uint64_t value, const ref_ptr<TimePointFence>& wait_fence) {
+  IREE_TRACE_SCOPE0("EmulatedTimelineSemaphore::GetWaitSemaphore");
+  IREE_DVLOG(2) << "EmulatedTimelineSemaphore::GetWaitSemaphore";
+
+  RAIILock locker(&mutex_);
+
+  VkSemaphore semaphore = VK_NULL_HANDLE;
+  for (TimePointSemaphore* point : outstanding_semaphores_) {
+    if (point->value > value && point->wait_fence) {
+      point->wait_fence = add_ref(wait_fence);
+      semaphore = point->semaphore;
+      break;
+    }
+  }
+
+  IREE_DVLOG(2) << "Binary VkSemaphore to wait on for timeline value (" << value
+                << ") and wait fence (" << wait_fence.get()
+                << "): " << semaphore;
+
+  return semaphore;
+}
+
+iree_status_t EmulatedTimelineSemaphore::CancelWaitSemaphore(
+    VkSemaphore semaphore) {
+  IREE_TRACE_SCOPE0("EmulatedTimelineSemaphore::CancelWaitSemaphore");
+  IREE_DVLOG(2) << "EmulatedTimelineSemaphore::CancelWaitSemaphore";
+
+  RAIILock locker(&mutex_);
+  for (TimePointSemaphore* point : outstanding_semaphores_) {
+    if (point->semaphore != semaphore) continue;
+
+    if (!point->wait_fence) {
+      return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                              "time point wasn't waited before");
+    }
+    point->wait_fence = nullptr;
+    IREE_DVLOG(2) << "Cancelled waiting on binary VkSemaphore: " << semaphore;
+    return iree_ok_status();
+  }
+  return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                          "no time point for the given semaphore");
+}
+
+iree_status_t EmulatedTimelineSemaphore::GetSignalSemaphore(
+    uint64_t value, const ref_ptr<TimePointFence>& signal_fence,
+    VkSemaphore* out_handle) {
+  IREE_TRACE_SCOPE0("EmulatedTimelineSemaphore::GetSignalSemaphore");
+  IREE_DVLOG(2) << "EmulatedTimelineSemaphore::GetSignalSemaphore";
+
+  if (signaled_value_.load() >= value) {
+    return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                            "timeline semaphore already signaled past %" PRIu64,
+                            value);
+  }
+
+  RAIILock locker(&mutex_);
+
+  auto insertion_point = outstanding_semaphores_.begin();
+  while (insertion_point != outstanding_semaphores_.end()) {
+    if ((*insertion_point)->value > value) break;
+  }
+
+  TimePointSemaphore* semaphore = NULL;
+  IREE_RETURN_IF_ERROR(semaphore_pool_->Acquire(&semaphore));
+  semaphore->value = value;
+  semaphore->signal_fence = add_ref(signal_fence);
+  if (semaphore->wait_fence) {
+    return iree_make_status(
+        IREE_STATUS_INTERNAL,
+        "newly acquired time point semaphore should not have waiters");
+  }
+  outstanding_semaphores_.insert(insertion_point, semaphore);
+  IREE_DVLOG(2) << "Timepoint semaphore to signal for timeline value (" << value
+                << ") and wait fence (" << signal_fence.get()
+                << "): " << semaphore
+                << " (binary VkSemaphore: " << semaphore->semaphore << ")";
+
+  *out_handle = semaphore->semaphore;
+  return iree_ok_status();
+}
+
+iree_status_t EmulatedTimelineSemaphore::TryToAdvanceTimeline(
+    uint64_t to_upper_value, bool* out_reached_upper_value) {
+  std::vector<VkFence> signaled_fences;
+  iree_status_t status = TryToAdvanceTimeline(
+      to_upper_value, out_reached_upper_value, &signaled_fences);
+  // Inform the queue that some fences are known to have signaled. This should
+  // happen here instead of inside the other TryToAdvanceTimeline to avoid
+  // potential mutex deadlock, given here we are not holding a mutex anymore.
+  if (!signaled_fences.empty()) {
+    for (iree_host_size_t i = 0; i < command_queue_count_; ++i) {
+      ((SerializingCommandQueue*)command_queues_[i])
+          ->SignalFences(signaled_fences);
+    }
+  }
+  return status;
+}
+
+iree_status_t EmulatedTimelineSemaphore::TryToAdvanceTimeline(
+    uint64_t to_upper_value, bool* out_reached_upper_value,
+    std::vector<VkFence>* out_signaled_fences) {
+  IREE_TRACE_SCOPE0("EmulatedTimelineSemaphore::TryToAdvanceTimeline");
+  IREE_DVLOG(3) << "EmulatedTimelineSemaphore::TryToAdvanceTimeline";
+  if (out_reached_upper_value) *out_reached_upper_value = false;
+
+  uint64_t past_value = signaled_value_.load();
+  IREE_DVLOG(3) << "Current timeline value: " << past_value
+                << "; desired timeline value: " << to_upper_value;
+
+  // Fast path for when already signaled past the desired value.
+  if (past_value >= to_upper_value) {
+    if (out_reached_upper_value) *out_reached_upper_value = true;
+    return iree_ok_status();
+  }
+
+  // We hold the lock during the entire resolve process so that we can resolve
+  // to the furthest possible value.
+  RAIILock locker(&mutex_);
+
+  IREE_DVLOG(3) << "# outstanding semaphores: "
+                << outstanding_semaphores_.size();
+
+  // The timeline has not signaled past the desired value and there is no
+  // binary semaphore pending on GPU yet: certainly the timeline cannot
+  // advance to the desired value.
+  if (outstanding_semaphores_.empty()) return iree_ok_status();
+
+  IntrusiveList<TimePointSemaphore> resolved_semaphores;
+
+  auto clear_signal_fence =
+      [&out_signaled_fences](ref_ptr<TimePointFence>& fence) {
+        if (fence) {
+          if (out_signaled_fences)
+            out_signaled_fences->push_back(fence->value());
+          fence.reset();
+        }
+      };
+
+  bool keep_resolving = true;
+  bool reached_desired_value = false;
+  while (keep_resolving && !outstanding_semaphores_.empty()) {
+    auto* semaphore = outstanding_semaphores_.front();
+    IREE_DVLOG(3) << "Looking at timepoint semaphore " << semaphore << "..";
+    IREE_DVLOG(3) << "  value: " << semaphore->value;
+    IREE_DVLOG(3) << "  VkSemaphore: " << semaphore->semaphore;
+    IREE_DVLOG(3) << "  signal fence: " << semaphore->signal_fence.get();
+    IREE_DVLOG(3) << "  wait fence: " << semaphore->wait_fence.get();
+
+    // If the current semaphore is for a value beyond our upper limit, then
+    // early exit so that we don't spend time dealing with signals we don't yet
+    // care about. This can prevent live lock where one thread is signaling
+    // fences as fast/faster than another thread can consume them.
+    if (semaphore->value > to_upper_value) {
+      keep_resolving = false;
+      reached_desired_value = true;
+      break;
+    }
+
+    // If the current semaphore is for a value not greater than the past
+    // signaled value, then we know it was signaled previously. But there might
+    // be a waiter on it on GPU.
+    if (semaphore->value <= past_value) {
+      if (semaphore->signal_fence) {
+        return iree_make_status(IREE_STATUS_INTERNAL,
+                                "timeline should already signaled past this "
+                                "time point and cleared the signal fence");
+      }
+
+      // If ther is no waiters, we can recycle this semaphore now. If there
+      // exists one waiter, then query its status and recycle on success. We
+      // only handle success status here. Others will be handled when the fence
+      // is checked for other semaphores' signaling status for the same queue
+      // submission.
+      if (!semaphore->wait_fence ||
+          semaphore->wait_fence->GetStatus() == VK_SUCCESS) {
+        clear_signal_fence(semaphore->signal_fence);
+        semaphore->wait_fence = nullptr;
+        outstanding_semaphores_.erase(semaphore);
+        resolved_semaphores.push_back(semaphore);
+        IREE_DVLOG(3) << "Resolved and recycling semaphore " << semaphore;
+      }
+
+      continue;
+    }
+
+    // This semaphore represents a value gerater than the known previously
+    // signaled value. We don't know its status so we need to really query now.
+
+    if (!semaphore->signal_fence) {
+      return iree_make_status(IREE_STATUS_INTERNAL,
+                              "status of this time point in the timeline "
+                              "should still be pending with a singal fence");
+    }
+    VkResult signal_status = semaphore->signal_fence->GetStatus();
+
+    switch (signal_status) {
+      case VK_SUCCESS:
+        IREE_DVLOG(3) << "..semaphore signaled";
+        signaled_value_.store(semaphore->value);
+        clear_signal_fence(semaphore->signal_fence);
+        // If no waiters, we can recycle this semaphore now.
+        if (!semaphore->wait_fence) {
+          semaphore->wait_fence = nullptr;
+          outstanding_semaphores_.erase(semaphore);
+          resolved_semaphores.push_back(semaphore);
+          IREE_DVLOG(3) << "Resolved and recycling semaphore " << semaphore;
+        }
+        break;
+      case VK_NOT_READY:
+        // The fence has not been signaled yet so this is the furthest time
+        // point we can go in this timeline.
+        keep_resolving = false;
+        IREE_DVLOG(3) << "..semaphore not yet signaled";
+        break;
+      default:
+        // Fence indicates an error (device lost, out of memory, etc).
+        // Propagate this back to our status (and thus any waiters).
+        // Since we only take the first error we find we skip all remaining
+        // fences.
+        keep_resolving = false;
+        clear_signal_fence(semaphore->signal_fence);
+        status_ = VK_RESULT_TO_STATUS(signal_status, "signal status");
+        signaled_value_.store(UINT64_MAX);
+        break;
+    }
+  }
+
+  IREE_DVLOG(3) << "Releasing " << resolved_semaphores.size()
+                << " resolved semaphores; " << outstanding_semaphores_.size()
+                << " still outstanding";
+  semaphore_pool_->ReleaseResolved(&resolved_semaphores);
+  if (!iree_status_is_ok(status_)) {
+    for (iree_host_size_t i = 0; i < command_queue_count_; ++i) {
+      ((SerializingCommandQueue*)command_queues_[i])->AbortQueueSubmission();
+    }
+    semaphore_pool_->ReleaseUnresolved(&outstanding_semaphores_);
+    return status_;
+  }
+
+  if (out_reached_upper_value) *out_reached_upper_value = reached_desired_value;
+  return iree_ok_status();
+}
+
+}  // namespace vulkan
+}  // namespace hal
+}  // namespace iree
+
+using namespace iree::hal::vulkan;
+
+// Wrap the C++ type above so that we have a somewhat normal C interface.
+// Porting the above to C is ideal but since this is just a fallback layer I'm
+// not sure it's worth it (given that we may require Vulkan 1.2 with timeline
+// semaphores built in at some point soon).
+typedef struct iree_hal_vulkan_emulated_semaphore_t {
+  iree_hal_resource_t resource;
+  iree_allocator_t host_allocator;
+  EmulatedTimelineSemaphore* handle;
+} iree_hal_vulkan_emulated_semaphore_t;
+
+namespace {
+extern const iree_hal_semaphore_vtable_t
+    iree_hal_vulkan_emulated_semaphore_vtable;
+}  // namespace
+
+static EmulatedTimelineSemaphore* iree_hal_vulkan_emulated_semaphore_cast(
+    iree_hal_semaphore_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_vulkan_emulated_semaphore_vtable);
+  return ((iree_hal_vulkan_emulated_semaphore_t*)base_value)->handle;
+}
+
+iree_status_t iree_hal_vulkan_emulated_semaphore_create(
+    iree::hal::vulkan::VkDeviceHandle* logical_device,
+    iree::hal::vulkan::TimePointSemaphorePool* semaphore_pool,
+    iree_host_size_t command_queue_count,
+    iree::hal::vulkan::CommandQueue** command_queues, uint64_t initial_value,
+    iree_hal_semaphore_t** out_semaphore) {
+  iree_hal_vulkan_emulated_semaphore_t* semaphore = NULL;
+  IREE_RETURN_IF_ERROR(iree_allocator_malloc(logical_device->host_allocator(),
+                                             sizeof(*semaphore),
+                                             (void**)&semaphore));
+  iree_hal_resource_initialize(&iree_hal_vulkan_emulated_semaphore_vtable,
+                               &semaphore->resource);
+  semaphore->host_allocator = logical_device->host_allocator();
+  semaphore->handle = new EmulatedTimelineSemaphore(
+      logical_device, semaphore_pool, command_queue_count, command_queues,
+      initial_value);
+
+  *out_semaphore = (iree_hal_semaphore_t*)semaphore;
+  return iree_ok_status();
+}
+
+static void iree_hal_vulkan_emulated_semaphore_destroy(
+    iree_hal_semaphore_t* base_semaphore) {
+  iree_hal_vulkan_emulated_semaphore_t* semaphore =
+      (iree_hal_vulkan_emulated_semaphore_t*)base_semaphore;
+  iree_allocator_t host_allocator = semaphore->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  delete semaphore->handle;
+  iree_allocator_free(host_allocator, semaphore);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+iree_status_t iree_hal_vulkan_emulated_semaphore_acquire_wait_handle(
+    iree_hal_semaphore_t* base_semaphore, uint64_t value,
+    const iree::ref_ptr<iree::hal::vulkan::TimePointFence>& wait_fence,
+    VkSemaphore* out_handle) {
+  EmulatedTimelineSemaphore* semaphore =
+      iree_hal_vulkan_emulated_semaphore_cast(base_semaphore);
+  *out_handle = semaphore->GetWaitSemaphore(value, wait_fence);
+  return iree_ok_status();
+}
+
+iree_status_t iree_hal_vulkan_emulated_semaphore_cancel_wait_handle(
+    iree_hal_semaphore_t* base_semaphore, VkSemaphore handle) {
+  EmulatedTimelineSemaphore* semaphore =
+      iree_hal_vulkan_emulated_semaphore_cast(base_semaphore);
+  return semaphore->CancelWaitSemaphore(handle);
+}
+
+iree_status_t iree_hal_vulkan_emulated_semaphore_acquire_signal_handle(
+    iree_hal_semaphore_t* base_semaphore, uint64_t value,
+    const iree::ref_ptr<iree::hal::vulkan::TimePointFence>& signal_fence,
+    VkSemaphore* out_handle) {
+  EmulatedTimelineSemaphore* semaphore =
+      iree_hal_vulkan_emulated_semaphore_cast(base_semaphore);
+  return semaphore->GetSignalSemaphore(value, signal_fence, out_handle);
+}
+
+static iree_status_t iree_hal_vulkan_emulated_semaphore_query(
+    iree_hal_semaphore_t* base_semaphore, uint64_t* out_value) {
+  EmulatedTimelineSemaphore* semaphore =
+      iree_hal_vulkan_emulated_semaphore_cast(base_semaphore);
+  return semaphore->Query(out_value);
+}
+
+static iree_status_t iree_hal_vulkan_emulated_semaphore_signal(
+    iree_hal_semaphore_t* base_semaphore, uint64_t new_value) {
+  EmulatedTimelineSemaphore* semaphore =
+      iree_hal_vulkan_emulated_semaphore_cast(base_semaphore);
+  return semaphore->Signal(new_value);
+}
+
+static void iree_hal_vulkan_emulated_semaphore_fail(
+    iree_hal_semaphore_t* base_semaphore, iree_status_t status) {
+  EmulatedTimelineSemaphore* semaphore =
+      iree_hal_vulkan_emulated_semaphore_cast(base_semaphore);
+  semaphore->Fail(status);
+}
+
+static iree_status_t iree_hal_vulkan_emulated_semaphore_wait(
+    iree_hal_semaphore_t* base_semaphore, uint64_t value,
+    iree_timeout_t timeout) {
+  EmulatedTimelineSemaphore* semaphore =
+      iree_hal_vulkan_emulated_semaphore_cast(base_semaphore);
+  return semaphore->Wait(value, timeout);
+}
+
+iree_status_t iree_hal_vulkan_emulated_semaphore_multi_wait(
+    iree::hal::vulkan::VkDeviceHandle* logical_device,
+    const iree_hal_semaphore_list_t* semaphore_list, iree_timeout_t timeout,
+    VkSemaphoreWaitFlags wait_flags) {
+  // TODO(antiagainst): We actually should get the fences associated with the
+  // emulated timeline semaphores so that we can wait them in a bunch. This
+  // implementation is problematic if we wait to wait any and we have the
+  // first semaphore taking extra long time but the following ones signal
+  // quickly.
+  for (iree_host_size_t i = 0; i < semaphore_list->count; ++i) {
+    IREE_RETURN_IF_ERROR(iree_hal_vulkan_emulated_semaphore_wait(
+        semaphore_list->semaphores[i], semaphore_list->payload_values[i],
+        timeout));
+    if (wait_flags & VK_SEMAPHORE_WAIT_ANY_BIT) return iree_ok_status();
+  }
+  return iree_ok_status();
+}
+
+namespace {
+const iree_hal_semaphore_vtable_t iree_hal_vulkan_emulated_semaphore_vtable = {
+    /*.destroy=*/iree_hal_vulkan_emulated_semaphore_destroy,
+    /*.query=*/iree_hal_vulkan_emulated_semaphore_query,
+    /*.signal=*/iree_hal_vulkan_emulated_semaphore_signal,
+    /*.fail=*/iree_hal_vulkan_emulated_semaphore_fail,
+    /*.wait=*/
+    iree_hal_vulkan_emulated_semaphore_wait,
+};
+}  // namespace
diff --git a/runtime/src/iree/hal/vulkan/emulated_semaphore.h b/runtime/src/iree/hal/vulkan/emulated_semaphore.h
new file mode 100644
index 0000000..ac7c62c
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/emulated_semaphore.h
@@ -0,0 +1,161 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_VULKAN_ENUMLATED_SEMAPHORE_H_
+#define IREE_HAL_VULKAN_ENUMLATED_SEMAPHORE_H_
+
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/vulkan/command_queue.h"
+#include "iree/hal/vulkan/handle_util.h"
+#include "iree/hal/vulkan/timepoint_util.h"
+#include "iree/hal/vulkan/util/ref_ptr.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Creates a timeline semaphore emulated via `VkFence`s and binary
+// `VkSemaphore`s.
+//
+// Vulkan provides several explicit synchronization primitives: fences,
+// (binary/timeline) semaphores, events, pipeline barriers, and render passes.
+// See "6. Synchronization and Cache Control" of the Vulkan specification
+// for the details.
+//
+// Render passes are for graphics pipelines so IREE does not care about them.
+// Pipeline barriers synchronize control within a command buffer at a single
+// point. Fences, (binary/timeline) semaphores, and events are synchronization
+// primitives that have separate signal and wait operations. Events are more
+// fine-grained compared to fences and semaphores given that they can be
+// signaled or waited within a command buffer while fences and semaphores are
+// at queue submissions. Each of them have its usage requirements:
+//
+// * Fences must be signaled on GPU and waited on CPU. Fences must be reset
+//   before reuse.
+// * Binary semaphores must be signaled on GPU and waited on GPU. They do not
+//   support wait-before-signal submission order. More importantly, binary
+//   semaphore wait also unsignals the semaphore. So binary semaphore signals
+//   and waits should occur in discrete 1:1 pairs.
+// * Timeline semaphores can be signaled on CPU or GPU and waited on CPU or GPU.
+//   They support wait-before-signal submission order. Timeline semaphores do
+//   not need to be reset.
+//
+// It's clear that timeline semaphore is more flexible than fences and binary
+// semaphores: it unifies GPU and CPU synchronization with a single primitive.
+// But it's not always available: it requires the VK_KHR_timeline_semaphore
+// or Vulkan 1.2. When it's not available, it can be emulated via `VkFence`s
+// and binary `VkSemaphore`s. The emulation need to provide the functionality of
+// timeline semaphores and also not violate the usage requirements of `VkFence`s
+// and binary `VkSemaphore`s.
+//
+// The basic idea is to create a timeline object with time points to emulate the
+// timeline semaphore, which consists of a monotonically increasing 64-bit
+// integer value. Each time point represents a specific signaled/waited integer
+// value of the timeline semaphore; each time point can associate with binary
+// `VkSemaphore`s and/or `VkFence`s for emulating the synchronization.
+//
+// Concretely, for each of the possible signal -> wait scenarios timeline
+// semaphore supports:
+//
+// ### GPU -> GPU (via `vkQueueSubmit`)
+//
+// Each `vkQueueSubmit` can attach a `VkTimelineSemaphoreSubmitInfo` to describe
+// the timeline semaphore values signaled and waited. Each of the signaled value
+// will be a time point and emulated by a binary `VkSemaphore`. We submit the
+// binary `VkSemahpore`s to the GPU under the hood. For the waited values, the
+// situation is more complicated because of the differences between binary and
+// timeline semaphores:
+//
+// * Binary semaphore signal-wait relationship is strictly 1:1, unlike timeline
+//   semaphore where we can have 1:N cases. This means for a specific binary
+//   `VkSemaphore` used to emulate a signaled time point, we can have at most
+//   one subsequent `vkQueueSubmit` waits on it. We need other mechanisms for
+//   additional waits. A simple way is to involve the CPU and don't sumbit
+//   the additional work to queue until the desired value is already signaled
+//   past. This requires `VkFence`s for letting the CPU know the status of
+//   GPU progress, but `VkFence` is needed anyway because of GPU -> CPU
+//   synchronization.
+// * Binary semaphores does not support wait-before-signal submission order.
+//   This means we need to put the submission into a self-managed queue if the
+//   binary semaphores used to emulate the time points waited by the submission
+//   are not submitted to GPU yet.
+//
+// ### GPU -> CPU (via `vkWaitSemaphores`)
+//
+// Without timeline semaphore, we need to use fences to let CPU wait on GPU
+// progress. So this direction can be emulated by `vkWaitFences`. It means we
+// need to associate a `VkFence` with the given waited timeline semaphores.
+// Because we don't know whether a particular `vkQueueSubmit` with timeline
+// semaphores will be later waited on by CPU beforehand, we need to bundle each
+// of them with a `VkFence` just in case they will be waited on later.
+//
+// ### CPU -> GPU (via `vkSignalSemaphore`)
+//
+// This direction can be handled by bumping the signaled timeline value and
+// scan the self-managed queue to submit more work to GPU if possible.
+//
+// ### CPU -> CPU (via `vkWaitSemaphores`)
+//
+// This is similar to CPU -> GPU direction; we just need to enable other threads
+// on CPU side and let them progress.
+//
+// The implementation is inspired by the Vulkan-ExtensionLayer project:
+// https://github.com/KhronosGroup/Vulkan-ExtensionLayer. We don't handle all
+// the aspects of the full spec though given that IREE only uses a subset of
+// synchronization primitives. So this should not be treated as a full
+// emulation of the Vulkan spec and thus does not substitute
+// Vulkan-ExtensionLayer.
+iree_status_t iree_hal_vulkan_emulated_semaphore_create(
+    iree::hal::vulkan::VkDeviceHandle* logical_device,
+    iree::hal::vulkan::TimePointSemaphorePool* semaphore_pool,
+    iree_host_size_t command_queue_count,
+    iree::hal::vulkan::CommandQueue** command_queues, uint64_t initial_value,
+    iree_hal_semaphore_t** out_semaphore);
+
+// Acquires a binary semaphore for waiting on the timeline to advance to the
+// given |value|. The semaphore returned won't be waited by anyone else.
+// |wait_fence| is the fence associated with the queue submission that waiting
+// on this semaphore.
+//
+// Returns VK_NULL_HANDLE if there are no available semaphores for the given
+// |value|.
+iree_status_t iree_hal_vulkan_emulated_semaphore_acquire_wait_handle(
+    iree_hal_semaphore_t* semaphore, uint64_t value,
+    const iree::ref_ptr<iree::hal::vulkan::TimePointFence>& wait_fence,
+    VkSemaphore* out_handle);
+
+// Cancels the waiting attempt on the given binary |semaphore|. This allows
+// the |semaphore| to be waited by others.
+iree_status_t iree_hal_vulkan_emulated_semaphore_cancel_wait_handle(
+    iree_hal_semaphore_t* semaphore, VkSemaphore handle);
+
+// Acquires a binary semaphore for signaling the timeline to the given |value|.
+// |value| must be smaller than the current timeline value. |signal_fence| is
+// the fence associated with the queue submission that signals this semaphore.
+iree_status_t iree_hal_vulkan_emulated_semaphore_acquire_signal_handle(
+    iree_hal_semaphore_t* semaphore, uint64_t value,
+    const iree::ref_ptr<iree::hal::vulkan::TimePointFence>& signal_fence,
+    VkSemaphore* out_handle);
+
+// Performs a multi-wait on one or more semaphores.
+// By default this is an all-wait but |wait_flags| may contain
+// VK_SEMAPHORE_WAIT_ANY_BIT to change to an any-wait.
+//
+// Returns IREE_STATUS_DEADLINE_EXCEEDED if the wait does not complete before
+// |deadline_ns| elapses.
+iree_status_t iree_hal_vulkan_emulated_semaphore_multi_wait(
+    iree::hal::vulkan::VkDeviceHandle* logical_device,
+    const iree_hal_semaphore_list_t* semaphore_list, iree_timeout_t timeout,
+    VkSemaphoreWaitFlags wait_flags);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_VULKAN_ENUMLATED_SEMAPHORE_H_
diff --git a/runtime/src/iree/hal/vulkan/extensibility_util.cc b/runtime/src/iree/hal/vulkan/extensibility_util.cc
new file mode 100644
index 0000000..a3574b0
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/extensibility_util.cc
@@ -0,0 +1,233 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/vulkan/extensibility_util.h"
+
+#include <cstdint>
+#include <cstring>
+
+#include "iree/hal/vulkan/status_util.h"
+
+// Returns true if |layers| contains a layer matching |layer_name|.
+static bool iree_hal_vulkan_layer_list_contains(uint32_t layer_count,
+                                                const VkLayerProperties* layers,
+                                                const char* layer_name) {
+  for (uint32_t i = 0; i < layer_count; ++i) {
+    if (strcmp(layer_name, layers[i].layerName) == 0) {
+      return true;
+    }
+  }
+  return false;
+}
+
+static iree_status_t iree_hal_vulkan_match_available_layers(
+    iree_host_size_t available_layers_count,
+    const VkLayerProperties* available_layers,
+    const iree_hal_vulkan_string_list_t* required_layers,
+    const iree_hal_vulkan_string_list_t* optional_layers,
+    iree_hal_vulkan_string_list_t* out_enabled_layers) {
+  memset(out_enabled_layers->values, 0,
+         (required_layers->count + optional_layers->count) *
+             sizeof(out_enabled_layers->values[0]));
+
+  for (iree_host_size_t i = 0; i < required_layers->count; ++i) {
+    const char* layer_name = required_layers->values[i];
+    if (!iree_hal_vulkan_layer_list_contains(available_layers_count,
+                                             available_layers, layer_name)) {
+      return iree_make_status(IREE_STATUS_UNAVAILABLE,
+                              "required layer %s not available", layer_name);
+    }
+    out_enabled_layers->values[out_enabled_layers->count++] = layer_name;
+  }
+
+  for (iree_host_size_t i = 0; i < optional_layers->count; ++i) {
+    const char* layer_name = optional_layers->values[i];
+    if (iree_hal_vulkan_layer_list_contains(available_layers_count,
+                                            available_layers, layer_name)) {
+      out_enabled_layers->values[out_enabled_layers->count++] = layer_name;
+    }
+  }
+
+  return iree_ok_status();
+}
+
+iree_status_t iree_hal_vulkan_match_available_instance_layers(
+    const iree::hal::vulkan::DynamicSymbols* syms,
+    const iree_hal_vulkan_string_list_t* required_layers,
+    const iree_hal_vulkan_string_list_t* optional_layers, iree::Arena* arena,
+    iree_hal_vulkan_string_list_t* out_enabled_layers) {
+  uint32_t layer_property_count = 0;
+  VK_RETURN_IF_ERROR(
+      syms->vkEnumerateInstanceLayerProperties(&layer_property_count, NULL),
+      "vkEnumerateInstanceLayerProperties");
+  VkLayerProperties* layer_properties =
+      (VkLayerProperties*)arena->AllocateBytes(layer_property_count *
+                                               sizeof(VkLayerProperties));
+  VK_RETURN_IF_ERROR(syms->vkEnumerateInstanceLayerProperties(
+                         &layer_property_count, layer_properties),
+                     "vkEnumerateInstanceLayerProperties");
+  out_enabled_layers->count = 0;
+  out_enabled_layers->values = (const char**)arena->AllocateBytes(
+      (required_layers->count + optional_layers->count) *
+      sizeof(out_enabled_layers->values[0]));
+  return iree_hal_vulkan_match_available_layers(
+      layer_property_count, layer_properties, required_layers, optional_layers,
+      out_enabled_layers);
+}
+
+// Returns true if |extensions| contains a layer matching |extension_name|.
+static bool iree_hal_vulkan_extension_list_contains(
+    uint32_t extension_count, const VkExtensionProperties* extensions,
+    const char* extension_name) {
+  for (uint32_t i = 0; i < extension_count; ++i) {
+    if (strcmp(extension_name, extensions[i].extensionName) == 0) {
+      return true;
+    }
+  }
+  return false;
+}
+
+static iree_status_t iree_hal_vulkan_match_available_extensions(
+    iree_host_size_t available_extension_count,
+    const VkExtensionProperties* available_extensions,
+    const iree_hal_vulkan_string_list_t* required_extensions,
+    const iree_hal_vulkan_string_list_t* optional_extensions,
+    iree_hal_vulkan_string_list_t* out_enabled_extensions) {
+  memset(out_enabled_extensions->values, 0,
+         (required_extensions->count + optional_extensions->count) *
+             sizeof(out_enabled_extensions->values[0]));
+
+  for (iree_host_size_t i = 0; i < required_extensions->count; ++i) {
+    const char* extension_name = required_extensions->values[i];
+    if (!iree_hal_vulkan_extension_list_contains(
+            available_extension_count, available_extensions, extension_name)) {
+      return iree_make_status(IREE_STATUS_UNAVAILABLE,
+                              "required extension %s not available",
+                              extension_name);
+    }
+    out_enabled_extensions->values[out_enabled_extensions->count++] =
+        extension_name;
+  }
+
+  for (iree_host_size_t i = 0; i < optional_extensions->count; ++i) {
+    const char* extension_name = optional_extensions->values[i];
+    if (iree_hal_vulkan_extension_list_contains(
+            available_extension_count, available_extensions, extension_name)) {
+      out_enabled_extensions->values[out_enabled_extensions->count++] =
+          extension_name;
+    }
+  }
+
+  return iree_ok_status();
+}
+
+iree_status_t iree_hal_vulkan_match_available_instance_extensions(
+    const iree::hal::vulkan::DynamicSymbols* syms,
+    const iree_hal_vulkan_string_list_t* required_extensions,
+    const iree_hal_vulkan_string_list_t* optional_extensions,
+    iree::Arena* arena, iree_hal_vulkan_string_list_t* out_enabled_extensions) {
+  uint32_t extension_property_count = 0;
+  VK_RETURN_IF_ERROR(syms->vkEnumerateInstanceExtensionProperties(
+                         NULL, &extension_property_count, NULL),
+                     "vkEnumerateInstanceExtensionProperties");
+  VkExtensionProperties* extension_properties =
+      (VkExtensionProperties*)arena->AllocateBytes(
+          extension_property_count * sizeof(VkExtensionProperties));
+  VK_RETURN_IF_ERROR(syms->vkEnumerateInstanceExtensionProperties(
+                         NULL, &extension_property_count, extension_properties),
+                     "vkEnumerateInstanceExtensionProperties");
+  out_enabled_extensions->count = 0;
+  out_enabled_extensions->values = (const char**)arena->AllocateBytes(
+      (required_extensions->count + optional_extensions->count) *
+      sizeof(out_enabled_extensions->values[0]));
+  return iree_hal_vulkan_match_available_extensions(
+      extension_property_count, extension_properties, required_extensions,
+      optional_extensions, out_enabled_extensions);
+}
+
+iree_status_t iree_hal_vulkan_match_available_device_extensions(
+    const iree::hal::vulkan::DynamicSymbols* syms,
+    VkPhysicalDevice physical_device,
+    const iree_hal_vulkan_string_list_t* required_extensions,
+    const iree_hal_vulkan_string_list_t* optional_extensions,
+    iree::Arena* arena, iree_hal_vulkan_string_list_t* out_enabled_extensions) {
+  uint32_t extension_property_count = 0;
+  VK_RETURN_IF_ERROR(
+      syms->vkEnumerateDeviceExtensionProperties(
+          physical_device, NULL, &extension_property_count, NULL),
+      "vkEnumerateDeviceExtensionProperties");
+  VkExtensionProperties* extension_properties =
+      (VkExtensionProperties*)arena->AllocateBytes(
+          extension_property_count * sizeof(VkExtensionProperties));
+  VK_RETURN_IF_ERROR(syms->vkEnumerateDeviceExtensionProperties(
+                         physical_device, NULL, &extension_property_count,
+                         extension_properties),
+                     "vkEnumerateDeviceExtensionProperties");
+  out_enabled_extensions->count = 0;
+  out_enabled_extensions->values = (const char**)arena->AllocateBytes(
+      (required_extensions->count + optional_extensions->count) *
+      sizeof(out_enabled_extensions->values[0]));
+  return iree_hal_vulkan_match_available_extensions(
+      extension_property_count, extension_properties, required_extensions,
+      optional_extensions, out_enabled_extensions);
+}
+
+iree_hal_vulkan_instance_extensions_t
+iree_hal_vulkan_populate_enabled_instance_extensions(
+    const iree_hal_vulkan_string_list_t* enabled_extensions) {
+  iree_hal_vulkan_instance_extensions_t extensions;
+  memset(&extensions, 0, sizeof(extensions));
+  for (iree_host_size_t i = 0; i < enabled_extensions->count; ++i) {
+    const char* extension_name = enabled_extensions->values[i];
+    if (strcmp(extension_name, VK_EXT_DEBUG_UTILS_EXTENSION_NAME) == 0) {
+      extensions.debug_utils = true;
+    }
+  }
+  return extensions;
+}
+
+iree_hal_vulkan_device_extensions_t
+iree_hal_vulkan_populate_enabled_device_extensions(
+    const iree_hal_vulkan_string_list_t* enabled_extensions) {
+  iree_hal_vulkan_device_extensions_t extensions;
+  memset(&extensions, 0, sizeof(extensions));
+  for (iree_host_size_t i = 0; i < enabled_extensions->count; ++i) {
+    const char* extension_name = enabled_extensions->values[i];
+    if (strcmp(extension_name, VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME) == 0) {
+      extensions.push_descriptors = true;
+    } else if (strcmp(extension_name,
+                      VK_KHR_TIMELINE_SEMAPHORE_EXTENSION_NAME) == 0) {
+      extensions.timeline_semaphore = true;
+    } else if (strcmp(extension_name, VK_EXT_HOST_QUERY_RESET_EXTENSION_NAME) ==
+               0) {
+      extensions.host_query_reset = true;
+    } else if (strcmp(extension_name,
+                      VK_EXT_CALIBRATED_TIMESTAMPS_EXTENSION_NAME) == 0) {
+      extensions.calibrated_timestamps = true;
+    }
+  }
+  return extensions;
+}
+
+iree_hal_vulkan_device_extensions_t
+iree_hal_vulkan_infer_enabled_device_extensions(
+    const iree::hal::vulkan::DynamicSymbols* device_syms) {
+  iree_hal_vulkan_device_extensions_t extensions;
+  memset(&extensions, 0, sizeof(extensions));
+  if (device_syms->vkCmdPushDescriptorSetKHR) {
+    extensions.push_descriptors = true;
+  }
+  if (device_syms->vkSignalSemaphore || device_syms->vkSignalSemaphoreKHR) {
+    extensions.timeline_semaphore = true;
+  }
+  if (device_syms->vkResetQueryPoolEXT) {
+    extensions.host_query_reset = true;
+  }
+  if (device_syms->vkGetCalibratedTimestampsEXT) {
+    extensions.calibrated_timestamps = true;
+  }
+  return extensions;
+}
diff --git a/runtime/src/iree/hal/vulkan/extensibility_util.h b/runtime/src/iree/hal/vulkan/extensibility_util.h
new file mode 100644
index 0000000..f436988
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/extensibility_util.h
@@ -0,0 +1,97 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_VULKAN_EXTENSIBILITY_UTIL_H_
+#define IREE_HAL_VULKAN_EXTENSIBILITY_UTIL_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/vulkan/api.h"
+#include "iree/hal/vulkan/dynamic_symbols.h"
+#include "iree/hal/vulkan/util/arena.h"
+
+// A list of NUL-terminated strings (so they can be passed directly to Vulkan).
+typedef struct iree_hal_vulkan_string_list_t {
+  iree_host_size_t count;
+  const char** values;
+} iree_hal_vulkan_string_list_t;
+
+// Populates |out_enabled_layers| with all layers that are both available in the
+// implementation and |required_layers| and |optional_layers| lists.
+// |out_enabled_layers| must have capacity at least the sum of
+// |required_layers|.count and |optional_layer|.count.
+// Returns failure if any |required_layers| are unavailable.
+iree_status_t iree_hal_vulkan_match_available_instance_layers(
+    const iree::hal::vulkan::DynamicSymbols* syms,
+    const iree_hal_vulkan_string_list_t* required_layers,
+    const iree_hal_vulkan_string_list_t* optional_layers, iree::Arena* arena,
+    iree_hal_vulkan_string_list_t* out_enabled_layers);
+
+// Populates |out_enabled_extensions| with all extensions that are both
+// available in the implementation and |required_extensions| and
+// |optional_extensions| lists. |out_enabled_extensions| must have capacity at
+// least the sum of |required_extensions|.count and |optional_extensions|.count.
+// Returns failure if any |required_extensions| are unavailable.
+iree_status_t iree_hal_vulkan_match_available_instance_extensions(
+    const iree::hal::vulkan::DynamicSymbols* syms,
+    const iree_hal_vulkan_string_list_t* required_extensions,
+    const iree_hal_vulkan_string_list_t* optional_extensions,
+    iree::Arena* arena, iree_hal_vulkan_string_list_t* out_enabled_extensions);
+
+// Populates |out_enabled_extensions| with all extensions that are both
+// available in the implementation and |required_extensions| and
+// |optional_extensions| lists. |out_enabled_extensions| must have capacity at
+// least the sum of |required_extensions|.count and |optional_extensions|.count.
+// Returns failure if any |required_extensions| are unavailable.
+iree_status_t iree_hal_vulkan_match_available_device_extensions(
+    const iree::hal::vulkan::DynamicSymbols* syms,
+    VkPhysicalDevice physical_device,
+    const iree_hal_vulkan_string_list_t* required_extensions,
+    const iree_hal_vulkan_string_list_t* optional_extensions,
+    iree::Arena* arena, iree_hal_vulkan_string_list_t* out_enabled_extensions);
+
+// Bits for enabled instance extensions.
+// We must use this to query support instead of just detecting symbol names as
+// ICDs will resolve the functions sometimes even if they don't support the
+// extension (or we didn't ask for it to be enabled).
+typedef struct iree_hal_vulkan_instance_extensions_t {
+  // VK_EXT_debug_utils is enabled and a debug messenger is registered.
+  // https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/chap44.html#VK_EXT_debug_utils
+  bool debug_utils : 1;
+} iree_hal_vulkan_instance_extensions_t;
+
+// Returns a bitfield with all of the provided extension names.
+iree_hal_vulkan_instance_extensions_t
+iree_hal_vulkan_populate_enabled_instance_extensions(
+    const iree_hal_vulkan_string_list_t* enabled_extension);
+
+// Bits for enabled device extensions.
+// We must use this to query support instead of just detecting symbol names as
+// ICDs will resolve the functions sometimes even if they don't support the
+// extension (or we didn't ask for it to be enabled).
+typedef struct iree_hal_vulkan_device_extensions_t {
+  // VK_KHR_push_descriptor is enabled and vkCmdPushDescriptorSetKHR is valid.
+  bool push_descriptors : 1;
+  // VK_KHR_timeline_semaphore is enabled.
+  bool timeline_semaphore : 1;
+  // VK_EXT_host_query_reset is enabled.
+  bool host_query_reset : 1;
+  // VK_EXT_calibrated_timestamps is enabled.
+  bool calibrated_timestamps : 1;
+} iree_hal_vulkan_device_extensions_t;
+
+// Returns a bitfield with all of the provided extension names.
+iree_hal_vulkan_device_extensions_t
+iree_hal_vulkan_populate_enabled_device_extensions(
+    const iree_hal_vulkan_string_list_t* enabled_extension);
+
+// Returns a bitfield with the extensions that are (likely) available on the
+// device symbols. This is less reliable than setting the bits directly when
+// the known set of extensions is available.
+iree_hal_vulkan_device_extensions_t
+iree_hal_vulkan_infer_enabled_device_extensions(
+    const iree::hal::vulkan::DynamicSymbols* device_syms);
+
+#endif  // IREE_HAL_VULKAN_EXTENSIBILITY_UTIL_H_
diff --git a/runtime/src/iree/hal/vulkan/handle_util.h b/runtime/src/iree/hal/vulkan/handle_util.h
new file mode 100644
index 0000000..0cff882
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/handle_util.h
@@ -0,0 +1,166 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Helpers for wrapping Vulkan handles that don't require us to wrap every type.
+// This keeps our compilation time reasonable (as the vulkancpp library is
+// insane) while giving us nice safety around cleanup and ensuring we use
+// dynamic symbols and consistent allocators.
+//
+// Do not add functionality beyond handle management to these types. Keep our
+// Vulkan usage mostly functional and C-like to ensure minimal code size and
+// readability.
+
+#ifndef IREE_HAL_VULKAN_HANDLE_UTIL_H_
+#define IREE_HAL_VULKAN_HANDLE_UTIL_H_
+
+// clang-format off: must be included before all other headers.
+#include "iree/hal/vulkan/vulkan_headers.h"  // IWYU pragma: export
+// clang-format on
+
+#include "iree/base/internal/synchronization.h"
+#include "iree/hal/vulkan/dynamic_symbols.h"
+#include "iree/hal/vulkan/extensibility_util.h"
+#include "iree/hal/vulkan/status_util.h"
+#include "iree/hal/vulkan/util/ref_ptr.h"
+
+namespace iree {
+namespace hal {
+namespace vulkan {
+
+template <class T, class U = T>
+constexpr T exchange(T& obj, U&& new_value) {
+  T old_value = std::move(obj);
+  obj = std::forward<U>(new_value);
+  return old_value;
+}
+
+class VkDeviceHandle : public RefObject<VkDeviceHandle> {
+ public:
+  VkDeviceHandle(DynamicSymbols* syms,
+                 iree_hal_vulkan_device_extensions_t enabled_extensions,
+                 bool owns_device, iree_allocator_t host_allocator,
+                 const VkAllocationCallbacks* allocator = nullptr)
+      : syms_(add_ref(syms)),
+        enabled_extensions_(enabled_extensions),
+        owns_device_(owns_device),
+        allocator_(allocator),
+        host_allocator_(host_allocator) {}
+  ~VkDeviceHandle() { reset(); }
+
+  VkDeviceHandle(const VkDeviceHandle&) = delete;
+  VkDeviceHandle& operator=(const VkDeviceHandle&) = delete;
+  VkDeviceHandle(VkDeviceHandle&& other) noexcept
+      : value_(exchange(other.value_, static_cast<VkDevice>(VK_NULL_HANDLE))),
+        syms_(std::move(other.syms_)),
+        enabled_extensions_(other.enabled_extensions_),
+        owns_device_(other.owns_device_),
+        allocator_(other.allocator_),
+        host_allocator_(other.host_allocator_) {}
+
+  void reset() {
+    if (value_ == VK_NULL_HANDLE) return;
+    if (owns_device_) {
+      syms_->vkDestroyDevice(value_, allocator_);
+    }
+    value_ = VK_NULL_HANDLE;
+  }
+
+  VkDevice value() const noexcept { return value_; }
+  VkDevice* mutable_value() noexcept { return &value_; }
+  operator VkDevice() const noexcept { return value_; }
+
+  const ref_ptr<DynamicSymbols>& syms() const noexcept { return syms_; }
+  const VkAllocationCallbacks* allocator() const noexcept { return allocator_; }
+  iree_allocator_t host_allocator() const noexcept { return host_allocator_; }
+
+  const iree_hal_vulkan_device_extensions_t& enabled_extensions() const {
+    return enabled_extensions_;
+  }
+
+ private:
+  VkDevice value_ = VK_NULL_HANDLE;
+  ref_ptr<DynamicSymbols> syms_;
+  iree_hal_vulkan_device_extensions_t enabled_extensions_;
+  bool owns_device_;
+  const VkAllocationCallbacks* allocator_ = nullptr;
+  iree_allocator_t host_allocator_;
+};
+
+class VkCommandPoolHandle {
+ public:
+  explicit VkCommandPoolHandle(VkDeviceHandle* logical_device)
+      : logical_device_(logical_device) {
+    iree_slim_mutex_initialize(&mutex_);
+  }
+  ~VkCommandPoolHandle() {
+    reset();
+    iree_slim_mutex_deinitialize(&mutex_);
+  }
+
+  VkCommandPoolHandle(const VkCommandPoolHandle&) = delete;
+  VkCommandPoolHandle& operator=(const VkCommandPoolHandle&) = delete;
+  VkCommandPoolHandle(VkCommandPoolHandle&& other) noexcept
+      : logical_device_(std::move(other.logical_device_)),
+        value_(exchange(other.value_,
+                        static_cast<VkCommandPool>(VK_NULL_HANDLE))) {}
+  VkCommandPoolHandle& operator=(VkCommandPoolHandle&& other) {
+    std::swap(logical_device_, other.logical_device_);
+    std::swap(value_, other.value_);
+    return *this;
+  }
+
+  void reset() {
+    if (value_ == VK_NULL_HANDLE) return;
+    syms()->vkDestroyCommandPool(*logical_device_, value_, allocator());
+    value_ = VK_NULL_HANDLE;
+  }
+
+  VkCommandPool value() const noexcept { return value_; }
+  VkCommandPool* mutable_value() noexcept { return &value_; }
+  operator VkCommandPool() const noexcept { return value_; }
+
+  const VkDeviceHandle* logical_device() const noexcept {
+    return logical_device_;
+  }
+  const ref_ptr<DynamicSymbols>& syms() const noexcept {
+    return logical_device_->syms();
+  }
+  const VkAllocationCallbacks* allocator() const noexcept {
+    return logical_device_->allocator();
+  }
+
+  iree_status_t Allocate(const VkCommandBufferAllocateInfo* allocate_info,
+                         VkCommandBuffer* out_handle) {
+    iree_slim_mutex_lock(&mutex_);
+    iree_status_t status =
+        VK_RESULT_TO_STATUS(syms()->vkAllocateCommandBuffers(
+                                *logical_device_, allocate_info, out_handle),
+                            "vkAllocateCommandBuffers");
+    iree_slim_mutex_unlock(&mutex_);
+    return status;
+  }
+
+  void Free(VkCommandBuffer handle) {
+    iree_slim_mutex_lock(&mutex_);
+    syms()->vkFreeCommandBuffers(*logical_device_, value_, 1, &handle);
+    iree_slim_mutex_unlock(&mutex_);
+  }
+
+ private:
+  VkDeviceHandle* logical_device_;
+  VkCommandPool value_ = VK_NULL_HANDLE;
+
+  // Vulkan command pools are not thread safe and require external
+  // synchronization. Since we allow arbitrary threads to allocate and
+  // deallocate the HAL command buffers we need to externally synchronize.
+  iree_slim_mutex_t mutex_;
+};
+
+}  // namespace vulkan
+}  // namespace hal
+}  // namespace iree
+
+#endif  // IREE_HAL_VULKAN_HANDLE_UTIL_H_
diff --git a/runtime/src/iree/hal/vulkan/internal_vk_mem_alloc.cc b/runtime/src/iree/hal/vulkan/internal_vk_mem_alloc.cc
new file mode 100644
index 0000000..145afd9
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/internal_vk_mem_alloc.cc
@@ -0,0 +1,62 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Only compile if an external implementation has not been otherwise linked.
+#if !defined(VULKAN_MEMORY_ALLOCATOR_EXTERNAL_IMPL)
+
+#include <ostream>
+
+#include "iree/base/internal/synchronization.h"
+#include "iree/base/logging.h"
+
+#define VMA_ASSERT IREE_DCHECK
+#define VMA_HEAVY_ASSERT IREE_DCHECK
+
+// NOTE: logging is disabled by default as unless you are debugging VMA itself
+// the information is not useful and just slows things down.
+#if 0
+#define VMA_DEBUG_LOG(...) _IREE_LOG_INFO << __VA_ARGS__
+#else
+#define VMA_DEBUG_LOG(...)
+#endif  // !NDEBUG
+
+// Use iree_slim_mutex_t for VMA_MUTEX.
+class IreeVmaMutex {
+ public:
+  IreeVmaMutex() { iree_slim_mutex_initialize(&mutex_); }
+  ~IreeVmaMutex() { iree_slim_mutex_deinitialize(&mutex_); }
+
+  void Lock() { iree_slim_mutex_lock(&mutex_); }
+  void Unlock() { iree_slim_mutex_unlock(&mutex_); }
+  bool TryLock() { return iree_slim_mutex_try_lock(&mutex_); }
+
+ private:
+  iree_slim_mutex_t mutex_;
+};
+#define VMA_MUTEX IreeVmaMutex
+
+// Use iree_slim_mutex_t for VMA_RW_MUTEX.
+class IreeVmaRWMutex {
+ public:
+  IreeVmaRWMutex() { iree_slim_mutex_initialize(&mutex_); }
+  ~IreeVmaRWMutex() { iree_slim_mutex_deinitialize(&mutex_); }
+
+  void LockRead() { iree_slim_mutex_lock(&mutex_); }
+  void UnlockRead() { iree_slim_mutex_unlock(&mutex_); }
+  bool TryLockRead() { return iree_slim_mutex_try_lock(&mutex_); }
+  void LockWrite() { iree_slim_mutex_lock(&mutex_); }
+  void UnlockWrite() { iree_slim_mutex_unlock(&mutex_); }
+  bool TryLockWrite() { return iree_slim_mutex_try_lock(&mutex_); }
+
+ private:
+  iree_slim_mutex_t mutex_;
+};
+#define VMA_RW_MUTEX IreeVmaRWMutex
+
+#define VMA_IMPLEMENTATION
+#include "iree/hal/vulkan/internal_vk_mem_alloc.h"
+
+#endif  // !VULKAN_MEMORY_ALLOCATOR_EXTERNAL_IMPL
diff --git a/runtime/src/iree/hal/vulkan/internal_vk_mem_alloc.h b/runtime/src/iree/hal/vulkan/internal_vk_mem_alloc.h
new file mode 100644
index 0000000..1f50682
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/internal_vk_mem_alloc.h
@@ -0,0 +1,23 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_VULKAN_INTERNAL_VK_MEM_ALLOC_H_
+#define IREE_HAL_VULKAN_INTERNAL_VK_MEM_ALLOC_H_
+
+#include "iree/hal/vulkan/vulkan_headers.h"
+
+// Force all Vulkan calls to go through an indirect pVulkanFunctions interface.
+// https://gpuopen-librariesandsdks.github.io/VulkanMemoryAllocator/html/configuration.html
+#define VMA_STATIC_VULKAN_FUNCTIONS 0
+
+// Prevent VMA from querying for dynamic functions we may not have provided.
+// We want to be able to print nice errors or decide whether something is ok
+// to be omitted and not have VMA poking around where it shouldn't.
+#define VMA_DYNAMIC_VULKAN_FUNCTIONS 0
+
+#include <vk_mem_alloc.h>  // IWYU pragma: export
+
+#endif  // IREE_HAL_VULKAN_INTERNAL_VK_MEM_ALLOC_H_
diff --git a/runtime/src/iree/hal/vulkan/native_descriptor_set.cc b/runtime/src/iree/hal/vulkan/native_descriptor_set.cc
new file mode 100644
index 0000000..e4ab3ef
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/native_descriptor_set.cc
@@ -0,0 +1,92 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/vulkan/native_descriptor_set.h"
+
+#include <cstddef>
+
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+
+using namespace iree::hal::vulkan;
+
+typedef struct iree_hal_vulkan_native_descriptor_set_t {
+  iree_hal_resource_t resource;
+  VkDeviceHandle* logical_device;
+  VkDescriptorSet handle;
+} iree_hal_vulkan_native_descriptor_set_t;
+
+namespace {
+extern const iree_hal_descriptor_set_vtable_t
+    iree_hal_vulkan_native_descriptor_set_vtable;
+}  // namespace
+
+static iree_hal_vulkan_native_descriptor_set_t*
+iree_hal_vulkan_native_descriptor_set_cast(
+    iree_hal_descriptor_set_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value,
+                       &iree_hal_vulkan_native_descriptor_set_vtable);
+  return (iree_hal_vulkan_native_descriptor_set_t*)base_value;
+}
+
+iree_status_t iree_hal_vulkan_native_descriptor_set_create(
+    iree::hal::vulkan::VkDeviceHandle* logical_device, VkDescriptorSet handle,
+    iree_hal_descriptor_set_t** out_descriptor_set) {
+  IREE_ASSERT_ARGUMENT(logical_device);
+  IREE_ASSERT_ARGUMENT(handle);
+  IREE_ASSERT_ARGUMENT(out_descriptor_set);
+  *out_descriptor_set = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_vulkan_native_descriptor_set_t* descriptor_set = NULL;
+  iree_status_t status =
+      iree_allocator_malloc(logical_device->host_allocator(),
+                            sizeof(*descriptor_set), (void**)&descriptor_set);
+  if (iree_status_is_ok(status)) {
+    iree_hal_resource_initialize(&iree_hal_vulkan_native_descriptor_set_vtable,
+                                 &descriptor_set->resource);
+    descriptor_set->logical_device = logical_device;
+    descriptor_set->handle = handle;
+    *out_descriptor_set = (iree_hal_descriptor_set_t*)descriptor_set;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_vulkan_native_descriptor_set_destroy(
+    iree_hal_descriptor_set_t* base_descriptor_set) {
+  iree_hal_vulkan_native_descriptor_set_t* descriptor_set =
+      iree_hal_vulkan_native_descriptor_set_cast(base_descriptor_set);
+  iree_allocator_t host_allocator =
+      descriptor_set->logical_device->host_allocator();
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // TODO(benvanik): return to pool. For now we rely on the descriptor cache to
+  // reset entire pools at once via via vkResetDescriptorPool so we don't need
+  // to do anything here (the VkDescriptorSet handle will just be invalidated).
+  // In the future if we want to have generational collection/defragmentation
+  // of the descriptor cache we'll want to allow both pooled and unpooled
+  // descriptors and clean them up here appropriately.
+
+  iree_allocator_free(host_allocator, descriptor_set);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+VkDescriptorSet iree_hal_vulkan_native_descriptor_set_handle(
+    iree_hal_descriptor_set_t* base_descriptor_set) {
+  iree_hal_vulkan_native_descriptor_set_t* descriptor_set =
+      iree_hal_vulkan_native_descriptor_set_cast(base_descriptor_set);
+  return descriptor_set->handle;
+}
+
+namespace {
+const iree_hal_descriptor_set_vtable_t
+    iree_hal_vulkan_native_descriptor_set_vtable = {
+        /*.destroy=*/iree_hal_vulkan_native_descriptor_set_destroy,
+};
+}  // namespace
diff --git a/runtime/src/iree/hal/vulkan/native_descriptor_set.h b/runtime/src/iree/hal/vulkan/native_descriptor_set.h
new file mode 100644
index 0000000..128a1c6
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/native_descriptor_set.h
@@ -0,0 +1,31 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_VULKAN_NATIVE_DESCRIPTOR_SET_H_
+#define IREE_HAL_VULKAN_NATIVE_DESCRIPTOR_SET_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/vulkan/handle_util.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Creates a native Vulkan VkDescriptorSet object.
+iree_status_t iree_hal_vulkan_native_descriptor_set_create(
+    iree::hal::vulkan::VkDeviceHandle* logical_device, VkDescriptorSet handle,
+    iree_hal_descriptor_set_t** out_descriptor_set);
+
+// Returns the native Vulkan VkDescriptorSet handle.
+VkDescriptorSet iree_hal_vulkan_native_descriptor_set_handle(
+    iree_hal_descriptor_set_t* base_descriptor_set);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_VULKAN_NATIVE_DESCRIPTOR_SET_H_
diff --git a/runtime/src/iree/hal/vulkan/native_descriptor_set_layout.cc b/runtime/src/iree/hal/vulkan/native_descriptor_set_layout.cc
new file mode 100644
index 0000000..87965c4
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/native_descriptor_set_layout.cc
@@ -0,0 +1,162 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/vulkan/native_descriptor_set_layout.h"
+
+#include <cstddef>
+#include <cstdint>
+
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/vulkan/dynamic_symbols.h"
+#include "iree/hal/vulkan/extensibility_util.h"
+#include "iree/hal/vulkan/status_util.h"
+#include "iree/hal/vulkan/util/ref_ptr.h"
+
+using namespace iree::hal::vulkan;
+
+typedef struct iree_hal_vulkan_native_descriptor_set_layout_t {
+  iree_hal_resource_t resource;
+  VkDeviceHandle* logical_device;
+  VkDescriptorSetLayout handle;
+} iree_hal_vulkan_native_descriptor_set_layout_t;
+
+namespace {
+extern const iree_hal_descriptor_set_layout_vtable_t
+    iree_hal_vulkan_native_descriptor_set_layout_vtable;
+}  // namespace
+
+static iree_hal_vulkan_native_descriptor_set_layout_t*
+iree_hal_vulkan_native_descriptor_set_layout_cast(
+    iree_hal_descriptor_set_layout_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value,
+                       &iree_hal_vulkan_native_descriptor_set_layout_vtable);
+  return (iree_hal_vulkan_native_descriptor_set_layout_t*)base_value;
+}
+
+static iree_status_t iree_hal_vulkan_create_descriptor_set_layout(
+    VkDeviceHandle* logical_device,
+    iree_hal_descriptor_set_layout_usage_type_t usage_type,
+    iree_host_size_t binding_count,
+    const iree_hal_descriptor_set_layout_binding_t* bindings,
+    VkDescriptorSetLayout* out_handle) {
+  VkDescriptorSetLayoutCreateInfo create_info;
+  create_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO;
+  create_info.pNext = NULL;
+  create_info.flags = 0;
+  if (usage_type == IREE_HAL_DESCRIPTOR_SET_LAYOUT_USAGE_TYPE_PUSH_ONLY &&
+      logical_device->enabled_extensions().push_descriptors) {
+    // Note that we can *only* use push descriptor sets if we set this create
+    // flag. If push descriptors aren't supported we emulate them with normal
+    // descriptors so it's fine to have kPushOnly without support.
+    create_info.flags |=
+        VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR;
+  }
+
+  VkDescriptorSetLayoutBinding* native_bindings = NULL;
+  if (binding_count > 0) {
+    // TODO(benvanik): avoid this allocation if possible (inline_array).
+    IREE_RETURN_IF_ERROR(iree_allocator_malloc(
+        logical_device->host_allocator(),
+        binding_count * sizeof(VkDescriptorSetLayoutBinding),
+        (void**)&native_bindings));
+    for (iree_host_size_t i = 0; i < binding_count; ++i) {
+      VkDescriptorSetLayoutBinding* native_binding = &native_bindings[i];
+      native_binding->binding = bindings[i].binding;
+      native_binding->descriptorType =
+          static_cast<VkDescriptorType>(bindings[i].type);
+      native_binding->descriptorCount = 1;
+      native_binding->stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+      native_binding->pImmutableSamplers = NULL;
+    }
+  }
+  create_info.bindingCount = (uint32_t)binding_count;
+  create_info.pBindings = native_bindings;
+
+  iree_status_t status =
+      VK_RESULT_TO_STATUS(logical_device->syms()->vkCreateDescriptorSetLayout(
+                              *logical_device, &create_info,
+                              logical_device->allocator(), out_handle),
+                          "vkCreateDescriptorSetLayout");
+
+  iree_allocator_free(logical_device->host_allocator(), native_bindings);
+  return status;
+}
+
+static void iree_hal_vulkan_destroy_descriptor_set_layout(
+    VkDeviceHandle* logical_device, VkDescriptorSetLayout handle) {
+  if (handle == VK_NULL_HANDLE) return;
+  logical_device->syms()->vkDestroyDescriptorSetLayout(
+      *logical_device, handle, logical_device->allocator());
+}
+
+iree_status_t iree_hal_vulkan_native_descriptor_set_layout_create(
+    VkDeviceHandle* logical_device,
+    iree_hal_descriptor_set_layout_usage_type_t usage_type,
+    iree_host_size_t binding_count,
+    const iree_hal_descriptor_set_layout_binding_t* bindings,
+    iree_hal_descriptor_set_layout_t** out_descriptor_set_layout) {
+  IREE_ASSERT_ARGUMENT(logical_device);
+  IREE_ASSERT_ARGUMENT(!binding_count || bindings);
+  IREE_ASSERT_ARGUMENT(out_descriptor_set_layout);
+  *out_descriptor_set_layout = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  VkDescriptorSetLayout handle = VK_NULL_HANDLE;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_hal_vulkan_create_descriptor_set_layout(
+              logical_device, usage_type, binding_count, bindings, &handle));
+
+  iree_hal_vulkan_native_descriptor_set_layout_t* descriptor_set_layout = NULL;
+  iree_status_t status = iree_allocator_malloc(logical_device->host_allocator(),
+                                               sizeof(*descriptor_set_layout),
+                                               (void**)&descriptor_set_layout);
+  if (iree_status_is_ok(status)) {
+    iree_hal_resource_initialize(
+        &iree_hal_vulkan_native_descriptor_set_layout_vtable,
+        &descriptor_set_layout->resource);
+    descriptor_set_layout->logical_device = logical_device;
+    descriptor_set_layout->handle = handle;
+    *out_descriptor_set_layout =
+        (iree_hal_descriptor_set_layout_t*)descriptor_set_layout;
+  } else {
+    iree_hal_vulkan_destroy_descriptor_set_layout(logical_device, handle);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_vulkan_native_descriptor_set_layout_destroy(
+    iree_hal_descriptor_set_layout_t* base_descriptor_set_layout) {
+  iree_hal_vulkan_native_descriptor_set_layout_t* descriptor_set_layout =
+      iree_hal_vulkan_native_descriptor_set_layout_cast(
+          base_descriptor_set_layout);
+  iree_allocator_t host_allocator =
+      descriptor_set_layout->logical_device->host_allocator();
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_vulkan_destroy_descriptor_set_layout(
+      descriptor_set_layout->logical_device, descriptor_set_layout->handle);
+  iree_allocator_free(host_allocator, descriptor_set_layout);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+VkDescriptorSetLayout iree_hal_vulkan_native_descriptor_set_layout_handle(
+    iree_hal_descriptor_set_layout_t* base_descriptor_set_layout) {
+  iree_hal_vulkan_native_descriptor_set_layout_t* descriptor_set_layout =
+      iree_hal_vulkan_native_descriptor_set_layout_cast(
+          base_descriptor_set_layout);
+  return descriptor_set_layout->handle;
+}
+
+namespace {
+const iree_hal_descriptor_set_layout_vtable_t
+    iree_hal_vulkan_native_descriptor_set_layout_vtable = {
+        /*.destroy=*/iree_hal_vulkan_native_descriptor_set_layout_destroy,
+};
+}  // namespace
diff --git a/runtime/src/iree/hal/vulkan/native_descriptor_set_layout.h b/runtime/src/iree/hal/vulkan/native_descriptor_set_layout.h
new file mode 100644
index 0000000..0faa9d5
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/native_descriptor_set_layout.h
@@ -0,0 +1,34 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_VULKAN_NATIVE_DESCRIPTOR_SET_LAYOUT_H_
+#define IREE_HAL_VULKAN_NATIVE_DESCRIPTOR_SET_LAYOUT_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/vulkan/handle_util.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Creates a native Vulkan VkDescriptorSetLayout object.
+iree_status_t iree_hal_vulkan_native_descriptor_set_layout_create(
+    iree::hal::vulkan::VkDeviceHandle* logical_device,
+    iree_hal_descriptor_set_layout_usage_type_t usage_type,
+    iree_host_size_t binding_count,
+    const iree_hal_descriptor_set_layout_binding_t* bindings,
+    iree_hal_descriptor_set_layout_t** out_descriptor_set_layout);
+
+// Returns the native Vulkan VkDescriptorSetLayout handle.
+VkDescriptorSetLayout iree_hal_vulkan_native_descriptor_set_layout_handle(
+    iree_hal_descriptor_set_layout_t* base_descriptor_set_layout);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_VULKAN_NATIVE_DESCRIPTOR_SET_LAYOUT_H_
diff --git a/runtime/src/iree/hal/vulkan/native_event.cc b/runtime/src/iree/hal/vulkan/native_event.cc
new file mode 100644
index 0000000..09dd2be
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/native_event.cc
@@ -0,0 +1,103 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/vulkan/native_event.h"
+
+#include <cstddef>
+
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/vulkan/dynamic_symbols.h"
+#include "iree/hal/vulkan/status_util.h"
+#include "iree/hal/vulkan/util/ref_ptr.h"
+
+using namespace iree::hal::vulkan;
+
+typedef struct iree_hal_vulkan_native_event_t {
+  iree_hal_resource_t resource;
+  VkDeviceHandle* logical_device;
+  VkEvent handle;
+} iree_hal_vulkan_native_event_t;
+
+namespace {
+extern const iree_hal_event_vtable_t iree_hal_vulkan_native_event_vtable;
+}  // namespace
+
+static iree_hal_vulkan_native_event_t* iree_hal_vulkan_native_event_cast(
+    iree_hal_event_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_vulkan_native_event_vtable);
+  return (iree_hal_vulkan_native_event_t*)base_value;
+}
+
+static iree_status_t iree_hal_vulkan_create_event(
+    VkDeviceHandle* logical_device, VkEvent* out_handle) {
+  VkEventCreateInfo create_info;
+  create_info.sType = VK_STRUCTURE_TYPE_EVENT_CREATE_INFO;
+  create_info.pNext = NULL;
+  create_info.flags = 0;
+  return VK_RESULT_TO_STATUS(logical_device->syms()->vkCreateEvent(
+                                 *logical_device, &create_info,
+                                 logical_device->allocator(), out_handle),
+                             "vkCreateEvent");
+}
+
+static void iree_hal_vulkan_destroy_event(VkDeviceHandle* logical_device,
+                                          VkEvent handle) {
+  if (handle == VK_NULL_HANDLE) return;
+  logical_device->syms()->vkDestroyEvent(*logical_device, handle,
+                                         logical_device->allocator());
+}
+
+iree_status_t iree_hal_vulkan_native_event_create(
+    VkDeviceHandle* logical_device, iree_hal_event_t** out_event) {
+  IREE_ASSERT_ARGUMENT(logical_device);
+  IREE_ASSERT_ARGUMENT(out_event);
+  *out_event = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  VkEvent handle = VK_NULL_HANDLE;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_hal_vulkan_create_event(logical_device, &handle));
+
+  iree_hal_vulkan_native_event_t* event = NULL;
+  iree_status_t status = iree_allocator_malloc(logical_device->host_allocator(),
+                                               sizeof(*event), (void**)&event);
+  if (iree_status_is_ok(status)) {
+    iree_hal_resource_initialize(&iree_hal_vulkan_native_event_vtable,
+                                 &event->resource);
+    event->logical_device = logical_device;
+    event->handle = handle;
+    *out_event = (iree_hal_event_t*)event;
+  } else {
+    iree_hal_vulkan_destroy_event(logical_device, handle);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_vulkan_native_event_destroy(iree_hal_event_t* base_event) {
+  iree_hal_vulkan_native_event_t* event =
+      iree_hal_vulkan_native_event_cast(base_event);
+  iree_allocator_t host_allocator = event->logical_device->host_allocator();
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_vulkan_destroy_event(event->logical_device, event->handle);
+  iree_allocator_free(host_allocator, event);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+VkEvent iree_hal_vulkan_native_event_handle(
+    const iree_hal_event_t* base_event) {
+  return ((const iree_hal_vulkan_native_event_t*)base_event)->handle;
+}
+
+namespace {
+const iree_hal_event_vtable_t iree_hal_vulkan_native_event_vtable = {
+    /*.destroy=*/iree_hal_vulkan_native_event_destroy,
+};
+}  // namespace
diff --git a/runtime/src/iree/hal/vulkan/native_event.h b/runtime/src/iree/hal/vulkan/native_event.h
new file mode 100644
index 0000000..bb641da
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/native_event.h
@@ -0,0 +1,30 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_VULKAN_NATIVE_EVENT_H_
+#define IREE_HAL_VULKAN_NATIVE_EVENT_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/vulkan/handle_util.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Creates a native Vulkan VkEvent object.
+iree_status_t iree_hal_vulkan_native_event_create(
+    iree::hal::vulkan::VkDeviceHandle* logical_device,
+    iree_hal_event_t** out_event);
+
+// Returns Vulkan event handle.
+VkEvent iree_hal_vulkan_native_event_handle(const iree_hal_event_t* event);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_VULKAN_NATIVE_EVENT_H_
diff --git a/runtime/src/iree/hal/vulkan/native_executable.cc b/runtime/src/iree/hal/vulkan/native_executable.cc
new file mode 100644
index 0000000..8d8d35a
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/native_executable.cc
@@ -0,0 +1,353 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/vulkan/native_executable.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/vulkan/dynamic_symbol_tables.h"
+#include "iree/hal/vulkan/dynamic_symbols.h"
+#include "iree/hal/vulkan/handle_util.h"
+#include "iree/hal/vulkan/native_executable_layout.h"
+#include "iree/hal/vulkan/status_util.h"
+#include "iree/hal/vulkan/util/ref_ptr.h"
+
+// flatcc schemas:
+#include "iree/base/internal/flatcc/parsing.h"
+#include "iree/schemas/spirv_executable_def_reader.h"
+#include "iree/schemas/spirv_executable_def_verifier.h"
+
+using namespace iree::hal::vulkan;
+
+typedef struct iree_hal_vulkan_entry_point_t {
+  VkPipeline pipeline;
+  iree_string_view_t name;
+} iree_hal_vulkan_entry_point_t;
+
+static iree_status_t iree_hal_vulkan_create_shader_module(
+    VkDeviceHandle* logical_device, iree_const_byte_span_t code,
+    VkShaderModule* out_shader_module) {
+  IREE_TRACE_SCOPE();
+  VkShaderModuleCreateInfo create_info;
+  create_info.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO;
+  create_info.pNext = NULL;
+  create_info.flags = 0;
+  create_info.codeSize = code.data_length;
+  create_info.pCode = (const uint32_t*)code.data;
+  VK_RETURN_IF_ERROR(logical_device->syms()->vkCreateShaderModule(
+                         *logical_device, &create_info,
+                         logical_device->allocator(), out_shader_module),
+                     "vkCreateShaderModule");
+  return iree_ok_status();
+}
+
+static void iree_hal_vulkan_destroy_shader_module(
+    VkDeviceHandle* logical_device, VkShaderModule handle) {
+  if (handle == VK_NULL_HANDLE) return;
+  logical_device->syms()->vkDestroyShaderModule(*logical_device, handle,
+                                                logical_device->allocator());
+}
+
+static iree_status_t iree_hal_vulkan_create_pipelines(
+    VkDeviceHandle* logical_device, VkPipelineCache pipeline_cache,
+    const iree_hal_executable_params_t* executable_params,
+    iree_SpirVExecutableDef_table_t executable_def,
+    VkShaderModule shader_module, iree_host_size_t pipeline_count,
+    iree_hal_vulkan_entry_point_t* out_entry_points) {
+  IREE_TRACE_SCOPE();
+  uint8_t* scratch_memory = NULL;
+  IREE_RETURN_IF_ERROR(iree_allocator_malloc(
+      logical_device->host_allocator(),
+      pipeline_count * sizeof(VkComputePipelineCreateInfo) +
+          executable_params->constant_count * sizeof(VkSpecializationMapEntry),
+      (void**)&scratch_memory));
+  VkComputePipelineCreateInfo* create_infos =
+      (VkComputePipelineCreateInfo*)scratch_memory;
+  VkSpecializationMapEntry* spec_map_entries =
+      (VkSpecializationMapEntry*)(scratch_memory +
+                                  pipeline_count *
+                                      sizeof(VkComputePipelineCreateInfo));
+
+  VkSpecializationInfo spec_info;
+  memset(&spec_info, 0, sizeof(spec_info));
+  spec_info.mapEntryCount = executable_params->constant_count;
+  spec_info.pMapEntries = spec_map_entries;
+  spec_info.dataSize = executable_params->constant_count * sizeof(uint32_t);
+  spec_info.pData = executable_params->constants;
+  for (iree_host_size_t i = 0; i < executable_params->constant_count; ++i) {
+    spec_map_entries[i].constantID = i;
+    spec_map_entries[i].offset = i * sizeof(uint32_t);
+    spec_map_entries[i].size = sizeof(uint32_t);
+  }
+
+  flatbuffers_string_vec_t entry_points_vec =
+      iree_SpirVExecutableDef_entry_points_get(executable_def);
+  for (iree_host_size_t entry_ordinal = 0; entry_ordinal < pipeline_count;
+       ++entry_ordinal) {
+    VkComputePipelineCreateInfo* create_info = &create_infos[entry_ordinal];
+    create_info->sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO;
+    create_info->pNext = NULL;
+    create_info->flags = 0;
+    if (!iree_all_bits_set(
+            executable_params->caching_mode,
+            IREE_HAL_EXECUTABLE_CACHING_MODE_ALLOW_OPTIMIZATION)) {
+      create_info->flags |= VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT;
+    }
+    if (entry_ordinal == 0) {
+      create_info->flags |= VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT;
+    } else {
+      create_info->flags |= VK_PIPELINE_CREATE_DERIVATIVE_BIT;
+    }
+    create_info->layout = iree_hal_vulkan_native_executable_layout_handle(
+        executable_params->executable_layouts[entry_ordinal]);
+    create_info->basePipelineHandle = VK_NULL_HANDLE;
+    create_info->basePipelineIndex = 0;
+
+    VkPipelineShaderStageCreateInfo* stage_create_info = &create_info->stage;
+    stage_create_info->sType =
+        VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
+    stage_create_info->pNext = NULL;
+    stage_create_info->flags = 0;
+    stage_create_info->stage = VK_SHADER_STAGE_COMPUTE_BIT;
+    stage_create_info->module = shader_module;
+    stage_create_info->pName =
+        flatbuffers_string_vec_at(entry_points_vec, entry_ordinal);
+    stage_create_info->pSpecializationInfo = &spec_info;
+  }
+
+  VkPipeline* pipelines =
+      (VkPipeline*)iree_alloca(pipeline_count * sizeof(VkPipeline));
+  iree_status_t status = VK_RESULT_TO_STATUS(
+      logical_device->syms()->vkCreateComputePipelines(
+          *logical_device, pipeline_cache, (uint32_t)pipeline_count,
+          create_infos, logical_device->allocator(), pipelines),
+      "vkCreateComputePipelines");
+  if (iree_status_is_ok(status)) {
+    for (iree_host_size_t i = 0; i < pipeline_count; ++i) {
+      out_entry_points[i].pipeline = pipelines[i];
+    }
+  }
+
+  iree_allocator_free(logical_device->host_allocator(), scratch_memory);
+  return status;
+}
+
+static void iree_hal_vulkan_destroy_pipeline(VkDeviceHandle* logical_device,
+                                             VkPipeline handle) {
+  IREE_TRACE_SCOPE();
+  if (handle == VK_NULL_HANDLE) return;
+  logical_device->syms()->vkDestroyPipeline(*logical_device, handle,
+                                            logical_device->allocator());
+}
+
+// Verifies the structure of the flatbuffer so that we can avoid doing so during
+// runtime. There are still some conditions we must be aware of (such as omitted
+// names on functions with internal linkage), however we shouldn't need to
+// bounds check anything within the flatbuffer after this succeeds.
+static iree_status_t iree_hal_spirv_executable_flatbuffer_verify(
+    iree_const_byte_span_t flatbuffer_data,
+    iree_host_size_t expected_entry_point_count) {
+  if (!flatbuffer_data.data || flatbuffer_data.data_length < 16) {
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "flatbuffer data is not present or less than 16 bytes (%zu total)",
+        flatbuffer_data.data_length);
+  }
+
+  // Run flatcc generated verification. This ensures all pointers are in-bounds
+  // and that we can safely walk the file, but not that the actual contents of
+  // the flatbuffer meet our expectations.
+  int verify_ret = iree_SpirVExecutableDef_verify_as_root(
+      flatbuffer_data.data, flatbuffer_data.data_length);
+  if (verify_ret != flatcc_verify_ok) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "flatbuffer verification failed: %s",
+                            flatcc_verify_error_string(verify_ret));
+  }
+
+  iree_SpirVExecutableDef_table_t executable_def =
+      iree_SpirVExecutableDef_as_root(flatbuffer_data.data);
+
+  flatbuffers_string_vec_t entry_points_vec =
+      iree_SpirVExecutableDef_entry_points_get(executable_def);
+  size_t entry_point_count = flatbuffers_string_vec_len(entry_points_vec);
+  if (entry_point_count != expected_entry_point_count) {
+    return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                            "executable provides %zu entry points but caller "
+                            "provided %zu; must match",
+                            entry_point_count, expected_entry_point_count);
+  }
+
+  for (size_t i = 0; i < entry_point_count; ++i) {
+    if (!flatbuffers_string_len(
+            flatbuffers_string_vec_at(entry_points_vec, i))) {
+      return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                              "executable entry point %zu has no name", i);
+    }
+  }
+
+  if (flatbuffers_uint32_vec_len(
+          iree_SpirVExecutableDef_code_get(executable_def)) == 0) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "executable SPIR-V code is missing/empty");
+  }
+
+  return iree_ok_status();
+}
+
+typedef struct iree_hal_vulkan_native_executable_t {
+  iree_hal_resource_t resource;
+  VkDeviceHandle* logical_device;
+  iree_host_size_t entry_point_count;
+  iree_hal_vulkan_entry_point_t entry_points[];
+} iree_hal_vulkan_native_executable_t;
+
+namespace {
+extern const iree_hal_executable_vtable_t
+    iree_hal_vulkan_native_executable_vtable;
+}  // namespace
+
+static iree_hal_vulkan_native_executable_t*
+iree_hal_vulkan_native_executable_cast(iree_hal_executable_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_vulkan_native_executable_vtable);
+  return (iree_hal_vulkan_native_executable_t*)base_value;
+}
+
+iree_status_t iree_hal_vulkan_native_executable_create(
+    iree::hal::vulkan::VkDeviceHandle* logical_device,
+    VkPipelineCache pipeline_cache,
+    const iree_hal_executable_params_t* executable_params,
+    iree_hal_executable_t** out_executable) {
+  IREE_ASSERT_ARGUMENT(logical_device);
+  IREE_ASSERT_ARGUMENT(executable_params);
+  IREE_ASSERT_ARGUMENT(out_executable);
+  *out_executable = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Verify and fetch the executable flatbuffer wrapper.
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_hal_spirv_executable_flatbuffer_verify(
+              executable_params->executable_data,
+              executable_params->executable_layout_count));
+  iree_SpirVExecutableDef_table_t executable_def =
+      iree_SpirVExecutableDef_as_root(executable_params->executable_data.data);
+
+  // Create the shader module.
+  flatbuffers_uint32_vec_t code_vec =
+      iree_SpirVExecutableDef_code_get(executable_def);
+  VkShaderModule shader_module = VK_NULL_HANDLE;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_hal_vulkan_create_shader_module(
+              logical_device,
+              iree_make_const_byte_span(
+                  code_vec,
+                  flatbuffers_uint32_vec_len(code_vec) * sizeof(uint32_t)),
+              &shader_module));
+
+  // Create pipelines for each entry point.
+  flatbuffers_string_vec_t entry_points_vec =
+      iree_SpirVExecutableDef_entry_points_get(executable_def);
+  iree_host_size_t entry_point_count =
+      flatbuffers_string_vec_len(entry_points_vec);
+
+  iree_hal_vulkan_native_executable_t* executable = NULL;
+  iree_host_size_t total_size =
+      sizeof(*executable) +
+      entry_point_count * sizeof(*executable->entry_points);
+  iree_status_t status = iree_allocator_malloc(logical_device->host_allocator(),
+                                               total_size, (void**)&executable);
+  if (iree_status_is_ok(status)) {
+    iree_hal_resource_initialize(&iree_hal_vulkan_native_executable_vtable,
+                                 &executable->resource);
+    executable->logical_device = logical_device;
+    executable->entry_point_count = entry_point_count;
+    memset(executable->entry_points, 0,
+           entry_point_count * sizeof(*executable->entry_points));
+  }
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_vulkan_create_pipelines(
+        logical_device, pipeline_cache, executable_params, executable_def,
+        shader_module, executable->entry_point_count, executable->entry_points);
+  }
+  iree_hal_vulkan_destroy_shader_module(logical_device, shader_module);
+
+  if (iree_status_is_ok(status)) {
+    flatbuffers_string_vec_t entry_points_vec =
+        iree_SpirVExecutableDef_entry_points_get(executable_def);
+    for (iree_host_size_t i = 0; i < entry_point_count; ++i) {
+      flatbuffers_string_t name =
+          flatbuffers_string_vec_at(entry_points_vec, i);
+      executable->entry_points[i].name =
+          iree_make_string_view(name, flatbuffers_string_len(name));
+      IREE_TRACE_ZONE_APPEND_TEXT(z0, name);
+    }
+  }
+
+  if (iree_status_is_ok(status)) {
+    *out_executable = (iree_hal_executable_t*)executable;
+  } else {
+    iree_hal_executable_destroy((iree_hal_executable_t*)executable);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_vulkan_native_executable_destroy(
+    iree_hal_executable_t* base_executable) {
+  iree_hal_vulkan_native_executable_t* executable =
+      iree_hal_vulkan_native_executable_cast(base_executable);
+  iree_allocator_t host_allocator =
+      executable->logical_device->host_allocator();
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  for (iree_host_size_t i = 0; i < executable->entry_point_count; ++i) {
+    iree_hal_vulkan_destroy_pipeline(executable->logical_device,
+                                     executable->entry_points[i].pipeline);
+  }
+  iree_allocator_free(host_allocator, executable);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_hal_vulkan_native_executable_entry_point_source_location(
+    iree_hal_executable_t* base_executable, iree_host_size_t entry_ordinal,
+    iree_hal_vulkan_source_location_t* out_source_location) {
+  iree_hal_vulkan_native_executable_t* executable =
+      iree_hal_vulkan_native_executable_cast(base_executable);
+  memset(out_source_location, 0, sizeof(*out_source_location));
+  if (entry_ordinal >= executable->entry_point_count) {
+    return;
+  }
+  out_source_location->func_name = executable->entry_points[entry_ordinal].name;
+
+  // TODO(benvanik): plumb through file name/line for the MLIR function.
+  out_source_location->file_name = out_source_location->func_name;
+  out_source_location->line = 0;
+}
+
+iree_status_t iree_hal_vulkan_native_executable_pipeline_for_entry_point(
+    iree_hal_executable_t* base_executable, iree_host_size_t entry_ordinal,
+    VkPipeline* out_pipeline_handle) {
+  iree_hal_vulkan_native_executable_t* executable =
+      iree_hal_vulkan_native_executable_cast(base_executable);
+  if (entry_ordinal >= executable->entry_point_count) {
+    return iree_make_status(IREE_STATUS_OUT_OF_RANGE,
+                            "invalid entry point ordinal %zu", entry_ordinal);
+  }
+  *out_pipeline_handle = executable->entry_points[entry_ordinal].pipeline;
+  return iree_ok_status();
+}
+
+namespace {
+const iree_hal_executable_vtable_t iree_hal_vulkan_native_executable_vtable = {
+    /*.destroy=*/iree_hal_vulkan_native_executable_destroy,
+};
+}  // namespace
diff --git a/runtime/src/iree/hal/vulkan/native_executable.h b/runtime/src/iree/hal/vulkan/native_executable.h
new file mode 100644
index 0000000..e7f3c98
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/native_executable.h
@@ -0,0 +1,52 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_VULKAN_NATIVE_EXECUTABLE_H_
+#define IREE_HAL_VULKAN_NATIVE_EXECUTABLE_H_
+
+// clang-format off: must be included before all other headers.
+#include "iree/hal/vulkan/vulkan_headers.h"
+// clang-format on
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/vulkan/handle_util.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef struct iree_hal_vulkan_source_location_t {
+  iree_string_view_t file_name;
+  int line;
+  iree_string_view_t func_name;
+} iree_hal_vulkan_source_location_t;
+
+// Creates a wrapper for one or more VkPipelines that are sourced from the same
+// IREE executable. Each of the pipelines will share the same shader module
+// and just differs by the entry point into the shader module they reference.
+iree_status_t iree_hal_vulkan_native_executable_create(
+    iree::hal::vulkan::VkDeviceHandle* logical_device,
+    VkPipelineCache pipeline_cache,
+    const iree_hal_executable_params_t* executable_params,
+    iree_hal_executable_t** out_executable);
+
+// Returns the source location for the given entry point. May be empty if not
+// available.
+void iree_hal_vulkan_native_executable_entry_point_source_location(
+    iree_hal_executable_t* executable, iree_host_size_t entry_ordinal,
+    iree_hal_vulkan_source_location_t* out_source_location);
+
+// Returns the cached VkPipeline for the given executable |entry_ordinal|.
+iree_status_t iree_hal_vulkan_native_executable_pipeline_for_entry_point(
+    iree_hal_executable_t* executable, iree_host_size_t entry_ordinal,
+    VkPipeline* out_pipeline_handle);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_VULKAN_NATIVE_EXECUTABLE_H_
diff --git a/runtime/src/iree/hal/vulkan/native_executable_layout.cc b/runtime/src/iree/hal/vulkan/native_executable_layout.cc
new file mode 100644
index 0000000..572c8bd
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/native_executable_layout.cc
@@ -0,0 +1,175 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/vulkan/native_executable_layout.h"
+
+#include <cstddef>
+#include <cstdint>
+
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/vulkan/dynamic_symbol_tables.h"
+#include "iree/hal/vulkan/dynamic_symbols.h"
+#include "iree/hal/vulkan/native_descriptor_set_layout.h"
+#include "iree/hal/vulkan/status_util.h"
+#include "iree/hal/vulkan/util/ref_ptr.h"
+
+using namespace iree::hal::vulkan;
+
+typedef struct iree_hal_vulkan_native_executable_layout_t {
+  iree_hal_resource_t resource;
+  VkDeviceHandle* logical_device;
+  VkPipelineLayout handle;
+  iree_host_size_t set_layout_count;
+  iree_hal_descriptor_set_layout_t* set_layouts[];
+} iree_hal_vulkan_native_executable_layout_t;
+
+namespace {
+extern const iree_hal_executable_layout_vtable_t
+    iree_hal_vulkan_native_executable_layout_vtable;
+}  // namespace
+
+static iree_hal_vulkan_native_executable_layout_t*
+iree_hal_vulkan_native_executable_layout_cast(
+    iree_hal_executable_layout_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value,
+                       &iree_hal_vulkan_native_executable_layout_vtable);
+  return (iree_hal_vulkan_native_executable_layout_t*)base_value;
+}
+
+static iree_status_t iree_hal_vulkan_create_pipeline_layout(
+    iree::hal::vulkan::VkDeviceHandle* logical_device,
+    iree_host_size_t push_constant_count, iree_host_size_t set_layout_count,
+    iree_hal_descriptor_set_layout_t** set_layouts,
+    VkPipelineLayout* out_handle) {
+  VkDescriptorSetLayout* set_layout_handles =
+      (VkDescriptorSetLayout*)iree_alloca(set_layout_count *
+                                          sizeof(VkDescriptorSetLayout));
+  for (iree_host_size_t i = 0; i < set_layout_count; ++i) {
+    set_layout_handles[i] =
+        iree_hal_vulkan_native_descriptor_set_layout_handle(set_layouts[i]);
+  }
+
+  VkPushConstantRange push_constant_ranges[1];
+  push_constant_ranges[0].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+  push_constant_ranges[0].offset = 0;
+  push_constant_ranges[0].size =
+      (uint32_t)(push_constant_count * sizeof(uint32_t));
+
+  VkPipelineLayoutCreateInfo create_info;
+  create_info.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
+  create_info.pNext = nullptr;
+  create_info.flags = 0;
+  create_info.setLayoutCount = (uint32_t)set_layout_count;
+  create_info.pSetLayouts = set_layout_handles;
+  create_info.pushConstantRangeCount = push_constant_count > 0 ? 1 : 0;
+  create_info.pPushConstantRanges = push_constant_ranges;
+
+  return VK_RESULT_TO_STATUS(logical_device->syms()->vkCreatePipelineLayout(
+                                 *logical_device, &create_info,
+                                 logical_device->allocator(), out_handle),
+                             "vkCreatePipelineLayout");
+}
+
+static void iree_hal_vulkan_destroy_pipeline_layout(
+    VkDeviceHandle* logical_device, VkPipelineLayout handle) {
+  if (handle == VK_NULL_HANDLE) return;
+  logical_device->syms()->vkDestroyPipelineLayout(*logical_device, handle,
+                                                  logical_device->allocator());
+}
+
+iree_status_t iree_hal_vulkan_native_executable_layout_create(
+    iree::hal::vulkan::VkDeviceHandle* logical_device,
+    iree_host_size_t push_constant_count, iree_host_size_t set_layout_count,
+    iree_hal_descriptor_set_layout_t** set_layouts,
+    iree_hal_executable_layout_t** out_executable_layout) {
+  IREE_ASSERT_ARGUMENT(logical_device);
+  IREE_ASSERT_ARGUMENT(!set_layout_count || set_layouts);
+  IREE_ASSERT_ARGUMENT(out_executable_layout);
+  *out_executable_layout = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  VkPipelineLayout handle = VK_NULL_HANDLE;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_hal_vulkan_create_pipeline_layout(
+              logical_device, push_constant_count, set_layout_count,
+              set_layouts, &handle));
+
+  iree_hal_vulkan_native_executable_layout_t* executable_layout = NULL;
+  iree_host_size_t total_size =
+      sizeof(*executable_layout) +
+      set_layout_count * sizeof(*executable_layout->set_layouts);
+  iree_status_t status = iree_allocator_malloc(
+      logical_device->host_allocator(), total_size, (void**)&executable_layout);
+  if (iree_status_is_ok(status)) {
+    iree_hal_resource_initialize(
+        &iree_hal_vulkan_native_executable_layout_vtable,
+        &executable_layout->resource);
+    executable_layout->logical_device = logical_device;
+    executable_layout->handle = handle;
+    executable_layout->set_layout_count = set_layout_count;
+    for (iree_host_size_t i = 0; i < set_layout_count; ++i) {
+      executable_layout->set_layouts[i] = set_layouts[i];
+      iree_hal_descriptor_set_layout_retain(set_layouts[i]);
+    }
+    *out_executable_layout = (iree_hal_executable_layout_t*)executable_layout;
+  } else {
+    iree_hal_vulkan_destroy_pipeline_layout(logical_device, handle);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_vulkan_native_executable_layout_destroy(
+    iree_hal_executable_layout_t* base_executable_layout) {
+  iree_hal_vulkan_native_executable_layout_t* executable_layout =
+      iree_hal_vulkan_native_executable_layout_cast(base_executable_layout);
+  iree_allocator_t host_allocator =
+      executable_layout->logical_device->host_allocator();
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_vulkan_destroy_pipeline_layout(executable_layout->logical_device,
+                                          executable_layout->handle);
+  for (iree_host_size_t i = 0; i < executable_layout->set_layout_count; ++i) {
+    iree_hal_descriptor_set_layout_release(executable_layout->set_layouts[i]);
+  }
+  iree_allocator_free(host_allocator, executable_layout);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+VkPipelineLayout iree_hal_vulkan_native_executable_layout_handle(
+    iree_hal_executable_layout_t* base_executable_layout) {
+  iree_hal_vulkan_native_executable_layout_t* executable_layout =
+      iree_hal_vulkan_native_executable_layout_cast(base_executable_layout);
+  return executable_layout->handle;
+}
+
+iree_host_size_t iree_hal_vulkan_native_executable_layout_set_count(
+    iree_hal_executable_layout_t* base_executable_layout) {
+  iree_hal_vulkan_native_executable_layout_t* executable_layout =
+      iree_hal_vulkan_native_executable_layout_cast(base_executable_layout);
+  return executable_layout->set_layout_count;
+}
+
+iree_hal_descriptor_set_layout_t* iree_hal_vulkan_native_executable_layout_set(
+    iree_hal_executable_layout_t* base_executable_layout,
+    iree_host_size_t set_index) {
+  iree_hal_vulkan_native_executable_layout_t* executable_layout =
+      iree_hal_vulkan_native_executable_layout_cast(base_executable_layout);
+  if (IREE_UNLIKELY(set_index >= executable_layout->set_layout_count)) {
+    return NULL;
+  }
+  return executable_layout->set_layouts[set_index];
+}
+
+namespace {
+const iree_hal_executable_layout_vtable_t
+    iree_hal_vulkan_native_executable_layout_vtable = {
+        /*.destroy=*/iree_hal_vulkan_native_executable_layout_destroy,
+};
+}  // namespace
diff --git a/runtime/src/iree/hal/vulkan/native_executable_layout.h b/runtime/src/iree/hal/vulkan/native_executable_layout.h
new file mode 100644
index 0000000..7f9e5af
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/native_executable_layout.h
@@ -0,0 +1,47 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_VULKAN_NATIVE_EXECUTABLE_LAYOUT_H_
+#define IREE_HAL_VULKAN_NATIVE_EXECUTABLE_LAYOUT_H_
+
+// clang-format off: must be included before all other headers.
+#include "iree/hal/vulkan/vulkan_headers.h"
+// clang-format on
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/vulkan/handle_util.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Creates a VkPipelineLayout-based executable layout composed of one or more
+// descriptor set layouts.
+iree_status_t iree_hal_vulkan_native_executable_layout_create(
+    iree::hal::vulkan::VkDeviceHandle* logical_device,
+    iree_host_size_t push_constant_count, iree_host_size_t set_layout_count,
+    iree_hal_descriptor_set_layout_t** set_layouts,
+    iree_hal_executable_layout_t** out_executable_layout);
+
+// Returns the native VkPipelineLayout handle for the executable layout.
+VkPipelineLayout iree_hal_vulkan_native_executable_layout_handle(
+    iree_hal_executable_layout_t* executable_layout);
+
+// Returns the total number of descriptor sets within the layout.
+iree_host_size_t iree_hal_vulkan_native_executable_layout_set_count(
+    iree_hal_executable_layout_t* executable_layout);
+
+// Returns the descriptor set layout with the given |set_index|.
+iree_hal_descriptor_set_layout_t* iree_hal_vulkan_native_executable_layout_set(
+    iree_hal_executable_layout_t* executable_layout,
+    iree_host_size_t set_index);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_VULKAN_NATIVE_EXECUTABLE_LAYOUT_H_
diff --git a/runtime/src/iree/hal/vulkan/native_semaphore.cc b/runtime/src/iree/hal/vulkan/native_semaphore.cc
new file mode 100644
index 0000000..4ef4f36
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/native_semaphore.cc
@@ -0,0 +1,279 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/vulkan/native_semaphore.h"
+
+#include <cstddef>
+
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/vulkan/dynamic_symbol_tables.h"
+#include "iree/hal/vulkan/dynamic_symbols.h"
+#include "iree/hal/vulkan/status_util.h"
+#include "iree/hal/vulkan/util/ref_ptr.h"
+
+// The maximum valid payload value of an iree_hal_semaphore_t.
+// Payload values larger than this indicate that the semaphore has failed.
+//
+// This originates from Vulkan having a lower-bound of INT_MAX for
+// maxTimelineSemaphoreValueDifference and many Android devices only supporting
+// that lower-bound. At ~100 signals per second it'll take 1.5+ years to
+// saturate. We may increase this value at some point but so long as there are
+// some devices in the wild that may have this limitation we can ensure better
+// consistency across the backends by observing this.
+//
+// The major mitigation here is that in proper usage of IREE there are no
+// semaphores that are implicitly referenced by multiple VMs (each creates their
+// own internally) and in a multitenant system each session should have its own
+// semaphores - so even if the process lives for years it's highly unlikely any
+// particular session does. Whatever, 640K is enough for anyone.
+//
+// See:
+//   https://vulkan.gpuinfo.org/displayextensionproperty.php?name=maxTimelineSemaphoreValueDifference
+#define IREE_HAL_VULKAN_SEMAPHORE_MAX_VALUE (2147483647ull - 1)
+
+using namespace iree::hal::vulkan;
+
+typedef struct iree_hal_vulkan_native_semaphore_t {
+  iree_hal_resource_t resource;
+  VkDeviceHandle* logical_device;
+  VkSemaphore handle;
+  iree_atomic_intptr_t failure_status;
+} iree_hal_vulkan_native_semaphore_t;
+
+namespace {
+extern const iree_hal_semaphore_vtable_t
+    iree_hal_vulkan_native_semaphore_vtable;
+}  // namespace
+
+static iree_hal_vulkan_native_semaphore_t*
+iree_hal_vulkan_native_semaphore_cast(iree_hal_semaphore_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_vulkan_native_semaphore_vtable);
+  return (iree_hal_vulkan_native_semaphore_t*)base_value;
+}
+
+iree_status_t iree_hal_vulkan_native_semaphore_create(
+    iree::hal::vulkan::VkDeviceHandle* logical_device, uint64_t initial_value,
+    iree_hal_semaphore_t** out_semaphore) {
+  IREE_ASSERT_ARGUMENT(logical_device);
+  IREE_ASSERT_ARGUMENT(out_semaphore);
+  *out_semaphore = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  VkSemaphoreTypeCreateInfo timeline_create_info;
+  timeline_create_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO;
+  timeline_create_info.pNext = NULL;
+  timeline_create_info.semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE;
+  timeline_create_info.initialValue = initial_value;
+
+  VkSemaphoreCreateInfo create_info;
+  create_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
+  create_info.pNext = &timeline_create_info;
+  create_info.flags = 0;
+  VkSemaphore handle = VK_NULL_HANDLE;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, VK_RESULT_TO_STATUS(logical_device->syms()->vkCreateSemaphore(
+                                  *logical_device, &create_info,
+                                  logical_device->allocator(), &handle),
+                              "vkCreateSemaphore"));
+
+  iree_hal_vulkan_native_semaphore_t* semaphore = NULL;
+  iree_status_t status = iree_allocator_malloc(
+      logical_device->host_allocator(), sizeof(*semaphore), (void**)&semaphore);
+  if (iree_status_is_ok(status)) {
+    iree_hal_resource_initialize(&iree_hal_vulkan_native_semaphore_vtable,
+                                 &semaphore->resource);
+    semaphore->logical_device = logical_device;
+    semaphore->handle = handle;
+    iree_atomic_store_intptr(&semaphore->failure_status, 0,
+                             iree_memory_order_release);
+    *out_semaphore = (iree_hal_semaphore_t*)semaphore;
+  } else {
+    logical_device->syms()->vkDestroySemaphore(*logical_device, handle,
+                                               logical_device->allocator());
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_vulkan_native_semaphore_destroy(
+    iree_hal_semaphore_t* base_semaphore) {
+  iree_hal_vulkan_native_semaphore_t* semaphore =
+      iree_hal_vulkan_native_semaphore_cast(base_semaphore);
+  iree_allocator_t host_allocator = semaphore->logical_device->host_allocator();
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_status_free((iree_status_t)iree_atomic_load_intptr(
+      &semaphore->failure_status, iree_memory_order_acquire));
+  semaphore->logical_device->syms()->vkDestroySemaphore(
+      *semaphore->logical_device, semaphore->handle,
+      semaphore->logical_device->allocator());
+  iree_allocator_free(host_allocator, semaphore);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+VkSemaphore iree_hal_vulkan_native_semaphore_handle(
+    iree_hal_semaphore_t* base_semaphore) {
+  iree_hal_vulkan_native_semaphore_t* semaphore =
+      iree_hal_vulkan_native_semaphore_cast(base_semaphore);
+  return semaphore->handle;
+}
+
+static iree_status_t iree_hal_vulkan_native_semaphore_query(
+    iree_hal_semaphore_t* base_semaphore, uint64_t* out_value) {
+  iree_hal_vulkan_native_semaphore_t* semaphore =
+      iree_hal_vulkan_native_semaphore_cast(base_semaphore);
+  *out_value = 0;
+
+  uint64_t value = 0;
+  IREE_RETURN_IF_ERROR(VK_RESULT_TO_STATUS(
+      semaphore->logical_device->syms()->vkGetSemaphoreCounterValue(
+          *semaphore->logical_device, semaphore->handle, &value),
+      "vkGetSemaphoreCounterValue"));
+
+  if (value > IREE_HAL_VULKAN_SEMAPHORE_MAX_VALUE) {
+    iree_status_t failure_status = (iree_status_t)iree_atomic_load_intptr(
+        &semaphore->failure_status, iree_memory_order_acquire);
+    if (iree_status_is_ok(failure_status)) {
+      return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED,
+                              "overflowed timeline semaphore max value");
+    }
+    return iree_status_clone(failure_status);
+  }
+
+  *out_value = value;
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_vulkan_native_semaphore_signal(
+    iree_hal_semaphore_t* base_semaphore, uint64_t new_value) {
+  iree_hal_vulkan_native_semaphore_t* semaphore =
+      iree_hal_vulkan_native_semaphore_cast(base_semaphore);
+
+  VkSemaphoreSignalInfo signal_info;
+  signal_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_SIGNAL_INFO;
+  signal_info.pNext = NULL;
+  signal_info.semaphore = semaphore->handle;
+  signal_info.value = new_value;
+  return VK_RESULT_TO_STATUS(
+      semaphore->logical_device->syms()->vkSignalSemaphore(
+          *semaphore->logical_device, &signal_info),
+      "vkSignalSemaphore");
+}
+
+static void iree_hal_vulkan_native_semaphore_fail(
+    iree_hal_semaphore_t* base_semaphore, iree_status_t status) {
+  iree_hal_vulkan_native_semaphore_t* semaphore =
+      iree_hal_vulkan_native_semaphore_cast(base_semaphore);
+
+  // Try to set our local status - we only preserve the first failure so only
+  // do this if we are going from a valid semaphore to a failed one.
+  iree_status_t old_status = iree_ok_status();
+  if (!iree_atomic_compare_exchange_strong_intptr(
+          &semaphore->failure_status, (intptr_t*)&old_status, (intptr_t)status,
+          iree_memory_order_seq_cst, iree_memory_order_seq_cst)) {
+    // Previous status was not OK; drop our new status.
+    IREE_IGNORE_ERROR(status);
+    return;
+  }
+
+  VkSemaphoreSignalInfo signal_info;
+  signal_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_SIGNAL_INFO;
+  signal_info.pNext = NULL;
+  signal_info.semaphore = semaphore->handle;
+  signal_info.value = IREE_HAL_VULKAN_SEMAPHORE_MAX_VALUE + 1;
+  // NOTE: we don't care about the result in case of failures as we are
+  // failing and the caller will likely be tearing everything down anyway.
+  semaphore->logical_device->syms()->vkSignalSemaphore(
+      *semaphore->logical_device, &signal_info);
+}
+
+iree_status_t iree_hal_vulkan_native_semaphore_multi_wait(
+    iree::hal::vulkan::VkDeviceHandle* logical_device,
+    const iree_hal_semaphore_list_t* semaphore_list, iree_timeout_t timeout,
+    VkSemaphoreWaitFlags wait_flags) {
+  if (semaphore_list->count == 0) return iree_ok_status();
+
+  iree_time_t deadline_ns = iree_timeout_as_deadline_ns(timeout);
+
+  uint64_t timeout_ns;
+  if (deadline_ns == IREE_TIME_INFINITE_FUTURE) {
+    timeout_ns = UINT64_MAX;
+  } else if (deadline_ns == IREE_TIME_INFINITE_PAST) {
+    timeout_ns = 0;
+  } else {
+    iree_time_t now_ns = iree_time_now();
+    if (deadline_ns < now_ns) {
+      return iree_status_from_code(IREE_STATUS_DEADLINE_EXCEEDED);
+    }
+    timeout_ns = (uint64_t)(deadline_ns - now_ns);
+  }
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  VkSemaphore* semaphore_handles =
+      (VkSemaphore*)iree_alloca(semaphore_list->count * sizeof(VkSemaphore));
+  for (iree_host_size_t i = 0; i < semaphore_list->count; ++i) {
+    semaphore_handles[i] =
+        iree_hal_vulkan_native_semaphore_handle(semaphore_list->semaphores[i]);
+  }
+
+  VkSemaphoreWaitInfo wait_info;
+  wait_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO;
+  wait_info.pNext = nullptr;
+  wait_info.flags = wait_flags;
+  wait_info.semaphoreCount = semaphore_list->count;
+  wait_info.pSemaphores = semaphore_handles;
+  wait_info.pValues = semaphore_list->payload_values;
+  static_assert(
+      sizeof(wait_info.pValues[0]) == sizeof(semaphore_list->payload_values[0]),
+      "payload value type must match vulkan expected size");
+
+  // NOTE: this may fail with a timeout (VK_TIMEOUT) or in the case of a
+  // device loss event may return either VK_SUCCESS *or* VK_ERROR_DEVICE_LOST.
+  // We may want to explicitly query for device loss after a successful wait
+  // to ensure we consistently return errors.
+  VkResult result = logical_device->syms()->vkWaitSemaphores(
+      *logical_device, &wait_info, timeout_ns);
+
+  IREE_TRACE_ZONE_END(z0);
+
+  if (result == VK_SUCCESS) {
+    return iree_ok_status();
+  } else if (result == VK_ERROR_DEVICE_LOST) {
+    // Nothing we do now matters.
+    return VK_RESULT_TO_STATUS(result, "vkWaitSemaphores");
+  } else if (result == VK_TIMEOUT) {
+    return iree_status_from_code(IREE_STATUS_DEADLINE_EXCEEDED);
+  }
+  return VK_RESULT_TO_STATUS(result, "vkWaitSemaphores");
+}
+
+static iree_status_t iree_hal_vulkan_native_semaphore_wait(
+    iree_hal_semaphore_t* base_semaphore, uint64_t value,
+    iree_timeout_t timeout) {
+  iree_hal_vulkan_native_semaphore_t* semaphore =
+      iree_hal_vulkan_native_semaphore_cast(base_semaphore);
+  iree_hal_semaphore_list_t semaphore_list = {
+      /*.count=*/1,
+      /*.semaphores=*/&base_semaphore,
+      /*.payload_values=*/&value,
+  };
+  return iree_hal_vulkan_native_semaphore_multi_wait(
+      semaphore->logical_device, &semaphore_list, timeout, 0);
+}
+
+namespace {
+const iree_hal_semaphore_vtable_t iree_hal_vulkan_native_semaphore_vtable = {
+    /*.destroy=*/iree_hal_vulkan_native_semaphore_destroy,
+    /*.query=*/iree_hal_vulkan_native_semaphore_query,
+    /*.signal=*/iree_hal_vulkan_native_semaphore_signal,
+    /*.fail=*/iree_hal_vulkan_native_semaphore_fail,
+    /*.wait=*/iree_hal_vulkan_native_semaphore_wait,
+};
+}  // namespace
diff --git a/runtime/src/iree/hal/vulkan/native_semaphore.h b/runtime/src/iree/hal/vulkan/native_semaphore.h
new file mode 100644
index 0000000..91580de
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/native_semaphore.h
@@ -0,0 +1,46 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_VULKAN_NATIVE_SEMAPHORE_H_
+#define IREE_HAL_VULKAN_NATIVE_SEMAPHORE_H_
+
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/vulkan/handle_util.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Creates a timeline semaphore implemented using the native VkSemaphore type.
+// This may require emulation pre-Vulkan 1.2 when timeline semaphores were only
+// an extension.
+iree_status_t iree_hal_vulkan_native_semaphore_create(
+    iree::hal::vulkan::VkDeviceHandle* logical_device, uint64_t initial_value,
+    iree_hal_semaphore_t** out_semaphore);
+
+// Returns the Vulkan timeline semaphore handle.
+VkSemaphore iree_hal_vulkan_native_semaphore_handle(
+    iree_hal_semaphore_t* semaphore);
+
+// Performs a multi-wait on one or more semaphores.
+// By default this is an all-wait but |wait_flags| may contain
+// VK_SEMAPHORE_WAIT_ANY_BIT to change to an any-wait.
+//
+// Returns IREE_STATUS_DEADLINE_EXCEEDED if the wait does not complete before
+// |deadline_ns| elapses.
+iree_status_t iree_hal_vulkan_native_semaphore_multi_wait(
+    iree::hal::vulkan::VkDeviceHandle* logical_device,
+    const iree_hal_semaphore_list_t* semaphore_list, iree_timeout_t timeout,
+    VkSemaphoreWaitFlags wait_flags);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_VULKAN_NATIVE_SEMAPHORE_H_
diff --git a/runtime/src/iree/hal/vulkan/nop_executable_cache.cc b/runtime/src/iree/hal/vulkan/nop_executable_cache.cc
new file mode 100644
index 0000000..fdd5348
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/nop_executable_cache.cc
@@ -0,0 +1,101 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/vulkan/nop_executable_cache.h"
+
+#include <cstddef>
+
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/vulkan/dynamic_symbol_tables.h"
+#include "iree/hal/vulkan/native_executable.h"
+
+using namespace iree::hal::vulkan;
+
+typedef struct iree_hal_vulkan_nop_executable_cache_t {
+  iree_hal_resource_t resource;
+  VkDeviceHandle* logical_device;
+} iree_hal_vulkan_nop_executable_cache_t;
+
+namespace {
+extern const iree_hal_executable_cache_vtable_t
+    iree_hal_vulkan_nop_executable_cache_vtable;
+}  // namespace
+
+static iree_hal_vulkan_nop_executable_cache_t*
+iree_hal_vulkan_nop_executable_cache_cast(
+    iree_hal_executable_cache_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value,
+                       &iree_hal_vulkan_nop_executable_cache_vtable);
+  return (iree_hal_vulkan_nop_executable_cache_t*)base_value;
+}
+
+iree_status_t iree_hal_vulkan_nop_executable_cache_create(
+    iree::hal::vulkan::VkDeviceHandle* logical_device,
+    iree_string_view_t identifier,
+    iree_hal_executable_cache_t** out_executable_cache) {
+  IREE_ASSERT_ARGUMENT(out_executable_cache);
+  *out_executable_cache = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_vulkan_nop_executable_cache_t* executable_cache = NULL;
+  iree_status_t status = iree_allocator_malloc(logical_device->host_allocator(),
+                                               sizeof(*executable_cache),
+                                               (void**)&executable_cache);
+  if (iree_status_is_ok(status)) {
+    iree_hal_resource_initialize(&iree_hal_vulkan_nop_executable_cache_vtable,
+                                 &executable_cache->resource);
+    executable_cache->logical_device = logical_device;
+
+    *out_executable_cache = (iree_hal_executable_cache_t*)executable_cache;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_vulkan_nop_executable_cache_destroy(
+    iree_hal_executable_cache_t* base_executable_cache) {
+  iree_hal_vulkan_nop_executable_cache_t* executable_cache =
+      iree_hal_vulkan_nop_executable_cache_cast(base_executable_cache);
+  iree_allocator_t host_allocator =
+      executable_cache->logical_device->host_allocator();
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_allocator_free(host_allocator, executable_cache);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static bool iree_hal_vulkan_nop_executable_cache_can_prepare_format(
+    iree_hal_executable_cache_t* base_executable_cache,
+    iree_hal_executable_caching_mode_t caching_mode,
+    iree_string_view_t executable_format) {
+  return iree_string_view_equal(executable_format,
+                                iree_make_cstring_view("SPVE"));
+}
+
+static iree_status_t iree_hal_vulkan_nop_executable_cache_prepare_executable(
+    iree_hal_executable_cache_t* base_executable_cache,
+    const iree_hal_executable_params_t* executable_params,
+    iree_hal_executable_t** out_executable) {
+  iree_hal_vulkan_nop_executable_cache_t* executable_cache =
+      iree_hal_vulkan_nop_executable_cache_cast(base_executable_cache);
+  return iree_hal_vulkan_native_executable_create(
+      executable_cache->logical_device,
+      /*pipeline_cache=*/VK_NULL_HANDLE, executable_params, out_executable);
+}
+
+namespace {
+const iree_hal_executable_cache_vtable_t
+    iree_hal_vulkan_nop_executable_cache_vtable = {
+        /*.destroy=*/iree_hal_vulkan_nop_executable_cache_destroy,
+        /*.can_prepare_format=*/
+        iree_hal_vulkan_nop_executable_cache_can_prepare_format,
+        /*.prepare_executable=*/
+        iree_hal_vulkan_nop_executable_cache_prepare_executable,
+};
+}  // namespace
diff --git a/runtime/src/iree/hal/vulkan/nop_executable_cache.h b/runtime/src/iree/hal/vulkan/nop_executable_cache.h
new file mode 100644
index 0000000..7a3e10b
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/nop_executable_cache.h
@@ -0,0 +1,30 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_VULKAN_NOP_EXECUTABLE_CACHE_H_
+#define IREE_HAL_VULKAN_NOP_EXECUTABLE_CACHE_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/vulkan/handle_util.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Creates a no-op executable cache that does not cache at all.
+// This is useful to isolate pipeline caching behavior and verify compilation
+// behavior.
+iree_status_t iree_hal_vulkan_nop_executable_cache_create(
+    iree::hal::vulkan::VkDeviceHandle* logical_device,
+    iree_string_view_t identifier,
+    iree_hal_executable_cache_t** out_executable_cache);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_VULKAN_NOP_EXECUTABLE_CACHE_H_
diff --git a/runtime/src/iree/hal/vulkan/registration/BUILD b/runtime/src/iree/hal/vulkan/registration/BUILD
new file mode 100644
index 0000000..8706fbb
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/registration/BUILD
@@ -0,0 +1,45 @@
+# Copyright 2020 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("//iree:build_defs.oss.bzl", "iree_cmake_extra_content", "iree_runtime_cc_library")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+iree_cmake_extra_content(
+    content = """
+if(${IREE_HAL_DRIVER_VULKAN})
+""",
+    inline = True,
+)
+
+iree_runtime_cc_library(
+    name = "registration",
+    srcs = ["driver_module.cc"],
+    hdrs = ["driver_module.h"],
+    defines = [
+        "IREE_HAL_HAVE_VULKAN_DRIVER_MODULE=1",
+    ],
+    deps = [
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:cc",
+        "//runtime/src/iree/base:core_headers",
+        "//runtime/src/iree/base:tracing",
+        "//runtime/src/iree/base/internal:flags",
+        "//runtime/src/iree/hal",
+        "//runtime/src/iree/hal/vulkan",
+    ],
+)
+
+iree_cmake_extra_content(
+    content = """
+endif()
+""",
+    inline = True,
+)
diff --git a/runtime/src/iree/hal/vulkan/registration/CMakeLists.txt b/runtime/src/iree/hal/vulkan/registration/CMakeLists.txt
new file mode 100644
index 0000000..14854c0
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/registration/CMakeLists.txt
@@ -0,0 +1,37 @@
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# runtime/src/iree/hal/vulkan/registration/BUILD                               #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
+
+iree_add_all_subdirs()
+
+if(${IREE_HAL_DRIVER_VULKAN})
+
+iree_cc_library(
+  NAME
+    registration
+  HDRS
+    "driver_module.h"
+  SRCS
+    "driver_module.cc"
+  DEPS
+    iree::base
+    iree::base::cc
+    iree::base::core_headers
+    iree::base::internal::flags
+    iree::base::tracing
+    iree::hal
+    iree::hal::vulkan
+  DEFINES
+    "IREE_HAL_HAVE_VULKAN_DRIVER_MODULE=1"
+  PUBLIC
+)
+
+endif()
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/runtime/src/iree/hal/vulkan/registration/driver_module.cc b/runtime/src/iree/hal/vulkan/registration/driver_module.cc
new file mode 100644
index 0000000..2692a31
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/registration/driver_module.cc
@@ -0,0 +1,125 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/vulkan/registration/driver_module.h"
+
+#include <cinttypes>
+#include <cstddef>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/flags.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/vulkan/api.h"
+
+#define IREE_HAL_VULKAN_1_X_DRIVER_ID 0x564C4B31u  // VLK1
+
+IREE_FLAG(bool, vulkan_validation_layers, true,
+          "Enables standard Vulkan validation layers.");
+IREE_FLAG(bool, vulkan_debug_utils, true,
+          "Enables VK_EXT_debug_utils, records markers, and logs errors.");
+
+IREE_FLAG(int32_t, vulkan_default_index, 0,
+          "Index of the default Vulkan device.");
+
+IREE_FLAG(bool, vulkan_force_timeline_semaphore_emulation, false,
+          "Uses timeline semaphore emulation even if native support exists.");
+
+IREE_FLAG(bool, vulkan_tracing, true,
+          "Enables Vulkan tracing (if IREE tracing is enabled).");
+
+static iree_status_t iree_hal_vulkan_create_driver_with_flags(
+    iree_string_view_t identifier, iree_allocator_t allocator,
+    iree_hal_driver_t** out_driver) {
+  IREE_TRACE_SCOPE();
+
+  // Setup driver options from flags. We do this here as we want to enable other
+  // consumers that may not be using modules/command line flags to be able to
+  // set their options however they want.
+  iree_hal_vulkan_driver_options_t driver_options;
+  iree_hal_vulkan_driver_options_initialize(&driver_options);
+
+// TODO(benvanik): make this a flag - it's useful for testing the same binary
+// against multiple versions of Vulkan.
+#if defined(IREE_PLATFORM_ANDROID)
+  // TODO(#4494): let's see when we can always enable timeline semaphores.
+  driver_options.api_version = VK_API_VERSION_1_1;
+#else
+  driver_options.api_version = VK_API_VERSION_1_2;
+#endif  // IREE_PLATFORM_ANDROID
+
+  if (FLAG_vulkan_validation_layers) {
+    driver_options.requested_features |=
+        IREE_HAL_VULKAN_FEATURE_ENABLE_VALIDATION_LAYERS;
+  }
+  if (FLAG_vulkan_debug_utils) {
+    driver_options.requested_features |=
+        IREE_HAL_VULKAN_FEATURE_ENABLE_DEBUG_UTILS;
+  }
+  if (FLAG_vulkan_tracing) {
+    driver_options.requested_features |= IREE_HAL_VULKAN_FEATURE_ENABLE_TRACING;
+  }
+
+  driver_options.default_device_index = FLAG_vulkan_default_index;
+
+  if (FLAG_vulkan_force_timeline_semaphore_emulation) {
+    driver_options.device_options.flags |=
+        IREE_HAL_VULKAN_DEVICE_FORCE_TIMELINE_SEMAPHORE_EMULATION;
+  }
+
+  // Load the Vulkan library. This will fail if the library cannot be found or
+  // does not have the expected functions.
+  iree_hal_vulkan_syms_t* syms = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_hal_vulkan_syms_create_from_system_loader(allocator, &syms));
+
+  iree_status_t status = iree_hal_vulkan_driver_create(
+      identifier, &driver_options, syms, allocator, out_driver);
+
+  iree_hal_vulkan_syms_release(syms);
+  return status;
+}
+
+static iree_status_t iree_hal_vulkan_driver_factory_enumerate(
+    void* self, const iree_hal_driver_info_t** out_driver_infos,
+    iree_host_size_t* out_driver_info_count) {
+  // NOTE: we could query supported vulkan versions or featuresets here.
+  static const iree_hal_driver_info_t driver_infos[1] = {{
+      /*driver_id=*/IREE_HAL_VULKAN_1_X_DRIVER_ID,
+      /*driver_name=*/iree_make_cstring_view("vulkan"),
+      /*full_name=*/iree_make_cstring_view("Vulkan 1.x (dynamic)"),
+  }};
+  *out_driver_info_count = IREE_ARRAYSIZE(driver_infos);
+  *out_driver_infos = driver_infos;
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_vulkan_driver_factory_try_create(
+    void* self, iree_hal_driver_id_t driver_id, iree_allocator_t allocator,
+    iree_hal_driver_t** out_driver) {
+  if (driver_id != IREE_HAL_VULKAN_1_X_DRIVER_ID) {
+    return iree_make_status(IREE_STATUS_UNAVAILABLE,
+                            "no driver with ID %016" PRIu64
+                            " is provided by this factory",
+                            driver_id);
+  }
+
+  // When we expose more than one driver (different vulkan versions, etc) we
+  // can name them here:
+  iree_string_view_t identifier = iree_make_cstring_view("vulkan");
+
+  return iree_hal_vulkan_create_driver_with_flags(identifier, allocator,
+                                                  out_driver);
+}
+
+IREE_API_EXPORT iree_status_t
+iree_hal_vulkan_driver_module_register(iree_hal_driver_registry_t* registry) {
+  static const iree_hal_driver_factory_t factory = {
+      /*self=*/NULL,
+      iree_hal_vulkan_driver_factory_enumerate,
+      iree_hal_vulkan_driver_factory_try_create,
+  };
+  return iree_hal_driver_registry_register_factory(registry, &factory);
+}
diff --git a/runtime/src/iree/hal/vulkan/registration/driver_module.h b/runtime/src/iree/hal/vulkan/registration/driver_module.h
new file mode 100644
index 0000000..e6c3cf7
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/registration/driver_module.h
@@ -0,0 +1,24 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_VULKAN_REGISTRATION_DRIVER_MODULE_H_
+#define IREE_HAL_VULKAN_REGISTRATION_DRIVER_MODULE_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+IREE_API_EXPORT iree_status_t
+iree_hal_vulkan_driver_module_register(iree_hal_driver_registry_t* registry);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_VULKAN_REGISTRATION_DRIVER_MODULE_H_
diff --git a/runtime/src/iree/hal/vulkan/serializing_command_queue.cc b/runtime/src/iree/hal/vulkan/serializing_command_queue.cc
new file mode 100644
index 0000000..92e0a64
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/serializing_command_queue.cc
@@ -0,0 +1,428 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/vulkan/serializing_command_queue.h"
+
+#include <cstdint>
+#include <memory>
+#include <type_traits>
+
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/api.h"
+#include "iree/hal/vulkan/direct_command_buffer.h"
+#include "iree/hal/vulkan/emulated_semaphore.h"
+#include "iree/hal/vulkan/status_util.h"
+#include "iree/hal/vulkan/tracing.h"
+#include "iree/hal/vulkan/util/arena.h"
+
+namespace iree {
+namespace hal {
+namespace vulkan {
+
+namespace {
+
+// Tries to prepare all necessary binary `VKSemaphore`s for emulating the time
+// points as specified in the given submission |batch_wait_semaphores| and
+// |batch_signal_semaphores|, then returns true if possible so that the
+// batch is ready to be submitted to GPU.
+// |wait_semaphores| and |signal_semaphores| will be filled with the binary
+// `VkSemaphores` on success.
+iree_status_t TryToPrepareSemaphores(
+    const std::vector<SemaphoreValue>& batch_wait_semaphores,
+    const std::vector<SemaphoreValue>& batch_signal_semaphores,
+    const ref_ptr<TimePointFence>& batch_fence,
+    std::vector<VkSemaphore>* wait_semaphores,
+    std::vector<VkSemaphore>* signal_semaphores, bool* out_ready_to_submit) {
+  IREE_TRACE_SCOPE0("TryToPrepareSemaphores");
+  *out_ready_to_submit = false;
+
+  wait_semaphores->clear();
+  for (const auto& timeline_semaphore : batch_wait_semaphores) {
+    // Query first to progress this timeline semaphore to the furthest.
+    uint64_t signaled_value = 0;
+    IREE_RETURN_IF_ERROR(
+        iree_hal_semaphore_query(timeline_semaphore.first, &signaled_value));
+
+    // If it's already signaled to a value greater than we require here,
+    // we can just ignore this semaphore now.
+    if (signaled_value >= timeline_semaphore.second) {
+      continue;
+    }
+
+    // Otherwise try to get a binary semaphore for this time point so that
+    // we can wait on.
+    // TODO(antiagainst): if this fails we need to cancel.
+    VkSemaphore wait_semaphore = VK_NULL_HANDLE;
+    IREE_RETURN_IF_ERROR(iree_hal_vulkan_emulated_semaphore_acquire_wait_handle(
+        timeline_semaphore.first, timeline_semaphore.second, batch_fence,
+        &wait_semaphore));
+    wait_semaphores->push_back(wait_semaphore);
+
+    if (wait_semaphore == VK_NULL_HANDLE) {
+      // We cannot wait on this time point yet: there are no previous semaphores
+      // submitted to the GPU that can signal a value greater than what's
+      // desired here.
+
+      // Cancel the wait so others may make progress.
+      // TODO(antiagainst): if any of these fail we need to cancel.
+      for (iree_host_size_t i = 0; i < batch_wait_semaphores.size(); ++i) {
+        if (!wait_semaphores->at(i)) break;
+        IREE_RETURN_IF_ERROR(
+            iree_hal_vulkan_emulated_semaphore_cancel_wait_handle(
+                batch_wait_semaphores[i].first, wait_semaphores->at(i)));
+      }
+
+      // This batch cannot be submitted to GPU yet.
+      return iree_ok_status();
+    }
+  }
+
+  // We've collected all necessary binary semaphores for each timeline we need
+  // to wait on. Now prepare binary semaphores for signaling.
+  signal_semaphores->clear();
+  for (const auto& timeline_semaphore : batch_signal_semaphores) {
+    // SerializingCommandQueue only works with EmulatedTimelineSemaphore.
+    VkSemaphore signal_semaphore = VK_NULL_HANDLE;
+    IREE_RETURN_IF_ERROR(
+        iree_hal_vulkan_emulated_semaphore_acquire_signal_handle(
+            timeline_semaphore.first, timeline_semaphore.second, batch_fence,
+            &signal_semaphore));
+    signal_semaphores->push_back(signal_semaphore);
+  }
+
+  // Good to submit!
+  *out_ready_to_submit = true;
+  return iree_ok_status();
+}
+
+// Prepares `VkSubmitInfo` to submit the given list of |command_buffers| that
+// waiting on |wait_semaphores| and signalling |signal_semaphores|. Necessary
+// structures are allocated from |arena| and the result `VkSubmitInfo` is
+// written to |submit_info|.
+void PrepareSubmitInfo(
+    const std::vector<VkSemaphore>& wait_semaphore_handles,
+    const std::vector<VkCommandBuffer>& command_buffer_handles,
+    const std::vector<VkSemaphore>& signal_semaphore_handles,
+    VkSubmitInfo* submit_info, Arena* arena) {
+  // TODO(benvanik): see if we can go to finer-grained stages.
+  // For example, if this was just queue ownership transfers then we can use
+  // the pseudo-stage of VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT.
+  auto wait_dst_stage_masks =
+      arena->AllocateSpan<VkPipelineStageFlags>(wait_semaphore_handles.size());
+  for (size_t i = 0, e = wait_semaphore_handles.size(); i < e; ++i) {
+    wait_dst_stage_masks[i] =
+        VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
+  }
+
+  // NOTE: this code does some very weird things - the handles we take in as
+  // args are mutated in-place after this function is called so we can't
+  // reference them here. If we were going to preserve this code post-Vulkan 1.2
+  // then we'd really want to rework all of this to properly use the arena from
+  // the start instead of all this span tomfoolery.
+  auto wait_semaphores =
+      arena->AllocateSpan<VkSemaphore>(wait_semaphore_handles.size());
+  for (size_t i = 0, e = wait_semaphore_handles.size(); i < e; ++i) {
+    wait_semaphores[i] = wait_semaphore_handles[i];
+  }
+  auto command_buffers =
+      arena->AllocateSpan<VkCommandBuffer>(command_buffer_handles.size());
+  for (size_t i = 0, e = command_buffer_handles.size(); i < e; ++i) {
+    command_buffers[i] = command_buffer_handles[i];
+  }
+  auto signal_semaphores =
+      arena->AllocateSpan<VkSemaphore>(signal_semaphore_handles.size());
+  for (size_t i = 0, e = signal_semaphore_handles.size(); i < e; ++i) {
+    signal_semaphores[i] = signal_semaphore_handles[i];
+  }
+
+  submit_info->sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+  submit_info->pNext = nullptr;
+  submit_info->waitSemaphoreCount =
+      static_cast<uint32_t>(wait_semaphores.size());
+  submit_info->pWaitSemaphores = wait_semaphores.data();
+  submit_info->pWaitDstStageMask = wait_dst_stage_masks.data();
+  submit_info->commandBufferCount =
+      static_cast<uint32_t>(command_buffers.size());
+  submit_info->pCommandBuffers = command_buffers.data();
+  submit_info->signalSemaphoreCount =
+      static_cast<uint32_t>(signal_semaphores.size());
+  submit_info->pSignalSemaphores = signal_semaphores.data();
+}
+
+}  // namespace
+
+SerializingCommandQueue::SerializingCommandQueue(
+    VkDeviceHandle* logical_device,
+    iree_hal_command_category_t supported_categories, VkQueue queue,
+    TimePointFencePool* fence_pool)
+    : CommandQueue(logical_device, supported_categories, queue),
+      fence_pool_(fence_pool) {}
+
+SerializingCommandQueue::~SerializingCommandQueue() = default;
+
+iree_status_t SerializingCommandQueue::Submit(
+    iree_host_size_t batch_count, const iree_hal_submission_batch_t* batches) {
+  IREE_TRACE_SCOPE0("SerializingCommandQueue::Submit");
+
+  IntrusiveList<std::unique_ptr<FencedSubmission>> new_submissions;
+  for (iree_host_size_t i = 0; i < batch_count; ++i) {
+    const iree_hal_submission_batch_t* batch = &batches[i];
+
+    // Grab a fence for this submission first. This will be used to check the
+    // progress of emulated timeline semaphores later.
+    auto submission = std::make_unique<FencedSubmission>();
+    IREE_RETURN_IF_ERROR(fence_pool_->Acquire(&submission->fence));
+
+    submission->wait_semaphores.resize(batch->wait_semaphores.count);
+    for (iree_host_size_t j = 0; j < batch->wait_semaphores.count; ++j) {
+      submission->wait_semaphores[j] = {
+          batch->wait_semaphores.semaphores[j],
+          batch->wait_semaphores.payload_values[j]};
+    }
+
+    submission->command_buffers.resize(batch->command_buffer_count);
+    for (iree_host_size_t j = 0; j < batch->command_buffer_count; ++j) {
+      submission->command_buffers[j] =
+          iree_hal_vulkan_direct_command_buffer_handle(
+              batch->command_buffers[j]);
+    }
+
+    submission->signal_semaphores.resize(batch->signal_semaphores.count);
+    for (iree_host_size_t j = 0; j < batch->signal_semaphores.count; ++j) {
+      submission->signal_semaphores[j] = {
+          batch->signal_semaphores.semaphores[j],
+          batch->signal_semaphores.payload_values[j]};
+    }
+
+    new_submissions.push_back(std::move(submission));
+  }
+
+  iree_slim_mutex_lock(&queue_mutex_);
+  deferred_submissions_.merge_from(&new_submissions);
+  iree_status_t status = ProcessDeferredSubmissions();
+  iree_slim_mutex_unlock(&queue_mutex_);
+  return status;
+}
+
+iree_status_t SerializingCommandQueue::ProcessDeferredSubmissions(
+    bool* out_work_submitted) {
+  IREE_TRACE_SCOPE0("SerializingCommandQueue::ProcessDeferredSubmissions");
+
+  // Try to process the submissions and if we hit a stopping point during the
+  // process where we need to yield we take the remaining submissions and
+  // re-enqueue them.
+  IntrusiveList<std::unique_ptr<FencedSubmission>> remaining_submissions;
+  iree_status_t status =
+      TryProcessDeferredSubmissions(remaining_submissions, out_work_submitted);
+  while (!remaining_submissions.empty()) {
+    deferred_submissions_.push_back(
+        remaining_submissions.take(remaining_submissions.front()));
+  }
+
+  return status;
+}
+
+iree_status_t SerializingCommandQueue::TryProcessDeferredSubmissions(
+    IntrusiveList<std::unique_ptr<FencedSubmission>>& remaining_submissions,
+    bool* out_work_submitted) {
+  if (out_work_submitted) *out_work_submitted = false;
+
+  Arena arena(4 * 1024);
+  std::vector<VkSubmitInfo> submit_infos;
+  std::vector<VkFence> submit_fences;
+  while (!deferred_submissions_.empty()) {
+    FencedSubmission* submission = deferred_submissions_.front();
+    ref_ptr<TimePointFence>& fence = submission->fence;
+
+    std::vector<VkSemaphore> wait_semaphores;
+    std::vector<VkSemaphore> signal_semaphores;
+    bool ready_to_submit = false;
+    IREE_RETURN_IF_ERROR(TryToPrepareSemaphores(
+        submission->wait_semaphores, submission->signal_semaphores, fence,
+        &wait_semaphores, &signal_semaphores, &ready_to_submit));
+    if (ready_to_submit) {
+      submit_infos.emplace_back();
+      PrepareSubmitInfo(wait_semaphores, submission->command_buffers,
+                        signal_semaphores, &submit_infos.back(), &arena);
+
+      submit_fences.push_back(fence->value());
+      pending_fences_.emplace_back(std::move(fence));
+      deferred_submissions_.pop_front();
+    } else {
+      // We need to defer the submission until later.
+      remaining_submissions.push_back(deferred_submissions_.take(submission));
+    }
+  }
+  if (submit_infos.empty()) {
+    if (out_work_submitted) *out_work_submitted = false;
+    return iree_ok_status();
+  }
+
+  // Note: We might be able to batch the submission but it involves non-trivial
+  // fence handling. We can handle that if really needed.
+  for (size_t i = 0, e = submit_infos.size(); i < e; ++i) {
+    VK_RETURN_IF_ERROR(
+        syms()->vkQueueSubmit(queue_, /*submitCount=*/1, &submit_infos[i],
+                              submit_fences[i]),
+        "vkQueueSubmit");
+  }
+
+  if (out_work_submitted) *out_work_submitted = true;
+  return iree_ok_status();
+}
+
+iree_status_t SerializingCommandQueue::WaitIdle(iree_timeout_t timeout) {
+  iree_status_t status = iree_ok_status();
+
+  iree_time_t deadline_ns = iree_timeout_as_deadline_ns(timeout);
+
+  if (deadline_ns == IREE_TIME_INFINITE_FUTURE) {
+    IREE_TRACE_SCOPE0("SerializingCommandQueue::WaitIdle#vkQueueWaitIdle");
+    // Fast path for using vkQueueWaitIdle, which is usually cheaper (as it
+    // requires fewer calls into the driver).
+
+    iree_slim_mutex_lock(&queue_mutex_);
+
+    // Complete all pending work on the queue.
+    status =
+        VK_RESULT_TO_STATUS(syms()->vkQueueWaitIdle(queue_), "vkQueueWaitIdle");
+    if (!iree_status_is_ok(status)) {
+      iree_slim_mutex_unlock(&queue_mutex_);
+      return status;
+    }
+    pending_fences_.clear();
+
+    // Submit and complete all deferred work.
+    while (!deferred_submissions_.empty()) {
+      bool work_submitted = false;
+      status = ProcessDeferredSubmissions(&work_submitted);
+      if (!iree_status_is_ok(status)) break;
+      if (work_submitted) {
+        status = VK_RESULT_TO_STATUS(syms()->vkQueueWaitIdle(queue_),
+                                     "vkQueueWaitIdle");
+        if (!iree_status_is_ok(status)) break;
+        pending_fences_.clear();
+      }
+    }
+
+    iree_slim_mutex_unlock(&queue_mutex_);
+
+    iree_hal_vulkan_tracing_context_collect(tracing_context(), VK_NULL_HANDLE);
+    return status;
+  }
+
+  IREE_TRACE_SCOPE0("SerializingCommandQueue::WaitIdle#Fence");
+
+  // Keep trying to submit more workload to the GPU until reaching the deadline.
+  iree_slim_mutex_lock(&queue_mutex_);
+  do {
+    status = ProcessDeferredSubmissions();
+    bool has_deferred_submissions = !deferred_submissions_.empty();
+    std::vector<VkFence> fence_handles(pending_fences_.size());
+    for (size_t i = 0; i < pending_fences_.size(); ++i) {
+      fence_handles[i] = pending_fences_[i]->value();
+    }
+    if (!iree_status_is_ok(status)) {
+      break;  // unable to process submissions
+    } else if (!has_deferred_submissions && fence_handles.empty()) {
+      break;  // no more work - idle achieved
+    }
+
+    uint64_t timeout_ns;
+    if (deadline_ns == IREE_TIME_INFINITE_FUTURE) {
+      timeout_ns = UINT64_MAX;
+    } else if (deadline_ns == IREE_TIME_INFINITE_PAST) {
+      timeout_ns = 0;
+    } else {
+      // Convert to relative time in nanoseconds.
+      // The implementation may not wait with this granularity (like by 10000x).
+      iree_time_t now_ns = iree_time_now();
+      if (deadline_ns < now_ns) {
+        return iree_status_from_code(IREE_STATUS_DEADLINE_EXCEEDED);
+      }
+      timeout_ns = (uint64_t)(deadline_ns - now_ns);
+    }
+    VkResult result = syms()->vkWaitForFences(
+        *logical_device_, static_cast<uint32_t>(fence_handles.size()),
+        fence_handles.data(),
+        /*waitAll=*/VK_TRUE, timeout_ns);
+
+    switch (result) {
+      case VK_SUCCESS:
+        pending_fences_.clear();
+        break;
+      case VK_TIMEOUT:
+        status = iree_status_from_code(IREE_STATUS_DEADLINE_EXCEEDED);
+        break;
+      default:
+        status = VK_RESULT_TO_STATUS(result, "vkWaitForFences");
+        break;
+    }
+    // As long as there is submitted or deferred work still pending.
+  } while (iree_status_is_ok(status));
+  iree_slim_mutex_unlock(&queue_mutex_);
+  return status;
+}
+
+iree_status_t SerializingCommandQueue::AdvanceQueueSubmission() {
+  // The returned value just indicates whether there were newly ready
+  // submissions gotten submitted to the GPU. Other callers might be
+  // interested in that information but for this API we just want to advance
+  // queue submisison if possible. So we ignore it here.
+  iree_slim_mutex_lock(&queue_mutex_);
+  iree_status_t status = ProcessDeferredSubmissions();
+  iree_slim_mutex_unlock(&queue_mutex_);
+  return status;
+}
+
+void SerializingCommandQueue::AbortQueueSubmission() {
+  iree_slim_mutex_lock(&queue_mutex_);
+
+  // We have fences in deferred_submissions_ but they are not submitted to GPU
+  // yet so we don't need to reset.
+  deferred_submissions_.clear();
+
+  std::vector<VkFence> fence_handles(pending_fences_.size());
+  for (size_t i = 0; i < pending_fences_.size(); ++i) {
+    fence_handles[i] = pending_fences_[i]->value();
+  }
+
+  syms()->vkWaitForFences(*logical_device_,
+                          static_cast<uint32_t>(fence_handles.size()),
+                          fence_handles.data(),
+                          /*waitAll=*/VK_TRUE, /*timeout=*/UINT64_MAX);
+
+  // Clear the list. Fences will be automatically returned back to the queue
+  // after refcount reaches 0.
+  pending_fences_.clear();
+
+  iree_slim_mutex_unlock(&queue_mutex_);
+}
+
+void SerializingCommandQueue::SignalFences(const std::vector<VkFence>& fences) {
+  const auto span_contains = [fences](VkFence fence) {
+    for (VkFence f : fences) {
+      if (f == fence) return true;
+    }
+    return false;
+  };
+
+  iree_slim_mutex_lock(&queue_mutex_);
+  auto it = pending_fences_.begin();
+  while (it != pending_fences_.end()) {
+    if (span_contains((*it)->value())) {
+      it = pending_fences_.erase(it);
+    } else {
+      ++it;
+    }
+  }
+  iree_slim_mutex_unlock(&queue_mutex_);
+}
+
+}  // namespace vulkan
+}  // namespace hal
+}  // namespace iree
diff --git a/runtime/src/iree/hal/vulkan/serializing_command_queue.h b/runtime/src/iree/hal/vulkan/serializing_command_queue.h
new file mode 100644
index 0000000..949deca
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/serializing_command_queue.h
@@ -0,0 +1,103 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_VULKAN_SERIALIZING_COMMAND_QUEUE_H_
+#define IREE_HAL_VULKAN_SERIALIZING_COMMAND_QUEUE_H_
+
+// clang-format off: must be included before all other headers.
+#include "iree/hal/vulkan/vulkan_headers.h"
+// clang-format on
+
+#include <stddef.h>
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/synchronization.h"
+#include "iree/hal/api.h"
+#include "iree/hal/vulkan/command_queue.h"
+#include "iree/hal/vulkan/dynamic_symbols.h"
+#include "iree/hal/vulkan/handle_util.h"
+#include "iree/hal/vulkan/timepoint_util.h"
+#include "iree/hal/vulkan/util/intrusive_list.h"
+#include "iree/hal/vulkan/util/ref_ptr.h"
+
+namespace iree {
+namespace hal {
+namespace vulkan {
+
+using SemaphoreValue = std::pair<iree_hal_semaphore_t*, uint64_t>;
+
+// A command queue that potentially defers and serializes command buffer
+// submission to the GPU.
+//
+// This command queue is designed to be used together with emulated timeline
+// semaphores. Timeline semaphores can follow wait-before-signal submission
+// order but binary `VkSemaphore` cannot. So when emulating timeline semaphores
+// with binary `VkSemaphore`s and `VkFence`s, we need to make sure no
+// wait-before-signal submission order occur for binary `VkSemaphore`s. The way
+// to enforce that is to defer the submission until we can be certain that the
+// `VkSemaphore`s emulating time points in the timeline are all *submitted* to
+// the GPU.
+class SerializingCommandQueue final : public CommandQueue {
+ public:
+  SerializingCommandQueue(VkDeviceHandle* logical_device,
+                          iree_hal_command_category_t supported_categories,
+                          VkQueue queue, TimePointFencePool* fence_pool);
+  ~SerializingCommandQueue() override;
+
+  const ref_ptr<DynamicSymbols>& syms() const {
+    return logical_device_->syms();
+  }
+
+  iree_status_t Submit(iree_host_size_t batch_count,
+                       const iree_hal_submission_batch_t* batches) override;
+
+  iree_status_t WaitIdle(iree_timeout_t timeout) override;
+
+  // Releases all deferred submissions ready to submit to the GPU.
+  iree_status_t AdvanceQueueSubmission();
+
+  // Aborts all deferred submissions and waits for submitted work to complete.
+  void AbortQueueSubmission();
+
+  // Informs this queue that the given |fences| are known to have signaled.
+  void SignalFences(const std::vector<VkFence>& fences);
+
+ private:
+  // A submission batch together with the fence to singal its status.
+  struct FencedSubmission : public IntrusiveLinkBase<void> {
+    std::vector<SemaphoreValue> wait_semaphores;
+    std::vector<VkCommandBuffer> command_buffers;
+    std::vector<SemaphoreValue> signal_semaphores;
+    ref_ptr<TimePointFence> fence;
+  };
+
+  // Processes deferred submissions in this queue and returns whether there are
+  // new workload submitted to the GPU if no errors happen.
+  iree_status_t ProcessDeferredSubmissions(bool* out_work_submitted = NULL);
+  iree_status_t TryProcessDeferredSubmissions(
+      IntrusiveList<std::unique_ptr<FencedSubmission>>& remaining_submissions,
+      bool* out_work_submitted);
+
+  TimePointFencePool* fence_pool_;
+
+  // A list of fences that are submitted to GPU.
+  std::vector<ref_ptr<TimePointFence>> pending_fences_ IREE_GUARDED_BY(mutex_);
+  // A list of deferred submissions that haven't been submitted to GPU.
+  IntrusiveList<std::unique_ptr<FencedSubmission>> deferred_submissions_
+      IREE_GUARDED_BY(mutex_);
+};
+
+}  // namespace vulkan
+}  // namespace hal
+}  // namespace iree
+
+#endif  // IREE_HAL_VULKAN_SERIALIZING_COMMAND_QUEUE_H_
diff --git a/runtime/src/iree/hal/vulkan/status_util.c b/runtime/src/iree/hal/vulkan/status_util.c
new file mode 100644
index 0000000..e61008c
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/status_util.c
@@ -0,0 +1,260 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/vulkan/status_util.h"
+
+iree_status_t iree_hal_vulkan_result_to_status(VkResult result,
+                                               const char* file,
+                                               uint32_t line) {
+  switch (result) {
+    // Success codes.
+    case VK_SUCCESS:
+      // Command successfully completed.
+      return iree_ok_status();
+    case VK_NOT_READY:
+      // A fence or query has not yet completed.
+      return iree_ok_status();
+    case VK_TIMEOUT:
+      // A wait operation has not completed in the specified time.
+      return iree_ok_status();
+    case VK_EVENT_SET:
+      // An event is signaled.
+      return iree_ok_status();
+    case VK_EVENT_RESET:
+      // An event is unsignaled.
+      return iree_ok_status();
+    case VK_INCOMPLETE:
+      // A return array was too small for the result.
+      return iree_ok_status();
+    case VK_SUBOPTIMAL_KHR:
+      // A swapchain no longer matches the surface properties exactly, but can
+      // still be used to present to the surface successfully.
+      return iree_ok_status();
+
+    // Error codes.
+    case VK_ERROR_OUT_OF_HOST_MEMORY:
+      // A host memory allocation has failed.
+      return iree_make_status_with_location(file, line,
+                                            IREE_STATUS_RESOURCE_EXHAUSTED,
+                                            "VK_ERROR_OUT_OF_HOST_MEMORY");
+    case VK_ERROR_OUT_OF_DEVICE_MEMORY:
+      // A device memory allocation has failed.
+      return iree_make_status_with_location(file, line,
+                                            IREE_STATUS_RESOURCE_EXHAUSTED,
+                                            "VK_ERROR_OUT_OF_DEVICE_MEMORY");
+    case VK_ERROR_INITIALIZATION_FAILED:
+      // Initialization of an object could not be completed for
+      // implementation-specific reasons.
+      return iree_make_status_with_location(file, line, IREE_STATUS_UNAVAILABLE,
+                                            "VK_ERROR_INITIALIZATION_FAILED");
+    case VK_ERROR_DEVICE_LOST:
+      // The logical or physical device has been lost.
+      //
+      // A logical device may become lost for a number of
+      // implementation-specific reasons, indicating that pending and future
+      // command execution may fail and cause resources and backing memory to
+      // become undefined.
+      //
+      // Typical reasons for device loss will include things like execution
+      // timing out (to prevent denial of service), power management events,
+      // platform resource management, or implementation errors.
+      //
+      // When this happens, certain commands will return
+      // VK_ERROR_DEVICE_LOST (see Error Codes for a list of such
+      // commands). After any such event, the logical device is considered lost.
+      // It is not possible to reset the logical device to a non-lost state,
+      // however the lost state is specific to a logical device (VkDevice), and
+      // the corresponding physical device (VkPhysicalDevice) may be otherwise
+      // unaffected.
+      //
+      // In some cases, the physical device may also be lost, and attempting to
+      // create a new logical device will fail, returning VK_ERROR_DEVICE_LOST.
+      // This is usually indicative of a problem with the underlying
+      // implementation, or its connection to the host. If the physical device
+      // has not been lost, and a new logical device is successfully created
+      // from that physical device, it must be in the non-lost state.
+      //
+      // Whilst logical device loss may be recoverable, in the case of physical
+      // device loss, it is unlikely that an application will be able to recover
+      // unless additional, unaffected physical devices exist on the system. The
+      // error is largely informational and intended only to inform the user
+      // that a platform issue has occurred, and should be investigated further.
+      // For example, underlying hardware may have developed a fault or become
+      // physically disconnected from the rest of the system. In many cases,
+      // physical device loss may cause other more serious issues such as the
+      // operating system crashing; in which case it may not be reported via the
+      // Vulkan API.
+      //
+      // Undefined behavior caused by an application error may cause a device to
+      // become lost. However, such undefined behavior may also cause
+      // unrecoverable damage to the process, and it is then not guaranteed that
+      // the API objects, including the VkPhysicalDevice or the VkInstance are
+      // still valid or that the error is recoverable.
+      //
+      // When a device is lost, its child objects are not implicitly destroyed
+      // and their handles are still valid. Those objects must still be
+      // destroyed before their parents or the device can be destroyed (see the
+      // Object Lifetime section). The host address space corresponding to
+      // device memory mapped using vkMapMemory is still valid, and host memory
+      // accesses to these mapped regions are still valid, but the contents are
+      // undefined. It is still legal to call any API command on the device and
+      // child objects.
+      //
+      // Once a device is lost, command execution may fail, and commands that
+      // return a VkResult may return VK_ERROR_DEVICE_LOST.
+      // Commands that do not allow run-time errors must still operate correctly
+      // for valid usage and, if applicable, return valid data.
+      //
+      // Commands that wait indefinitely for device execution (namely
+      // vkDeviceWaitIdle, vkQueueWaitIdle, vkWaitForFences with a maximum
+      // timeout, and vkGetQueryPoolResults with the VK_QUERY_RESULT_WAIT_BIT
+      // bit set in flags) must return in finite time even in the case
+      // of a lost device, and return either VK_SUCCESS or
+      // VK_ERROR_DEVICE_LOST. For any command that may return
+      // VK_ERROR_DEVICE_LOST, for the purpose of determining whether a
+      // command buffer is in the pending state, or whether resources are
+      // considered in-use by the device, a return value of
+      // VK_ERROR_DEVICE_LOST is equivalent to VK_SUCCESS.
+      return iree_make_status_with_location(file, line, IREE_STATUS_INTERNAL,
+                                            "VK_ERROR_DEVICE_LOST");
+    case VK_ERROR_MEMORY_MAP_FAILED:
+      // Mapping of a memory object has failed.
+      return iree_make_status_with_location(file, line, IREE_STATUS_INTERNAL,
+                                            "VK_ERROR_MEMORY_MAP_FAILED");
+    case VK_ERROR_LAYER_NOT_PRESENT:
+      // A requested layer is not present or could not be loaded.
+      return iree_make_status_with_location(
+          file, line, IREE_STATUS_UNIMPLEMENTED, "VK_ERROR_LAYER_NOT_PRESENT");
+    case VK_ERROR_EXTENSION_NOT_PRESENT:
+      // A requested extension is not supported.
+      return iree_make_status_with_location(file, line,
+                                            IREE_STATUS_UNIMPLEMENTED,
+                                            "VK_ERROR_EXTENSION_NOT_PRESENT");
+    case VK_ERROR_FEATURE_NOT_PRESENT:
+      // A requested feature is not supported.
+      return iree_make_status_with_location(file, line,
+                                            IREE_STATUS_UNIMPLEMENTED,
+                                            "VK_ERROR_FEATURE_NOT_PRESENT");
+    case VK_ERROR_INCOMPATIBLE_DRIVER:
+      // The requested version of Vulkan is not supported by the driver or is
+      // otherwise incompatible for implementation-specific reasons.
+      return iree_make_status_with_location(file, line,
+                                            IREE_STATUS_FAILED_PRECONDITION,
+                                            "VK_ERROR_INCOMPATIBLE_DRIVER");
+    case VK_ERROR_TOO_MANY_OBJECTS:
+      // Too many objects of the type have already been created.
+      return iree_make_status_with_location(file, line,
+                                            IREE_STATUS_RESOURCE_EXHAUSTED,
+                                            "VK_ERROR_TOO_MANY_OBJECTS");
+    case VK_ERROR_FORMAT_NOT_SUPPORTED:
+      // A requested format is not supported on this device.
+      return iree_make_status_with_location(file, line,
+                                            IREE_STATUS_UNIMPLEMENTED,
+                                            "VK_ERROR_FORMAT_NOT_SUPPORTED");
+    case VK_ERROR_FRAGMENTED_POOL:
+      // A pool allocation has failed due to fragmentation of the pool’s
+      // memory. This must only be returned if no attempt to allocate host
+      // or device memory was made to accommodate the new allocation.
+      return iree_make_status_with_location(file, line,
+                                            IREE_STATUS_RESOURCE_EXHAUSTED,
+                                            "VK_ERROR_FRAGMENTED_POOL");
+    case VK_ERROR_OUT_OF_POOL_MEMORY:
+      // A pool memory allocation has failed. This must only be returned if no
+      // attempt to allocate host or device memory was made to accommodate the
+      // new allocation. If the failure was definitely due to fragmentation of
+      // the pool, VK_ERROR_FRAGMENTED_POOL should be returned instead.
+      return iree_make_status_with_location(file, line,
+                                            IREE_STATUS_RESOURCE_EXHAUSTED,
+                                            "VK_ERROR_OUT_OF_POOL_MEMORY");
+    case VK_ERROR_INVALID_EXTERNAL_HANDLE:
+      // An external handle is not a valid handle of the specified type.
+      return iree_make_status_with_location(file, line,
+                                            IREE_STATUS_INVALID_ARGUMENT,
+                                            "VK_ERROR_INVALID_EXTERNAL_HANDLE");
+    case VK_ERROR_SURFACE_LOST_KHR:
+      // A surface is no longer available.
+      return iree_make_status_with_location(file, line, IREE_STATUS_UNAVAILABLE,
+                                            "VK_ERROR_SURFACE_LOST_KHR");
+    case VK_ERROR_NATIVE_WINDOW_IN_USE_KHR:
+      // The requested window is already in use by Vulkan or another API in a
+      // manner which prevents it from being used again.
+      return iree_make_status_with_location(
+          file, line, IREE_STATUS_INVALID_ARGUMENT,
+          "VK_ERROR_NATIVE_WINDOW_IN_USE_KHR");
+    case VK_ERROR_OUT_OF_DATE_KHR:
+      // A surface has changed in such a way that it is no longer compatible
+      // with the swapchain, and further presentation requests using the
+      // swapchain will fail. Applications must query the new surface properties
+      // and recreate their swapchain if they wish to continue presenting to the
+      // surface.
+      return iree_make_status_with_location(file, line,
+                                            IREE_STATUS_FAILED_PRECONDITION,
+                                            "VK_ERROR_OUT_OF_DATE_KHR");
+    case VK_ERROR_INCOMPATIBLE_DISPLAY_KHR:
+      // The display used by a swapchain does not use the same presentable image
+      // layout, or is incompatible in a way that prevents sharing an image.
+      return iree_make_status_with_location(
+          file, line, IREE_STATUS_INVALID_ARGUMENT,
+          "VK_ERROR_INCOMPATIBLE_DISPLAY_KHR");
+    case VK_ERROR_VALIDATION_FAILED_EXT:
+      // Validation layer testing failed. It is not expected that an
+      // application would see this this error code during normal use of the
+      // validation layers.
+      return iree_make_status_with_location(file, line,
+                                            IREE_STATUS_INVALID_ARGUMENT,
+                                            "VK_ERROR_VALIDATION_FAILED_EXT");
+    case VK_ERROR_INVALID_SHADER_NV:
+      // One or more shaders failed to compile or link. More details are
+      // reported back to the application when the validation layer is enabled
+      // using the extension VK_EXT_debug_report.
+      return iree_make_status_with_location(file, line,
+                                            IREE_STATUS_INVALID_ARGUMENT,
+                                            "VK_ERROR_INVALID_SHADER_NV");
+    case VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT:
+      // When creating an image with
+      // VkImageDrmFormatModifierExplicitCreateInfoEXT, it is the application’s
+      // responsibility to satisfy all Valid Usage requirements. However, the
+      // implementation must validate that the provided pPlaneLayouts, when
+      // combined with the provided drmFormatModifier and other creation
+      // parameters in VkImageCreateInfo and its pNext chain, produce a valid
+      // image. (This validation is necessarily implementation-dependent and
+      // outside the scope of Vulkan, and therefore not described by Valid Usage
+      // requirements). If this validation fails, then vkCreateImage returns
+      // VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT.
+      return iree_make_status_with_location(
+          file, line, IREE_STATUS_INVALID_ARGUMENT,
+          "VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT");
+    case VK_ERROR_FRAGMENTATION_EXT:
+      // A descriptor pool creation has failed due to fragmentation.
+      return iree_make_status_with_location(file, line,
+                                            IREE_STATUS_RESOURCE_EXHAUSTED,
+                                            "VK_ERROR_FRAGMENTATION_EXT");
+    case VK_ERROR_NOT_PERMITTED_EXT:
+      // When creating a queue, the caller does not have sufficient privileges
+      // to request to acquire a priority above the default priority
+      // (VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_EXT).
+      return iree_make_status_with_location(file, line,
+                                            IREE_STATUS_PERMISSION_DENIED,
+                                            "VK_ERROR_NOT_PERMITTED_EXT");
+    case VK_ERROR_INVALID_DEVICE_ADDRESS_EXT:
+      // A buffer creation failed because the requested address is not
+      // available.
+      return iree_make_status_with_location(
+          file, line, IREE_STATUS_OUT_OF_RANGE,
+          "VK_ERROR_INVALID_DEVICE_ADDRESS_EXT");
+    case VK_ERROR_FULL_SCREEN_EXCLUSIVE_MODE_LOST_EXT:
+      // An operation on a swapchain created with
+      // VK_FULL_SCREEN_EXCLUSIVE_APPLICATION_CONTROLLED_EXT failed as it did
+      // not have exlusive full-screen access. This may occur due to
+      // implementation-dependent reasons, outside of the application’s control.
+      return iree_make_status_with_location(
+          file, line, IREE_STATUS_UNAVAILABLE,
+          "VK_ERROR_FULL_SCREEN_EXCLUSIVE_MODE_LOST_EXT");
+    default:
+      return iree_make_status_with_location(file, line, IREE_STATUS_UNKNOWN,
+                                            "VkResult=%u", (uint32_t)result);
+  }
+}
diff --git a/runtime/src/iree/hal/vulkan/status_util.h b/runtime/src/iree/hal/vulkan/status_util.h
new file mode 100644
index 0000000..3e22946
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/status_util.h
@@ -0,0 +1,93 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_VULKAN_STATUS_UTIL_H_
+#define IREE_HAL_VULKAN_STATUS_UTIL_H_
+
+// clang-format off: must be included before all other headers.
+#include "iree/hal/vulkan/vulkan_headers.h"
+// clang-format on
+
+#include <stdint.h>
+
+#include "iree/base/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Converts a VkResult to an iree_status_t.
+//
+// Usage:
+//   iree_status_t status = VK_RESULT_TO_STATUS(vkDoThing(...));
+#define VK_RESULT_TO_STATUS(expr, ...) \
+  iree_hal_vulkan_result_to_status((expr), __FILE__, __LINE__)
+
+// IREE_RETURN_IF_ERROR but implicitly converts the VkResult return value to
+// a Status.
+//
+// Usage:
+//   VK_RETURN_IF_ERROR(vkDoThing(...), "message");
+#define VK_RETURN_IF_ERROR(expr, ...) \
+  IREE_RETURN_IF_ERROR(               \
+      iree_hal_vulkan_result_to_status(expr, __FILE__, __LINE__), __VA_ARGS__)
+
+// IREE_CHECK_OK but implicitly converts the VkResults return value to a
+// Status and checks that it is OkStatus.
+//
+// Usage:
+//   VK_CHECK_OK(vkDoThing(...));
+#define VK_CHECK_OK(expr) \
+  IREE_CHECK_OK(iree_hal_vulkan_result_to_status(expr, __FILE__, __LINE__))
+
+// Converts a VkResult to a Status object.
+//
+// Vulkan considers the following as "success codes" and users should ensure
+// they first check the result prior to converting:
+//
+// - VK_SUCCESS        -> OkStatus()
+// - VK_NOT_READY      -> OkStatus()
+// - VK_TIMEOUT        -> OkStatus()
+// - VK_EVENT_SET      -> OkStatus()
+// - VK_EVENT_RESET    -> OkStatus()
+// - VK_INCOMPLETE     -> OkStatus()
+// - VK_SUBOPTIMAL_KHR -> OkStatus()
+//
+// The rest are considered as "error codes":
+//
+// - VK_ERROR_OUT_OF_HOST_MEMORY          -> ResourceExhaustedError("VK...")
+// - VK_ERROR_OUT_OF_DEVICE_MEMORY        -> ResourceExhaustedError("VK...")
+// - VK_ERROR_INITIALIZATION_FAILED       -> InternalError("VK...")
+// - VK_ERROR_DEVICE_LOST                 -> InternalError("VK...")
+// - VK_ERROR_MEMORY_MAP_FAILED           -> InternalError("VK...")
+// - VK_ERROR_LAYER_NOT_PRESENT           -> NotFoundError("VK...")
+// - VK_ERROR_EXTENSION_NOT_PRESENT       -> NotFoundError("VK...")
+// - VK_ERROR_FEATURE_NOT_PRESENT         -> NotFoundError("VK...")
+// - VK_ERROR_INCOMPATIBLE_DRIVER         -> FailedPreconditionError("VK...")
+// - VK_ERROR_TOO_MANY_OBJECTS            -> ResourceExhaustedError("VK...")
+// - VK_ERROR_FORMAT_NOT_SUPPORTED        -> UnimplementedError("VK...")
+// - VK_ERROR_FRAGMENTED_POOL             -> ResourceExhaustedError("VK...")
+// - VK_ERROR_OUT_OF_POOL_MEMORY          -> ResourceExhaustedError("VK...")
+// - VK_ERROR_INVALID_EXTERNAL_HANDLE     -> InvalidArgumentError("VK...")
+// - VK_ERROR_SURFACE_LOST_KHR            -> InternalError("VK...")
+// - VK_ERROR_NATIVE_WINDOW_IN_USE_KHR    -> InternalError("VK...")
+// - VK_ERROR_OUT_OF_DATE_KHR             -> InternalError("VK...")
+// - VK_ERROR_INCOMPATIBLE_DISPLAY_KHR    -> InternalError("VK...")
+// - VK_ERROR_VALIDATION_FAILED_EXT       -> InternalError("VK...")
+// - VK_ERROR_INVALID_SHADER_NV           -> InternalError("VK...")
+// - VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT  -> InternalError
+// - VK_ERROR_FRAGMENTATION_EXT           -> ResourceExhaustedError("VK...")
+// - VK_ERROR_NOT_PERMITTED_EXT           -> PermissionDeniedError("VK...")
+// - VK_ERROR_INVALID_DEVICE_ADDRESS_EXT  -> OutOfRangeError("VK...")
+// - VK_ERROR_FULL_SCREEN_EXCLUSIVE_MODE_LOST_EXT -> InternalError("VK...")
+iree_status_t iree_hal_vulkan_result_to_status(VkResult result,
+                                               const char* file, uint32_t line);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_VULKAN_STATUS_UTIL_H_
diff --git a/runtime/src/iree/hal/vulkan/timepoint_util.cc b/runtime/src/iree/hal/vulkan/timepoint_util.cc
new file mode 100644
index 0000000..531d897
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/timepoint_util.cc
@@ -0,0 +1,262 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/vulkan/timepoint_util.h"
+
+#include <memory>
+
+#include "iree/base/logging.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/vulkan/dynamic_symbols.h"
+#include "iree/hal/vulkan/status_util.h"
+
+namespace iree {
+namespace hal {
+namespace vulkan {
+
+namespace {
+
+class RaiiLocker {
+ public:
+  explicit RaiiLocker(iree_slim_mutex_t* mu)
+      IREE_THREAD_ANNOTATION_ATTRIBUTE(no_thread_safety_analysis)
+      : mu_(mu) {
+    iree_slim_mutex_lock(mu_);
+  }
+  ~RaiiLocker() IREE_THREAD_ANNOTATION_ATTRIBUTE(no_thread_safety_analysis) {
+    iree_slim_mutex_unlock(mu_);
+  }
+
+ private:
+  iree_slim_mutex_t* mu_;
+};
+
+}  // namespace
+
+// static
+void TimePointFence::Delete(TimePointFence* ptr) {
+  ptr->ResetStatus();
+  ptr->pool()->ReleaseResolved(ptr);
+}
+
+VkResult TimePointFence::GetStatus() {
+  RaiiLocker locker(&status_mutex_);
+  if (status_ == VK_NOT_READY) {
+    const auto& device = pool()->logical_device();
+    status_ = device->syms()->vkGetFenceStatus(*device, fence_);
+  }
+  return status_;
+}
+
+void TimePointFence::ResetStatus() {
+  RaiiLocker locker(&status_mutex_);
+  status_ = VK_NOT_READY;
+}
+
+// static
+iree_status_t TimePointFencePool::Create(VkDeviceHandle* logical_device,
+                                         TimePointFencePool** out_pool) {
+  IREE_TRACE_SCOPE0("TimePointFencePool::Create");
+  ref_ptr<TimePointFencePool> pool(new TimePointFencePool(logical_device));
+  iree_slim_mutex_initialize(&(pool->mutex_));
+  IREE_RETURN_IF_ERROR(pool->PreallocateFences());
+  *out_pool = pool.release();
+  return iree_ok_status();
+}
+
+TimePointFencePool::~TimePointFencePool() {
+  IREE_TRACE_SCOPE0("TimePointFencePool::dtor");
+
+  iree_slim_mutex_lock(&mutex_);
+
+  int free_count = 0;
+  for (auto* fence : free_fences_) {
+    syms()->vkDestroyFence(*logical_device_, fence->value(),
+                           logical_device_->allocator());
+    ++free_count;
+  }
+  IREE_DCHECK_EQ(free_count, kMaxInFlightFenceCount);
+  free_fences_.clear();
+
+  iree_slim_mutex_unlock(&mutex_);
+  iree_slim_mutex_deinitialize(&mutex_);
+}
+
+iree_status_t TimePointFencePool::Acquire(ref_ptr<TimePointFence>* out_fence) {
+  IREE_TRACE_SCOPE0("TimePointFencePool::Acquire");
+
+  RaiiLocker locker(&mutex_);
+  if (free_fences_.empty()) {
+    return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED,
+                            "fence pool out of free fences");
+  }
+
+  // To acquire from the pool, we:
+  //   1) Pop from the front of the queue (reference count of 0);
+  //   2) Release the unique_ptr, since lifetime will be managed by ref counts;
+  //   3) Return as a raw RefObject with a reference count of 1;
+  // When the reference count goes back to 0, it will be returned to the pool,
+  // wrapped with unique_ptr.
+  // When the pool is destroyed, all free fences are freed by unique_ptr
+  // automatically.
+  std::unique_ptr<TimePointFence> fence =
+      free_fences_.take(free_fences_.front());
+  *out_fence = add_ref(fence.release());
+  return iree_ok_status();
+}
+
+void TimePointFencePool::ReleaseResolved(TimePointFence* fence) {
+  IREE_TRACE_SCOPE0("TimePointFencePool::ReleaseResolved");
+  VkFence f = fence->value();
+  syms()->vkResetFences(*logical_device_, 1, &f);
+  RaiiLocker locker(&mutex_);
+  free_fences_.push_back(std::unique_ptr<TimePointFence>(fence));
+}
+
+TimePointFencePool::TimePointFencePool(VkDeviceHandle* logical_device)
+    : logical_device_(logical_device) {}
+
+const ref_ptr<DynamicSymbols>& TimePointFencePool::syms() const {
+  return logical_device_->syms();
+}
+
+iree_status_t TimePointFencePool::PreallocateFences() {
+  IREE_TRACE_SCOPE0("TimePointFencePool::PreallocateFences");
+
+  VkFenceCreateInfo create_info;
+  create_info.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
+  create_info.pNext = nullptr;
+  create_info.flags = 0;
+
+  std::array<std::unique_ptr<TimePointFence>, kMaxInFlightFenceCount> fences;
+  {
+    RaiiLocker locker(&mutex_);
+    for (int i = 0; i < fences.size(); ++i) {
+      VkFence fence = VK_NULL_HANDLE;
+      VK_RETURN_IF_ERROR(
+          syms()->vkCreateFence(*logical_device_, &create_info,
+                                logical_device_->allocator(), &fence),
+          "vkCreateFence");
+      fences[i] = std::make_unique<TimePointFence>(this, fence);
+    }
+  }
+
+  for (int i = 0; i < fences.size(); ++i) {
+    // The `TimePointFence`s was created with an initial ref-count of one.
+    // Decrease explicitly to zero so that later we can rely on the ref-count
+    // reaching zero to auto-release the `TimePointFence` back to the free
+    // list. As a nice side effect, this will also initialize the free list
+    // with all newly created fences.
+    // TODO: Might want to avoid acquiring and releasing the mutex for each
+    // fence.
+    fences[i].release()->ReleaseReference();
+  }
+
+  return iree_ok_status();
+}
+
+// static
+iree_status_t TimePointSemaphorePool::Create(
+    VkDeviceHandle* logical_device, TimePointSemaphorePool** out_pool) {
+  IREE_TRACE_SCOPE0("TimePointSemaphorePool::Create");
+  ref_ptr<TimePointSemaphorePool> pool(
+      new TimePointSemaphorePool(logical_device));
+  iree_slim_mutex_initialize(&(pool->mutex_));
+  IREE_RETURN_IF_ERROR(pool->PreallocateSemaphores());
+  *out_pool = pool.release();
+  return iree_ok_status();
+}
+
+TimePointSemaphorePool::~TimePointSemaphorePool() {
+  IREE_TRACE_SCOPE0("TimePointSemaphorePool::dtor");
+
+  iree_slim_mutex_lock(&mutex_);
+
+  IREE_DCHECK_EQ(free_semaphores_.size(), kMaxInFlightSemaphoreCount);
+  free_semaphores_.clear();
+
+  for (auto& semaphore : storage_) {
+    syms()->vkDestroySemaphore(*logical_device_, semaphore.semaphore,
+                               logical_device_->allocator());
+  }
+
+  iree_slim_mutex_unlock(&mutex_);
+  iree_slim_mutex_deinitialize(&mutex_);
+}
+
+iree_status_t TimePointSemaphorePool::Acquire(
+    TimePointSemaphore** out_semaphore) {
+  IREE_TRACE_SCOPE0("TimePointSemaphorePool::Acquire");
+
+  RaiiLocker locker(&mutex_);
+  if (free_semaphores_.empty()) {
+    return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED,
+                            "semaphore pool out of free semaphores");
+  }
+
+  *out_semaphore = free_semaphores_.front();
+  free_semaphores_.pop_front();
+  return iree_ok_status();
+}
+
+void TimePointSemaphorePool::ReleaseResolved(
+    IntrusiveList<TimePointSemaphore>* semaphores) {
+  IREE_TRACE_SCOPE0("TimePointSemaphorePool::ReleaseResolved");
+
+  for (auto* semaphore : *semaphores) {
+    IREE_DCHECK(!semaphore->signal_fence && !semaphore->wait_fence);
+    semaphore->value = UINT64_MAX;
+  }
+
+  RaiiLocker locker(&mutex_);
+  free_semaphores_.merge_from(semaphores);
+}
+
+void TimePointSemaphorePool::ReleaseUnresolved(
+    IntrusiveList<TimePointSemaphore>* semaphores) {
+  IREE_TRACE_SCOPE0("TimePointSemaphorePool::ReleaseUnresolved");
+
+  for (auto* semaphore : *semaphores) {
+    semaphore->signal_fence = nullptr;
+    semaphore->wait_fence = nullptr;
+    semaphore->value = UINT64_MAX;
+  }
+
+  RaiiLocker locker(&mutex_);
+  free_semaphores_.merge_from(semaphores);
+}
+
+TimePointSemaphorePool::TimePointSemaphorePool(VkDeviceHandle* logical_device)
+    : logical_device_(logical_device) {}
+
+const ref_ptr<DynamicSymbols>& TimePointSemaphorePool::syms() const {
+  return logical_device_->syms();
+}
+
+iree_status_t TimePointSemaphorePool::PreallocateSemaphores() {
+  IREE_TRACE_SCOPE0("TimePointSemaphorePool::PreallocateSemaphores");
+
+  VkSemaphoreCreateInfo create_info;
+  create_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
+  create_info.pNext = nullptr;
+  create_info.flags = 0;
+
+  RaiiLocker locker(&mutex_);
+  for (int i = 0; i < kMaxInFlightSemaphoreCount; ++i) {
+    auto* semaphore = &storage_[i];
+    VK_RETURN_IF_ERROR(syms()->vkCreateSemaphore(*logical_device_, &create_info,
+                                                 logical_device_->allocator(),
+                                                 &semaphore->semaphore),
+                       "vkCreateSemaphore");
+    free_semaphores_.push_back(semaphore);
+  }
+
+  return iree_ok_status();
+}
+
+}  // namespace vulkan
+}  // namespace hal
+}  // namespace iree
diff --git a/runtime/src/iree/hal/vulkan/timepoint_util.h b/runtime/src/iree/hal/vulkan/timepoint_util.h
new file mode 100644
index 0000000..6eea90b
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/timepoint_util.h
@@ -0,0 +1,214 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_VULKAN_TIMEPOINT_UTIL_H_
+#define IREE_HAL_VULKAN_TIMEPOINT_UTIL_H_
+
+// clang-format off: must be included before all other headers.
+#include "iree/hal/vulkan/vulkan_headers.h"
+// clang-format on
+
+#include <stdint.h>
+
+#include <array>
+#include <memory>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/synchronization.h"
+#include "iree/base/status_cc.h"
+#include "iree/hal/vulkan/dynamic_symbols.h"
+#include "iree/hal/vulkan/handle_util.h"
+#include "iree/hal/vulkan/util/intrusive_list.h"
+#include "iree/hal/vulkan/util/ref_ptr.h"
+
+namespace iree {
+namespace hal {
+namespace vulkan {
+
+class TimePointFencePool;
+class TimePointSemaphorePool;
+
+// A fence used for tracking progress of timeline semaphores.
+//
+// Each queue submission gets a new `VkFence` associated with it so that we can
+// later query the `VkFence` on CPU to know what time points were signaled for
+// timeline semaphores.
+//
+// Ref-counting allows the fence to be associated with multiple time points from
+// different timelines without worrying about ownership complexity.
+//
+// This is expected to used together with `TimePointFencePool` and must be
+// externally synchronized via `TimePointFencePool`'s mutex.
+class TimePointFence final : public RefObject<TimePointFence>,
+                             public IntrusiveLinkBase<void> {
+ public:
+  TimePointFence(TimePointFencePool* pool, VkFence fence)
+      : pool_(pool), fence_(fence) {
+    iree_slim_mutex_initialize(&status_mutex_);
+  }
+
+  ~TimePointFence() { iree_slim_mutex_deinitialize(&status_mutex_); }
+
+  TimePointFence(TimePointFence&& that) = delete;
+  TimePointFence& operator=(TimePointFence&&) = delete;
+
+  TimePointFence(const TimePointFence&) = delete;
+  TimePointFence& operator=(const TimePointFence&) = delete;
+
+  // Returns this fence to the pool on destruction.
+  static void Delete(TimePointFence* ptr);
+
+  VkFence value() const noexcept { return fence_; }
+  operator VkFence() const noexcept { return fence_; }
+
+  // Gets the status of this fence object. This might issue an Vulkan API call
+  // under the hood.
+  VkResult GetStatus();
+
+  // Resets the status to unsignaled (VK_NOT_READY).
+  void ResetStatus();
+
+  // Returns the pool from which this fence comes.
+  TimePointFencePool* pool() const { return pool_; }
+
+ private:
+  // The pool from which this fence comes.
+  TimePointFencePool* pool_;
+
+  // Allocated fence that associated with a bunch of time point(s) of
+  // timeline(s). This is passed to queue submission so that we can track the
+  // timeline(s) progress on CPU and schedule work.
+  VkFence fence_;
+
+  // The fence's status.
+  iree_slim_mutex_t status_mutex_;
+  VkResult status_ IREE_GUARDED_BY(status_mutex_) = VK_NOT_READY;
+};
+
+// A semaphore used for emulating a specific time point of timeline semaphores.
+//
+// Each signaled time point in a timeline semaphore is emulated with a new
+// binary `VkSemaphore` associated with queue submission. These time point
+// semaphores are stored in `EmulatedTimelineSemaphore` to quickly scan and
+// process signaled values.
+//
+// This is expected to used together with `TimePointSemaphorePool` and
+// `EmulatedTimelineSemaphore` and must be externally synchronized via their
+// mutexes.
+struct TimePointSemaphore final : public IntrusiveLinkBase<void> {
+  // Allocated binary semaphore that represents a time point in the timeline.
+  // This is passed to queue submission.
+  VkSemaphore semaphore = VK_NULL_HANDLE;
+
+  // Value of the timeline should be at when the binary semaphore is signaled.
+  uint64_t value = UINT64_MAX;
+
+  // The fence associated with the queue submission signaling this semaphore.
+  // nullptr means this binary semaphore has not been submitted to GPU.
+  ref_ptr<TimePointFence> signal_fence = nullptr;
+
+  // The fence associated with the queue submission waiting this semaphore.
+  // nullptr means this binary semaphore has not been waited by any queue
+  // submission.
+  ref_ptr<TimePointFence> wait_fence = nullptr;
+};
+
+// A pool of `VkFence`s that can be used by `EmulatedTimelineSemaphore` to track
+// timeline progress on CPU. Each `VkFence` can be used to query the status of
+// all the semaphores in the same submission to a `VkQueue`.
+class TimePointFencePool final : public RefObject<TimePointFencePool> {
+ public:
+  static constexpr int kMaxInFlightFenceCount = 64;
+
+  // Creates a new pool and pre-allocates `kMaxInFlightFenceCount` fences.
+  static iree_status_t Create(VkDeviceHandle* logical_device,
+                              TimePointFencePool** out_pool);
+
+  ~TimePointFencePool();
+
+  // Acquires a fence from the pool for use by the caller. The fence is
+  // guaranteed to be in unsignaled state and not in-flight on GPU.
+  //
+  // Returns RESOURCE_EXHAUSTED if the pool has no more available fences.
+  // Callers are expected to handle this by waiting on previous fences or for
+  // complete device idle. Yes, that's as bad as it sounds, and if we start
+  // seeing that we should bump up the max count.
+  iree_status_t Acquire(ref_ptr<TimePointFence>* out_fence);
+
+  // Releases one fence back to the pool. The fence must either be signaled or
+  // not be in flight on GPU.
+  void ReleaseResolved(TimePointFence* fence);
+
+  VkDeviceHandle* logical_device() const { return logical_device_; }
+
+ private:
+  explicit TimePointFencePool(VkDeviceHandle* logical_device);
+
+  const ref_ptr<DynamicSymbols>& syms() const;
+
+  iree_status_t PreallocateFences();
+
+  VkDeviceHandle* logical_device_;
+
+  iree_slim_mutex_t mutex_;
+
+  // Track via unique_ptr, since IntrusiveList doesn't manage memory itself.
+  IntrusiveList<std::unique_ptr<TimePointFence>> free_fences_
+      IREE_GUARDED_BY(mutex_);
+};
+
+// A pool of `VkSemaphore`s that can be used by `EmulatedTimelineSemaphore` to
+// simulate individual timeline value signaling.
+class TimePointSemaphorePool final : public RefObject<TimePointSemaphorePool> {
+ public:
+  static constexpr int kMaxInFlightSemaphoreCount = 64;
+
+  // Creates a new pool and pre-allocates `kMaxInFlightSemaphoreCount` binary
+  // semaphores.
+  static iree_status_t Create(VkDeviceHandle* logical_device,
+                              TimePointSemaphorePool** out_pool);
+
+  ~TimePointSemaphorePool();
+
+  // Acquires a binary semaphore from the pool for use by the caller. The
+  // semaphore is guaranteed to be in unsignaled state and not in-flight on GPU.
+  //
+  // Returns RESOURCE_EXHAUSTED if the pool has no more available semaphores.
+  // Callers are expected to handle this by waiting on previous fences or for
+  // complete device idle. Yes, that's as bad as it sounds, and if we start
+  // seeing that we should bump up the max count.
+  iree_status_t Acquire(TimePointSemaphore** out_semaphore);
+
+  // Releases one or more semaphores back to the pool. The binary semaphore must
+  // be unsignaled and not in flight on GPU.
+  void ReleaseResolved(IntrusiveList<TimePointSemaphore>* semaphores);
+
+  // Releases one or more semaphores back to the pool. These may be in any state
+  // and will be assumed as untouchable; the pool will unconditionally recycle
+  // them.
+  void ReleaseUnresolved(IntrusiveList<TimePointSemaphore>* semaphores);
+
+ private:
+  explicit TimePointSemaphorePool(VkDeviceHandle* logical_device);
+
+  const ref_ptr<DynamicSymbols>& syms() const;
+
+  iree_status_t PreallocateSemaphores();
+
+  VkDeviceHandle* logical_device_;
+
+  iree_slim_mutex_t mutex_;
+
+  std::array<TimePointSemaphore, kMaxInFlightSemaphoreCount> storage_
+      IREE_GUARDED_BY(mutex_);
+  IntrusiveList<TimePointSemaphore> free_semaphores_ IREE_GUARDED_BY(mutex_);
+};
+
+}  // namespace vulkan
+}  // namespace hal
+}  // namespace iree
+
+#endif  // IREE_HAL_VULKAN_TIMEPOINT_UTIL_H_
diff --git a/runtime/src/iree/hal/vulkan/tracing.cc b/runtime/src/iree/hal/vulkan/tracing.cc
new file mode 100644
index 0000000..ead88e5
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/tracing.cc
@@ -0,0 +1,667 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/vulkan/tracing.h"
+
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+
+#include "iree/base/api.h"
+#include "iree/base/target_platform.h"
+#include "third_party/tracy/Tracy.hpp"
+#include "third_party/tracy/client/TracyProfiler.hpp"
+#include "third_party/tracy/common/TracyAlloc.hpp"
+
+// Total number of queries the per-queue query pool will contain. This
+// translates to the maximum number of outstanding queries before collection is
+// required.
+#define IREE_HAL_VULKAN_TRACING_DEFAULT_QUERY_CAPACITY (32 * 1024)
+
+// Total number of queries that can be read back from the API in a single
+// collection.
+#define IREE_HAL_VULKAN_TRACING_READBACK_QUERY_CAPACITY (8 * 1024)
+
+// Number of times we will query the max_deviation from calibrated timestamps.
+// The more we do the better confidence we have in a lower-bound.
+#define IREE_HAL_VULKAN_TRACING_MAX_DEVIATION_PROBE_COUNT 32
+
+typedef struct iree_hal_vulkan_timestamp_query_t {
+  uint64_t timestamp;
+  uint64_t availability;  // non-zero if available
+} iree_hal_vulkan_timestamp_query_t;
+
+struct iree_hal_vulkan_tracing_context_t {
+  // Device and queue the context represents.
+  iree::hal::vulkan::VkDeviceHandle* logical_device;
+  VkQueue queue;
+  iree_allocator_t host_allocator;
+
+  // Maintenance queue that supports dispatch commands and can be used to reset
+  // queries.
+  VkQueue maintenance_dispatch_queue;
+  // Command pool that serves command buffers compatible with the
+  // |maintenance_dispatch_queue|.
+  iree::hal::vulkan::VkCommandPoolHandle* maintenance_command_pool;
+
+  // A unique GPU zone ID allocated from Tracy.
+  // There is a global limit of 255 GPU zones (ID 255 is special).
+  uint8_t id;
+
+  // Defines how the timestamps are interpreted (device-specific, posix, QPC).
+  // https://www.khronos.org/registry/vulkan/specs/1.2-extensions/man/html/VkTimeDomainEXT.html
+  VkTimeDomainEXT time_domain;
+
+  // Maximum expected deviation between CPU and GPU timestamps based on an
+  // average computed at startup. Calibration events that exceed this value are
+  // discarded.
+  uint64_t max_expected_deviation;
+
+  // Vulkan-reported CPU timestamp of the last calibration.
+  // Used to detect when drift occurs and we need to notify tracy.
+  uint64_t previous_cpu_time;
+
+  // Pool of query instances that we treat as a backing store for a ringbuffer.
+  VkQueryPool query_pool;
+
+  // Indices into |query_pool| defining a ringbuffer.
+  uint32_t query_head;
+  uint32_t query_tail;
+  uint32_t query_capacity;
+
+  // Readback storage; large enough to get a decent chunk of queries back from
+  // the API in one shot.
+  //
+  // Data is stored as [[timestamp, availability], ...].
+  // Availability will be non-zero if the timestamp is valid. Since we put all
+  // timestamps in order once we reach an unavailable timestamp we can bail
+  // and leave that for future collections.
+  iree_hal_vulkan_timestamp_query_t
+      readback_buffer[IREE_HAL_VULKAN_TRACING_READBACK_QUERY_CAPACITY];
+};
+
+// Allocates and begins a command buffer and returns its handle.
+// Returns VK_NULL_HANDLE if allocation fails.
+static VkCommandBuffer iree_hal_vulkan_tracing_begin_command_buffer(
+    iree_hal_vulkan_tracing_context_t* context) {
+  const auto& syms = context->logical_device->syms();
+
+  VkCommandBufferAllocateInfo command_buffer_info;
+  memset(&command_buffer_info, 0, sizeof(command_buffer_info));
+  command_buffer_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
+  command_buffer_info.commandPool = *context->maintenance_command_pool;
+  command_buffer_info.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
+  command_buffer_info.commandBufferCount = 1;
+  VkCommandBuffer command_buffer = VK_NULL_HANDLE;
+  IREE_IGNORE_ERROR(context->maintenance_command_pool->Allocate(
+      &command_buffer_info, &command_buffer));
+  if (!command_buffer) return VK_NULL_HANDLE;
+
+  VkCommandBufferBeginInfo begin_info;
+  memset(&begin_info, 0, sizeof(begin_info));
+  begin_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+  begin_info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
+  syms->vkBeginCommandBuffer(command_buffer, &begin_info);
+
+  return command_buffer;
+}
+
+// Ends and submits |command_buffer| and waits for it to complete.
+static void iree_hal_vulkan_tracing_submit_command_buffer(
+    iree_hal_vulkan_tracing_context_t* context,
+    VkCommandBuffer command_buffer) {
+  const auto& syms = context->logical_device->syms();
+
+  syms->vkEndCommandBuffer(command_buffer);
+
+  VkSubmitInfo submit_info;
+  memset(&submit_info, 0, sizeof(submit_info));
+  submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+  submit_info.commandBufferCount = 1;
+  submit_info.pCommandBuffers = &command_buffer;
+  syms->vkQueueSubmit(context->maintenance_dispatch_queue, 1, &submit_info,
+                      VK_NULL_HANDLE);
+  syms->vkQueueWaitIdle(context->maintenance_dispatch_queue);
+
+  context->maintenance_command_pool->Free(command_buffer);
+}
+
+// Synchronously resets a range of querys in a query pool.
+// This may submit commands to the queue.
+static void iree_hal_vulkan_tracing_reset_query_pool(
+    iree_hal_vulkan_tracing_context_t* context, uint32_t query_index,
+    uint32_t query_count) {
+  const auto& syms = context->logical_device->syms();
+
+  // Fast-path for when host-side vkResetQueryPool is available.
+  // This is core in Vulkan 1.2.
+  if (context->logical_device->enabled_extensions().host_query_reset) {
+    PFN_vkResetQueryPool vkResetQueryPool_fn = syms->vkResetQueryPool
+                                                   ? syms->vkResetQueryPool
+                                                   : syms->vkResetQueryPoolEXT;
+    if (vkResetQueryPool_fn != NULL) {
+      vkResetQueryPool_fn(*context->logical_device, context->query_pool,
+                          query_index, query_count);
+      return;
+    }
+  }
+
+  // Slow-path submitting a command buffer to reset the query pool. It's obvious
+  // why vkResetQueryPool was added :)
+  VkCommandBuffer command_buffer =
+      iree_hal_vulkan_tracing_begin_command_buffer(context);
+  if (command_buffer != VK_NULL_HANDLE) {
+    syms->vkCmdResetQueryPool(command_buffer, context->query_pool, query_index,
+                              query_count);
+    iree_hal_vulkan_tracing_submit_command_buffer(context, command_buffer);
+  }
+}
+
+// Attempts to get a timestamp from both the CPU and GPU that are correlated
+// with each other. Only valid when calibration is supported.
+static void iree_hal_vulkan_tracing_query_calibration_timestamps(
+    iree_hal_vulkan_tracing_context_t* context, uint64_t* out_cpu_time,
+    uint64_t* out_gpu_time) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  *out_cpu_time = 0;
+  *out_gpu_time = 0;
+
+  VkCalibratedTimestampInfoEXT timestamp_infos[2];
+  timestamp_infos[0].sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT;
+  timestamp_infos[0].pNext = NULL;
+  timestamp_infos[0].timeDomain = VK_TIME_DOMAIN_DEVICE_EXT;
+  timestamp_infos[1].sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT;
+  timestamp_infos[1].pNext = NULL;
+  timestamp_infos[1].timeDomain = context->time_domain;
+  uint64_t timestamps[2] = {0, 0};
+  uint64_t max_deviation = 0;
+  do {
+    context->logical_device->syms()->vkGetCalibratedTimestampsEXT(
+        *context->logical_device, IREE_ARRAYSIZE(timestamps), timestamp_infos,
+        timestamps, &max_deviation);
+  } while (max_deviation > context->max_expected_deviation);
+
+  *out_gpu_time = timestamps[0];
+  *out_cpu_time = timestamps[1];
+  switch (context->time_domain) {
+#if defined(IREE_PLATFORM_WINDOWS)
+    case VK_TIME_DOMAIN_QUERY_PERFORMANCE_COUNTER_EXT:
+      *out_cpu_time *= (uint64_t)(1000000000.0 / tracy::GetFrequencyQpc());
+      break;
+#else
+    case VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT:
+    case VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT:
+      // TODO(benvanik): posix calibrated timestamps - ignored for now.
+      break;
+#endif  // IREE_PLATFORM_WINDOWS
+    default:
+      break;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+// Populates |out_cpu_time| and |out_gpu_time| with calibrated timestamps.
+// Depending on whether VK_EXT_calibrated_timestamps is available this may be
+// a guess done by ourselves (with lots of slop) or done by the driver (with
+// less slop).
+static void iree_hal_vulkan_tracing_perform_initial_calibration(
+    iree_hal_vulkan_tracing_context_t* context, uint64_t* out_cpu_time,
+    uint64_t* out_gpu_time) {
+  const auto& syms = context->logical_device->syms();
+  *out_cpu_time = 0;
+  *out_gpu_time = 0;
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_TEXT(z0,
+                              context->time_domain == VK_TIME_DOMAIN_DEVICE_EXT
+                                  ? "VK_TIME_DOMAIN_DEVICE_EXT"
+                                  : "VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT");
+
+  // Attempt to get a timestamp from both the device and the host at roughly the
+  // same time. There's a gap between when we get control returned to use after
+  // submitting and waiting for idle and that will be the slop we have in the
+  // timings in the tracy UI.
+  if (context->time_domain == VK_TIME_DOMAIN_DEVICE_EXT) {
+    // Submit a device timestamp.
+    VkCommandBuffer command_buffer =
+        iree_hal_vulkan_tracing_begin_command_buffer(context);
+    if (command_buffer != VK_NULL_HANDLE) {
+      syms->vkCmdWriteTimestamp(command_buffer,
+                                VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+                                context->query_pool, 0);
+      iree_hal_vulkan_tracing_submit_command_buffer(context, command_buffer);
+    }
+
+    // Query the timestamp from the host and the device.
+    *out_cpu_time = tracy::Profiler::GetTime();
+    syms->vkGetQueryPoolResults(
+        *context->logical_device, context->query_pool, 0, 1,
+        sizeof(*out_gpu_time), out_gpu_time, sizeof(*out_gpu_time),
+        VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT);
+
+    // Reset the query used.
+    iree_hal_vulkan_tracing_reset_query_pool(context, 0, 1);
+    IREE_TRACE_ZONE_END(z0);
+    return;
+  }
+
+  // From the spec:
+  // The maximum deviation may vary between calls to
+  // vkGetCalibratedTimestampsEXT even for the same set of time domains due to
+  // implementation and platform specific reasons. It is the application’s
+  // responsibility to assess whether the returned maximum deviation makes the
+  // timestamp values suitable for any particular purpose and can choose to
+  // re-issue the timestamp calibration call pursuing a lower devation value.
+  // https://www.khronos.org/registry/vulkan/specs/1.2-extensions/man/html/vkGetCalibratedTimestampsEXT.html
+  //
+  // We perform a small number of queries here and find the minimum deviation
+  // across all of them to get an average lower bound on the maximum deviation
+  // from any particular query. We then use that as our baseline (plus some
+  // slop) to see if calibration events in the future are reasonable.
+  VkCalibratedTimestampInfoEXT timestamp_infos[2];
+  timestamp_infos[0].sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT;
+  timestamp_infos[0].pNext = NULL;
+  timestamp_infos[0].timeDomain = VK_TIME_DOMAIN_DEVICE_EXT;
+  timestamp_infos[1].sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT;
+  timestamp_infos[1].pNext = NULL;
+  timestamp_infos[1].timeDomain = context->time_domain;
+  uint64_t max_deviations[IREE_HAL_VULKAN_TRACING_MAX_DEVIATION_PROBE_COUNT];
+  IREE_TRACE_ZONE_BEGIN_NAMED(z1, "vkGetCalibratedTimestampsEXT");
+  for (iree_host_size_t i = 0; i < IREE_ARRAYSIZE(max_deviations); ++i) {
+    uint64_t timestamps[2] = {0, 0};
+    syms->vkGetCalibratedTimestampsEXT(
+        *context->logical_device, IREE_ARRAYSIZE(timestamps), timestamp_infos,
+        timestamps, &max_deviations[i]);
+  }
+  IREE_TRACE_ZONE_END(z1);
+  uint64_t min_deviation = max_deviations[0];
+  for (iree_host_size_t i = 1; i < IREE_ARRAYSIZE(max_deviations); ++i) {
+    min_deviation = iree_min(min_deviation, max_deviations[i]);
+  }
+  context->max_expected_deviation = min_deviation * 3 / 2;
+
+  iree_hal_vulkan_tracing_query_calibration_timestamps(
+      context, &context->previous_cpu_time, out_gpu_time);
+  *out_cpu_time = tracy::Profiler::GetTime();
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+// Performs a periodic calibration (if supported) and sends the data to tracy.
+// Over time the host and device clocks may drift (especially with power events)
+// and by frequently performing this we ensure that the samples we are sending
+// to tracy are able to be correlated.
+void iree_hal_vulkan_tracing_perform_calibration(
+    iree_hal_vulkan_tracing_context_t* context) {
+  if (context->time_domain == VK_TIME_DOMAIN_DEVICE_EXT) return;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  uint64_t cpu_time = 0;
+  uint64_t gpu_time = 0;
+  iree_hal_vulkan_tracing_query_calibration_timestamps(context, &cpu_time,
+                                                       &gpu_time);
+
+  uint64_t tracy_time = tracy::Profiler::GetTime();
+  if (cpu_time > context->previous_cpu_time) {
+    uint64_t cpu_delta = cpu_time - context->previous_cpu_time;
+    context->previous_cpu_time = cpu_time;
+    auto* item = tracy::Profiler::QueueSerial();
+    tracy::MemWrite(&item->hdr.type, tracy::QueueType::GpuCalibration);
+    tracy::MemWrite(&item->gpuCalibration.gpuTime, gpu_time);
+    tracy::MemWrite(&item->gpuCalibration.cpuTime, tracy_time);
+    tracy::MemWrite(&item->gpuCalibration.cpuDelta, cpu_delta);
+    tracy::MemWrite(&item->gpuCalibration.context, context->id);
+    tracy::Profiler::QueueSerialFinish();
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+// Prepares the VkQueryPool backing storage for our query ringbuffer.
+static void iree_hal_vulkan_tracing_prepare_query_pool(
+    iree_hal_vulkan_tracing_context_t* context) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Create a query pool with the largest query capacity it can provide.
+  VkQueryPoolCreateInfo pool_info;
+  memset(&pool_info, 0, sizeof(pool_info));
+  pool_info.sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO;
+  pool_info.queryCount = IREE_HAL_VULKAN_TRACING_DEFAULT_QUERY_CAPACITY;
+  pool_info.queryType = VK_QUERY_TYPE_TIMESTAMP;
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, pool_info.queryCount);
+  while (context->logical_device->syms()->vkCreateQueryPool(
+             *context->logical_device, &pool_info,
+             context->logical_device->allocator(),
+             &context->query_pool) != VK_SUCCESS) {
+    pool_info.queryCount /= 2;
+    IREE_TRACE_ZONE_APPEND_VALUE(z0, pool_info.queryCount);
+  }
+  context->query_capacity = pool_info.queryCount;
+
+  // Perform initial reset of the query pool. All queries must be reset upon
+  // creation before first use.
+  iree_hal_vulkan_tracing_reset_query_pool(context, 0, context->query_capacity);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+// Prepares the Tracy-related GPU context that events are fed into. Each context
+// will appear as a unique plot in the tracy UI with the given |queue_name|.
+static void iree_hal_vulkan_tracing_prepare_gpu_context(
+    iree_hal_vulkan_tracing_context_t* context,
+    VkPhysicalDevice physical_device, iree_string_view_t queue_name) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Allocate the process-unique GPU context ID. There's a max of 255 available;
+  // if we are recreating devices a lot we may exceed that. Don't do that, or
+  // wrap around and get weird (but probably still usable) numbers.
+  context->id =
+      tracy::GetGpuCtxCounter().fetch_add(1, std::memory_order_relaxed);
+  if (context->id >= 255) {
+    context->id %= 255;
+  }
+
+  // The number of nanoseconds required for a timestamp query to be incremented
+  // by 1.
+  VkPhysicalDeviceProperties device_properties;
+  context->logical_device->syms()->vkGetPhysicalDeviceProperties(
+      physical_device, &device_properties);
+  float timestamp_period = device_properties.limits.timestampPeriod;
+
+  // Perform initial calibration for tracy to be able to correlate timestamps
+  // between CPU and GPU.
+  uint64_t cpu_time = 0;
+  uint64_t gpu_time = 0;
+  iree_hal_vulkan_tracing_perform_initial_calibration(context, &cpu_time,
+                                                      &gpu_time);
+
+  uint8_t context_flags = 0;
+  if (context->time_domain != VK_TIME_DOMAIN_DEVICE_EXT) {
+    // Tell tracy we'll be passing calibrated timestamps and not to mess with
+    // the times. We'll periodically send GpuCalibration events in case the
+    // times drift.
+    context_flags |= tracy::GpuContextCalibration;
+  }
+  {
+    auto* item = tracy::Profiler::QueueSerial();
+    tracy::MemWrite(&item->hdr.type, tracy::QueueType::GpuNewContext);
+    tracy::MemWrite(&item->gpuNewContext.cpuTime, cpu_time);
+    tracy::MemWrite(&item->gpuNewContext.gpuTime, gpu_time);
+    memset(&item->gpuNewContext.thread, 0, sizeof(item->gpuNewContext.thread));
+    tracy::MemWrite(&item->gpuNewContext.period, timestamp_period);
+    tracy::MemWrite(&item->gpuNewContext.context, context->id);
+    tracy::MemWrite(&item->gpuNewContext.flags, context_flags);
+    tracy::MemWrite(&item->gpuNewContext.type, tracy::GpuContextType::Vulkan);
+    tracy::Profiler::QueueSerialFinish();
+  }
+
+  // Send the name of the context along.
+  // NOTE: Tracy will unconditionally free the name so we must clone it here.
+  // Since internally Tracy will use its own rpmalloc implementation we must
+  // make sure we allocate from the same source.
+  char* cloned_name = (char*)tracy::tracy_malloc(queue_name.size);
+  memcpy(cloned_name, queue_name.data, queue_name.size);
+  {
+    auto* item = tracy::Profiler::QueueSerial();
+    tracy::MemWrite(&item->hdr.type, tracy::QueueType::GpuContextName);
+    tracy::MemWrite(&item->gpuContextNameFat.context, context->id);
+    tracy::MemWrite(&item->gpuContextNameFat.ptr, (uint64_t)cloned_name);
+    tracy::MemWrite(&item->gpuContextNameFat.size, queue_name.size);
+    tracy::Profiler::QueueSerialFinish();
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+// Returns the best possible platform-supported time domain, falling back to
+// VK_TIME_DOMAIN_DEVICE_EXT. By default it is one that is only usable for
+// device-relative calculations and that we need to perform our own hacky
+// calibration on.
+static VkTimeDomainEXT iree_hal_vulkan_tracing_query_time_domain(
+    VkPhysicalDevice physical_device,
+    iree::hal::vulkan::VkDeviceHandle* logical_device) {
+  if (!logical_device->enabled_extensions().calibrated_timestamps) {
+    // Calibrated timestamps extension is not available; we'll only have the
+    // device domain.
+    return VK_TIME_DOMAIN_DEVICE_EXT;
+  }
+
+  uint32_t time_domain_count = 0;
+  if (logical_device->syms()->vkGetPhysicalDeviceCalibrateableTimeDomainsEXT(
+          physical_device, &time_domain_count, NULL) != VK_SUCCESS) {
+    return VK_TIME_DOMAIN_DEVICE_EXT;
+  }
+  VkTimeDomainEXT* time_domains = (VkTimeDomainEXT*)iree_alloca(
+      time_domain_count * sizeof(VkTimeDomainEXT));
+  if (logical_device->syms()->vkGetPhysicalDeviceCalibrateableTimeDomainsEXT(
+          physical_device, &time_domain_count, time_domains) != VK_SUCCESS) {
+    return VK_TIME_DOMAIN_DEVICE_EXT;
+  }
+
+  for (uint32_t i = 0; i < time_domain_count; i++) {
+    switch (time_domains[i]) {
+#if defined(IREE_PLATFORM_WINDOWS)
+      case VK_TIME_DOMAIN_QUERY_PERFORMANCE_COUNTER_EXT:
+        return time_domains[i];
+#else
+      case VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT:
+      case VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT:
+        // TODO(benvanik): support posix clock domains with some kind of math.
+        // return time_domains[i];  -- ignored
+#endif  // IREE_PLATFORM_WINDOWS
+      default:
+        continue;
+    }
+  }
+  return VK_TIME_DOMAIN_DEVICE_EXT;
+}
+
+iree_status_t iree_hal_vulkan_tracing_context_allocate(
+    VkPhysicalDevice physical_device,
+    iree::hal::vulkan::VkDeviceHandle* logical_device, VkQueue queue,
+    iree_string_view_t queue_name, VkQueue maintenance_dispatch_queue,
+    iree::hal::vulkan::VkCommandPoolHandle* maintenance_command_pool,
+    iree_allocator_t host_allocator,
+    iree_hal_vulkan_tracing_context_t** out_context) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_ASSERT_ARGUMENT(logical_device);
+  IREE_ASSERT_ARGUMENT(out_context);
+  *out_context = NULL;
+
+  iree_hal_vulkan_tracing_context_t* context = NULL;
+  iree_status_t status =
+      iree_allocator_malloc(host_allocator, sizeof(*context), (void**)&context);
+  if (iree_status_is_ok(status)) {
+    context->logical_device = logical_device;
+    context->queue = queue;
+    context->host_allocator = host_allocator;
+    context->time_domain = iree_hal_vulkan_tracing_query_time_domain(
+        physical_device, logical_device);
+    context->maintenance_dispatch_queue = maintenance_dispatch_queue;
+    context->maintenance_command_pool = maintenance_command_pool;
+
+    // Prepare the query pool and perform the initial calibration.
+    iree_hal_vulkan_tracing_prepare_query_pool(context);
+
+    // Prepare the Tracy GPU context.
+    iree_hal_vulkan_tracing_prepare_gpu_context(context, physical_device,
+                                                queue_name);
+  }
+
+  if (iree_status_is_ok(status)) {
+    *out_context = context;
+  } else {
+    iree_hal_vulkan_tracing_context_free(context);
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+void iree_hal_vulkan_tracing_context_free(
+    iree_hal_vulkan_tracing_context_t* context) {
+  if (!context) return;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  if (context->query_pool != VK_NULL_HANDLE) {
+    // Always perform a collection on shutdown.
+    iree_hal_vulkan_tracing_context_collect(context, VK_NULL_HANDLE);
+
+    auto* logical_device = context->logical_device;
+    logical_device->syms()->vkDestroyQueryPool(
+        *logical_device, context->query_pool, logical_device->allocator());
+  }
+
+  iree_allocator_t host_allocator = context->host_allocator;
+  iree_allocator_free(host_allocator, context);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+uint32_t iree_hal_vulkan_tracing_context_acquire_query_id(
+    iree_hal_vulkan_tracing_context_t* context) {
+  uint32_t id = context->query_head;
+  context->query_head = (context->query_head + 1) % context->query_capacity;
+  assert(context->query_head != context->query_tail);
+  return id;
+}
+
+void iree_hal_vulkan_tracing_context_collect(
+    iree_hal_vulkan_tracing_context_t* context,
+    VkCommandBuffer command_buffer) {
+  if (!context) return;
+  if (context->query_tail == context->query_head) {
+    // No outstanding queries.
+    return;
+  }
+  IREE_TRACE_ZONE_BEGIN(z0);
+  const auto& syms = context->logical_device->syms();
+
+  while (context->query_tail != context->query_head) {
+    // Compute the contiguous range of queries ready to be read.
+    // If the ringbuffer wraps around we'll handle that in the next loop.
+    uint32_t try_query_count =
+        context->query_head < context->query_tail
+            ? context->query_capacity - context->query_tail
+            : context->query_head - context->query_tail;
+    try_query_count = iree_min(try_query_count,
+                               IREE_HAL_VULKAN_TRACING_READBACK_QUERY_CAPACITY);
+
+    // Read back all of the queries. Note that we also are reading back the
+    // availability such that we can handle partial readiness of the outstanding
+    // range of queries.
+    uint32_t query_base = context->query_tail;
+    if (syms->vkGetQueryPoolResults(
+            *context->logical_device, context->query_pool, query_base,
+            try_query_count, sizeof(context->readback_buffer),
+            context->readback_buffer, sizeof(iree_hal_vulkan_timestamp_query_t),
+            VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) !=
+        VK_SUCCESS) {
+      break;
+    }
+
+    // Scan and feed the times to tracy, stopping when we hit the first
+    // unavailable query.
+    uint32_t read_query_count = 0;
+    for (uint32_t i = 0; i < try_query_count; ++i) {
+      if (context->readback_buffer[i].availability == 0) break;
+      read_query_count = i + 1;
+      auto* item = tracy::Profiler::QueueSerial();
+      tracy::MemWrite(&item->hdr.type, tracy::QueueType::GpuTime);
+      tracy::MemWrite(&item->gpuTime.gpuTime,
+                      context->readback_buffer[i].timestamp);
+      tracy::MemWrite(&item->gpuTime.queryId, (uint16_t)(query_base + i));
+      tracy::MemWrite(&item->gpuTime.context, context->id);
+      tracy::Profiler::QueueSerialFinish();
+    }
+
+    // Reset the range of queries read back.
+    if (command_buffer != VK_NULL_HANDLE) {
+      syms->vkCmdResetQueryPool(command_buffer, context->query_pool, query_base,
+                                read_query_count);
+    } else {
+      iree_hal_vulkan_tracing_reset_query_pool(context, query_base,
+                                               read_query_count);
+    }
+
+    context->query_tail += read_query_count;
+    if (context->query_tail >= context->query_capacity) {
+      context->query_tail = 0;
+    }
+  }
+
+  // Run calibration - we could do this less frequently in cases where collect
+  // is called every submission, however it's relatively cheap compared to all
+  // this other tracing overhead.
+  iree_hal_vulkan_tracing_perform_calibration(context);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_hal_vulkan_tracing_zone_begin_impl(
+    iree_hal_vulkan_tracing_context_t* context, VkCommandBuffer command_buffer,
+    const iree_tracing_location_t* src_loc) {
+  if (!context) return;
+
+  uint32_t query_id = iree_hal_vulkan_tracing_context_acquire_query_id(context);
+  context->logical_device->syms()->vkCmdWriteTimestamp(
+      command_buffer, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, context->query_pool,
+      query_id);
+
+  auto* item = tracy::Profiler::QueueSerial();
+  tracy::MemWrite(&item->hdr.type, tracy::QueueType::GpuZoneBeginSerial);
+  tracy::MemWrite(&item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime());
+  tracy::MemWrite(&item->gpuZoneBegin.srcloc, (uint64_t)src_loc);
+  tracy::MemWrite(&item->gpuZoneBegin.thread, tracy::GetThreadHandle());
+  tracy::MemWrite(&item->gpuZoneBegin.queryId, (uint16_t)query_id);
+  tracy::MemWrite(&item->gpuZoneBegin.context, context->id);
+  tracy::Profiler::QueueSerialFinish();
+}
+
+void iree_hal_vulkan_tracing_zone_begin_external_impl(
+    iree_hal_vulkan_tracing_context_t* context, VkCommandBuffer command_buffer,
+    const char* file_name, size_t file_name_length, uint32_t line,
+    const char* function_name, size_t function_name_length, const char* name,
+    size_t name_length) {
+  if (!context) return;
+
+  uint32_t query_id = iree_hal_vulkan_tracing_context_acquire_query_id(context);
+  context->logical_device->syms()->vkCmdWriteTimestamp(
+      command_buffer, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, context->query_pool,
+      query_id);
+
+  const auto src_loc = tracy::Profiler::AllocSourceLocation(
+      line, file_name, file_name_length, function_name, function_name_length,
+      name, name_length);
+  auto* item = tracy::Profiler::QueueSerial();
+  tracy::MemWrite(&item->hdr.type,
+                  tracy::QueueType::GpuZoneBeginAllocSrcLocSerial);
+  tracy::MemWrite(&item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime());
+  tracy::MemWrite(&item->gpuZoneBegin.srcloc, (uint64_t)src_loc);
+  tracy::MemWrite(&item->gpuZoneBegin.thread, tracy::GetThreadHandle());
+  tracy::MemWrite(&item->gpuZoneBegin.queryId, (uint16_t)query_id);
+  tracy::MemWrite(&item->gpuZoneBegin.context, context->id);
+  tracy::Profiler::QueueSerialFinish();
+}
+
+void iree_hal_vulkan_tracing_zone_end_impl(
+    iree_hal_vulkan_tracing_context_t* context,
+    VkCommandBuffer command_buffer) {
+  if (!context) return;
+
+  uint32_t query_id = iree_hal_vulkan_tracing_context_acquire_query_id(context);
+  context->logical_device->syms()->vkCmdWriteTimestamp(
+      command_buffer, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, context->query_pool,
+      query_id);
+
+  auto* item = tracy::Profiler::QueueSerial();
+  tracy::MemWrite(&item->hdr.type, tracy::QueueType::GpuZoneEndSerial);
+  tracy::MemWrite(&item->gpuZoneEnd.cpuTime, tracy::Profiler::GetTime());
+  tracy::MemWrite(&item->gpuZoneEnd.thread, tracy::GetThreadHandle());
+  tracy::MemWrite(&item->gpuZoneEnd.queryId, (uint16_t)query_id);
+  tracy::MemWrite(&item->gpuZoneEnd.context, context->id);
+  tracy::Profiler::QueueSerialFinish();
+}
+
+#endif  // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
diff --git a/runtime/src/iree/hal/vulkan/tracing.h b/runtime/src/iree/hal/vulkan/tracing.h
new file mode 100644
index 0000000..f43f80e
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/tracing.h
@@ -0,0 +1,174 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_VULKAN_TRACING_H_
+#define IREE_HAL_VULKAN_TRACING_H_
+
+// clang-format off: must be included before all other headers.
+#include "iree/hal/vulkan/vulkan_headers.h"
+// clang-format on
+
+#include <stddef.h>
+
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/vulkan/handle_util.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Per-queue Vulkan tracing context.
+// No-op if IREE tracing is not enabled.
+//
+// Use the IREE_VULKAN_TRACE_* macros to trace a contiguous set of command
+// buffer operations. Unlike the normal tracy macros there are no zone IDs and
+// instead each queue gets an ID allocated once and passed to all tracing
+// macros.
+//
+// Usage:
+//   IREE_VULKAN_TRACE_ZONE_BEGIN(device->tracing_context, command_buffer);
+//   vkCmdDispatch(command_buffer, ...);
+//   IREE_VULKAN_TRACE_ZONE_END(queue->tracing_context, command_buffer);
+//   ...
+//   iree_hal_vulkan_tracing_context_collect(queue->tracing_context,
+//                                           command_buffer);
+//   vkQueueSubmit(...command_buffer...);
+//
+// NOTE: timestamps have non-trivial side-effecting behavior on the device:
+// inserting a timestamp is in the worst (and average) case just as bad as
+// inserting a full global execution barrier. If two command buffer operations
+// that could overlap (no barrier between them) have tracing zones placed around
+// them they will execute sequentially.
+//
+// TODO(benvanik):
+//   Each queue needs a context and maintains its own query pool. In the future
+//   this should be changed to have a single query pool per device to reduce
+//   bookkeeping overhead.
+//
+// TODO(benvanik):
+//   Both a zone begin and zone end always insert timestamps leading to N*2
+//   total queries, however within command buffers the end of one zone and the
+//   begin of another share the same point in time. By inserting the timestamps
+//   at barriers in the command buffer the query count can be reduced to N+1.
+//
+// TODO(benvanik):
+//   vkCmdCopyQueryPoolResults is really what we should be using to do this -
+//   that inserts a device-side transfer to a buffer (conceptually) that is
+//   in-stream with all submissions to a queue. This changes things to a push
+//   model vs. the pull one in _collect and allows us to pipeline the readbacks.
+//   Instead of being limited to the query pool slots we'd only be limited by
+//   the size of the buffer the copy targets allowing us to perform collection
+//   much more infrequently.
+//
+// Thread-compatible: external synchronization is required if using from
+// multiple threads (same as with VkQueue itself).
+typedef struct iree_hal_vulkan_tracing_context_t
+    iree_hal_vulkan_tracing_context_t;
+
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+
+// Allocates a tracing context for the given Vulkan queue.
+// Each context must only be used with the queue it was created with.
+//
+// |maintenance_dispatch_queue| may be used to perform query pool maintenance
+// tasks and must support graphics or compute commands.
+iree_status_t iree_hal_vulkan_tracing_context_allocate(
+    VkPhysicalDevice physical_device,
+    iree::hal::vulkan::VkDeviceHandle* logical_device, VkQueue queue,
+    iree_string_view_t queue_name, VkQueue maintenance_dispatch_queue,
+    iree::hal::vulkan::VkCommandPoolHandle* maintenance_command_pool,
+    iree_allocator_t host_allocator,
+    iree_hal_vulkan_tracing_context_t** out_context);
+
+// Frees a tracing context and all associated Vulkan resources.
+// All submissions using the resources must be completed prior to calling.
+void iree_hal_vulkan_tracing_context_free(
+    iree_hal_vulkan_tracing_context_t* context);
+
+// Collects in-flight timestamp queries from the queue and feeds them to tracy.
+// Must be called frequently (every submission, etc) to drain the backlog;
+// tracing may start failing if the internal ringbuffer is exceeded.
+//
+// The provided |command_buffer| may receive additional bookkeeping commands
+// that should have no impact on correctness or behavior. If VK_NULL_HANDLE is
+// provided then collection will occur synchronously.
+void iree_hal_vulkan_tracing_context_collect(
+    iree_hal_vulkan_tracing_context_t* context, VkCommandBuffer command_buffer);
+
+// Begins a normal zone derived on the calling |src_loc|.
+// Must be perfectly nested and paired with a corresponding zone end.
+void iree_hal_vulkan_tracing_zone_begin_impl(
+    iree_hal_vulkan_tracing_context_t* context, VkCommandBuffer command_buffer,
+    const iree_tracing_location_t* src_loc);
+
+// Begins an external zone using the given source information.
+// The provided strings will be copied into the tracy buffer.
+void iree_hal_vulkan_tracing_zone_begin_external_impl(
+    iree_hal_vulkan_tracing_context_t* context, VkCommandBuffer command_buffer,
+    const char* file_name, size_t file_name_length, uint32_t line,
+    const char* function_name, size_t function_name_length, const char* name,
+    size_t name_length);
+
+void iree_hal_vulkan_tracing_zone_end_impl(
+    iree_hal_vulkan_tracing_context_t* context, VkCommandBuffer command_buffer);
+
+// Begins a new zone with the parent function name.
+#define IREE_VULKAN_TRACE_ZONE_BEGIN(context, command_buffer)                 \
+  static const iree_tracing_location_t TracyConcat(                           \
+      __tracy_source_location, __LINE__) = {name_literal, __FUNCTION__,       \
+                                            __FILE__, (uint32_t)__LINE__, 0}; \
+  iree_hal_vulkan_tracing_zone_begin_impl(                                    \
+      context, command_buffer,                                                \
+      &TracyConcat(__tracy_source_location, __LINE__));
+
+// Begins an externally defined zone with a dynamic source location.
+// The |file_name|, |function_name|, and optional |name| strings will be copied
+// into the trace buffer and do not need to persist.
+#define IREE_VULKAN_TRACE_ZONE_BEGIN_EXTERNAL(                                 \
+    context, command_buffer, file_name, file_name_length, line, function_name, \
+    function_name_length, name, name_length)                                   \
+  iree_hal_vulkan_tracing_zone_begin_external_impl(                            \
+      context, command_buffer, file_name, file_name_length, line,              \
+      function_name, function_name_length, name, name_length)
+
+// Ends the current zone. Must be passed the |zone_id| from the _BEGIN.
+#define IREE_VULKAN_TRACE_ZONE_END(context, command_buffer) \
+  iree_hal_vulkan_tracing_zone_end_impl(context, command_buffer)
+
+#else
+
+inline iree_status_t iree_hal_vulkan_tracing_context_allocate(
+    VkPhysicalDevice physical_device,
+    iree::hal::vulkan::VkDeviceHandle* logical_device, VkQueue queue,
+    iree_string_view_t queue_name, VkQueue maintenance_dispatch_queue,
+    iree::hal::vulkan::VkCommandPoolHandle* maintenance_command_pool,
+    iree_allocator_t host_allocator,
+    iree_hal_vulkan_tracing_context_t** out_context) {
+  *out_context = NULL;
+  return iree_ok_status();
+}
+
+inline void iree_hal_vulkan_tracing_context_free(
+    iree_hal_vulkan_tracing_context_t* context) {}
+
+inline void iree_hal_vulkan_tracing_context_collect(
+    iree_hal_vulkan_tracing_context_t* context,
+    VkCommandBuffer command_buffer) {}
+
+#define IREE_VULKAN_TRACE_ZONE_BEGIN(context, command_buffer)
+#define IREE_VULKAN_TRACE_ZONE_BEGIN_EXTERNAL(                                 \
+    context, command_buffer, file_name, file_name_length, line, function_name, \
+    function_name_length, name, name_length)
+#define IREE_VULKAN_TRACE_ZONE_END(context, command_buffer)
+
+#endif  // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_VULKAN_TRACING_H_
diff --git a/runtime/src/iree/hal/vulkan/util/BUILD b/runtime/src/iree/hal/vulkan/util/BUILD
new file mode 100644
index 0000000..c57de01
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/util/BUILD
@@ -0,0 +1,78 @@
+# Copyright 2019 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("//iree:build_defs.oss.bzl", "iree_runtime_cc_library", "iree_runtime_cc_test")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+iree_runtime_cc_library(
+    name = "arena",
+    srcs = ["arena.cc"],
+    hdrs = ["arena.h"],
+    deps = [
+        "//runtime/src/iree/base:core_headers",
+        "//runtime/src/iree/base:logging",
+    ],
+)
+
+iree_runtime_cc_test(
+    name = "arena_test",
+    srcs = ["arena_test.cc"],
+    deps = [
+        ":arena",
+        "//runtime/src/iree/testing:gtest",
+        "//runtime/src/iree/testing:gtest_main",
+    ],
+)
+
+iree_runtime_cc_library(
+    name = "intrusive_list",
+    hdrs = [
+        "intrusive_list.h",
+        "intrusive_list_unique_ptr.inc",
+    ],
+    deps = [
+        "//runtime/src/iree/base:logging",
+    ],
+)
+
+iree_runtime_cc_test(
+    name = "intrusive_list_test",
+    srcs = [
+        "intrusive_list_test.cc",
+        "intrusive_list_unique_ptr_test.cc",
+    ],
+    deps = [
+        ":intrusive_list",
+        "//runtime/src/iree/testing:gtest",
+        "//runtime/src/iree/testing:gtest_main",
+    ],
+)
+
+iree_runtime_cc_library(
+    name = "ref_ptr",
+    hdrs = ["ref_ptr.h"],
+    deps = [
+        "//runtime/src/iree/base:core_headers",
+        "//runtime/src/iree/base:logging",
+        "//runtime/src/iree/base/internal",
+    ],
+)
+
+iree_runtime_cc_test(
+    name = "ref_ptr_test",
+    size = "small",
+    srcs = ["ref_ptr_test.cc"],
+    deps = [
+        ":ref_ptr",
+        "//runtime/src/iree/testing:gtest",
+        "//runtime/src/iree/testing:gtest_main",
+    ],
+)
diff --git a/runtime/src/iree/hal/vulkan/util/CMakeLists.txt b/runtime/src/iree/hal/vulkan/util/CMakeLists.txt
new file mode 100644
index 0000000..1f7c7c4
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/util/CMakeLists.txt
@@ -0,0 +1,83 @@
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# runtime/src/iree/hal/vulkan/util/BUILD                                       #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
+
+iree_add_all_subdirs()
+
+iree_cc_library(
+  NAME
+    arena
+  HDRS
+    "arena.h"
+  SRCS
+    "arena.cc"
+  DEPS
+    iree::base::core_headers
+    iree::base::logging
+  PUBLIC
+)
+
+iree_cc_test(
+  NAME
+    arena_test
+  SRCS
+    "arena_test.cc"
+  DEPS
+    ::arena
+    iree::testing::gtest
+    iree::testing::gtest_main
+)
+
+iree_cc_library(
+  NAME
+    intrusive_list
+  HDRS
+    "intrusive_list.h"
+    "intrusive_list_unique_ptr.inc"
+  DEPS
+    iree::base::logging
+  PUBLIC
+)
+
+iree_cc_test(
+  NAME
+    intrusive_list_test
+  SRCS
+    "intrusive_list_test.cc"
+    "intrusive_list_unique_ptr_test.cc"
+  DEPS
+    ::intrusive_list
+    iree::testing::gtest
+    iree::testing::gtest_main
+)
+
+iree_cc_library(
+  NAME
+    ref_ptr
+  HDRS
+    "ref_ptr.h"
+  DEPS
+    iree::base::core_headers
+    iree::base::internal
+    iree::base::logging
+  PUBLIC
+)
+
+iree_cc_test(
+  NAME
+    ref_ptr_test
+  SRCS
+    "ref_ptr_test.cc"
+  DEPS
+    ::ref_ptr
+    iree::testing::gtest
+    iree::testing::gtest_main
+)
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/runtime/src/iree/hal/vulkan/util/arena.cc b/runtime/src/iree/hal/vulkan/util/arena.cc
new file mode 100644
index 0000000..187d234
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/util/arena.cc
@@ -0,0 +1,117 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/vulkan/util/arena.h"
+
+#include <cstdlib>
+
+#include "iree/base/attributes.h"
+#include "iree/base/logging.h"
+
+namespace iree {
+
+namespace {
+
+// Rounds up to the next alignment value, if it is not already aligned.
+template <typename T>
+IREE_ATTRIBUTE_ALWAYS_INLINE constexpr T RoundToAlignment(
+    T value, T alignment) noexcept {
+  return ((value + alignment - 1) / alignment) * alignment;
+}
+
+}  // namespace
+
+Arena::Arena(size_t block_size) : block_size_(block_size) {}
+
+Arena::~Arena() { Clear(); }
+
+void Arena::Clear() {
+  // Deallocate all memory.
+  auto block_header = block_list_head_;
+  while (block_header) {
+    auto next_block = block_header->next_block;
+    std::free(block_header);
+    block_header = next_block;
+  }
+  block_list_head_ = nullptr;
+  block_header = unused_block_list_head_;
+  while (block_header) {
+    auto next_block = block_header->next_block;
+    std::free(block_header);
+    block_header = next_block;
+  }
+  unused_block_list_head_ = nullptr;
+
+  bytes_allocated_ = 0;
+  block_bytes_allocated_ = 0;
+}
+
+void Arena::Reset() {
+  // Move all blocks to the unused list and reset allocation count only.
+  auto block_header = block_list_head_;
+  while (block_header) {
+    auto next_block = block_header->next_block;
+    block_header->bytes_allocated = 0;
+    block_header->next_block = unused_block_list_head_;
+    unused_block_list_head_ = block_header;
+    block_header = next_block;
+  }
+  block_list_head_ = nullptr;
+
+  bytes_allocated_ = 0;
+}
+
+uint8_t* Arena::AllocateBytes(size_t length) {
+  if (!length) {
+    // Guarantee zero-length allocations return nullptr.
+    return nullptr;
+  }
+
+  // Pad length allocated so we are machine word aligned.
+  // This ensures the next allocation starts at the right boundary.
+  size_t aligned_length = RoundToAlignment(length, sizeof(uintptr_t));
+
+  if (aligned_length > block_size_) {
+    // This allocation is larger than an entire block. That's bad.
+    // We could allocate this with malloc (and then keep track of those to free
+    // things), but for now let's just die.
+    IREE_CHECK(false);
+    return nullptr;
+  }
+
+  if (!block_list_head_ ||
+      block_list_head_->bytes_allocated + aligned_length > block_size_) {
+    // Check to see if we have an existing unused block we can use.
+    if (unused_block_list_head_) {
+      // Move block from unused list to main list.
+      auto block_header = unused_block_list_head_;
+      unused_block_list_head_ = block_header->next_block;
+      block_header->next_block = block_list_head_;
+      block_header->bytes_allocated = 0;
+      block_list_head_ = block_header;
+    } else {
+      // Allocate a new block.
+      auto block_ptr = reinterpret_cast<uint8_t*>(
+          std::malloc(sizeof(BlockHeader) + block_size_));
+      auto block_header = reinterpret_cast<BlockHeader*>(block_ptr);
+      block_header->next_block = block_list_head_;
+      block_header->bytes_allocated = 0;
+      block_list_head_ = block_header;
+      block_bytes_allocated_ += sizeof(BlockHeader) + block_size_;
+    }
+  }
+
+  BlockHeader* target_block = block_list_head_;
+  auto data_ptr = reinterpret_cast<uint8_t*>(target_block) +
+                  sizeof(BlockHeader) + target_block->bytes_allocated;
+  target_block->bytes_allocated += aligned_length;
+
+  bytes_allocated_ += length;
+
+  return data_ptr;
+}
+
+}  // namespace iree
diff --git a/runtime/src/iree/hal/vulkan/util/arena.h b/runtime/src/iree/hal/vulkan/util/arena.h
new file mode 100644
index 0000000..b891c2d
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/util/arena.h
@@ -0,0 +1,129 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_VULKAN_UTIL_ARENA_H_
+#define IREE_HAL_VULKAN_UTIL_ARENA_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <utility>
+
+namespace iree {
+
+template <typename T>
+class Span {
+ public:
+  Span(T* data, size_t size) noexcept : data_(data), size_(size) {}
+
+  T* data() const noexcept { return data_; }
+  size_t size() const noexcept { return size_; }
+  bool empty() const noexcept { return size() == 0; }
+
+  T& operator[](size_t i) noexcept { return *(data() + i); }
+
+ private:
+  T* data_;
+  size_t size_;
+};
+
+// Arena allocator.
+// Allocates memory from a cached block list grown at specified intervals.
+// Individual allocations cannot be freed.
+// Default constructors will be called when allocating but no destructors will
+// ever be called.
+//
+// This should be used in places where extreme dynamic memory growth is required
+// to ensure that the allocations stay close to each other in memory, are easy
+// to account for, and can be released together. For example, proto or file
+// parsing, per-batch write-once/read-once data buffers, etc.
+//
+// Usage:
+//   Arena arena;
+//   auto t0 = arena.Allocate<MyType>();
+class Arena {
+ public:
+  static constexpr size_t kDefaultBlockSize = 32 * 1024;
+  static constexpr size_t kBlockOverhead = sizeof(void*) + sizeof(size_t);
+
+  Arena() : Arena(kDefaultBlockSize) {}
+  explicit Arena(size_t block_size);
+  ~Arena();
+
+  // Clears all data in the arena and deallocates blocks.
+  // Use Reset to avoid reallocation.
+  void Clear();
+
+  // Resets data in the arena but does not deallocate blocks.
+  // Use Clear to reclaim memory.
+  void Reset();
+
+  // Block size, excluding the block header.
+  // This is the largest size of any allocation that can be made of the arena.
+  size_t block_size() const { return block_size_; }
+
+  // Total number of bytes that have been allocated, excluding wasted space.
+  size_t bytes_allocated() const { return bytes_allocated_; }
+  // Total number of bytes as blocks allocated, including wasted space.
+  // If this number is much higher than bytes_allocated the block size requires
+  // tuning.
+  size_t block_bytes_allocated() const { return block_bytes_allocated_; }
+
+  // Allocates an instance of the given type and calls its constructor.
+  template <typename T>
+  T* Allocate() {
+    void* storage = AllocateBytes(sizeof(T));
+    return new (storage) T();
+  }
+
+  // Allocates an instance of the given type and calls its constructor with
+  // arguments.
+  template <typename T, typename... Args>
+  T* Allocate(Args&&... args) {
+    void* storage = AllocateBytes(sizeof(T));
+    return new (storage) T(std::forward<Args>(args)...);
+  }
+
+  // Allocates an array of items and returns a span pointing to them.
+  template <typename T>
+  Span<T> AllocateSpan(size_t count) {
+    void* storage = AllocateBytes(count * sizeof(T));
+    return Span<T>(reinterpret_cast<T*>(storage), count);
+  }
+
+  // Allocates a block of raw bytes from the arena.
+  // Zero-byte allocations will return nullptr.
+  uint8_t* AllocateBytes(size_t length);
+
+ private:
+  // Block size contains the BlockHeader, so a 1024b block size will result in
+  // 1024-sizeof(BlockHeader) usable bytes.
+  size_t block_size_ = kDefaultBlockSize;
+  size_t bytes_allocated_ = 0;
+  size_t block_bytes_allocated_ = 0;
+
+  // Each block in the arena contains a prefixed header that lets us link the
+  // blocks together (to make freeing easier) as well as tracking current byte
+  // count to let us fill gaps.
+  // Immediately following the header is the actual arena data, up until the
+  // block size is reached.
+  struct BlockHeader {
+    BlockHeader* next_block;
+    size_t bytes_allocated;
+  };
+  static_assert(sizeof(BlockHeader) == kBlockOverhead, "Block header mismatch");
+
+  // Singly-linked list of allocated blocks in reverse allocation order (so
+  // the most recently allocated block is first).
+  BlockHeader* block_list_head_ = nullptr;
+
+  // Allocated but unused blocks.
+  BlockHeader* unused_block_list_head_ = nullptr;
+};
+
+}  // namespace iree
+
+#endif  // IREE_HAL_VULKAN_UTIL_ARENA_H_
diff --git a/runtime/src/iree/hal/vulkan/util/arena_test.cc b/runtime/src/iree/hal/vulkan/util/arena_test.cc
new file mode 100644
index 0000000..0baedf9
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/util/arena_test.cc
@@ -0,0 +1,139 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/vulkan/util/arena.h"
+
+#include "iree/testing/gtest.h"
+
+namespace iree {
+namespace {
+
+// Tests basic block allocations.
+TEST(ArenaTest, BasicAllocation) {
+  Arena arena(64);
+  EXPECT_EQ(64, arena.block_size());
+  EXPECT_EQ(0, arena.bytes_allocated());
+  EXPECT_EQ(0, arena.block_bytes_allocated());
+
+  // Zero byte allocations should return nullptr and not allocate bytes.
+  auto zero_ptr = reinterpret_cast<uintptr_t>(arena.AllocateBytes(0));
+  EXPECT_EQ(0, zero_ptr);
+  EXPECT_EQ(0, arena.bytes_allocated());
+  EXPECT_EQ(0, arena.block_bytes_allocated());
+
+  arena.Clear();
+
+  // Allocations must be machine word aligned.
+  auto one_ptr = reinterpret_cast<uintptr_t>(arena.AllocateBytes(1));
+  EXPECT_NE(0, one_ptr);
+  EXPECT_EQ(0, one_ptr % sizeof(uintptr_t));
+  one_ptr = reinterpret_cast<uintptr_t>(arena.AllocateBytes(1));
+  EXPECT_NE(0, one_ptr);
+  EXPECT_EQ(0, one_ptr % sizeof(uintptr_t));
+  EXPECT_EQ(2, arena.bytes_allocated());
+  EXPECT_LT(2, arena.block_bytes_allocated());
+
+  arena.Clear();
+  EXPECT_EQ(0, arena.bytes_allocated());
+  EXPECT_EQ(0, arena.block_bytes_allocated());
+}
+
+// Tests typed allocations.
+TEST(ArenaTest, TypedAllocations) {
+  Arena arena(64);
+
+  EXPECT_NE(nullptr, arena.Allocate<int>());
+  EXPECT_EQ(4, arena.bytes_allocated());
+  EXPECT_EQ(64 + Arena::kBlockOverhead, arena.block_bytes_allocated());
+  arena.Clear();
+  EXPECT_EQ(0, arena.bytes_allocated());
+  EXPECT_EQ(0, arena.block_bytes_allocated());
+
+  struct MyType {
+    MyType() {}
+    explicit MyType(int initial_value) : value(initial_value) {}
+
+    int value = 5;
+  };
+  auto my_type_ptr = arena.Allocate<MyType>();
+  EXPECT_NE(nullptr, my_type_ptr);
+  EXPECT_EQ(sizeof(MyType), arena.bytes_allocated());
+  EXPECT_EQ(5, my_type_ptr->value);  // Default ctor must be called.
+  arena.Clear();
+  EXPECT_EQ(0, arena.bytes_allocated());
+  EXPECT_EQ(0, arena.block_bytes_allocated());
+
+  my_type_ptr = arena.Allocate<MyType>(10);
+  EXPECT_NE(nullptr, my_type_ptr);
+  EXPECT_EQ(sizeof(MyType), arena.bytes_allocated());
+  EXPECT_EQ(10, my_type_ptr->value);  // Ctor should have been called.
+  arena.Clear();
+  EXPECT_EQ(0, arena.bytes_allocated());
+  EXPECT_EQ(0, arena.block_bytes_allocated());
+}
+
+// Tests multiple blocks.
+TEST(ArenaTest, MultipleBlocks) {
+  Arena arena(16);
+  EXPECT_EQ(0, arena.bytes_allocated());
+  EXPECT_EQ(0, arena.block_bytes_allocated());
+
+  // Allocate one entire block.
+  EXPECT_NE(nullptr, arena.AllocateBytes(16));
+  EXPECT_EQ(16, arena.bytes_allocated());
+  EXPECT_EQ(16 + Arena::kBlockOverhead, arena.block_bytes_allocated());
+
+  // Allocate into the next block.
+  EXPECT_NE(nullptr, arena.AllocateBytes(16));
+  EXPECT_EQ(32, arena.bytes_allocated());
+  EXPECT_EQ(32 + 2 * Arena::kBlockOverhead, arena.block_bytes_allocated());
+
+  // Clear.
+  arena.Clear();
+  EXPECT_EQ(0, arena.bytes_allocated());
+  EXPECT_EQ(0, arena.block_bytes_allocated());
+
+  // Allocate again.
+  EXPECT_NE(nullptr, arena.AllocateBytes(16));
+  EXPECT_EQ(16, arena.bytes_allocated());
+  EXPECT_EQ(16 + Arena::kBlockOverhead, arena.block_bytes_allocated());
+  EXPECT_NE(nullptr, arena.AllocateBytes(16));
+  EXPECT_EQ(32, arena.bytes_allocated());
+  EXPECT_EQ(32 + 2 * Arena::kBlockOverhead, arena.block_bytes_allocated());
+}
+
+// Tests fast reset.
+TEST(ArenaTest, FastReset) {
+  Arena arena(16);
+  EXPECT_EQ(0, arena.bytes_allocated());
+  EXPECT_EQ(0, arena.block_bytes_allocated());
+
+  // Allocate one entire block.
+  EXPECT_NE(nullptr, arena.AllocateBytes(16));
+  EXPECT_EQ(16, arena.bytes_allocated());
+  EXPECT_EQ(16 + Arena::kBlockOverhead, arena.block_bytes_allocated());
+
+  // Allocate into the next block.
+  EXPECT_NE(nullptr, arena.AllocateBytes(16));
+  EXPECT_EQ(32, arena.bytes_allocated());
+  EXPECT_EQ(32 + 2 * Arena::kBlockOverhead, arena.block_bytes_allocated());
+
+  // Reset (without deallocating).
+  arena.Reset();
+  EXPECT_EQ(0, arena.bytes_allocated());
+  EXPECT_EQ(32 + 2 * Arena::kBlockOverhead, arena.block_bytes_allocated());
+
+  // Allocate again.
+  EXPECT_NE(nullptr, arena.AllocateBytes(16));
+  EXPECT_EQ(16, arena.bytes_allocated());
+  EXPECT_EQ(32 + 2 * Arena::kBlockOverhead, arena.block_bytes_allocated());
+  EXPECT_NE(nullptr, arena.AllocateBytes(16));
+  EXPECT_EQ(32, arena.bytes_allocated());
+  EXPECT_EQ(32 + 2 * Arena::kBlockOverhead, arena.block_bytes_allocated());
+}
+
+}  // namespace
+}  // namespace iree
diff --git a/runtime/src/iree/hal/vulkan/util/intrusive_list.h b/runtime/src/iree/hal/vulkan/util/intrusive_list.h
new file mode 100644
index 0000000..ff5d5fe
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/util/intrusive_list.h
@@ -0,0 +1,750 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Doubly linked list using element interior storage.
+// This has the performance of std::list (that means O(1) on insert and remove)
+// but performs no allocations and has better caching behavior.
+//
+// Elements are maintained in lists by way of IntrusiveListLinks, with each link
+// allowing the element to exist in one list simultaneously. In the most simple
+// case subclassing IntrusiveLinkBase will let the type be added to a list with
+// little boilerplate. If an element must be in more than one list
+// simultaneously IntrusiveListLinks can be added as members.
+//
+// Usage (simple):
+//   class MySimpleElement : public IntrusiveLinkBase {};
+//   IntrusiveList<MySimpleElement> list;
+//   list.push_back(new MySimpleElement());
+//   for (auto element : list) { ... }
+//
+// Usage (multiple lists):
+//   class MultiElement {
+//    public:
+//     IntrusiveListLink list_link_a;
+//     IntrusiveListLink list_link_b;
+//   };
+//   IntrusiveList<MultiElement, offsetof(MultiElement, list_link_a)> list_a;
+//   IntrusiveList<MultiElement, offsetof(MultiElement, list_link_b)> list_b;
+//
+// By default elements in the list are not retained and must be kept alive
+// externally. For automatic memory management there are specializations for
+// std::unique_ptr.
+//
+// Usage (unique_ptr):
+//   IntrusiveList<std::unique_ptr<MyElement>> list;
+//   list.push_back(std::make_unique<MyElement>());
+//   std::unique_ptr<MyElement> elm = list.take(list.front());
+//
+// This type is thread-unsafe.
+
+#ifndef IREE_HAL_VULKAN_UTIL_INTRUSIVE_LIST_H_
+#define IREE_HAL_VULKAN_UTIL_INTRUSIVE_LIST_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <iterator>
+#include <limits>
+#include <utility>
+
+#include "iree/base/logging.h"
+
+namespace iree {
+
+// Define to enable extensive checks after each mutation of the intrusive list.
+// #define IREE_PARANOID_INTRUSIVE_LIST
+
+// Storage for the doubly-linked list.
+// This is embedded within all elements in an intrusive list.
+struct IntrusiveListLink {
+  IntrusiveListLink* prev = nullptr;
+  IntrusiveListLink* next = nullptr;
+
+  IntrusiveListLink() = default;
+
+  // Prevent copies.
+  IntrusiveListLink(const IntrusiveListLink&) = delete;
+  IntrusiveListLink& operator=(const IntrusiveListLink&) = delete;
+};
+
+template <class T>
+struct IntrusiveLinkBase : public T {
+ public:
+  IntrusiveListLink link;
+};
+
+template <>
+struct IntrusiveLinkBase<void> {
+ public:
+  IntrusiveListLink link;
+};
+
+// Base type for intrusive lists.
+// This is either used directly when the list is on naked pointers or
+// specialized to std::unique_ptr.
+template <typename T, typename IteratorT, typename ReverseIteratorT,
+          size_t kOffset>
+class IntrusiveListBase {
+ public:
+  using self_type = IntrusiveListBase<T, IteratorT, ReverseIteratorT, kOffset>;
+
+  IntrusiveListBase() = default;
+  virtual ~IntrusiveListBase() { clear(); }
+
+  // Prevent copies.
+  IntrusiveListBase(const IntrusiveListBase&) = delete;
+  IntrusiveListBase& operator=(const IntrusiveListBase&) = delete;
+
+  // Returns true if the list is empty.
+  // Performance: O(1)
+  constexpr bool empty() const { return head_ == nullptr; }
+
+  // Returns the total number of items in the list.
+  // Performance: O(1)
+  constexpr size_t size() const { return count_; }
+
+  // Returns true if the given item is contained within the list.
+  // Performance: O(n)
+  bool contains(T* value) const;
+
+  // Appends the contents of the given list to this one.
+  // The |other_list| is cleared.
+  // Performance: O(1)
+  void merge_from(self_type* other_list);
+
+  // Removes all items from the list.
+  // Performance: O(n)
+  void clear();
+
+  IteratorT begin() const { return IteratorT(head_); }
+  IteratorT end() const { return IteratorT(nullptr); }
+  ReverseIteratorT rbegin() const { return ReverseIteratorT(tail_); }
+  ReverseIteratorT rend() const { return ReverseIteratorT(nullptr); }
+
+  // Returns the next item in the list relative to the given item.
+  // |value| must exist in the list.
+  // Performance: O(1)
+  T* next(T* value) const;
+
+  // Returns the previous item in the list relative to the given item.
+  // |value| must exist in the list.
+  // Performance: O(1)
+  T* previous(T* value) const;
+
+  // Returns the item at the front of the list, if any.
+  // Performance: O(1)
+  T* front() const;
+
+  // Inserts an item at the front of the list.
+  // Performance: O(1)
+  void push_front(T* value);
+
+  // Removes the item at the front of the list.
+  // Performance: O(1)
+  void pop_front();
+
+  // Returns the item at the back of the list, if any.
+  // Performance: O(1)
+  T* back() const;
+
+  // Inserts an item at the back of the list.
+  // Performance: O(1)
+  void push_back(T* value);
+
+  // Removes the item at the back of the list.
+  // Performance: O(1)
+  void pop_back();
+
+  // Inserts an item into the list before the given iterator.
+  // Performance: O(1)
+  void insert(const IteratorT& it, T* value) { return insert(*it, value); }
+  void insert(T* position, T* value);
+
+  // Erases the given item from the list.
+  // Returns the item following the erased item, if any.
+  // Performance: O(1)
+  T* erase(T* value);
+
+  // Erases the item from the list at the given iterator.
+  // Performance: O(1)
+  IteratorT erase(const IteratorT& it);
+  ReverseIteratorT erase(const ReverseIteratorT& it);
+
+  // Replaces the item with a new item at the same position.
+  // |new_value| must not be contained in any list.
+  // Performance: O(1)
+  void replace(T* old_value, T* new_value);
+
+  // Sorts the list with the given comparison function.
+  // The sort function is the same as used by std::sort.
+  //
+  // Uses merge sort O(N log N) using the algorithm described here:
+  // http://www.chiark.greenend.org.uk/~sgtatham/algorithms/listsort.html
+  void sort(bool (*compare_fn)(T* a, T* b));
+
+ protected:
+  // Called when an item is added to the list.
+  virtual void OnAdd(T* value) {}
+  // Called when an item is removed from the list.
+  virtual void OnRemove(T* value) {}
+  // Called when an item is removed and deallocated.
+  virtual void OnDeallocate(T* value) {}
+
+  // Performs expensive correctness checks on the list structure. It's too slow
+  // to use in normal builds (even dbg), so it should only be used when there's
+  // a suspected issue with an intrusive list. Define
+  // IREE_PARANOID_INTRUSIVE_LIST to enable.
+  void CheckCorrectness() const;
+
+  IntrusiveListLink* head_ = nullptr;
+  IntrusiveListLink* tail_ = nullptr;
+  size_t count_ = 0;
+};
+
+// Basic iterator for an IntrusiveList.
+template <typename T, size_t kOffset, bool kForward>
+class IntrusiveListIterator
+    : public std::iterator<std::input_iterator_tag, int> {
+ public:
+  using self_type = IntrusiveListIterator<T, kOffset, kForward>;
+
+  explicit IntrusiveListIterator(IntrusiveListLink* current)
+      : current_(current) {}
+  IntrusiveListIterator& operator++();
+  self_type operator++(int);
+  self_type& operator--();
+  self_type operator--(int);
+  bool operator==(const self_type& rhs) const;
+  bool operator!=(const self_type& rhs) const;
+  T* operator*() const;
+
+ protected:
+  IntrusiveListLink* current_;
+};
+
+// Specialized IntrusiveListBase used for unreferenced naked pointers.
+// This very thinly wraps the base type and does no special memory management.
+template <typename T, size_t kOffset>
+class IntrusiveListUnrefBase
+    : public IntrusiveListBase<T, IntrusiveListIterator<T, kOffset, true>,
+                               IntrusiveListIterator<T, kOffset, false>,
+                               kOffset> {
+ public:
+  using IteratorT = IntrusiveListIterator<T, kOffset, true>;
+  using ReverseIteratorT = IntrusiveListIterator<T, kOffset, false>;
+  using base_list = IntrusiveListBase<T, IteratorT, ReverseIteratorT, kOffset>;
+
+  using base_list::clear;
+
+  // Removes all items from the list and calls the given deleter function for
+  // each of them. The built-in OnDeallocate will not be used.
+  // Performance: O(n)
+  void clear(const std::function<void(T*)>& deleter);
+
+ private:
+  using base_list::count_;
+  using base_list::head_;
+  using base_list::tail_;
+};
+
+constexpr size_t kUseDefaultLinkOffset = std::numeric_limits<size_t>::max();
+
+// IntrusiveList for raw pointers with a specified offset.
+// Use this if there are multiple links within a type.
+//
+// Usage:
+//  struct MyType {
+//   IntrusiveListLink link_a;
+//   IntrusiveListLink link_b;
+//  };
+//  IntrusiveList<MyType, offsetof(MyType, link_a)> list_a;
+//  IntrusiveList<MyType, offsetof(MyType, link_b)> list_b;
+template <typename T, size_t kOffset = kUseDefaultLinkOffset>
+class IntrusiveList : public IntrusiveListUnrefBase<T, kOffset> {};
+
+// IntrusiveList for raw pointers.
+// Items added to the list will not be owned by the list and must be freed by
+// the caller.
+//
+// Usage:
+//  struct MyType : public IntrusiveListBase<void> {};
+//  IntrusiveList<MyType> list;
+//  auto* p = new MyType();
+//  list.push_back(p);  // p is not retained and won't be freed!
+//  delete p;
+template <typename T>
+class IntrusiveList<T, kUseDefaultLinkOffset>
+    : public IntrusiveListUnrefBase<T, offsetof(T, link)> {};
+
+// -- implementation --
+
+namespace impl {
+
+// Maps an IntrusiveListLink to its containing type T.
+template <typename T, size_t kOffset>
+static inline T* LinkToT(IntrusiveListLink* link) {
+  if (link) {
+    return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(link) - kOffset);
+  } else {
+    return nullptr;
+  }
+}
+
+// Maps a containing type T to its IntrusiveListLink.
+template <typename T, size_t kOffset>
+static inline IntrusiveListLink* TToLink(T* value) {
+  if (value) {
+    return reinterpret_cast<IntrusiveListLink*>(
+        reinterpret_cast<uintptr_t>(value) + kOffset);
+  } else {
+    return nullptr;
+  }
+}
+
+}  // namespace impl
+
+template <typename T, size_t kOffset, bool kForward>
+IntrusiveListIterator<T, kOffset, kForward>&
+IntrusiveListIterator<T, kOffset, kForward>::operator++() {
+  if (current_) {
+    current_ = kForward ? current_->next : current_->prev;
+  }
+  return *this;
+}
+
+template <typename T, size_t kOffset, bool kForward>
+IntrusiveListIterator<T, kOffset, kForward>
+IntrusiveListIterator<T, kOffset, kForward>::operator++(int) {
+  self_type tmp(current_);
+  operator++();
+  return tmp;
+}
+
+template <typename T, size_t kOffset, bool kForward>
+IntrusiveListIterator<T, kOffset, kForward>&
+IntrusiveListIterator<T, kOffset, kForward>::operator--() {
+  if (current_) {
+    current_ = kForward ? current_->prev : current_->next;
+  }
+  return *this;
+}
+
+template <typename T, size_t kOffset, bool kForward>
+IntrusiveListIterator<T, kOffset, kForward>
+IntrusiveListIterator<T, kOffset, kForward>::operator--(int) {
+  self_type tmp(current_);
+  operator--();
+  return tmp;
+}
+
+template <typename T, size_t kOffset, bool kForward>
+bool IntrusiveListIterator<T, kOffset, kForward>::operator==(
+    const self_type& rhs) const {
+  return rhs.current_ == current_;
+}
+
+template <typename T, size_t kOffset, bool kForward>
+bool IntrusiveListIterator<T, kOffset, kForward>::operator!=(
+    const self_type& rhs) const {
+  return !operator==(rhs);
+}
+
+template <typename T, size_t kOffset, bool kForward>
+T* IntrusiveListIterator<T, kOffset, kForward>::operator*() const {
+  return impl::LinkToT<T, kOffset>(current_);
+}
+
+template <typename T, size_t kOffset>
+void IntrusiveListUnrefBase<T, kOffset>::clear(
+    const std::function<void(T*)>& deleter) {
+  auto* link = head_;
+  while (link) {
+    auto* next = link->next;
+    link->prev = link->next = nullptr;
+    deleter(impl::LinkToT<T, kOffset>(link));
+    link = next;
+  }
+  head_ = tail_ = nullptr;
+  count_ = 0;
+}
+
+template <typename T, typename IteratorT, typename ReverseIteratorT,
+          size_t kOffset>
+void IntrusiveListBase<T, IteratorT, ReverseIteratorT,
+                       kOffset>::CheckCorrectness() const {
+#if defined(IREE_PARANOID_INTRUSIVE_LIST)
+  auto* link = head_;
+  IntrusiveListLink* previous = nullptr;
+  size_t actual_count = 0;
+  while (link) {
+    ++actual_count;
+    if (!link->prev) {
+      IREE_DCHECK_EQ(link, head_);
+    }
+    if (!link->next) {
+      IREE_DCHECK_EQ(link, tail_);
+    }
+    IREE_DCHECK_EQ(link->prev, previous);
+    previous = link;
+    link = link->next;
+  }
+  IREE_DCHECK_EQ(actual_count, count_);
+#endif  // IREE_PARANOID_INTRUSIVE_LIST
+}
+
+template <typename T, typename IteratorT, typename ReverseIteratorT,
+          size_t kOffset>
+bool IntrusiveListBase<T, IteratorT, ReverseIteratorT, kOffset>::contains(
+    T* value) const {
+  if (!value) return false;
+  // TODO(benvanik): faster way of checking? requires list ptr in link?
+  auto* needle = impl::TToLink<T, kOffset>(value);
+  auto* link = head_;
+  while (link) {
+    if (link == needle) {
+      return true;
+    }
+    link = link->next;
+  }
+  return false;
+}
+
+template <typename T, typename IteratorT, typename ReverseIteratorT,
+          size_t kOffset>
+void IntrusiveListBase<T, IteratorT, ReverseIteratorT, kOffset>::merge_from(
+    self_type* other_list) {
+  if (tail_) {
+    tail_->next = other_list->head_;
+  }
+  if (other_list->head_) {
+    other_list->head_->prev = tail_;
+  }
+  if (!head_) {
+    head_ = other_list->head_;
+  }
+  tail_ = other_list->tail_;
+
+  other_list->head_ = nullptr;
+  other_list->tail_ = nullptr;
+
+  count_ += other_list->count_;
+  other_list->count_ = 0;
+}
+
+template <typename T, typename IteratorT, typename ReverseIteratorT,
+          size_t kOffset>
+void IntrusiveListBase<T, IteratorT, ReverseIteratorT, kOffset>::clear() {
+  auto* link = head_;
+  while (link) {
+    auto* next = link->next;
+    link->prev = link->next = nullptr;
+    OnDeallocate(impl::LinkToT<T, kOffset>(link));
+    link = next;
+  }
+  head_ = tail_ = nullptr;
+  count_ = 0;
+}
+
+template <typename T, typename IteratorT, typename ReverseIteratorT,
+          size_t kOffset>
+inline T* IntrusiveListBase<T, IteratorT, ReverseIteratorT, kOffset>::next(
+    T* value) const {
+  if (!value) {
+    return nullptr;
+  }
+  auto* link = impl::TToLink<T, kOffset>(value);
+  return impl::LinkToT<T, kOffset>(link->next);
+}
+
+template <typename T, typename IteratorT, typename ReverseIteratorT,
+          size_t kOffset>
+inline T* IntrusiveListBase<T, IteratorT, ReverseIteratorT, kOffset>::previous(
+    T* value) const {
+  if (!value) {
+    return nullptr;
+  }
+  auto* link = impl::TToLink<T, kOffset>(value);
+  return impl::LinkToT<T, kOffset>(link->prev);
+}
+
+template <typename T, typename IteratorT, typename ReverseIteratorT,
+          size_t kOffset>
+inline T* IntrusiveListBase<T, IteratorT, ReverseIteratorT, kOffset>::front()
+    const {
+  return impl::LinkToT<T, kOffset>(head_);
+}
+
+template <typename T, typename IteratorT, typename ReverseIteratorT,
+          size_t kOffset>
+void IntrusiveListBase<T, IteratorT, ReverseIteratorT, kOffset>::push_front(
+    T* value) {
+  IREE_DCHECK(value);
+  auto* link = impl::TToLink<T, kOffset>(value);
+  IREE_DCHECK(!link->next);
+  IREE_DCHECK(!link->prev);
+  link->next = head_;
+  link->prev = nullptr;
+  head_ = link;
+  if (link->next) {
+    link->next->prev = link;
+  }
+  if (!tail_) {
+    tail_ = link;
+  }
+  ++count_;
+  OnAdd(value);
+  CheckCorrectness();
+}
+
+template <typename T, typename IteratorT, typename ReverseIteratorT,
+          size_t kOffset>
+void IntrusiveListBase<T, IteratorT, ReverseIteratorT, kOffset>::pop_front() {
+  IREE_DCHECK(head_);
+  auto* link = head_;
+  if (link) {
+    head_ = head_->next;
+    link->next = link->prev = nullptr;
+    if (head_) {
+      head_->prev = nullptr;
+    }
+    if (link == tail_) {
+      tail_ = nullptr;
+    }
+    --count_;
+    OnDeallocate(impl::LinkToT<T, kOffset>(link));
+  }
+  CheckCorrectness();
+}
+
+template <typename T, typename IteratorT, typename ReverseIteratorT,
+          size_t kOffset>
+inline T* IntrusiveListBase<T, IteratorT, ReverseIteratorT, kOffset>::back()
+    const {
+  return impl::LinkToT<T, kOffset>(tail_);
+}
+
+template <typename T, typename IteratorT, typename ReverseIteratorT,
+          size_t kOffset>
+void IntrusiveListBase<T, IteratorT, ReverseIteratorT, kOffset>::push_back(
+    T* value) {
+  IREE_DCHECK(value);
+  auto* link = impl::TToLink<T, kOffset>(value);
+  IREE_DCHECK(!link->next);
+  IREE_DCHECK(!link->prev);
+  link->prev = tail_;
+  link->next = nullptr;
+  tail_ = link;
+  if (link->prev) {
+    link->prev->next = link;
+  }
+  if (!head_) {
+    head_ = link;
+  }
+  ++count_;
+  OnAdd(value);
+  CheckCorrectness();
+}
+
+template <typename T, typename IteratorT, typename ReverseIteratorT,
+          size_t kOffset>
+void IntrusiveListBase<T, IteratorT, ReverseIteratorT, kOffset>::pop_back() {
+  IREE_DCHECK(tail_);
+  auto* link = tail_;
+  if (link) {
+    tail_ = tail_->prev;
+    link->next = link->prev = nullptr;
+    if (tail_) {
+      tail_->next = nullptr;
+    }
+    if (link == head_) {
+      head_ = nullptr;
+    }
+    --count_;
+    OnDeallocate(impl::LinkToT<T, kOffset>(link));
+  }
+  CheckCorrectness();
+}
+
+template <typename T, typename IteratorT, typename ReverseIteratorT,
+          size_t kOffset>
+void IntrusiveListBase<T, IteratorT, ReverseIteratorT, kOffset>::insert(
+    T* position, T* value) {
+  IREE_DCHECK(value);
+  auto* link = impl::TToLink<T, kOffset>(value);
+  auto* position_link = impl::TToLink<T, kOffset>(position);
+  IREE_DCHECK(!link->next);
+  IREE_DCHECK(!link->prev);
+
+  if (position_link == head_) {
+    push_front(value);
+  } else if (position_link == nullptr) {
+    push_back(value);
+  } else {
+    link->next = position_link;
+    link->prev = position_link->prev;
+    position_link->prev->next = link;
+    position_link->prev = link;
+    ++count_;
+    OnAdd(value);
+  }
+  CheckCorrectness();
+}
+
+template <typename T, typename IteratorT, typename ReverseIteratorT,
+          size_t kOffset>
+T* IntrusiveListBase<T, IteratorT, ReverseIteratorT, kOffset>::erase(T* value) {
+  if (!value) {
+    return nullptr;
+  }
+  auto* link = impl::TToLink<T, kOffset>(value);
+  if (link->prev) {
+    IREE_DCHECK_NE(link, head_);
+    link->prev->next = link->next;
+  } else {
+    IREE_DCHECK_EQ(link, head_);
+    head_ = link->next;
+  }
+  if (link->next) {
+    IREE_DCHECK_NE(link, tail_);
+    link->next->prev = link->prev;
+  } else {
+    IREE_DCHECK_EQ(link, tail_);
+    tail_ = link->prev;
+  }
+  auto* next = link->next;
+  link->next = link->prev = nullptr;
+  --count_;
+  OnDeallocate(value);
+  CheckCorrectness();
+  return impl::LinkToT<T, kOffset>(next);
+}
+
+template <typename T, typename IteratorT, typename ReverseIteratorT,
+          size_t kOffset>
+IteratorT IntrusiveListBase<T, IteratorT, ReverseIteratorT, kOffset>::erase(
+    const IteratorT& it) {
+  return IteratorT(impl::TToLink<T, kOffset>(erase(*it)));
+}
+
+template <typename T, typename IteratorT, typename ReverseIteratorT,
+          size_t kOffset>
+ReverseIteratorT IntrusiveListBase<T, IteratorT, ReverseIteratorT,
+                                   kOffset>::erase(const ReverseIteratorT& it) {
+  return ReverseIteratorT(impl::TToLink<T, kOffset>(erase(*it)));
+}
+
+template <typename T, typename IteratorT, typename ReverseIteratorT,
+          size_t kOffset>
+void IntrusiveListBase<T, IteratorT, ReverseIteratorT, kOffset>::replace(
+    T* old_value, T* new_value) {
+  IREE_DCHECK(old_value);
+  IREE_DCHECK(new_value);
+  IREE_DCHECK_NE(old_value, new_value);
+  auto* old_link = impl::TToLink<T, kOffset>(old_value);
+  auto* new_link = impl::TToLink<T, kOffset>(new_value);
+  new_link->next = old_link->next;
+  new_link->prev = old_link->prev;
+  if (new_link->prev) {
+    new_link->prev->next = new_link;
+  } else {
+    head_ = new_link;
+  }
+  if (new_link->next) {
+    new_link->next->prev = new_link;
+  } else {
+    tail_ = new_link;
+  }
+  old_link->next = old_link->prev = nullptr;
+  OnAdd(new_value);
+  OnDeallocate(old_value);
+  CheckCorrectness();
+}
+
+template <typename T, typename IteratorT, typename ReverseIteratorT,
+          size_t kOffset>
+void IntrusiveListBase<T, IteratorT, ReverseIteratorT, kOffset>::sort(
+    bool (*compare_fn)(T* a, T* b)) {
+  if (empty()) {
+    // Empty list no-op.
+    return;
+  }
+  // Repeatedly run until the list is sorted.
+  int in_size = 1;
+  while (true) {
+    IntrusiveListLink* p = head_;
+    IntrusiveListLink* q = nullptr;
+    IntrusiveListLink* e = nullptr;
+    IntrusiveListLink* tail = nullptr;
+    head_ = nullptr;
+    tail_ = nullptr;
+    // Repeatedly merge sublists.
+    int merge_count = 0;
+    do {
+      ++merge_count;
+      q = p;
+      // Determine the size of the first part and find the second.
+      int p_size = 0;
+      for (int i = 0; i < in_size; ++i) {
+        ++p_size;
+        q = q->next;
+        if (!q) {
+          break;
+        }
+      }
+      // Merge the two lists (if we have two).
+      int q_size = in_size;
+      while (p_size > 0 || (q_size > 0 && q)) {
+        if (p_size == 0) {
+          // p is empty; e must come from q.
+          e = q;
+          q = q->next;
+          --q_size;
+        } else if (q_size == 0 || !q) {
+          // q is empty; e must come from p.
+          e = p;
+          p = p->next;
+          --p_size;
+        } else if (compare_fn(impl::LinkToT<T, kOffset>(p),
+                              impl::LinkToT<T, kOffset>(q))) {
+          // p <= q; e must come from p.
+          e = p;
+          p = p->next;
+          --p_size;
+        } else {
+          // q < p; e must come from q.
+          e = q;
+          q = q->next;
+          --q_size;
+        }
+        // Append e to the merged list.
+        if (tail) {
+          tail->next = e;
+        } else {
+          head_ = e;
+        }
+        e->prev = tail;
+        tail = e;
+      }
+      p = q;
+    } while (p);
+    tail->next = nullptr;
+    if (merge_count <= 1) {
+      // List is now sorted; stash and return.
+      tail_ = tail;
+      CheckCorrectness();
+      return;
+    }
+    // Run merge again with larger lists.
+    in_size *= 2;
+  }
+}
+
+}  // namespace iree
+
+// Specializations:
+#include "iree/hal/vulkan/util/intrusive_list_unique_ptr.inc"
+
+#endif  // IREE_HAL_VULKAN_UTIL_INTRUSIVE_LIST_H_
diff --git a/runtime/src/iree/hal/vulkan/util/intrusive_list_test.cc b/runtime/src/iree/hal/vulkan/util/intrusive_list_test.cc
new file mode 100644
index 0000000..ad5dee5
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/util/intrusive_list_test.cc
@@ -0,0 +1,537 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/vulkan/util/intrusive_list.h"
+
+#include <algorithm>
+#include <vector>
+
+#include "iree/testing/gtest.h"
+
+namespace iree {
+namespace {
+
+using ::testing::ElementsAre;
+
+struct Item {
+  size_t some_data_0;
+  IntrusiveListLink list_a;
+  size_t some_data_1;
+  IntrusiveListLink list_b;
+  size_t some_data_2;
+  int value;
+
+  static constexpr size_t kToken = 0xDEADBEEF;
+  explicit Item(int value)
+      : some_data_0(kToken),
+        some_data_1(kToken),
+        some_data_2(kToken),
+        value(value) {}
+  bool is_valid() {
+    return some_data_0 == kToken && some_data_1 == kToken &&
+           some_data_2 == kToken;
+  }
+};
+
+template <typename T, size_t V>
+std::vector<T*> ExtractItems(const IntrusiveList<T, V>& list) {
+  std::vector<T*> items;
+  for (auto* item : list) {
+    items.push_back(item);
+  }
+  return items;
+}
+
+template <typename T, size_t V>
+std::vector<int> ExtractValues(const IntrusiveList<T, V>& list) {
+  std::vector<int> values;
+  for (auto* item : list) {
+    values.push_back(item->value);
+  }
+  return values;
+}
+
+template <typename T, size_t V>
+std::vector<int> ExtractValuesMutable(const IntrusiveList<T, V>& list) {
+  std::vector<int> values;
+  for (auto* item : list) {
+    values.push_back(item->value);
+  }
+  return values;
+}
+
+TEST(IntrusiveListTest, PushPopItems) {
+  Item item1(1);
+  Item item2(2);
+  Item item3(3);
+  Item item4(4);
+
+  IntrusiveList<Item, offsetof(Item, list_a)> items;
+  EXPECT_TRUE(items.empty());
+  EXPECT_EQ(items.size(), 0u);
+  EXPECT_EQ(items.front(), nullptr);
+  EXPECT_EQ(items.back(), nullptr);
+  EXPECT_TRUE(items.begin() == items.end());
+  items.push_front(&item1);
+  EXPECT_FALSE(items.empty());
+  EXPECT_EQ(items.size(), 1u);
+  EXPECT_EQ(items.front(), &item1);
+  EXPECT_EQ(items.back(), &item1);
+  EXPECT_FALSE(items.begin() == items.end());
+  items.push_front(&item2);
+  EXPECT_EQ(items.size(), 2u);
+  EXPECT_EQ(items.front(), &item2);
+  EXPECT_EQ(items.back(), &item1);
+  items.push_front(&item3);
+  EXPECT_EQ(items.size(), 3u);
+  EXPECT_EQ(items.front(), &item3);
+  EXPECT_EQ(items.back(), &item1);
+  EXPECT_THAT(ExtractValues(items), ElementsAre(3, 2, 1));
+
+  items.push_back(&item4);
+  EXPECT_EQ(items.size(), 4u);
+  EXPECT_EQ(items.front(), &item3);
+  EXPECT_EQ(items.back(), &item4);
+  EXPECT_THAT(ExtractValues(items), ElementsAre(3, 2, 1, 4));
+
+  items.pop_front();
+  EXPECT_EQ(items.size(), 3u);
+  EXPECT_EQ(items.front(), &item2);
+  EXPECT_EQ(items.back(), &item4);
+  EXPECT_THAT(ExtractValues(items), ElementsAre(2, 1, 4));
+
+  items.pop_back();
+  EXPECT_EQ(items.size(), 2u);
+  EXPECT_EQ(items.front(), &item2);
+  EXPECT_EQ(items.back(), &item1);
+  EXPECT_THAT(ExtractValues(items), ElementsAre(2, 1));
+
+  items.pop_back();
+  items.pop_front();
+  EXPECT_TRUE(items.empty());
+  EXPECT_EQ(items.size(), 0u);
+  EXPECT_EQ(items.front(), nullptr);
+  EXPECT_EQ(items.back(), nullptr);
+  EXPECT_TRUE(items.begin() == items.end());
+
+  EXPECT_TRUE(item1.is_valid());
+  EXPECT_TRUE(item2.is_valid());
+  EXPECT_TRUE(item3.is_valid());
+  EXPECT_TRUE(item4.is_valid());
+}
+
+TEST(IntrusiveListTest, Contains) {
+  Item item1(1);
+  Item item2(2);
+  Item item3(3);
+  Item item4(4);
+
+  IntrusiveList<Item, offsetof(Item, list_a)> items;
+  items.push_back(&item1);
+  items.push_back(&item2);
+  items.push_back(&item3);
+  // item4 omitted.
+
+  EXPECT_TRUE(items.contains(&item1));
+  EXPECT_TRUE(items.contains(&item2));
+  EXPECT_TRUE(items.contains(&item3));
+  EXPECT_FALSE(items.contains(&item4));
+
+  EXPECT_FALSE(items.contains(nullptr));
+}
+
+TEST(IntrusiveListTest, MergeFrom) {
+  Item item1(1);
+  Item item2(2);
+  Item item3(3);
+  Item item4(4);
+
+  IntrusiveList<Item, offsetof(Item, list_a)> items0;
+  items0.push_back(&item1);
+  items0.push_back(&item2);
+  items0.push_back(&item3);
+
+  IntrusiveList<Item, offsetof(Item, list_a)> items1;
+  items1.push_back(&item4);
+
+  items0.merge_from(&items1);
+  EXPECT_THAT(ExtractValues(items0), ElementsAre(1, 2, 3, 4));
+  EXPECT_TRUE(items1.empty());
+}
+
+TEST(IntrusiveListTest, MergeFromEmpty) {
+  IntrusiveList<Item, offsetof(Item, list_a)> items0;
+  IntrusiveList<Item, offsetof(Item, list_a)> items1;
+  items0.merge_from(&items1);
+}
+
+TEST(IntrusiveListTest, MergeFromAll) {
+  Item item1(1);
+  Item item2(2);
+  Item item3(3);
+  Item item4(4);
+  IntrusiveList<Item, offsetof(Item, list_a)> items0;
+  items0.push_back(&item1);
+  items0.push_back(&item2);
+  items0.push_back(&item3);
+  items0.push_back(&item4);
+  IntrusiveList<Item, offsetof(Item, list_a)> items1;
+
+  // Merge all items from items1 into items0. Shouldn't change anything.
+  items0.merge_from(&items1);
+  EXPECT_THAT(ExtractValues(items0), ElementsAre(1, 2, 3, 4));
+  EXPECT_TRUE(items1.empty());
+
+  // Merge all items from items0 into items1. Should move everything.
+  items1.merge_from(&items0);
+  EXPECT_TRUE(items0.empty());
+  EXPECT_THAT(ExtractValues(items1), ElementsAre(1, 2, 3, 4));
+}
+
+TEST(IntrusiveListTest, Erase) {
+  Item item1(1);
+  Item item2(2);
+  Item item3(3);
+  Item item4(4);
+
+  IntrusiveList<Item, offsetof(Item, list_a)> items;
+  items.push_back(&item1);
+  items.push_back(&item2);
+  items.push_back(&item3);
+  items.push_back(&item4);
+
+  EXPECT_THAT(ExtractValues(items), ElementsAre(1, 2, 3, 4));
+  items.erase(&item3);
+  EXPECT_THAT(ExtractValues(items), ElementsAre(1, 2, 4));
+  items.erase(&item1);
+  EXPECT_THAT(ExtractValues(items), ElementsAre(2, 4));
+  items.erase(&item4);
+  EXPECT_THAT(ExtractValues(items), ElementsAre(2));
+  items.erase(&item2);
+  EXPECT_TRUE(items.empty());
+
+  items.push_back(&item1);
+  items.push_back(&item2);
+  items.push_back(&item3);
+  items.push_back(&item4);
+
+  EXPECT_THAT(ExtractValues(items), ElementsAre(1, 2, 3, 4));
+  auto it = items.begin();
+  items.erase(it);
+  EXPECT_THAT(ExtractValues(items), ElementsAre(2, 3, 4));
+  it = items.end();
+  items.erase(it);
+  EXPECT_THAT(ExtractValues(items), ElementsAre(2, 3, 4));
+  it = items.begin();
+  ++it;
+  items.erase(it);
+  EXPECT_THAT(ExtractValues(items), ElementsAre(2, 4));
+
+  it = items.begin();
+  it = items.erase(it);
+  EXPECT_EQ(4, (*it)->value);
+  EXPECT_THAT(ExtractValues(items), ElementsAre(4));
+  it = items.erase(it);
+  EXPECT_TRUE(items.empty());
+  EXPECT_EQ(items.end(), it);
+}
+
+TEST(IntrusiveListTest, MultipleLists) {
+  Item item1(1);
+  Item item2(2);
+  Item item3(3);
+  Item item4(4);
+
+  IntrusiveList<Item, offsetof(Item, list_a)> items_a;
+  IntrusiveList<Item, offsetof(Item, list_b)> items_b;
+  items_a.push_back(&item1);
+  items_a.push_back(&item2);
+  items_a.push_back(&item3);
+  items_a.push_back(&item4);
+  items_b.push_front(&item1);
+  items_b.push_front(&item2);
+  items_b.push_front(&item3);
+  items_b.push_front(&item4);
+  EXPECT_THAT(ExtractValues(items_a), ElementsAre(1, 2, 3, 4));
+  EXPECT_THAT(ExtractValues(items_b), ElementsAre(4, 3, 2, 1));
+  items_b.erase(&item3);
+  EXPECT_THAT(ExtractValues(items_a), ElementsAre(1, 2, 3, 4));
+  EXPECT_THAT(ExtractValues(items_b), ElementsAre(4, 2, 1));
+  items_a.pop_back();
+  EXPECT_THAT(ExtractValues(items_a), ElementsAre(1, 2, 3));
+  EXPECT_THAT(ExtractValues(items_b), ElementsAre(4, 2, 1));
+}
+
+TEST(IntrusiveListTest, MutableIterator) {
+  Item item1(1);
+  Item item2(2);
+  Item item3(3);
+  Item item4(4);
+
+  IntrusiveList<Item, offsetof(Item, list_a)> items;
+  items.push_back(&item4);
+  items.push_front(&item1);
+  items.push_front(&item2);
+  items.push_front(&item3);
+
+  EXPECT_THAT(ExtractValuesMutable(items), ElementsAre(3, 2, 1, 4));
+}
+
+struct BaseType {
+  explicit BaseType(int value) : value(value) {}
+  int value;
+  IntrusiveListLink base_link;
+};
+struct SubType : public BaseType {
+  explicit SubType(int value) : BaseType(value) {}
+  IntrusiveListLink sub_link;
+};
+TEST(IntrusiveListTest, SimpleType) {
+  SubType item1(1);
+  SubType item2(2);
+  SubType item3(3);
+  SubType item4(4);
+
+  IntrusiveList<BaseType, offsetof(BaseType, base_link)> items_a;
+  items_a.push_front(&item1);
+  items_a.push_front(&item2);
+  items_a.push_front(&item3);
+  items_a.push_front(&item4);
+  EXPECT_THAT(ExtractValues(items_a), ElementsAre(4, 3, 2, 1));
+
+  IntrusiveList<SubType, offsetof(SubType, sub_link)> items_b;
+  items_b.push_back(&item1);
+  items_b.push_back(&item2);
+  items_b.push_back(&item3);
+  items_b.push_back(&item4);
+  EXPECT_THAT(ExtractValues(items_b), ElementsAre(1, 2, 3, 4));
+}
+
+struct AbstractType {
+  explicit AbstractType(int value) : value(value) {}
+  virtual ~AbstractType() = default;
+  virtual int DoSomething() = 0;
+  int value;
+  IntrusiveListLink base_link;
+};
+struct ImplType : public AbstractType {
+  explicit ImplType(int value) : AbstractType(value) {}
+  int DoSomething() override { return value; }
+  IntrusiveListLink sub_link;
+};
+
+TEST(IntrusiveListTest, ComplexType) {
+  ImplType item1(1);
+  ImplType item2(2);
+  ImplType item3(3);
+  ImplType item4(4);
+
+  IntrusiveList<AbstractType, offsetof(AbstractType, base_link)> items_a;
+  items_a.push_front(&item1);
+  items_a.push_front(&item2);
+  items_a.push_front(&item3);
+  items_a.push_front(&item4);
+  EXPECT_THAT(ExtractValues(items_a), ElementsAre(4, 3, 2, 1));
+
+  IntrusiveList<ImplType, offsetof(ImplType, sub_link)> items_b;
+  items_b.push_back(&item1);
+  items_b.push_back(&item2);
+  items_b.push_back(&item3);
+  items_b.push_back(&item4);
+  EXPECT_THAT(ExtractValues(items_b), ElementsAre(1, 2, 3, 4));
+}
+
+bool Comparison(Item* a, Item* b) { return a->value < b->value; }
+
+TEST(IntrusiveListTest, Inserting) {
+  Item item1(1);
+  Item item2(2);
+  Item item3(3);
+  Item item4(4);
+
+  IntrusiveList<Item, offsetof(Item, list_a)> items;
+  items.insert(items.end(), &item3);
+  items.insert(items.begin(), &item1);
+  items.insert(items.end(), &item4);
+
+  auto pos = std::upper_bound(items.begin(), items.end(), &item2, Comparison);
+  items.insert(pos, &item2);
+
+  EXPECT_THAT(ExtractValues(items), ElementsAre(1, 2, 3, 4));
+}
+
+TEST(IntrusiveListTest, Iteration) {
+  Item item1(1);
+  Item item2(2);
+  Item item3(3);
+  Item item4(4);
+
+  IntrusiveList<Item, offsetof(Item, list_a)> items;
+  items.push_back(&item1);
+  items.push_back(&item2);
+  items.push_back(&item3);
+  items.push_back(&item4);
+
+  std::vector<int> regular;
+  for (auto it = items.begin(); it != items.end(); ++it) {
+    regular.push_back((*it)->value);
+  }
+  EXPECT_THAT(regular, ElementsAre(1, 2, 3, 4));
+
+  std::vector<int> reverse;
+  for (auto rit = items.rbegin(); rit != items.rend(); ++rit) {
+    reverse.push_back((*rit)->value);
+  }
+  EXPECT_THAT(reverse, ElementsAre(4, 3, 2, 1));
+}
+
+TEST(IntrusiveListTest, NextPrevious) {
+  Item item1(1);
+  Item item2(2);
+
+  IntrusiveList<Item, offsetof(Item, list_a)> items;
+  EXPECT_EQ(nullptr, items.previous(nullptr));
+  EXPECT_EQ(nullptr, items.next(nullptr));
+
+  items.push_back(&item1);
+  EXPECT_EQ(nullptr, items.previous(&item1));
+  EXPECT_EQ(nullptr, items.next(&item1));
+
+  items.push_back(&item2);
+  EXPECT_EQ(nullptr, items.previous(&item1));
+  EXPECT_EQ(&item2, items.next(&item1));
+  EXPECT_EQ(&item1, items.previous(&item2));
+  EXPECT_EQ(nullptr, items.next(&item2));
+}
+
+TEST(IntrusiveListTest, Clear) {
+  Item item1(1);
+  Item item2(2);
+  Item item3(3);
+  Item item4(4);
+
+  IntrusiveList<Item, offsetof(Item, list_a)> items;
+
+  // Empty clear.
+  items.clear();
+  EXPECT_TRUE(items.empty());
+
+  // 1 item clear.
+  items.push_back(&item1);
+  items.clear();
+  EXPECT_TRUE(items.empty());
+
+  // Multi-item clear.
+  items.push_back(&item1);
+  items.push_back(&item2);
+  items.push_back(&item3);
+  items.push_back(&item4);
+  items.clear();
+  EXPECT_TRUE(items.empty());
+}
+
+TEST(IntrusiveListTest, ClearDeleter) {
+  Item item1(1);
+  Item item2(2);
+
+  IntrusiveList<Item, offsetof(Item, list_a)> items;
+
+  // No-op first.
+  int delete_count = 0;
+  items.clear([&](Item* item) { ++delete_count; });
+  EXPECT_EQ(0, delete_count);
+
+  // Now with items.
+  items.push_back(&item1);
+  items.push_back(&item2);
+  items.clear([&](Item* item) { ++delete_count; });
+  EXPECT_EQ(2, delete_count);
+  EXPECT_TRUE(items.empty());
+}
+
+TEST(IntrusiveListTest, Replace) {
+  Item item1(1);
+  Item item2(2);
+  Item item3(3);
+
+  IntrusiveList<Item, offsetof(Item, list_a)> items;
+  items.push_back(&item1);
+  items.push_back(&item2);
+
+  items.replace(&item1, &item3);
+  EXPECT_THAT(ExtractValues(items), ElementsAre(3, 2));
+  EXPECT_FALSE(items.contains(&item1));
+  items.replace(&item2, &item1);
+  EXPECT_THAT(ExtractValues(items), ElementsAre(3, 1));
+  EXPECT_FALSE(items.contains(&item2));
+}
+
+TEST(IntrusiveListTest, Sort) {
+  Item item1(1);
+  Item item2(2);
+  Item item3(3);
+  Item item4(4);
+
+  IntrusiveList<Item, offsetof(Item, list_a)> items;
+
+  // Empty sort.
+  items.sort([](Item* a, Item* b) { return a->value < b->value; });
+
+  // Single item sort.
+  items.clear();
+  items.push_back(&item1);
+  items.sort([](Item* a, Item* b) { return a->value < b->value; });
+  EXPECT_THAT(ExtractValues(items), ElementsAre(1));
+
+  // Already sorted.
+  items.clear();
+  items.push_back(&item1);
+  items.push_back(&item2);
+  items.push_back(&item3);
+  items.push_back(&item4);
+  items.sort([](Item* a, Item* b) { return a->value < b->value; });
+  EXPECT_THAT(ExtractValues(items), ElementsAre(1, 2, 3, 4));
+
+  // Reverse.
+  items.clear();
+  items.push_back(&item4);
+  items.push_back(&item3);
+  items.push_back(&item2);
+  items.push_back(&item1);
+  items.sort([](Item* a, Item* b) { return a->value < b->value; });
+  EXPECT_THAT(ExtractValues(items), ElementsAre(1, 2, 3, 4));
+
+  // Random.
+  items.clear();
+  items.push_back(&item2);
+  items.push_back(&item4);
+  items.push_back(&item1);
+  items.push_back(&item3);
+  items.sort([](Item* a, Item* b) { return a->value < b->value; });
+  EXPECT_THAT(ExtractValues(items), ElementsAre(1, 2, 3, 4));
+
+  // Stability.
+  Item item1a(1);
+  Item item2a(2);
+  items.clear();
+  items.push_back(&item2);
+  items.push_back(&item4);
+  items.push_back(&item1);
+  items.push_back(&item3);
+  items.push_back(&item1a);
+  items.push_back(&item2a);
+  items.sort([](Item* a, Item* b) { return a->value <= b->value; });
+  EXPECT_THAT(ExtractValues(items), ElementsAre(1, 1, 2, 2, 3, 4));
+  auto items_vector = ExtractItems(items);
+  EXPECT_EQ(&item1, items_vector[0]);
+  EXPECT_EQ(&item1a, items_vector[1]);
+  EXPECT_EQ(&item2, items_vector[2]);
+  EXPECT_EQ(&item2a, items_vector[3]);
+  items.clear();
+}
+
+}  // namespace
+}  // namespace iree
diff --git a/runtime/src/iree/hal/vulkan/util/intrusive_list_unique_ptr.inc b/runtime/src/iree/hal/vulkan/util/intrusive_list_unique_ptr.inc
new file mode 100644
index 0000000..c0011fa
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/util/intrusive_list_unique_ptr.inc
@@ -0,0 +1,137 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// IWYU pragma: private, include "iree/hal/vulkan/util/intrusive_list.h"
+
+#ifndef IREE_HAL_VULKAN_UTIL_INTRUSIVE_LIST_UNIQUE_PTR_H_
+#define IREE_HAL_VULKAN_UTIL_INTRUSIVE_LIST_UNIQUE_PTR_H_
+
+#include <cstddef>
+#include <memory>
+
+#include "iree/base/logging.h"
+#include "iree/hal/vulkan/util/intrusive_list.h"
+
+namespace iree {
+
+// Specialized IntrusiveListBase for std::unique_ptr types.
+// This makes the list methods accept std::unique_ptrs and contains a special
+// take() method that takes ownership of a list item.
+template <typename T, size_t kOffset>
+class IntrusiveListUniquePtrBase
+    : private IntrusiveListBase<T, IntrusiveListIterator<T, kOffset, true>,
+                                IntrusiveListIterator<T, kOffset, false>,
+                                kOffset> {
+ public:
+  using IteratorT = IntrusiveListIterator<T, kOffset, true>;
+  using ReverseIteratorT = IntrusiveListIterator<T, kOffset, false>;
+  using base_list = IntrusiveListBase<T, IteratorT, ReverseIteratorT, kOffset>;
+  using self_type = IntrusiveListUniquePtrBase<T, kOffset>;
+
+  IntrusiveListUniquePtrBase() = default;
+
+  using base_list::empty;
+  using base_list::size;
+
+  using base_list::contains;
+
+  inline void merge_from(self_type* other_list) {
+    return base_list::merge_from(static_cast<base_list*>(other_list));
+  }
+
+  using base_list::clear;
+
+  using base_list::begin;
+  using base_list::end;
+  using base_list::rbegin;
+  using base_list::rend;
+
+  using base_list::next;
+
+  using base_list::previous;
+
+  using base_list::front;
+
+  void push_front(std::unique_ptr<T> value) {
+    base_list::push_front(value.release());
+  }
+
+  using base_list::pop_front;
+
+  using base_list::back;
+
+  void push_back(std::unique_ptr<T> value) {
+    base_list::push_back(value.release());
+  }
+
+  using base_list::pop_back;
+
+  void insert(const IteratorT& it, std::unique_ptr<T> value) {
+    base_list::insert(it, value.release());
+  }
+
+  using base_list::erase;
+
+  // Removes an item from the list at the given iterator and transfers ownership
+  // to the caller.
+  // Performance: O(1)
+  std::unique_ptr<T> take(IteratorT& it) {  // NOLINT(runtime/references)
+    return take(*it);
+  }
+
+  // Removes an item from the list and transfers ownership to the caller.
+  // Performance: O(1)
+  std::unique_ptr<T> take(T* value) {
+    if (!value) {
+      return {nullptr};
+    }
+    auto* link = impl::TToLink<T, kOffset>(value);
+    if (link->prev) {
+      IREE_DCHECK_NE(link, head_);
+      link->prev->next = link->next;
+    } else {
+      IREE_DCHECK_EQ(link, head_);
+      head_ = link->next;
+    }
+    if (link->next) {
+      IREE_DCHECK_NE(link, tail_);
+      link->next->prev = link->prev;
+    } else {
+      IREE_DCHECK_EQ(link, tail_);
+      tail_ = link->prev;
+    }
+    link->next = link->prev = nullptr;
+    --count_;
+    base_list::OnRemove(value);
+    base_list::CheckCorrectness();
+    return std::unique_ptr<T>(value);
+  }
+
+  void replace(T* old_value, std::unique_ptr<T> new_value) {
+    base_list::replace(old_value, new_value.release());
+  }
+
+  using base_list::sort;
+
+ private:
+  void OnDeallocate(T* value) override { delete value; }
+
+  using base_list::count_;
+  using base_list::head_;
+  using base_list::tail_;
+};
+
+template <typename U, size_t kOffset>
+class IntrusiveList<std::unique_ptr<U>, kOffset>
+    : public IntrusiveListUniquePtrBase<U, kOffset> {};
+
+template <typename U>
+class IntrusiveList<std::unique_ptr<U>, kUseDefaultLinkOffset>
+    : public IntrusiveListUniquePtrBase<U, offsetof(U, link)> {};
+
+}  // namespace iree
+
+#endif  // IREE_HAL_VULKAN_UTIL_INTRUSIVE_LIST_UNIQUE_PTR_H_
diff --git a/runtime/src/iree/hal/vulkan/util/intrusive_list_unique_ptr_test.cc b/runtime/src/iree/hal/vulkan/util/intrusive_list_unique_ptr_test.cc
new file mode 100644
index 0000000..9596368
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/util/intrusive_list_unique_ptr_test.cc
@@ -0,0 +1,77 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <memory>
+
+#include "iree/hal/vulkan/util/intrusive_list.h"
+#include "iree/testing/gtest.h"
+
+namespace iree {
+namespace {
+
+struct AllocatedType : public IntrusiveLinkBase<void> {
+  AllocatedType() { ++alloc_count; }
+  ~AllocatedType() { --alloc_count; }
+  static int alloc_count;
+};
+int AllocatedType::alloc_count = 0;
+
+TEST(IntrusiveListUniquePtrTest, UniquePtr) {
+  AllocatedType::alloc_count = 0;
+
+  // Push/clear.
+  IntrusiveList<std::unique_ptr<AllocatedType>> list;
+  EXPECT_EQ(0, AllocatedType::alloc_count);
+  list.push_back(std::make_unique<AllocatedType>());
+  EXPECT_EQ(1, AllocatedType::alloc_count);
+  EXPECT_NE(nullptr, list.front());
+  list.clear();
+  EXPECT_EQ(0, AllocatedType::alloc_count);
+
+  // Push/pop.
+  list.push_back(std::make_unique<AllocatedType>());
+  EXPECT_EQ(1, AllocatedType::alloc_count);
+  EXPECT_NE(nullptr, list.front());
+  for (auto item : list) {
+    EXPECT_EQ(item, list.front());
+  }
+  list.pop_back();
+  EXPECT_EQ(0, AllocatedType::alloc_count);
+
+  // Push/take.
+  list.push_back(std::make_unique<AllocatedType>());
+  EXPECT_EQ(1, AllocatedType::alloc_count);
+  EXPECT_NE(nullptr, list.front());
+  auto item = list.take(list.front());
+  EXPECT_TRUE(list.empty());
+  EXPECT_NE(nullptr, item.get());
+  EXPECT_EQ(1, AllocatedType::alloc_count);
+  item.reset();
+  EXPECT_EQ(0, AllocatedType::alloc_count);
+
+  // Push/replace.
+  list.push_back(std::make_unique<AllocatedType>());
+  EXPECT_EQ(1, AllocatedType::alloc_count);
+  list.replace(list.front(), std::make_unique<AllocatedType>());
+  EXPECT_EQ(1, AllocatedType::alloc_count);
+  list.clear();
+  EXPECT_EQ(0, AllocatedType::alloc_count);
+
+  // Iteration.
+  list.push_back(std::make_unique<AllocatedType>());
+  list.push_back(std::make_unique<AllocatedType>());
+  list.push_back(std::make_unique<AllocatedType>());
+  EXPECT_EQ(3, AllocatedType::alloc_count);
+  for (auto item : list) {
+    AllocatedType* item_ptr = item;
+    EXPECT_NE(nullptr, item_ptr);
+  }
+  list.clear();
+  EXPECT_EQ(0, AllocatedType::alloc_count);
+}
+
+}  // namespace
+}  // namespace iree
diff --git a/runtime/src/iree/hal/vulkan/util/ref_ptr.h b/runtime/src/iree/hal/vulkan/util/ref_ptr.h
new file mode 100644
index 0000000..5bde1c9
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/util/ref_ptr.h
@@ -0,0 +1,383 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_VULKAN_UTIL_REF_PTR_H_
+#define IREE_HAL_VULKAN_UTIL_REF_PTR_H_
+
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <utility>
+
+#include "iree/base/attributes.h"
+#include "iree/base/logging.h"
+
+namespace iree {
+
+// Use this to get really verbose refptr logging:
+// #define IREE_VERBOSE_REF_PTR
+
+template <class T>
+class ref_ptr;
+
+// Allocates a new ref_ptr type.
+// Like make_unique, but for ref_ptr.
+//
+// Usage:
+//  ref_ptr<MyType> p = make_ref<MyType>(1, 2, 3);
+template <typename T, typename... Args>
+ref_ptr<T> make_ref(Args&&... args) {
+  return ref_ptr<T>(new T(std::forward<Args>(args)...));
+}
+
+// Assigns a raw pointer to a ref_ptr without adding a reference.
+//
+// Usage:
+//  ref_ptr<MyType> p = assign_ref(new MyType());
+template <typename T>
+inline ref_ptr<T> assign_ref(T* value) {
+  return ref_ptr<T>(value);
+}
+
+// Adds a reference to the given raw pointer.
+//
+// Usage:
+//  MyType* raw_ptr = AcquirePointerFromSomewhere();
+//  ref_ptr<MyType> p = add_ref(raw_ptr);
+template <typename T>
+inline ref_ptr<T> add_ref(T* value) {
+  if (value) ref_ptr_add_ref(value);
+  return ref_ptr<T>(value);
+}
+
+// Adds a reference to the given ref_ptr.
+//
+// Usage:
+//  ref_ptr<MyType> a = make_ref<MyType>();
+//  ref_ptr<MyType> p = add_ref(a);
+template <typename T>
+inline ref_ptr<T> add_ref(const ref_ptr<T>& value) {
+  if (value.get()) ref_ptr_add_ref(value.get());
+  return ref_ptr<T>(value.get());
+}
+
+// Reference counted pointer container.
+// This is modeled on boost::instrusive_ptr in that it requires no
+// extra storage over the pointer type and should compile to almost
+// no additional code. It also allows us to round-trip object pointers
+// through regular pointers, which is critical when having to round-trip
+// them through JNI/etc where we can't use things like unique_ptr/shared_ptr.
+//
+//   ref_ptr<Foo> p1(new Foo());    // ref count 1
+//   ref_ptr<Foo> p2(p1);           // ref count 2
+//   p1.reset();                    // ref count 1
+//   p2.reset();                    // ref count 0, deleted
+//
+// When round-tripping the pointer through external APIs, use release():
+//   ref_ptr<Foo> p1(new Foo());    // ref count 1
+//   Foo* raw_p = p1.release();     // ref count 1
+//   // pass to API
+//   ref_ptr<Foo> p2(raw_p);        // ref count 1 (don't add ref)
+//   p2.reset();                    // ref count 0, deleted
+//
+// See the boost intrusive_ptr docs for details of behavior:
+// http://www.boost.org/doc/libs/1_55_0/libs/smart_ptr/intrusive_ptr.html
+//
+// ref_ptr manages the target objects in a thread-safe way, though you'll want
+// to take care with objects that may have pinned threads for deallocation. If
+// you release the last reference to an object on a thread other than what it
+// was expecting you're gonna have a bad time.
+//
+// Compatible only with types that subclass RefObject or implement the following
+// methods:
+//   ref_ptr_add_ref
+//   ref_ptr_release_ref
+template <class T>
+class ref_ptr {
+ private:
+  typedef ref_ptr this_type;
+  typedef T* this_type::*unspecified_bool_type;
+
+ public:
+  // Initializes with nullptr.
+  IREE_ATTRIBUTE_ALWAYS_INLINE ref_ptr() noexcept = default;
+
+  // Initializes with nullptr so that there is no way to create an
+  // uninitialized ref_ptr.
+  IREE_ATTRIBUTE_ALWAYS_INLINE ref_ptr(std::nullptr_t) noexcept {}  // NOLINT
+
+  // Initializes the pointer to the given value.
+  // The value will not have its reference count incremented (as it is with
+  // unique_ptr). Use Retain to add to the reference count.
+  IREE_ATTRIBUTE_ALWAYS_INLINE explicit ref_ptr(T* p) noexcept : px_(p) {}
+
+  // Decrements the reference count of the owned pointer.
+  IREE_ATTRIBUTE_ALWAYS_INLINE ~ref_ptr() noexcept {
+    if (px_) ref_ptr_release_ref(px_);
+  }
+
+  // No implicit ref_ptr copying allowed; use add_ref instead.
+  ref_ptr(const ref_ptr&) noexcept = delete;
+  ref_ptr& operator=(const ref_ptr&) noexcept = delete;
+
+  // Move support to transfer ownership from one ref_ptr to another.
+  ref_ptr(ref_ptr&& rhs) noexcept : px_(rhs.release()) {}
+  ref_ptr& operator=(ref_ptr&& rhs) noexcept {
+    if (px_ != rhs.px_) {
+      if (px_) ref_ptr_release_ref(px_);
+      px_ = rhs.release();
+    }
+    return *this;
+  }
+
+  // Move support from another compatible type.
+  template <typename U>
+  ref_ptr(ref_ptr<U>&& rhs) noexcept : px_(rhs.release()) {}  // NOLINT
+  template <typename U>
+  ref_ptr& operator=(ref_ptr<U>&& rhs) noexcept {
+    if (px_ != rhs.get()) {
+      if (px_) ref_ptr_release_ref(px_);
+      px_ = rhs.release();
+    }
+    return *this;
+  }
+
+  // Resets the object to nullptr and decrements the reference count, possibly
+  // deleting it.
+  void reset() noexcept {
+    if (px_) {
+      ref_ptr_release_ref(px_);
+      px_ = nullptr;
+    }
+  }
+
+  // Releases a pointer.
+  // Returns the current pointer held by this object without having
+  // its reference count decremented and resets the ref_ptr to empty.
+  // Returns nullptr if the ref_ptr holds no value.
+  // To re-wrap in a ref_ptr use either ref_ptr<T>(value) or assign().
+  IREE_ATTRIBUTE_ALWAYS_INLINE T* release() noexcept {
+    T* p = px_;
+    px_ = nullptr;
+    return p;
+  }
+
+  // Assigns a pointer.
+  // The pointer will be accepted by the ref_ptr and its reference count will
+  // not be incremented.
+  IREE_ATTRIBUTE_ALWAYS_INLINE void assign(T* value) noexcept {
+    reset();
+    px_ = value;
+  }
+
+  // Gets the pointer referenced by this instance.
+  // operator* and operator-> will assert() if there is no current object.
+  constexpr T* get() const noexcept { return px_; }
+  constexpr T& operator*() const noexcept { return *px_; }
+  constexpr T* operator->() const noexcept { return px_; }
+
+  // Support boolean expression evaluation ala unique_ptr/shared_ptr:
+  // https://en.cppreference.com/w/cpp/memory/shared_ptr/operator_bool
+  constexpr operator unspecified_bool_type() const noexcept {
+    return px_ ? &this_type::px_ : nullptr;
+  }
+  // Supports unary expression evaluation.
+  constexpr bool operator!() const noexcept { return !px_; }
+
+  // Swap support.
+  void swap(ref_ptr& rhs) { std::swap(px_, rhs.px_); }
+
+ private:
+  T* px_ = nullptr;
+};
+
+// Base class for reference counted objects.
+// Reference counted objects should be used with the ref_ptr pointer type.
+// As reference counting can be tricky always prefer to use unique_ptr and
+// avoid this type. Only use this when unique_ptr is not possible, such as
+// when round-tripping objects through marshaling boundaries (v8/Java) or
+// any objects that may have their lifetime tied to a garbage collected
+// object.
+//
+// Subclasses should protect their dtor so that reference counting must
+// be used.
+//
+// This is designed to avoid the need for extra vtable space or for adding
+// methods to the vtable of subclasses. This differs from the boost Pointable
+// version of this object.
+// Inspiration for this comes from Peter Weinert's Dr. Dobb's article:
+// http://www.drdobbs.com/cpp/a-base-class-for-intrusively-reference-c/229218807
+//
+// RefObjects are thread safe and may be used with ref_ptrs from multiple
+// threads.
+//
+// Subclasses may implement a custom Delete operator to handle their
+// deallocation. It should be thread safe as it may be called from any thread.
+//
+// Usage:
+//   class MyRefObject : public RefObject<MyRefObject> {
+//    public:
+//     MyRefObject() = default;
+//     // Optional; can be used to return to pool/etc - must be public:
+//     static void Delete(MyRefObject* ptr) {
+//       ::operator delete(ptr);
+//     }
+//   };
+template <class T>
+class RefObject {
+  static_assert(!std::is_array<T>::value, "T must not be an array");
+
+  // value is true if a static Delete(T*) function is present.
+  struct has_custom_deleter {
+    template <typename C>
+    static auto Test(C* p) -> decltype(C::Delete(nullptr), std::true_type());
+    template <typename>
+    static std::false_type Test(...);
+    static constexpr bool value =
+        std::is_same<std::true_type, decltype(Test<T>(nullptr))>::value;
+  };
+
+  template <typename V, bool has_custom_deleter>
+  struct delete_thunk {
+    static void Delete(V* p) {
+      auto ref_obj = static_cast<RefObject<V>*>(p);
+      int previous_count = ref_obj->counter_.fetch_sub(1);
+#ifdef IREE_VERBOSE_REF_PTR
+      IREE_LOG(INFO) << "ro-- " << typeid(V).name() << " " << p << " now "
+                     << previous_count - 1
+                     << (previous_count == 1 ? " DEAD (CUSTOM)" : "");
+#endif  // IREE_VERBOSE_REF_PTR
+      if (previous_count == 1) {
+        // We delete type T pointer here to avoid the need for a virtual dtor.
+        V::Delete(p);
+      }
+    }
+    static void Destroy(V* p) { V::Delete(p); }
+  };
+
+  template <typename V>
+  struct delete_thunk<V, false> {
+    static void Delete(V* p) {
+      auto ref_obj = static_cast<RefObject<V>*>(p);
+      int previous_count = ref_obj->counter_.fetch_sub(1);
+#ifdef IREE_VERBOSE_REF_PTR
+      IREE_LOG(INFO) << "ro-- " << typeid(V).name() << " " << p << " now "
+                     << previous_count - 1
+                     << (previous_count == 1 ? " DEAD" : "");
+#endif  // IREE_VERBOSE_REF_PTR
+      if (previous_count == 1) {
+        // We delete type T pointer here to avoid the need for a virtual dtor.
+        delete p;
+      }
+    }
+    static void Destroy(V* p) { delete p; }
+  };
+
+ public:
+  // Adds a reference; used by ref_ptr.
+  friend void ref_ptr_add_ref(T* p) {
+    auto ref_obj = static_cast<RefObject*>(p);
+    ++ref_obj->counter_;
+
+#ifdef IREE_VERBOSE_REF_PTR
+    IREE_LOG(INFO) << "ro++ " << typeid(T).name() << " " << p << " now "
+                   << ref_obj->counter_;
+#endif  // IREE_VERBOSE_REF_PTR
+  }
+
+  // Releases a reference, potentially deleting the object; used by ref_ptr.
+  friend void ref_ptr_release_ref(T* p) {
+    delete_thunk<T, has_custom_deleter::value>::Delete(p);
+  }
+
+  // Deletes the object (precondition: ref count is zero).
+  friend void ref_ptr_destroy_ref(T* p) {
+    delete_thunk<T, has_custom_deleter::value>::Destroy(p);
+  }
+
+  // Deletes the object (precondition: ref count is zero).
+  static void DirectDestroy(void* p) {
+    ref_ptr_destroy_ref(reinterpret_cast<T*>(p));
+  }
+
+  // Adds a reference.
+  // ref_ptr should be used instead of this in most cases. This is required
+  // for when interoperating with marshaling APIs.
+  void AddReference() { ref_ptr_add_ref(static_cast<T*>(this)); }
+
+  // Releases a reference, potentially deleting the object.
+  // ref_ptr should be used instead of this in most cases. This is required
+  // for when interoperating with marshaling APIs.
+  void ReleaseReference() { ref_ptr_release_ref(static_cast<T*>(this)); }
+
+  // Returns the offset of the reference counter field from the start of the
+  // type T.
+  //
+  // This is generally unsafe to use and is here for support of the
+  // iree_vm_ref_t glue that allows RefObject-derived types to be round-tripped
+  // through the VM.
+  //
+  // For simple POD types or non-virtual classes we expect this to return 0.
+  // If the type has virtual methods (dtors/etc) then it should be 4 or 8
+  // (depending on pointer width). It may be other things, and instead of too
+  // much crazy magic we just rely on offsetof doing the right thing here.
+  static constexpr size_t offsetof_counter() { return offsetof(T, counter_); }
+
+ protected:
+  RefObject() { ref_ptr_add_ref(static_cast<T*>(this)); }
+  RefObject(const RefObject&) = default;
+  RefObject& operator=(const RefObject&) { return *this; }
+
+  std::atomic<int32_t> counter_{0};
+};
+
+// Various comparison operator overloads.
+
+template <class T, class U>
+inline bool operator==(ref_ptr<T> const& a, ref_ptr<U> const& b) {
+  return a.get() == b.get();
+}
+
+template <class T, class U>
+inline bool operator!=(ref_ptr<T> const& a, ref_ptr<U> const& b) {
+  return a.get() != b.get();
+}
+
+template <class T, class U>
+inline bool operator==(ref_ptr<T> const& a, U* b) {
+  return a.get() == b;
+}
+
+template <class T, class U>
+inline bool operator!=(ref_ptr<T> const& a, U* b) {
+  return a.get() != b;
+}
+
+template <class T, class U>
+inline bool operator==(T* a, ref_ptr<U> const& b) {
+  return a == b.get();
+}
+
+template <class T, class U>
+inline bool operator!=(T* a, ref_ptr<U> const& b) {
+  return a != b.get();
+}
+
+template <class T>
+inline bool operator<(ref_ptr<T> const& a, ref_ptr<T> const& b) {
+  return a.get() < b.get();
+}
+
+// Swaps the pointers of two ref_ptrs.
+template <class T>
+void swap(ref_ptr<T>& lhs, ref_ptr<T>& rhs) {
+  lhs.swap(rhs);
+}
+
+}  // namespace iree
+
+#endif  // IREE_HAL_VULKAN_UTIL_REF_PTR_H_
diff --git a/runtime/src/iree/hal/vulkan/util/ref_ptr_test.cc b/runtime/src/iree/hal/vulkan/util/ref_ptr_test.cc
new file mode 100644
index 0000000..532931c
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/util/ref_ptr_test.cc
@@ -0,0 +1,324 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/vulkan/util/ref_ptr.h"
+
+#include "iree/testing/gtest.h"
+
+namespace iree {
+namespace {
+
+class MyType : public RefObject<MyType> {
+ public:
+  int x = 5;
+
+  using RefObject<MyType>::counter_;  // Expose for testing.
+};
+
+TEST(RefPtrTest, Construction) {
+  // Empty.
+  ref_ptr<MyType> n1;
+  EXPECT_EQ(nullptr, n1.get());
+  ref_ptr<MyType> n2(nullptr);
+  EXPECT_EQ(nullptr, n2.get());
+
+  // Assign a new ptr and add ref.
+  MyType* a_ptr = new MyType();
+  EXPECT_EQ(1, a_ptr->counter_);
+  ref_ptr<MyType> a(a_ptr);
+  EXPECT_EQ(1, a->counter_);
+
+  // Assign existing ptr without adding a ref.
+  ref_ptr<MyType> b(a_ptr);
+  EXPECT_EQ(1, b->counter_);
+
+  // Add a new ref.
+  ref_ptr<MyType> c = add_ref(b);
+  EXPECT_EQ(2, c->counter_);
+
+  b.release();
+}
+
+TEST(RefPtrTest, Assign) {
+  // Ok to assign nothing.
+  ref_ptr<MyType> n1 = assign_ref<MyType>(nullptr);
+  EXPECT_EQ(nullptr, n1.get());
+
+  ref_ptr<MyType> mt = make_ref<MyType>();
+  EXPECT_EQ(1, mt->counter_);
+  ref_ptr<MyType> n2 = assign_ref(mt.get());
+  EXPECT_EQ(1, mt->counter_);
+  mt.release();  // must release, as we assigned to n2.
+  EXPECT_EQ(1, n2->counter_);
+  n2.reset();
+}
+
+TEST(RefPtrTest, Retain) {
+  // Ok to retain nothing.
+  ref_ptr<MyType> n1 = add_ref<MyType>(nullptr);
+  EXPECT_EQ(nullptr, n1.get());
+
+  ref_ptr<MyType> mt = make_ref<MyType>();
+  EXPECT_EQ(1, mt->counter_);
+  ref_ptr<MyType> n2 = add_ref(mt.get());
+  EXPECT_EQ(2, mt->counter_);
+  mt.reset();
+  EXPECT_EQ(1, n2->counter_);
+  n2.reset();
+}
+
+TEST(RefPtrTest, Reset) {
+  ref_ptr<MyType> a(new MyType());
+  ref_ptr<MyType> b(new MyType());
+
+  // Reset to drop reference.
+  ref_ptr<MyType> a_copy = add_ref(a);
+  EXPECT_EQ(2, a_copy->counter_);
+  a.reset();
+  EXPECT_EQ(1, a_copy->counter_);
+
+  // Reset via = operator.
+  a = nullptr;
+  EXPECT_EQ(1, a_copy->counter_);
+  a = add_ref(a_copy);
+  EXPECT_EQ(2, a_copy->counter_);
+
+  // No-op on empty ptrs.
+  ref_ptr<MyType> n;
+  n.reset();
+  n.assign(nullptr);
+}
+
+TEST(RefPtrTest, ReleaseAssign) {
+  ref_ptr<MyType> a(new MyType());
+
+  // Release a's pointer.
+  MyType* a_raw_ptr = a.get();
+  MyType* a_ptr = a.release();
+  EXPECT_EQ(a_raw_ptr, a_ptr);
+  EXPECT_EQ(nullptr, a.get());
+  EXPECT_EQ(1, a_ptr->counter_);
+
+  // Re-wrap in a ref_ptr.
+  a.assign(a_ptr);
+  EXPECT_EQ(1, a->counter_);
+
+  // No-op on empty ptrs.
+  ref_ptr<MyType> n;
+  EXPECT_EQ(nullptr, n.release());
+}
+
+TEST(RefPtrTest, Accessors) {
+  ref_ptr<MyType> a(new MyType());
+  EXPECT_EQ(5, a->x);
+  a->x = 100;
+  EXPECT_EQ(100, a->x);
+
+  MyType& ra = *a;
+  ra.x = 200;
+  EXPECT_EQ(200, ra.x);
+
+  const MyType& cra = *a;
+  EXPECT_EQ(200, cra.x);
+}
+
+TEST(RefPtrTest, BooleanExpressions) {
+  ref_ptr<MyType> a(new MyType());
+  ref_ptr<MyType> n;
+
+  EXPECT_NE(nullptr, a.get());
+  EXPECT_TRUE(a);
+  EXPECT_FALSE(!a);
+  EXPECT_EQ(true, static_cast<bool>(a));
+
+  EXPECT_EQ(nullptr, n.get());
+  EXPECT_FALSE(n);
+  EXPECT_TRUE(!n);
+  EXPECT_EQ(false, static_cast<bool>(n));
+}
+
+TEST(RefPtrTest, Comparisons) {
+  ref_ptr<MyType> a(new MyType());
+  ref_ptr<MyType> b(new MyType());
+  ref_ptr<MyType> n;
+
+  EXPECT_TRUE(a == a);
+  EXPECT_TRUE(a == a.get());
+  EXPECT_TRUE(a.get() == a);
+  EXPECT_FALSE(a != a);
+  EXPECT_FALSE(a != a.get());
+  EXPECT_FALSE(a.get() != a);
+
+  EXPECT_FALSE(a == b);
+  EXPECT_FALSE(a == b.get());
+  EXPECT_FALSE(a.get() == b);
+  EXPECT_TRUE(a != b);
+  EXPECT_TRUE(a != b.get());
+  EXPECT_TRUE(a.get() != b);
+
+  EXPECT_TRUE(n == n);
+  EXPECT_TRUE(n == n.get());
+  EXPECT_TRUE(n.get() == n);
+  EXPECT_FALSE(n != n);
+  EXPECT_FALSE(n != n.get());
+  EXPECT_FALSE(n.get() != n);
+
+  EXPECT_FALSE(a < a);
+  EXPECT_TRUE(n < a);
+}
+
+TEST(RefPtrTest, Swap) {
+  ref_ptr<MyType> a(new MyType());
+  ref_ptr<MyType> b(new MyType());
+  MyType* a_ptr = a.get();
+  MyType* b_ptr = b.get();
+
+  swap(a, a);
+  EXPECT_EQ(a_ptr, a);
+
+  swap(a, b);
+  EXPECT_EQ(a_ptr, b.get());
+  EXPECT_EQ(b_ptr, a.get());
+
+  swap(a, b);
+  EXPECT_EQ(a_ptr, a.get());
+  EXPECT_EQ(b_ptr, b.get());
+
+  ref_ptr<MyType> c;
+  swap(a, c);
+  EXPECT_EQ(a_ptr, c.get());
+  EXPECT_EQ(nullptr, a.get());
+}
+
+TEST(RefPtrTest, Move) {
+  auto a = make_ref<MyType>();
+  auto b = make_ref<MyType>();
+  ref_ptr<MyType> c;
+  EXPECT_EQ(nullptr, c.get());
+
+  c = std::move(a);
+  EXPECT_NE(nullptr, c.get());
+
+  b = std::move(c);
+  EXPECT_NE(nullptr, b.get());
+}
+
+TEST(RefPtrTest, MoveCompatible) {
+  struct MyBaseType : public RefObject<MyBaseType> {
+    int x = 5;
+    using RefObject<MyBaseType>::counter_;  // Expose for testing.
+
+    virtual ~MyBaseType() = default;
+  };
+  struct MyTypeA : public MyBaseType {
+    int a = 6;
+  };
+  struct MyTypeB : public MyBaseType {
+    int b = 7;
+  };
+
+  ref_ptr<MyTypeA> a = make_ref<MyTypeA>();
+  EXPECT_EQ(1, a->counter_);
+  ref_ptr<MyBaseType> base = add_ref(a);
+  EXPECT_EQ(a.get(), base.get());
+  EXPECT_EQ(2, a->counter_);
+
+  base = make_ref<MyTypeB>();
+  EXPECT_EQ(1, a->counter_);
+  EXPECT_EQ(1, base->counter_);
+}
+
+TEST(RefPtrTest, StackAllocation) {
+  static int alloc_count = 0;
+  class StackAllocationType : public RefObject<StackAllocationType> {
+   public:
+    StackAllocationType() { ++alloc_count; }
+    ~StackAllocationType() { --alloc_count; }
+  };
+  {
+    StackAllocationType a;
+    EXPECT_EQ(1, alloc_count);
+  }
+  EXPECT_EQ(0, alloc_count);
+}
+
+TEST(RefPtrTest, DefaultDeleter) {
+  static int alloc_count = 0;
+  class DefaultDeleterType : public RefObject<DefaultDeleterType> {
+   public:
+    DefaultDeleterType() { ++alloc_count; }
+    ~DefaultDeleterType() { --alloc_count; }
+  };
+
+  // Empty is ok.
+  ref_ptr<DefaultDeleterType> n;
+  n.reset();
+
+  // Lifecycle.
+  EXPECT_EQ(0, alloc_count);
+  ref_ptr<DefaultDeleterType> a = make_ref<DefaultDeleterType>();
+  EXPECT_EQ(1, alloc_count);
+  a.reset();
+  EXPECT_EQ(0, alloc_count);
+}
+
+TEST(RefPtrTest, InlineDeallocator) {
+  static int alloc_count = 0;
+  class CustomDeleterType : public RefObject<CustomDeleterType> {
+   public:
+    CustomDeleterType() { ++alloc_count; }
+    static void Delete(CustomDeleterType* ptr) {
+      --alloc_count;
+      ::operator delete(ptr);
+    }
+  };
+
+  // Empty is ok.
+  ref_ptr<CustomDeleterType> n;
+  n.reset();
+
+  // Lifecycle.
+  EXPECT_EQ(0, alloc_count);
+  auto a = make_ref<CustomDeleterType>();
+  EXPECT_EQ(1, alloc_count);
+  a.reset();
+  EXPECT_EQ(0, alloc_count);
+}
+
+class VirtualDtorTypeA : public RefObject<VirtualDtorTypeA> {
+ public:
+  VirtualDtorTypeA() { ++alloc_count_a; }
+  virtual ~VirtualDtorTypeA() { --alloc_count_a; }
+  static int alloc_count_a;
+};
+int VirtualDtorTypeA::alloc_count_a = 0;
+
+class VirtualDtorTypeB : public VirtualDtorTypeA {
+ public:
+  VirtualDtorTypeB() { ++alloc_count_b; }
+  ~VirtualDtorTypeB() override { --alloc_count_b; }
+  static int alloc_count_b;
+};
+int VirtualDtorTypeB::alloc_count_b = 0;
+
+TEST(RefPtrTest, VirtualDestructor) {
+  // Empty is ok.
+  ref_ptr<VirtualDtorTypeB> n;
+  n.reset();
+
+  // Lifecycle.
+  EXPECT_EQ(0, VirtualDtorTypeA::alloc_count_a);
+  EXPECT_EQ(0, VirtualDtorTypeB::alloc_count_b);
+  ref_ptr<VirtualDtorTypeA> a = make_ref<VirtualDtorTypeB>();
+  EXPECT_EQ(1, VirtualDtorTypeA::alloc_count_a);
+  EXPECT_EQ(1, VirtualDtorTypeB::alloc_count_b);
+  a.reset();
+  EXPECT_EQ(0, VirtualDtorTypeA::alloc_count_a);
+  EXPECT_EQ(0, VirtualDtorTypeB::alloc_count_b);
+}
+
+}  // namespace
+}  // namespace iree
diff --git a/runtime/src/iree/hal/vulkan/vma_allocator.cc b/runtime/src/iree/hal/vulkan/vma_allocator.cc
new file mode 100644
index 0000000..9cc167d
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/vma_allocator.cc
@@ -0,0 +1,406 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/vulkan/vma_allocator.h"
+
+#include <cstddef>
+#include <cstring>
+
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/vulkan/dynamic_symbols.h"
+#include "iree/hal/vulkan/status_util.h"
+#include "iree/hal/vulkan/util/ref_ptr.h"
+#include "iree/hal/vulkan/vma_buffer.h"
+
+using namespace iree::hal::vulkan;
+
+typedef struct iree_hal_vulkan_vma_allocator_t {
+  iree_hal_resource_t resource;
+  iree_hal_device_t* device;  // unretained to avoid cycles
+  iree_allocator_t host_allocator;
+  VmaAllocator vma;
+
+  IREE_STATISTICS(VkPhysicalDeviceMemoryProperties memory_props;)
+  IREE_STATISTICS(iree_hal_allocator_statistics_t statistics;)
+} iree_hal_vulkan_vma_allocator_t;
+
+namespace {
+extern const iree_hal_allocator_vtable_t iree_hal_vulkan_vma_allocator_vtable;
+}  // namespace
+
+static iree_hal_vulkan_vma_allocator_t* iree_hal_vulkan_vma_allocator_cast(
+    iree_hal_allocator_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_vulkan_vma_allocator_vtable);
+  return (iree_hal_vulkan_vma_allocator_t*)base_value;
+}
+
+#if IREE_STATISTICS_ENABLE
+
+static iree_hal_memory_type_t iree_hal_vulkan_vma_allocator_lookup_memory_type(
+    iree_hal_vulkan_vma_allocator_t* allocator, uint32_t memory_type_ordinal) {
+  // We could better map the types however today we only use the
+  // device/host-local bits.
+  VkMemoryPropertyFlags flags =
+      allocator->memory_props.memoryTypes[memory_type_ordinal].propertyFlags;
+  if (iree_all_bits_set(flags, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT)) {
+    return IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL;
+  } else {
+    return IREE_HAL_MEMORY_TYPE_HOST_LOCAL;
+  }
+}
+
+// Callback function called before vkAllocateMemory.
+static void VKAPI_PTR iree_hal_vulkan_vma_allocate_callback(
+    VmaAllocator VMA_NOT_NULL vma, uint32_t memoryType,
+    VkDeviceMemory VMA_NOT_NULL_NON_DISPATCHABLE memory, VkDeviceSize size,
+    void* VMA_NULLABLE pUserData) {
+  iree_hal_vulkan_vma_allocator_t* allocator =
+      (iree_hal_vulkan_vma_allocator_t*)pUserData;
+  iree_hal_allocator_statistics_record_alloc(
+      &allocator->statistics,
+      iree_hal_vulkan_vma_allocator_lookup_memory_type(allocator, memoryType),
+      (iree_device_size_t)size);
+}
+
+// Callback function called before vkFreeMemory.
+static void VKAPI_PTR iree_hal_vulkan_vma_free_callback(
+    VmaAllocator VMA_NOT_NULL vma, uint32_t memoryType,
+    VkDeviceMemory VMA_NOT_NULL_NON_DISPATCHABLE memory, VkDeviceSize size,
+    void* VMA_NULLABLE pUserData) {
+  iree_hal_vulkan_vma_allocator_t* allocator =
+      (iree_hal_vulkan_vma_allocator_t*)pUserData;
+  iree_hal_allocator_statistics_record_free(
+      &allocator->statistics,
+      iree_hal_vulkan_vma_allocator_lookup_memory_type(allocator, memoryType),
+      (iree_device_size_t)size);
+}
+
+#endif  // IREE_STATISTICS_ENABLE
+
+iree_status_t iree_hal_vulkan_vma_allocator_create(
+    VkInstance instance, VkPhysicalDevice physical_device,
+    VkDeviceHandle* logical_device, iree_hal_device_t* device,
+    iree_hal_allocator_t** out_allocator) {
+  IREE_ASSERT_ARGUMENT(instance);
+  IREE_ASSERT_ARGUMENT(physical_device);
+  IREE_ASSERT_ARGUMENT(logical_device);
+  IREE_ASSERT_ARGUMENT(device);
+  IREE_ASSERT_ARGUMENT(out_allocator);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_allocator_t host_allocator = logical_device->host_allocator();
+  iree_hal_vulkan_vma_allocator_t* allocator = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_allocator_malloc(host_allocator, sizeof(*allocator),
+                                (void**)&allocator));
+  iree_hal_resource_initialize(&iree_hal_vulkan_vma_allocator_vtable,
+                               &allocator->resource);
+  allocator->host_allocator = host_allocator;
+  allocator->device = device;
+
+  const auto& syms = logical_device->syms();
+  VmaVulkanFunctions vulkan_fns;
+  memset(&vulkan_fns, 0, sizeof(vulkan_fns));
+  vulkan_fns.vkGetPhysicalDeviceProperties =
+      syms->vkGetPhysicalDeviceProperties;
+  vulkan_fns.vkGetPhysicalDeviceMemoryProperties =
+      syms->vkGetPhysicalDeviceMemoryProperties;
+  vulkan_fns.vkAllocateMemory = syms->vkAllocateMemory;
+  vulkan_fns.vkFreeMemory = syms->vkFreeMemory;
+  vulkan_fns.vkMapMemory = syms->vkMapMemory;
+  vulkan_fns.vkUnmapMemory = syms->vkUnmapMemory;
+  vulkan_fns.vkFlushMappedMemoryRanges = syms->vkFlushMappedMemoryRanges;
+  vulkan_fns.vkInvalidateMappedMemoryRanges =
+      syms->vkInvalidateMappedMemoryRanges;
+  vulkan_fns.vkBindBufferMemory = syms->vkBindBufferMemory;
+  vulkan_fns.vkBindImageMemory = syms->vkBindImageMemory;
+  vulkan_fns.vkGetBufferMemoryRequirements =
+      syms->vkGetBufferMemoryRequirements;
+  vulkan_fns.vkGetImageMemoryRequirements = syms->vkGetImageMemoryRequirements;
+  vulkan_fns.vkCreateBuffer = syms->vkCreateBuffer;
+  vulkan_fns.vkDestroyBuffer = syms->vkDestroyBuffer;
+  vulkan_fns.vkCreateImage = syms->vkCreateImage;
+  vulkan_fns.vkDestroyImage = syms->vkDestroyImage;
+  vulkan_fns.vkCmdCopyBuffer = syms->vkCmdCopyBuffer;
+
+  VmaDeviceMemoryCallbacks device_memory_callbacks;
+  memset(&device_memory_callbacks, 0, sizeof(device_memory_callbacks));
+  IREE_STATISTICS({
+    device_memory_callbacks.pfnAllocate = iree_hal_vulkan_vma_allocate_callback;
+    device_memory_callbacks.pfnFree = iree_hal_vulkan_vma_free_callback;
+    device_memory_callbacks.pUserData = allocator;
+  });
+
+  VmaAllocatorCreateInfo create_info;
+  memset(&create_info, 0, sizeof(create_info));
+  create_info.flags = 0;
+  create_info.physicalDevice = physical_device;
+  create_info.device = *logical_device;
+  create_info.instance = instance;
+  create_info.preferredLargeHeapBlockSize = 64 * 1024 * 1024;
+  create_info.pAllocationCallbacks = logical_device->allocator();
+  create_info.pDeviceMemoryCallbacks = &device_memory_callbacks;
+  create_info.pHeapSizeLimit = NULL;
+  create_info.pVulkanFunctions = &vulkan_fns;
+  VmaAllocator vma = VK_NULL_HANDLE;
+  iree_status_t status = VK_RESULT_TO_STATUS(
+      vmaCreateAllocator(&create_info, &vma), "vmaCreateAllocator");
+
+  if (iree_status_is_ok(status)) {
+    allocator->vma = vma;
+
+    IREE_STATISTICS({
+      const VkPhysicalDeviceMemoryProperties* memory_props = NULL;
+      vmaGetMemoryProperties(allocator->vma, &memory_props);
+      memcpy(&allocator->memory_props, memory_props,
+             sizeof(allocator->memory_props));
+    });
+
+    *out_allocator = (iree_hal_allocator_t*)allocator;
+  } else {
+    vmaDestroyAllocator(vma);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_vulkan_vma_allocator_destroy(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator) {
+  iree_hal_vulkan_vma_allocator_t* allocator =
+      iree_hal_vulkan_vma_allocator_cast(base_allocator);
+  iree_allocator_t host_allocator = allocator->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  vmaDestroyAllocator(allocator->vma);
+  iree_allocator_free(host_allocator, allocator);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_allocator_t iree_hal_vulkan_vma_allocator_host_allocator(
+    const iree_hal_allocator_t* IREE_RESTRICT base_allocator) {
+  iree_hal_vulkan_vma_allocator_t* allocator =
+      (iree_hal_vulkan_vma_allocator_t*)base_allocator;
+  return allocator->host_allocator;
+}
+
+static iree_status_t iree_hal_vulkan_vma_allocator_trim(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator) {
+  return iree_ok_status();
+}
+
+static void iree_hal_vulkan_vma_allocator_query_statistics(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    iree_hal_allocator_statistics_t* IREE_RESTRICT out_statistics) {
+  IREE_STATISTICS({
+    iree_hal_vulkan_vma_allocator_t* allocator =
+        iree_hal_vulkan_vma_allocator_cast(base_allocator);
+    memcpy(out_statistics, &allocator->statistics, sizeof(*out_statistics));
+  });
+}
+
+static iree_hal_buffer_compatibility_t
+iree_hal_vulkan_vma_allocator_query_compatibility(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    const iree_hal_buffer_params_t* IREE_RESTRICT params,
+    iree_device_size_t allocation_size) {
+  // TODO(benvanik): check to ensure the allocator can serve the memory type.
+
+  // All buffers can be allocated on the heap.
+  iree_hal_buffer_compatibility_t compatibility =
+      IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE;
+
+  // All buffers can be used as transfer source/dest.
+  if (iree_all_bits_set(params->usage, IREE_HAL_BUFFER_USAGE_TRANSFER)) {
+    compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_TRANSFER;
+  }
+
+  // Buffers can only be used on the queue if they are device visible.
+  if (iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE)) {
+    if (iree_all_bits_set(params->usage, IREE_HAL_BUFFER_USAGE_DISPATCH)) {
+      compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_DISPATCH;
+    }
+  }
+
+  return compatibility;
+}
+
+static iree_status_t iree_hal_vulkan_vma_allocator_allocate_internal(
+    iree_hal_vulkan_vma_allocator_t* IREE_RESTRICT allocator,
+    const iree_hal_buffer_params_t* IREE_RESTRICT params,
+    iree_device_size_t allocation_size, iree_const_byte_span_t initial_data,
+    VmaAllocationCreateFlags flags,
+    iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
+  // Guard against the corner case where the requested buffer size is 0. The
+  // application is unlikely to do anything when requesting a 0-byte buffer; but
+  // it can happen in real world use cases. So we should at least not crash.
+  if (allocation_size == 0) allocation_size = 4;
+  // Align allocation sizes to 4 bytes so shaders operating on 32 bit types can
+  // act safely even on buffer ranges that are not naturally aligned.
+  allocation_size = iree_host_align(allocation_size, 4);
+
+  VkBufferCreateInfo buffer_create_info;
+  buffer_create_info.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
+  buffer_create_info.pNext = NULL;
+  buffer_create_info.flags = 0;
+  buffer_create_info.size = allocation_size;
+  buffer_create_info.usage = 0;
+  if (iree_all_bits_set(params->usage, IREE_HAL_BUFFER_USAGE_TRANSFER)) {
+    buffer_create_info.usage |= VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
+    buffer_create_info.usage |= VK_BUFFER_USAGE_TRANSFER_DST_BIT;
+  }
+  if (iree_all_bits_set(params->usage, IREE_HAL_BUFFER_USAGE_DISPATCH)) {
+    buffer_create_info.usage |= VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
+    buffer_create_info.usage |= VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
+    buffer_create_info.usage |= VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT;
+  }
+  buffer_create_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
+  buffer_create_info.queueFamilyIndexCount = 0;
+  buffer_create_info.pQueueFamilyIndices = NULL;
+
+  VmaAllocationCreateInfo allocation_create_info;
+  allocation_create_info.flags = flags;
+  allocation_create_info.usage = VMA_MEMORY_USAGE_UNKNOWN;
+  allocation_create_info.requiredFlags = 0;
+  allocation_create_info.preferredFlags = 0;
+  allocation_create_info.memoryTypeBits = 0;  // Automatic selection.
+  allocation_create_info.pool = VK_NULL_HANDLE;
+  allocation_create_info.pUserData = NULL;
+  if (iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL)) {
+    if (iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_HOST_VISIBLE)) {
+      // Device-local, host-visible.
+      allocation_create_info.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
+      allocation_create_info.preferredFlags |=
+          VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
+    } else {
+      // Device-local only.
+      allocation_create_info.usage = VMA_MEMORY_USAGE_GPU_ONLY;
+      allocation_create_info.requiredFlags |=
+          VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
+    }
+  } else {
+    if (iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE)) {
+      // Host-local, device-visible.
+      allocation_create_info.usage = VMA_MEMORY_USAGE_GPU_TO_CPU;
+    } else {
+      // Host-local only.
+      allocation_create_info.usage = VMA_MEMORY_USAGE_CPU_ONLY;
+    }
+  }
+  if (iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_HOST_CACHED)) {
+    allocation_create_info.requiredFlags |= VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
+  }
+  if (iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_HOST_COHERENT)) {
+    allocation_create_info.requiredFlags |=
+        VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
+  }
+  if (iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_TRANSIENT)) {
+    allocation_create_info.preferredFlags |=
+        VK_MEMORY_PROPERTY_LAZILY_ALLOCATED_BIT;
+  }
+  if (iree_all_bits_set(params->usage, IREE_HAL_BUFFER_USAGE_MAPPING)) {
+    allocation_create_info.requiredFlags |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
+  }
+
+  // TODO(benvanik): if on a unified memory system and initial data is present
+  // we could set the mapping bit and ensure a much more efficient upload.
+
+  VkBuffer handle = VK_NULL_HANDLE;
+  VmaAllocation allocation = VK_NULL_HANDLE;
+  VmaAllocationInfo allocation_info;
+  VK_RETURN_IF_ERROR(vmaCreateBuffer(allocator->vma, &buffer_create_info,
+                                     &allocation_create_info, &handle,
+                                     &allocation, &allocation_info),
+                     "vmaCreateBuffer");
+
+  iree_hal_buffer_t* buffer = NULL;
+  iree_status_t status = iree_hal_vulkan_vma_buffer_wrap(
+      (iree_hal_allocator_t*)allocator, params->type, params->access,
+      params->usage, allocation_size,
+      /*byte_offset=*/0,
+      /*byte_length=*/allocation_size, allocator->vma, handle, allocation,
+      allocation_info, &buffer);
+  if (!iree_status_is_ok(status)) {
+    vmaDestroyBuffer(allocator->vma, handle, allocation);
+    return status;
+  }
+
+  // Copy the initial contents into the buffer. This may require staging.
+  if (iree_status_is_ok(status) &&
+      !iree_const_byte_span_is_empty(initial_data)) {
+    status = iree_hal_device_transfer_range(
+        allocator->device,
+        iree_hal_make_host_transfer_buffer_span((void*)initial_data.data,
+                                                initial_data.data_length),
+        0, iree_hal_make_device_transfer_buffer(buffer), 0,
+        initial_data.data_length, IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT,
+        iree_infinite_timeout());
+  }
+
+  if (iree_status_is_ok(status)) {
+    *out_buffer = buffer;
+  } else {
+    iree_hal_buffer_release(buffer);
+  }
+  return status;
+}
+
+static iree_status_t iree_hal_vulkan_vma_allocator_allocate_buffer(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    const iree_hal_buffer_params_t* IREE_RESTRICT params,
+    iree_device_size_t allocation_size, iree_const_byte_span_t initial_data,
+    iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
+  iree_hal_vulkan_vma_allocator_t* allocator =
+      iree_hal_vulkan_vma_allocator_cast(base_allocator);
+  return iree_hal_vulkan_vma_allocator_allocate_internal(
+      allocator, params, allocation_size, initial_data,
+      /*flags=*/0, out_buffer);
+}
+
+static void iree_hal_vulkan_vma_allocator_deallocate_buffer(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    iree_hal_buffer_t* IREE_RESTRICT base_buffer) {
+  // VMA does the pooling for us so we don't need anything special.
+  iree_hal_buffer_destroy(base_buffer);
+}
+
+static iree_status_t iree_hal_vulkan_vma_allocator_import_buffer(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    const iree_hal_buffer_params_t* IREE_RESTRICT params,
+    iree_hal_external_buffer_t* IREE_RESTRICT external_buffer,
+    iree_hal_buffer_release_callback_t release_callback,
+    iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
+  // TODO(#7242): use VK_EXT_external_memory_host to import memory.
+  return iree_make_status(IREE_STATUS_UNAVAILABLE,
+                          "importing from external buffers not supported");
+}
+
+static iree_status_t iree_hal_vulkan_vma_allocator_export_buffer(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    iree_hal_buffer_t* IREE_RESTRICT buffer,
+    iree_hal_external_buffer_type_t requested_type,
+    iree_hal_external_buffer_flags_t requested_flags,
+    iree_hal_external_buffer_t* IREE_RESTRICT out_external_buffer) {
+  return iree_make_status(IREE_STATUS_UNAVAILABLE,
+                          "exporting to external buffers not supported");
+}
+
+namespace {
+const iree_hal_allocator_vtable_t iree_hal_vulkan_vma_allocator_vtable = {
+    /*.destroy=*/iree_hal_vulkan_vma_allocator_destroy,
+    /*.host_allocator=*/iree_hal_vulkan_vma_allocator_host_allocator,
+    /*.trim=*/iree_hal_vulkan_vma_allocator_trim,
+    /*.query_statistics=*/iree_hal_vulkan_vma_allocator_query_statistics,
+    /*.query_compatibility=*/
+    iree_hal_vulkan_vma_allocator_query_compatibility,
+    /*.allocate_buffer=*/iree_hal_vulkan_vma_allocator_allocate_buffer,
+    /*.deallocate_buffer=*/iree_hal_vulkan_vma_allocator_deallocate_buffer,
+    /*.import_buffer=*/iree_hal_vulkan_vma_allocator_import_buffer,
+    /*.export_buffer=*/iree_hal_vulkan_vma_allocator_export_buffer,
+};
+}  // namespace
diff --git a/runtime/src/iree/hal/vulkan/vma_allocator.h b/runtime/src/iree/hal/vulkan/vma_allocator.h
new file mode 100644
index 0000000..86d892e
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/vma_allocator.h
@@ -0,0 +1,45 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_VULKAN_VMA_ALLOCATOR_H_
+#define IREE_HAL_VULKAN_VMA_ALLOCATOR_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/vulkan/handle_util.h"
+#include "iree/hal/vulkan/internal_vk_mem_alloc.h"  // IWYU pragma: export
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Creates a VMA-based allocator that performs internal suballocation and a
+// bunch of other fancy things.
+//
+// This uses the Vulkan Memory Allocator (VMA) to manage memory.
+// VMA (//third_party/vulkan_memory_allocator) provides dlmalloc-like behavior
+// with suballocations made with various policies (best fit, first fit, etc).
+// This reduces the number of allocations we need from the Vulkan implementation
+// (which can sometimes be limited to as little as 4096 total allowed) and
+// manages higher level allocation semantics like slab allocation and
+// defragmentation.
+//
+// VMA is internally synchronized and the functionality exposed on the HAL
+// interface is thread-safe.
+//
+// More information:
+//   https://github.com/GPUOpen-LibrariesAndSDKs/VulkanMemoryAllocator
+//   https://gpuopen-librariesandsdks.github.io/VulkanMemoryAllocator/html/
+iree_status_t iree_hal_vulkan_vma_allocator_create(
+    VkInstance instance, VkPhysicalDevice physical_device,
+    iree::hal::vulkan::VkDeviceHandle* logical_device,
+    iree_hal_device_t* device, iree_hal_allocator_t** out_allocator);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_VULKAN_VMA_ALLOCATOR_H_
diff --git a/runtime/src/iree/hal/vulkan/vma_buffer.cc b/runtime/src/iree/hal/vulkan/vma_buffer.cc
new file mode 100644
index 0000000..765ad17
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/vma_buffer.cc
@@ -0,0 +1,179 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/vulkan/vma_buffer.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/vulkan/status_util.h"
+
+typedef struct iree_hal_vulkan_vma_buffer_t {
+  iree_hal_buffer_t base;
+
+  VmaAllocator vma;
+  VkBuffer handle;
+  VmaAllocation allocation;
+  VmaAllocationInfo allocation_info;
+} iree_hal_vulkan_vma_buffer_t;
+
+namespace {
+extern const iree_hal_buffer_vtable_t iree_hal_vulkan_vma_buffer_vtable;
+}  // namespace
+
+static iree_hal_vulkan_vma_buffer_t* iree_hal_vulkan_vma_buffer_cast(
+    iree_hal_buffer_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_vulkan_vma_buffer_vtable);
+  return (iree_hal_vulkan_vma_buffer_t*)base_value;
+}
+
+iree_status_t iree_hal_vulkan_vma_buffer_wrap(
+    iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type,
+    iree_hal_memory_access_t allowed_access,
+    iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size,
+    iree_device_size_t byte_offset, iree_device_size_t byte_length,
+    VmaAllocator vma, VkBuffer handle, VmaAllocation allocation,
+    VmaAllocationInfo allocation_info, iree_hal_buffer_t** out_buffer) {
+  IREE_ASSERT_ARGUMENT(allocator);
+  IREE_ASSERT_ARGUMENT(vma);
+  IREE_ASSERT_ARGUMENT(handle);
+  IREE_ASSERT_ARGUMENT(allocation);
+  IREE_ASSERT_ARGUMENT(out_buffer);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_allocator_t host_allocator =
+      iree_hal_allocator_host_allocator(allocator);
+  iree_hal_vulkan_vma_buffer_t* buffer = NULL;
+  iree_status_t status =
+      iree_allocator_malloc(host_allocator, sizeof(*buffer), (void**)&buffer);
+  if (iree_status_is_ok(status)) {
+    iree_hal_buffer_initialize(
+        host_allocator, allocator, &buffer->base, allocation_size, byte_offset,
+        byte_length, memory_type, allowed_access, allowed_usage,
+        &iree_hal_vulkan_vma_buffer_vtable, &buffer->base);
+    buffer->vma = vma;
+    buffer->handle = handle;
+    buffer->allocation = allocation;
+    buffer->allocation_info = allocation_info;
+
+    // TODO(benvanik): set debug name instead and use the
+    //     VMA_ALLOCATION_CREATE_USER_DATA_COPY_STRING_BIT flag.
+    vmaSetAllocationUserData(buffer->vma, buffer->allocation, buffer);
+
+    // TODO(benvanik): figure out why this is not working - has unbalanced
+    // allocs in the tracy UI even though they are definitely balanced here.
+    // IREE_TRACE_ALLOC_NAMED("VMA", (void*)buffer->handle, byte_length);
+
+    *out_buffer = &buffer->base;
+  } else {
+    vmaDestroyBuffer(vma, handle, allocation);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+static void iree_hal_vulkan_vma_buffer_destroy(iree_hal_buffer_t* base_buffer) {
+  iree_hal_vulkan_vma_buffer_t* buffer =
+      iree_hal_vulkan_vma_buffer_cast(base_buffer);
+  iree_allocator_t host_allocator = base_buffer->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // IREE_TRACE_FREE_NAMED("VMA", (void*)buffer->handle);
+
+  vmaDestroyBuffer(buffer->vma, buffer->handle, buffer->allocation);
+  iree_allocator_free(host_allocator, buffer);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+VkBuffer iree_hal_vulkan_vma_buffer_handle(iree_hal_buffer_t* base_buffer) {
+  iree_hal_vulkan_vma_buffer_t* buffer =
+      iree_hal_vulkan_vma_buffer_cast(base_buffer);
+  return buffer->handle;
+}
+
+static iree_status_t iree_hal_vulkan_vma_buffer_map_range(
+    iree_hal_buffer_t* base_buffer, iree_hal_mapping_mode_t mapping_mode,
+    iree_hal_memory_access_t memory_access,
+    iree_device_size_t local_byte_offset, iree_device_size_t local_byte_length,
+    iree_hal_buffer_mapping_t* mapping) {
+  iree_hal_vulkan_vma_buffer_t* buffer =
+      iree_hal_vulkan_vma_buffer_cast(base_buffer);
+
+  // TODO(benvanik): add upload/download for unmapped buffers.
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_memory_type(
+      iree_hal_buffer_memory_type(base_buffer),
+      IREE_HAL_MEMORY_TYPE_HOST_VISIBLE));
+  IREE_RETURN_IF_ERROR(
+      iree_hal_buffer_validate_usage(iree_hal_buffer_allowed_usage(base_buffer),
+                                     IREE_HAL_BUFFER_USAGE_MAPPING));
+
+  uint8_t* data_ptr = nullptr;
+  VK_RETURN_IF_ERROR(
+      vmaMapMemory(buffer->vma, buffer->allocation, (void**)&data_ptr),
+      "vmaMapMemory");
+  mapping->contents =
+      iree_make_byte_span(data_ptr + local_byte_offset, local_byte_length);
+
+  // If we mapped for discard scribble over the bytes. This is not a mandated
+  // behavior but it will make debugging issues easier. Alternatively for
+  // heap buffers we could reallocate them such that ASAN yells, but that
+  // would only work if the entire buffer was discarded.
+#ifndef NDEBUG
+  if (iree_any_bit_set(memory_access, IREE_HAL_MEMORY_ACCESS_DISCARD)) {
+    memset(mapping->contents.data, 0xCD, local_byte_length);
+  }
+#endif  // !NDEBUG
+
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_vulkan_vma_buffer_unmap_range(
+    iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset,
+    iree_device_size_t local_byte_length, iree_hal_buffer_mapping_t* mapping) {
+  iree_hal_vulkan_vma_buffer_t* buffer =
+      iree_hal_vulkan_vma_buffer_cast(base_buffer);
+  vmaUnmapMemory(buffer->vma, buffer->allocation);
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_vulkan_vma_buffer_invalidate_range(
+    iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset,
+    iree_device_size_t local_byte_length) {
+  iree_hal_vulkan_vma_buffer_t* buffer =
+      iree_hal_vulkan_vma_buffer_cast(base_buffer);
+  VK_RETURN_IF_ERROR(
+      vmaInvalidateAllocation(buffer->vma, buffer->allocation,
+                              local_byte_offset, local_byte_length),
+      "vmaInvalidateAllocation");
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_vulkan_vma_buffer_flush_range(
+    iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset,
+    iree_device_size_t local_byte_length) {
+  iree_hal_vulkan_vma_buffer_t* buffer =
+      iree_hal_vulkan_vma_buffer_cast(base_buffer);
+  VK_RETURN_IF_ERROR(vmaFlushAllocation(buffer->vma, buffer->allocation,
+                                        local_byte_offset, local_byte_length),
+                     "vmaFlushAllocation");
+  return iree_ok_status();
+}
+
+namespace {
+const iree_hal_buffer_vtable_t iree_hal_vulkan_vma_buffer_vtable = {
+    /*.recycle=*/iree_hal_buffer_recycle,
+    /*.destroy=*/iree_hal_vulkan_vma_buffer_destroy,
+    /*.map_range=*/iree_hal_vulkan_vma_buffer_map_range,
+    /*.unmap_range=*/iree_hal_vulkan_vma_buffer_unmap_range,
+    /*.invalidate_range=*/iree_hal_vulkan_vma_buffer_invalidate_range,
+    /*.flush_range=*/iree_hal_vulkan_vma_buffer_flush_range,
+};
+}  // namespace
diff --git a/runtime/src/iree/hal/vulkan/vma_buffer.h b/runtime/src/iree/hal/vulkan/vma_buffer.h
new file mode 100644
index 0000000..a00adbd
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/vma_buffer.h
@@ -0,0 +1,37 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_VULKAN_VMA_BUFFER_H_
+#define IREE_HAL_VULKAN_VMA_BUFFER_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/vulkan/internal_vk_mem_alloc.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Wraps a VMA allocation in an iree_hal_buffer_t.
+// The allocation will be released back to VMA when the buffer is released.
+iree_status_t iree_hal_vulkan_vma_buffer_wrap(
+    iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type,
+    iree_hal_memory_access_t allowed_access,
+    iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size,
+    iree_device_size_t byte_offset, iree_device_size_t byte_length,
+    VmaAllocator vma, VkBuffer handle, VmaAllocation allocation,
+    VmaAllocationInfo allocation_info, iree_hal_buffer_t** out_buffer);
+
+// Returns the Vulkan handle backing the given |buffer|.
+// This is the entire allocated_buffer and must be offset by the buffer
+// byte_offset and byte_length when used.
+VkBuffer iree_hal_vulkan_vma_buffer_handle(iree_hal_buffer_t* buffer);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_VULKAN_VMA_BUFFER_H_
diff --git a/runtime/src/iree/hal/vulkan/vulkan_device.cc b/runtime/src/iree/hal/vulkan/vulkan_device.cc
new file mode 100644
index 0000000..c7c3ecc
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/vulkan_device.cc
@@ -0,0 +1,1169 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/vulkan/vulkan_device.h"
+
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <vector>
+
+#include "iree/base/internal/arena.h"
+#include "iree/base/internal/math.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/utils/buffer_transfer.h"
+#include "iree/hal/vulkan/api.h"
+#include "iree/hal/vulkan/builtin_executables.h"
+#include "iree/hal/vulkan/command_queue.h"
+#include "iree/hal/vulkan/descriptor_pool_cache.h"
+#include "iree/hal/vulkan/direct_command_buffer.h"
+#include "iree/hal/vulkan/direct_command_queue.h"
+#include "iree/hal/vulkan/dynamic_symbols.h"
+#include "iree/hal/vulkan/emulated_semaphore.h"
+#include "iree/hal/vulkan/extensibility_util.h"
+#include "iree/hal/vulkan/handle_util.h"
+#include "iree/hal/vulkan/native_descriptor_set.h"
+#include "iree/hal/vulkan/native_descriptor_set_layout.h"
+#include "iree/hal/vulkan/native_event.h"
+#include "iree/hal/vulkan/native_executable_layout.h"
+#include "iree/hal/vulkan/native_semaphore.h"
+#include "iree/hal/vulkan/nop_executable_cache.h"
+#include "iree/hal/vulkan/serializing_command_queue.h"
+#include "iree/hal/vulkan/status_util.h"
+#include "iree/hal/vulkan/timepoint_util.h"
+#include "iree/hal/vulkan/tracing.h"
+#include "iree/hal/vulkan/util/arena.h"
+#include "iree/hal/vulkan/util/ref_ptr.h"
+#include "iree/hal/vulkan/vma_allocator.h"
+
+using namespace iree::hal::vulkan;
+
+//===----------------------------------------------------------------------===//
+// iree_hal_vulkan_device_t extensibility util
+//===----------------------------------------------------------------------===//
+
+IREE_API_EXPORT iree_status_t iree_hal_vulkan_query_extensibility_set(
+    iree_hal_vulkan_features_t requested_features,
+    iree_hal_vulkan_extensibility_set_t set, iree_host_size_t string_capacity,
+    const char** out_string_values, iree_host_size_t* out_string_count) {
+  *out_string_count = 0;
+
+  iree_status_t status = iree_ok_status();
+  iree_host_size_t string_count = 0;
+#define ADD_EXT(target_set, name_literal)                       \
+  if (iree_status_is_ok(status) && set == (target_set)) {       \
+    if (string_count >= string_capacity && out_string_values) { \
+      status = iree_status_from_code(IREE_STATUS_OUT_OF_RANGE); \
+    } else if (out_string_values) {                             \
+      out_string_values[string_count] = (name_literal);         \
+    }                                                           \
+    ++string_count;                                             \
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Baseline IREE requirements
+  //===--------------------------------------------------------------------===//
+  // Using IREE at all requires these extensions unconditionally. Adding things
+  // here changes our minimum requirements and should be done carefully.
+  // Optional extensions here are feature detected by the runtime.
+
+#ifdef IREE_PLATFORM_APPLE
+  // VK_KHR_portability_subset:
+  // For Apple platforms, Vulkan is layered on top of Metal via MoltenVK.
+  // It exposes this extension to allow a non-conformant Vulkan implementation
+  // to be built on top of another non-Vulkan graphics API. This extension must
+  // be enabled if exists.
+  ADD_EXT(IREE_HAL_VULKAN_EXTENSIBILITY_DEVICE_EXTENSIONS_REQUIRED,
+          VK_KHR_PORTABILITY_SUBSET_EXTENSION_NAME);
+#endif
+
+  // VK_KHR_storage_buffer_storage_class:
+  // Our generated SPIR-V kernels use storage buffers for all their data access.
+  ADD_EXT(IREE_HAL_VULKAN_EXTENSIBILITY_DEVICE_EXTENSIONS_REQUIRED,
+          VK_KHR_STORAGE_BUFFER_STORAGE_CLASS_EXTENSION_NAME);
+
+  // VK_KHR_get_physical_device_properties2:
+  // Multiple extensions depend on VK_KHR_get_physical_device_properties2.
+  // This extension was deprecated in Vulkan 1.1 as its functionality was
+  // promoted to core so we list it as optional even though we require it.
+  ADD_EXT(IREE_HAL_VULKAN_EXTENSIBILITY_INSTANCE_EXTENSIONS_OPTIONAL,
+          VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME);
+
+  // VK_KHR_push_descriptor:
+  // We can avoid a lot of additional Vulkan descriptor set manipulation
+  // overhead when this extension is present. Android is a holdout, though, and
+  // we have a fallback for when it's not available.
+  ADD_EXT(IREE_HAL_VULKAN_EXTENSIBILITY_DEVICE_EXTENSIONS_OPTIONAL,
+          VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME);
+
+  //===--------------------------------------------------------------------===//
+  // Vulkan forward-compatibility shims
+  //===--------------------------------------------------------------------===//
+  // These are shims or extensions that are made core later in the spec and can
+  // be removed once we require the core version that contains them.
+
+  // VK_KHR_timeline_semaphore:
+  // timeline semaphore support is optional and will be emulated if necessary.
+  ADD_EXT(IREE_HAL_VULKAN_EXTENSIBILITY_DEVICE_EXTENSIONS_OPTIONAL,
+          VK_KHR_TIMELINE_SEMAPHORE_EXTENSION_NAME);
+
+  // VK_LAYER_KHRONOS_timeline_semaphore:
+  // polyfill layer - enable if present instead of our custom emulation. Ignored
+  // if timeline semaphores are supported natively (Vulkan 1.2+).
+  ADD_EXT(IREE_HAL_VULKAN_EXTENSIBILITY_INSTANCE_LAYERS_OPTIONAL,
+          "VK_LAYER_KHRONOS_timeline_semaphore");
+
+  //===--------------------------------------------------------------------===//
+  // Optional debugging features
+  //===--------------------------------------------------------------------===//
+  // Used only when explicitly requested as they drastically change the
+  // performance behavior of Vulkan.
+
+  // VK_LAYER_KHRONOS_validation:
+  // only enabled if validation is desired. Since validation in Vulkan is just a
+  // API correctness check it can't be used as a security mechanism and is fine
+  // to ignore.
+  if (iree_all_bits_set(requested_features,
+                        IREE_HAL_VULKAN_FEATURE_ENABLE_VALIDATION_LAYERS)) {
+    ADD_EXT(IREE_HAL_VULKAN_EXTENSIBILITY_INSTANCE_LAYERS_OPTIONAL,
+            "VK_LAYER_KHRONOS_validation");
+  }
+
+  // VK_EXT_debug_utils:
+  // only enabled if debugging is desired to route Vulkan debug messages through
+  // our logging sinks. Note that this adds a non-trivial runtime overhead and
+  // we may want to disable it even in debug builds.
+  if (iree_all_bits_set(requested_features,
+                        IREE_HAL_VULKAN_FEATURE_ENABLE_DEBUG_UTILS)) {
+    ADD_EXT(IREE_HAL_VULKAN_EXTENSIBILITY_INSTANCE_EXTENSIONS_OPTIONAL,
+            VK_EXT_DEBUG_UTILS_EXTENSION_NAME);
+  }
+
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+  if (iree_all_bits_set(requested_features,
+                        IREE_HAL_VULKAN_FEATURE_ENABLE_TRACING)) {
+    // VK_EXT_host_query_reset:
+    // optionally allows for vkResetQueryPool to be used to reset query pools
+    // from the host without needing to do an expensive vkCmdResetQueryPool
+    // submission.
+    ADD_EXT(IREE_HAL_VULKAN_EXTENSIBILITY_DEVICE_EXTENSIONS_OPTIONAL,
+            VK_EXT_HOST_QUERY_RESET_EXTENSION_NAME);
+
+    // VK_EXT_calibrated_timestamps:
+    // optionally provides more accurate timestamps that correspond to the
+    // system time. If this is not present then tracy will attempt calibration
+    // itself and have some per-run variance in the skew (up to many
+    // milliseconds).
+    ADD_EXT(IREE_HAL_VULKAN_EXTENSIBILITY_DEVICE_EXTENSIONS_OPTIONAL,
+            VK_EXT_CALIBRATED_TIMESTAMPS_EXTENSION_NAME);
+  }
+#endif  // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+
+  *out_string_count = string_count;
+  return status;
+}
+
+//===----------------------------------------------------------------------===//
+// Queue selection
+//===----------------------------------------------------------------------===//
+
+#define IREE_HAL_VULKAN_INVALID_QUEUE_FAMILY_INDEX (-1)
+
+typedef struct iree_hal_vulkan_queue_family_info_t {
+  uint32_t dispatch_index;
+  iree_host_size_t dispatch_queue_count;
+  uint32_t transfer_index;
+  iree_host_size_t transfer_queue_count;
+} iree_hal_vulkan_queue_family_info_t;
+
+// Finds the first queue in the listing (which is usually the
+// driver-preferred) that has all of the |required_queue_flags| and none of
+// the |excluded_queue_flags|.
+// Returns IREE_HAL_VULKAN_INVALID_QUEUE_FAMILY_INDEX if no matching queue is
+// found.
+static uint32_t iree_hal_vulkan_find_first_queue_family_with_flags(
+    uint32_t queue_family_count,
+    const VkQueueFamilyProperties* queue_family_properties,
+    VkQueueFlags required_queue_flags, VkQueueFlags excluded_queue_flags) {
+  for (uint32_t queue_family_index = 0; queue_family_index < queue_family_count;
+       ++queue_family_index) {
+    const VkQueueFamilyProperties* properties =
+        &queue_family_properties[queue_family_index];
+    if (iree_all_bits_set(properties->queueFlags, required_queue_flags) &&
+        !iree_any_bit_set(properties->queueFlags, excluded_queue_flags)) {
+      return queue_family_index;
+    }
+  }
+  return IREE_HAL_VULKAN_INVALID_QUEUE_FAMILY_INDEX;
+}
+
+// Selects queue family indices for compute and transfer queues.
+// Note that both queue families may be the same if there is only one family
+// available.
+static iree_status_t iree_hal_vulkan_select_queue_families(
+    VkPhysicalDevice physical_device, iree::hal::vulkan::DynamicSymbols* syms,
+    iree_hal_vulkan_queue_family_info_t* out_family_info) {
+  // Enumerate queue families available on the device.
+  uint32_t queue_family_count = 0;
+  syms->vkGetPhysicalDeviceQueueFamilyProperties(physical_device,
+                                                 &queue_family_count, NULL);
+  VkQueueFamilyProperties* queue_family_properties =
+      (VkQueueFamilyProperties*)iree_alloca(queue_family_count *
+                                            sizeof(VkQueueFamilyProperties));
+  syms->vkGetPhysicalDeviceQueueFamilyProperties(
+      physical_device, &queue_family_count, queue_family_properties);
+
+  memset(out_family_info, 0, sizeof(*out_family_info));
+  out_family_info->dispatch_index = IREE_HAL_VULKAN_INVALID_QUEUE_FAMILY_INDEX;
+  out_family_info->dispatch_queue_count = 0;
+  out_family_info->transfer_index = IREE_HAL_VULKAN_INVALID_QUEUE_FAMILY_INDEX;
+  out_family_info->transfer_queue_count = 0;
+
+  // Try to find a dedicated compute queue (no graphics caps).
+  // Some may support both transfer and compute. If that fails then fallback
+  // to any queue that supports compute.
+  out_family_info->dispatch_index =
+      iree_hal_vulkan_find_first_queue_family_with_flags(
+          queue_family_count, queue_family_properties, VK_QUEUE_COMPUTE_BIT,
+          VK_QUEUE_GRAPHICS_BIT);
+  if (out_family_info->dispatch_index ==
+      IREE_HAL_VULKAN_INVALID_QUEUE_FAMILY_INDEX) {
+    out_family_info->dispatch_index =
+        iree_hal_vulkan_find_first_queue_family_with_flags(
+            queue_family_count, queue_family_properties, VK_QUEUE_COMPUTE_BIT,
+            0);
+  }
+  if (out_family_info->dispatch_index ==
+      IREE_HAL_VULKAN_INVALID_QUEUE_FAMILY_INDEX) {
+    return iree_make_status(
+        IREE_STATUS_NOT_FOUND,
+        "unable to find any queue family support compute operations");
+  }
+  out_family_info->dispatch_queue_count =
+      queue_family_properties[out_family_info->dispatch_index].queueCount;
+
+  // Try to find a dedicated transfer queue (no compute or graphics caps).
+  // Not all devices have one, and some have only a queue family for
+  // everything and possibly a queue family just for compute/etc. If that
+  // fails then fallback to any queue that supports transfer. Finally, if
+  // /that/ fails then we just won't create a transfer queue and instead use
+  // the compute queue for all operations.
+  out_family_info->transfer_index =
+      iree_hal_vulkan_find_first_queue_family_with_flags(
+          queue_family_count, queue_family_properties, VK_QUEUE_TRANSFER_BIT,
+          VK_QUEUE_COMPUTE_BIT | VK_QUEUE_GRAPHICS_BIT);
+  if (out_family_info->transfer_index ==
+      IREE_HAL_VULKAN_INVALID_QUEUE_FAMILY_INDEX) {
+    out_family_info->transfer_index =
+        iree_hal_vulkan_find_first_queue_family_with_flags(
+            queue_family_count, queue_family_properties, VK_QUEUE_TRANSFER_BIT,
+            VK_QUEUE_GRAPHICS_BIT);
+  }
+  if (out_family_info->transfer_index ==
+      IREE_HAL_VULKAN_INVALID_QUEUE_FAMILY_INDEX) {
+    out_family_info->transfer_index =
+        iree_hal_vulkan_find_first_queue_family_with_flags(
+            queue_family_count, queue_family_properties, VK_QUEUE_TRANSFER_BIT,
+            0);
+  }
+  if (out_family_info->transfer_index !=
+      IREE_HAL_VULKAN_INVALID_QUEUE_FAMILY_INDEX) {
+    out_family_info->transfer_queue_count =
+        queue_family_properties[out_family_info->transfer_index].queueCount;
+  }
+
+  // Ensure that we don't share the dispatch queues with transfer queues if
+  // that would put us over the queue count.
+  if (out_family_info->dispatch_index == out_family_info->transfer_index) {
+    out_family_info->transfer_queue_count = iree_min(
+        queue_family_properties[out_family_info->dispatch_index].queueCount -
+            out_family_info->dispatch_queue_count,
+        out_family_info->transfer_queue_count);
+  }
+
+  // Limit the number of queues we create (for now).
+  // We may want to allow this to grow, but each queue adds overhead and we
+  // need to measure to make sure we can effectively use them all.
+  out_family_info->dispatch_queue_count =
+      iree_min(2u, out_family_info->dispatch_queue_count);
+  out_family_info->transfer_queue_count =
+      iree_min(1u, out_family_info->transfer_queue_count);
+
+  return iree_ok_status();
+}
+
+// Builds a set of compute and transfer queues based on the queues available on
+// the device and some magic heuristical goo.
+static iree_status_t iree_hal_vulkan_build_queue_sets(
+    VkPhysicalDevice physical_device, iree::hal::vulkan::DynamicSymbols* syms,
+    iree_hal_vulkan_queue_set_t* out_compute_queue_set,
+    iree_hal_vulkan_queue_set_t* out_transfer_queue_set) {
+  // Select which queues to use (and fail the implementation can't handle them).
+  iree_hal_vulkan_queue_family_info_t queue_family_info;
+  IREE_RETURN_IF_ERROR(iree_hal_vulkan_select_queue_families(
+      physical_device, syms, &queue_family_info));
+
+  // Build queue indices for the selected queue families.
+  memset(out_compute_queue_set, 0, sizeof(*out_compute_queue_set));
+  out_compute_queue_set->queue_family_index = queue_family_info.dispatch_index;
+  for (iree_host_size_t i = 0; i < queue_family_info.dispatch_queue_count;
+       ++i) {
+    out_compute_queue_set->queue_indices |= 1ull << i;
+  }
+
+  memset(out_transfer_queue_set, 0, sizeof(*out_transfer_queue_set));
+  out_transfer_queue_set->queue_family_index = queue_family_info.transfer_index;
+  uint32_t base_queue_index = 0;
+  if (queue_family_info.dispatch_index == queue_family_info.transfer_index) {
+    // Sharing a family, so transfer queues follow compute queues.
+    base_queue_index = queue_family_info.dispatch_index;
+  }
+  for (iree_host_size_t i = 0; i < queue_family_info.transfer_queue_count;
+       ++i) {
+    out_transfer_queue_set->queue_indices |= 1ull << (i + base_queue_index);
+  }
+
+  return iree_ok_status();
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_vulkan_device_t
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_vulkan_device_t {
+  iree_hal_resource_t resource;
+  iree_string_view_t identifier;
+
+  // Optional driver that owns the instance. We retain it for our lifetime to
+  // ensure the instance remains valid.
+  iree_hal_driver_t* driver;
+
+  // Flags overriding default device behavior.
+  iree_hal_vulkan_device_flags_t flags;
+  // Which optional extensions are active and available on the device.
+  iree_hal_vulkan_device_extensions_t device_extensions;
+
+  VkInstance instance;
+  VkPhysicalDevice physical_device;
+  VkDeviceHandle* logical_device;
+
+  iree_allocator_t host_allocator;
+  iree_hal_allocator_t* device_allocator;
+
+  // All queues available on the device; the device owns these.
+  iree_host_size_t queue_count;
+  CommandQueue** queues;
+  // The subset of queues that support dispatch operations. May overlap with
+  // transfer_queues.
+  iree_host_size_t dispatch_queue_count;
+  CommandQueue** dispatch_queues;
+  // The subset of queues that support transfer operations. May overlap with
+  // dispatch_queues.
+  iree_host_size_t transfer_queue_count;
+  CommandQueue** transfer_queues;
+
+  // |queue_count| tracing contexts, if tracing is enabled.
+  iree_hal_vulkan_tracing_context_t** queue_tracing_contexts;
+
+  DescriptorPoolCache* descriptor_pool_cache;
+
+  VkCommandPoolHandle* dispatch_command_pool;
+  VkCommandPoolHandle* transfer_command_pool;
+
+  // Block pool used for command buffers with a larger block size (as command
+  // buffers can contain inlined data uploads).
+  iree_arena_block_pool_t block_pool;
+
+  // Used only for emulated timeline semaphores.
+  TimePointSemaphorePool* semaphore_pool;
+  TimePointFencePool* fence_pool;
+
+  BuiltinExecutables* builtin_executables;
+} iree_hal_vulkan_device_t;
+
+namespace {
+extern const iree_hal_device_vtable_t iree_hal_vulkan_device_vtable;
+}  // namespace
+
+static iree_hal_vulkan_device_t* iree_hal_vulkan_device_cast(
+    iree_hal_device_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_vulkan_device_vtable);
+  return (iree_hal_vulkan_device_t*)base_value;
+}
+
+IREE_API_EXPORT void iree_hal_vulkan_device_options_initialize(
+    iree_hal_vulkan_device_options_t* out_options) {
+  memset(out_options, 0, sizeof(*out_options));
+  out_options->flags = 0;
+}
+
+// Creates a transient command pool for the given queue family.
+// Command buffers allocated from the pool must only be issued on queues
+// belonging to the specified family.
+static iree_status_t iree_hal_vulkan_create_transient_command_pool(
+    VkDeviceHandle* logical_device, uint32_t queue_family_index,
+    VkCommandPoolHandle** out_handle) {
+  VkCommandPoolCreateInfo create_info;
+  create_info.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
+  create_info.pNext = NULL;
+  create_info.flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT;
+  create_info.queueFamilyIndex = queue_family_index;
+  VkCommandPoolHandle* command_pool = new VkCommandPoolHandle(logical_device);
+  iree_status_t status = VK_RESULT_TO_STATUS(
+      logical_device->syms()->vkCreateCommandPool(
+          *logical_device, &create_info, logical_device->allocator(),
+          command_pool->mutable_value()),
+      "vkCreateCommandPool");
+  if (iree_status_is_ok(status)) {
+    *out_handle = command_pool;
+  } else {
+    delete command_pool;
+  }
+  return status;
+}
+
+// Creates a command queue of the given queue family.
+static CommandQueue* iree_hal_vulkan_device_create_queue(
+    VkDeviceHandle* logical_device,
+    iree_hal_command_category_t command_category, uint32_t queue_family_index,
+    uint32_t queue_index, TimePointFencePool* fence_pool) {
+  VkQueue queue = VK_NULL_HANDLE;
+  logical_device->syms()->vkGetDeviceQueue(*logical_device, queue_family_index,
+                                           queue_index, &queue);
+
+  // When emulating timeline semaphores we use a special queue that allows us to
+  // sequence the semaphores correctly.
+  if (fence_pool != NULL) {
+    return new SerializingCommandQueue(logical_device, command_category, queue,
+                                       fence_pool);
+  }
+
+  return new DirectCommandQueue(logical_device, command_category, queue);
+}
+
+// Creates command queues for the given sets of queues and populates the
+// device queue lists.
+static iree_status_t iree_hal_vulkan_device_initialize_command_queues(
+    iree_hal_vulkan_device_t* device,
+    iree_hal_vulkan_features_t enabled_features,
+    iree_string_view_t queue_prefix,
+    const iree_hal_vulkan_queue_set_t* compute_queue_set,
+    const iree_hal_vulkan_queue_set_t* transfer_queue_set) {
+  device->queue_count = 0;
+  device->dispatch_queue_count = 0;
+  device->transfer_queue_count = 0;
+
+  // The first available queue supporting dispatch commands that will be used by
+  // the tracing subsystem for query and cleanup tasks.
+  VkQueue maintenance_dispatch_queue = VK_NULL_HANDLE;
+
+  uint64_t compute_queue_count =
+      iree_math_count_ones_u64(compute_queue_set->queue_indices);
+  uint64_t transfer_queue_count =
+      iree_math_count_ones_u64(transfer_queue_set->queue_indices);
+  for (iree_host_size_t i = 0; i < compute_queue_count; ++i) {
+    if (!(compute_queue_set->queue_indices & (1ull << i))) continue;
+
+    char queue_name_buffer[32];
+    int queue_name_length =
+        snprintf(queue_name_buffer, IREE_ARRAYSIZE(queue_name_buffer),
+                 "Vulkan[%c:%d]", 'D', (int)device->dispatch_queue_count);
+    iree_string_view_t queue_name =
+        iree_make_string_view(queue_name_buffer, queue_name_length);
+
+    CommandQueue* queue = iree_hal_vulkan_device_create_queue(
+        device->logical_device, IREE_HAL_COMMAND_CATEGORY_ANY,
+        compute_queue_set->queue_family_index, i, device->fence_pool);
+
+    iree_host_size_t queue_index = device->queue_count++;
+    device->queues[queue_index] = queue;
+    device->dispatch_queues[device->dispatch_queue_count++] = queue;
+
+    if (!transfer_queue_count) {
+      // If we don't have any dedicated transfer queues then use all dispatch
+      // queues as transfer queues.
+      device->transfer_queues[device->transfer_queue_count++] = queue;
+    }
+
+    if (maintenance_dispatch_queue == VK_NULL_HANDLE) {
+      maintenance_dispatch_queue = queue->handle();
+    }
+
+    if (iree_all_bits_set(enabled_features,
+                          IREE_HAL_VULKAN_FEATURE_ENABLE_TRACING)) {
+      IREE_RETURN_IF_ERROR(iree_hal_vulkan_tracing_context_allocate(
+          device->physical_device, device->logical_device, queue->handle(),
+          queue_name, maintenance_dispatch_queue, device->dispatch_command_pool,
+          device->host_allocator,
+          &device->queue_tracing_contexts[queue_index]));
+      queue->set_tracing_context(device->queue_tracing_contexts[queue_index]);
+    }
+  }
+  for (iree_host_size_t i = 0; i < transfer_queue_count; ++i) {
+    if (!(transfer_queue_set->queue_indices & (1ull << i))) continue;
+
+    char queue_name_buffer[32];
+    int queue_name_length =
+        snprintf(queue_name_buffer, IREE_ARRAYSIZE(queue_name_buffer),
+                 "Vulkan[%c:%d]", 'T', (int)device->transfer_queue_count);
+    iree_string_view_t queue_name =
+        iree_make_string_view(queue_name_buffer, queue_name_length);
+
+    CommandQueue* queue = iree_hal_vulkan_device_create_queue(
+        device->logical_device, IREE_HAL_COMMAND_CATEGORY_TRANSFER,
+        transfer_queue_set->queue_family_index, i, device->fence_pool);
+
+    iree_host_size_t queue_index = device->queue_count++;
+    device->queues[queue_index] = queue;
+    device->transfer_queues[device->transfer_queue_count++] = queue;
+
+    if (iree_all_bits_set(enabled_features,
+                          IREE_HAL_VULKAN_FEATURE_ENABLE_TRACING)) {
+      IREE_RETURN_IF_ERROR(iree_hal_vulkan_tracing_context_allocate(
+          device->physical_device, device->logical_device, queue->handle(),
+          queue_name, maintenance_dispatch_queue, device->dispatch_command_pool,
+          device->host_allocator,
+          &device->queue_tracing_contexts[queue_index]));
+      queue->set_tracing_context(device->queue_tracing_contexts[queue_index]);
+    }
+  }
+
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_vulkan_device_create_internal(
+    iree_hal_driver_t* driver, iree_string_view_t identifier,
+    iree_hal_vulkan_features_t enabled_features,
+    const iree_hal_vulkan_device_options_t* options, VkInstance instance,
+    VkPhysicalDevice physical_device, VkDeviceHandle* logical_device,
+    const iree_hal_vulkan_device_extensions_t* device_extensions,
+    const iree_hal_vulkan_queue_set_t* compute_queue_set,
+    const iree_hal_vulkan_queue_set_t* transfer_queue_set,
+    iree_allocator_t host_allocator, iree_hal_device_t** out_device) {
+  auto& device_syms = logical_device->syms();
+
+  iree_host_size_t compute_queue_count =
+      iree_math_count_ones_u64(compute_queue_set->queue_indices);
+  iree_host_size_t transfer_queue_count =
+      iree_math_count_ones_u64(transfer_queue_set->queue_indices);
+  iree_host_size_t total_queue_count =
+      compute_queue_count + transfer_queue_count;
+
+  iree_hal_vulkan_device_t* device = NULL;
+  iree_host_size_t total_size =
+      sizeof(*device) + identifier.size +
+      total_queue_count * sizeof(device->queues[0]) +
+      total_queue_count * sizeof(device->dispatch_queues[0]) +
+      total_queue_count * sizeof(device->transfer_queues[0]) +
+      total_queue_count * sizeof(device->queue_tracing_contexts[0]);
+  IREE_RETURN_IF_ERROR(
+      iree_allocator_malloc(host_allocator, total_size, (void**)&device));
+  memset(device, 0, total_size);
+  iree_hal_resource_initialize(&iree_hal_vulkan_device_vtable,
+                               &device->resource);
+  device->host_allocator = host_allocator;
+  device->driver = driver;
+  iree_hal_driver_retain(device->driver);
+  uint8_t* buffer_ptr = (uint8_t*)device + sizeof(*device);
+  buffer_ptr += iree_string_view_append_to_buffer(
+      identifier, &device->identifier, (char*)buffer_ptr);
+  device->flags = options->flags;
+
+  device->device_extensions = *device_extensions;
+  device->instance = instance;
+  device->physical_device = physical_device;
+  device->logical_device = logical_device;
+  device->logical_device->AddReference();
+
+  iree_arena_block_pool_initialize(32 * 1024, host_allocator,
+                                   &device->block_pool);
+
+  // Point the queue storage into the new device allocation. The queues
+  // themselves are populated
+  device->queues = (CommandQueue**)buffer_ptr;
+  buffer_ptr += total_queue_count * sizeof(device->queues[0]);
+  device->dispatch_queues = (CommandQueue**)buffer_ptr;
+  buffer_ptr += total_queue_count * sizeof(device->dispatch_queues[0]);
+  device->transfer_queues = (CommandQueue**)buffer_ptr;
+  buffer_ptr += total_queue_count * sizeof(device->transfer_queues[0]);
+  device->queue_tracing_contexts =
+      (iree_hal_vulkan_tracing_context_t**)buffer_ptr;
+  buffer_ptr += total_queue_count * sizeof(device->queue_tracing_contexts[0]);
+
+  device->descriptor_pool_cache =
+      new DescriptorPoolCache(device->logical_device);
+
+  // Create the device memory allocator that will service all buffer
+  // allocation requests.
+  iree_status_t status = iree_hal_vulkan_vma_allocator_create(
+      instance, physical_device, logical_device, (iree_hal_device_t*)device,
+      &device->device_allocator);
+
+  // Create command pools for each queue family. If we don't have a transfer
+  // queue then we'll ignore that one and just use the dispatch pool.
+  // If we wanted to expose the pools through the HAL to allow the VM to more
+  // effectively manage them (pool per fiber, etc) we could, however I doubt
+  // the overhead of locking the pool will be even a blip.
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_vulkan_create_transient_command_pool(
+        device->logical_device, compute_queue_set->queue_family_index,
+        &device->dispatch_command_pool);
+  }
+  if (transfer_queue_set->queue_indices != 0 && iree_status_is_ok(status)) {
+    status = iree_hal_vulkan_create_transient_command_pool(
+        device->logical_device, transfer_queue_set->queue_family_index,
+        &device->transfer_command_pool);
+  }
+
+  // Emulate timeline semaphores when the extension is not available and we are
+  // ony Vulkan versions prior to 1.2 when they were made core.
+  bool emulate_timeline_semaphores =
+      device_syms->vkGetSemaphoreCounterValue == NULL ||
+      iree_all_bits_set(
+          options->flags,
+          IREE_HAL_VULKAN_DEVICE_FORCE_TIMELINE_SEMAPHORE_EMULATION);
+  if (emulate_timeline_semaphores && iree_status_is_ok(status)) {
+    status = TimePointSemaphorePool::Create(device->logical_device,
+                                            &device->semaphore_pool);
+  }
+  if (emulate_timeline_semaphores && iree_status_is_ok(status)) {
+    status =
+        TimePointFencePool::Create(device->logical_device, &device->fence_pool);
+  }
+
+  // Initialize queues now that we've completed the rest of the device
+  // initialization; this happens last as the queues require the pools allocated
+  // above.
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_vulkan_device_initialize_command_queues(
+        device, enabled_features, identifier, compute_queue_set,
+        transfer_queue_set);
+  }
+
+  if (iree_status_is_ok(status)) {
+    device->builtin_executables =
+        new BuiltinExecutables(device->logical_device);
+    status = device->builtin_executables->InitializeExecutables();
+  }
+
+  if (iree_status_is_ok(status)) {
+    *out_device = (iree_hal_device_t*)device;
+  } else {
+    iree_hal_device_destroy((iree_hal_device_t*)device);
+  }
+  return status;
+}
+
+static void iree_hal_vulkan_device_destroy(iree_hal_device_t* base_device) {
+  iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device);
+  iree_allocator_t host_allocator = iree_hal_device_host_allocator(base_device);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Drop all command queues. These may wait until idle in their destructor.
+  for (iree_host_size_t i = 0; i < device->queue_count; ++i) {
+    delete device->queues[i];
+    iree_hal_vulkan_tracing_context_free(device->queue_tracing_contexts[i]);
+  }
+
+  // Drop command pools now that we know there are no more outstanding command
+  // buffers.
+  delete device->dispatch_command_pool;
+  delete device->transfer_command_pool;
+
+  // Now that no commands are outstanding we can release all resources that may
+  // have been in use.
+  delete device->builtin_executables;
+  delete device->descriptor_pool_cache;
+  delete device->semaphore_pool;
+  delete device->fence_pool;
+
+  // There should be no more buffers live that use the allocator.
+  iree_hal_allocator_release(device->device_allocator);
+
+  // All arena blocks should have been returned.
+  iree_arena_block_pool_deinitialize(&device->block_pool);
+
+  // Finally, destroy the device.
+  device->logical_device->ReleaseReference();
+  iree_hal_driver_release(device->driver);
+
+  iree_allocator_free(host_allocator, device);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_status_t iree_hal_vulkan_device_query_extensibility_set(
+    iree_hal_vulkan_features_t requested_features,
+    iree_hal_vulkan_extensibility_set_t set, iree::Arena* arena,
+    iree_hal_vulkan_string_list_t* out_string_list) {
+  IREE_RETURN_IF_ERROR(iree_hal_vulkan_query_extensibility_set(
+      requested_features, set, 0, NULL, &out_string_list->count));
+  out_string_list->values = (const char**)arena->AllocateBytes(
+      out_string_list->count * sizeof(out_string_list->values[0]));
+  IREE_RETURN_IF_ERROR(iree_hal_vulkan_query_extensibility_set(
+      requested_features, set, out_string_list->count, out_string_list->values,
+      &out_string_list->count));
+  return iree_ok_status();
+}
+
+iree_status_t iree_hal_vulkan_device_create(
+    iree_hal_driver_t* driver, iree_string_view_t identifier,
+    iree_hal_vulkan_features_t enabled_features,
+    const iree_hal_vulkan_device_options_t* options,
+    iree_hal_vulkan_syms_t* opaque_syms, VkInstance instance,
+    VkPhysicalDevice physical_device, iree_allocator_t host_allocator,
+    iree_hal_device_t** out_device) {
+  DynamicSymbols* instance_syms = (DynamicSymbols*)opaque_syms;
+
+  // Find the extensions we need (or want) that are also available
+  // on the device. This will fail when required ones are not present.
+  // TODO(benvanik): replace with a real arena.
+  iree::Arena arena(128 * 1024);
+  iree_hal_vulkan_string_list_t required_extensions;
+  IREE_RETURN_IF_ERROR(iree_hal_vulkan_device_query_extensibility_set(
+      enabled_features,
+      IREE_HAL_VULKAN_EXTENSIBILITY_DEVICE_EXTENSIONS_REQUIRED, &arena,
+      &required_extensions));
+  iree_hal_vulkan_string_list_t optional_extensions;
+  IREE_RETURN_IF_ERROR(iree_hal_vulkan_device_query_extensibility_set(
+      enabled_features,
+      IREE_HAL_VULKAN_EXTENSIBILITY_DEVICE_EXTENSIONS_OPTIONAL, &arena,
+      &optional_extensions));
+  iree_hal_vulkan_string_list_t enabled_extensions;
+  IREE_RETURN_IF_ERROR(iree_hal_vulkan_match_available_device_extensions(
+      instance_syms, physical_device, &required_extensions,
+      &optional_extensions, &arena, &enabled_extensions));
+  iree_hal_vulkan_device_extensions_t enabled_device_extensions =
+      iree_hal_vulkan_populate_enabled_device_extensions(&enabled_extensions);
+
+  // Find queue families we will expose as HAL queues.
+  iree_hal_vulkan_queue_family_info_t queue_family_info;
+  IREE_RETURN_IF_ERROR(iree_hal_vulkan_select_queue_families(
+      physical_device, instance_syms, &queue_family_info));
+
+  bool has_dedicated_transfer_queues =
+      queue_family_info.transfer_queue_count > 0;
+
+  // TODO(benvanik): convert to using the arena.
+  // Setup the queue info we'll be using.
+  // Each queue here (created from within a family) will map to a HAL queue.
+  //
+  // Note that we need to handle the case where we have transfer queues that
+  // are of the same queue family as the dispatch queues: Vulkan requires that
+  // all queues created from the same family are done in the same
+  // VkDeviceQueueCreateInfo struct.
+  std::vector<VkDeviceQueueCreateInfo> queue_create_info;
+  // Reserve space for create infos. Note: must be the maximum used, or else
+  // references used below will be invalidated as the vector grows.
+  queue_create_info.reserve(2);
+  std::vector<float> dispatch_queue_priorities;
+  std::vector<float> transfer_queue_priorities;
+  queue_create_info.push_back({});
+  auto& dispatch_queue_info = queue_create_info.back();
+  dispatch_queue_info.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
+  dispatch_queue_info.pNext = NULL;
+  dispatch_queue_info.flags = 0;
+  dispatch_queue_info.queueFamilyIndex = queue_family_info.dispatch_index;
+  dispatch_queue_info.queueCount = queue_family_info.dispatch_queue_count;
+  if (has_dedicated_transfer_queues) {
+    if (queue_family_info.dispatch_index == queue_family_info.transfer_index) {
+      dispatch_queue_info.queueCount += queue_family_info.transfer_queue_count;
+    } else {
+      queue_create_info.push_back({});
+      auto& transfer_queue_info = queue_create_info.back();
+      transfer_queue_info.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
+      transfer_queue_info.pNext = NULL;
+      transfer_queue_info.queueFamilyIndex = queue_family_info.transfer_index;
+      transfer_queue_info.queueCount = queue_family_info.transfer_queue_count;
+      transfer_queue_info.flags = 0;
+      transfer_queue_priorities.resize(transfer_queue_info.queueCount);
+      transfer_queue_info.pQueuePriorities = transfer_queue_priorities.data();
+    }
+  }
+  dispatch_queue_priorities.resize(dispatch_queue_info.queueCount);
+  dispatch_queue_info.pQueuePriorities = dispatch_queue_priorities.data();
+
+  // Create device and its queues.
+  VkDeviceCreateInfo device_create_info;
+  memset(&device_create_info, 0, sizeof(device_create_info));
+  device_create_info.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
+  device_create_info.enabledLayerCount = 0;
+  device_create_info.ppEnabledLayerNames = NULL;
+  device_create_info.enabledExtensionCount = enabled_extensions.count;
+  device_create_info.ppEnabledExtensionNames = enabled_extensions.values;
+  device_create_info.queueCreateInfoCount = queue_create_info.size();
+  device_create_info.pQueueCreateInfos = queue_create_info.data();
+  device_create_info.pEnabledFeatures = NULL;
+
+  VkPhysicalDeviceFeatures2 features2;
+  memset(&features2, 0, sizeof(features2));
+  features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
+  device_create_info.pNext = &features2;
+
+  VkPhysicalDeviceTimelineSemaphoreFeatures semaphore_features;
+  bool emulate_timeline_semaphores =
+      !enabled_device_extensions.timeline_semaphore ||
+      iree_all_bits_set(
+          options->flags,
+          IREE_HAL_VULKAN_DEVICE_FORCE_TIMELINE_SEMAPHORE_EMULATION);
+  if (!emulate_timeline_semaphores) {
+    memset(&semaphore_features, 0, sizeof(semaphore_features));
+    semaphore_features.sType =
+        VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES;
+    semaphore_features.pNext = features2.pNext;
+    features2.pNext = &semaphore_features;
+    semaphore_features.timelineSemaphore = VK_TRUE;
+  }
+
+  VkPhysicalDeviceHostQueryResetFeaturesEXT host_query_reset_features;
+  if (enabled_device_extensions.host_query_reset) {
+    memset(&host_query_reset_features, 0, sizeof(host_query_reset_features));
+    host_query_reset_features.sType =
+        VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_HOST_QUERY_RESET_FEATURES_EXT;
+    host_query_reset_features.pNext = features2.pNext;
+    features2.pNext = &host_query_reset_features;
+    host_query_reset_features.hostQueryReset = VK_TRUE;
+  }
+
+  auto logical_device = new VkDeviceHandle(
+      instance_syms, enabled_device_extensions,
+      /*owns_device=*/true, host_allocator, /*allocator=*/NULL);
+
+  iree_status_t status = VK_RESULT_TO_STATUS(
+      instance_syms->vkCreateDevice(physical_device, &device_create_info,
+                                    logical_device->allocator(),
+                                    logical_device->mutable_value()),
+      "vkCreateDevice");
+  if (iree_status_is_ok(status)) {
+    status = logical_device->syms()->LoadFromDevice(instance,
+                                                    logical_device->value());
+  }
+
+  // Select queue indices and create command queues with them.
+  iree_hal_vulkan_queue_set_t compute_queue_set;
+  iree_hal_vulkan_queue_set_t transfer_queue_set;
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_vulkan_build_queue_sets(
+        physical_device, logical_device->syms().get(), &compute_queue_set,
+        &transfer_queue_set);
+  }
+
+  // Allocate and initialize the device.
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_vulkan_device_create_internal(
+        driver, identifier, enabled_features, options, instance,
+        physical_device, logical_device, &enabled_device_extensions,
+        &compute_queue_set, &transfer_queue_set, host_allocator, out_device);
+  }
+
+  logical_device->ReleaseReference();
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_vulkan_wrap_device(
+    iree_string_view_t identifier,
+    const iree_hal_vulkan_device_options_t* options,
+    const iree_hal_vulkan_syms_t* instance_syms, VkInstance instance,
+    VkPhysicalDevice physical_device, VkDevice logical_device,
+    const iree_hal_vulkan_queue_set_t* compute_queue_set,
+    const iree_hal_vulkan_queue_set_t* transfer_queue_set,
+    iree_allocator_t host_allocator, iree_hal_device_t** out_device) {
+  IREE_ASSERT_ARGUMENT(instance_syms);
+  IREE_ASSERT_ARGUMENT(instance);
+  IREE_ASSERT_ARGUMENT(physical_device);
+  IREE_ASSERT_ARGUMENT(logical_device);
+  IREE_ASSERT_ARGUMENT(out_device);
+
+  if (iree_math_count_ones_u64(compute_queue_set->queue_indices) == 0) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "at least one compute queue is required");
+  }
+
+  // Grab symbols from the device.
+  auto device_syms = iree::make_ref<DynamicSymbols>();
+  device_syms->vkGetInstanceProcAddr =
+      ((const DynamicSymbols*)instance_syms)->vkGetInstanceProcAddr;
+  IREE_RETURN_IF_ERROR(device_syms->LoadFromDevice(instance, logical_device));
+
+  // Since the device is already created, we can't actually enable any
+  // extensions or query if they are really enabled - we just have to trust
+  // that the caller already enabled them for us or we may fail later. For the
+  // optional extensions we check for the symbols but this is not always
+  // guaranteed to work.
+  iree_hal_vulkan_device_extensions_t enabled_device_extensions =
+      iree_hal_vulkan_infer_enabled_device_extensions(device_syms.get());
+
+  iree_hal_vulkan_features_t enabled_features = 0;
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+  enabled_features |= IREE_HAL_VULKAN_FEATURE_ENABLE_TRACING;
+#endif  // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+
+  // Wrap the provided VkDevice with a VkDeviceHandle for use within the HAL.
+  auto logical_device_handle = new VkDeviceHandle(
+      device_syms.get(), enabled_device_extensions,
+      /*owns_device=*/false, host_allocator, /*allocator=*/NULL);
+  *logical_device_handle->mutable_value() = logical_device;
+
+  // Allocate and initialize the device.
+  iree_status_t status = iree_hal_vulkan_device_create_internal(
+      /*driver=*/NULL, identifier, enabled_features, options, instance,
+      physical_device, logical_device_handle, &enabled_device_extensions,
+      compute_queue_set, transfer_queue_set, host_allocator, out_device);
+
+  logical_device_handle->ReleaseReference();
+  return status;
+}
+
+static iree_string_view_t iree_hal_vulkan_device_id(
+    iree_hal_device_t* base_device) {
+  iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device);
+  return device->identifier;
+}
+
+static iree_allocator_t iree_hal_vulkan_device_host_allocator(
+    iree_hal_device_t* base_device) {
+  iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device);
+  return device->host_allocator;
+}
+
+static iree_hal_allocator_t* iree_hal_vulkan_device_allocator(
+    iree_hal_device_t* base_device) {
+  iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device);
+  return device->device_allocator;
+}
+
+static iree_status_t iree_hal_vulkan_device_trim(
+    iree_hal_device_t* base_device) {
+  iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device);
+  iree_arena_block_pool_trim(&device->block_pool);
+  return iree_hal_allocator_trim(device->device_allocator);
+}
+
+static iree_status_t iree_hal_vulkan_device_query_i32(
+    iree_hal_device_t* base_device, iree_string_view_t category,
+    iree_string_view_t key, int32_t* out_value) {
+  // iree_hal_vulkan_device_t* device =
+  //     iree_hal_vulkan_device_cast(base_device);
+  *out_value = 0;
+
+  if (iree_string_view_equal(category,
+                             iree_make_cstring_view("hal.executable.format"))) {
+    *out_value =
+        iree_string_view_equal(key, iree_make_cstring_view("vulkan-spirv-fb"))
+            ? 1
+            : 0;
+    return iree_ok_status();
+  }
+
+  return iree_make_status(
+      IREE_STATUS_NOT_FOUND,
+      "unknown device configuration key value '%.*s :: %.*s'",
+      (int)category.size, category.data, (int)key.size, key.data);
+}
+
+// Returns the queue to submit work to based on the |queue_affinity|.
+static CommandQueue* iree_hal_vulkan_device_select_queue(
+    iree_hal_vulkan_device_t* device,
+    iree_hal_command_category_t command_categories,
+    iree_hal_queue_affinity_t queue_affinity) {
+  // TODO(scotttodd): revisit queue selection logic and remove this
+  //   * the unaligned buffer fill polyfill and tracing timestamp queries may
+  //     both insert dispatches into command buffers that at compile time are
+  //     expected to only contain transfer commands
+  //   * we could set a bit at recording time if emulation or tracing is used
+  //     and submit to the right queue based on that
+  command_categories |= IREE_HAL_COMMAND_CATEGORY_DISPATCH;
+
+  // TODO(benvanik): meaningful heuristics for affinity. We don't generate
+  // anything from the compiler that uses multiple queues and until we do it's
+  // best not to do anything too clever here.
+  if (command_categories == IREE_HAL_COMMAND_CATEGORY_TRANSFER) {
+    return device
+        ->transfer_queues[queue_affinity % device->transfer_queue_count];
+  }
+  return device->dispatch_queues[queue_affinity % device->dispatch_queue_count];
+}
+
+static iree_status_t iree_hal_vulkan_device_create_command_buffer(
+    iree_hal_device_t* base_device, iree_hal_command_buffer_mode_t mode,
+    iree_hal_command_category_t command_categories,
+    iree_hal_queue_affinity_t queue_affinity,
+    iree_hal_command_buffer_t** out_command_buffer) {
+  iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device);
+
+  // TODO(scotttodd): revisit queue selection logic and remove this
+  //   * the unaligned buffer fill polyfill and tracing timestamp queries may
+  //     both insert dispatches into command buffers that at compile time are
+  //     expected to only contain transfer commands
+  //   * we could set a bit at recording time if emulation or tracing is used
+  //     and submit to the right queue based on that
+  command_categories |= IREE_HAL_COMMAND_CATEGORY_DISPATCH;
+
+  // Select the command pool to used based on the types of commands used.
+  // Note that we may not have a dedicated transfer command pool if there are
+  // no dedicated transfer queues.
+  VkCommandPoolHandle* command_pool = NULL;
+  if (device->transfer_command_pool &&
+      !iree_all_bits_set(command_categories,
+                         IREE_HAL_COMMAND_CATEGORY_DISPATCH)) {
+    command_pool = device->transfer_command_pool;
+  } else {
+    command_pool = device->dispatch_command_pool;
+  }
+
+  // The tracing context is tied to a particular queue so we must select here
+  // even though ideally we'd do it during submission. This is informational
+  // only and if the user does provide a different queue affinity during
+  // submission it just means the commands will be attributed to the wrong
+  // queue.
+  CommandQueue* queue = iree_hal_vulkan_device_select_queue(
+      device, command_categories, queue_affinity);
+
+  return iree_hal_vulkan_direct_command_buffer_allocate(
+      base_device, device->logical_device, command_pool, mode,
+      command_categories, queue_affinity, queue->tracing_context(),
+      device->descriptor_pool_cache, device->builtin_executables,
+      &device->block_pool, out_command_buffer);
+}
+
+static iree_status_t iree_hal_vulkan_device_create_descriptor_set(
+    iree_hal_device_t* base_device,
+    iree_hal_descriptor_set_layout_t* set_layout,
+    iree_host_size_t binding_count,
+    const iree_hal_descriptor_set_binding_t* bindings,
+    iree_hal_descriptor_set_t** out_descriptor_set) {
+  // TODO(benvanik): rework the create fn to take the bindings.
+  return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                          "non-push descriptor sets still need work");
+}
+
+static iree_status_t iree_hal_vulkan_device_create_descriptor_set_layout(
+    iree_hal_device_t* base_device,
+    iree_hal_descriptor_set_layout_usage_type_t usage_type,
+    iree_host_size_t binding_count,
+    const iree_hal_descriptor_set_layout_binding_t* bindings,
+    iree_hal_descriptor_set_layout_t** out_descriptor_set_layout) {
+  iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device);
+  return iree_hal_vulkan_native_descriptor_set_layout_create(
+      device->logical_device, usage_type, binding_count, bindings,
+      out_descriptor_set_layout);
+}
+
+static iree_status_t iree_hal_vulkan_device_create_event(
+    iree_hal_device_t* base_device, iree_hal_event_t** out_event) {
+  iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device);
+  return iree_hal_vulkan_native_event_create(device->logical_device, out_event);
+}
+
+static iree_status_t iree_hal_vulkan_device_create_executable_cache(
+    iree_hal_device_t* base_device, iree_string_view_t identifier,
+    iree_loop_t loop, iree_hal_executable_cache_t** out_executable_cache) {
+  iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device);
+  return iree_hal_vulkan_nop_executable_cache_create(
+      device->logical_device, identifier, out_executable_cache);
+}
+
+static iree_status_t iree_hal_vulkan_device_create_executable_layout(
+    iree_hal_device_t* base_device, iree_host_size_t push_constants,
+    iree_host_size_t set_layout_count,
+    iree_hal_descriptor_set_layout_t** set_layouts,
+    iree_hal_executable_layout_t** out_executable_layout) {
+  iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device);
+  return iree_hal_vulkan_native_executable_layout_create(
+      device->logical_device, push_constants, set_layout_count, set_layouts,
+      out_executable_layout);
+}
+
+static iree_status_t iree_hal_vulkan_device_create_semaphore(
+    iree_hal_device_t* base_device, uint64_t initial_value,
+    iree_hal_semaphore_t** out_semaphore) {
+  iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device);
+  if (device->semaphore_pool != NULL) {
+    return iree_hal_vulkan_emulated_semaphore_create(
+        device->logical_device, device->semaphore_pool, device->queue_count,
+        device->queues, initial_value, out_semaphore);
+  }
+  return iree_hal_vulkan_native_semaphore_create(device->logical_device,
+                                                 initial_value, out_semaphore);
+}
+
+static iree_status_t iree_hal_vulkan_device_queue_submit(
+    iree_hal_device_t* base_device,
+    iree_hal_command_category_t command_categories,
+    iree_hal_queue_affinity_t queue_affinity, iree_host_size_t batch_count,
+    const iree_hal_submission_batch_t* batches) {
+  iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device);
+  CommandQueue* queue = iree_hal_vulkan_device_select_queue(
+      device, command_categories, queue_affinity);
+  return queue->Submit(batch_count, batches);
+}
+
+static iree_status_t iree_hal_vulkan_device_submit_and_wait(
+    iree_hal_device_t* base_device,
+    iree_hal_command_category_t command_categories,
+    iree_hal_queue_affinity_t queue_affinity, iree_host_size_t batch_count,
+    const iree_hal_submission_batch_t* batches,
+    iree_hal_semaphore_t* wait_semaphore, uint64_t wait_value,
+    iree_timeout_t timeout) {
+  // Submit...
+  IREE_RETURN_IF_ERROR(iree_hal_vulkan_device_queue_submit(
+      base_device, command_categories, queue_affinity, batch_count, batches));
+
+  // ...and wait.
+  return iree_hal_semaphore_wait(wait_semaphore, wait_value, timeout);
+}
+
+static iree_status_t iree_hal_vulkan_device_wait_semaphores(
+    iree_hal_device_t* base_device, iree_hal_wait_mode_t wait_mode,
+    const iree_hal_semaphore_list_t* semaphore_list, iree_timeout_t timeout) {
+  iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device);
+  VkSemaphoreWaitFlags wait_flags = 0;
+  if (wait_mode == IREE_HAL_WAIT_MODE_ANY) {
+    wait_flags |= VK_SEMAPHORE_WAIT_ANY_BIT;
+  }
+  if (device->semaphore_pool != NULL) {
+    return iree_hal_vulkan_emulated_semaphore_multi_wait(
+        device->logical_device, semaphore_list, timeout, wait_flags);
+  }
+  return iree_hal_vulkan_native_semaphore_multi_wait(
+      device->logical_device, semaphore_list, timeout, wait_flags);
+}
+
+static iree_status_t iree_hal_vulkan_device_wait_idle(
+    iree_hal_device_t* base_device, iree_timeout_t timeout) {
+  iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device);
+  for (iree_host_size_t i = 0; i < device->queue_count; ++i) {
+    IREE_RETURN_IF_ERROR(device->queues[i]->WaitIdle(timeout));
+  }
+  return iree_ok_status();
+}
+
+namespace {
+const iree_hal_device_vtable_t iree_hal_vulkan_device_vtable = {
+    /*.destroy=*/iree_hal_vulkan_device_destroy,
+    /*.id=*/iree_hal_vulkan_device_id,
+    /*.host_allocator=*/iree_hal_vulkan_device_host_allocator,
+    /*.device_allocator=*/iree_hal_vulkan_device_allocator,
+    /*.trim=*/iree_hal_vulkan_device_trim,
+    /*.query_i32=*/iree_hal_vulkan_device_query_i32,
+    /*.create_command_buffer=*/iree_hal_vulkan_device_create_command_buffer,
+    /*.create_descriptor_set=*/iree_hal_vulkan_device_create_descriptor_set,
+    /*.create_descriptor_set_layout=*/
+    iree_hal_vulkan_device_create_descriptor_set_layout,
+    /*.create_event=*/iree_hal_vulkan_device_create_event,
+    /*.create_executable_cache=*/
+    iree_hal_vulkan_device_create_executable_cache,
+    /*.create_executable_layout=*/
+    iree_hal_vulkan_device_create_executable_layout,
+    /*.create_semaphore=*/iree_hal_vulkan_device_create_semaphore,
+    /*.transfer_range=*/iree_hal_device_submit_transfer_range_and_wait,
+    /*.queue_submit=*/iree_hal_vulkan_device_queue_submit,
+    /*.submit_and_wait=*/
+    iree_hal_vulkan_device_submit_and_wait,
+    /*.wait_semaphores=*/iree_hal_vulkan_device_wait_semaphores,
+    /*.wait_idle=*/iree_hal_vulkan_device_wait_idle,
+};
+}  // namespace
diff --git a/runtime/src/iree/hal/vulkan/vulkan_device.h b/runtime/src/iree/hal/vulkan/vulkan_device.h
new file mode 100644
index 0000000..6cf2244
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/vulkan_device.h
@@ -0,0 +1,38 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_VULKAN_VULKAN_DEVICE_H_
+#define IREE_HAL_VULKAN_VULKAN_DEVICE_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/vulkan/api.h"
+#include "iree/hal/vulkan/dynamic_symbols.h"
+#include "iree/hal/vulkan/extensibility_util.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Creates a device that owns and manages its own VkDevice.
+//
+// The |driver| will be retained for as long as the device is live such that if
+// the driver owns the |instance| provided it is ensured to be valid. |driver|
+// may be NULL if there is no parent driver to retain (such as when wrapping
+// existing VkInstances provided by the application).
+iree_status_t iree_hal_vulkan_device_create(
+    iree_hal_driver_t* driver, iree_string_view_t identifier,
+    iree_hal_vulkan_features_t enabled_features,
+    const iree_hal_vulkan_device_options_t* options,
+    iree_hal_vulkan_syms_t* instance_syms, VkInstance instance,
+    VkPhysicalDevice physical_device, iree_allocator_t host_allocator,
+    iree_hal_device_t** out_device);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_VULKAN_VULKAN_DEVICE_H_
diff --git a/runtime/src/iree/hal/vulkan/vulkan_driver.cc b/runtime/src/iree/hal/vulkan/vulkan_driver.cc
new file mode 100644
index 0000000..e58e680
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/vulkan_driver.cc
@@ -0,0 +1,481 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/vulkan/vulkan_driver.h"
+
+#include <cstdint>
+#include <cstring>
+
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/api.h"
+#include "iree/hal/vulkan/api.h"
+#include "iree/hal/vulkan/debug_reporter.h"
+#include "iree/hal/vulkan/dynamic_symbols.h"
+#include "iree/hal/vulkan/extensibility_util.h"
+#include "iree/hal/vulkan/status_util.h"
+#include "iree/hal/vulkan/util/arena.h"
+#include "iree/hal/vulkan/util/ref_ptr.h"
+#include "iree/hal/vulkan/vulkan_device.h"
+
+using namespace iree::hal::vulkan;
+
+typedef struct iree_hal_vulkan_driver_t {
+  iree_hal_resource_t resource;
+  iree_allocator_t host_allocator;
+
+  // Identifier used for the driver in the IREE driver registry.
+  // We allow overriding so that multiple Vulkan versions can be exposed in the
+  // same process.
+  iree_string_view_t identifier;
+
+  iree_hal_vulkan_device_options_t device_options;
+  int default_device_index;
+
+  iree_hal_vulkan_features_t enabled_features;
+
+  // Which optional extensions are active and available on the instance.
+  iree_hal_vulkan_instance_extensions_t instance_extensions;
+
+  // (Partial) loaded Vulkan symbols. Devices created within the driver may have
+  // different function pointers for device-specific functions that change
+  // behavior with enabled layers/extensions.
+  iree::ref_ptr<DynamicSymbols> syms;
+
+  // The Vulkan instance that all devices created from the driver will share.
+  VkInstance instance;
+  bool owns_instance;
+
+  // Optional debug reporter: may be disabled or unavailable (no debug layers).
+  iree_hal_vulkan_debug_reporter_t* debug_reporter;
+} iree_hal_vulkan_driver_t;
+
+namespace {
+extern const iree_hal_driver_vtable_t iree_hal_vulkan_driver_vtable;
+}  // namespace
+
+static iree_hal_vulkan_driver_t* iree_hal_vulkan_driver_cast(
+    iree_hal_driver_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_vulkan_driver_vtable);
+  return (iree_hal_vulkan_driver_t*)base_value;
+}
+
+IREE_API_EXPORT void iree_hal_vulkan_driver_options_initialize(
+    iree_hal_vulkan_driver_options_t* out_options) {
+  memset(out_options, 0, sizeof(*out_options));
+  out_options->api_version = VK_API_VERSION_1_2;
+  out_options->requested_features = 0;
+  iree_hal_vulkan_device_options_initialize(&out_options->device_options);
+  out_options->default_device_index = 0;
+}
+
+// Returns a VkApplicationInfo struct populated with the default app info.
+// We may allow hosting applications to override this via weak-linkage if it's
+// useful, otherwise this is enough to create the application.
+static void iree_hal_vulkan_driver_populate_default_app_info(
+    const iree_hal_vulkan_driver_options_t* options,
+    VkApplicationInfo* out_app_info) {
+  memset(out_app_info, 0, sizeof(*out_app_info));
+  out_app_info->sType = VK_STRUCTURE_TYPE_APPLICATION_INFO;
+  out_app_info->pNext = NULL;
+  out_app_info->pApplicationName = "IREE-ML";
+  out_app_info->applicationVersion = 0;
+  out_app_info->pEngineName = "IREE";
+  out_app_info->engineVersion = 0;
+  out_app_info->apiVersion = options->api_version;
+}
+
+// NOTE: takes ownership of |instance|.
+static iree_status_t iree_hal_vulkan_driver_create_internal(
+    iree_string_view_t identifier,
+    const iree_hal_vulkan_driver_options_t* options,
+    const iree_hal_vulkan_string_list_t* enabled_extensions,
+    iree_hal_vulkan_syms_t* opaque_syms, VkInstance instance,
+    bool owns_instance, iree_allocator_t host_allocator,
+    iree_hal_driver_t** out_driver) {
+  auto* instance_syms = (DynamicSymbols*)opaque_syms;
+
+  iree_hal_vulkan_instance_extensions_t instance_extensions =
+      iree_hal_vulkan_populate_enabled_instance_extensions(enabled_extensions);
+
+  // The real debug messenger (not just the static one used above) can now be
+  // created as we've loaded all the required symbols.
+  // TODO(benvanik): strip in min-size release builds.
+  iree_hal_vulkan_debug_reporter_t* debug_reporter = NULL;
+  if (instance_extensions.debug_utils) {
+    IREE_RETURN_IF_ERROR(iree_hal_vulkan_debug_reporter_allocate(
+        instance, instance_syms, /*allocation_callbacks=*/NULL, host_allocator,
+        &debug_reporter));
+  }
+
+  iree_hal_vulkan_driver_t* driver = NULL;
+  iree_host_size_t total_size = sizeof(*driver) + identifier.size;
+  iree_status_t status =
+      iree_allocator_malloc(host_allocator, total_size, (void**)&driver);
+  if (!iree_status_is_ok(status)) {
+    // Need to clean up if we fail (as we own these).
+    iree_hal_vulkan_debug_reporter_free(debug_reporter);
+    return status;
+  }
+  iree_hal_resource_initialize(&iree_hal_vulkan_driver_vtable,
+                               &driver->resource);
+  driver->host_allocator = host_allocator;
+  iree_string_view_append_to_buffer(
+      identifier, &driver->identifier,
+      (char*)driver + total_size - identifier.size);
+  memcpy(&driver->device_options, &options->device_options,
+         sizeof(driver->device_options));
+  driver->default_device_index = options->default_device_index;
+  driver->enabled_features = options->requested_features;
+  driver->syms = iree::add_ref(instance_syms);
+  driver->instance = instance;
+  driver->owns_instance = owns_instance;
+  driver->debug_reporter = debug_reporter;
+  *out_driver = (iree_hal_driver_t*)driver;
+  return status;
+}
+
+static void iree_hal_vulkan_driver_destroy(iree_hal_driver_t* base_driver) {
+  iree_hal_vulkan_driver_t* driver = iree_hal_vulkan_driver_cast(base_driver);
+  iree_allocator_t host_allocator = driver->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_vulkan_debug_reporter_free(driver->debug_reporter);
+  if (driver->owns_instance) {
+    driver->syms->vkDestroyInstance(driver->instance, /*pAllocator=*/NULL);
+  }
+  driver->syms.reset();
+  iree_allocator_free(host_allocator, driver);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_status_t iree_hal_vulkan_driver_query_extensibility_set(
+    iree_hal_vulkan_features_t requested_features,
+    iree_hal_vulkan_extensibility_set_t set, iree::Arena* arena,
+    iree_hal_vulkan_string_list_t* out_string_list) {
+  IREE_RETURN_IF_ERROR(iree_hal_vulkan_query_extensibility_set(
+      requested_features, set, 0, NULL, &out_string_list->count));
+  out_string_list->values = (const char**)arena->AllocateBytes(
+      out_string_list->count * sizeof(out_string_list->values[0]));
+  IREE_RETURN_IF_ERROR(iree_hal_vulkan_query_extensibility_set(
+      requested_features, set, out_string_list->count, out_string_list->values,
+      &out_string_list->count));
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_vulkan_driver_compute_enabled_extensibility_sets(
+    iree::hal::vulkan::DynamicSymbols* syms,
+    iree_hal_vulkan_features_t requested_features, iree::Arena* arena,
+    iree_hal_vulkan_string_list_t* out_enabled_layers,
+    iree_hal_vulkan_string_list_t* out_enabled_extensions) {
+  // Query our required and optional layers and extensions based on the IREE
+  // features the user requested.
+  iree_hal_vulkan_string_list_t required_layers;
+  IREE_RETURN_IF_ERROR(iree_hal_vulkan_driver_query_extensibility_set(
+      requested_features,
+      IREE_HAL_VULKAN_EXTENSIBILITY_INSTANCE_LAYERS_REQUIRED, arena,
+      &required_layers));
+  iree_hal_vulkan_string_list_t optional_layers;
+  IREE_RETURN_IF_ERROR(iree_hal_vulkan_driver_query_extensibility_set(
+      requested_features,
+      IREE_HAL_VULKAN_EXTENSIBILITY_INSTANCE_LAYERS_OPTIONAL, arena,
+      &optional_layers));
+  iree_hal_vulkan_string_list_t required_extensions;
+  IREE_RETURN_IF_ERROR(iree_hal_vulkan_driver_query_extensibility_set(
+      requested_features,
+      IREE_HAL_VULKAN_EXTENSIBILITY_INSTANCE_EXTENSIONS_REQUIRED, arena,
+      &required_extensions));
+  iree_hal_vulkan_string_list_t optional_extensions;
+  IREE_RETURN_IF_ERROR(iree_hal_vulkan_driver_query_extensibility_set(
+      requested_features,
+      IREE_HAL_VULKAN_EXTENSIBILITY_INSTANCE_EXTENSIONS_OPTIONAL, arena,
+      &optional_extensions));
+
+  // Find the layers and extensions we need (or want) that are also available
+  // on the instance. This will fail when required ones are not present.
+  IREE_RETURN_IF_ERROR(iree_hal_vulkan_match_available_instance_layers(
+      syms, &required_layers, &optional_layers, arena, out_enabled_layers));
+  IREE_RETURN_IF_ERROR(iree_hal_vulkan_match_available_instance_extensions(
+      syms, &required_extensions, &optional_extensions, arena,
+      out_enabled_extensions));
+
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_vulkan_driver_create(
+    iree_string_view_t identifier,
+    const iree_hal_vulkan_driver_options_t* options,
+    iree_hal_vulkan_syms_t* opaque_syms, iree_allocator_t host_allocator,
+    iree_hal_driver_t** out_driver) {
+  IREE_ASSERT_ARGUMENT(options);
+  IREE_ASSERT_ARGUMENT(opaque_syms);
+  IREE_ASSERT_ARGUMENT(out_driver);
+  IREE_TRACE_SCOPE();
+
+  auto* instance_syms = (DynamicSymbols*)opaque_syms;
+
+  // Query required and optional instance layers/extensions for the requested
+  // features.
+  iree::Arena arena;
+  iree_hal_vulkan_string_list_t enabled_layers;
+  iree_hal_vulkan_string_list_t enabled_extensions;
+  IREE_RETURN_IF_ERROR(
+      iree_hal_vulkan_driver_compute_enabled_extensibility_sets(
+          instance_syms, options->requested_features, &arena, &enabled_layers,
+          &enabled_extensions));
+
+  // Create the instance this driver will use for all requests.
+  VkApplicationInfo app_info;
+  iree_hal_vulkan_driver_populate_default_app_info(options, &app_info);
+  VkInstanceCreateInfo create_info;
+  create_info.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
+  create_info.pNext = NULL;
+  create_info.flags = 0;
+  create_info.pApplicationInfo = &app_info;
+  create_info.enabledLayerCount = enabled_layers.count;
+  create_info.ppEnabledLayerNames = enabled_layers.values;
+  create_info.enabledExtensionCount = enabled_extensions.count;
+  create_info.ppEnabledExtensionNames = enabled_extensions.values;
+
+  VkInstance instance = VK_NULL_HANDLE;
+  VK_RETURN_IF_ERROR(instance_syms->vkCreateInstance(
+                         &create_info, /*pAllocator=*/NULL, &instance),
+                     "vkCreateInstance: invalid instance configuration");
+
+  // Now that the instance has been created we can fetch all of the instance
+  // symbols.
+  iree_status_t status = instance_syms->LoadFromInstance(instance);
+
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_vulkan_driver_create_internal(
+        identifier, options, &enabled_extensions, opaque_syms, instance,
+        /*owns_instance=*/true, host_allocator, out_driver);
+  }
+
+  if (!iree_status_is_ok(status)) {
+    instance_syms->vkDestroyInstance(instance, /*pAllocator=*/NULL);
+  }
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_vulkan_driver_create_using_instance(
+    iree_string_view_t identifier,
+    const iree_hal_vulkan_driver_options_t* options,
+    iree_hal_vulkan_syms_t* opaque_syms, VkInstance instance,
+    iree_allocator_t host_allocator, iree_hal_driver_t** out_driver) {
+  IREE_ASSERT_ARGUMENT(options);
+  IREE_ASSERT_ARGUMENT(opaque_syms);
+  IREE_ASSERT_ARGUMENT(out_driver);
+  if (instance == VK_NULL_HANDLE) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "a non-NULL VkInstance must be provided");
+  }
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // May be a no-op but don't rely on that so we can be sure we have the right
+  // function pointers.
+  auto* instance_syms = (DynamicSymbols*)opaque_syms;
+  IREE_RETURN_IF_ERROR(instance_syms->LoadFromInstance(instance));
+
+  // Since the instance is already created we can't actually enable any
+  // extensions or even query if they are really enabled - we just have to trust
+  // that the caller already enabled them for us (or we may fail later).
+  iree::Arena arena;
+  iree_hal_vulkan_string_list_t enabled_layers;
+  iree_hal_vulkan_string_list_t enabled_extensions;
+  IREE_RETURN_IF_ERROR(
+      iree_hal_vulkan_driver_compute_enabled_extensibility_sets(
+          instance_syms, options->requested_features, &arena, &enabled_layers,
+          &enabled_extensions));
+
+  iree_status_t status = iree_hal_vulkan_driver_create_internal(
+      identifier, options, &enabled_extensions, opaque_syms, instance,
+      /*owns_instance=*/true, host_allocator, out_driver);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+// Enumerates all physical devices on |instance| and returns them as an
+// allocated list in |out_physical_devices|, which must be freed by the caller.
+static iree_status_t iree_hal_vulkan_driver_enumerate_physical_devices(
+    iree::hal::vulkan::DynamicSymbols* instance_syms, VkInstance instance,
+    iree_allocator_t host_allocator, uint32_t* out_physical_device_count,
+    VkPhysicalDevice** out_physical_devices) {
+  uint32_t physical_device_count = 0;
+  VK_RETURN_IF_ERROR(instance_syms->vkEnumeratePhysicalDevices(
+                         instance, &physical_device_count, NULL),
+                     "vkEnumeratePhysicalDevices");
+  VkPhysicalDevice* physical_devices = NULL;
+  IREE_RETURN_IF_ERROR(iree_allocator_malloc(
+      host_allocator, physical_device_count * sizeof(physical_devices),
+      (void**)&physical_devices));
+  iree_status_t status = VK_RESULT_TO_STATUS(
+      instance_syms->vkEnumeratePhysicalDevices(
+          instance, &physical_device_count, physical_devices),
+      "vkEnumeratePhysicalDevices");
+  if (iree_status_is_ok(status)) {
+    *out_physical_device_count = physical_device_count;
+    *out_physical_devices = physical_devices;
+  } else {
+    iree_allocator_free(host_allocator, physical_devices);
+  }
+  return status;
+}
+
+// Returns the size, in bytes, of the iree_hal_device_info_t storage required
+// for holding the given |physical_device|.
+static iree_host_size_t iree_hal_vulkan_calculate_device_info_size(
+    VkPhysicalDevice physical_device, iree::hal::vulkan::DynamicSymbols* syms) {
+  VkPhysicalDeviceProperties physical_device_properties;
+  syms->vkGetPhysicalDeviceProperties(physical_device,
+                                      &physical_device_properties);
+  return strlen(physical_device_properties.deviceName);
+}
+
+// Populates device information from the given Vulkan physical device handle.
+// |out_device_info| must point to valid memory and additional data will be
+// appended to |buffer_ptr| and the new pointer is returned.
+static uint8_t* iree_hal_vulkan_populate_device_info(
+    VkPhysicalDevice physical_device, DynamicSymbols* syms, uint8_t* buffer_ptr,
+    iree_hal_device_info_t* out_device_info) {
+  memset(out_device_info, 0, sizeof(*out_device_info));
+  out_device_info->device_id = (iree_hal_device_id_t)physical_device;
+
+  VkPhysicalDeviceFeatures physical_device_features;
+  syms->vkGetPhysicalDeviceFeatures(physical_device, &physical_device_features);
+  // TODO(benvanik): check and optionally require these features:
+  // - physical_device_features.robustBufferAccess
+  // - physical_device_features.shaderInt16
+  // - physical_device_features.shaderInt64
+  // - physical_device_features.shaderFloat64
+
+  VkPhysicalDeviceProperties physical_device_properties;
+  syms->vkGetPhysicalDeviceProperties(physical_device,
+                                      &physical_device_properties);
+  // TODO(benvanik): check and optionally require reasonable limits.
+
+  // TODO(benvanik): more clever/sanitized device naming.
+  iree_string_view_t device_name =
+      iree_make_string_view(physical_device_properties.deviceName,
+                            strlen(physical_device_properties.deviceName));
+  buffer_ptr += iree_string_view_append_to_buffer(
+      device_name, &out_device_info->name, (char*)buffer_ptr);
+
+  return buffer_ptr;
+}
+
+static iree_status_t iree_hal_vulkan_driver_query_available_devices(
+    iree_hal_driver_t* base_driver, iree_allocator_t host_allocator,
+    iree_hal_device_info_t** out_device_infos,
+    iree_host_size_t* out_device_info_count) {
+  iree_hal_vulkan_driver_t* driver = iree_hal_vulkan_driver_cast(base_driver);
+
+  // Query all devices from the Vulkan instance.
+  uint32_t physical_device_count = 0;
+  VkPhysicalDevice* physical_devices = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_vulkan_driver_enumerate_physical_devices(
+      driver->syms.get(), driver->instance, host_allocator,
+      &physical_device_count, &physical_devices));
+
+  // Allocate the return infos and populate with the devices.
+  iree_hal_device_info_t* device_infos = NULL;
+  iree_host_size_t total_size =
+      physical_device_count * sizeof(iree_hal_device_info_t);
+  for (uint32_t i = 0; i < physical_device_count; ++i) {
+    total_size += iree_hal_vulkan_calculate_device_info_size(
+        physical_devices[i], driver->syms.get());
+  }
+  iree_status_t status =
+      iree_allocator_malloc(host_allocator, total_size, (void**)&device_infos);
+  if (iree_status_is_ok(status)) {
+    uint8_t* buffer_ptr =
+        (uint8_t*)device_infos +
+        physical_device_count * sizeof(iree_hal_device_info_t);
+    for (uint32_t i = 0; i < physical_device_count; ++i) {
+      buffer_ptr = iree_hal_vulkan_populate_device_info(
+          physical_devices[i], driver->syms.get(), buffer_ptr,
+          &device_infos[i]);
+    }
+    *out_device_info_count = physical_device_count;
+    *out_device_infos = device_infos;
+  }
+
+  iree_allocator_free(host_allocator, physical_devices);
+  return status;
+}
+
+static iree_status_t iree_hal_vulkan_driver_select_default_device(
+    iree::hal::vulkan::DynamicSymbols* instance_syms, VkInstance instance,
+    int default_device_index, iree_allocator_t host_allocator,
+    VkPhysicalDevice* out_physical_device) {
+  uint32_t physical_device_count = 0;
+  VkPhysicalDevice* physical_devices = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_vulkan_driver_enumerate_physical_devices(
+      instance_syms, instance, host_allocator, &physical_device_count,
+      &physical_devices));
+  iree_status_t status = iree_ok_status();
+  if (physical_device_count == 0 ||
+      default_device_index >= physical_device_count) {
+    status = iree_make_status(IREE_STATUS_NOT_FOUND,
+                              "default device %d not found (of %d enumerated)",
+                              default_device_index, physical_device_count);
+  } else {
+    *out_physical_device = physical_devices[default_device_index];
+  }
+  iree_allocator_free(host_allocator, physical_devices);
+  return status;
+}
+
+static iree_status_t iree_hal_vulkan_driver_create_device(
+    iree_hal_driver_t* base_driver, iree_hal_device_id_t device_id,
+    iree_allocator_t host_allocator, iree_hal_device_t** out_device) {
+  iree_hal_vulkan_driver_t* driver = iree_hal_vulkan_driver_cast(base_driver);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Use either the specified device (enumerated earlier) or whatever default
+  // one was specified when the driver was created.
+  VkPhysicalDevice physical_device = (VkPhysicalDevice)device_id;
+  if (physical_device == VK_NULL_HANDLE) {
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z0,
+        iree_hal_vulkan_driver_select_default_device(
+            driver->syms.get(), driver->instance, driver->default_device_index,
+            host_allocator, &physical_device));
+  }
+
+  // TODO(benvanik): remove HAL module dependence on the identifier for matching
+  // devices. Today it *must* be vulkan* to work, whereas really that should be
+  // a device type (vs the identifier, which is arbitrary).
+  // Query the device name to use as an identifier.
+  // VkPhysicalDeviceProperties physical_device_properties;
+  // driver->syms->vkGetPhysicalDeviceProperties(physical_device,
+  //                                             &physical_device_properties);
+  // iree_string_view_t device_name =
+  //     iree_make_string_view(physical_device_properties.deviceName,
+  //                           strlen(physical_device_properties.deviceName));
+  iree_string_view_t device_name = iree_make_cstring_view("vulkan");
+
+  // Attempt to create the device.
+  // This may fail if the device was enumerated but is in exclusive use,
+  // disabled by the system, or permission is denied.
+  iree_status_t status = iree_hal_vulkan_device_create(
+      base_driver, device_name, driver->enabled_features,
+      &driver->device_options, (iree_hal_vulkan_syms_t*)driver->syms.get(),
+      driver->instance, physical_device, host_allocator, out_device);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+namespace {
+const iree_hal_driver_vtable_t iree_hal_vulkan_driver_vtable = {
+    /*.destroy=*/iree_hal_vulkan_driver_destroy,
+    /*.query_available_devices=*/
+    iree_hal_vulkan_driver_query_available_devices,
+    /*.create_device=*/iree_hal_vulkan_driver_create_device,
+};
+}  // namespace
diff --git a/runtime/src/iree/hal/vulkan/vulkan_driver.h b/runtime/src/iree/hal/vulkan/vulkan_driver.h
new file mode 100644
index 0000000..c41a8d4
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/vulkan_driver.h
@@ -0,0 +1,17 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_VULKAN_VULKAN_DRIVER_H_
+#define IREE_HAL_VULKAN_VULKAN_DRIVER_H_
+
+#include "iree/hal/api.h"
+#include "iree/hal/vulkan/api.h"
+
+// NOTE: the driver API calls are defined in api.h.
+// TODO(benvanik): clean that up? api.h is nice because then we only need to
+// deploy a single header file for the backend, but it is a bit tricky.
+
+#endif  // IREE_HAL_VULKAN_VULKAN_DRIVER_H_
diff --git a/runtime/src/iree/hal/vulkan/vulkan_headers.h b/runtime/src/iree/hal/vulkan/vulkan_headers.h
new file mode 100644
index 0000000..6e88b09
--- /dev/null
+++ b/runtime/src/iree/hal/vulkan/vulkan_headers.h
@@ -0,0 +1,42 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_VULKAN_VULKAN_HEADERS_H_
+#define IREE_HAL_VULKAN_VULKAN_HEADERS_H_
+
+// We exclusively use Vulkan via queried function pointers. To ensure that there
+// are no accidental calls to the linker-loaded implicit functions we just
+// compile them all out.
+//
+// Code under iree/hal/vulkan/ *MUST NOT* directly include vulkan.h or any
+// header that includes it without this first being set. This means that this
+// iree/hal/vulkan/vulkan_headers.h file must usually be included first in all
+// files using it.
+//
+// From there, use iree/hal/vulkan/dynamic_symbols.h to plumb the dynamically
+// resolved symbols to any code that may need to make Vulkan calls. See that
+// header for more information: in general we try to keep our required set of
+// symbols minimal to avoid binary size/runtime memory/linker time so symbols
+// are only added as needed.
+//
+// Other non-core code can choose not to disable the prototypes if they want.
+// I don't suggest it though for anything beyond samples.
+//
+// There's a bunch of reasons to dynamically link against Vulkan like supporting
+// platforms without Vulkan or with differing Vulkan versions where all symbols
+// may not be available.
+//
+// See this article for more information:
+// https://djang86.blogspot.com/2019/01/what-is-vknoprototypes.html
+#define VK_NO_PROTOTYPES 1
+
+#include <vulkan/vulkan.h>  // IWYU pragma: export
+
+#ifdef IREE_PLATFORM_APPLE
+#include <vulkan/vulkan_beta.h>  // IWYU pragma: export
+#endif
+
+#endif  // IREE_HAL_VULKAN_VULKAN_HEADERS_H_
diff --git a/runtime/src/iree/modules/BUILD b/runtime/src/iree/modules/BUILD
new file mode 100644
index 0000000..236a474
--- /dev/null
+++ b/runtime/src/iree/modules/BUILD
@@ -0,0 +1,11 @@
+# Copyright 2020 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
diff --git a/runtime/src/iree/modules/CMakeLists.txt b/runtime/src/iree/modules/CMakeLists.txt
new file mode 100644
index 0000000..a913b35
--- /dev/null
+++ b/runtime/src/iree/modules/CMakeLists.txt
@@ -0,0 +1,13 @@
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# runtime/src/iree/modules/BUILD                                               #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
+
+iree_add_all_subdirs()
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/runtime/src/iree/modules/check/BUILD b/runtime/src/iree/modules/check/BUILD
new file mode 100644
index 0000000..d80f0de
--- /dev/null
+++ b/runtime/src/iree/modules/check/BUILD
@@ -0,0 +1,50 @@
+# Copyright 2020 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("//iree:build_defs.oss.bzl", "iree_runtime_cc_library", "iree_runtime_cc_test")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+iree_runtime_cc_library(
+    name = "check",
+    testonly = True,
+    srcs = ["module.cc"],
+    hdrs = ["module.h"],
+    deps = [
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:cc",
+        "//runtime/src/iree/base/internal",
+        "//runtime/src/iree/hal",
+        "//runtime/src/iree/modules/hal",
+        "//runtime/src/iree/testing:gtest",
+        "//runtime/src/iree/vm",
+        "//runtime/src/iree/vm:cc",
+    ],
+)
+
+iree_runtime_cc_test(
+    name = "check_test",
+    srcs = ["check_test.cc"],
+    deps = [
+        ":check",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:cc",
+        "//runtime/src/iree/base/internal",
+        "//runtime/src/iree/base/internal:span",
+        "//runtime/src/iree/hal",
+        "//runtime/src/iree/hal/vmvx/registration",
+        "//runtime/src/iree/modules/hal",
+        "//runtime/src/iree/testing:gtest",
+        "//runtime/src/iree/testing:gtest_main",
+        "//runtime/src/iree/vm",
+        "//runtime/src/iree/vm:bytecode_module",
+        "//runtime/src/iree/vm:cc",
+    ],
+)
diff --git a/runtime/src/iree/modules/check/CMakeLists.txt b/runtime/src/iree/modules/check/CMakeLists.txt
new file mode 100644
index 0000000..10ba2ae
--- /dev/null
+++ b/runtime/src/iree/modules/check/CMakeLists.txt
@@ -0,0 +1,51 @@
+# Copyright 2020 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+iree_add_all_subdirs()
+
+iree_cc_library(
+  NAME
+    check
+  HDRS
+    "module.h"
+  SRCS
+    "module.cc"
+  DEPS
+    iree::base
+    iree::base::cc
+    iree::base::internal
+    iree::hal
+    iree::modules::hal
+    iree::testing::gtest
+    iree::vm
+    iree::vm::cc
+  TESTONLY
+  PUBLIC
+)
+
+# Doesn't use bazel_to_cmake because IREE_HAL_DRIVER_VMVX filtering is custom logic
+if(${IREE_HAL_DRIVER_VMVX})
+  iree_cc_test(
+    NAME
+      check_test
+    SRCS
+      "check_test.cc"
+    DEPS
+      ::check
+      iree::base
+      iree::base::cc
+      iree::base::internal
+      iree::base::internal::span
+      iree::hal
+      iree::hal::vmvx::registration
+      iree::modules::hal
+      iree::testing::gtest
+      iree::testing::gtest_main
+      iree::vm
+      iree::vm::bytecode_module
+      iree::vm::cc
+  )
+endif()
diff --git a/runtime/src/iree/modules/check/check_test.cc b/runtime/src/iree/modules/check/check_test.cc
new file mode 100644
index 0000000..b3701bf
--- /dev/null
+++ b/runtime/src/iree/modules/check/check_test.cc
@@ -0,0 +1,581 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Tests that our bytecode module can call through into our native module.
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/math.h"
+#include "iree/base/internal/span.h"
+#include "iree/base/status_cc.h"
+#include "iree/hal/api.h"
+#include "iree/hal/vmvx/registration/driver_module.h"
+#include "iree/modules/check/module.h"
+#include "iree/modules/hal/module.h"
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+#include "iree/vm/api.h"
+#include "iree/vm/ref_cc.h"
+
+namespace iree {
+namespace {
+
+class CheckTest : public ::testing::Test {
+ protected:
+  static void SetUpTestSuite() {
+    IREE_CHECK_OK(iree_hal_vmvx_driver_module_register(
+        iree_hal_driver_registry_default()));
+    // TODO(benvanik): move to instance-based registration.
+    IREE_ASSERT_OK(iree_hal_module_register_types());
+
+    iree_hal_driver_t* hal_driver = nullptr;
+    IREE_ASSERT_OK(iree_hal_driver_registry_try_create_by_name(
+        iree_hal_driver_registry_default(), iree_make_cstring_view("vmvx"),
+        iree_allocator_system(), &hal_driver));
+    IREE_ASSERT_OK(iree_hal_driver_create_default_device(
+        hal_driver, iree_allocator_system(), &device_));
+    IREE_ASSERT_OK(
+        iree_hal_module_create(device_, iree_allocator_system(), &hal_module_));
+    iree_hal_driver_release(hal_driver);
+
+    IREE_ASSERT_OK(
+        iree_vm_instance_create(iree_allocator_system(), &instance_));
+
+    IREE_ASSERT_OK(
+        iree_check_module_create(iree_allocator_system(), &check_module_))
+        << "Native module failed to init";
+  }
+
+  static void TearDownTestSuite() {
+    iree_hal_device_release(device_);
+    iree_vm_module_release(check_module_);
+    iree_vm_module_release(hal_module_);
+    iree_vm_instance_release(instance_);
+  }
+
+  void SetUp() override {
+    std::vector<iree_vm_module_t*> modules = {hal_module_, check_module_};
+    IREE_ASSERT_OK(iree_vm_context_create_with_modules(
+        instance_, IREE_VM_CONTEXT_FLAG_NONE, modules.data(), modules.size(),
+        iree_allocator_system(), &context_));
+    allocator_ = iree_hal_device_allocator(device_);
+  }
+
+  void TearDown() override {
+    inputs_.reset();
+    iree_vm_context_release(context_);
+  }
+
+  void CreateInt32BufferView(iree::span<const int32_t> contents,
+                             iree::span<const int32_t> shape,
+                             iree_hal_buffer_view_t** out_buffer_view) {
+    size_t num_elements = 1;
+    for (int32_t dim : shape) {
+      num_elements *= dim;
+    }
+    ASSERT_EQ(contents.size(), num_elements);
+    iree_hal_buffer_params_t params = {0};
+    params.type =
+        IREE_HAL_MEMORY_TYPE_HOST_LOCAL | IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE,
+    params.usage = IREE_HAL_BUFFER_USAGE_DISPATCH |
+                   IREE_HAL_BUFFER_USAGE_TRANSFER |
+                   IREE_HAL_BUFFER_USAGE_MAPPING;
+    IREE_ASSERT_OK(iree_hal_buffer_view_allocate_buffer(
+        allocator_, shape.data(), shape.size(), IREE_HAL_ELEMENT_TYPE_INT_32,
+        IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR, params,
+        iree_make_const_byte_span(contents.data(),
+                                  contents.size() * sizeof(int32_t)),
+        &*out_buffer_view));
+  }
+
+  void CreateFloat16BufferView(iree::span<const uint16_t> contents,
+                               iree::span<const int32_t> shape,
+                               iree_hal_buffer_view_t** out_buffer_view) {
+    size_t num_elements = 1;
+    for (int32_t dim : shape) {
+      num_elements *= dim;
+    }
+    ASSERT_EQ(contents.size(), num_elements);
+    iree_hal_buffer_params_t params = {0};
+    params.type =
+        IREE_HAL_MEMORY_TYPE_HOST_LOCAL | IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE;
+    params.usage = IREE_HAL_BUFFER_USAGE_DISPATCH |
+                   IREE_HAL_BUFFER_USAGE_TRANSFER |
+                   IREE_HAL_BUFFER_USAGE_MAPPING;
+    IREE_ASSERT_OK(iree_hal_buffer_view_allocate_buffer(
+        allocator_, shape.data(), shape.size(), IREE_HAL_ELEMENT_TYPE_FLOAT_16,
+        IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR, params,
+        iree_make_const_byte_span(contents.data(),
+                                  contents.size() * sizeof(uint16_t)),
+        &*out_buffer_view));
+  }
+
+  void CreateFloat32BufferView(iree::span<const float> contents,
+                               iree::span<const int32_t> shape,
+                               iree_hal_buffer_view_t** out_buffer_view) {
+    size_t num_elements = 1;
+    for (int32_t dim : shape) {
+      num_elements *= dim;
+    }
+    ASSERT_EQ(contents.size(), num_elements);
+    iree_hal_buffer_params_t params = {0};
+    params.type =
+        IREE_HAL_MEMORY_TYPE_HOST_LOCAL | IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE;
+    params.usage = IREE_HAL_BUFFER_USAGE_DISPATCH |
+                   IREE_HAL_BUFFER_USAGE_TRANSFER |
+                   IREE_HAL_BUFFER_USAGE_MAPPING;
+    IREE_ASSERT_OK(iree_hal_buffer_view_allocate_buffer(
+        allocator_, shape.data(), shape.size(), IREE_HAL_ELEMENT_TYPE_FLOAT_32,
+        IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR, params,
+        iree_make_const_byte_span(contents.data(),
+                                  contents.size() * sizeof(float)),
+        &*out_buffer_view));
+  }
+
+  void CreateFloat64BufferView(iree::span<const double> contents,
+                               iree::span<const int32_t> shape,
+                               iree_hal_buffer_view_t** out_buffer_view) {
+    size_t num_elements = 1;
+    for (int32_t dim : shape) {
+      num_elements *= dim;
+    }
+    ASSERT_EQ(contents.size(), num_elements);
+    iree_hal_buffer_params_t params = {0};
+    params.type =
+        IREE_HAL_MEMORY_TYPE_HOST_LOCAL | IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE;
+    params.usage = IREE_HAL_BUFFER_USAGE_DISPATCH |
+                   IREE_HAL_BUFFER_USAGE_TRANSFER |
+                   IREE_HAL_BUFFER_USAGE_MAPPING;
+    IREE_ASSERT_OK(iree_hal_buffer_view_allocate_buffer(
+        allocator_, shape.data(), shape.size(), IREE_HAL_ELEMENT_TYPE_FLOAT_64,
+        IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR, params,
+        iree_make_const_byte_span(contents.data(),
+                                  contents.size() * sizeof(double)),
+        &*out_buffer_view));
+  }
+
+  iree_status_t Invoke(const char* function_name) {
+    iree_vm_function_t function;
+    IREE_RETURN_IF_ERROR(iree_vm_module_lookup_function_by_name(
+                             check_module_, IREE_VM_FUNCTION_LINKAGE_EXPORT,
+                             iree_make_cstring_view(function_name), &function),
+                         "exported function '%s' not found", function_name);
+    // TODO(#2075): don't directly invoke native functions like this.
+    return iree_vm_invoke(context_, function, IREE_VM_INVOCATION_FLAG_NONE,
+                          /*policy=*/nullptr, inputs_.get(),
+                          /*outputs=*/nullptr, iree_allocator_system());
+  }
+
+  iree_status_t Invoke(const char* function_name,
+                       std::vector<iree_vm_value_t> args) {
+    IREE_RETURN_IF_ERROR(
+        iree_vm_list_create(/*element_type=*/nullptr, args.size(),
+                            iree_allocator_system(), &inputs_));
+    for (auto& arg : args) {
+      IREE_RETURN_IF_ERROR(iree_vm_list_push_value(inputs_.get(), &arg));
+    }
+    return Invoke(function_name);
+  }
+
+  iree_status_t Invoke(const char* function_name,
+                       std::vector<vm::ref<iree_hal_buffer_view_t>> args) {
+    IREE_RETURN_IF_ERROR(
+        iree_vm_list_create(/*element_type=*/nullptr, args.size(),
+                            iree_allocator_system(), &inputs_));
+    for (auto& arg : args) {
+      iree_vm_ref_t arg_ref = iree_hal_buffer_view_move_ref(arg.get());
+      IREE_RETURN_IF_ERROR(iree_vm_list_push_ref_move(inputs_.get(), &arg_ref));
+    }
+    return Invoke(function_name);
+  }
+
+ private:
+  static iree_hal_device_t* device_;
+  static iree_vm_instance_t* instance_;
+  static iree_vm_module_t* check_module_;
+  static iree_vm_module_t* hal_module_;
+
+  iree_vm_context_t* context_ = nullptr;
+  vm::ref<iree_vm_list_t> inputs_;
+  iree_hal_allocator_t* allocator_ = nullptr;
+};
+iree_hal_device_t* CheckTest::device_ = nullptr;
+iree_vm_instance_t* CheckTest::instance_ = nullptr;
+iree_vm_module_t* CheckTest::check_module_ = nullptr;
+iree_vm_module_t* CheckTest::hal_module_ = nullptr;
+
+TEST_F(CheckTest, ExpectTrueSuccess) {
+  IREE_ASSERT_OK(Invoke("expect_true", {iree_vm_value_make_i32(1)}));
+}
+
+TEST_F(CheckTest, ExpectTrueFailure) {
+  EXPECT_NONFATAL_FAILURE(
+      IREE_ASSERT_OK(Invoke("expect_true", {iree_vm_value_make_i32(0)})),
+      "Expected 0 to be nonzero");
+}
+
+TEST_F(CheckTest, ExpectFalseSuccess) {
+  IREE_ASSERT_OK(Invoke("expect_false", {iree_vm_value_make_i32(0)}));
+}
+
+TEST_F(CheckTest, ExpectFalseFailure) {
+  EXPECT_NONFATAL_FAILURE(
+      IREE_ASSERT_OK(Invoke("expect_false", {iree_vm_value_make_i32(1)})),
+      "Expected 1 to be zero");
+}
+
+TEST_F(CheckTest, ExpectFalseNotOneFailure) {
+  EXPECT_NONFATAL_FAILURE(
+      IREE_ASSERT_OK(Invoke("expect_false", {iree_vm_value_make_i32(42)})),
+      "Expected 42 to be zero");
+}
+
+TEST_F(CheckTest, ExpectAllTrueSuccess) {
+  vm::ref<iree_hal_buffer_view_t> input_buffer_view;
+  int32_t contents[] = {1};
+  int32_t shape[] = {1};
+  ASSERT_NO_FATAL_FAILURE(
+      CreateInt32BufferView(contents, shape, &input_buffer_view));
+  IREE_ASSERT_OK(Invoke("expect_all_true", {input_buffer_view}));
+}
+
+TEST_F(CheckTest, ExpectAllTrue3DTrueSuccess) {
+  vm::ref<iree_hal_buffer_view_t> input_buffer_view;
+  int32_t contents[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  int32_t shape[] = {2, 2, 2};
+  ASSERT_NO_FATAL_FAILURE(
+      CreateInt32BufferView(contents, shape, &input_buffer_view));
+  IREE_ASSERT_OK(Invoke("expect_all_true", {input_buffer_view}));
+}
+
+TEST_F(CheckTest, ExpectAllTrueFailure) {
+  vm::ref<iree_hal_buffer_view_t> input_buffer_view;
+  int32_t contents[] = {0};
+  int32_t shape[] = {1};
+  ASSERT_NO_FATAL_FAILURE(
+      CreateInt32BufferView(contents, shape, &input_buffer_view));
+  EXPECT_NONFATAL_FAILURE(
+      IREE_ASSERT_OK(Invoke("expect_all_true", {input_buffer_view})), "0");
+}
+
+TEST_F(CheckTest, ExpectAllTrueSingleElementFailure) {
+  vm::ref<iree_hal_buffer_view_t> input_buffer_view;
+  int32_t contents[] = {1, 2, 3, 0, 4};
+  int32_t shape[] = {5};
+  ASSERT_NO_FATAL_FAILURE(
+      CreateInt32BufferView(contents, shape, &input_buffer_view));
+  EXPECT_NONFATAL_FAILURE(
+      IREE_ASSERT_OK(Invoke("expect_all_true", {input_buffer_view})),
+      "1, 2, 3, 0, 4");
+}
+
+TEST_F(CheckTest, ExpectAllTrue3DSingleElementFailure) {
+  vm::ref<iree_hal_buffer_view_t> input_buffer_view;
+  int32_t contents[] = {1, 2, 3, 4, 5, 6, 0, 8};
+  int32_t shape[] = {2, 2, 2};
+  ASSERT_NO_FATAL_FAILURE(
+      CreateInt32BufferView(contents, shape, &input_buffer_view));
+  EXPECT_NONFATAL_FAILURE(
+      IREE_ASSERT_OK(Invoke("expect_all_true", {input_buffer_view})),
+      "1, 2, 3, 4, 5, 6, 0, 8");
+}
+
+TEST_F(CheckTest, ExpectEqSameBufferSuccess) {
+  vm::ref<iree_hal_buffer_view_t> input_buffer_view;
+  int32_t contents[] = {1};
+  int32_t shape[] = {1};
+  ASSERT_NO_FATAL_FAILURE(
+      CreateInt32BufferView(contents, shape, &input_buffer_view));
+  IREE_ASSERT_OK(Invoke("expect_eq", {input_buffer_view, input_buffer_view}));
+}
+
+TEST_F(CheckTest, ExpectEqIdenticalBufferSuccess) {
+  vm::ref<iree_hal_buffer_view_t> lhs;
+  vm::ref<iree_hal_buffer_view_t> rhs;
+  int32_t contents[] = {1};
+  int32_t shape[] = {1};
+  ASSERT_NO_FATAL_FAILURE(CreateInt32BufferView(contents, shape, &lhs));
+  ASSERT_NO_FATAL_FAILURE(CreateInt32BufferView(contents, shape, &rhs));
+  IREE_ASSERT_OK(Invoke("expect_eq", {lhs, rhs}));
+}
+
+TEST_F(CheckTest, ExpectEqIdentical3DBufferSuccess) {
+  vm::ref<iree_hal_buffer_view_t> lhs;
+  vm::ref<iree_hal_buffer_view_t> rhs;
+  int32_t contents[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  int32_t shape[] = {2, 2, 2};
+  ASSERT_NO_FATAL_FAILURE(CreateInt32BufferView(contents, shape, &lhs));
+  ASSERT_NO_FATAL_FAILURE(CreateInt32BufferView(contents, shape, &rhs));
+  IREE_ASSERT_OK(Invoke("expect_eq", {lhs, rhs}));
+}
+
+TEST_F(CheckTest, ExpectEqDifferentShapeFailure) {
+  vm::ref<iree_hal_buffer_view_t> lhs;
+  vm::ref<iree_hal_buffer_view_t> rhs;
+  int32_t contents[] = {1, 2, 3, 4};
+  int32_t lhs_shape[] = {2, 2};
+  int32_t rhs_shape[] = {4};
+  ASSERT_NO_FATAL_FAILURE(CreateInt32BufferView(contents, lhs_shape, &lhs));
+  ASSERT_NO_FATAL_FAILURE(CreateInt32BufferView(contents, rhs_shape, &rhs));
+  EXPECT_NONFATAL_FAILURE(IREE_ASSERT_OK(Invoke("expect_eq", {lhs, rhs})),
+                          "Shapes do not match");
+}
+
+TEST_F(CheckTest, ExpectEqDifferentElementTypeFailure) {
+  vm::ref<iree_hal_buffer_view_t> lhs;
+  vm::ref<iree_hal_buffer_view_t> rhs;
+  int32_t lhs_contents[] = {1, 2, 3, 4};
+  float rhs_contents[] = {1, 2, 3, 4};
+  int32_t shape[] = {2, 2};
+  ASSERT_NO_FATAL_FAILURE(CreateInt32BufferView(lhs_contents, shape, &lhs));
+  ASSERT_NO_FATAL_FAILURE(CreateFloat32BufferView(rhs_contents, shape, &rhs));
+  EXPECT_NONFATAL_FAILURE(IREE_ASSERT_OK(Invoke("expect_eq", {lhs, rhs})),
+                          "Element types do not match");
+}
+
+TEST_F(CheckTest, ExpectEqDifferentContentsFailure) {
+  vm::ref<iree_hal_buffer_view_t> lhs;
+  vm::ref<iree_hal_buffer_view_t> rhs;
+  int32_t lhs_contents[] = {1};
+  int32_t rhs_contents[] = {2};
+  int32_t shape[] = {1};
+  ASSERT_NO_FATAL_FAILURE(CreateInt32BufferView(lhs_contents, shape, &lhs));
+  ASSERT_NO_FATAL_FAILURE(CreateInt32BufferView(rhs_contents, shape, &rhs));
+  EXPECT_NONFATAL_FAILURE(IREE_ASSERT_OK(Invoke("expect_eq", {lhs, rhs})),
+                          "Contents does not match");
+}
+
+TEST_F(CheckTest, ExpectEqDifferentEverythingFullMessageFailure) {
+  vm::ref<iree_hal_buffer_view_t> lhs;
+  vm::ref<iree_hal_buffer_view_t> rhs;
+  int32_t lhs_contents[] = {1, 2, 3, 4, 5, 6};
+  float rhs_contents[] = {1, 2, 3, 42};
+  int32_t lhs_shape[] = {2, 3};
+  int32_t rhs_shape[] = {2, 2};
+  ASSERT_NO_FATAL_FAILURE(CreateInt32BufferView(lhs_contents, lhs_shape, &lhs));
+  ASSERT_NO_FATAL_FAILURE(
+      CreateFloat32BufferView(rhs_contents, rhs_shape, &rhs));
+  EXPECT_NONFATAL_FAILURE(
+      IREE_ASSERT_OK(Invoke("expect_eq", {lhs, rhs})),
+      "Expected equality of these values. Element types do not match."
+      " Shapes do not match. Contents does not match.\n"
+      "  lhs:\n"
+      "    2x3xi32=[1 2 3][4 5 6]\n"
+      "  rhs:\n"
+      "    2x2xf32=[1 2][3 42]");
+}
+
+TEST_F(CheckTest, ExpectEqDifferentContents3DFullMessageFailure) {
+  vm::ref<iree_hal_buffer_view_t> lhs;
+  vm::ref<iree_hal_buffer_view_t> rhs;
+  int32_t lhs_contents[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  int32_t rhs_contents[] = {1, 2, 3, 42, 5, 6, 7, 8};
+  int32_t shape[] = {2, 2, 2};
+  ASSERT_NO_FATAL_FAILURE(CreateInt32BufferView(lhs_contents, shape, &lhs));
+  ASSERT_NO_FATAL_FAILURE(CreateInt32BufferView(rhs_contents, shape, &rhs));
+  EXPECT_NONFATAL_FAILURE(
+      IREE_ASSERT_OK(Invoke("expect_eq", {lhs, rhs})),
+      "Expected equality of these values. Contents does not match.\n"
+      "  lhs:\n"
+      "    2x2x2xi32=[[1 2][3 4]][[5 6][7 8]]\n"
+      "  rhs:\n"
+      "    2x2x2xi32=[[1 2][3 42]][[5 6][7 8]]");
+}
+
+TEST_F(CheckTest, ExpectAlmostEqSameBufferSuccess) {
+  vm::ref<iree_hal_buffer_view_t> input_buffer_view;
+  float contents[] = {1};
+  int32_t shape[] = {1};
+  ASSERT_NO_FATAL_FAILURE(
+      CreateFloat32BufferView(contents, shape, &input_buffer_view));
+  IREE_ASSERT_OK(
+      Invoke("expect_almost_eq", {input_buffer_view, input_buffer_view}));
+}
+
+TEST_F(CheckTest, ExpectAlmostEqIdenticalBufferSuccess) {
+  vm::ref<iree_hal_buffer_view_t> lhs;
+  vm::ref<iree_hal_buffer_view_t> rhs;
+  float contents[] = {1};
+  int32_t shape[] = {1};
+  ASSERT_NO_FATAL_FAILURE(CreateFloat32BufferView(contents, shape, &lhs));
+  ASSERT_NO_FATAL_FAILURE(CreateFloat32BufferView(contents, shape, &rhs));
+  IREE_ASSERT_OK(Invoke("expect_almost_eq", {lhs, rhs}));
+}
+
+TEST_F(CheckTest, ExpectAlmostEqNearIdenticalBufferSuccess) {
+  vm::ref<iree_hal_buffer_view_t> lhs;
+  vm::ref<iree_hal_buffer_view_t> rhs;
+  float lhs_contents[] = {1.0f, 1.99999f, 0.00001f, 4.0f};
+  float rhs_contents[] = {1.00001f, 2.0f, 0.0f, 4.0f};
+  int32_t shape[] = {4};
+  ASSERT_NO_FATAL_FAILURE(CreateFloat32BufferView(lhs_contents, shape, &lhs));
+  ASSERT_NO_FATAL_FAILURE(CreateFloat32BufferView(rhs_contents, shape, &rhs));
+  IREE_ASSERT_OK(Invoke("expect_almost_eq", {lhs, rhs}));
+}
+
+TEST_F(CheckTest, ExpectAlmostEqIdentical3DBufferSuccess) {
+  vm::ref<iree_hal_buffer_view_t> lhs;
+  vm::ref<iree_hal_buffer_view_t> rhs;
+  float contents[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  int32_t shape[] = {2, 2, 2};
+  ASSERT_NO_FATAL_FAILURE(CreateFloat32BufferView(contents, shape, &lhs));
+  ASSERT_NO_FATAL_FAILURE(CreateFloat32BufferView(contents, shape, &rhs));
+  IREE_ASSERT_OK(Invoke("expect_almost_eq", {lhs, rhs}));
+}
+
+TEST_F(CheckTest, ExpectAlmostEqDifferentShapeFailure) {
+  vm::ref<iree_hal_buffer_view_t> lhs;
+  vm::ref<iree_hal_buffer_view_t> rhs;
+  float contents[] = {1, 2, 3, 4};
+  int32_t lhs_shape[] = {2, 2};
+  int32_t rhs_shape[] = {4};
+  ASSERT_NO_FATAL_FAILURE(CreateFloat32BufferView(contents, lhs_shape, &lhs));
+  ASSERT_NO_FATAL_FAILURE(CreateFloat32BufferView(contents, rhs_shape, &rhs));
+  EXPECT_NONFATAL_FAILURE(
+      IREE_ASSERT_OK(Invoke("expect_almost_eq", {lhs, rhs})),
+      "Shapes do not match");
+}
+
+TEST_F(CheckTest, ExpectAlmostEqSmallerLhsElementCountFailure) {
+  vm::ref<iree_hal_buffer_view_t> smaller;
+  vm::ref<iree_hal_buffer_view_t> bigger;
+  float smaller_contents[] = {1, 2};
+  float bigger_contents[] = {1, 2, 3, 4};
+  int32_t smaller_shape[] = {2};
+  int32_t bigger_shape[] = {4};
+  ASSERT_NO_FATAL_FAILURE(
+      CreateFloat32BufferView(smaller_contents, smaller_shape, &smaller));
+  ASSERT_NO_FATAL_FAILURE(
+      CreateFloat32BufferView(bigger_contents, bigger_shape, &bigger));
+  EXPECT_NONFATAL_FAILURE(
+      IREE_ASSERT_OK(Invoke("expect_almost_eq", {smaller, bigger})),
+      "Shapes do not match");
+}
+
+TEST_F(CheckTest, ExpectAlmostEqSmallerRhsElementCountFailure) {
+  vm::ref<iree_hal_buffer_view_t> smaller;
+  vm::ref<iree_hal_buffer_view_t> bigger;
+  float smaller_contents[] = {1, 2};
+  float bigger_contents[] = {1, 2, 3, 4};
+  int32_t smaller_shape[] = {2};
+  int32_t bigger_shape[] = {4};
+  ASSERT_NO_FATAL_FAILURE(
+      CreateFloat32BufferView(smaller_contents, smaller_shape, &smaller));
+  ASSERT_NO_FATAL_FAILURE(
+      CreateFloat32BufferView(bigger_contents, bigger_shape, &bigger));
+  EXPECT_NONFATAL_FAILURE(
+      IREE_ASSERT_OK(Invoke("expect_almost_eq", {bigger, smaller})),
+      "Shapes do not match");
+}
+
+TEST_F(CheckTest, ExpectAlmostEqDifferentElementTypeFailure) {
+  vm::ref<iree_hal_buffer_view_t> lhs;
+  vm::ref<iree_hal_buffer_view_t> rhs;
+  double lhs_contents[] = {1, 2, 3, 4};
+  float rhs_contents[] = {1, 2, 3, 4};
+  int32_t shape[] = {2, 2};
+  ASSERT_NO_FATAL_FAILURE(CreateFloat64BufferView(lhs_contents, shape, &lhs));
+  ASSERT_NO_FATAL_FAILURE(CreateFloat32BufferView(rhs_contents, shape, &rhs));
+  EXPECT_NONFATAL_FAILURE(
+      IREE_ASSERT_OK(Invoke("expect_almost_eq", {lhs, rhs})),
+      "Element types do not match");
+}
+
+TEST_F(CheckTest, ExpectAlmostEqDifferentContentsFailure) {
+  vm::ref<iree_hal_buffer_view_t> lhs;
+  vm::ref<iree_hal_buffer_view_t> rhs;
+  float lhs_contents[] = {1};
+  float rhs_contents[] = {2};
+  int32_t shape[] = {1};
+  ASSERT_NO_FATAL_FAILURE(CreateFloat32BufferView(lhs_contents, shape, &lhs));
+  ASSERT_NO_FATAL_FAILURE(CreateFloat32BufferView(rhs_contents, shape, &rhs));
+  EXPECT_NONFATAL_FAILURE(
+      IREE_ASSERT_OK(Invoke("expect_almost_eq", {lhs, rhs})),
+      "Contents does not match");
+}
+
+TEST_F(CheckTest, ExpectAlmostEqDifferentEverythingFullMessageFailure) {
+  vm::ref<iree_hal_buffer_view_t> lhs;
+  vm::ref<iree_hal_buffer_view_t> rhs;
+  double lhs_contents[] = {1, 2, 3, 4, 5, 6};
+  float rhs_contents[] = {1, 2, 3, 42};
+  int32_t lhs_shape[] = {2, 3};
+  int32_t rhs_shape[] = {2, 2};
+  ASSERT_NO_FATAL_FAILURE(
+      CreateFloat64BufferView(lhs_contents, lhs_shape, &lhs));
+  ASSERT_NO_FATAL_FAILURE(
+      CreateFloat32BufferView(rhs_contents, rhs_shape, &rhs));
+  // Note no comment on contents. Cannot compare different shapes and element
+  // types.
+  EXPECT_NONFATAL_FAILURE(
+      IREE_ASSERT_OK(Invoke("expect_almost_eq", {lhs, rhs})),
+      "Expected near equality of these values. Element types do not match."
+      " Shapes do not match.\n"
+      "  lhs:\n"
+      "    2x3xf64=[1 2 3][4 5 6]\n"
+      "  rhs:\n"
+      "    2x2xf32=[1 2][3 42]");
+}
+
+TEST_F(CheckTest, ExpectAlmostEqDifferentContents3DFullMessageFailure) {
+  vm::ref<iree_hal_buffer_view_t> lhs;
+  vm::ref<iree_hal_buffer_view_t> rhs;
+  float lhs_contents[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  float rhs_contents[] = {1, 2, 3, 42, 5, 6, 7, 8};
+  int32_t shape[] = {2, 2, 2};
+  ASSERT_NO_FATAL_FAILURE(CreateFloat32BufferView(lhs_contents, shape, &lhs));
+  ASSERT_NO_FATAL_FAILURE(CreateFloat32BufferView(rhs_contents, shape, &rhs));
+  EXPECT_NONFATAL_FAILURE(
+      IREE_ASSERT_OK(Invoke("expect_almost_eq", {lhs, rhs})),
+      "Expected near equality of these values. Contents does not match.\n"
+      "  lhs:\n"
+      "    2x2x2xf32=[[1 2][3 4]][[5 6][7 8]]\n"
+      "  rhs:\n"
+      "    2x2x2xf32=[[1 2][3 42]][[5 6][7 8]]");
+}
+
+TEST_F(CheckTest, ExpectAlmostEqIdenticalBufferF16Success) {
+  vm::ref<iree_hal_buffer_view_t> lhs;
+  vm::ref<iree_hal_buffer_view_t> rhs;
+  uint16_t contents[] = {iree_math_f32_to_f16(1.f)};
+  int32_t shape[] = {1};
+  ASSERT_NO_FATAL_FAILURE(CreateFloat16BufferView(contents, shape, &lhs));
+  ASSERT_NO_FATAL_FAILURE(CreateFloat16BufferView(contents, shape, &rhs));
+  IREE_ASSERT_OK(Invoke("expect_almost_eq", {lhs, rhs}));
+}
+
+TEST_F(CheckTest, ExpectAlmostEqNearIdenticalBufferF16Success) {
+  vm::ref<iree_hal_buffer_view_t> lhs;
+  vm::ref<iree_hal_buffer_view_t> rhs;
+  uint16_t lhs_contents[] = {
+      iree_math_f32_to_f16(1.0f), iree_math_f32_to_f16(1.99999f),
+      iree_math_f32_to_f16(0.00001f), iree_math_f32_to_f16(4.0f)};
+  uint16_t rhs_contents[] = {
+      iree_math_f32_to_f16(1.00001f), iree_math_f32_to_f16(2.0f),
+      iree_math_f32_to_f16(0.0f), iree_math_f32_to_f16(4.0f)};
+  int32_t shape[] = {4};
+  ASSERT_NO_FATAL_FAILURE(CreateFloat16BufferView(lhs_contents, shape, &lhs));
+  ASSERT_NO_FATAL_FAILURE(CreateFloat16BufferView(rhs_contents, shape, &rhs));
+  IREE_ASSERT_OK(Invoke("expect_almost_eq", {lhs, rhs}));
+}
+
+TEST_F(CheckTest, ExpectAlmostEqDifferentContentsF16Failure) {
+  vm::ref<iree_hal_buffer_view_t> lhs;
+  vm::ref<iree_hal_buffer_view_t> rhs;
+  uint16_t lhs_contents[] = {iree_math_f32_to_f16(1.f)};
+  uint16_t rhs_contents[] = {iree_math_f32_to_f16(2.f)};
+  int32_t shape[] = {1};
+  ASSERT_NO_FATAL_FAILURE(CreateFloat16BufferView(lhs_contents, shape, &lhs));
+  ASSERT_NO_FATAL_FAILURE(CreateFloat16BufferView(rhs_contents, shape, &rhs));
+  EXPECT_NONFATAL_FAILURE(
+      IREE_ASSERT_OK(Invoke("expect_almost_eq", {lhs, rhs})),
+      "Contents does not match");
+}
+}  // namespace
+}  // namespace iree
diff --git a/runtime/src/iree/modules/check/module.cc b/runtime/src/iree/modules/check/module.cc
new file mode 100644
index 0000000..9996e94
--- /dev/null
+++ b/runtime/src/iree/modules/check/module.cc
@@ -0,0 +1,411 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/modules/check/module.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/math.h"
+#include "iree/base/status_cc.h"
+#include "iree/hal/api.h"
+#include "iree/modules/hal/module.h"
+#include "iree/testing/gtest.h"
+#include "iree/vm/native_module_cc.h"
+#include "iree/vm/ref_cc.h"
+
+//===----------------------------------------------------------------------===//
+// VM module interface implementation
+//===----------------------------------------------------------------------===//
+
+namespace iree {
+namespace {
+
+using ::testing::Each;
+using ::testing::Not;
+
+template <typename T>
+iree::span<const T> ToSpan(iree_byte_span_t bytes) {
+  return iree::span<const T>(reinterpret_cast<T*>(bytes.data),
+                             bytes.data_length / sizeof(T));
+}
+
+StatusOr<std::string> BufferViewToString(iree_hal_buffer_view_t* buffer_view) {
+  std::string result_str(4096, '\0');
+  iree_status_t status;
+  do {
+    iree_host_size_t actual_length = 0;
+    status = iree_hal_buffer_view_format(
+        buffer_view, /*max_element_count=*/1024, result_str.size() + 1,
+        &result_str[0], &actual_length);
+    result_str.resize(actual_length);
+  } while (iree_status_is_out_of_range(status));
+  IREE_RETURN_IF_ERROR(std::move(status));
+  return std::move(result_str);
+}
+
+template <typename T>
+Status ExpectAllTrue(iree_byte_span_t bytes) {
+  EXPECT_THAT(ToSpan<T>(bytes), Each(Not(T(0))));
+  return OkStatus();
+}
+
+bool EqByteSpan(iree_byte_span_t lhs_bytes, iree_byte_span_t rhs_bytes) {
+  return lhs_bytes.data_length == rhs_bytes.data_length &&
+         memcmp(lhs_bytes.data, rhs_bytes.data, lhs_bytes.data_length) == 0;
+}
+
+static constexpr float kF32PrecisionThreshold = 0.0001f;
+
+template <typename T>
+bool AlmostEqByteSpan(iree_byte_span_t lhs_bytes, iree_byte_span_t rhs_bytes) {
+  auto lhs_span = ToSpan<T>(lhs_bytes);
+  auto rhs_span = ToSpan<T>(rhs_bytes);
+  assert(lhs_span.size() == rhs_span.size());
+  for (int i = 0; i < lhs_span.size(); ++i) {
+    if (fabs(lhs_span[i] - rhs_span[i]) > kF32PrecisionThreshold) {
+      return false;
+    }
+  }
+  return true;
+}
+
+static constexpr float kF16PrecisionThreshold = 0.001f;
+
+bool AlmostEqByteSpanF16(iree_byte_span_t lhs_bytes,
+                         iree_byte_span_t rhs_bytes) {
+  auto lhs_span = ToSpan<uint16_t>(lhs_bytes);
+  auto rhs_span = ToSpan<uint16_t>(rhs_bytes);
+  assert(lhs_span.size() == rhs_span.size());
+  for (int i = 0; i < lhs_span.size(); ++i) {
+    if (fabs(iree_math_f16_to_f32(lhs_span[i]) -
+             iree_math_f16_to_f32(rhs_span[i])) > kF16PrecisionThreshold) {
+      return false;
+    }
+  }
+  return true;
+}
+
+StatusOr<bool> AlmostEqByteSpan(iree_byte_span_t lhs_bytes,
+                                iree_byte_span_t rhs_bytes,
+                                iree_hal_element_type_t element_type) {
+  switch (element_type) {
+    case IREE_HAL_ELEMENT_TYPE_FLOAT_32:
+      return AlmostEqByteSpan<float>(lhs_bytes, rhs_bytes);
+    case IREE_HAL_ELEMENT_TYPE_FLOAT_64:
+      return AlmostEqByteSpan<double>(lhs_bytes, rhs_bytes);
+    case IREE_HAL_ELEMENT_TYPE_FLOAT_16:
+      return AlmostEqByteSpanF16(lhs_bytes, rhs_bytes);
+    default:
+      // TODO(gcmn): Consider supporting fuzzy matching for quantized integers.
+      break;
+  }
+  char element_type_str[16];
+  IREE_RETURN_IF_ERROR(iree_hal_format_element_type(
+      element_type, sizeof(element_type_str), element_type_str, nullptr));
+  return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                          "unsupported element type %s", element_type_str);
+}
+
+Status ExpectAllTrue(iree_byte_span_t bytes,
+                     iree_hal_element_type_t element_type) {
+  switch (element_type) {
+    case IREE_HAL_ELEMENT_TYPE_INT_8:
+    case IREE_HAL_ELEMENT_TYPE_SINT_8:
+      return ExpectAllTrue<int8_t>(bytes);
+    case IREE_HAL_ELEMENT_TYPE_UINT_8:
+      return ExpectAllTrue<uint8_t>(bytes);
+    case IREE_HAL_ELEMENT_TYPE_INT_16:
+    case IREE_HAL_ELEMENT_TYPE_SINT_16:
+      return ExpectAllTrue<int16_t>(bytes);
+    case IREE_HAL_ELEMENT_TYPE_UINT_16:
+      return ExpectAllTrue<uint16_t>(bytes);
+    case IREE_HAL_ELEMENT_TYPE_INT_32:
+    case IREE_HAL_ELEMENT_TYPE_SINT_32:
+      return ExpectAllTrue<int32_t>(bytes);
+    case IREE_HAL_ELEMENT_TYPE_UINT_32:
+      return ExpectAllTrue<uint32_t>(bytes);
+    case IREE_HAL_ELEMENT_TYPE_INT_64:
+    case IREE_HAL_ELEMENT_TYPE_SINT_64:
+      return ExpectAllTrue<int64_t>(bytes);
+    case IREE_HAL_ELEMENT_TYPE_UINT_64:
+      return ExpectAllTrue<uint64_t>(bytes);
+    case IREE_HAL_ELEMENT_TYPE_FLOAT_32:
+      return ExpectAllTrue<float>(bytes);
+    case IREE_HAL_ELEMENT_TYPE_FLOAT_64:
+      return ExpectAllTrue<double>(bytes);
+    default:
+      break;
+  }
+  char element_type_str[16];
+  IREE_RETURN_IF_ERROR(iree_hal_format_element_type(
+      element_type, sizeof(element_type_str), element_type_str, nullptr));
+  return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                          "unsupported element type %s", element_type_str);
+}
+
+// Per-context module state.
+// This can contain "globals" and other arbitrary state.
+//
+// Thread-compatible; the runtime will not issue multiple calls at the same
+// time using the same state. If the implementation uses external threads then
+// it must synchronize itself.
+class CheckModuleState final {
+ public:
+  explicit CheckModuleState(iree_allocator_t allocator)
+      : allocator_(allocator) {}
+  ~CheckModuleState() = default;
+
+  Status ExpectTrue(int32_t operand) {
+    EXPECT_TRUE(operand) << "Expected " << operand << " to be nonzero.";
+    return OkStatus();
+  }
+
+  Status ExpectFalse(int32_t operand) {
+    EXPECT_FALSE(operand) << "Expected " << operand << " to be zero.";
+    return OkStatus();
+  }
+
+  Status ExpectAllTrue(vm::ref<iree_hal_buffer_view_t> operand) {
+    auto* view = operand.get();
+    iree_hal_element_type_t element_type =
+        iree_hal_buffer_view_element_type(view);
+    iree_hal_buffer_t* buf = iree_hal_buffer_view_buffer(view);
+    iree_device_size_t size = iree_hal_buffer_view_byte_length(view);
+    iree_hal_buffer_mapping_t mapped_memory = {{0}};
+    IREE_RETURN_IF_ERROR(iree_hal_buffer_map_range(
+        buf, IREE_HAL_MAPPING_MODE_SCOPED, IREE_HAL_MEMORY_ACCESS_READ,
+        /*byte_offset=*/0, size, &mapped_memory));
+    IREE_RETURN_IF_ERROR(
+        ::iree::ExpectAllTrue(mapped_memory.contents, element_type));
+    iree_status_ignore(iree_hal_buffer_unmap_range(&mapped_memory));
+    return OkStatus();
+  }
+
+  Status ExpectEq(vm::ref<iree_hal_buffer_view_t> lhs_ref,
+                  vm::ref<iree_hal_buffer_view_t> rhs_ref) {
+    auto* lhs = lhs_ref.get();
+    auto* rhs = rhs_ref.get();
+
+    iree_device_size_t lhs_size = iree_hal_buffer_view_byte_length(lhs);
+    size_t lhs_rank = iree_hal_buffer_view_shape_rank(lhs);
+    std::vector<iree_hal_dim_t> lhs_shape(lhs_rank);
+    if (lhs_rank > 0) {
+      IREE_RETURN_IF_ERROR(
+          iree_hal_buffer_view_shape(lhs, lhs_rank, lhs_shape.data(), nullptr));
+    }
+
+    iree_device_size_t rhs_size = iree_hal_buffer_view_byte_length(rhs);
+    size_t rhs_rank = iree_hal_buffer_view_shape_rank(rhs);
+    std::vector<iree_hal_dim_t> rhs_shape(rhs_rank);
+    if (rhs_rank > 0) {
+      IREE_RETURN_IF_ERROR(
+          iree_hal_buffer_view_shape(rhs, rhs_rank, rhs_shape.data(), nullptr));
+    }
+
+    iree_hal_element_type_t lhs_element_type =
+        iree_hal_buffer_view_element_type(lhs);
+    iree_hal_element_type_t rhs_element_type =
+        iree_hal_buffer_view_element_type(rhs);
+
+    // HACK: this is all broken and will leak. Let's kill this entire module
+    // please.
+
+    iree_hal_buffer_t* lhs_buf = iree_hal_buffer_view_buffer(lhs);
+    iree_hal_buffer_mapping_t lhs_mapped_memory = {{0}};
+    IREE_RETURN_IF_ERROR(iree_hal_buffer_map_range(
+        lhs_buf, IREE_HAL_MAPPING_MODE_SCOPED, IREE_HAL_MEMORY_ACCESS_READ,
+        /*byte_offset=*/0, lhs_size, &lhs_mapped_memory));
+    iree_hal_buffer_t* rhs_buf = iree_hal_buffer_view_buffer(rhs);
+    iree_hal_buffer_mapping_t rhs_mapped_memory = {{0}};
+    IREE_RETURN_IF_ERROR(iree_hal_buffer_map_range(
+        rhs_buf, IREE_HAL_MAPPING_MODE_SCOPED, IREE_HAL_MEMORY_ACCESS_READ,
+        /*byte_offset=*/0, rhs_size, &rhs_mapped_memory));
+
+    bool element_types_eq = lhs_element_type == rhs_element_type;
+    bool shape_eq = lhs_shape == rhs_shape;
+    bool contents_eq =
+        EqByteSpan(lhs_mapped_memory.contents, rhs_mapped_memory.contents);
+    iree_status_ignore(iree_hal_buffer_unmap_range(&lhs_mapped_memory));
+    iree_status_ignore(iree_hal_buffer_unmap_range(&rhs_mapped_memory));
+
+    if (!element_types_eq || !shape_eq || !contents_eq) {
+      std::ostringstream os;
+      os << "Expected equality of these values.";
+      if (!element_types_eq) {
+        os << " Element types do not match.";
+      }
+      if (!shape_eq) {
+        os << " Shapes do not match.";
+      }
+      if (!contents_eq) {
+        os << " Contents does not match.";
+      }
+      // TODO(b/146898896): Propagate original variable names.
+      os << "\n"
+            "  lhs:\n"
+            "    ";
+      IREE_ASSIGN_OR_RETURN(auto lhs_str, BufferViewToString(lhs));
+      os << lhs_str;
+
+      os << "\n"
+            "  rhs:\n"
+            "    ";
+      IREE_ASSIGN_OR_RETURN(auto rhs_str, BufferViewToString(rhs));
+      os << rhs_str;
+
+      // TODO(b/146898896): Use ADD_FAILURE_AT to propagate source location.
+      ADD_FAILURE() << os.str();
+    }
+
+    return OkStatus();
+  }
+
+  Status ExpectAlmostEq(vm::ref<iree_hal_buffer_view_t> lhs_ref,
+                        vm::ref<iree_hal_buffer_view_t> rhs_ref) {
+    auto* lhs = lhs_ref.get();
+    auto* rhs = rhs_ref.get();
+
+    iree_device_size_t lhs_size = iree_hal_buffer_view_byte_length(lhs);
+    size_t lhs_rank = iree_hal_buffer_view_shape_rank(lhs);
+    std::vector<iree_hal_dim_t> lhs_shape(lhs_rank);
+    if (lhs_rank > 0) {
+      IREE_RETURN_IF_ERROR(
+          iree_hal_buffer_view_shape(lhs, lhs_rank, lhs_shape.data(), nullptr));
+    }
+
+    iree_device_size_t rhs_size = iree_hal_buffer_view_byte_length(rhs);
+    size_t rhs_rank = iree_hal_buffer_view_shape_rank(rhs);
+    std::vector<iree_hal_dim_t> rhs_shape(rhs_rank);
+    if (rhs_rank > 0) {
+      IREE_RETURN_IF_ERROR(
+          iree_hal_buffer_view_shape(rhs, rhs_rank, rhs_shape.data(), nullptr));
+    }
+
+    iree_hal_element_type_t lhs_element_type =
+        iree_hal_buffer_view_element_type(lhs);
+    iree_hal_element_type_t rhs_element_type =
+        iree_hal_buffer_view_element_type(rhs);
+
+    iree_hal_buffer_t* lhs_buf = iree_hal_buffer_view_buffer(lhs);
+    iree_hal_buffer_mapping_t lhs_mapped_memory = {{0}};
+    IREE_RETURN_IF_ERROR(iree_hal_buffer_map_range(
+        lhs_buf, IREE_HAL_MAPPING_MODE_SCOPED, IREE_HAL_MEMORY_ACCESS_READ,
+        /*byte_offset=*/0, lhs_size, &lhs_mapped_memory));
+    iree_hal_buffer_t* rhs_buf = iree_hal_buffer_view_buffer(rhs);
+    iree_hal_buffer_mapping_t rhs_mapped_memory = {{0}};
+    IREE_RETURN_IF_ERROR(iree_hal_buffer_map_range(
+        rhs_buf, IREE_HAL_MAPPING_MODE_SCOPED, IREE_HAL_MEMORY_ACCESS_READ,
+        /*byte_offset=*/0, rhs_size, &rhs_mapped_memory));
+
+    bool element_types_eq = lhs_element_type == rhs_element_type;
+    bool shape_eq = lhs_shape == rhs_shape;
+    // Only check contents if shape and element type match. Otherwise we can't.
+    bool contents_could_be_almost_eq = true;
+    if (element_types_eq && shape_eq) {
+      IREE_ASSIGN_OR_RETURN(
+          contents_could_be_almost_eq,
+          AlmostEqByteSpan(lhs_mapped_memory.contents,
+                           rhs_mapped_memory.contents, lhs_element_type));
+    }
+    iree_status_ignore(iree_hal_buffer_unmap_range(&lhs_mapped_memory));
+    iree_status_ignore(iree_hal_buffer_unmap_range(&rhs_mapped_memory));
+
+    if (!element_types_eq || !shape_eq || !contents_could_be_almost_eq) {
+      std::ostringstream os;
+      os << "Expected near equality of these values.";
+      if (!element_types_eq) {
+        os << " Element types do not match.";
+      }
+      if (!shape_eq) {
+        os << " Shapes do not match.";
+      }
+      if (!contents_could_be_almost_eq) {
+        os << " Contents does not match.";
+      }
+      // TODO(b/146898896): Propagate original variable names.
+      os << "\n"
+            "  lhs:\n"
+            "    ";
+      IREE_ASSIGN_OR_RETURN(auto lhs_str, BufferViewToString(lhs));
+      os << lhs_str;
+
+      os << "\n"
+            "  rhs:\n"
+            "    ";
+      IREE_ASSIGN_OR_RETURN(auto rhs_str, BufferViewToString(rhs));
+      os << rhs_str;
+
+      // TODO(b/146898896): Use ADD_FAILURE_AT to propagate source location.
+      ADD_FAILURE() << os.str();
+    }
+
+    return OkStatus();
+  }
+
+ private:
+  // Allocator that the caller requested we use for any allocations we need to
+  // perform during operation.
+  iree_allocator_t allocator_ = iree_allocator_system();
+};
+
+// Function table mapping imported function names to their implementation.
+// The signature of the target function is expected to match that in the
+// check.imports.mlir file.
+static const vm::NativeFunction<CheckModuleState> kCheckModuleFunctions[] = {
+    vm::MakeNativeFunction("expect_true", &CheckModuleState::ExpectTrue),
+    vm::MakeNativeFunction("expect_false", &CheckModuleState::ExpectFalse),
+    vm::MakeNativeFunction("expect_all_true", &CheckModuleState::ExpectAllTrue),
+    vm::MakeNativeFunction("expect_eq", &CheckModuleState::ExpectEq),
+    vm::MakeNativeFunction("expect_almost_eq",
+                           &CheckModuleState::ExpectAlmostEq),
+};
+
+// The module instance that will be allocated and reused across contexts.
+// Any context-specific state must be stored in a state structure such as
+// CheckModuleState below.
+//
+// Assumed thread-safe (by construction here, as it's immutable), though if more
+// state is stored here it will need to be synchronized by the implementation.
+class CheckModule final : public vm::NativeModule<CheckModuleState> {
+ public:
+  using vm::NativeModule<CheckModuleState>::NativeModule;
+
+  // Creates per-context state when the module is added to a new context.
+  // May be called from any thread.
+  StatusOr<std::unique_ptr<CheckModuleState>> CreateState(
+      iree_allocator_t allocator) override {
+    auto state = std::make_unique<CheckModuleState>(allocator);
+    return state;
+  }
+};
+
+}  // namespace
+
+// Note that while we are using C++ bindings internally we still expose the
+// module as a C instance. This hides the details of our implementation.
+extern "C" iree_status_t iree_check_module_create(
+    iree_allocator_t allocator, iree_vm_module_t** out_module) {
+  IREE_ASSERT_ARGUMENT(out_module);
+  *out_module = NULL;
+  auto module = std::make_unique<CheckModule>(
+      "check", allocator,
+      iree::span<const vm::NativeFunction<CheckModuleState>>(
+          kCheckModuleFunctions));
+  *out_module = module.release()->interface();
+  return iree_ok_status();
+}
+
+}  // namespace iree
diff --git a/runtime/src/iree/modules/check/module.h b/runtime/src/iree/modules/check/module.h
new file mode 100644
index 0000000..24d29ba
--- /dev/null
+++ b/runtime/src/iree/modules/check/module.h
@@ -0,0 +1,27 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_MODULES_CHECK_MODULE_H_
+#define IREE_MODULES_CHECK_MODULE_H_
+
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/vm/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Creates a native custom module.
+iree_status_t iree_check_module_create(iree_allocator_t allocator,
+                                       iree_vm_module_t** out_module);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_MODULES_CHECK_MODULE_H_
diff --git a/runtime/src/iree/modules/check/test/BUILD b/runtime/src/iree/modules/check/test/BUILD
new file mode 100644
index 0000000..a834f60
--- /dev/null
+++ b/runtime/src/iree/modules/check/test/BUILD
@@ -0,0 +1,48 @@
+# Copyright 2020 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("//build_tools/bazel:iree_check_test.bzl", "iree_check_test_suite")
+load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
+load("//build_tools/bazel:enforce_glob.bzl", "enforce_glob")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+iree_lit_test_suite(
+    name = "lit",
+    srcs = enforce_glob(
+        [
+            "failure.mlir",
+            "success.mlir",
+            "unavailable.mlir",
+        ],
+        include = ["*.mlir"],
+    ),
+    cfg = "//runtime:lit.cfg.py",
+    tags = ["hostonly"],
+    tools = [
+        "//iree/tools:iree-check-module",
+        "//iree/tools:iree-compile",
+        "//iree/tools:iree-run-module",
+        "@llvm-project//llvm:FileCheck",
+    ],
+)
+
+iree_check_test_suite(
+    name = "check",
+    srcs = ["success.mlir"],
+    compiler_flags = ["-iree-input-type=mhlo"],
+)
+
+iree_check_test_suite(
+    name = "check_failure",
+    srcs = ["failure.mlir"],
+    compiler_flags = ["-iree-input-type=mhlo"],
+    runner_args = ["--expect_failure"],
+)
diff --git a/runtime/src/iree/modules/check/test/CMakeLists.txt b/runtime/src/iree/modules/check/test/CMakeLists.txt
new file mode 100644
index 0000000..f41dae2
--- /dev/null
+++ b/runtime/src/iree/modules/check/test/CMakeLists.txt
@@ -0,0 +1,49 @@
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# runtime/src/iree/modules/check/test/BUILD                                    #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
+
+iree_add_all_subdirs()
+
+iree_lit_test_suite(
+  NAME
+    lit
+  SRCS
+    "failure.mlir"
+    "success.mlir"
+    "unavailable.mlir"
+  TOOLS
+    FileCheck
+    iree::tools::iree-check-module
+    iree::tools::iree-compile
+    iree::tools::iree-run-module
+  LABELS
+    "hostonly"
+)
+
+iree_check_test_suite(
+  NAME
+    check
+  SRCS
+    "success.mlir"
+  COMPILER_FLAGS
+    "-iree-input-type=mhlo"
+)
+
+iree_check_test_suite(
+  NAME
+    check_failure
+  SRCS
+    "failure.mlir"
+  COMPILER_FLAGS
+    "-iree-input-type=mhlo"
+  RUNNER_ARGS
+    "--expect_failure"
+)
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/runtime/src/iree/modules/check/test/failure.mlir b/runtime/src/iree/modules/check/test/failure.mlir
new file mode 100644
index 0000000..a5c541c
--- /dev/null
+++ b/runtime/src/iree/modules/check/test/failure.mlir
@@ -0,0 +1,13 @@
+// RUN: iree-compile --iree-input-type=mhlo --iree-hal-target-backends=vmvx -iree-mlir-to-vm-bytecode-module %s | iree-check-module --expect_failure - | FileCheck %s
+// RUN: [[ $IREE_VULKAN_DISABLE == 1 ]] || (iree-compile --iree-input-type=mhlo --iree-hal-target-backends=vulkan-spirv -iree-mlir-to-vm-bytecode-module %s | iree-check-module --driver=vulkan --expect_failure - | FileCheck %s)
+
+// CHECK-LABEL: expect_failure.expect_true_of_false
+// CHECK: Expected 0 to be nonzero
+// CHECK: Test failed as expected
+module @expect_failure {
+func.func @expect_true_of_false() {
+  %false = util.unfoldable_constant 0 : i32
+  check.expect_true(%false) : i32
+  return
+}
+}
diff --git a/runtime/src/iree/modules/check/test/success.mlir b/runtime/src/iree/modules/check/test/success.mlir
new file mode 100644
index 0000000..2935131
--- /dev/null
+++ b/runtime/src/iree/modules/check/test/success.mlir
@@ -0,0 +1,78 @@
+// RUN: iree-compile --iree-input-type=mhlo --iree-hal-target-backends=vmvx -iree-mlir-to-vm-bytecode-module %s | iree-check-module --driver=vmvx -
+// RUN: [[ $IREE_VULKAN_DISABLE == 1 ]] || (iree-compile --iree-input-type=mhlo --iree-hal-target-backends=vulkan-spirv -iree-mlir-to-vm-bytecode-module %s | iree-check-module --driver=vulkan -)
+
+func.func @expect_true() {
+  %true = util.unfoldable_constant 1 : i32
+  check.expect_true(%true) : i32
+  return
+}
+
+func.func @expect_false() {
+  %false = util.unfoldable_constant 0 : i32
+  check.expect_false(%false) : i32
+  return
+}
+
+func.func @expect_all_true() {
+  %all_true = util.unfoldable_constant dense<1> : tensor<2x2xi32>
+  %all_true_view = hal.tensor.export %all_true : tensor<2x2xi32> -> !hal.buffer_view
+  check.expect_all_true(%all_true_view) : !hal.buffer_view
+  return
+}
+
+func.func @expect_all_true_tensor() {
+  %all_true = util.unfoldable_constant dense<1> : tensor<2x2xi32>
+  check.expect_all_true(%all_true) : tensor<2x2xi32>
+  return
+}
+
+func.func @expect_eq() {
+  %const0 = util.unfoldable_constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
+  %const1 = util.unfoldable_constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
+  check.expect_eq(%const0, %const1) : tensor<5xi32>
+  return
+}
+
+func.func @expect_eq_const() {
+  %const0 = util.unfoldable_constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
+  check.expect_eq_const(%const0, dense<[1, 2, 3, 4, 5]> : tensor<5xi32>) : tensor<5xi32>
+  return
+}
+
+func.func @expect_almost_eq() {
+  %const0 = util.unfoldable_constant dense<[1.0, 2.0, 3.0, 4.0, 5.0]> : tensor<5xf32>
+  %const1 = util.unfoldable_constant dense<[0.999999, 2.0, 3.0, 4.0, 5.0]> : tensor<5xf32>
+  check.expect_almost_eq(%const0, %const1) : tensor<5xf32>
+  return
+}
+
+func.func @expect_almost_eq_const() {
+  %const0 = util.unfoldable_constant dense<[1.0, 2.0, 3.0, 4.0, 5.0]> : tensor<5xf32>
+  check.expect_almost_eq_const(%const0, dense<[0.999999, 2.0, 3.0, 4.0, 5.0]> : tensor<5xf32>) : tensor<5xf32>
+  return
+}
+
+func.func @add() {
+  %c5 = util.unfoldable_constant dense<5> : tensor<i32>
+  %result = "mhlo.add"(%c5, %c5) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %c10 = util.unfoldable_constant dense<10> : tensor<i32>
+  check.expect_eq(%result, %c10) : tensor<i32>
+  return
+}
+
+func.func @floats() {
+  %cp1 = util.unfoldable_constant dense<0.1> : tensor<f32>
+  %c1 = util.unfoldable_constant dense<1.0> : tensor<f32>
+  %p2 = "mhlo.add"(%cp1, %cp1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  %p3 = "mhlo.add"(%p2, %cp1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  %p4 = "mhlo.add"(%p3, %cp1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  %p5 = "mhlo.add"(%p4, %cp1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  %p6 = "mhlo.add"(%p5, %cp1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  %p7 = "mhlo.add"(%p6, %cp1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  %p8 = "mhlo.add"(%p7, %cp1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  %p9 = "mhlo.add"(%p8, %cp1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  %approximately_1 = "mhlo.add"(%p9, %cp1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+
+  check.expect_almost_eq(%approximately_1, %c1) : tensor<f32>
+  return
+}
diff --git a/runtime/src/iree/modules/check/test/unavailable.mlir b/runtime/src/iree/modules/check/test/unavailable.mlir
new file mode 100644
index 0000000..c8f333c
--- /dev/null
+++ b/runtime/src/iree/modules/check/test/unavailable.mlir
@@ -0,0 +1,15 @@
+// RUN: iree-compile --iree-input-type=mhlo --iree-hal-target-backends=vmvx -iree-mlir-to-vm-bytecode-module %s | iree-run-module --module_file=- --entry_function=expect_true_of_false | FileCheck %s
+
+// Tests that even if the check module is not available (in this case because
+// we are running with iree-run-module instead of iree-check-module) the
+// execution still completes.
+
+// CHECK-LABEL: EXEC @expect_true_of_false
+// CHECK: result[0]: i32=0
+module @expect_failure {
+  func.func @expect_true_of_false() -> i32 {
+    %false = util.unfoldable_constant 0 : i32
+    check.expect_true(%false) : i32
+    return %false : i32
+  }
+}
diff --git a/runtime/src/iree/modules/hal/BUILD b/runtime/src/iree/modules/hal/BUILD
new file mode 100644
index 0000000..9afa643
--- /dev/null
+++ b/runtime/src/iree/modules/hal/BUILD
@@ -0,0 +1,32 @@
+# Copyright 2019 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("//iree:build_defs.oss.bzl", "iree_runtime_cc_library")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+iree_runtime_cc_library(
+    name = "hal",
+    srcs = [
+        "module.c",
+    ],
+    hdrs = [
+        "module.h",
+    ],
+    textual_hdrs = [
+        "exports.inl",
+    ],
+    deps = [
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:tracing",
+        "//runtime/src/iree/hal",
+        "//runtime/src/iree/vm",
+    ],
+)
diff --git a/runtime/src/iree/modules/hal/CMakeLists.txt b/runtime/src/iree/modules/hal/CMakeLists.txt
new file mode 100644
index 0000000..14b2612
--- /dev/null
+++ b/runtime/src/iree/modules/hal/CMakeLists.txt
@@ -0,0 +1,30 @@
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# runtime/src/iree/modules/hal/BUILD                                           #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
+
+iree_add_all_subdirs()
+
+iree_cc_library(
+  NAME
+    hal
+  HDRS
+    "module.h"
+  TEXTUAL_HDRS
+    "exports.inl"
+  SRCS
+    "module.c"
+  DEPS
+    iree::base
+    iree::base::tracing
+    iree::hal
+    iree::vm
+  PUBLIC
+)
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/runtime/src/iree/modules/hal/exports.inl b/runtime/src/iree/modules/hal/exports.inl
new file mode 100644
index 0000000..8bca87f
--- /dev/null
+++ b/runtime/src/iree/modules/hal/exports.inl
@@ -0,0 +1,81 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+//===----------------------------------------------------------------------===//
+//
+//         ██     ██  █████  ██████  ███    ██ ██ ███    ██  ██████
+//         ██     ██ ██   ██ ██   ██ ████   ██ ██ ████   ██ ██
+//         ██  █  ██ ███████ ██████  ██ ██  ██ ██ ██ ██  ██ ██   ███
+//         ██ ███ ██ ██   ██ ██   ██ ██  ██ ██ ██ ██  ██ ██ ██    ██
+//          ███ ███  ██   ██ ██   ██ ██   ████ ██ ██   ████  ██████
+//
+//===----------------------------------------------------------------------===//
+//
+// This file will be auto generated from hal.imports.mlir in the future; for
+// now it's modified by hand but with strict alphabetical sorting required.
+// The order of these functions must be sorted ascending by name in a way
+// compatible with iree_string_view_compare.
+//
+// Users are meant to `#define EXPORT_FN` to be able to access the information.
+// #define EXPORT_FN(name, arg_type, ret_type, target_fn)
+
+// clang-format off
+
+EXPORT_FN("allocator.allocate", iree_hal_module_allocator_allocate, riii, r)
+EXPORT_FN("allocator.map.byte_buffer", iree_hal_module_allocator_map_byte_buffer, riiirii, r)
+EXPORT_FN("allocator.wrap.byte_buffer", iree_hal_module_allocator_wrap_byte_buffer, riirii, r)
+
+EXPORT_FN("buffer.assert", iree_hal_module_buffer_assert, rrriii, v)
+EXPORT_FN("buffer.length", iree_hal_module_buffer_length, r, i)
+EXPORT_FN("buffer.load", iree_hal_module_buffer_load, rii, i)
+EXPORT_FN("buffer.store", iree_hal_module_buffer_store, irii, v)
+EXPORT_FN("buffer.subspan", iree_hal_module_buffer_subspan, rii, r)
+
+EXPORT_FN("buffer_view.assert", iree_hal_module_buffer_view_assert, rriiCiD, v)
+EXPORT_FN("buffer_view.buffer", iree_hal_module_buffer_view_buffer, r, r)
+EXPORT_FN("buffer_view.byte_length", iree_hal_module_buffer_view_byte_length, r, i)
+EXPORT_FN("buffer_view.create", iree_hal_module_buffer_view_create, riiCiD, r)
+EXPORT_FN("buffer_view.dim", iree_hal_module_buffer_view_dim, ri, i)
+EXPORT_FN("buffer_view.element_type", iree_hal_module_buffer_view_element_type, r, i)
+EXPORT_FN("buffer_view.encoding_type", iree_hal_module_buffer_view_encoding_type, r, i)
+EXPORT_FN("buffer_view.rank", iree_hal_module_buffer_view_rank, r, i)
+EXPORT_FN("buffer_view.trace", iree_hal_module_buffer_view_trace, rCrD, v)
+
+EXPORT_FN("command_buffer.begin", iree_hal_module_command_buffer_begin, r, v)
+EXPORT_FN("command_buffer.begin_debug_group", iree_hal_module_command_buffer_begin_debug_group, rr, v)
+EXPORT_FN("command_buffer.bind_descriptor_set", iree_hal_module_command_buffer_bind_descriptor_set, rrirCiD, v)
+EXPORT_FN("command_buffer.copy_buffer", iree_hal_module_command_buffer_copy_buffer, rririi, v)
+EXPORT_FN("command_buffer.create", iree_hal_module_command_buffer_create, rii, r)
+EXPORT_FN("command_buffer.dispatch", iree_hal_module_command_buffer_dispatch, rriiii, v)
+EXPORT_FN("command_buffer.dispatch.indirect", iree_hal_module_command_buffer_dispatch_indirect, rriri, v)
+EXPORT_FN("command_buffer.end", iree_hal_module_command_buffer_end, r, v)
+EXPORT_FN("command_buffer.end_debug_group", iree_hal_module_command_buffer_end_debug_group, r, v)
+EXPORT_FN("command_buffer.execution_barrier", iree_hal_module_command_buffer_execution_barrier, riii, v)
+EXPORT_FN("command_buffer.fill_buffer", iree_hal_module_command_buffer_fill_buffer, rriiii, v)
+EXPORT_FN("command_buffer.push_constants", iree_hal_module_command_buffer_push_constants, rriCiD, v)
+EXPORT_FN("command_buffer.push_descriptor_set", iree_hal_module_command_buffer_push_descriptor_set, rriCiriiD, v)
+
+EXPORT_FN("descriptor_set.create", iree_hal_module_descriptor_set_create, rrCiriiD, r)
+
+EXPORT_FN("descriptor_set_layout.create", iree_hal_module_descriptor_set_layout_create, riCiiD, r)
+
+EXPORT_FN("device.allocator", iree_hal_module_device_allocator, r, r)
+EXPORT_FN("device.query.i32", iree_hal_module_device_query_i32, rrr, ii)
+
+EXPORT_FN("ex.shared_device", iree_hal_module_ex_shared_device, v, r)
+EXPORT_FN("ex.submit_and_wait", iree_hal_module_ex_submit_and_wait, rr, v)
+
+EXPORT_FN("executable.create", iree_hal_module_executable_create, rrrrCrD, r)
+
+EXPORT_FN("executable_layout.create", iree_hal_module_executable_layout_create, riCrD, r)
+
+EXPORT_FN("semaphore.await", iree_hal_module_semaphore_await, ri, i)
+EXPORT_FN("semaphore.create", iree_hal_module_semaphore_create, ri, r)
+EXPORT_FN("semaphore.fail", iree_hal_module_semaphore_fail, r, i)
+EXPORT_FN("semaphore.query", iree_hal_module_semaphore_query, r, ii)
+EXPORT_FN("semaphore.signal", iree_hal_module_semaphore_signal, ri, v)
+
+// clang-format on
diff --git a/runtime/src/iree/modules/hal/module.c b/runtime/src/iree/modules/hal/module.c
new file mode 100644
index 0000000..bf0534b
--- /dev/null
+++ b/runtime/src/iree/modules/hal/module.c
@@ -0,0 +1,1473 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/modules/hal/module.h"
+
+#include <inttypes.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/api.h"
+#include "iree/vm/api.h"
+
+// Limit the number of bindings we pass down through the HAL. This can be tuned
+// in the future but right now guards the stack from blowing up during calls.
+#define IREE_HAL_MODULE_MAX_DESCRIPTOR_BINDING_COUNT ((iree_host_size_t)32)
+
+//===----------------------------------------------------------------------===//
+// Type registration
+//===----------------------------------------------------------------------===//
+
+static iree_vm_ref_type_descriptor_t iree_hal_allocator_descriptor = {0};
+static iree_vm_ref_type_descriptor_t iree_hal_buffer_descriptor = {0};
+static iree_vm_ref_type_descriptor_t iree_hal_buffer_view_descriptor = {0};
+static iree_vm_ref_type_descriptor_t iree_hal_command_buffer_descriptor = {0};
+static iree_vm_ref_type_descriptor_t iree_hal_descriptor_set_descriptor = {0};
+static iree_vm_ref_type_descriptor_t iree_hal_descriptor_set_layout_descriptor =
+    {0};
+static iree_vm_ref_type_descriptor_t iree_hal_device_descriptor = {0};
+static iree_vm_ref_type_descriptor_t iree_hal_event_descriptor = {0};
+static iree_vm_ref_type_descriptor_t iree_hal_executable_descriptor = {0};
+static iree_vm_ref_type_descriptor_t iree_hal_executable_layout_descriptor = {
+    0};
+static iree_vm_ref_type_descriptor_t iree_hal_semaphore_descriptor = {0};
+
+#define IREE_VM_REGISTER_HAL_C_TYPE(type, name, destroy_fn, descriptor)   \
+  descriptor.type_name = iree_make_cstring_view(name);                    \
+  descriptor.offsetof_counter = offsetof(iree_hal_resource_t, ref_count); \
+  descriptor.destroy = (iree_vm_ref_destroy_t)destroy_fn;                 \
+  IREE_RETURN_IF_ERROR(iree_vm_ref_register_type(&descriptor));
+
+IREE_API_EXPORT iree_status_t iree_hal_module_register_types(void) {
+  static bool has_registered = false;
+  if (has_registered) return iree_ok_status();
+
+  IREE_VM_REGISTER_HAL_C_TYPE(iree_hal_allocator_t, "hal.allocator",
+                              iree_hal_allocator_destroy,
+                              iree_hal_allocator_descriptor);
+  IREE_VM_REGISTER_HAL_C_TYPE(iree_hal_buffer_t, "hal.buffer",
+                              iree_hal_buffer_recycle,
+                              iree_hal_buffer_descriptor);
+  IREE_VM_REGISTER_HAL_C_TYPE(iree_hal_buffer_view_t, "hal.buffer_view",
+                              iree_hal_buffer_view_destroy,
+                              iree_hal_buffer_view_descriptor);
+  IREE_VM_REGISTER_HAL_C_TYPE(iree_hal_command_buffer_t, "hal.command_buffer",
+                              iree_hal_command_buffer_destroy,
+                              iree_hal_command_buffer_descriptor);
+  IREE_VM_REGISTER_HAL_C_TYPE(iree_hal_descriptor_set_t, "hal.descriptor_set",
+                              iree_hal_descriptor_set_destroy,
+                              iree_hal_descriptor_set_descriptor);
+  IREE_VM_REGISTER_HAL_C_TYPE(iree_hal_descriptor_set_layout_t,
+                              "hal.descriptor_set_layout",
+                              iree_hal_descriptor_set_layout_destroy,
+                              iree_hal_descriptor_set_layout_descriptor);
+  IREE_VM_REGISTER_HAL_C_TYPE(iree_hal_device_t, "hal.device",
+                              iree_hal_device_destroy,
+                              iree_hal_device_descriptor);
+  IREE_VM_REGISTER_HAL_C_TYPE(iree_hal_event_t, "hal.event",
+                              iree_hal_event_destroy,
+                              iree_hal_event_descriptor);
+  IREE_VM_REGISTER_HAL_C_TYPE(iree_hal_executable_t, "hal.executable",
+                              iree_hal_executable_destroy,
+                              iree_hal_executable_descriptor);
+  IREE_VM_REGISTER_HAL_C_TYPE(iree_hal_executable_layout_t,
+                              "hal.executable_layout",
+                              iree_hal_executable_layout_destroy,
+                              iree_hal_executable_layout_descriptor);
+  IREE_VM_REGISTER_HAL_C_TYPE(iree_hal_semaphore_t, "hal.semaphore",
+                              iree_hal_semaphore_destroy,
+                              iree_hal_semaphore_descriptor);
+
+  has_registered = true;
+  return iree_ok_status();
+}
+
+//===----------------------------------------------------------------------===//
+// Type wrappers
+//===----------------------------------------------------------------------===//
+
+IREE_VM_DEFINE_TYPE_ADAPTERS(iree_hal_allocator, iree_hal_allocator_t);
+IREE_VM_DEFINE_TYPE_ADAPTERS(iree_hal_buffer, iree_hal_buffer_t);
+IREE_VM_DEFINE_TYPE_ADAPTERS(iree_hal_buffer_view, iree_hal_buffer_view_t);
+IREE_VM_DEFINE_TYPE_ADAPTERS(iree_hal_command_buffer,
+                             iree_hal_command_buffer_t);
+IREE_VM_DEFINE_TYPE_ADAPTERS(iree_hal_descriptor_set,
+                             iree_hal_descriptor_set_t);
+IREE_VM_DEFINE_TYPE_ADAPTERS(iree_hal_descriptor_set_layout,
+                             iree_hal_descriptor_set_layout_t);
+IREE_VM_DEFINE_TYPE_ADAPTERS(iree_hal_device, iree_hal_device_t);
+IREE_VM_DEFINE_TYPE_ADAPTERS(iree_hal_event, iree_hal_event_t);
+IREE_VM_DEFINE_TYPE_ADAPTERS(iree_hal_executable, iree_hal_executable_t);
+IREE_VM_DEFINE_TYPE_ADAPTERS(iree_hal_executable_layout,
+                             iree_hal_executable_layout_t);
+IREE_VM_DEFINE_TYPE_ADAPTERS(iree_hal_semaphore, iree_hal_semaphore_t);
+
+//===----------------------------------------------------------------------===//
+// Module type definitions
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_module_t {
+  iree_allocator_t host_allocator;
+  iree_hal_device_t* shared_device;
+  // TODO(benvanik): types.
+} iree_hal_module_t;
+
+#define IREE_HAL_MODULE_CAST(module) \
+  (iree_hal_module_t*)((uint8_t*)(module) + iree_vm_native_module_size());
+
+typedef struct iree_hal_module_state_t {
+  iree_allocator_t host_allocator;
+  iree_hal_device_t* shared_device;
+  iree_status_t loop_status;
+  iree_hal_executable_cache_t* executable_cache;
+
+  iree_hal_semaphore_t* submit_semaphore;
+  uint64_t submit_value;
+} iree_hal_module_state_t;
+
+static void IREE_API_PTR iree_hal_module_destroy(void* base_module) {
+  iree_hal_module_t* module = IREE_HAL_MODULE_CAST(base_module);
+  iree_hal_device_release(module->shared_device);
+}
+
+static iree_status_t IREE_API_PTR
+iree_hal_module_alloc_state(void* self, iree_allocator_t host_allocator,
+                            iree_vm_module_state_t** out_module_state) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_module_t* module = IREE_HAL_MODULE_CAST(self);
+  iree_hal_module_state_t* state = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0,
+      iree_allocator_malloc(host_allocator, sizeof(*state), (void**)&state));
+  memset(state, 0, sizeof(*state));
+  state->host_allocator = host_allocator;
+  state->shared_device = module->shared_device;
+  iree_hal_device_retain(state->shared_device);
+
+  state->loop_status = iree_ok_status();
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_hal_executable_cache_create(
+              state->shared_device, iree_string_view_empty(),
+              iree_loop_inline(&state->loop_status), &state->executable_cache));
+
+  state->submit_value = 0ull;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_hal_semaphore_create(state->shared_device, state->submit_value,
+                                    &state->submit_semaphore));
+
+  *out_module_state = (iree_vm_module_state_t*)state;
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+static void IREE_API_PTR
+iree_hal_module_free_state(void* self, iree_vm_module_state_t* module_state) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_module_state_t* state = (iree_hal_module_state_t*)module_state;
+  iree_hal_semaphore_release(state->submit_semaphore);
+  iree_hal_executable_cache_release(state->executable_cache);
+  iree_status_ignore(state->loop_status);
+  iree_hal_device_release(state->shared_device);
+  iree_allocator_free(state->host_allocator, state);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_status_t IREE_API_PTR iree_hal_module_notify(
+    void* self, iree_vm_module_state_t* module_state, iree_vm_signal_t signal) {
+  iree_hal_module_state_t* state = (iree_hal_module_state_t*)module_state;
+  switch (signal) {
+    case IREE_VM_SIGNAL_SUSPEND:
+    case IREE_VM_SIGNAL_LOW_MEMORY:
+      return iree_hal_device_trim(state->shared_device);
+    default:
+      return iree_ok_status();
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Experimental APIs
+//===----------------------------------------------------------------------===//
+// NOTE: Ex* APIs are experimental and likely to be removed soon. Modules
+// using these APIs are not forward compatible.
+
+IREE_VM_ABI_EXPORT(iree_hal_module_ex_shared_device,  //
+                   iree_hal_module_state_t,           //
+                   v, r) {
+  rets->r0 = iree_hal_device_retain_ref(state->shared_device);
+  return iree_ok_status();
+}
+
+IREE_VM_ABI_EXPORT(iree_hal_module_ex_submit_and_wait,  //
+                   iree_hal_module_state_t,             //
+                   rr, v) {
+  iree_hal_device_t* device = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_device_check_deref(args->r0, &device));
+  iree_hal_command_buffer_t* command_buffer = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_hal_command_buffer_check_deref(args->r1, &command_buffer));
+
+  // Batch with our single command buffer.
+  iree_hal_submission_batch_t batch;
+  memset(&batch, 0, sizeof(batch));
+
+  iree_hal_command_buffer_t* command_buffer_ptrs[] = {command_buffer};
+  batch.command_buffer_count = IREE_ARRAYSIZE(command_buffer_ptrs);
+  batch.command_buffers = command_buffer_ptrs;
+
+  uint64_t next_semaphore_value = ++state->submit_value;
+  iree_hal_semaphore_t* signal_semaphore_ptrs[] = {state->submit_semaphore};
+  uint64_t signal_semaphore_values[] = {next_semaphore_value};
+  batch.signal_semaphores.count = IREE_ARRAYSIZE(signal_semaphore_ptrs);
+  batch.signal_semaphores.semaphores = signal_semaphore_ptrs;
+  batch.signal_semaphores.payload_values = signal_semaphore_values;
+
+  iree_status_t status = iree_hal_device_submit_and_wait(
+      device, IREE_HAL_COMMAND_CATEGORY_ANY, 0, 1, &batch,
+      state->submit_semaphore, next_semaphore_value, iree_infinite_timeout());
+  if (!iree_status_is_ok(status)) {
+    return status;
+  }
+
+  return iree_ok_status();
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_allocator_t
+//===----------------------------------------------------------------------===//
+
+IREE_VM_ABI_EXPORT(iree_hal_module_allocator_allocate,  //
+                   iree_hal_module_state_t,             //
+                   riii, r) {
+  iree_hal_allocator_t* allocator = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_allocator_check_deref(args->r0, &allocator));
+  iree_hal_memory_type_t memory_types = (iree_hal_memory_type_t)args->i1;
+  iree_hal_buffer_usage_t buffer_usage = (iree_hal_buffer_usage_t)args->i2;
+  iree_vm_size_t allocation_size = (iree_vm_size_t)args->i3;
+
+  const iree_hal_buffer_params_t params = {
+      .type = memory_types,
+      .usage = buffer_usage,
+  };
+  iree_hal_buffer_t* buffer = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_allocator_allocate_buffer(
+      allocator, params, allocation_size, iree_const_byte_span_empty(),
+      &buffer));
+  rets->r0 = iree_hal_buffer_move_ref(buffer);
+  return iree_ok_status();
+}
+
+static void iree_hal_module_mapped_buffer_release(void* user_data,
+                                                  iree_hal_buffer_t* buffer) {
+  iree_vm_buffer_t* backing_buffer = (iree_vm_buffer_t*)user_data;
+  iree_vm_buffer_release(backing_buffer);
+}
+
+IREE_VM_ABI_EXPORT(iree_hal_module_allocator_map_byte_buffer,  //
+                   iree_hal_module_state_t,                    //
+                   riiirii, r) {
+  iree_hal_allocator_t* allocator = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_allocator_check_deref(args->r0, &allocator));
+  bool is_try = args->i1 != 0;
+  iree_hal_memory_type_t memory_types = (iree_hal_memory_type_t)args->i2;
+  iree_hal_buffer_usage_t buffer_usage = (iree_hal_buffer_usage_t)args->i3;
+  iree_vm_buffer_t* source = NULL;
+  IREE_RETURN_IF_ERROR(iree_vm_buffer_check_deref(args->r4, &source));
+  iree_vm_size_t offset = (iree_vm_size_t)args->i5;
+  iree_vm_size_t length = (iree_vm_size_t)args->i6;
+
+  iree_host_size_t buffer_length = source->data.data_length;
+  if (length == -1) {
+    length = buffer_length;
+  }
+  if (length < 0 || offset < 0 || offset > buffer_length ||
+      offset + length > buffer_length) {
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "byte range out of bounds (requested %d-%d of available %zu)", offset,
+        (offset + length - 1), buffer_length);
+  }
+
+  iree_hal_memory_access_t allowed_access = IREE_HAL_MEMORY_ACCESS_READ;
+  if (!iree_all_bits_set(source->access, IREE_VM_BUFFER_ACCESS_MUTABLE)) {
+    // Source buffer is read-only; require that the access request matches.
+    if (!iree_all_bits_set(buffer_usage, IREE_HAL_BUFFER_USAGE_CONSTANT)) {
+      return iree_make_status(IREE_STATUS_PERMISSION_DENIED,
+                              "source buffer is immutable and can only be "
+                              "mapped for constant usage");
+    }
+
+    // NOTE: if we wanted to lock things down for when there's no MMU to ensure
+    // that the loaded program doesn't touch the memory then we could just fail
+    // the request - the program will then perform an alloc+copy and can do
+    // whatever it wants with the memory.
+  } else {
+    // Source buffer is mutable; allow in-place writes.
+    if (!iree_all_bits_set(buffer_usage, IREE_HAL_BUFFER_USAGE_CONSTANT)) {
+      allowed_access |= IREE_HAL_MEMORY_ACCESS_WRITE;
+    }
+  }
+
+  // Try mapping - note that this may fail if the target device cannot map the
+  // memory into the given type (for example, mapping a host buffer into
+  // device-local memory is only going to work on unified memory systems).
+  const iree_hal_buffer_params_t params = {
+      .type = memory_types,
+      .usage = buffer_usage,
+      .access = allowed_access,
+  };
+  iree_hal_external_buffer_t external_buffer = {
+      .type = IREE_HAL_EXTERNAL_BUFFER_TYPE_HOST_ALLOCATION,
+      .flags = IREE_HAL_EXTERNAL_BUFFER_FLAG_NONE,
+      .size = length,
+      .handle.host_allocation.ptr = source->data.data + offset,
+  };
+  iree_hal_buffer_release_callback_t release_callback = {
+      .fn = iree_hal_module_mapped_buffer_release,
+      .user_data = source,
+  };
+  iree_hal_buffer_t* buffer = NULL;
+  iree_status_t status = iree_hal_allocator_import_buffer(
+      allocator, params, &external_buffer, release_callback, &buffer);
+  if (iree_status_is_ok(status)) {
+    // Mapping succeeded - retain the source buffer that'll be released by
+    // iree_hal_module_map_data_ctl when the mapping is no longer used.
+    iree_vm_buffer_retain(source);
+    rets->r0 = iree_hal_buffer_move_ref(buffer);
+    return iree_ok_status();
+  }
+
+  // Failed to map - if this was a try then don't fail and just rely on the
+  // result being nullptr to indicate to the caller that things failed.
+  memset(&rets->r0, 0, sizeof(rets->r0));
+  if (is_try) {
+    iree_status_ignore(status);
+    return iree_ok_status();
+  }
+  return status;
+}
+
+// TODO(#7277): drop this method (use map instead) with streams.
+IREE_VM_ABI_EXPORT(iree_hal_module_allocator_wrap_byte_buffer,  //
+                   iree_hal_module_state_t,                     //
+                   riirii, r) {
+  iree_hal_allocator_t* allocator = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_allocator_check_deref(args->r0, &allocator));
+  iree_hal_memory_type_t memory_types = (iree_hal_memory_type_t)args->i1;
+  iree_hal_buffer_usage_t buffer_usage = (iree_hal_buffer_usage_t)args->i2;
+  iree_vm_buffer_t* source = NULL;
+  IREE_RETURN_IF_ERROR(iree_vm_buffer_check_deref(args->r3, &source));
+  iree_vm_size_t offset = (iree_vm_size_t)args->i4;
+  iree_vm_size_t length = (iree_vm_size_t)args->i5;
+
+  iree_host_size_t buffer_length = source->data.data_length;
+  if (length == -1) {
+    length = buffer_length;
+  }
+  if (length < 0 || offset < 0 || offset > buffer_length ||
+      offset + length > buffer_length) {
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "byte range out of bounds (requested %d-%d of available %zu)", offset,
+        (offset + length - 1), buffer_length);
+  }
+
+  const iree_hal_buffer_params_t params = {
+      .type = memory_types,
+      .usage = buffer_usage,
+  };
+  iree_hal_buffer_t* buffer = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_hal_allocator_allocate_buffer(
+          allocator, params, length,
+          iree_make_const_byte_span(source->data.data + offset, length),
+          &buffer),
+      "failed to allocate buffer of length %d", length);
+
+  rets->r0 = iree_hal_buffer_move_ref(buffer);
+  return iree_ok_status();
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_buffer_t
+//===----------------------------------------------------------------------===//
+
+IREE_VM_ABI_EXPORT(iree_hal_module_buffer_assert,  //
+                   iree_hal_module_state_t,        //
+                   rrriii, v) {
+  iree_hal_buffer_t* buffer = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_check_deref(args->r0, &buffer));
+  iree_vm_buffer_t* message = NULL;
+  IREE_RETURN_IF_ERROR(iree_vm_buffer_check_deref(args->r1, &message));
+  iree_string_view_t message_str IREE_ATTRIBUTE_UNUSED =
+      iree_vm_buffer_as_string(message);
+  iree_hal_allocator_t* allocator = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_allocator_check_deref(args->r2, &allocator));
+  iree_vm_size_t minimum_length = (iree_vm_size_t)args->i3;
+  iree_hal_memory_type_t required_memory_types =
+      (iree_hal_memory_type_t)args->i4;
+  iree_hal_buffer_usage_t required_buffer_usage =
+      (iree_hal_buffer_usage_t)args->i5;
+
+  // Ensure we have enough bytes in the buffer for the encoding we have.
+  // Note that having more bytes is fine:
+  //   assert(expected_length <= actual_length);
+  iree_device_size_t actual_length = iree_hal_buffer_byte_length(buffer);
+  if (actual_length < minimum_length) {
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "%.*s buffer byte length %" PRIdsz " less than expected minimum %d",
+        (int)message_str.size, message_str.data, actual_length, minimum_length);
+  }
+
+  // TODO(benvanik): assert that the buffer view is accessible from the
+  // target device. This needs some iree_hal_allocator_* methods for checking
+  // whether the external buffer can be used. To start we just compare if the
+  // allocators are identical.
+
+  // All memory type bits expected (indicating where the program intends to use
+  // the buffer data) must be set in the buffer while the buffer is allowed to
+  // have more bits.
+  iree_hal_memory_type_t actual_memory_type =
+      iree_hal_buffer_memory_type(buffer);
+  if (!iree_all_bits_set(actual_memory_type, required_memory_types)) {
+#if IREE_HAL_MODULE_STRING_UTIL_ENABLE
+    iree_bitfield_string_temp_t temp0, temp1;
+    iree_string_view_t actual_memory_type_str =
+        iree_hal_memory_type_format(actual_memory_type, &temp0);
+    iree_string_view_t expected_memory_type_str =
+        iree_hal_memory_type_format(required_memory_types, &temp1);
+    return iree_make_status(
+        IREE_STATUS_PERMISSION_DENIED,
+        "%.*s buffer memory type is not compatible; buffer has %.*s, operation "
+        "requires %.*s",
+        (int)message_str.size, message_str.data,
+        (int)actual_memory_type_str.size, actual_memory_type_str.data,
+        (int)expected_memory_type_str.size, expected_memory_type_str.data);
+#else
+    return iree_make_status(
+        IREE_STATUS_PERMISSION_DENIED,
+        "%.*s buffer memory type is not compatible; buffer has %08X, operation "
+        "requires %08X",
+        (int)message_str.size, message_str.data, actual_memory_type,
+        expected_memory_type);
+#endif  // IREE_HAL_MODULE_STRING_UTIL_ENABLE
+  }
+
+  // All usage bits expected (indicating what the program intends to use the
+  // buffer for) must be set in the buffer while the buffer is allowed to have
+  // more bits.
+  iree_hal_buffer_usage_t actual_buffer_usage =
+      iree_hal_buffer_allowed_usage(buffer);
+  if (!iree_all_bits_set(actual_buffer_usage, required_buffer_usage)) {
+#if IREE_HAL_MODULE_STRING_UTIL_ENABLE
+    iree_bitfield_string_temp_t temp0, temp1;
+    iree_string_view_t allowed_usage_str =
+        iree_hal_buffer_usage_format(actual_buffer_usage, &temp0);
+    iree_string_view_t required_usage_str =
+        iree_hal_buffer_usage_format(required_buffer_usage, &temp1);
+    return iree_make_status(
+        IREE_STATUS_PERMISSION_DENIED,
+        "%.*s requested usage was not specified when the buffer was allocated; "
+        "buffer allows %.*s, operation requires %.*s",
+        (int)message_str.size, message_str.data, (int)allowed_usage_str.size,
+        allowed_usage_str.data, (int)required_usage_str.size,
+        required_usage_str.data);
+#else
+    return iree_make_status(
+        IREE_STATUS_PERMISSION_DENIED,
+        "%.*s requested usage was not specified when the buffer was allocated; "
+        "buffer allows %08X, operation requires %08X",
+        (int)message_str.size, message_str.data, allowed_buffer_usage,
+        required_buffer_usage);
+#endif  // IREE_HAL_MODULE_STRING_UTIL_ENABLE
+  }
+
+  return iree_ok_status();
+}
+
+IREE_VM_ABI_EXPORT(iree_hal_module_buffer_subspan,  //
+                   iree_hal_module_state_t,         //
+                   rii, r) {
+  iree_hal_buffer_t* source_buffer = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_check_deref(args->r0, &source_buffer));
+  iree_vm_size_t source_offset = (iree_vm_size_t)args->i1;
+  iree_vm_size_t length = (iree_vm_size_t)args->i2;
+
+  iree_hal_buffer_t* subspan_buffer = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_hal_buffer_subspan(source_buffer, source_offset, length,
+                              &subspan_buffer),
+      "invalid subspan of an existing buffer (source_offset=%d, length=%d)",
+      source_offset, length);
+  rets->r0 = iree_hal_buffer_move_ref(subspan_buffer);
+  return iree_ok_status();
+}
+
+IREE_VM_ABI_EXPORT(iree_hal_module_buffer_length,  //
+                   iree_hal_module_state_t,        //
+                   r, i) {
+  iree_hal_buffer_t* buffer = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_check_deref(args->r0, &buffer));
+  rets->i0 = iree_hal_buffer_byte_length(buffer);
+  return iree_ok_status();
+}
+
+IREE_VM_ABI_EXPORT(iree_hal_module_buffer_load,  //
+                   iree_hal_module_state_t,      //
+                   rii, i) {
+  iree_hal_buffer_t* source_buffer = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_check_deref(args->r0, &source_buffer));
+  iree_vm_size_t source_offset = (iree_vm_size_t)args->i1;
+  iree_vm_size_t length = (iree_vm_size_t)args->i2;
+
+  uint32_t target_buffer = 0;
+  if (length > sizeof(target_buffer)) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "load length byte count %d exceeds max", length);
+  }
+
+  IREE_RETURN_IF_ERROR(iree_hal_device_transfer_d2h(
+      state->shared_device, source_buffer, source_offset, &target_buffer,
+      length, IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT, iree_infinite_timeout()));
+
+  rets->i0 = target_buffer;
+  return iree_ok_status();
+}
+
+IREE_VM_ABI_EXPORT(iree_hal_module_buffer_store,  //
+                   iree_hal_module_state_t,       //
+                   irii, v) {
+  int32_t value = args->i0;
+  iree_hal_buffer_t* target_buffer = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_check_deref(args->r1, &target_buffer));
+  iree_vm_size_t target_offset = (iree_vm_size_t)args->i2;
+  iree_vm_size_t length = (iree_vm_size_t)args->i3;
+
+  if (length > sizeof(value)) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "store length byte count %d exceeds max", length);
+  } else if (target_offset + length >
+             iree_hal_buffer_byte_length(target_buffer)) {
+    return iree_make_status(
+        IREE_STATUS_OUT_OF_RANGE,
+        "store out of bounds (target_offset=%d, length=%d into max %" PRIdsz
+        ")",
+        target_offset, length, iree_hal_buffer_byte_length(target_buffer));
+  }
+
+  return iree_hal_device_transfer_h2d(
+      state->shared_device, &value, target_buffer, target_offset, length,
+      IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT, iree_infinite_timeout());
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_buffer_view_t
+//===----------------------------------------------------------------------===//
+
+IREE_VM_ABI_EXPORT(iree_hal_module_buffer_view_create,  //
+                   iree_hal_module_state_t,             //
+                   riiCiD, r) {
+  iree_hal_buffer_t* source_buffer = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_check_deref(args->r0, &source_buffer));
+  iree_hal_element_type_t element_type = (iree_hal_element_type_t)args->i1;
+  iree_hal_encoding_type_t encoding_type = (iree_hal_encoding_type_t)args->i2;
+  iree_host_size_t shape_rank = 0;
+  iree_hal_dim_t* shape_dims = NULL;
+  IREE_VM_ABI_VLA_STACK_CAST(args, a3_count, a3, iree_hal_dim_t, 128,
+                             &shape_rank, &shape_dims);
+
+  iree_hal_buffer_view_t* buffer_view = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_view_create(
+      source_buffer, shape_dims, shape_rank, element_type, encoding_type,
+      state->host_allocator, &buffer_view));
+  rets->r0 = iree_hal_buffer_view_move_ref(buffer_view);
+  return iree_ok_status();
+}
+
+// Returns true if the |expected_type| can be satisfied with |actual_type|.
+// This allows for basic type widening and bypassing instead of requiring an
+// exact match in all cases.
+static bool iree_hal_element_types_are_compatible(
+    iree_hal_element_type_t actual_type,
+    iree_hal_element_type_t expected_type) {
+  if (iree_hal_element_numerical_type_is_opaque(actual_type)) {
+    // If the provided type is opaque it can map to anything. This allows
+    // applications to bypass the checks when they are treating all the data as
+    // opaque, such as when carrying around buffer data in binary blobs.
+    return true;
+  }
+
+  if (iree_hal_element_numerical_type_is_integer(actual_type) &&
+      iree_hal_element_numerical_type_is_integer(expected_type) &&
+      iree_hal_element_bit_count(actual_type) ==
+          iree_hal_element_bit_count(expected_type)) {
+    // Integer types of the same bit width are allowed to be cast.
+    // This allows users or the compiler to treat data as signless while still
+    // allowing signedness. For example, tensor<1xi32> can successfully match
+    // a tensor<1xui32> expectation.
+    return true;
+  }
+
+  // Otherwise we require an exact match. This may be overly conservative but
+  // in most cases is a useful error message. Users can pass in OPAQUE types if
+  // hitting this to bypass.
+  return actual_type == expected_type;
+}
+
+IREE_VM_ABI_EXPORT(iree_hal_module_buffer_view_assert,  //
+                   iree_hal_module_state_t,             //
+                   rriiCiD, v) {
+  iree_hal_buffer_view_t* buffer_view = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_hal_buffer_view_check_deref(args->r0, &buffer_view));
+  iree_vm_buffer_t* message = NULL;
+  IREE_RETURN_IF_ERROR(iree_vm_buffer_check_deref(args->r1, &message));
+  iree_string_view_t message_str IREE_ATTRIBUTE_UNUSED =
+      iree_vm_buffer_as_string(message);
+  iree_hal_element_type_t expected_element_type =
+      (iree_hal_element_type_t)args->i2;
+  iree_hal_encoding_type_t expected_encoding_type =
+      (iree_hal_encoding_type_t)args->i3;
+  iree_host_size_t expected_shape_rank = 0;
+  iree_hal_dim_t* expected_shape_dims = NULL;
+  IREE_VM_ABI_VLA_STACK_CAST(args, a4_count, a4, iree_hal_dim_t, 128,
+                             &expected_shape_rank, &expected_shape_dims);
+
+  // Check encoding first; getting the encoding wrong is worse than the shape.
+  // If the actual encoding is opaque we allow it to pass through - this lets
+  // users override the assertion in the case where they are just passing data
+  // around and don't care about the contents.
+  iree_hal_encoding_type_t actual_encoding_type =
+      iree_hal_buffer_view_encoding_type(buffer_view);
+  if (actual_encoding_type != IREE_HAL_ENCODING_TYPE_OPAQUE &&
+      actual_encoding_type != expected_encoding_type) {
+    // TODO(benvanik): string formatting of encodings.
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "%.*s encoding mismatch; expected %08X but have %08X",
+        (int)message_str.size, message_str.data, expected_encoding_type,
+        actual_encoding_type);
+  }
+
+  // Element types determine the storage requirements.
+  // If the actual element type is opaque we allow it to pass through.
+  iree_hal_element_type_t actual_element_type =
+      iree_hal_buffer_view_element_type(buffer_view);
+  if (!iree_hal_element_types_are_compatible(actual_element_type,
+                                             expected_element_type)) {
+#if IREE_HAL_MODULE_STRING_UTIL_ENABLE
+    char actual_element_type_str[32];
+    iree_host_size_t actual_element_type_str_length = 0;
+    char expected_element_type_str[32];
+    iree_host_size_t expected_element_type_str_length = 0;
+    IREE_RETURN_IF_ERROR(iree_hal_format_element_type(
+        actual_element_type, sizeof(actual_element_type_str),
+        actual_element_type_str, &actual_element_type_str_length));
+    IREE_RETURN_IF_ERROR(iree_hal_format_element_type(
+        expected_element_type, sizeof(expected_element_type_str),
+        expected_element_type_str, &expected_element_type_str_length));
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "%.*s element type mismatch; expected %.*s (%08X) but have %.*s (%08X)",
+        (int)message_str.size, message_str.data,
+        (int)expected_element_type_str_length, expected_element_type_str,
+        expected_element_type, (int)actual_element_type_str_length,
+        actual_element_type_str, actual_element_type);
+#else
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "%.*s element type mismatch; expected %08X but have %08X",
+        (int)message_str.size, message_str.data, expected_element_type,
+        actual_element_type);
+#endif  // IREE_HAL_MODULE_STRING_UTIL_ENABLE
+  }
+
+  // Rank check before the individual shape dimensions.
+  iree_host_size_t actual_shape_rank =
+      iree_hal_buffer_view_shape_rank(buffer_view);
+  const iree_hal_dim_t* actual_shape_dims =
+      iree_hal_buffer_view_shape_dims(buffer_view);
+  iree_status_t shape_status = iree_ok_status();
+  if (actual_shape_rank != expected_shape_rank) {
+    shape_status =
+        iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                         "%.*s shape rank mismatch; expected %zu but have %zu",
+                         (int)message_str.size, message_str.data,
+                         expected_shape_rank, actual_shape_rank);
+  }
+  if (iree_status_is_ok(shape_status)) {
+    for (iree_host_size_t i = 0; i < actual_shape_rank; ++i) {
+      if (actual_shape_dims[i] == expected_shape_dims[i]) continue;
+      // Dimension mismatch.
+      shape_status = iree_make_status(
+          IREE_STATUS_INVALID_ARGUMENT,
+          "%.*s shape dimension %zu mismatch; expected %d but have %d",
+          (int)message_str.size, message_str.data, i, expected_shape_dims[i],
+          actual_shape_dims[i]);
+      break;
+    }
+  }
+
+#if IREE_HAL_MODULE_STRING_UTIL_ENABLE
+  if (!iree_status_is_ok(shape_status)) {
+    char actual_shape_str[32];
+    iree_host_size_t actual_shape_str_length = 0;
+    char expected_shape_str[32];
+    iree_host_size_t expected_shape_str_length = 0;
+    IREE_RETURN_IF_ERROR(iree_hal_format_shape(
+        actual_shape_dims, actual_shape_rank, sizeof(actual_shape_str),
+        actual_shape_str, &actual_shape_str_length));
+    IREE_RETURN_IF_ERROR(iree_hal_format_shape(
+        expected_shape_dims, expected_shape_rank, sizeof(expected_shape_str),
+        expected_shape_str, &expected_shape_str_length));
+    shape_status = iree_status_annotate_f(
+        shape_status, "expected shape %.*s, actual shape %.*s",
+        (int)expected_shape_str_length, expected_shape_str,
+        (int)actual_shape_str_length, actual_shape_str);
+  }
+#endif  // IREE_HAL_MODULE_STRING_UTIL_ENABLE
+
+  return shape_status;
+}
+
+IREE_VM_ABI_EXPORT(iree_hal_module_buffer_view_buffer,  //
+                   iree_hal_module_state_t,             //
+                   r, r) {
+  iree_hal_buffer_view_t* buffer_view = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_hal_buffer_view_check_deref(args->r0, &buffer_view));
+  rets->r0 =
+      iree_hal_buffer_retain_ref(iree_hal_buffer_view_buffer(buffer_view));
+  return iree_ok_status();
+}
+
+IREE_VM_ABI_EXPORT(iree_hal_module_buffer_view_byte_length,  //
+                   iree_hal_module_state_t,                  //
+                   r, i) {
+  iree_hal_buffer_view_t* buffer_view = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_hal_buffer_view_check_deref(args->r0, &buffer_view));
+  rets->i0 = (iree_vm_size_t)iree_hal_buffer_view_byte_length(buffer_view);
+  return iree_ok_status();
+}
+
+IREE_VM_ABI_EXPORT(iree_hal_module_buffer_view_element_type,  //
+                   iree_hal_module_state_t,                   //
+                   r, i) {
+  iree_hal_buffer_view_t* buffer_view = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_hal_buffer_view_check_deref(args->r0, &buffer_view));
+  rets->i0 = (uint32_t)iree_hal_buffer_view_element_type(buffer_view);
+  return iree_ok_status();
+}
+
+IREE_VM_ABI_EXPORT(iree_hal_module_buffer_view_encoding_type,  //
+                   iree_hal_module_state_t,                    //
+                   r, i) {
+  iree_hal_buffer_view_t* buffer_view = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_hal_buffer_view_check_deref(args->r0, &buffer_view));
+  rets->i0 = (uint32_t)iree_hal_buffer_view_encoding_type(buffer_view);
+  return iree_ok_status();
+}
+
+IREE_VM_ABI_EXPORT(iree_hal_module_buffer_view_rank,  //
+                   iree_hal_module_state_t,           //
+                   r, i) {
+  iree_hal_buffer_view_t* buffer_view = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_hal_buffer_view_check_deref(args->r0, &buffer_view));
+  rets->i0 = (iree_vm_size_t)iree_hal_buffer_view_shape_rank(buffer_view);
+  return iree_ok_status();
+}
+
+IREE_VM_ABI_EXPORT(iree_hal_module_buffer_view_dim,  //
+                   iree_hal_module_state_t,          //
+                   ri, i) {
+  iree_hal_buffer_view_t* buffer_view = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_hal_buffer_view_check_deref(args->r0, &buffer_view));
+  iree_vm_size_t index = (iree_vm_size_t)args->i1;
+  rets->i0 = (iree_vm_size_t)iree_hal_buffer_view_shape_dim(buffer_view, index);
+  return iree_ok_status();
+}
+
+IREE_VM_ABI_EXPORT(iree_hal_module_buffer_view_trace,  //
+                   iree_hal_module_state_t,            //
+                   rCrD, v) {
+#if IREE_HAL_MODULE_STRING_UTIL_ENABLE
+
+  iree_vm_buffer_t* key = NULL;
+  IREE_RETURN_IF_ERROR(iree_vm_buffer_check_deref(args->r0, &key));
+  iree_string_view_t key_str = iree_vm_buffer_as_string(key);
+
+  fprintf(stderr, "=== %.*s ===\n", (int)key_str.size, key_str.data);
+  for (iree_host_size_t i = 0; i < args->a1_count; ++i) {
+    iree_hal_buffer_view_t* buffer_view = NULL;
+    IREE_RETURN_IF_ERROR(
+        iree_hal_buffer_view_check_deref(args->a1[i].r0, &buffer_view));
+
+    // NOTE: this export is for debugging only and a no-op in min-size builds.
+    // We heap-alloc here because at the point this export is used performance
+    // is not a concern.
+
+    // Query total length (excluding NUL terminator).
+    iree_host_size_t result_length = 0;
+    iree_status_t status = iree_hal_buffer_view_format(buffer_view, SIZE_MAX, 0,
+                                                       NULL, &result_length);
+    if (!iree_status_is_out_of_range(status)) {
+      return status;
+    }
+    ++result_length;  // include NUL
+
+    // Allocate scratch heap memory to contain the result and format into it.
+    char* result_str = NULL;
+    IREE_RETURN_IF_ERROR(iree_allocator_malloc(
+        state->host_allocator, result_length, (void**)&result_str));
+    status = iree_hal_buffer_view_format(buffer_view, SIZE_MAX, result_length,
+                                         result_str, &result_length);
+    if (iree_status_is_ok(status)) {
+      fprintf(stderr, "%.*s\n", (int)result_length, result_str);
+    }
+    iree_allocator_free(state->host_allocator, result_str);
+    IREE_RETURN_IF_ERROR(status);
+  }
+  fprintf(stderr, "\n");
+
+#endif  // IREE_HAL_MODULE_STRING_UTIL_ENABLE
+  return iree_ok_status();
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_command_buffer_t
+//===----------------------------------------------------------------------===//
+
+IREE_VM_ABI_EXPORT(iree_hal_module_command_buffer_create,  //
+                   iree_hal_module_state_t,                //
+                   rii, r) {
+  iree_hal_device_t* device = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_device_check_deref(args->r0, &device));
+  iree_hal_command_buffer_mode_t modes =
+      (iree_hal_command_buffer_mode_t)args->i1;
+  iree_hal_command_category_t command_categories =
+      (iree_hal_command_category_t)args->i2;
+
+  iree_hal_command_buffer_t* command_buffer = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_command_buffer_create(
+      device, modes, command_categories, IREE_HAL_QUEUE_AFFINITY_ANY,
+      &command_buffer));
+  rets->r0 = iree_hal_command_buffer_move_ref(command_buffer);
+  return iree_ok_status();
+}
+
+IREE_VM_ABI_EXPORT(iree_hal_module_command_buffer_begin,  //
+                   iree_hal_module_state_t,               //
+                   r, v) {
+  iree_hal_command_buffer_t* command_buffer = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_hal_command_buffer_check_deref(args->r0, &command_buffer));
+
+  return iree_hal_command_buffer_begin(command_buffer);
+}
+
+IREE_VM_ABI_EXPORT(iree_hal_module_command_buffer_end,  //
+                   iree_hal_module_state_t,             //
+                   r, v) {
+  iree_hal_command_buffer_t* command_buffer = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_hal_command_buffer_check_deref(args->r0, &command_buffer));
+
+  return iree_hal_command_buffer_end(command_buffer);
+}
+
+IREE_VM_ABI_EXPORT(iree_hal_module_command_buffer_begin_debug_group,  //
+                   iree_hal_module_state_t,                           //
+                   rr, v) {
+  iree_hal_command_buffer_t* command_buffer = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_hal_command_buffer_check_deref(args->r0, &command_buffer));
+  iree_vm_buffer_t* label = NULL;
+  IREE_RETURN_IF_ERROR(iree_vm_buffer_check_deref(args->r1, &label));
+  iree_string_view_t label_str = iree_vm_buffer_as_string(label);
+  // TODO(benvanik): query from VM.
+  iree_hal_label_location_t location = {
+      .file = iree_string_view_empty(),
+      .line = 0,
+  };
+  iree_hal_command_buffer_begin_debug_group(
+      command_buffer, label_str, iree_hal_label_color_unspecified(), &location);
+  return iree_ok_status();
+}
+
+IREE_VM_ABI_EXPORT(iree_hal_module_command_buffer_end_debug_group,  //
+                   iree_hal_module_state_t,                         //
+                   r, v) {
+  iree_hal_command_buffer_t* command_buffer = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_hal_command_buffer_check_deref(args->r0, &command_buffer));
+  iree_hal_command_buffer_end_debug_group(command_buffer);
+  return iree_ok_status();
+}
+
+IREE_VM_ABI_EXPORT(iree_hal_module_command_buffer_execution_barrier,  //
+                   iree_hal_module_state_t,                           //
+                   riii, v) {
+  iree_hal_command_buffer_t* command_buffer = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_hal_command_buffer_check_deref(args->r0, &command_buffer));
+  iree_hal_execution_stage_t source_stage_mask =
+      (iree_hal_execution_stage_t)args->i1;
+  iree_hal_execution_stage_t target_stage_mask =
+      (iree_hal_execution_stage_t)args->i2;
+  iree_hal_execution_barrier_flags_t flags =
+      (iree_hal_execution_barrier_flags_t)args->i3;
+
+  // TODO(benvanik): decode barriers.
+  iree_hal_memory_barrier_t global_barrier;
+  global_barrier.source_scope = IREE_HAL_ACCESS_SCOPE_DISPATCH_WRITE;
+  global_barrier.target_scope = IREE_HAL_ACCESS_SCOPE_DISPATCH_READ;
+
+  return iree_hal_command_buffer_execution_barrier(
+      command_buffer, source_stage_mask, target_stage_mask, flags, 1,
+      &global_barrier, 0, NULL);
+}
+
+IREE_VM_ABI_EXPORT(iree_hal_module_command_buffer_fill_buffer,  //
+                   iree_hal_module_state_t,                     //
+                   rriiii, v) {
+  iree_hal_command_buffer_t* command_buffer = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_hal_command_buffer_check_deref(args->r0, &command_buffer));
+  iree_hal_buffer_t* target_buffer = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_check_deref(args->r1, &target_buffer));
+  iree_vm_size_t target_offset = (iree_vm_size_t)args->i2;
+  iree_vm_size_t length = (iree_vm_size_t)args->i3;
+  uint32_t pattern = (uint32_t)args->i4;
+  uint32_t pattern_length = (uint32_t)args->i5;
+  return iree_hal_command_buffer_fill_buffer(command_buffer, target_buffer,
+                                             target_offset, length, &pattern,
+                                             pattern_length);
+}
+
+IREE_VM_ABI_EXPORT(iree_hal_module_command_buffer_copy_buffer,  //
+                   iree_hal_module_state_t,                     //
+                   rririi, v) {
+  iree_hal_command_buffer_t* command_buffer = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_hal_command_buffer_check_deref(args->r0, &command_buffer));
+  iree_hal_buffer_t* source_buffer = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_check_deref(args->r1, &source_buffer));
+  iree_vm_size_t source_offset = (iree_vm_size_t)args->i2;
+  iree_hal_buffer_t* target_buffer = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_check_deref(args->r3, &target_buffer));
+  iree_vm_size_t target_offset = (iree_vm_size_t)args->i4;
+  iree_vm_size_t length = (iree_vm_size_t)args->i5;
+  return iree_hal_command_buffer_copy_buffer(command_buffer, source_buffer,
+                                             source_offset, target_buffer,
+                                             target_offset, length);
+}
+
+IREE_VM_ABI_EXPORT(iree_hal_module_command_buffer_push_constants,  //
+                   iree_hal_module_state_t,                        //
+                   rriCiD, v) {
+  iree_hal_command_buffer_t* command_buffer = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_hal_command_buffer_check_deref(args->r0, &command_buffer));
+  iree_hal_executable_layout_t* executable_layout = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_hal_executable_layout_check_deref(args->r1, &executable_layout));
+  iree_vm_size_t offset = (iree_vm_size_t)args->i2;
+  iree_host_size_t value_count = args->a3_count;
+  const uint32_t* values = (const uint32_t*)&args->a3[0].i0;
+
+  return iree_hal_command_buffer_push_constants(
+      command_buffer, executable_layout, offset * sizeof(uint32_t), values,
+      value_count * sizeof(uint32_t));
+}
+
+IREE_VM_ABI_EXPORT(iree_hal_module_command_buffer_push_descriptor_set,  //
+                   iree_hal_module_state_t,                             //
+                   rriCiriiD, v) {
+  iree_hal_command_buffer_t* command_buffer = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_hal_command_buffer_check_deref(args->r0, &command_buffer));
+  iree_hal_executable_layout_t* executable_layout = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_hal_executable_layout_check_deref(args->r1, &executable_layout));
+  iree_vm_size_t set = args->i2;
+
+  iree_host_size_t binding_count = args->a3_count;
+  if (IREE_UNLIKELY(binding_count >
+                    IREE_HAL_MODULE_MAX_DESCRIPTOR_BINDING_COUNT)) {
+    return iree_make_status(IREE_STATUS_OUT_OF_RANGE, "binding count %zu > %zu",
+                            binding_count,
+                            IREE_HAL_MODULE_MAX_DESCRIPTOR_BINDING_COUNT);
+  }
+  iree_hal_descriptor_set_binding_t* bindings =
+      (iree_hal_descriptor_set_binding_t*)iree_alloca(
+          binding_count * sizeof(iree_hal_descriptor_set_binding_t));
+  for (iree_host_size_t i = 0; i < binding_count; ++i) {
+    IREE_RETURN_IF_ERROR(
+        iree_hal_buffer_check_deref(args->a3[i].r1, &bindings[i].buffer));
+    bindings[i].binding = (uint32_t)args->a3[i].i0;
+    bindings[i].offset = (iree_device_size_t)args->a3[i].i2;
+    bindings[i].length = (iree_device_size_t)args->a3[i].i3;
+  }
+
+  return iree_hal_command_buffer_push_descriptor_set(
+      command_buffer, executable_layout, set, binding_count, bindings);
+}
+
+IREE_VM_ABI_EXPORT(iree_hal_module_command_buffer_bind_descriptor_set,  //
+                   iree_hal_module_state_t,                             //
+                   rrirCiD, v) {
+  iree_hal_command_buffer_t* command_buffer = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_hal_command_buffer_check_deref(args->r0, &command_buffer));
+  iree_hal_executable_layout_t* executable_layout = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_hal_executable_layout_check_deref(args->r1, &executable_layout));
+  int32_t set = args->i2;
+  iree_hal_descriptor_set_t* descriptor_set = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_hal_descriptor_set_check_deref(args->r3, &descriptor_set));
+  iree_host_size_t dynamic_offset_count = 0;
+  iree_device_size_t* dynamic_offsets = NULL;
+  IREE_VM_ABI_VLA_STACK_CAST(args, a4_count, a4, iree_device_size_t, 64,
+                             &dynamic_offset_count, &dynamic_offsets);
+  return iree_hal_command_buffer_bind_descriptor_set(
+      command_buffer, executable_layout, set, descriptor_set,
+      dynamic_offset_count, dynamic_offsets);
+}
+
+IREE_VM_ABI_EXPORT(iree_hal_module_command_buffer_dispatch,  //
+                   iree_hal_module_state_t,                  //
+                   rriiii, v) {
+  iree_hal_command_buffer_t* command_buffer = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_hal_command_buffer_check_deref(args->r0, &command_buffer));
+  iree_hal_executable_t* executable = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_executable_check_deref(args->r1, &executable));
+  uint32_t entry_point = (uint32_t)args->i2;
+  uint32_t workgroup_x = (uint32_t)args->i3;
+  uint32_t workgroup_y = (uint32_t)args->i4;
+  uint32_t workgroup_z = (uint32_t)args->i5;
+  return iree_hal_command_buffer_dispatch(command_buffer, executable,
+                                          entry_point, workgroup_x, workgroup_y,
+                                          workgroup_z);
+}
+
+IREE_VM_ABI_EXPORT(iree_hal_module_command_buffer_dispatch_indirect,  //
+                   iree_hal_module_state_t,                           //
+                   rriri, v) {
+  iree_hal_command_buffer_t* command_buffer = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_hal_command_buffer_check_deref(args->r0, &command_buffer));
+  iree_hal_executable_t* executable = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_executable_check_deref(args->r1, &executable));
+  uint32_t entry_point = (uint32_t)args->i2;
+  iree_hal_buffer_t* workgroups_buffer = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_hal_buffer_check_deref(args->r3, &workgroups_buffer));
+  iree_vm_size_t workgroups_offset = (iree_vm_size_t)args->i4;
+  return iree_hal_command_buffer_dispatch_indirect(
+      command_buffer, executable, entry_point, workgroups_buffer,
+      workgroups_offset);
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_descriptor_set_t
+//===----------------------------------------------------------------------===//
+
+IREE_VM_ABI_EXPORT(iree_hal_module_descriptor_set_create,  //
+                   iree_hal_module_state_t,                //
+                   rrCiriiD, r) {
+  iree_hal_device_t* device = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_device_check_deref(args->r0, &device));
+  iree_hal_descriptor_set_layout_t* set_layout = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_hal_descriptor_set_layout_check_deref(args->r1, &set_layout));
+
+  iree_host_size_t binding_count = args->a2_count;
+  if (IREE_UNLIKELY(binding_count >
+                    IREE_HAL_MODULE_MAX_DESCRIPTOR_BINDING_COUNT)) {
+    return iree_make_status(IREE_STATUS_OUT_OF_RANGE, "binding count %zu > %zu",
+                            binding_count,
+                            IREE_HAL_MODULE_MAX_DESCRIPTOR_BINDING_COUNT);
+  }
+  iree_hal_descriptor_set_binding_t* bindings =
+      (iree_hal_descriptor_set_binding_t*)iree_alloca(
+          binding_count * sizeof(iree_hal_descriptor_set_binding_t));
+  for (iree_host_size_t i = 0; i < binding_count; ++i) {
+    IREE_RETURN_IF_ERROR(
+        iree_hal_buffer_check_deref(args->a2[i].r1, &bindings[i].buffer));
+    bindings[i].binding = (uint32_t)args->a2[i].i0;
+    bindings[i].offset = (iree_device_size_t)args->a2[i].i2;
+    bindings[i].length = (iree_device_size_t)args->a2[i].i3;
+  }
+
+  iree_hal_descriptor_set_t* descriptor_set = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_descriptor_set_create(
+      device, set_layout, binding_count, bindings, &descriptor_set));
+  rets->r0 = iree_hal_descriptor_set_move_ref(descriptor_set);
+  return iree_ok_status();
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_descriptor_set_layout
+//===----------------------------------------------------------------------===//
+
+IREE_VM_ABI_EXPORT(iree_hal_module_descriptor_set_layout_create,  //
+                   iree_hal_module_state_t,                       //
+                   riCiiD, r) {
+  iree_hal_device_t* device = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_device_check_deref(args->r0, &device));
+  iree_hal_descriptor_set_layout_usage_type_t usage_type =
+      (iree_hal_descriptor_set_layout_usage_type_t)args->i1;
+
+  iree_host_size_t binding_count = args->a2_count;
+  if (IREE_UNLIKELY(binding_count >
+                    IREE_HAL_MODULE_MAX_DESCRIPTOR_BINDING_COUNT)) {
+    return iree_make_status(IREE_STATUS_OUT_OF_RANGE, "binding count %zu > %zu",
+                            binding_count,
+                            IREE_HAL_MODULE_MAX_DESCRIPTOR_BINDING_COUNT);
+  }
+  iree_hal_descriptor_set_layout_binding_t* bindings =
+      (iree_hal_descriptor_set_layout_binding_t*)iree_alloca(
+          binding_count * sizeof(iree_hal_descriptor_set_layout_binding_t));
+  for (iree_host_size_t i = 0; i < binding_count; ++i) {
+    bindings[i].binding = (uint32_t)args->a2[i].i0;
+    bindings[i].type = (iree_hal_descriptor_type_t)args->a2[i].i1;
+  }
+
+  iree_hal_descriptor_set_layout_t* descriptor_set_layout = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_descriptor_set_layout_create(
+      device, usage_type, binding_count, bindings, &descriptor_set_layout));
+  rets->r0 = iree_hal_descriptor_set_layout_move_ref(descriptor_set_layout);
+  return iree_ok_status();
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_device_t
+//===----------------------------------------------------------------------===//
+
+IREE_VM_ABI_EXPORT(iree_hal_module_device_allocator,  //
+                   iree_hal_module_state_t,           //
+                   r, r) {
+  iree_hal_device_t* device = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_device_check_deref(args->r0, &device));
+  rets->r0 = iree_hal_allocator_retain_ref(iree_hal_device_allocator(device));
+  return iree_ok_status();
+}
+
+IREE_VM_ABI_EXPORT(iree_hal_module_device_query_i32,  //
+                   iree_hal_module_state_t,           //
+                   rrr, ii) {
+  iree_hal_device_t* device = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_device_check_deref(args->r0, &device));
+  iree_vm_buffer_t* category = NULL;
+  IREE_RETURN_IF_ERROR(iree_vm_buffer_check_deref(args->r1, &category));
+  iree_string_view_t category_str = iree_vm_buffer_as_string(category);
+  iree_vm_buffer_t* key = NULL;
+  IREE_RETURN_IF_ERROR(iree_vm_buffer_check_deref(args->r2, &key));
+  iree_string_view_t key_str = iree_vm_buffer_as_string(key);
+
+  int32_t value = 0;
+  iree_status_t query_status =
+      iree_hal_device_query_i32(device, category_str, key_str, &value);
+  rets->i0 = iree_status_consume_code(query_status) == IREE_STATUS_OK ? 1 : 0;
+  rets->i1 = (int32_t)value;
+  return iree_ok_status();
+}
+
+//===--------------------------------------------------------------------===//
+// iree_hal_executable_t
+//===--------------------------------------------------------------------===//
+
+IREE_VM_ABI_EXPORT(iree_hal_module_executable_create,  //
+                   iree_hal_module_state_t,            //
+                   rrrrCrD, r) {
+  iree_hal_device_t* device = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_device_check_deref(args->r0, &device));
+  iree_vm_buffer_t* executable_format = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_vm_buffer_check_deref(args->r1, &executable_format));
+  iree_string_view_t executable_format_str =
+      iree_vm_buffer_as_string(executable_format);
+  iree_vm_buffer_t* executable_data = NULL;
+  IREE_RETURN_IF_ERROR(iree_vm_buffer_check_deref(args->r2, &executable_data));
+  iree_host_size_t constant_count = 0;
+  const uint32_t* constants = NULL;
+  if (iree_vm_buffer_isa(args->r3)) {
+    iree_vm_buffer_t* constant_buffer = NULL;
+    IREE_RETURN_IF_ERROR(
+        iree_vm_buffer_check_deref(args->r3, &constant_buffer));
+    if (constant_buffer->data.data_length % 4 != 0) {
+      return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                              "constant buffer data must contain 4-byte "
+                              "elements but data length is %" PRIhsz,
+                              constant_buffer->data.data_length);
+    }
+    constant_count = constant_buffer->data.data_length / sizeof(uint32_t);
+    constants = (const uint32_t*)constant_buffer->data.data;
+  }
+  iree_host_size_t executable_layout_count = args->a4_count;
+  iree_hal_executable_layout_t** executable_layouts = NULL;
+  IREE_RETURN_IF_ERROR(iree_allocator_malloc(
+      state->host_allocator,
+      executable_layout_count * sizeof(executable_layouts[0]),
+      (void**)&executable_layouts));
+  iree_status_t status = iree_ok_status();
+  for (iree_host_size_t i = 0; i < executable_layout_count; ++i) {
+    status = iree_hal_executable_layout_check_deref(args->a4[i].r0,
+                                                    &executable_layouts[i]);
+    if (!iree_status_is_ok(status)) break;
+  }
+
+  iree_hal_executable_t* executable = NULL;
+  if (iree_status_is_ok(status)) {
+    iree_hal_executable_params_t executable_params;
+    iree_hal_executable_params_initialize(&executable_params);
+    executable_params.caching_mode |=
+        executable_data->access == IREE_VM_BUFFER_ACCESS_ORIGIN_MODULE
+            ? IREE_HAL_EXECUTABLE_CACHING_MODE_ALIAS_PROVIDED_DATA
+            : 0;
+    executable_params.executable_format = executable_format_str;
+    executable_params.executable_data = iree_make_const_byte_span(
+        executable_data->data.data, executable_data->data.data_length);
+    executable_params.executable_layout_count = executable_layout_count;
+    executable_params.executable_layouts = executable_layouts;
+    executable_params.constant_count = constant_count;
+    executable_params.constants = constants;
+    status = iree_hal_executable_cache_prepare_executable(
+        state->executable_cache, &executable_params, &executable);
+  }
+
+  iree_allocator_free(state->host_allocator, executable_layouts);
+  rets->r0 = iree_hal_executable_move_ref(executable);
+  return status;
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_executable_layout_t
+//===----------------------------------------------------------------------===//
+
+IREE_VM_ABI_EXPORT(iree_hal_module_executable_layout_create,  //
+                   iree_hal_module_state_t,                   //
+                   riCrD, r) {
+  iree_hal_device_t* device = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_device_check_deref(args->r0, &device));
+  int32_t push_constants = (int32_t)args->i1;
+  iree_host_size_t set_layout_count = 0;
+  iree_hal_descriptor_set_layout_t** set_layouts = NULL;
+  IREE_VM_ABI_VLA_STACK_DEREF(args, a2_count, a2,
+                              iree_hal_descriptor_set_layout, 32,
+                              &set_layout_count, &set_layouts);
+
+  iree_hal_executable_layout_t* executable_layout = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_executable_layout_create(
+      device, push_constants, set_layout_count, set_layouts,
+      &executable_layout));
+  rets->r0 = iree_hal_executable_layout_move_ref(executable_layout);
+  return iree_ok_status();
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_semaphore_t
+//===----------------------------------------------------------------------===//
+
+IREE_VM_ABI_EXPORT(iree_hal_module_semaphore_create,  //
+                   iree_hal_module_state_t,           //
+                   ri, r) {
+  iree_hal_device_t* device = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_device_check_deref(args->r0, &device));
+  uint32_t initial_value = (uint32_t)args->i1;
+
+  iree_hal_semaphore_t* semaphore = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_hal_semaphore_create(device, initial_value, &semaphore));
+  rets->r0 = iree_hal_semaphore_move_ref(semaphore);
+  return iree_ok_status();
+}
+
+IREE_VM_ABI_EXPORT(iree_hal_module_semaphore_query,  //
+                   iree_hal_module_state_t,          //
+                   r, ii) {
+  iree_hal_semaphore_t* semaphore = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_semaphore_check_deref(args->r0, &semaphore));
+
+  uint64_t value = 0;
+  iree_status_t query_status = iree_hal_semaphore_query(semaphore, &value);
+  rets->i0 = iree_status_consume_code(query_status);
+  rets->i1 = (uint32_t)value;
+  return iree_ok_status();
+}
+
+IREE_VM_ABI_EXPORT(iree_hal_module_semaphore_signal,  //
+                   iree_hal_module_state_t,           //
+                   ri, v) {
+  iree_hal_semaphore_t* semaphore = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_semaphore_check_deref(args->r0, &semaphore));
+  uint32_t new_value = (uint32_t)args->i1;
+
+  return iree_hal_semaphore_signal(semaphore, new_value);
+}
+
+IREE_VM_ABI_EXPORT(iree_hal_module_semaphore_fail,  //
+                   iree_hal_module_state_t,         //
+                   ri, v) {
+  iree_hal_semaphore_t* semaphore = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_semaphore_check_deref(args->r0, &semaphore));
+  iree_status_code_t status_code =
+      (iree_status_code_t)(args->i1 & IREE_STATUS_CODE_MASK);
+
+  iree_hal_semaphore_fail(semaphore, iree_make_status(status_code));
+  return iree_ok_status();
+}
+
+IREE_VM_ABI_EXPORT(iree_hal_module_semaphore_await,  //
+                   iree_hal_module_state_t,          //
+                   ri, i) {
+  iree_hal_semaphore_t* semaphore = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_semaphore_check_deref(args->r0, &semaphore));
+  uint64_t new_value = (uint32_t)args->i1;
+
+  // TODO(benvanik): coroutine magic.
+  iree_status_t status =
+      iree_hal_semaphore_wait(semaphore, new_value, iree_infinite_timeout());
+  if (iree_status_is_ok(status)) {
+    rets->i0 = 0;
+  } else if (iree_status_is_deadline_exceeded(status)) {
+    // Propagate deadline exceeded back to the VM.
+    rets->i0 = (int32_t)iree_status_consume_code(status);
+  }
+  return status;
+}
+
+//===----------------------------------------------------------------------===//
+// VM module interface implementation
+//===----------------------------------------------------------------------===//
+
+// NOTE: this must match the ordering of the iree_hal_module_exports_ table.
+static const iree_vm_native_function_ptr_t iree_hal_module_funcs_[] = {
+#define EXPORT_FN(name, target_fn, arg_types, ret_types)       \
+  {                                                            \
+      .shim = (iree_vm_native_function_shim_t)                 \
+          iree_vm_shim_##arg_types##_##ret_types,              \
+      .target = (iree_vm_native_function_target_t)(target_fn), \
+  },
+#include "iree/modules/hal/exports.inl"  // IWYU pragma: keep
+#undef EXPORT_FN
+};
+
+// NOTE: 0 length, but can't express that in C.
+static const iree_vm_native_import_descriptor_t iree_hal_module_imports_[1];
+
+static const iree_vm_native_export_descriptor_t iree_hal_module_exports_[] = {
+#define EXPORT_FN(name, target_fn, arg_types, ret_types)           \
+  {                                                                \
+      .local_name = iree_string_view_literal(name),                \
+      .calling_convention =                                        \
+          iree_string_view_literal("0" #arg_types "_" #ret_types), \
+      .reflection_attr_count = 0,                                  \
+      .reflection_attrs = NULL,                                    \
+  },
+#include "iree/modules/hal/exports.inl"  // IWYU pragma: keep
+#undef EXPORT_FN
+};
+static_assert(IREE_ARRAYSIZE(iree_hal_module_funcs_) ==
+                  IREE_ARRAYSIZE(iree_hal_module_exports_),
+              "function pointer table must be 1:1 with exports");
+
+static const iree_vm_native_module_descriptor_t iree_hal_module_descriptor_ = {
+    .module_name = iree_string_view_literal("hal"),
+    .import_count = 0,  // workaround for 0-length C struct
+    .imports = iree_hal_module_imports_,
+    .export_count = IREE_ARRAYSIZE(iree_hal_module_exports_),
+    .exports = iree_hal_module_exports_,
+    .function_count = IREE_ARRAYSIZE(iree_hal_module_funcs_),
+    .functions = iree_hal_module_funcs_,
+    .reflection_attr_count = 0,
+    .reflection_attrs = NULL,
+};
+
+IREE_API_EXPORT iree_status_t
+iree_hal_module_create(iree_hal_device_t* device, iree_allocator_t allocator,
+                       iree_vm_module_t** out_module) {
+  IREE_ASSERT_ARGUMENT(device);
+  IREE_ASSERT_ARGUMENT(out_module);
+  *out_module = NULL;
+
+  // Setup the interface with the functions we implement ourselves. Any function
+  // we omit will be handled by the base native module.
+  static const iree_vm_module_t interface = {
+      .destroy = iree_hal_module_destroy,
+      .alloc_state = iree_hal_module_alloc_state,
+      .free_state = iree_hal_module_free_state,
+      .notify = iree_hal_module_notify,
+  };
+
+  // Allocate shared module state.
+  iree_host_size_t total_size =
+      iree_vm_native_module_size() + sizeof(iree_hal_module_t);
+  iree_vm_module_t* base_module = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_allocator_malloc(allocator, total_size, (void**)&base_module));
+  memset(base_module, 0, total_size);
+  iree_status_t status = iree_vm_native_module_initialize(
+      &interface, &iree_hal_module_descriptor_, allocator, base_module);
+  if (!iree_status_is_ok(status)) {
+    iree_allocator_free(allocator, base_module);
+    return status;
+  }
+
+  iree_hal_module_t* module = IREE_HAL_MODULE_CAST(base_module);
+  module->host_allocator = allocator;
+  module->shared_device = device;
+  iree_hal_device_retain(module->shared_device);
+
+  *out_module = base_module;
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_hal_device_t* iree_hal_module_state_device(
+    iree_vm_module_state_t* module_state) {
+  iree_hal_module_state_t* state = (iree_hal_module_state_t*)module_state;
+  return state->shared_device;
+}
+
+//===--------------------------------------------------------------------===//
+// Utilities
+//===--------------------------------------------------------------------===//
+
+IREE_API_EXPORT iree_hal_buffer_view_t* iree_vm_list_get_buffer_view_assign(
+    const iree_vm_list_t* list, iree_host_size_t i) {
+  return (iree_hal_buffer_view_t*)iree_vm_list_get_ref_deref(
+      list, i, iree_hal_buffer_view_get_descriptor());
+}
+
+IREE_API_EXPORT iree_hal_buffer_view_t* iree_vm_list_get_buffer_view_retain(
+    const iree_vm_list_t* list, iree_host_size_t i) {
+  iree_hal_buffer_view_t* value = iree_vm_list_get_buffer_view_assign(list, i);
+  iree_hal_buffer_view_retain(value);
+  return value;
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_list_set_buffer_view_retain(
+    iree_vm_list_t* list, iree_host_size_t i, iree_hal_buffer_view_t* value) {
+  iree_vm_ref_t value_ref;
+  IREE_RETURN_IF_ERROR(iree_vm_ref_wrap_assign(
+      value, iree_hal_buffer_view_type_id(), &value_ref));
+  return iree_vm_list_set_ref_retain(list, i, &value_ref);
+}
diff --git a/runtime/src/iree/modules/hal/module.h b/runtime/src/iree/modules/hal/module.h
new file mode 100644
index 0000000..4a66ccd
--- /dev/null
+++ b/runtime/src/iree/modules/hal/module.h
@@ -0,0 +1,69 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_MODULES_HAL_MODULE_H_
+#define IREE_MODULES_HAL_MODULE_H_
+
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/vm/api.h"
+
+IREE_VM_DECLARE_TYPE_ADAPTERS(iree_hal_allocator, iree_hal_allocator_t);
+IREE_VM_DECLARE_TYPE_ADAPTERS(iree_hal_buffer, iree_hal_buffer_t);
+IREE_VM_DECLARE_TYPE_ADAPTERS(iree_hal_buffer_view, iree_hal_buffer_view_t);
+IREE_VM_DECLARE_TYPE_ADAPTERS(iree_hal_command_buffer,
+                              iree_hal_command_buffer_t);
+IREE_VM_DECLARE_TYPE_ADAPTERS(iree_hal_descriptor_set,
+                              iree_hal_descriptor_set_t);
+IREE_VM_DECLARE_TYPE_ADAPTERS(iree_hal_descriptor_set_layout,
+                              iree_hal_descriptor_set_layout_t);
+IREE_VM_DECLARE_TYPE_ADAPTERS(iree_hal_device, iree_hal_device_t);
+IREE_VM_DECLARE_TYPE_ADAPTERS(iree_hal_event, iree_hal_event_t);
+IREE_VM_DECLARE_TYPE_ADAPTERS(iree_hal_executable, iree_hal_executable_t);
+IREE_VM_DECLARE_TYPE_ADAPTERS(iree_hal_executable_cache,
+                              iree_hal_executable_cache_t);
+IREE_VM_DECLARE_TYPE_ADAPTERS(iree_hal_executable_layout,
+                              iree_hal_executable_layout_t);
+IREE_VM_DECLARE_TYPE_ADAPTERS(iree_hal_semaphore, iree_hal_semaphore_t);
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Registers the custom types used by the HAL module.
+// WARNING: not thread-safe; call at startup before using.
+IREE_API_EXPORT iree_status_t iree_hal_module_register_types(void);
+
+// Creates the HAL module initialized to use a specific |device|.
+// Each context using this module will share the device and have compatible
+// allocations.
+IREE_API_EXPORT iree_status_t
+iree_hal_module_create(iree_hal_device_t* device, iree_allocator_t allocator,
+                       iree_vm_module_t** out_module);
+
+// Returns the device currently in use by the HAL module.
+// Returns NULL if no device has been initialized yet.
+IREE_API_EXPORT iree_hal_device_t* iree_hal_module_state_device(
+    iree_vm_module_state_t* module_state);
+
+// TODO(benvanik): generate these list helpers:
+
+IREE_API_EXPORT iree_hal_buffer_view_t* iree_vm_list_get_buffer_view_assign(
+    const iree_vm_list_t* list, iree_host_size_t i);
+
+IREE_API_EXPORT iree_hal_buffer_view_t* iree_vm_list_get_buffer_view_retain(
+    const iree_vm_list_t* list, iree_host_size_t i);
+
+IREE_API_EXPORT iree_status_t iree_vm_list_set_buffer_view_retain(
+    iree_vm_list_t* list, iree_host_size_t i, iree_hal_buffer_view_t* value);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_MODULES_HAL_MODULE_H_
diff --git a/runtime/src/iree/modules/vmvx/BUILD b/runtime/src/iree/modules/vmvx/BUILD
new file mode 100644
index 0000000..1a49b24
--- /dev/null
+++ b/runtime/src/iree/modules/vmvx/BUILD
@@ -0,0 +1,31 @@
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("//iree:build_defs.oss.bzl", "iree_runtime_cc_library")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+iree_runtime_cc_library(
+    name = "vmvx",
+    srcs = [
+        "module.c",
+    ],
+    hdrs = [
+        "module.h",
+    ],
+    textual_hdrs = [
+        "exports.inl",
+    ],
+    deps = [
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:tracing",
+        "//runtime/src/iree/vm",
+    ],
+)
diff --git a/runtime/src/iree/modules/vmvx/CMakeLists.txt b/runtime/src/iree/modules/vmvx/CMakeLists.txt
new file mode 100644
index 0000000..5b6bcf9
--- /dev/null
+++ b/runtime/src/iree/modules/vmvx/CMakeLists.txt
@@ -0,0 +1,29 @@
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# runtime/src/iree/modules/vmvx/BUILD                                          #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
+
+iree_add_all_subdirs()
+
+iree_cc_library(
+  NAME
+    vmvx
+  HDRS
+    "module.h"
+  TEXTUAL_HDRS
+    "exports.inl"
+  SRCS
+    "module.c"
+  DEPS
+    iree::base
+    iree::base::tracing
+    iree::vm
+  PUBLIC
+)
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/runtime/src/iree/modules/vmvx/exports.inl b/runtime/src/iree/modules/vmvx/exports.inl
new file mode 100644
index 0000000..70b3ef0
--- /dev/null
+++ b/runtime/src/iree/modules/vmvx/exports.inl
@@ -0,0 +1,28 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+//===----------------------------------------------------------------------===//
+//
+//         ██     ██  █████  ██████  ███    ██ ██ ███    ██  ██████
+//         ██     ██ ██   ██ ██   ██ ████   ██ ██ ████   ██ ██
+//         ██  █  ██ ███████ ██████  ██ ██  ██ ██ ██ ██  ██ ██   ███
+//         ██ ███ ██ ██   ██ ██   ██ ██  ██ ██ ██ ██  ██ ██ ██    ██
+//          ███ ███  ██   ██ ██   ██ ██   ████ ██ ██   ████  ██████
+//
+//===----------------------------------------------------------------------===//
+//
+// This file matches the vmvx.imports.mlir in the compiler. It'd be nice to
+// autogenerate this as the order of these functions must be sorted ascending by
+// name in a way compatible with iree_string_view_compare.
+//
+// Users are meant to `#define EXPORT_FN` to be able to access the information.
+// #define EXPORT_FN(name, arg_type, ret_type, target_fn)
+
+// clang-format off
+
+EXPORT_FN("_placeholder", iree_vmvx_module_placeholder, v, v)
+
+// clang-format on
diff --git a/runtime/src/iree/modules/vmvx/module.c b/runtime/src/iree/modules/vmvx/module.c
new file mode 100644
index 0000000..2133f67
--- /dev/null
+++ b/runtime/src/iree/modules/vmvx/module.c
@@ -0,0 +1,183 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/modules/vmvx/module.h"
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+#include "iree/vm/api.h"
+
+//===----------------------------------------------------------------------===//
+// Type registration
+//===----------------------------------------------------------------------===//
+
+// NOTE: we aren't exporting any types yet; this is just the empty boilerplate.
+
+// static iree_vm_ref_type_descriptor_t iree_vmvx_interface_descriptor = {0};
+
+#define IREE_VM_REGISTER_VMVX_C_TYPE(type, name, destroy_fn, descriptor) \
+  descriptor.type_name = iree_make_cstring_view(name);                   \
+  descriptor.offsetof_counter = offsetof(type, ref_object);              \
+  descriptor.destroy = (iree_vm_ref_destroy_t)destroy_fn;                \
+  IREE_RETURN_IF_ERROR(iree_vm_ref_register_type(&descriptor));
+
+IREE_API_EXPORT iree_status_t iree_vmvx_module_register_types() {
+  static bool has_registered = false;
+  if (has_registered) return iree_ok_status();
+
+  // IREE_VM_REGISTER_VMVX_C_TYPE(iree_vmvx_interface_t, "vmvx.interface",
+  //                              iree_vmvx_interface_destroy,
+  //                              iree_vmvx_interface_descriptor);
+
+  has_registered = true;
+  return iree_ok_status();
+}
+
+//===----------------------------------------------------------------------===//
+// Type wrappers
+//===----------------------------------------------------------------------===//
+
+// IREE_VM_DEFINE_TYPE_ADAPTERS(iree_vmvx_interface, iree_vmvx_interface_t);
+
+//===----------------------------------------------------------------------===//
+// Module type definitions
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_vmvx_module_t {
+  iree_allocator_t host_allocator;
+  // TODO(benvanik): types when we are not registering them globally.
+} iree_vmvx_module_t;
+
+#define IREE_VMVX_MODULE_CAST(module) \
+  (iree_vmvx_module_t*)((uint8_t*)(module) + iree_vm_native_module_size());
+
+typedef struct iree_vmvx_module_state_t {
+  iree_allocator_t host_allocator;
+
+  // If we have any external libraries we want to interact with that are
+  // stateful we could store their state here. Note that VMVX invocations may
+  // happen from any thread and concurrently and if the state is not thread-safe
+  // we'll have to perform the synchronization ourselves here.
+} iree_vmvx_module_state_t;
+
+static void IREE_API_PTR iree_vmvx_module_destroy(void* base_module) {
+  // No state to clean up (yet).
+}
+
+static iree_status_t IREE_API_PTR
+iree_vmvx_module_alloc_state(void* self, iree_allocator_t host_allocator,
+                             iree_vm_module_state_t** out_module_state) {
+  iree_vmvx_module_state_t* state = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_allocator_malloc(host_allocator, sizeof(*state), (void**)&state));
+  memset(state, 0, sizeof(*state));
+  state->host_allocator = host_allocator;
+  *out_module_state = (iree_vm_module_state_t*)state;
+  return iree_ok_status();
+}
+
+static void IREE_API_PTR
+iree_vmvx_module_free_state(void* self, iree_vm_module_state_t* module_state) {
+  iree_vmvx_module_state_t* state = (iree_vmvx_module_state_t*)module_state;
+  iree_allocator_free(state->host_allocator, state);
+}
+
+//===----------------------------------------------------------------------===//
+// TODO
+//===----------------------------------------------------------------------===//
+
+// Placeholder to make the function pointer arrays happy (they can't be empty).
+IREE_VM_ABI_EXPORT(iree_vmvx_module_placeholder,  //
+                   iree_vmvx_module_state_t,      //
+                   v, v) {
+  return iree_ok_status();
+}
+
+//===----------------------------------------------------------------------===//
+// VM module interface implementation
+//===----------------------------------------------------------------------===//
+
+// NOTE: this must match the ordering of the iree_vmvx_module_exports_ table.
+static const iree_vm_native_function_ptr_t iree_vmvx_module_funcs_[] = {
+#define EXPORT_FN(name, target_fn, arg_types, ret_types)       \
+  {                                                            \
+      .shim = (iree_vm_native_function_shim_t)                 \
+          iree_vm_shim_##arg_types##_##ret_types,              \
+      .target = (iree_vm_native_function_target_t)(target_fn), \
+  },
+#include "iree/modules/vmvx/exports.inl"  // IWYU pragma: keep
+#undef EXPORT_FN
+};
+
+// NOTE: 0 length, but can't express that in C.
+static const iree_vm_native_import_descriptor_t iree_vmvx_module_imports_[1];
+
+static const iree_vm_native_export_descriptor_t iree_vmvx_module_exports_[] = {
+#define EXPORT_FN(name, target_fn, arg_types, ret_types)           \
+  {                                                                \
+      .local_name = iree_string_view_literal(name),                \
+      .calling_convention =                                        \
+          iree_string_view_literal("0" #arg_types "_" #ret_types), \
+      .reflection_attr_count = 0,                                  \
+      .reflection_attrs = NULL,                                    \
+  },
+#include "iree/modules/vmvx/exports.inl"  // IWYU pragma: keep
+#undef EXPORT_FN
+};
+static_assert(IREE_ARRAYSIZE(iree_vmvx_module_funcs_) ==
+                  IREE_ARRAYSIZE(iree_vmvx_module_exports_),
+              "function pointer table must be 1:1 with exports");
+
+static const iree_vm_native_module_descriptor_t iree_vmvx_module_descriptor_ = {
+    .module_name = iree_string_view_literal("vmvx"),
+    .import_count = 0,  // workaround for 0-length C struct
+    .imports = iree_vmvx_module_imports_,
+    .export_count = IREE_ARRAYSIZE(iree_vmvx_module_exports_),
+    .exports = iree_vmvx_module_exports_,
+    .function_count = IREE_ARRAYSIZE(iree_vmvx_module_funcs_),
+    .functions = iree_vmvx_module_funcs_,
+    .reflection_attr_count = 0,
+    .reflection_attrs = NULL,
+};
+
+IREE_API_EXPORT iree_status_t iree_vmvx_module_create(
+    iree_allocator_t allocator, iree_vm_module_t** out_module) {
+  IREE_ASSERT_ARGUMENT(out_module);
+  *out_module = NULL;
+
+  // Setup the interface with the functions we implement ourselves. Any function
+  // we omit will be handled by the base native module.
+  static const iree_vm_module_t interface = {
+      .destroy = iree_vmvx_module_destroy,
+      .alloc_state = iree_vmvx_module_alloc_state,
+      .free_state = iree_vmvx_module_free_state,
+  };
+
+  // Allocate shared module state.
+  iree_host_size_t total_size =
+      iree_vm_native_module_size() + sizeof(iree_vmvx_module_t);
+  iree_vm_module_t* base_module = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_allocator_malloc(allocator, total_size, (void**)&base_module));
+  memset(base_module, 0, total_size);
+  iree_status_t status = iree_vm_native_module_initialize(
+      &interface, &iree_vmvx_module_descriptor_, allocator, base_module);
+  if (!iree_status_is_ok(status)) {
+    iree_allocator_free(allocator, base_module);
+    return status;
+  }
+
+  iree_vmvx_module_t* module = IREE_VMVX_MODULE_CAST(base_module);
+  module->host_allocator = allocator;
+
+  *out_module = base_module;
+  return iree_ok_status();
+}
diff --git a/runtime/src/iree/modules/vmvx/module.h b/runtime/src/iree/modules/vmvx/module.h
new file mode 100644
index 0000000..61ec691
--- /dev/null
+++ b/runtime/src/iree/modules/vmvx/module.h
@@ -0,0 +1,31 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_MODULES_VMVX_MODULE_H_
+#define IREE_MODULES_VMVX_MODULE_H_
+
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/vm/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Registers the custom types used by the HAL module.
+// WARNING: not thread-safe; call at startup before using.
+IREE_API_EXPORT iree_status_t iree_vmvx_module_register_types();
+
+// Creates the VMVX module with a default configuration.
+IREE_API_EXPORT iree_status_t iree_vmvx_module_create(
+    iree_allocator_t allocator, iree_vm_module_t** out_module);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_MODULES_VMVX_MODULE_H_
diff --git a/runtime/src/iree/runtime/BUILD.bazel b/runtime/src/iree/runtime/BUILD.bazel
new file mode 100644
index 0000000..2c435fb
--- /dev/null
+++ b/runtime/src/iree/runtime/BUILD.bazel
@@ -0,0 +1,58 @@
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("//iree:build_defs.oss.bzl", "iree_runtime_cc_library")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+#===------------------------------------------------------------------------===#
+# Public API
+#===------------------------------------------------------------------------===#
+
+iree_runtime_cc_library(
+    name = "runtime",
+    hdrs = [
+        "api.h",
+    ],
+    deps = [
+        ":impl",
+        "//runtime/src/iree/base",
+    ],
+)
+
+#===------------------------------------------------------------------------===#
+# Implementation
+#===------------------------------------------------------------------------===#
+
+iree_runtime_cc_library(
+    name = "impl",
+    srcs = [
+        "call.c",
+        "instance.c",
+        "session.c",
+    ],
+    hdrs = [
+        "call.h",
+        "instance.h",
+        "session.h",
+    ],
+    deps = [
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:core_headers",
+        "//runtime/src/iree/base:tracing",
+        "//runtime/src/iree/base/internal",
+        "//runtime/src/iree/base/internal:file_io",
+        "//runtime/src/iree/hal",
+        "//runtime/src/iree/hal/drivers",
+        "//runtime/src/iree/modules/hal",
+        "//runtime/src/iree/vm",
+        "//runtime/src/iree/vm:bytecode_module",
+    ],
+)
diff --git a/runtime/src/iree/runtime/CMakeLists.txt b/runtime/src/iree/runtime/CMakeLists.txt
new file mode 100644
index 0000000..143fd67
--- /dev/null
+++ b/runtime/src/iree/runtime/CMakeLists.txt
@@ -0,0 +1,56 @@
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# iree/runtime/BUILD                                                           #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
+
+iree_add_all_subdirs()
+
+iree_cc_library(
+  NAME
+    runtime
+  HDRS
+    "api.h"
+  DEPS
+    ::impl
+    iree::base
+  PUBLIC
+)
+
+iree_cc_library(
+  NAME
+    impl
+  HDRS
+    "call.h"
+    "instance.h"
+    "session.h"
+  SRCS
+    "call.c"
+    "instance.c"
+    "session.c"
+  DEPS
+    iree::base
+    iree::base::core_headers
+    iree::base::internal
+    iree::base::internal::file_io
+    iree::base::tracing
+    iree::hal
+    iree::hal::drivers
+    iree::modules::hal
+    iree::vm
+    iree::vm::bytecode_module
+  PUBLIC
+)
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
+
+iree_cc_unified_library(
+  NAME
+    unified
+  ROOT
+    ::impl
+)
diff --git a/runtime/src/iree/runtime/README.md b/runtime/src/iree/runtime/README.md
new file mode 100644
index 0000000..45dd876
--- /dev/null
+++ b/runtime/src/iree/runtime/README.md
@@ -0,0 +1,11 @@
+# IREE Higher-Level Runtime API
+
+This directory implements a higher-level runtime API on top of the low level
+APIs split across `iree/base/api.h`, `iree/hal/api.h`, and `iree/vm/api.h`.
+
+Using this higher level API may pull in additional dependencies and perform
+additional allocations compared to what you can get by directly going to the
+lower levels. For the most part, the higher level and lower levels APIs may be
+mixed.
+
+See [the demo directory](./demo/) for sample usage.
diff --git a/runtime/src/iree/runtime/api.h b/runtime/src/iree/runtime/api.h
new file mode 100644
index 0000000..850ac52
--- /dev/null
+++ b/runtime/src/iree/runtime/api.h
@@ -0,0 +1,20 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_RUNTIME_API_H_
+#define IREE_RUNTIME_API_H_
+
+// Lower-level APIs:
+#include "iree/base/api.h"  // IWYU pragma: export
+#include "iree/hal/api.h"   // IWYU pragma: export
+#include "iree/vm/api.h"    // IWYU pragma: export
+
+// Runtime API:
+#include "iree/runtime/call.h"      // IWYU pragma: export
+#include "iree/runtime/instance.h"  // IWYU pragma: export
+#include "iree/runtime/session.h"   // IWYU pragma: export
+
+#endif  // IREE_RUNTIME_API_H_
diff --git a/runtime/src/iree/runtime/call.c b/runtime/src/iree/runtime/call.c
new file mode 100644
index 0000000..764668a
--- /dev/null
+++ b/runtime/src/iree/runtime/call.c
@@ -0,0 +1,124 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/runtime/call.h"
+
+#include <stddef.h>
+#include <string.h>
+
+#include "iree/base/api.h"
+#include "iree/modules/hal/module.h"
+#include "iree/runtime/session.h"
+
+//===----------------------------------------------------------------------===//
+// iree_runtime_call_t
+//===----------------------------------------------------------------------===//
+
+IREE_API_EXPORT iree_status_t iree_runtime_call_initialize(
+    iree_runtime_session_t* session, iree_vm_function_t function,
+    iree_runtime_call_t* out_call) {
+  IREE_ASSERT_ARGUMENT(session);
+  IREE_ASSERT_ARGUMENT(out_call);
+  memset(out_call, 0, sizeof(*out_call));
+
+  // Query the signature of the function to determine the sizes of the lists.
+  iree_vm_function_signature_t signature =
+      iree_vm_function_signature(&function);
+  iree_string_view_t arguments;
+  iree_string_view_t results;
+  IREE_RETURN_IF_ERROR(iree_vm_function_call_get_cconv_fragments(
+      &signature, &arguments, &results));
+
+  out_call->session = session;
+  iree_runtime_session_retain(session);
+  out_call->function = function;
+
+  // Allocate the input and output lists with the required capacity.
+  // A user wanting to avoid dynamic allocations could instead create on-stack
+  // storage for these and use iree_vm_list_initialize instead. This high-level
+  // API keeps things simple, though, and for the frequency of calls through
+  // this interface a few small pooled malloc calls should be fine.
+  iree_allocator_t host_allocator =
+      iree_runtime_session_host_allocator(session);
+  iree_status_t status = iree_vm_list_create(
+      /*element_type=*/NULL, arguments.size, host_allocator, &out_call->inputs);
+  if (iree_status_is_ok(status)) {
+    status = iree_vm_list_create(
+        /*element_type=*/NULL, results.size, host_allocator,
+        &out_call->outputs);
+  }
+
+  if (!iree_status_is_ok(status)) {
+    iree_runtime_call_deinitialize(out_call);
+  }
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_runtime_call_initialize_by_name(
+    iree_runtime_session_t* session, iree_string_view_t full_name,
+    iree_runtime_call_t* out_call) {
+  iree_vm_function_t function;
+  IREE_RETURN_IF_ERROR(
+      iree_runtime_session_lookup_function(session, full_name, &function));
+  return iree_runtime_call_initialize(session, function, out_call);
+}
+
+IREE_API_EXPORT void iree_runtime_call_deinitialize(iree_runtime_call_t* call) {
+  IREE_ASSERT_ARGUMENT(call);
+  iree_vm_list_release(call->inputs);
+  iree_vm_list_release(call->outputs);
+  iree_runtime_session_release(call->session);
+}
+
+IREE_API_EXPORT void iree_runtime_call_reset(iree_runtime_call_t* call) {
+  IREE_ASSERT_ARGUMENT(call);
+  iree_status_ignore(iree_vm_list_resize(call->inputs, 0));
+  iree_status_ignore(iree_vm_list_resize(call->outputs, 0));
+}
+
+IREE_API_EXPORT iree_vm_list_t* iree_runtime_call_inputs(
+    const iree_runtime_call_t* call) {
+  IREE_ASSERT_ARGUMENT(call);
+  return call->inputs;
+}
+
+IREE_API_EXPORT iree_vm_list_t* iree_runtime_call_outputs(
+    const iree_runtime_call_t* call) {
+  IREE_ASSERT_ARGUMENT(call);
+  return call->outputs;
+}
+
+IREE_API_EXPORT iree_status_t iree_runtime_call_invoke(
+    iree_runtime_call_t* call, iree_runtime_call_flags_t flags) {
+  return iree_runtime_session_call(call->session, &call->function, call->inputs,
+                                   call->outputs);
+}
+
+//===----------------------------------------------------------------------===//
+// Helpers for defining call I/O
+//===----------------------------------------------------------------------===//
+
+IREE_API_EXPORT iree_status_t iree_runtime_call_inputs_push_back_buffer_view(
+    iree_runtime_call_t* call, iree_hal_buffer_view_t* buffer_view) {
+  IREE_ASSERT_ARGUMENT(call);
+  IREE_ASSERT_ARGUMENT(buffer_view);
+  iree_vm_ref_t value = {0};
+  IREE_RETURN_IF_ERROR(iree_vm_ref_wrap_assign(
+      buffer_view, iree_hal_buffer_view_type_id(), &value));
+  return iree_vm_list_push_ref_retain(call->inputs, &value);
+}
+
+// Pops a buffer view from the front of the call outputs list.
+// Ownership of the buffer view transfers to the caller.
+IREE_API_EXPORT iree_status_t iree_runtime_call_outputs_pop_front_buffer_view(
+    iree_runtime_call_t* call, iree_hal_buffer_view_t** out_buffer_view) {
+  IREE_ASSERT_ARGUMENT(call);
+  IREE_ASSERT_ARGUMENT(out_buffer_view);
+  *out_buffer_view = NULL;
+  iree_vm_ref_t value = {0};
+  IREE_RETURN_IF_ERROR(iree_vm_list_pop_front_ref_move(call->outputs, &value));
+  return iree_hal_buffer_view_check_deref(value, out_buffer_view);
+}
diff --git a/runtime/src/iree/runtime/call.h b/runtime/src/iree/runtime/call.h
new file mode 100644
index 0000000..69d0540
--- /dev/null
+++ b/runtime/src/iree/runtime/call.h
@@ -0,0 +1,118 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_RUNTIME_CALL_H_
+#define IREE_RUNTIME_CALL_H_
+
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/vm/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef struct iree_runtime_session_t iree_runtime_session_t;
+
+//===----------------------------------------------------------------------===//
+// iree_runtime_call_t
+//===----------------------------------------------------------------------===//
+
+// TODO(benvanik): determine if we want to control behavior like non-blocking
+// or whether to consume inputs like this or by having separate call types.
+// For example, an async_call may make things more clear when using semaphores
+// without having to pollute this interface.
+enum iree_runtime_call_flag_bits_t {
+  IREE_RUNTIME_CALL_FLAG_RESERVED = 0u,
+};
+typedef uint32_t iree_runtime_call_flags_t;
+
+// A stateful VM function call builder.
+//
+// Applications that will be calling the same function repeatedly can reuse the
+// call to avoid having to construct the inputs lists each time. Outputs of
+// prior calls will be retained unless iree_runtime_call_reset is used and will
+// be provided to the VM on subsequent calls to reuse (if able): when reusing a
+// call like this callers are required to either reset the call, copy their
+// data out, or reset the particular output they are consuming.
+//
+// Thread-compatible; these are designed to be stack-local or embedded in a user
+// data structure that can provide synchronization when required.
+typedef struct iree_runtime_call_t {
+  iree_runtime_session_t* session;
+  iree_vm_function_t function;
+  iree_vm_list_t* inputs;
+  iree_vm_list_t* outputs;
+} iree_runtime_call_t;
+
+// Initializes call state for a call to |function| within |session|.
+IREE_API_EXPORT iree_status_t iree_runtime_call_initialize(
+    iree_runtime_session_t* session, iree_vm_function_t function,
+    iree_runtime_call_t* out_call);
+
+// Initializes call state for a call to |full_name| within |session|.
+//
+// The function name matches the original MLIR module and function symbols.
+// Example:
+//   module @foo {
+//     func.func @bar()
+//   }
+// The full name of '@bar' is 'foo.bar'.
+// By default modules have the name 'module'.
+IREE_API_EXPORT iree_status_t iree_runtime_call_initialize_by_name(
+    iree_runtime_session_t* session, iree_string_view_t full_name,
+    iree_runtime_call_t* out_call);
+
+// Deinitializes a call by releasing its input and output lists.
+IREE_API_EXPORT void iree_runtime_call_deinitialize(iree_runtime_call_t* call);
+
+// Resets the input and output lists back to 0-length in preparation for
+// construction of another call.
+IREE_API_EXPORT void iree_runtime_call_reset(iree_runtime_call_t* call);
+
+// Returns an initially-empty variant list for passing in function inputs.
+// The list must be fully populated based on the required arguments of the
+// function.
+IREE_API_EXPORT iree_vm_list_t* iree_runtime_call_inputs(
+    const iree_runtime_call_t* call);
+
+// Returns an initially-empty variant list for passing in function outputs or
+// for reading back the results of a call.
+IREE_API_EXPORT iree_vm_list_t* iree_runtime_call_outputs(
+    const iree_runtime_call_t* call);
+
+// Synchronously invokes the call and returns the status.
+// The inputs list will remain unchanged to allow for subsequent reuse and the
+// output list will be populated with the results of the call.
+IREE_API_EXPORT iree_status_t iree_runtime_call_invoke(
+    iree_runtime_call_t* call, iree_runtime_call_flags_t flags);
+
+//===----------------------------------------------------------------------===//
+// Helpers for defining call I/O
+//===----------------------------------------------------------------------===//
+// NOTE: these are mostly useful for one-shot tests and samples. Applications
+// that will be reusing the same inputs and outputs should prefer to track them
+// themselves. If applications are able it's strongly recommended that they
+// produce and consume the iree_hal_buffer_ts directly to avoid additional
+// copies and allocations.
+
+// Pushes |buffer_view| to the call inputs list.
+// The value will be retained by the list.
+IREE_API_EXPORT iree_status_t iree_runtime_call_inputs_push_back_buffer_view(
+    iree_runtime_call_t* call, iree_hal_buffer_view_t* buffer_view);
+
+// Pops a buffer view from the front of the call outputs list.
+// Ownership of the buffer view transfers to the caller.
+IREE_API_EXPORT iree_status_t iree_runtime_call_outputs_pop_front_buffer_view(
+    iree_runtime_call_t* call, iree_hal_buffer_view_t** out_buffer_view);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_RUNTIME_CALL_H_
diff --git a/runtime/src/iree/runtime/demo/BUILD b/runtime/src/iree/runtime/demo/BUILD
new file mode 100644
index 0000000..84a5f73
--- /dev/null
+++ b/runtime/src/iree/runtime/demo/BUILD
@@ -0,0 +1,75 @@
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("//iree:build_defs.oss.bzl", "iree_cmake_extra_content")
+load("//build_tools/bazel:native_binary.bzl", "native_test")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+#===------------------------------------------------------------------------===#
+# Hello World!
+#===------------------------------------------------------------------------===#
+
+cc_binary(
+    name = "hello_world_file",
+    srcs = ["hello_world_explained.c"],
+    defines = [
+        # Load data from a file passed on the command line.
+        "IREE_RUNTIME_DEMO_LOAD_FILE_FROM_COMMAND_LINE_ARG",
+    ],
+    deps = [
+        "//runtime/src/iree/runtime",
+    ],
+)
+
+# TODO(benvanik): native_test that passes the file as a flag. Right now we
+# can't specify data through native_test, though, so this isn't possible to
+# automate.
+
+iree_cmake_extra_content(
+    content = """
+if (NOT ${IREE_HAL_DRIVER_VMVX} OR NOT ${IREE_TARGET_BACKEND_VMVX})
+  return()
+endif()
+""",
+    inline = True,
+)
+
+cc_binary(
+    name = "hello_world_embedded",
+    srcs = ["hello_world_explained.c"],
+    defines = [
+        # Load data directly from memory.
+        "IREE_RUNTIME_DEMO_LOAD_FILE_FROM_EMBEDDED_DATA",
+    ],
+    deps = [
+        "//runtime/src/iree/runtime",
+        "//runtime/src/iree/runtime/testdata:simple_mul_module_c",
+    ],
+)
+
+native_test(
+    name = "hello_world_embedded_test",
+    src = ":hello_world_embedded",
+)
+
+cc_binary(
+    name = "hello_world_terse",
+    srcs = ["hello_world_terse.c"],
+    deps = [
+        "//runtime/src/iree/runtime",
+        "//runtime/src/iree/runtime/testdata:simple_mul_module_c",
+    ],
+)
+
+native_test(
+    name = "hello_world_terse_test",
+    src = ":hello_world_terse",
+)
diff --git a/runtime/src/iree/runtime/demo/CMakeLists.txt b/runtime/src/iree/runtime/demo/CMakeLists.txt
new file mode 100644
index 0000000..c06f15a
--- /dev/null
+++ b/runtime/src/iree/runtime/demo/CMakeLists.txt
@@ -0,0 +1,53 @@
+# NOTE: not using bazel-to-cmake here because of the runtime unified rule.
+# We should figure out how to make bazel/cmake consistent with that.
+
+iree_cc_binary(
+  NAME
+    hello_world_file
+  SRCS
+    "hello_world_explained.c"
+  DEFINES
+    "IREE_RUNTIME_DEMO_LOAD_FILE_FROM_COMMAND_LINE_ARG"
+  DEPS
+    iree::runtime::unified
+)
+
+if (NOT ${IREE_HAL_DRIVER_VMVX} OR NOT ${IREE_TARGET_BACKEND_VMVX})
+  return()
+endif()
+
+iree_cc_binary(
+  NAME
+    hello_world_embedded
+  SRCS
+    "hello_world_explained.c"
+  DEFINES
+    "IREE_RUNTIME_DEMO_LOAD_FILE_FROM_EMBEDDED_DATA"
+  DEPS
+    iree::runtime::unified
+    iree::runtime::testdata::simple_mul_module_c
+)
+
+iree_native_test(
+  NAME
+    "hello_world_embedded_test"
+  SRC
+    ::hello_world_embedded
+)
+
+iree_cc_binary(
+  NAME
+    hello_world_terse
+  SRCS
+    "hello_world_terse.c"
+  DEPS
+    iree::runtime::unified
+    iree::runtime::testdata::simple_mul_module_c
+)
+
+iree_native_test(
+  NAME
+    "hello_world_terse_test"
+  SRC
+    ::hello_world_terse
+)
diff --git a/runtime/src/iree/runtime/demo/README.md b/runtime/src/iree/runtime/demo/README.md
new file mode 100644
index 0000000..b4b0f02
--- /dev/null
+++ b/runtime/src/iree/runtime/demo/README.md
@@ -0,0 +1,33 @@
+# IREE C Runtime API Demo
+
+This demonstrates how to use the higher-level IREE C API to load a compiled
+module and call the functions within it.
+
+The module used has a single exported function `@simple_mul` that multiplies two
+tensors and returns the result:
+
+```mlir
+func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32>
+    {
+  %0 = "mhlo.multiply"(%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  return %0 : tensor<4xf32>
+}
+```
+
+The demo here sets up the shared `iree_runtime_instance_t`, loads the module
+into an `iree_runtime_session_t`, and makes a call via `iree_runtime_call_t`.
+
+[`hello_world_terse.c`](hello_world_terse.c) highlights the steps while
+[`hello_world_explained.c`](hello_world_explained.c) has more discussion over
+what is happening and things to watch out for.
+
+Modules can be loaded from the file system or into memory by the application.
+The `iree_runtime_demo_hello_world_file` target shows loading from a file
+passed in as a command line argument and
+`iree_runtime_demo_hello_world_embedded` shows loading from a blob of memory
+where the test file has been built directly into the binary.
+
+NOTE: for brevity the `_terse.c` example uses `IREE_CHECK_OK` to abort the
+program on errors. Real applications - especially ones hosting IREE such as
+Android apps - would want to follow the patterns in `_explained.c` for how to
+propagate errors and clean up allocated resources.
diff --git a/runtime/src/iree/runtime/demo/hello_world_explained.c b/runtime/src/iree/runtime/demo/hello_world_explained.c
new file mode 100644
index 0000000..3abaca0
--- /dev/null
+++ b/runtime/src/iree/runtime/demo/hello_world_explained.c
@@ -0,0 +1,277 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <stdio.h>
+
+#include "iree/runtime/api.h"
+
+static int iree_runtime_demo_main(void);
+static iree_status_t iree_runtime_demo_run_session(
+    iree_runtime_instance_t* instance);
+static iree_status_t iree_runtime_demo_perform_mul(
+    iree_runtime_session_t* session);
+
+#if defined(IREE_RUNTIME_DEMO_LOAD_FILE_FROM_COMMAND_LINE_ARG)
+
+static const char* demo_file_path = NULL;
+
+// Takes the first argument on the command line as a file path and loads it.
+int main(int argc, char** argv) {
+  if (argc < 2) {
+    fprintf(stderr, "usage: session_demo module_file.vmfb\n");
+    return 1;
+  }
+  demo_file_path = argv[1];
+  return iree_runtime_demo_main();
+}
+
+// Loads a compiled IREE module from the file system.
+static iree_status_t iree_runtime_demo_load_module(
+    iree_runtime_session_t* session) {
+  return iree_runtime_session_append_bytecode_module_from_file(session,
+                                                               demo_file_path);
+}
+
+#elif defined(IREE_RUNTIME_DEMO_LOAD_FILE_FROM_EMBEDDED_DATA)
+
+#include "iree/runtime/testdata/simple_mul_module_c.h"
+
+int main(int argc, char** argv) { return iree_runtime_demo_main(); }
+
+// Loads the bytecode module directly from memory.
+//
+// Embedding the compiled output into your binary is not always possible (or
+// recommended) but is a fairly painless way to get things working on a variety
+// of targets without worrying about how to deploy files or pass flags.
+//
+// In cases like this the module file is in .rodata and does not need to be
+// freed; if the memory needs to be released when the module is unloaded then a
+// custom allocator can be provided to get a callback instead.
+static iree_status_t iree_runtime_demo_load_module(
+    iree_runtime_session_t* session) {
+  const iree_file_toc_t* module_file =
+      iree_runtime_testdata_simple_mul_module_create();
+  return iree_runtime_session_append_bytecode_module_from_memory(
+      session, iree_make_const_byte_span(module_file->data, module_file->size),
+      iree_allocator_null());
+}
+
+#else
+#error "must specify a way to load the module data"
+#endif  // IREE_RUNTIME_DEMO_LOAD_FILE_FROM_*
+
+//===----------------------------------------------------------------------===//
+// 1. Entry point / shared iree_runtime_instance_t setup
+//===----------------------------------------------------------------------===//
+// Applications should create and share a single instance across all sessions.
+
+// This would live in your application startup/shutdown code or scoped to the
+// usage of IREE. Creating and destroying instances is expensive and should be
+// avoided.
+static int iree_runtime_demo_main(void) {
+  // Set up the shared runtime instance.
+  // An application should usually only have one of these and share it across
+  // all of the sessions it has. The instance is thread-safe, while the
+  // sessions are only thread-compatible (you need to lock if its required).
+  iree_runtime_instance_options_t instance_options;
+  iree_runtime_instance_options_initialize(IREE_API_VERSION_LATEST,
+                                           &instance_options);
+  iree_runtime_instance_options_use_all_available_drivers(&instance_options);
+  iree_runtime_instance_t* instance = NULL;
+  iree_status_t status = iree_runtime_instance_create(
+      &instance_options, iree_allocator_system(), &instance);
+
+  // Run the demo.
+  // A real application would load its models (at startup, on-demand, etc) and
+  // retain them somewhere to be reused. Startup time and likelihood of failure
+  // varies across different HAL backends; the synchronous CPU backend is nearly
+  // instantaneous and will never fail (unless out of memory) while the Vulkan
+  // backend may take significantly longer and fail if there are not supported
+  // devices.
+  if (iree_status_is_ok(status)) {
+    status = iree_runtime_demo_run_session(instance);
+  }
+
+  // Release the shared instance - it will be deallocated when all sessions
+  // using it have been released (here it is deallocated immediately).
+  iree_runtime_instance_release(instance);
+
+  int ret = (int)iree_status_code(status);
+  if (!iree_status_is_ok(status)) {
+    // Dump nice status messages to stderr on failure.
+    // An application can route these through its own logging infrastructure as
+    // needed. Note that the status is a handle and must be freed!
+    iree_status_fprint(stderr, status);
+    iree_status_ignore(status);
+  }
+  return ret;
+}
+
+//===----------------------------------------------------------------------===//
+// 2. Load modules and initialize state in iree_runtime_session_t
+//===----------------------------------------------------------------------===//
+// Each instantiation of a module will live in its own session. Module state
+// like variables will be retained across calls within the same session.
+
+// Loads the demo module and uses it to perform some math.
+// In a real application you'd want to hang on to the iree_runtime_session_t
+// and reuse it for future calls - especially if it holds state internally.
+static iree_status_t iree_runtime_demo_run_session(
+    iree_runtime_instance_t* instance) {
+  // TODO(#5724): move device selection into the compiled modules.
+  iree_hal_device_t* device = NULL;
+  IREE_RETURN_IF_ERROR(iree_runtime_instance_try_create_default_device(
+      instance, iree_make_cstring_view("vmvx"), &device));
+
+  // Set up the session to run the demo module.
+  // Sessions are like OS processes and are used to isolate modules from each
+  // other and hold runtime state such as the variables used within the module.
+  // The same module loaded into two sessions will see their own private state.
+  iree_runtime_session_options_t session_options;
+  iree_runtime_session_options_initialize(&session_options);
+  iree_runtime_session_t* session = NULL;
+  iree_status_t status = iree_runtime_session_create_with_device(
+      instance, &session_options, device,
+      iree_runtime_instance_host_allocator(instance), &session);
+  iree_hal_device_release(device);
+
+  // Load the compiled user module in a demo-specific way.
+  // Applications could specify files, embed the outputs directly in their
+  // binaries, fetch them over the network, etc.
+  if (iree_status_is_ok(status)) {
+    status = iree_runtime_demo_load_module(session);
+  }
+
+  // Build and issue the call.
+  if (iree_status_is_ok(status)) {
+    status = iree_runtime_demo_perform_mul(session);
+  }
+
+  // Release the session and free all resources.
+  iree_runtime_session_release(session);
+  return status;
+}
+
+//===----------------------------------------------------------------------===//
+// 3. Call a function within a module with buffer views
+//===----------------------------------------------------------------------===//
+// The inputs and outputs of a call are reusable across calls (and possibly
+// across sessions depending on device compatibility) and can be setup by the
+// application as needed. For example, an application could perform
+// multi-threaded buffer view creation and then issue the call from a single
+// thread when all inputs are ready. This simple demo just allocates them
+// per-call and throws them away.
+
+// Sets up and calls the simple_mul function and dumps the results:
+// func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) ->
+// tensor<4xf32>
+//
+// NOTE: this is a demo and as such this performs no memoization; a real
+// application could reuse a lot of these structures and cache lookups of
+// iree_vm_function_t to reduce the amount of per-call overhead.
+static iree_status_t iree_runtime_demo_perform_mul(
+    iree_runtime_session_t* session) {
+  // Initialize the call to the function.
+  iree_runtime_call_t call;
+  IREE_RETURN_IF_ERROR(iree_runtime_call_initialize_by_name(
+      session, iree_make_cstring_view("module.simple_mul"), &call));
+
+  // Append the function inputs with the HAL device allocator in use by the
+  // session. The buffers will be usable within the session and _may_ be usable
+  // in other sessions depending on whether they share a compatible device.
+  iree_hal_allocator_t* device_allocator =
+      iree_runtime_session_device_allocator(session);
+  iree_allocator_t host_allocator =
+      iree_runtime_session_host_allocator(session);
+  iree_status_t status = iree_ok_status();
+  {
+    // %arg0: tensor<4xf32>
+    iree_hal_buffer_view_t* arg0 = NULL;
+    if (iree_status_is_ok(status)) {
+      static const iree_hal_dim_t arg0_shape[1] = {4};
+      static const float arg0_data[4] = {1.0f, 1.1f, 1.2f, 1.3f};
+      status = iree_hal_buffer_view_allocate_buffer(
+          device_allocator,
+          // Shape dimensions and rank:
+          arg0_shape, IREE_ARRAYSIZE(arg0_shape),
+          // Element type:
+          IREE_HAL_ELEMENT_TYPE_FLOAT_32,
+          // Encoding type:
+          IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR,
+          (iree_hal_buffer_params_t){
+              // Where to allocate (host or device):
+              .type = IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL,
+              // Access to allow to this memory (this is .rodata so READ only):
+              .access = IREE_HAL_MEMORY_ACCESS_READ,
+              // Intended usage of the buffer (transfers, dispatches, etc):
+              .usage = IREE_HAL_BUFFER_USAGE_DISPATCH |
+                       IREE_HAL_BUFFER_USAGE_TRANSFER,
+          },
+          // The actual heap buffer to wrap or clone and its allocator:
+          iree_make_const_byte_span(arg0_data, sizeof(arg0_data)),
+          // Buffer view + storage are returned and owned by the caller:
+          &arg0);
+    }
+    if (iree_status_is_ok(status)) {
+      IREE_IGNORE_ERROR(iree_hal_buffer_view_fprint(
+          stdout, arg0, /*max_element_count=*/4096, host_allocator));
+      // Add to the call inputs list (which retains the buffer view).
+      status = iree_runtime_call_inputs_push_back_buffer_view(&call, arg0);
+    }
+    // Since the call retains the buffer view we can release it here.
+    iree_hal_buffer_view_release(arg0);
+
+    fprintf(stdout, "\n * \n");
+
+    // %arg1: tensor<4xf32>
+    iree_hal_buffer_view_t* arg1 = NULL;
+    if (iree_status_is_ok(status)) {
+      static const iree_hal_dim_t arg1_shape[1] = {4};
+      static const float arg1_data[4] = {10.0f, 100.0f, 1000.0f, 10000.0f};
+      status = iree_hal_buffer_view_allocate_buffer(
+          device_allocator, arg1_shape, IREE_ARRAYSIZE(arg1_shape),
+          IREE_HAL_ELEMENT_TYPE_FLOAT_32,
+          IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR,
+          (iree_hal_buffer_params_t){
+              .type = IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL,
+              .access = IREE_HAL_MEMORY_ACCESS_READ,
+              .usage = IREE_HAL_BUFFER_USAGE_DISPATCH |
+                       IREE_HAL_BUFFER_USAGE_TRANSFER,
+          },
+          iree_make_const_byte_span(arg1_data, sizeof(arg1_data)), &arg1);
+    }
+    if (iree_status_is_ok(status)) {
+      IREE_IGNORE_ERROR(iree_hal_buffer_view_fprint(
+          stdout, arg1, /*max_element_count=*/4096, host_allocator));
+      status = iree_runtime_call_inputs_push_back_buffer_view(&call, arg1);
+    }
+    iree_hal_buffer_view_release(arg1);
+  }
+
+  // Synchronously perform the call.
+  if (iree_status_is_ok(status)) {
+    status = iree_runtime_call_invoke(&call, /*flags=*/0);
+  }
+
+  fprintf(stdout, "\n = \n");
+
+  // Dump the function outputs.
+  iree_hal_buffer_view_t* ret0 = NULL;
+  if (iree_status_is_ok(status)) {
+    // Try to get the first call result as a buffer view.
+    status = iree_runtime_call_outputs_pop_front_buffer_view(&call, &ret0);
+  }
+  if (iree_status_is_ok(status)) {
+    // This prints the buffer view out but an application could read its
+    // contents, pass it to another call, etc.
+    status = iree_hal_buffer_view_fprint(
+        stdout, ret0, /*max_element_count=*/4096, host_allocator);
+  }
+  iree_hal_buffer_view_release(ret0);
+
+  iree_runtime_call_deinitialize(&call);
+  return status;
+}
diff --git a/runtime/src/iree/runtime/demo/hello_world_terse.c b/runtime/src/iree/runtime/demo/hello_world_terse.c
new file mode 100644
index 0000000..35ca476
--- /dev/null
+++ b/runtime/src/iree/runtime/demo/hello_world_terse.c
@@ -0,0 +1,136 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <stdio.h>
+
+#include "iree/runtime/api.h"
+#include "iree/runtime/testdata/simple_mul_module_c.h"
+
+static void iree_runtime_demo_run_session(iree_runtime_instance_t* instance);
+static void iree_runtime_demo_perform_mul(iree_runtime_session_t* session);
+
+//===----------------------------------------------------------------------===//
+// 1. Entry point / shared iree_runtime_instance_t setup
+//===----------------------------------------------------------------------===//
+
+int main(int argc, char** argv) {
+  // Create and configure the instance shared across all sessions.
+  iree_runtime_instance_options_t instance_options;
+  iree_runtime_instance_options_initialize(IREE_API_VERSION_LATEST,
+                                           &instance_options);
+  iree_runtime_instance_options_use_all_available_drivers(&instance_options);
+  iree_runtime_instance_t* instance = NULL;
+  IREE_CHECK_OK(iree_runtime_instance_create(
+      &instance_options, iree_allocator_system(), &instance));
+
+  // All sessions should share the same instance.
+  iree_runtime_demo_run_session(instance);
+
+  iree_runtime_instance_release(instance);
+  return 0;
+}
+
+//===----------------------------------------------------------------------===//
+// 2. Load modules and initialize state in iree_runtime_session_t
+//===----------------------------------------------------------------------===//
+
+static void iree_runtime_demo_run_session(iree_runtime_instance_t* instance) {
+  // TODO(#5724): move device selection into the compiled modules.
+  iree_hal_device_t* device = NULL;
+  IREE_CHECK_OK(iree_runtime_instance_try_create_default_device(
+      instance, iree_make_cstring_view("vmvx"), &device));
+
+  // Create one session per loaded module to hold the module state.
+  iree_runtime_session_options_t session_options;
+  iree_runtime_session_options_initialize(&session_options);
+  iree_runtime_session_t* session = NULL;
+  IREE_CHECK_OK(iree_runtime_session_create_with_device(
+      instance, &session_options, device,
+      iree_runtime_instance_host_allocator(instance), &session));
+  iree_hal_device_release(device);
+
+  // Load your user module into the session (from memory, from file, etc).
+  const iree_file_toc_t* module_file =
+      iree_runtime_testdata_simple_mul_module_create();
+  IREE_CHECK_OK(iree_runtime_session_append_bytecode_module_from_memory(
+      session, iree_make_const_byte_span(module_file->data, module_file->size),
+      iree_allocator_null()));
+
+  // Run your functions; you should reuse the session to make multiple calls.
+  iree_runtime_demo_perform_mul(session);
+
+  iree_runtime_session_release(session);
+}
+
+//===----------------------------------------------------------------------===//
+// 3. Call a function within a module with buffer views
+//===----------------------------------------------------------------------===//
+
+// func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) ->
+// tensor<4xf32>
+static void iree_runtime_demo_perform_mul(iree_runtime_session_t* session) {
+  iree_runtime_call_t call;
+  IREE_CHECK_OK(iree_runtime_call_initialize_by_name(
+      session, iree_make_cstring_view("module.simple_mul"), &call));
+
+  // %arg0: tensor<4xf32>
+  iree_hal_buffer_view_t* arg0 = NULL;
+  static const iree_hal_dim_t arg0_shape[1] = {4};
+  static const float arg0_data[4] = {1.0f, 1.1f, 1.2f, 1.3f};
+  IREE_CHECK_OK(iree_hal_buffer_view_allocate_buffer(
+      iree_runtime_session_device_allocator(session), arg0_shape,
+      IREE_ARRAYSIZE(arg0_shape), IREE_HAL_ELEMENT_TYPE_FLOAT_32,
+      IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR,
+      (iree_hal_buffer_params_t){
+          .type = IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL,
+          .access = IREE_HAL_MEMORY_ACCESS_READ,
+          .usage =
+              IREE_HAL_BUFFER_USAGE_DISPATCH | IREE_HAL_BUFFER_USAGE_TRANSFER,
+      },
+      iree_make_const_byte_span(arg0_data, sizeof(arg0_data)), &arg0));
+  IREE_CHECK_OK(iree_hal_buffer_view_fprint(
+      stdout, arg0, /*max_element_count=*/4096,
+      iree_runtime_session_host_allocator(session)));
+  IREE_CHECK_OK(iree_runtime_call_inputs_push_back_buffer_view(&call, arg0));
+  iree_hal_buffer_view_release(arg0);
+
+  fprintf(stdout, "\n * \n");
+
+  // %arg1: tensor<4xf32>
+  iree_hal_buffer_view_t* arg1 = NULL;
+  static const iree_hal_dim_t arg1_shape[1] = {4};
+  static const float arg1_data[4] = {10.0f, 100.0f, 1000.0f, 10000.0f};
+  IREE_CHECK_OK(iree_hal_buffer_view_allocate_buffer(
+      iree_runtime_session_device_allocator(session), arg1_shape,
+      IREE_ARRAYSIZE(arg1_shape), IREE_HAL_ELEMENT_TYPE_FLOAT_32,
+      IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR,
+      (iree_hal_buffer_params_t){
+          .type = IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL,
+          .access = IREE_HAL_MEMORY_ACCESS_READ,
+          .usage =
+              IREE_HAL_BUFFER_USAGE_DISPATCH | IREE_HAL_BUFFER_USAGE_TRANSFER,
+      },
+      iree_make_const_byte_span(arg1_data, sizeof(arg1_data)), &arg1));
+  IREE_CHECK_OK(iree_hal_buffer_view_fprint(
+      stdout, arg1, /*max_element_count=*/4096,
+      iree_runtime_session_host_allocator(session)));
+  IREE_CHECK_OK(iree_runtime_call_inputs_push_back_buffer_view(&call, arg1));
+  iree_hal_buffer_view_release(arg1);
+
+  IREE_CHECK_OK(iree_runtime_call_invoke(&call, /*flags=*/0));
+
+  fprintf(stdout, "\n = \n");
+
+  // -> tensor<4xf32>
+  iree_hal_buffer_view_t* ret0 = NULL;
+  IREE_CHECK_OK(iree_runtime_call_outputs_pop_front_buffer_view(&call, &ret0));
+  IREE_CHECK_OK(iree_hal_buffer_view_fprint(
+      stdout, ret0, /*max_element_count=*/4096,
+      iree_runtime_session_host_allocator(session)));
+  iree_hal_buffer_view_release(ret0);
+
+  iree_runtime_call_deinitialize(&call);
+}
diff --git a/runtime/src/iree/runtime/instance.c b/runtime/src/iree/runtime/instance.c
new file mode 100644
index 0000000..352bfc5
--- /dev/null
+++ b/runtime/src/iree/runtime/instance.c
@@ -0,0 +1,166 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/runtime/instance.h"
+
+#include <stddef.h>
+#include <string.h>
+
+#include "iree/base/internal/atomics.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/api.h"
+#include "iree/hal/drivers/init.h"
+#include "iree/modules/hal/module.h"
+#include "iree/vm/api.h"
+
+//===----------------------------------------------------------------------===//
+// iree_runtime_instance_options_t
+//===----------------------------------------------------------------------===//
+
+IREE_API_EXPORT void iree_runtime_instance_options_initialize(
+    iree_api_version_t api_version,
+    iree_runtime_instance_options_t* out_options) {
+  memset(out_options, 0, sizeof(*out_options));
+  out_options->api_version = api_version;
+}
+
+IREE_API_EXPORT void iree_runtime_instance_options_use_all_available_drivers(
+    iree_runtime_instance_options_t* options) {
+  options->driver_registry = iree_hal_driver_registry_default();
+  // TODO(benvanik): remove status result from this; it can't (meaningfully)
+  // fail and is just extra bookkeeping.
+  iree_status_ignore(
+      iree_hal_register_all_available_drivers(options->driver_registry));
+}
+
+//===----------------------------------------------------------------------===//
+// iree_runtime_instance_t
+//===----------------------------------------------------------------------===//
+
+struct iree_runtime_instance_t {
+  iree_atomic_ref_count_t ref_count;
+
+  // Allocator used to allocate the instance and all of its resources.
+  iree_allocator_t host_allocator;
+
+  // An optional driver registry used to enumerate and create HAL devices.
+  iree_hal_driver_registry_t* driver_registry;
+
+  // TODO(#5724): we should have a device cache here so that multiple sessions
+  // can find the same devices. This may mean a new HAL type like
+  // iree_hal_device_pool_t to prevent too much coupling and make weak
+  // references easier.
+};
+
+IREE_API_EXPORT iree_status_t iree_runtime_instance_create(
+    const iree_runtime_instance_options_t* options,
+    iree_allocator_t host_allocator, iree_runtime_instance_t** out_instance) {
+  IREE_ASSERT_ARGUMENT(options);
+  IREE_ASSERT_ARGUMENT(out_instance);
+  *out_instance = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Check that the API version matches what the runtime expects. The check here
+  // should always succeed when the runtime and the underlying system are linked
+  // together into the same binary.
+  iree_api_version_t actual_version = IREE_API_VERSION_0;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_api_version_check(options->api_version, &actual_version));
+
+  // Register builtin types.
+  // TODO(benvanik): change to per-instance type registries to avoid these
+  // global (UNSAFE!) calls. For now hosting applications should really only
+  // be using a single instance anyway.
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(z0, iree_vm_register_builtin_types());
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(z0, iree_hal_module_register_types());
+
+  // Allocate the instance state.
+  iree_runtime_instance_t* instance = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_allocator_malloc(host_allocator, sizeof(*instance),
+                                (void**)&instance));
+  instance->host_allocator = host_allocator;
+  iree_atomic_ref_count_init(&instance->ref_count);
+
+  instance->driver_registry = options->driver_registry;
+  // TODO(benvanik): driver registry ref counting.
+
+  *out_instance = instance;
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+static void iree_runtime_instance_destroy(iree_runtime_instance_t* instance) {
+  IREE_ASSERT_ARGUMENT(instance);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_allocator_free(instance->host_allocator, instance);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+IREE_API_EXPORT void iree_runtime_instance_retain(
+    iree_runtime_instance_t* instance) {
+  if (instance) {
+    iree_atomic_ref_count_inc(&instance->ref_count);
+  }
+}
+
+IREE_API_EXPORT void iree_runtime_instance_release(
+    iree_runtime_instance_t* instance) {
+  if (instance && iree_atomic_ref_count_dec(&instance->ref_count) == 1) {
+    iree_runtime_instance_destroy(instance);
+  }
+}
+
+IREE_API_EXPORT iree_allocator_t
+iree_runtime_instance_host_allocator(const iree_runtime_instance_t* instance) {
+  IREE_ASSERT_ARGUMENT(instance);
+  return instance->host_allocator;
+}
+
+IREE_API_EXPORT iree_hal_driver_registry_t*
+iree_runtime_instance_driver_registry(const iree_runtime_instance_t* instance) {
+  IREE_ASSERT_ARGUMENT(instance);
+  return instance->driver_registry;
+}
+
+IREE_API_EXPORT iree_status_t iree_runtime_instance_try_create_default_device(
+    iree_runtime_instance_t* instance, iree_string_view_t driver_name,
+    iree_hal_device_t** out_device) {
+  IREE_ASSERT_ARGUMENT(instance);
+  IREE_ASSERT_ARGUMENT(out_device);
+  *out_device = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_TEXT(z0, driver_name.data, driver_name.size);
+
+  // This is only supported when we have a driver registry we can use to create
+  // the drivers.
+  iree_hal_driver_registry_t* driver_registry =
+      iree_runtime_instance_driver_registry(instance);
+  if (!driver_registry) {
+    IREE_TRACE_ZONE_END(z0);
+    return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                            "instance was created without a driver registry "
+                            "and cannot perform enumeration");
+  }
+
+  // Create a driver with the given name (if one exists).
+  iree_allocator_t host_allocator =
+      iree_runtime_instance_host_allocator(instance);
+  iree_hal_driver_t* driver = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_hal_driver_registry_try_create_by_name(
+              driver_registry, driver_name, host_allocator, &driver));
+
+  // Create the default device on that driver.
+  iree_status_t status =
+      iree_hal_driver_create_default_device(driver, host_allocator, out_device);
+
+  iree_hal_driver_release(driver);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
diff --git a/runtime/src/iree/runtime/instance.h b/runtime/src/iree/runtime/instance.h
new file mode 100644
index 0000000..6bf5423
--- /dev/null
+++ b/runtime/src/iree/runtime/instance.h
@@ -0,0 +1,122 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_RUNTIME_INSTANCE_H_
+#define IREE_RUNTIME_INSTANCE_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Shared runtime instance responsible for isolating runtime usage, enumerating
+// and creating hardware device interfaces, and managing device resource pools.
+//
+// A single runtime instance can service multiple sessions and hosting
+// applications should try to reuse instances as much as possible. This ensures
+// that resource allocation across contexts is handled and extraneous device
+// interaction is avoided. For devices that may have exclusive access
+// restrictions it is mandatory to share instances, so plan accordingly.
+//
+// In multi-tenant systems separate instances can be used to isolate each tenant
+// in cases where the underlying devices do not cleanly support isolation
+// themselves and otherwise multiple tenants can share the same instance.
+// Consider an instance as isolating IREE from itself rather than being the only
+// mechanism that can be used to isolate individual tenants or sessions.
+//
+// Caches and allocator pools are associated with an instance and resources may
+// be reused among any sessions sharing the same instance. In multi-tenant
+// environments where all tenants are trusted (and here "tenant" may just mean
+// "a single session" where there are many sessions) then they can often receive
+// large benefits in terms of peak memory consumption, startup time, and
+// interoperation by sharing an instance. If two tenants must never share any
+// data (PII) then they should be placed in different instances.
+//
+// As with all of iree/runtime/ this API is a higher-level wrapper for the
+// low-level IREE HAL and VM. Using this may pull in additional dependencies and
+// perform additional allocations compared to what you can get by directly going
+// to the lower levels.
+//
+// Thread-safe.
+typedef struct iree_runtime_instance_t iree_runtime_instance_t;
+
+//===----------------------------------------------------------------------===//
+// iree_runtime_instance_options_t
+//===----------------------------------------------------------------------===//
+
+// Options used to configure instance creation.
+typedef struct iree_runtime_instance_options_t {
+  // Should be set to IREE_API_VERSION_LATEST.
+  iree_api_version_t api_version;
+
+  // TODO(benvanik): inject logging hooks.
+
+  // A driver registry used to enumerate and create HAL devices.
+  // When not provided a device must be specified when creating sessions via
+  // iree_runtime_session_create_with_device.
+  iree_hal_driver_registry_t* driver_registry;
+} iree_runtime_instance_options_t;
+
+// Initializes |out_options| to its default values.
+IREE_API_EXPORT void iree_runtime_instance_options_initialize(
+    iree_api_version_t api_version,
+    iree_runtime_instance_options_t* out_options);
+
+// Sets the instance to use all available drivers registered in the current
+// binary. This allows for control over driver selection from the build system
+// using the IREE_HAL_DRIVER_* CMake options.
+// Sessions may query for the driver listing and select one(s) that are
+// appropriate.
+IREE_API_EXPORT void iree_runtime_instance_options_use_all_available_drivers(
+    iree_runtime_instance_options_t* options);
+
+//===----------------------------------------------------------------------===//
+// iree_runtime_instance_t
+//===----------------------------------------------------------------------===//
+
+// Creates a new instance with the given |options|.
+// Instances should be shared with as many sessions in an application as is
+// reasonable to ensure that resources are tracked properly and threads are
+// managed correctly.
+//
+// |host_allocator| will be used to allocate the instance and any associated
+// resources. |out_instance| must be released by the caller.
+IREE_API_EXPORT iree_status_t iree_runtime_instance_create(
+    const iree_runtime_instance_options_t* options,
+    iree_allocator_t host_allocator, iree_runtime_instance_t** out_instance);
+
+// Retains the given |instance| for the caller.
+IREE_API_EXPORT void iree_runtime_instance_retain(
+    iree_runtime_instance_t* instance);
+
+// Releases the given |instance| from the caller.
+IREE_API_EXPORT void iree_runtime_instance_release(
+    iree_runtime_instance_t* instance);
+
+// Returns the host allocator used to allocate the instance and its resources.
+// Callers should use this to allocate resources so that any memory tracking
+// being performed correctly attributes the allocations to the instance.
+IREE_API_EXPORT iree_allocator_t
+iree_runtime_instance_host_allocator(const iree_runtime_instance_t* instance);
+
+// Returns the optional driver registry used to enumerate drivers and devices.
+// If not provided then iree_runtime_session_create_with_device must be used
+// to specify the device that a session should use.
+IREE_API_EXPORT iree_hal_driver_registry_t*
+iree_runtime_instance_driver_registry(const iree_runtime_instance_t* instance);
+
+// TODO(#5724): remove this once user modules query devices themselves.
+IREE_API_EXPORT iree_status_t iree_runtime_instance_try_create_default_device(
+    iree_runtime_instance_t* instance, iree_string_view_t driver_name,
+    iree_hal_device_t** out_device);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_RUNTIME_INSTANCE_H_
diff --git a/runtime/src/iree/runtime/session.c b/runtime/src/iree/runtime/session.c
new file mode 100644
index 0000000..2394a39
--- /dev/null
+++ b/runtime/src/iree/runtime/session.c
@@ -0,0 +1,309 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/runtime/session.h"
+
+#include <stddef.h>
+#include <string.h>
+
+#include "iree/base/internal/atomics.h"
+#include "iree/base/internal/file_io.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/api.h"
+#include "iree/modules/hal/module.h"
+#include "iree/runtime/instance.h"
+#include "iree/vm/api.h"
+#include "iree/vm/bytecode_module.h"
+
+//===----------------------------------------------------------------------===//
+// iree_runtime_session_options_t
+//===----------------------------------------------------------------------===//
+
+IREE_API_EXPORT void iree_runtime_session_options_initialize(
+    iree_runtime_session_options_t* out_options) {
+  memset(out_options, 0, sizeof(*out_options));
+  out_options->context_flags = IREE_VM_CONTEXT_FLAG_NONE;
+  out_options->builtin_modules = IREE_RUNTIME_SESSION_BUILTIN_ALL;
+}
+
+//===----------------------------------------------------------------------===//
+// iree_runtime_session_t
+//===----------------------------------------------------------------------===//
+
+struct iree_runtime_session_t {
+  iree_atomic_ref_count_t ref_count;
+
+  // Allocator used to allocate the session and all of its resources.
+  // Independent sessions within the same instance can have unique allocators to
+  // enable session-level tagging of allocations and pooling.
+  iree_allocator_t host_allocator;
+
+  // The instance this session is a part of; may be shared across many sessions.
+  // Devices and pools are stored on the instance so that multiple sessions can
+  // share resources. The session will keep the instance retained for its
+  // lifetime to ensure that these resources remain available.
+  iree_runtime_instance_t* instance;
+
+  // VM context containing the loaded modules (both builtins and user).
+  // Thread-compatible; a context carries state that must be externally
+  // synchronized.
+  iree_vm_context_t* context;
+
+  // The HAL module state bound to the target devices.
+  // This is used internally by the loaded modules to interact with the devices
+  // but can also be used by the caller to perform allocation and custom device
+  // execution.
+  //
+  // The state is owned by the context and we have it cached here for faster
+  // lookup. An application directly using the API may never need this, or could
+  // perform VM calls into HAL module exports to gain more portability.
+  iree_vm_module_state_t* hal_module_state;
+};
+
+IREE_API_EXPORT iree_status_t iree_runtime_session_create_with_device(
+    iree_runtime_instance_t* instance,
+    const iree_runtime_session_options_t* options, iree_hal_device_t* device,
+    iree_allocator_t host_allocator, iree_runtime_session_t** out_session) {
+  IREE_ASSERT_ARGUMENT(instance);
+  IREE_ASSERT_ARGUMENT(options);
+  IREE_ASSERT_ARGUMENT(device);
+  IREE_ASSERT_ARGUMENT(out_session);
+  *out_session = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Allocate the session state.
+  iree_runtime_session_t* session = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_allocator_malloc(host_allocator, sizeof(*session),
+                                (void**)&session));
+  session->host_allocator = host_allocator;
+  iree_atomic_ref_count_init(&session->ref_count);
+
+  session->instance = instance;
+  iree_runtime_instance_retain(session->instance);
+
+  // Create the context empty so that we can add our modules to it.
+  iree_status_t status = iree_vm_context_create(
+      /*instance=*/NULL, options->context_flags, host_allocator,
+      &session->context);
+
+  // Add the HAL module; it is always required when using the runtime API.
+  // Lower-level usage of the VM can avoid the HAL if it's not required.
+  iree_vm_module_t* hal_module = NULL;
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_module_create(device, host_allocator, &hal_module);
+  }
+  if (iree_status_is_ok(status)) {
+    status = iree_vm_context_register_modules(session->context, &hal_module, 1);
+  }
+  if (iree_status_is_ok(status)) {
+    status = iree_vm_context_resolve_module_state(session->context, hal_module,
+                                                  &session->hal_module_state);
+  }
+  iree_vm_module_release(hal_module);
+
+  if (iree_status_is_ok(status)) {
+    *out_session = session;
+  } else {
+    iree_runtime_session_release(session);
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_runtime_session_destroy(iree_runtime_session_t* session) {
+  IREE_ASSERT_ARGUMENT(session);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_vm_context_release(session->context);
+  iree_runtime_instance_release(session->instance);
+
+  iree_allocator_free(session->host_allocator, session);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+IREE_API_EXPORT void iree_runtime_session_retain(
+    iree_runtime_session_t* session) {
+  if (session) {
+    iree_atomic_ref_count_inc(&session->ref_count);
+  }
+}
+
+IREE_API_EXPORT void iree_runtime_session_release(
+    iree_runtime_session_t* session) {
+  if (session && iree_atomic_ref_count_dec(&session->ref_count) == 1) {
+    iree_runtime_session_destroy(session);
+  }
+}
+
+IREE_API_EXPORT iree_allocator_t
+iree_runtime_session_host_allocator(const iree_runtime_session_t* session) {
+  IREE_ASSERT_ARGUMENT(session);
+  return session->host_allocator;
+}
+
+IREE_API_EXPORT iree_runtime_instance_t* iree_runtime_session_instance(
+    const iree_runtime_session_t* session) {
+  IREE_ASSERT_ARGUMENT(session);
+  return session->instance;
+}
+
+IREE_API_EXPORT iree_vm_context_t* iree_runtime_session_context(
+    const iree_runtime_session_t* session) {
+  IREE_ASSERT_ARGUMENT(session);
+  return session->context;
+}
+
+IREE_API_EXPORT iree_hal_device_t* iree_runtime_session_device(
+    const iree_runtime_session_t* session) {
+  IREE_ASSERT_ARGUMENT(session);
+  return iree_hal_module_state_device(session->hal_module_state);
+}
+
+IREE_API_EXPORT iree_hal_allocator_t* iree_runtime_session_device_allocator(
+    const iree_runtime_session_t* session) {
+  iree_hal_device_t* device = iree_runtime_session_device(session);
+  if (!device) return NULL;
+  return iree_hal_device_allocator(device);
+}
+
+IREE_API_EXPORT iree_status_t
+iree_runtime_session_trim(iree_runtime_session_t* session) {
+  IREE_ASSERT_ARGUMENT(session);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_status_t status = iree_vm_context_notify(
+      iree_runtime_session_context(session), IREE_VM_SIGNAL_LOW_MEMORY);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_runtime_session_append_module(
+    iree_runtime_session_t* session, iree_vm_module_t* module) {
+  IREE_ASSERT_ARGUMENT(session);
+  IREE_ASSERT_ARGUMENT(module);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_TEXT(z0, iree_vm_module_name(module).data,
+                              iree_vm_module_name(module).size);
+
+  iree_status_t status = iree_vm_context_register_modules(
+      iree_runtime_session_context(session), &module, 1);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t
+iree_runtime_session_append_bytecode_module_from_memory(
+    iree_runtime_session_t* session, iree_const_byte_span_t flatbuffer_data,
+    iree_allocator_t flatbuffer_allocator) {
+  IREE_ASSERT_ARGUMENT(session);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_vm_module_t* module = NULL;
+  iree_status_t status = iree_vm_bytecode_module_create(
+      flatbuffer_data, flatbuffer_allocator,
+      iree_runtime_session_host_allocator(session), &module);
+  if (iree_status_is_ok(status)) {
+    status = iree_runtime_session_append_module(session, module);
+  }
+  iree_vm_module_release(module);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t
+iree_runtime_session_append_bytecode_module_from_file(
+    iree_runtime_session_t* session, const char* file_path) {
+  IREE_ASSERT_ARGUMENT(session);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_TEXT(z0, file_path);
+
+  // TODO(#3909): actually map the memory here. For now we just load the
+  // contents.
+  iree_file_contents_t* flatbuffer_contents = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_file_read_contents(file_path,
+                                  iree_runtime_session_host_allocator(session),
+                                  &flatbuffer_contents));
+
+  iree_status_t status =
+      iree_runtime_session_append_bytecode_module_from_memory(
+          session, flatbuffer_contents->const_buffer,
+          iree_file_contents_deallocator(flatbuffer_contents));
+  if (!iree_status_is_ok(status)) {
+    iree_file_contents_free(flatbuffer_contents);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_runtime_session_lookup_function(
+    const iree_runtime_session_t* session, iree_string_view_t full_name,
+    iree_vm_function_t* out_function) {
+  IREE_ASSERT_ARGUMENT(session);
+  IREE_ASSERT_ARGUMENT(out_function);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_status_t status = iree_vm_context_resolve_function(
+      iree_runtime_session_context(session), full_name, out_function);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_runtime_session_call(
+    iree_runtime_session_t* session, const iree_vm_function_t* function,
+    iree_vm_list_t* input_list, iree_vm_list_t* output_list) {
+  IREE_ASSERT_ARGUMENT(session);
+  IREE_ASSERT_ARGUMENT(function);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_status_t status =
+      iree_vm_invoke(iree_runtime_session_context(session), *function,
+                     IREE_VM_INVOCATION_FLAG_NONE,
+                     /*policy=*/NULL, input_list, output_list,
+                     iree_runtime_session_host_allocator(session));
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_runtime_session_call_by_name(
+    iree_runtime_session_t* session, iree_string_view_t full_name,
+    iree_vm_list_t* input_list, iree_vm_list_t* output_list) {
+  IREE_ASSERT_ARGUMENT(session);
+  iree_vm_function_t function;
+  IREE_RETURN_IF_ERROR(
+      iree_runtime_session_lookup_function(session, full_name, &function));
+  return iree_runtime_session_call(session, &function, input_list, output_list);
+}
+
+IREE_API_EXPORT iree_status_t iree_runtime_session_call_direct(
+    iree_runtime_session_t* session, const iree_vm_function_call_t* call) {
+  IREE_ASSERT_ARGUMENT(session);
+  IREE_ASSERT_ARGUMENT(call);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Allocate a VM stack on the host stack and initialize it.
+  IREE_VM_INLINE_STACK_INITIALIZE(
+      stack, IREE_VM_INVOCATION_FLAG_NONE,
+      iree_vm_context_state_resolver(iree_runtime_session_context(session)),
+      iree_runtime_session_host_allocator(session));
+
+  // Issue the call.
+  iree_vm_execution_result_t result;
+  iree_status_t status = call->function.module->begin_call(
+      call->function.module->self, stack, call, &result);
+
+  // Cleanup the stack.
+  iree_vm_stack_deinitialize(stack);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
diff --git a/runtime/src/iree/runtime/session.h b/runtime/src/iree/runtime/session.h
new file mode 100644
index 0000000..b96a497
--- /dev/null
+++ b/runtime/src/iree/runtime/session.h
@@ -0,0 +1,226 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_RUNTIME_SESSION_H_
+#define IREE_RUNTIME_SESSION_H_
+
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/vm/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef struct iree_runtime_instance_t iree_runtime_instance_t;
+
+// A session containing a set of loaded VM modules and their runtime state.
+// Each session has its own isolated module state and though multiple sessions
+// may share the same device they will all see their own individual timelines.
+// Think of a session like a process in an operating system: able to communicate
+// and share syscalls but with a strict separation.
+//
+// Only sessions that share an instance may directly share resources as
+// different instances may have different HAL devices and have incompatible
+// memory. Import and export APIs must be used to transfer the resources across
+// instances or incompatible devices within the same instance.
+//
+// As with all of iree/runtime/ this API is a higher-level wrapper for the
+// low-level IREE HAL and VM. Using this may pull in additional dependencies and
+// perform additional allocations compared to what you can get by directly going
+// to the lower levels.
+//
+// Thread-compatible; only a single thread may use the session at any time and
+// the caller must use external synchronization if they will be using it or any
+// resource derived from it concurrently. Any two sessions may be executed
+// concurrently without interference.
+typedef struct iree_runtime_session_t iree_runtime_session_t;
+
+//===----------------------------------------------------------------------===//
+// iree_runtime_session_options_t
+//===----------------------------------------------------------------------===//
+
+// Builtin modules that are provided by the runtime.
+enum iree_runtime_session_builtins_bits_t {
+  // All built-in modules that are compiled into the runtime will be available.
+  IREE_RUNTIME_SESSION_BUILTIN_ALL = UINT64_MAX,
+};
+typedef uint64_t iree_runtime_session_builtins_t;
+
+// Options used to configure session creation.
+typedef struct iree_runtime_session_options_t {
+  // Flags controlling the execution environment.
+  iree_vm_context_flags_t context_flags;
+
+  // A bitmask identifying which IREE builtin modules should be enabled.
+  // Session creation will fail if a requested module is not built into the
+  // runtime binary.
+  iree_runtime_session_builtins_t builtin_modules;
+} iree_runtime_session_options_t;
+
+// Initializes |out_options| to its default values.
+IREE_API_EXPORT void iree_runtime_session_options_initialize(
+    iree_runtime_session_options_t* out_options);
+
+//===----------------------------------------------------------------------===//
+// iree_runtime_session_t
+//===----------------------------------------------------------------------===//
+
+// Creates a new session forced to use the given |device|.
+// This bypasses any device enumeration performed by the loaded modules but
+// the loaded modules will still verify that the device matches their
+// requirements.
+//
+// A base set of modules may be added by the runtime during creation based on
+// |options| and users may load additional modules - such as the one containing
+// their user code - by using the iree_vm_context_t provided by
+// iree_runtime_session_context.
+//
+// |host_allocator| will be used to allocate the session and any associated
+// resources. |out_session| must be released by the caller.
+IREE_API_EXPORT iree_status_t iree_runtime_session_create_with_device(
+    iree_runtime_instance_t* instance,
+    const iree_runtime_session_options_t* options, iree_hal_device_t* device,
+    iree_allocator_t host_allocator, iree_runtime_session_t** out_session);
+
+// Retains the given |session| for the caller.
+IREE_API_EXPORT void iree_runtime_session_retain(
+    iree_runtime_session_t* session);
+
+// Releases the given |session| from the caller.
+IREE_API_EXPORT void iree_runtime_session_release(
+    iree_runtime_session_t* session);
+
+// Returns the host allocator used to allocate the session and its resources.
+// Callers should use this to allocate resources so that any memory tracking
+// being performed correctly attributes the allocations to the session.
+IREE_API_EXPORT iree_allocator_t
+iree_runtime_session_host_allocator(const iree_runtime_session_t* session);
+
+// Returns the instance the session uses for shared resources.
+IREE_API_EXPORT iree_runtime_instance_t* iree_runtime_session_instance(
+    const iree_runtime_session_t* session);
+
+// Returns the VM context used to load and link modules.
+// The context can be used to perform additional reflection over the loaded
+// modules or load additional modules (if supported).
+IREE_API_EXPORT iree_vm_context_t* iree_runtime_session_context(
+    const iree_runtime_session_t* session);
+
+// Returns the HAL device being used for execution.
+//
+// NOTE: this device will not be available until initialized by a user module
+// and will return NULL if queried prior.
+IREE_API_EXPORT iree_hal_device_t* iree_runtime_session_device(
+    const iree_runtime_session_t* session);
+
+// Returns the device allocator used to allocate compatible buffers.
+// Buffers from other allocators may not be compatible and require importing
+// prior to being usable by the session.
+//
+// NOTE: this device allocator will not be available until initialized by a
+// user module and will return NULL if queried prior.
+IREE_API_EXPORT iree_hal_allocator_t* iree_runtime_session_device_allocator(
+    const iree_runtime_session_t* session);
+
+// Trims transient/cached resources used by the session.
+// Upon resuming these resources may be expensive to rematerialize/reload and
+// as such this should only be called when it is known the resources will not
+// be needed soon.
+IREE_API_EXPORT iree_status_t
+iree_runtime_session_trim(iree_runtime_session_t* session);
+
+// Appends the given |module| to the context.
+// The module will be retained by the context.
+//
+// NOTE: only valid if the context is not yet frozen; see
+// iree_vm_context_freeze for more information.
+IREE_API_EXPORT iree_status_t iree_runtime_session_append_module(
+    iree_runtime_session_t* session, iree_vm_module_t* module);
+
+// Appends a bytecode module to the context loaded from the given memory blob.
+// If a |flatbuffer_allocator| is provided then it will be used to free the
+// |flatbuffer_data| when the module is destroyed and otherwise the ownership of
+// the |flatbuffer_data| remains with the caller. The data must remain valid for
+// the lifetime of the session.
+//
+// If the module exists as a file prefer instead to use
+// iree_runtime_session_append_bytecode_module_from_file to use memory mapped
+// I/O and reduce total memory consumption.
+//
+// NOTE: only valid if the context is not yet frozen; see
+// iree_vm_context_freeze for more information.
+IREE_API_EXPORT iree_status_t
+iree_runtime_session_append_bytecode_module_from_memory(
+    iree_runtime_session_t* session, iree_const_byte_span_t flatbuffer_data,
+    iree_allocator_t flatbuffer_allocator);
+
+// Appends a bytecode module to the context loaded from the given |file_path|.
+//
+// NOTE: only valid if the context is not yet frozen; see
+// iree_vm_context_freeze for more information.
+IREE_API_EXPORT iree_status_t
+iree_runtime_session_append_bytecode_module_from_file(
+    iree_runtime_session_t* session, const char* file_path);
+
+// Sets |out_function| to to an exported function with the fully-qualified name
+// of |full_name| or returns IREE_STATUS_NOT_FOUND. The function reference is
+// valid for the lifetime of |session|.
+//
+// The function name matches the original MLIR module and function symbols.
+// Example:
+//   module @foo {
+//     func.func @bar()
+//   }
+// The full name of '@bar' is 'foo.bar'.
+// By default modules have the name 'module'.
+IREE_API_EXPORT iree_status_t iree_runtime_session_lookup_function(
+    const iree_runtime_session_t* session, iree_string_view_t full_name,
+    iree_vm_function_t* out_function);
+
+// Synchronously issues a generic function call.
+//
+// |input_list| is used to pass values and objects into the target function and
+// must match the signature defined by the compiled function. List ownership
+// remains with the caller.
+//
+// |output_list| is populated after the function completes execution with the
+// output values and objects of the function. List ownership remains with the
+// caller.
+//
+// Functions with either no inputs or outputs may provide NULL for the
+// respective list.
+IREE_API_EXPORT iree_status_t iree_runtime_session_call(
+    iree_runtime_session_t* session, const iree_vm_function_t* function,
+    iree_vm_list_t* input_list, iree_vm_list_t* output_list);
+
+// Synchronously issues a generic function call by fully-qualified name.
+// This is equivalent to performing a iree_runtime_session_lookup_function
+// followed by a iree_runtime_session_call. When calling the same function
+// repeatedly callers should perform the lookup and cache the resulting function
+// handle to avoid repeated lookups.
+IREE_API_EXPORT iree_status_t iree_runtime_session_call_by_name(
+    iree_runtime_session_t* session, iree_string_view_t full_name,
+    iree_vm_list_t* input_list, iree_vm_list_t* output_list);
+
+// Synchronously issues a direct function call.
+// This bypasses signature verification and directly calls through the VM ABI.
+// Though still safe(ish) the errors reported on a signature mismatch will be
+// much less useful than a call performed via the more generic methods. Treat
+// this as a low-level technique only to be used when the calling host code and
+// callee modules are known to be compatible.
+//
+// See iree_vm_function_call_t for more information.
+IREE_API_EXPORT iree_status_t iree_runtime_session_call_direct(
+    iree_runtime_session_t* session, const iree_vm_function_call_t* call);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_RUNTIME_SESSION_H_
diff --git a/runtime/src/iree/runtime/testdata/BUILD b/runtime/src/iree/runtime/testdata/BUILD
new file mode 100644
index 0000000..974ed60
--- /dev/null
+++ b/runtime/src/iree/runtime/testdata/BUILD
@@ -0,0 +1,34 @@
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("//iree:build_defs.oss.bzl", "iree_cmake_extra_content")
+load("//build_tools/bazel:iree_bytecode_module.bzl", "iree_bytecode_module")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+iree_cmake_extra_content(
+    content = """
+if (NOT ${IREE_HAL_DRIVER_VMVX} OR NOT ${IREE_TARGET_BACKEND_VMVX})
+  return()
+endif()
+""",
+    inline = True,
+)
+
+iree_bytecode_module(
+    name = "simple_mul_module",
+    src = "simple_mul.mlir",
+    c_identifier = "iree_runtime_testdata_simple_mul_module",
+    flags = [
+        "-iree-input-type=mhlo",
+        "-iree-mlir-to-vm-bytecode-module",
+        "-iree-hal-target-backends=vmvx",
+    ],
+)
diff --git a/runtime/src/iree/runtime/testdata/CMakeLists.txt b/runtime/src/iree/runtime/testdata/CMakeLists.txt
new file mode 100644
index 0000000..8278eea
--- /dev/null
+++ b/runtime/src/iree/runtime/testdata/CMakeLists.txt
@@ -0,0 +1,31 @@
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# runtime/src/iree/runtime/testdata/BUILD                                      #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
+
+iree_add_all_subdirs()
+
+if (NOT ${IREE_HAL_DRIVER_VMVX} OR NOT ${IREE_TARGET_BACKEND_VMVX})
+  return()
+endif()
+
+iree_bytecode_module(
+  NAME
+    simple_mul_module
+  SRC
+    "simple_mul.mlir"
+  C_IDENTIFIER
+    "iree_runtime_testdata_simple_mul_module"
+  FLAGS
+    "-iree-input-type=mhlo"
+    "-iree-mlir-to-vm-bytecode-module"
+    "-iree-hal-target-backends=vmvx"
+  PUBLIC
+)
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/runtime/src/iree/runtime/testdata/simple_mul.mlir b/runtime/src/iree/runtime/testdata/simple_mul.mlir
new file mode 100644
index 0000000..a7369d2
--- /dev/null
+++ b/runtime/src/iree/runtime/testdata/simple_mul.mlir
@@ -0,0 +1,4 @@
+func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
+  %0 = "mhlo.multiply"(%arg0, %arg1) {name = "mul.1"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  return %0 : tensor<4xf32>
+}
diff --git a/runtime/src/iree/schemas/BUILD b/runtime/src/iree/schemas/BUILD
new file mode 100644
index 0000000..80acb67
--- /dev/null
+++ b/runtime/src/iree/schemas/BUILD
@@ -0,0 +1,67 @@
+# Copyright 2019 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("//iree:build_defs.oss.bzl", "iree_build_test")
+load("//build_tools/bazel:iree_flatcc.bzl", "iree_flatbuffer_c_library")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+FLATCC_ARGS = [
+    "--reader",
+    "--builder",
+    "--verifier",
+    "--json",
+]
+
+iree_flatbuffer_c_library(
+    name = "bytecode_module_def_c_fbs",
+    srcs = ["bytecode_module_def.fbs"],
+    flatcc_args = FLATCC_ARGS,
+)
+
+iree_flatbuffer_c_library(
+    name = "cuda_executable_def_c_fbs",
+    srcs = ["cuda_executable_def.fbs"],
+    flatcc_args = FLATCC_ARGS,
+)
+
+iree_flatbuffer_c_library(
+    name = "rocm_executable_def_c_fbs",
+    srcs = ["rocm_executable_def.fbs"],
+    flatcc_args = FLATCC_ARGS,
+)
+
+iree_flatbuffer_c_library(
+    name = "metal_executable_def_c_fbs",
+    srcs = ["metal_executable_def.fbs"],
+    flatcc_args = FLATCC_ARGS,
+)
+
+iree_flatbuffer_c_library(
+    name = "spirv_executable_def_c_fbs",
+    srcs = ["spirv_executable_def.fbs"],
+    flatcc_args = FLATCC_ARGS,
+)
+
+iree_flatbuffer_c_library(
+    name = "wgsl_executable_def_c_fbs",
+    srcs = ["wgsl_executable_def.fbs"],
+    flatcc_args = FLATCC_ARGS,
+)
+
+iree_build_test(
+    name = "schema_build_test",
+    targets = [
+        ":bytecode_module_def_c_fbs",
+        ":metal_executable_def_c_fbs",
+        ":spirv_executable_def_c_fbs",
+        ":wgsl_executable_def_c_fbs",
+    ],
+)
diff --git a/runtime/src/iree/schemas/CMakeLists.txt b/runtime/src/iree/schemas/CMakeLists.txt
new file mode 100644
index 0000000..bc0b4f5
--- /dev/null
+++ b/runtime/src/iree/schemas/CMakeLists.txt
@@ -0,0 +1,91 @@
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# runtime/src/iree/schemas/BUILD                                               #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
+
+iree_add_all_subdirs()
+
+flatbuffer_c_library(
+  NAME
+    bytecode_module_def_c_fbs
+  SRCS
+    "bytecode_module_def.fbs"
+  FLATCC_ARGS
+    "--reader"
+    "--builder"
+    "--verifier"
+    "--json"
+  PUBLIC
+)
+
+flatbuffer_c_library(
+  NAME
+    cuda_executable_def_c_fbs
+  SRCS
+    "cuda_executable_def.fbs"
+  FLATCC_ARGS
+    "--reader"
+    "--builder"
+    "--verifier"
+    "--json"
+  PUBLIC
+)
+
+flatbuffer_c_library(
+  NAME
+    rocm_executable_def_c_fbs
+  SRCS
+    "rocm_executable_def.fbs"
+  FLATCC_ARGS
+    "--reader"
+    "--builder"
+    "--verifier"
+    "--json"
+  PUBLIC
+)
+
+flatbuffer_c_library(
+  NAME
+    metal_executable_def_c_fbs
+  SRCS
+    "metal_executable_def.fbs"
+  FLATCC_ARGS
+    "--reader"
+    "--builder"
+    "--verifier"
+    "--json"
+  PUBLIC
+)
+
+flatbuffer_c_library(
+  NAME
+    spirv_executable_def_c_fbs
+  SRCS
+    "spirv_executable_def.fbs"
+  FLATCC_ARGS
+    "--reader"
+    "--builder"
+    "--verifier"
+    "--json"
+  PUBLIC
+)
+
+flatbuffer_c_library(
+  NAME
+    wgsl_executable_def_c_fbs
+  SRCS
+    "wgsl_executable_def.fbs"
+  FLATCC_ARGS
+    "--reader"
+    "--builder"
+    "--verifier"
+    "--json"
+  PUBLIC
+)
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/runtime/src/iree/schemas/bytecode_module_def.fbs b/runtime/src/iree/schemas/bytecode_module_def.fbs
new file mode 100644
index 0000000..df15e9e
--- /dev/null
+++ b/runtime/src/iree/schemas/bytecode_module_def.fbs
@@ -0,0 +1,246 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+namespace iree.vm;
+
+// IREE bytecode module.
+file_identifier "IREE";
+file_extension "vmfb";
+
+// Arbitrary key/value reflection attribute.
+table ReflectionAttrDef {
+  key:string;
+  value:string;
+}
+
+// Defines a type within the type table.
+table TypeDef {
+  // Fully-qualified name of the type, such as `hal.buffer`.
+  full_name:string;
+}
+
+// Defines a function signature.
+table FunctionSignatureDef {
+  // Arguments, in order, as described in the FunctionSignatureDef.
+  // Maps to an entry in the module type table.
+  argument_types:[int32];
+
+  // Results, in order, as described in the FunctionSignatureDef.
+  // Maps to an entry in the module type table.
+  result_types:[int32];
+
+  // The VM calling convention declaration used to marshal arguments and
+  // results into and out of the function.
+  // Optional for imports and internal functions but required for exports.
+  //
+  // See iree/vm/module.h for more information.
+  calling_convention:string;
+
+  // Function level reflection attributes.
+  // These are typically used to communicate additional ABI metadata needed
+  // for dynamic invocation and host language mapping.
+  // See: docs/developers/design_docs/function_abi.md
+  reflection_attrs:[ReflectionAttrDef];
+}
+
+enum ImportFlagBits:uint32 (bit_flags) {
+  REQUIRED = 0,  // 1u << 0
+  OPTIONAL = 1,  // 1u << 1
+}
+
+// Defines a runtime-resolved import function.
+table ImportFunctionDef {
+  // Fully-qualified name of the function (including the module namespace).
+  full_name:string;
+
+  // Signature of the function expected used for verifying that imports match.
+  signature:FunctionSignatureDef;
+
+  // Version flags controlling the behavior of import resolution.
+  flags:ImportFlagBits = REQUIRED;
+}
+
+// Defines a runtime-resolved export function.
+table ExportFunctionDef {
+  // Local name of the function (excluding the module namespace).
+  local_name:string;
+
+  // Signature of the function expected used for verifying that imports match.
+  signature:FunctionSignatureDef;
+
+  // Ordinal in the internal_functions table that implements this function.
+  internal_ordinal:int32;
+}
+
+// Defines a bytecode function.
+table InternalFunctionDef {
+  // Local name of the function or empty if the names have been stripped.
+  // The full name of the function when referenced from external modules will
+  // include the BytecodeModuleDef.name prefix.
+  local_name:string;
+
+  // Signature of the function used for reflection.
+  signature:FunctionSignatureDef;
+}
+
+table UncompressedDataDef {
+}
+
+union CompressionTypeDef {
+  UncompressedDataDef,
+}
+
+// Read-only data segment.
+table RodataSegmentDef {
+  // The compression format used for the data, including required decompression
+  // arguments. Omitted if the data is uncompressed.
+  compression_type:CompressionTypeDef;
+
+  // Contents in a format defined by CompressionTypeDef.
+  data:[uint8];
+}
+
+// Read-write data segment.
+table RwdataSegmentDef {
+  // Total byte capacity.
+  byte_size:int32;
+}
+
+// Defines the per-instance module state.
+table ModuleStateDef {
+  // Bytes used for global primitive value storage. All are initialized to zero
+  // on module state allocation.
+  global_bytes_capacity:int32;
+
+  // Total number of global ref values.
+  global_ref_count:int32;
+}
+
+// Static function descriptor used for stack frame allocation.
+struct FunctionDescriptor {
+  // Offset and length within the larger bytecode data block.
+  bytecode_offset:int32;
+  bytecode_length:int32;
+
+  // TODO(benvanik): remove counts and embed directly in bytecode.
+  // Total number of i32 registers used by the function.
+  i32_register_count:int16;
+  // Total number of ref registers used by the function.
+  ref_register_count:int16;
+}
+
+// mlir/IR/BuiltinLocationAttributes.td : CallSiteLoc
+table CallSiteLocDef {
+  callee:int32;
+  caller:int32;
+}
+
+// mlir/IR/BuiltinLocationAttributes.td : FileLineColLoc
+table FileLineColLocDef {
+  filename:string;
+  line:int32;
+  column:int32;
+}
+
+// mlir/IR/BuiltinLocationAttributes.td : FusedLoc
+table FusedLocDef {
+  metadata:string;
+  locations:[int32];
+}
+
+// mlir/IR/BuiltinLocationAttributes.td : FusedLoc
+table NameLocDef {
+  name:string;
+  child_location:int32;
+}
+
+// A location - possibly nested.
+union LocationTypeDef {
+  CallSiteLocDef,
+  FileLineColLocDef,
+  FusedLocDef,
+  NameLocDef,
+}
+
+// Maps a relative bytecode offset within a function to a source location.
+struct BytecodeLocationDef {
+  // Bytecode offset of the start of the operation.
+  bytecode_offset:int32;
+  // Index into the debug database location_table.
+  location:int32;
+}
+
+// Debug data for a single function mapping back into source IR.
+table FunctionSourceMapDef {
+  // Operation locations for all ops within the function.
+  locations:[BytecodeLocationDef];
+}
+
+// VM debug information database.
+table DebugDatabaseDef {
+  // Location table. Source maps reference this table.
+  location_table:[LocationTypeDef];
+
+  // Internal function source maps; 1:1 with the module function_descriptors.
+  functions:[FunctionSourceMapDef];
+}
+
+// Defines a bytecode module containing the information required to serve the
+// iree_vm_module_interface_t interface.
+//
+// Modules are similar to shared objects in that they provide a set of exported
+// functions that can be queried and called externally as well as any number of
+// internal function definitions. Imports can also be used to have the loader
+// dynamically link in exports of other modules upon loading.
+//
+// Modules can contain read-only segments containing (optionally) compressed
+// data that is used by the module. Read-write segments define uninitialized
+// reservations and are similar to .bss, and custom initializer functions can
+// be embedded to treat them as if they were .data segments.
+//
+// State can be defined per active runtime context (effectively like
+// thread-local storage) using ModuleStateDef. The runtime will prepare this
+// state and maintain it for the lifetime of contexts and ensure that ops that
+// use it (such as vm.global.load.*) are always associated with the appropriate
+// state.
+table BytecodeModuleDef {
+  // Module namespace used for fully-qualified function lookups.
+  name:string (required);
+
+  // Type table mapping type IDs used within the module to type signatures.
+  types:[TypeDef];
+
+  // Imported function definitions used to resolve imports.
+  imported_functions:[ImportFunctionDef];
+
+  // Exported function definitions used to resolve imports.
+  exported_functions:[ExportFunctionDef];
+
+  // Read-only data segments (like non-code .text).
+  // May optionally be compressed and decompressed by the loader.
+  rodata_segments:[RodataSegmentDef];
+
+  // Read-write data segments of uninitialized memory (like .bss).
+  rwdata_segments:[RwdataSegmentDef];
+
+  // Global module state information (like TLS).
+  module_state:ModuleStateDef;
+
+  // References to ranges in the bytecode contents buffer where each internal
+  // function is located. This table is kept unnested within InternalFunctionDef
+  // to avoid the need to walk the FlatBuffer hierarchy at runtime when
+  // resolving call targets. Multiple functions may alias the same ranges in
+  // bytecode_data.
+  function_descriptors:[FunctionDescriptor];
+
+  // Bytecode contents. One large buffer containing all of the function op data.
+  bytecode_data:[uint8];
+
+  // Optional module debug database.
+  debug_database:DebugDatabaseDef;
+}
+
+root_type BytecodeModuleDef;
diff --git a/runtime/src/iree/schemas/cuda_executable_def.fbs b/runtime/src/iree/schemas/cuda_executable_def.fbs
new file mode 100644
index 0000000..77e5290
--- /dev/null
+++ b/runtime/src/iree/schemas/cuda_executable_def.fbs
@@ -0,0 +1,39 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+namespace iree;
+
+// 'CUDA Executable'.
+file_identifier "CUDA";
+file_extension "cuda";
+
+// A struct for the kernel block size along each dimensions.
+struct CUDABlockSizeDef {
+  x:uint32;
+  y:uint32;
+  z:uint32;
+}
+
+table CUDAExecutableDef {
+  // A map of entry point ordinals to string names as used in the shader
+  // library.
+  entry_points:[string];
+
+  // Block sizes for each entry point.
+  //
+  // Currently the thread group size/block size is decided during code gen but
+  // in CUDA it is set by the runtime.
+  block_sizes:[CUDABlockSizeDef];
+  // Size of dynamic shared memory.
+  shared_memory_size:[uint32];
+
+  // PTX string of the module.
+  ptx_image:string;
+  
+  // TODO(thomasraoux): Add potential cuBin binary specialized for some targets.
+}
+
+root_type CUDAExecutableDef;
diff --git a/runtime/src/iree/schemas/metal_executable_def.fbs b/runtime/src/iree/schemas/metal_executable_def.fbs
new file mode 100644
index 0000000..19a81ab
--- /dev/null
+++ b/runtime/src/iree/schemas/metal_executable_def.fbs
@@ -0,0 +1,47 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+namespace iree;
+
+// 'Metal Executable'.
+file_identifier "MTLE";
+file_extension "mtle";
+
+// A struct for Metal threadgroup size along each dimension.
+struct MetalThreadgroupSize {
+  x:uint32;
+  y:uint32;
+  z:uint32;
+}
+
+// A Metal shader library and runtime pipeline state description.
+// This information is used to create MTLLibrary, MTLFunction and pipeline
+// state objects.
+table MetalExecutableDef {
+  // A map of entry point ordinals to string names as used in the shader
+  // library.
+  entry_points:[string];
+
+  // Threadgroup sizes for each entry point.
+  //
+  // We need this because workgroup size is directly baked inside SPIR-V code,
+  // but in Metal it's specified when dispatching workload. So when cross
+  // compiling SPIR-V to MSL, we need to persist the information here so that
+  // later it can be used for dispatching.
+  // TODO(antiagainst): support SPIR-V specialization constant.
+  threadgroup_sizes:[MetalThreadgroupSize];
+
+  // Shader content can be provided as either a serialized library or in the
+  // form of source code strings.
+
+  // Serialized Metal shader library.
+  shader_library:[uint8];
+  // Original Metal shader source code.
+  shader_sources:[string];
+}
+
+root_type MetalExecutableDef;
+
diff --git a/runtime/src/iree/schemas/rocm_executable_def.fbs b/runtime/src/iree/schemas/rocm_executable_def.fbs
new file mode 100644
index 0000000..e88d8ed
--- /dev/null
+++ b/runtime/src/iree/schemas/rocm_executable_def.fbs
@@ -0,0 +1,33 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+namespace iree;
+
+// 'ROCM Executable'.
+file_identifier "ROCM";
+file_extension "rocm";
+
+// A struct for the kernel block size along each dimensions.
+struct ROCMBlockSizeDef {
+  x:uint32;
+  y:uint32;
+  z:uint32;
+}
+
+table ROCMExecutableDef {
+  // A map of entry point ordinals to string names as used in the shader
+  // library.
+  entry_points:[string];
+
+  // Block sizes for each entry point.
+  //
+  block_sizes:[ROCMBlockSizeDef];
+
+  // HSACO string of the module.
+  hsaco_image:string;
+}
+
+root_type ROCMExecutableDef;
diff --git a/runtime/src/iree/schemas/spirv_executable_def.fbs b/runtime/src/iree/schemas/spirv_executable_def.fbs
new file mode 100644
index 0000000..4dec3a4
--- /dev/null
+++ b/runtime/src/iree/schemas/spirv_executable_def.fbs
@@ -0,0 +1,24 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+namespace iree;
+
+// 'SPIR-V Executable'.
+file_identifier "SPVE";
+file_extension "spve";
+
+// A SPIR-V shader module and runtime pipeline layout description.
+// This information is used to create the VkShaderModule, VkPipelineLayout, and
+// any required VkDescriptorSetLayouts.
+table SpirVExecutableDef {
+  // A map of entry point ordinals to string names as used in the shader module.
+  entry_points:[string];
+
+  // SPIR-V code words.
+  code:[uint32];
+}
+
+root_type SpirVExecutableDef;
diff --git a/runtime/src/iree/schemas/wgsl_executable_def.fbs b/runtime/src/iree/schemas/wgsl_executable_def.fbs
new file mode 100644
index 0000000..c3ac1f6
--- /dev/null
+++ b/runtime/src/iree/schemas/wgsl_executable_def.fbs
@@ -0,0 +1,33 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+namespace iree;
+
+// 'WGSL Executable'.
+file_identifier "WGSL";
+file_extension "wgsl";
+
+// Contents of one WGPUShaderModule, possibly with multiple entry points.
+// Entry points have the name "dN" where N is the executable-wide entry point
+// ordinal.
+table WGSLShaderModuleDef {
+  // WGSL source code.
+  code:string;
+
+  // Optional `source-map-v3` format source map.
+  source_map:string;
+}
+
+table WGSLExecutableDef {
+  // An ordered list of shader modules, each containing 1+ entry points.
+  shader_modules:[WGSLShaderModuleDef];
+
+  // A mapping of executable entry point ordinals to the shader module in which
+  // they reside.
+  entry_points:[int];
+}
+
+root_type WGSLExecutableDef;
diff --git a/runtime/src/iree/task/BUILD b/runtime/src/iree/task/BUILD
new file mode 100644
index 0000000..6ef13e3
--- /dev/null
+++ b/runtime/src/iree/task/BUILD
@@ -0,0 +1,199 @@
+# Copyright 2020 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("//iree:build_defs.oss.bzl", "iree_cmake_extra_content", "iree_runtime_cc_library", "iree_runtime_cc_test")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+iree_cmake_extra_content(
+    content = """
+# Task-based executor requires threading support.
+if(NOT ${IREE_ENABLE_THREADING})
+  return()
+endif()
+
+# cpuinfo can be conditionally disabled when it is not supported.
+# If disabled then by default the task system will use 1 thread.
+set(IREE_CPUINFO_TARGET)
+if(IREE_ENABLE_CPUINFO)
+  set(IREE_CPUINFO_TARGET cpuinfo)
+endif()
+""",
+    inline = True,
+)
+
+iree_runtime_cc_library(
+    name = "api",
+    srcs = ["api.c"],
+    hdrs = ["api.h"],
+    deps = [
+        ":task",
+        "//runtime/src/iree/base:tracing",
+        "//runtime/src/iree/base/internal:flags",
+    ],
+)
+
+iree_runtime_cc_library(
+    name = "task",
+    srcs = [
+        "executor.c",
+        "executor_impl.h",
+        "list.c",
+        "poller.c",
+        "pool.c",
+        "post_batch.c",
+        "post_batch.h",
+        "queue.c",
+        "scope.c",
+        "submission.c",
+        "task.c",
+        "task_impl.h",
+        "topology.c",
+        "topology_cpuinfo.c",
+        "worker.c",
+        "worker.h",
+    ],
+    hdrs = [
+        "affinity_set.h",
+        "executor.h",
+        "list.h",
+        "poller.h",
+        "pool.h",
+        "queue.h",
+        "scope.h",
+        "submission.h",
+        "task.h",
+        "topology.h",
+        "tuning.h",
+    ],
+    deps = [
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:core_headers",
+        "//runtime/src/iree/base:tracing",
+        "//runtime/src/iree/base/internal",
+        "//runtime/src/iree/base/internal:atomic_slist",
+        "//runtime/src/iree/base/internal:cpu",
+        "//runtime/src/iree/base/internal:event_pool",
+        "//runtime/src/iree/base/internal:fpu_state",
+        "//runtime/src/iree/base/internal:prng",
+        "//runtime/src/iree/base/internal:synchronization",
+        "//runtime/src/iree/base/internal:threading",
+        "//runtime/src/iree/base/internal:wait_handle",
+        "@cpuinfo",
+    ],
+)
+
+iree_runtime_cc_test(
+    name = "executor_demo",
+    srcs = ["executor_demo.cc"],
+    deps = [
+        ":task",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:tracing",
+        "//runtime/src/iree/base/internal:prng",
+        "//runtime/src/iree/task/testing:test_util",
+    ],
+)
+
+iree_runtime_cc_test(
+    name = "executor_test",
+    srcs = ["executor_test.cc"],
+    deps = [
+        ":task",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/task/testing:test_util",
+        "//runtime/src/iree/testing:gtest",
+        "//runtime/src/iree/testing:gtest_main",
+    ],
+)
+
+iree_runtime_cc_test(
+    name = "list_test",
+    srcs = ["list_test.cc"],
+    deps = [
+        ":task",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/task/testing:test_util",
+        "//runtime/src/iree/testing:gtest",
+        "//runtime/src/iree/testing:gtest_main",
+    ],
+)
+
+iree_runtime_cc_test(
+    name = "pool_test",
+    srcs = ["pool_test.cc"],
+    deps = [
+        ":task",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/task/testing:test_util",
+        "//runtime/src/iree/testing:gtest",
+        "//runtime/src/iree/testing:gtest_main",
+    ],
+)
+
+iree_runtime_cc_test(
+    name = "queue_test",
+    srcs = ["queue_test.cc"],
+    deps = [
+        ":task",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/task/testing:test_util",
+        "//runtime/src/iree/testing:gtest",
+        "//runtime/src/iree/testing:gtest_main",
+    ],
+)
+
+iree_runtime_cc_test(
+    name = "scope_test",
+    srcs = [
+        "scope_test.cc",
+        "task_impl.h",
+    ],
+    deps = [
+        ":task",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/task/testing:test_util",
+        "//runtime/src/iree/testing:gtest",
+        "//runtime/src/iree/testing:gtest_main",
+    ],
+)
+
+iree_runtime_cc_test(
+    name = "task_tests",
+    srcs = [
+        "task_test_barrier.cc",
+        "task_test_call.cc",
+        "task_test_dispatch.cc",
+        "task_test_fence.cc",
+        "task_test_nop.cc",
+        "task_test_wait.cc",
+    ],
+    deps = [
+        ":task",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/task/testing:task_test",
+        "//runtime/src/iree/testing:gtest",
+        "//runtime/src/iree/testing:gtest_main",
+    ],
+)
+
+iree_runtime_cc_test(
+    name = "topology_test",
+    srcs = ["topology_test.cc"],
+    tags = [
+        "noasan",  # TODO(8469): Does not work on machines with large numbers of cores.
+    ],
+    deps = [
+        ":task",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/testing:gtest",
+        "//runtime/src/iree/testing:gtest_main",
+    ],
+)
diff --git a/runtime/src/iree/task/CMakeLists.txt b/runtime/src/iree/task/CMakeLists.txt
new file mode 100644
index 0000000..0e55722
--- /dev/null
+++ b/runtime/src/iree/task/CMakeLists.txt
@@ -0,0 +1,206 @@
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# runtime/src/iree/task/BUILD                                                  #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
+
+iree_add_all_subdirs()
+
+# Task-based executor requires threading support.
+if(NOT ${IREE_ENABLE_THREADING})
+  return()
+endif()
+
+# cpuinfo can be conditionally disabled when it is not supported.
+# If disabled then by default the task system will use 1 thread.
+set(IREE_CPUINFO_TARGET)
+if(IREE_ENABLE_CPUINFO)
+  set(IREE_CPUINFO_TARGET cpuinfo)
+endif()
+
+iree_cc_library(
+  NAME
+    api
+  HDRS
+    "api.h"
+  SRCS
+    "api.c"
+  DEPS
+    ::task
+    iree::base::internal::flags
+    iree::base::tracing
+  PUBLIC
+)
+
+iree_cc_library(
+  NAME
+    task
+  HDRS
+    "affinity_set.h"
+    "executor.h"
+    "list.h"
+    "poller.h"
+    "pool.h"
+    "queue.h"
+    "scope.h"
+    "submission.h"
+    "task.h"
+    "topology.h"
+    "tuning.h"
+  SRCS
+    "executor.c"
+    "executor_impl.h"
+    "list.c"
+    "poller.c"
+    "pool.c"
+    "post_batch.c"
+    "post_batch.h"
+    "queue.c"
+    "scope.c"
+    "submission.c"
+    "task.c"
+    "task_impl.h"
+    "topology.c"
+    "topology_cpuinfo.c"
+    "worker.c"
+    "worker.h"
+  DEPS
+    ${IREE_CPUINFO_TARGET}
+    iree::base
+    iree::base::core_headers
+    iree::base::internal
+    iree::base::internal::atomic_slist
+    iree::base::internal::cpu
+    iree::base::internal::event_pool
+    iree::base::internal::fpu_state
+    iree::base::internal::prng
+    iree::base::internal::synchronization
+    iree::base::internal::threading
+    iree::base::internal::wait_handle
+    iree::base::tracing
+  PUBLIC
+)
+
+iree_cc_test(
+  NAME
+    executor_demo
+  SRCS
+    "executor_demo.cc"
+  DEPS
+    ::task
+    iree::base
+    iree::base::internal::prng
+    iree::base::tracing
+    iree::task::testing::test_util
+)
+
+iree_cc_test(
+  NAME
+    executor_test
+  SRCS
+    "executor_test.cc"
+  DEPS
+    ::task
+    iree::base
+    iree::task::testing::test_util
+    iree::testing::gtest
+    iree::testing::gtest_main
+)
+
+iree_cc_test(
+  NAME
+    list_test
+  SRCS
+    "list_test.cc"
+  DEPS
+    ::task
+    iree::base
+    iree::task::testing::test_util
+    iree::testing::gtest
+    iree::testing::gtest_main
+)
+
+iree_cc_test(
+  NAME
+    pool_test
+  SRCS
+    "pool_test.cc"
+  DEPS
+    ::task
+    iree::base
+    iree::task::testing::test_util
+    iree::testing::gtest
+    iree::testing::gtest_main
+)
+
+iree_cc_test(
+  NAME
+    queue_test
+  SRCS
+    "queue_test.cc"
+  DEPS
+    ::task
+    iree::base
+    iree::task::testing::test_util
+    iree::testing::gtest
+    iree::testing::gtest_main
+)
+
+iree_cc_test(
+  NAME
+    scope_test
+  SRCS
+    "scope_test.cc"
+    "task_impl.h"
+  DEPS
+    ::task
+    iree::base
+    iree::task::testing::test_util
+    iree::testing::gtest
+    iree::testing::gtest_main
+)
+
+iree_cc_test(
+  NAME
+    task_tests
+  SRCS
+    "task_test_barrier.cc"
+    "task_test_call.cc"
+    "task_test_dispatch.cc"
+    "task_test_fence.cc"
+    "task_test_nop.cc"
+    "task_test_wait.cc"
+  DEPS
+    ::task
+    iree::base
+    iree::task::testing::task_test
+    iree::testing::gtest
+    iree::testing::gtest_main
+)
+
+iree_cc_test(
+  NAME
+    topology_test
+  SRCS
+    "topology_test.cc"
+  DEPS
+    ::task
+    iree::base
+    iree::testing::gtest
+    iree::testing::gtest_main
+  LABELS
+    "noasan"
+)
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
+
+if(NOT IREE_ENABLE_CPUINFO)
+  target_compile_definitions(iree_task_task
+    PUBLIC
+      "IREE_TASK_CPUINFO_DISABLED=1"
+  )
+endif()
diff --git a/runtime/src/iree/task/affinity_set.h b/runtime/src/iree/task/affinity_set.h
new file mode 100644
index 0000000..e81e549
--- /dev/null
+++ b/runtime/src/iree/task/affinity_set.h
@@ -0,0 +1,85 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_TASK_AFFINITY_SET_H_
+#define IREE_TASK_AFFINITY_SET_H_
+
+#include "iree/base/internal/atomics.h"
+#include "iree/base/internal/math.h"
+#include "iree/task/tuning.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// TODO(benvanik): if IREE_TASK_EXECUTOR_MAX_WORKER_COUNT <= 32 then switch
+// these to using the 32-bit primitives. No real effect on larger 64-bit systems
+// but if we were on a smaller 32-bit system with 2 cores it's kind of silly to
+// be doing expensive 64-bit atomics on a 32-bit bus all for just 2 bits of
+// data :)
+
+//===----------------------------------------------------------------------===//
+// iree_task_affinity_set_t
+//===----------------------------------------------------------------------===//
+
+typedef uint64_t iree_task_affinity_set_t;
+
+// Allows for only a specific worker to be selected.
+static inline iree_task_affinity_set_t iree_task_affinity_for_worker(
+    uint8_t worker_index) {
+  return 1ull << worker_index;
+}
+
+// Allows for a range of workers to be selected.
+static inline iree_task_affinity_set_t iree_task_affinity_for_worker_range(
+    uint8_t worker_start, uint8_t worker_end) {
+  return ((1ull << (worker_start - 1)) - 1) ^ ((1ull << worker_end) - 1);
+}
+
+// Allows for any worker to be selected.
+static inline iree_task_affinity_set_t iree_task_affinity_for_any_worker(void) {
+  return UINT64_MAX;
+}
+
+#define iree_task_affinity_set_count_trailing_zeros \
+  iree_math_count_trailing_zeros_u64
+#define iree_task_affinity_set_count_ones iree_math_count_ones_u64
+#define iree_task_affinity_set_rotr iree_math_rotr_u64
+
+//===----------------------------------------------------------------------===//
+// iree_atomic_task_affinity_set_t
+//===----------------------------------------------------------------------===//
+
+typedef iree_atomic_int64_t iree_atomic_task_affinity_set_t;
+
+static inline iree_task_affinity_set_t iree_atomic_task_affinity_set_load(
+    iree_atomic_task_affinity_set_t* set, iree_memory_order_t order) {
+  return iree_atomic_load_int64(set, order);
+}
+
+static inline void iree_atomic_task_affinity_set_store(
+    iree_atomic_task_affinity_set_t* set, iree_task_affinity_set_t value,
+    iree_memory_order_t order) {
+  iree_atomic_store_int64(set, value, order);
+}
+
+static inline iree_task_affinity_set_t iree_atomic_task_affinity_set_fetch_and(
+    iree_atomic_task_affinity_set_t* set, iree_task_affinity_set_t value,
+    iree_memory_order_t order) {
+  return iree_atomic_fetch_and_int64(set, value, order);
+}
+
+static inline iree_task_affinity_set_t iree_atomic_task_affinity_set_fetch_or(
+    iree_atomic_task_affinity_set_t* set, iree_task_affinity_set_t value,
+    iree_memory_order_t order) {
+  return iree_atomic_fetch_or_int64(set, value, order);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_TASK_AFFINITY_SET_H_
diff --git a/runtime/src/iree/task/api.c b/runtime/src/iree/task/api.c
new file mode 100644
index 0000000..d09e5ed
--- /dev/null
+++ b/runtime/src/iree/task/api.c
@@ -0,0 +1,113 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/task/api.h"
+
+#include <stdbool.h>
+#include <string.h>
+
+#include "iree/base/internal/flags.h"
+#include "iree/base/tracing.h"
+#include "iree/task/topology.h"
+
+//===----------------------------------------------------------------------===//
+// Executor configuration
+//===----------------------------------------------------------------------===//
+
+IREE_FLAG(
+    bool, task_scheduling_defer_worker_startup, false,
+    "Creates all workers suspended and waits until work is first scheduled to\n"
+    "them to resume. This trades off initial blocking startup time waking the\n"
+    "threads for potential latency additions later on as threads take longer\n"
+    "to wake on their first use.");
+
+// TODO(benvanik): enable this when we use it - though hopefully we don't!
+IREE_FLAG(
+    int32_t, task_worker_local_memory, 0,  // 64 * 1024,
+    "Specifies the bytes of per-worker local memory allocated for use by\n"
+    "dispatched tiles. Tiles may use less than this but will fail to dispatch\n"
+    "if they require more. Conceptually it is like a stack reservation and\n"
+    "should be treated the same way: the source programs must be built to\n"
+    "only use a specific maximum amount of local memory and the runtime must\n"
+    "be configured to make at least that amount of local memory available.");
+
+//===----------------------------------------------------------------------===//
+// Topology configuration
+//===----------------------------------------------------------------------===//
+
+IREE_FLAG(
+    string, task_topology_mode, "physical_cores",
+    "Available modes:\n"
+    " --task_topology_group_count=non-zero:\n"
+    "   Uses whatever the specified group count is and ignores the set mode.\n"
+    " 'physical_cores':\n"
+    "   Creates one group per physical core in the machine up to\n"
+    "   the value specified by --task_topology_max_group_count.\n");
+
+IREE_FLAG(
+    int32_t, task_topology_group_count, 0,
+    "Defines the total number of task system workers that will be created.\n"
+    "Workers will be distributed across cores. Specifying 0 will use a\n"
+    "heuristic defined by --task_topology_mode= to automatically select the\n"
+    "worker count and distribution.");
+
+IREE_FLAG(
+    int32_t, task_topology_max_group_count, 8,
+    "Sets a maximum value on the worker count that can be automatically\n"
+    "detected and used when --task_topology_group_count=0 and is ignored\n"
+    "otherwise.\n");
+
+// TODO(benvanik): add --task_topology_dump to dump out the current machine
+// configuration as seen by the topology utilities.
+
+//===----------------------------------------------------------------------===//
+// Task system factory functions
+//===----------------------------------------------------------------------===//
+
+iree_status_t iree_task_executor_create_from_flags(
+    iree_allocator_t host_allocator, iree_task_executor_t** out_executor) {
+  IREE_ASSERT_ARGUMENT(out_executor);
+  *out_executor = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_task_scheduling_mode_t scheduling_mode = 0;
+  if (FLAG_task_scheduling_defer_worker_startup) {
+    scheduling_mode |= IREE_TASK_SCHEDULING_MODE_DEFER_WORKER_STARTUP;
+  }
+
+  iree_host_size_t worker_local_memory =
+      (iree_host_size_t)FLAG_task_worker_local_memory;
+
+  iree_status_t status = iree_ok_status();
+
+  iree_task_topology_t topology;
+  iree_task_topology_initialize(&topology);
+
+  if (FLAG_task_topology_group_count != 0) {
+    iree_task_topology_initialize_from_group_count(
+        FLAG_task_topology_group_count, &topology);
+  } else if (strcmp(FLAG_task_topology_mode, "physical_cores") == 0) {
+    iree_task_topology_initialize_from_physical_cores(
+        FLAG_task_topology_max_group_count, &topology);
+  } else {
+    status = iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "one of --task_topology_group_count or --task_topology_mode must be "
+        "specified and be a valid value; have --task_topology_mode=%s.",
+        FLAG_task_topology_mode);
+  }
+
+  if (iree_status_is_ok(status)) {
+    status = iree_task_executor_create(scheduling_mode, &topology,
+                                       worker_local_memory, host_allocator,
+                                       out_executor);
+  }
+
+  iree_task_topology_deinitialize(&topology);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
diff --git a/runtime/src/iree/task/api.h b/runtime/src/iree/task/api.h
new file mode 100644
index 0000000..bebaf57
--- /dev/null
+++ b/runtime/src/iree/task/api.h
@@ -0,0 +1,46 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_TASK_API_H_
+#define IREE_TASK_API_H_
+
+#include "iree/base/api.h"
+#include "iree/task/executor.h"  // IWYU pragma: export
+#include "iree/task/topology.h"  // IWYU pragma: export
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+//===----------------------------------------------------------------------===//
+// Task system factory functions
+//===----------------------------------------------------------------------===//
+
+// Creates a task system executor from the current command line flags.
+// This configures a topology and all of the executor parameters and returns
+// a newly created instance in |out_executor| that must be released by the
+// caller.
+//
+// This utility method is useful when only a single executor exists within a
+// process as the flags are globals. When multiple executors may exist or
+// programmatic configuration is needed use the iree_task_executor_create method
+// directly.
+iree_status_t iree_task_executor_create_from_flags(
+    iree_allocator_t host_allocator, iree_task_executor_t** out_executor);
+
+//===----------------------------------------------------------------------===//
+// Task system simple invocation utilities
+//===----------------------------------------------------------------------===//
+
+// TODO(benvanik): simple IO completion event callback.
+// TODO(benvanik): simple async function call dispatch.
+// TODO(benvanik): simple parallel-for grid-style function call dispatch.
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_TASK_API_H_
diff --git a/runtime/src/iree/task/executor.c b/runtime/src/iree/task/executor.c
new file mode 100644
index 0000000..9c09ee6
--- /dev/null
+++ b/runtime/src/iree/task/executor.c
@@ -0,0 +1,590 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/task/executor.h"
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <string.h>
+
+#include "iree/base/internal/math.h"
+#include "iree/base/tracing.h"
+#include "iree/task/affinity_set.h"
+#include "iree/task/executor_impl.h"
+#include "iree/task/list.h"
+#include "iree/task/pool.h"
+#include "iree/task/post_batch.h"
+#include "iree/task/queue.h"
+#include "iree/task/task_impl.h"
+#include "iree/task/tuning.h"
+#include "iree/task/worker.h"
+
+static void iree_task_executor_destroy(iree_task_executor_t* executor);
+
+iree_status_t iree_task_executor_create(
+    iree_task_scheduling_mode_t scheduling_mode,
+    const iree_task_topology_t* topology,
+    iree_host_size_t worker_local_memory_size, iree_allocator_t allocator,
+    iree_task_executor_t** out_executor) {
+  iree_host_size_t worker_count = iree_task_topology_group_count(topology);
+  if (worker_count > IREE_TASK_EXECUTOR_MAX_WORKER_COUNT) {
+    return iree_make_status(
+        IREE_STATUS_RESOURCE_EXHAUSTED,
+        "requested %zu workers but a maximum of %d is allowed", worker_count,
+        IREE_TASK_EXECUTOR_MAX_WORKER_COUNT);
+  }
+
+  // TODO(benvanik): support a threadless mode where we have one dummy worker
+  // that just holds the lists but is pumped from donate_caller.
+  if (worker_count == 0) {
+    return iree_make_status(
+        IREE_STATUS_UNIMPLEMENTED,
+        "threadless donate-only executor mode not yet implemented");
+  }
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_ASSERT_ARGUMENT(out_executor);
+  *out_executor = NULL;
+
+  // The executor is followed in memory by worker[] + worker_local_memory[].
+  // The whole point is that we don't want destructive sharing between workers
+  // so ensure we are aligned to at least the destructive interference size.
+  worker_local_memory_size = iree_host_align(
+      worker_local_memory_size, iree_hardware_destructive_interference_size);
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, (int64_t)worker_local_memory_size);
+  iree_host_size_t executor_base_size =
+      iree_host_align(sizeof(iree_task_executor_t),
+                      iree_hardware_destructive_interference_size);
+  iree_host_size_t worker_list_size =
+      iree_host_align(worker_count * sizeof(iree_task_worker_t),
+                      iree_hardware_destructive_interference_size);
+  iree_host_size_t executor_size = executor_base_size + worker_list_size +
+                                   worker_count * worker_local_memory_size;
+
+  iree_task_executor_t* executor = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_allocator_malloc(allocator, executor_size, (void**)&executor));
+  memset(executor, 0, executor_size);
+  iree_atomic_ref_count_init(&executor->ref_count);
+  executor->allocator = allocator;
+  executor->scheduling_mode = scheduling_mode;
+  iree_atomic_task_slist_initialize(&executor->incoming_ready_slist);
+  iree_slim_mutex_initialize(&executor->coordinator_mutex);
+
+  // Simple PRNG used to generate seeds for the per-worker PRNGs used to
+  // distribute work. This isn't strong (and doesn't need to be); it's just
+  // enough to ensure each worker gets a sufficiently random seed for itself to
+  // then generate entropy with. As a hack we use out_executor's address, as
+  // that should live on the caller stack and with ASLR that's likely pretty
+  // random itself. I'm sure somewhere a mathemetician just cringed :)
+  iree_prng_splitmix64_state_t seed_prng;
+  iree_prng_splitmix64_initialize(/*seed=*/(uint64_t)(out_executor),
+                                  &seed_prng);
+  iree_prng_minilcg128_initialize(iree_prng_splitmix64_next(&seed_prng),
+                                  &executor->donation_theft_prng);
+
+  iree_status_t status = iree_ok_status();
+
+  // Pool used for system events; exposed to users of the task system to ensure
+  // we minimize the number of live events and reduce overheads in
+  // high-frequency transient parking operations.
+  if (iree_status_is_ok(status)) {
+    status = iree_event_pool_allocate(IREE_TASK_EXECUTOR_EVENT_POOL_CAPACITY,
+                                      allocator, &executor->event_pool);
+  }
+
+  // Pool used for all fanout tasks. These only live within the executor and
+  // since we know the precise lifetime of them we can keep them entirely within
+  // the system here.
+  if (iree_status_is_ok(status)) {
+    status = iree_task_pool_initialize(
+        allocator,
+        iree_max(sizeof(iree_task_fence_t), sizeof(iree_task_dispatch_shard_t)),
+        worker_count * IREE_TASK_EXECUTOR_INITIAL_SHARD_RESERVATION_PER_WORKER,
+        &executor->transient_task_pool);
+  }
+
+  // Wait handling polling and waiting use a dedicated thread to ensure that
+  // blocking syscalls stay off the workers.
+  if (iree_status_is_ok(status)) {
+    // For now we allow the poller to run anywhere - we should allow callers to
+    // specify it via the topology (or something).
+    iree_thread_affinity_t poller_thread_affinity;
+    iree_thread_affinity_set_any(&poller_thread_affinity);
+    status = iree_task_poller_initialize(executor, poller_thread_affinity,
+                                         &executor->poller);
+  }
+
+  // Bring up the workers; the threads will be created here but be suspended
+  // (if the platform supports it) awaiting the first tasks getting scheduled.
+  if (iree_status_is_ok(status)) {
+    executor->worker_count = worker_count;
+    executor->workers =
+        (iree_task_worker_t*)((uint8_t*)executor + executor_base_size);
+    uint8_t* worker_local_memory =
+        (uint8_t*)executor->workers + worker_list_size;
+
+    iree_task_affinity_set_t worker_idle_mask = 0;
+    iree_task_affinity_set_t worker_live_mask = 0;
+    iree_task_affinity_set_t worker_suspend_mask = 0;
+    for (iree_host_size_t i = 0; i < worker_count; ++i) {
+      iree_task_affinity_set_t worker_bit = iree_task_affinity_for_worker(i);
+      worker_idle_mask |= worker_bit;
+      worker_live_mask |= worker_bit;
+      if (executor->scheduling_mode &
+          IREE_TASK_SCHEDULING_MODE_DEFER_WORKER_STARTUP) {
+        worker_suspend_mask |= worker_bit;
+      }
+
+      iree_task_worker_t* worker = &executor->workers[i];
+      status = iree_task_worker_initialize(
+          executor, i, iree_task_topology_get_group(topology, i),
+          iree_make_byte_span(worker_local_memory, worker_local_memory_size),
+          &seed_prng, worker);
+      worker_local_memory += worker_local_memory_size;
+      if (!iree_status_is_ok(status)) break;
+    }
+    iree_atomic_task_affinity_set_store(&executor->worker_suspend_mask,
+                                        worker_suspend_mask,
+                                        iree_memory_order_relaxed);
+    iree_atomic_task_affinity_set_store(&executor->worker_idle_mask,
+                                        worker_idle_mask,
+                                        iree_memory_order_relaxed);
+    iree_atomic_task_affinity_set_store(&executor->worker_live_mask,
+                                        worker_live_mask,
+                                        iree_memory_order_release);
+  }
+
+  if (!iree_status_is_ok(status)) {
+    // NOTE: destroy will ensure that any workers we have initialized are
+    // properly cleaned up.
+    iree_task_executor_destroy(executor);
+    IREE_TRACE_ZONE_END(z0);
+    return status;
+  }
+
+  *out_executor = executor;
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+static void iree_task_executor_destroy(iree_task_executor_t* executor) {
+  if (!executor) return;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // First ask all workers to exit. We do this prior to waiting on them to exit
+  // so that we parallelize the shutdown logic (which may flush pending tasks).
+  for (iree_host_size_t i = 0; i < executor->worker_count; ++i) {
+    iree_task_worker_t* worker = &executor->workers[i];
+    iree_task_worker_request_exit(worker);
+  }
+
+  // Also ask the poller to exit - it'll wake from any system waits it's in and
+  // abort all the remaining waits.
+  iree_task_poller_request_exit(&executor->poller);
+
+  // Now that all workers and the poller should be in the process of exiting we
+  // can join with them. Some may take longer than others to exit but that's
+  // fine as we can't return from here until they exit anyway.
+  for (iree_host_size_t i = 0; i < executor->worker_count; ++i) {
+    iree_task_worker_t* worker = &executor->workers[i];
+    iree_task_worker_await_exit(worker);
+  }
+  iree_task_poller_await_exit(&executor->poller);
+
+  // Tear down all workers and the poller now that no more threads are live.
+  // Any live threads may still be touching their own data structures or those
+  // of others (for example when trying to steal work).
+  for (iree_host_size_t i = 0; i < executor->worker_count; ++i) {
+    iree_task_worker_t* worker = &executor->workers[i];
+    iree_task_worker_deinitialize(worker);
+  }
+  iree_task_poller_deinitialize(&executor->poller);
+
+  iree_event_pool_free(executor->event_pool);
+  iree_slim_mutex_deinitialize(&executor->coordinator_mutex);
+  iree_atomic_task_slist_deinitialize(&executor->incoming_ready_slist);
+  iree_task_pool_deinitialize(&executor->transient_task_pool);
+  iree_allocator_free(executor->allocator, executor);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_task_executor_retain(iree_task_executor_t* executor) {
+  if (executor) {
+    iree_atomic_ref_count_inc(&executor->ref_count);
+  }
+}
+
+void iree_task_executor_release(iree_task_executor_t* executor) {
+  if (executor && iree_atomic_ref_count_dec(&executor->ref_count) == 1) {
+    iree_task_executor_destroy(executor);
+  }
+}
+
+void iree_task_executor_trim(iree_task_executor_t* executor) {
+  // TODO(benvanik): figure out a good way to do this; the pools require that
+  // no tasks are in-flight to trim but our caller can't reliably make that
+  // guarantee. We'd need some global executor lock that we did here and
+  // on submit - or rework pools to not have this limitation.
+  // iree_task_pool_trim(&executor->fence_task_pool);
+  // iree_task_pool_trim(&executor->transient_task_pool);
+}
+
+iree_host_size_t iree_task_executor_worker_count(
+    iree_task_executor_t* executor) {
+  return executor->worker_count;
+}
+
+iree_event_pool_t* iree_task_executor_event_pool(
+    iree_task_executor_t* executor) {
+  return executor->event_pool;
+}
+
+iree_status_t iree_task_executor_acquire_fence(iree_task_executor_t* executor,
+                                               iree_task_scope_t* scope,
+                                               iree_task_fence_t** out_fence) {
+  *out_fence = NULL;
+
+  iree_task_fence_t* fence = NULL;
+  IREE_RETURN_IF_ERROR(iree_task_pool_acquire(&executor->transient_task_pool,
+                                              (iree_task_t**)&fence));
+  iree_task_fence_initialize(scope, iree_wait_primitive_immediate(), fence);
+  fence->header.pool = &executor->transient_task_pool;
+
+  *out_fence = fence;
+  return iree_ok_status();
+}
+
+// Schedules a generic task to a worker matching its affinity.
+// The task will be posted to the worker mailbox and available for the worker to
+// begin processing as soon as the |post_batch| is submitted.
+//
+// Only called during coordination and expects the coordinator lock to be held.
+static void iree_task_executor_relay_to_worker(
+    iree_task_executor_t* executor, iree_task_post_batch_t* post_batch,
+    iree_task_t* task) {
+  iree_host_size_t worker_index =
+      iree_task_post_batch_select_worker(post_batch, task->affinity_set);
+  iree_task_post_batch_enqueue(post_batch, worker_index, task);
+}
+
+// Schedules all ready tasks in the |pending_submission| list.
+// Task may enqueue zero or more new tasks (or newly-ready/waiting tasks) to
+// |pending_submission| or queue work for posting to workers via the
+// |post_batch|.
+//
+// NOTE: the pending submission list we walk here is in FIFO order and the
+// post batch we are building is in LIFO; this means that as we pop off the
+// least recently added tasks from the submission (nice in-order traversal) we
+// are pushing them as what will become the least recent tasks in the batch.
+//
+// Only called during coordination and expects the coordinator lock to be held.
+void iree_task_executor_schedule_ready_tasks(
+    iree_task_executor_t* executor, iree_task_submission_t* pending_submission,
+    iree_task_post_batch_t* post_batch) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_task_t* task = NULL;
+  while ((task = iree_task_list_pop_front(&pending_submission->ready_list))) {
+    // If the scope has been marked as failing then we abort the task.
+    // This needs to happen as a poll here because one or more of the tasks we
+    // are joining may have failed.
+    if (IREE_UNLIKELY(iree_task_scope_has_failed(task->scope))) {
+      iree_task_list_t discard_worklist;
+      iree_task_list_initialize(&discard_worklist);
+      iree_task_discard(task, &discard_worklist);
+      iree_task_list_discard(&discard_worklist);
+      continue;
+    }
+
+    switch (task->type) {
+      case IREE_TASK_TYPE_NOP:
+        // Doesn't do anything; just retire and continue on to any dependents.
+        iree_task_nop_retire((iree_task_nop_t*)task, pending_submission);
+        break;
+      case IREE_TASK_TYPE_CALL: {
+        // Generic routing to workers for tasks that should always run there.
+        iree_task_executor_relay_to_worker(executor, post_batch, task);
+        break;
+      }
+      case IREE_TASK_TYPE_BARRIER: {
+        // Retire the barrier to (possibly) ready up all dependent tasks.
+        // This acts as a fan-out in cases where the dependent task count >1.
+        iree_task_barrier_retire((iree_task_barrier_t*)task,
+                                 pending_submission);
+        break;
+      }
+      case IREE_TASK_TYPE_FENCE: {
+        // Scope fence hit; notifies the scope so that anyone waiting on the
+        // fence can be notified without us having to do so explicitly.
+        iree_task_fence_retire((iree_task_fence_t*)task, pending_submission);
+        break;
+      }
+      case IREE_TASK_TYPE_WAIT: {
+        // We should only ever see completed waits here; ones that have yet to
+        // resolve are sent to the poller.
+        iree_task_wait_retire(
+            (iree_task_wait_t*)task, pending_submission,
+            iree_all_bits_set(task->flags, IREE_TASK_FLAG_WAIT_COMPLETED)
+                ? iree_ok_status()
+                : iree_make_status(IREE_STATUS_INTERNAL,
+                                   "unresolved wait task ended up in the "
+                                   "executor run queue"));
+        break;
+      }
+      case IREE_TASK_TYPE_DISPATCH: {
+        // Dispatches may need to be issued (fanning out the tiles to workers)
+        // or retired (after all tiles have completed).
+        if (task->flags & IREE_TASK_FLAG_DISPATCH_RETIRE) {
+          iree_task_dispatch_retire((iree_task_dispatch_t*)task,
+                                    pending_submission);
+        } else {
+          iree_task_dispatch_issue((iree_task_dispatch_t*)task,
+                                   &executor->transient_task_pool,
+                                   pending_submission, post_batch);
+        }
+        break;
+      }
+    }
+  }
+  IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_task_executor_merge_submission(iree_task_executor_t* executor,
+                                         iree_task_submission_t* submission) {
+  // Concatenate all of the incoming tasks into the submission list.
+  // Note that the submission stores tasks in LIFO order such that when they are
+  // put into the LIFO atomic slist they match the order across all concats
+  // (earlier concats are later in the LIFO list).
+  iree_atomic_task_slist_concat(&executor->incoming_ready_slist,
+                                submission->ready_list.head,
+                                submission->ready_list.tail);
+
+  // Enqueue waiting tasks with the poller immediately: this may issue a
+  // syscall to kick the poller. If we see bad context switches here then we
+  // should split this into an enqueue/flush pair.
+  iree_task_poller_enqueue(&executor->poller, &submission->waiting_list);
+
+  // NOTE: after concatenating the intrusive next_task pointers may immediately
+  // be modified by other threads. We can no longer assume anything about the
+  // submission lists and can only discard them.
+  iree_task_submission_reset(submission);
+}
+
+void iree_task_executor_submit(iree_task_executor_t* executor,
+                               iree_task_submission_t* submission) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Concatenate the submitted tasks onto our primary LIFO incoming lists.
+  iree_task_executor_merge_submission(executor, submission);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_task_executor_flush(iree_task_executor_t* executor) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Mostly a no-op today as we aren't deferring submission with the scheduling
+  // mode. Instead, we'll just run the coordinator inline to ensure all tasks
+  // are pushed to workers. This will not wait - but may block.
+  iree_task_executor_coordinate(executor, /*current_worker=*/NULL);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+// Dispatches tasks in the global submission queue to workers.
+// This is called by users upon submission of new tasks or by workers when they
+// run out of tasks to process. If |current_worker| is provided then tasks will
+// prefer to be routed back to it for immediate processing.
+//
+// If a coordination run ends up with no ready tasks and |current_worker| is
+// provided the calling thread will enter a wait until the worker has more tasks
+// posted to it.
+void iree_task_executor_coordinate(iree_task_executor_t* executor,
+                                   iree_task_worker_t* current_worker) {
+  iree_slim_mutex_lock(&executor->coordinator_mutex);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // We may be adding tasks/waiting/etc on each pass through coordination - to
+  // ensure we completely drain the incoming queues and satisfied waits we loop
+  // until there's nothing left to coordinate.
+  bool schedule_dirty = true;
+  do {
+    // Check for incoming submissions and move their posted tasks into our
+    // local lists. Any of the tasks here are ready to execute immediately and
+    // ones we should be able to distribute to workers without delay. The
+    // waiting tasks are to the best of the caller's knowledge not ready yet.
+    //
+    // Note that we only do this once per coordination; that's so we don't
+    // starve if submissions come in faster than we can schedule them.
+    // Coordination will run again when workers become idle and will pick up
+    // any changes then.
+    //
+    // As we schedule tasks we may spawn new ones (like a dispatch -> many
+    // dispatch shards) and we keep track of those here. By doing a pass through
+    // all ready tasks and only then merging in the new submission we get
+    // breadth-first traversal of task graphs even if they originate from
+    // various places and have no relation - hopefully leading to better average
+    // latency.
+    iree_task_submission_t pending_submission;
+    iree_task_submission_initialize_from_lifo_slist(
+        &executor->incoming_ready_slist, &pending_submission);
+    if (iree_task_list_is_empty(&pending_submission.ready_list)) break;
+
+    // Scratch coordinator submission batch used during scheduling to batch up
+    // all tasks that will be posted to each worker. We could stash this on the
+    // executor but given that which thread is playing the role of the
+    // coordinator is random it's better to ensure that these bytes never incur
+    // a cache miss by making them live here in the stack of the chosen thread.
+    iree_task_post_batch_t* post_batch =
+        iree_alloca(sizeof(iree_task_post_batch_t) +
+                    executor->worker_count * sizeof(iree_task_list_t));
+    iree_task_post_batch_initialize(executor, current_worker, post_batch);
+
+    // Schedule all ready tasks in this batch. Some may complete inline (such
+    // as ready barriers with all their dependencies resolved) while others may
+    // be scheduled on workers via the post batch.
+    iree_task_executor_schedule_ready_tasks(executor, &pending_submission,
+                                            post_batch);
+
+    // Route waiting tasks to the poller.
+    iree_task_poller_enqueue(&executor->poller,
+                             &pending_submission.waiting_list);
+
+    // Post all new work to workers; they may wake and begin executing
+    // immediately. Returns whether this worker has new tasks for it to work on.
+    schedule_dirty = iree_task_post_batch_submit(post_batch);
+  } while (schedule_dirty);
+
+  iree_slim_mutex_unlock(&executor->coordinator_mutex);
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_task_t* iree_task_executor_try_steal_task_from_affinity_set(
+    iree_task_executor_t* executor, iree_task_affinity_set_t victim_mask,
+    uint32_t max_theft_attempts, int rotation_offset,
+    iree_task_queue_t* local_task_queue) {
+  if (!victim_mask) return NULL;
+  max_theft_attempts = iree_min(max_theft_attempts,
+                                iree_task_affinity_set_count_ones(victim_mask));
+  victim_mask = iree_task_affinity_set_rotr(victim_mask, rotation_offset);
+
+  int worker_index = rotation_offset;
+  iree_task_affinity_set_t mask =
+      iree_task_affinity_set_rotr(victim_mask, worker_index);
+  for (uint32_t i = 0; i < max_theft_attempts; ++i) {
+    // Find the last set bit and skip to it. This avoids the need for doing
+    // a full O(n) scan and instead gets us at O(popcnt) * O(ctz).
+    //
+    // Example: sharing mask = 0b01010101
+    //          mask_rotation = 3 (randomly selected)
+    //          mask = 0b01010101 rotr 3 = 0b10101010
+    //          for (i = 0; i < 4; ++i)
+    //            offset = ctz(0b10101010) = 1
+    //            mask_rotation += 1 = 4
+    //            mask >>= 1 = 0b01010101
+    //            victim_index = 4 % 64 = 4
+    int offset = iree_task_affinity_set_count_trailing_zeros(mask);
+    int victim_index = (worker_index + offset) % executor->worker_count;
+    worker_index += offset + 1;
+    mask = iree_shr(mask, offset + 1);
+    iree_task_worker_t* victim_worker = &executor->workers[victim_index];
+
+    // Policy: steal a chunk of tasks at the tail of the victim queue.
+    // This will steal multiple tasks from the victim up to the specified max
+    // and move the them into our local task queue. Not all tasks will be stolen
+    // and the assumption is that over a large-enough random distribution of
+    // thievery taking ~half of the tasks each time (across all queues) will
+    // lead to a relatively even distribution.
+    iree_task_t* task = iree_task_worker_try_steal_task(
+        victim_worker, local_task_queue,
+        /*max_tasks=*/IREE_TASK_EXECUTOR_MAX_THEFT_TASK_COUNT);
+    if (task) return task;
+  }
+
+  // No tasks found in victim_mask.
+  return NULL;
+}
+
+// Tries to steal an entire task from a sibling worker (based on topology).
+// Returns a task that is available (has not yet begun processing at all).
+// May steal multiple tasks and add them to the |local_task_queue|.
+//
+// We do a scan through ideal victims indicated by the
+// |constructive_sharing_mask|; these are the workers most likely to have some
+// cache benefits to taking their work as they share some level of the cache
+// hierarchy and should be better to steal from than any random worker.
+//
+// To prevent biasing any particular victim we use a fast prng function to
+// select where in the set of potential victims defined by the topology
+// group we steal. We (probably) don't need anything super complex here so
+// instead of bouncing around at random we just select the starting point in
+// our search and then go in-order.
+iree_task_t* iree_task_executor_try_steal_task(
+    iree_task_executor_t* executor,
+    iree_task_affinity_set_t constructive_sharing_mask,
+    uint32_t max_theft_attempts, iree_prng_minilcg128_state_t* theft_prng,
+    iree_task_queue_t* local_task_queue) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_task_affinity_set_t worker_live_mask =
+      iree_atomic_task_affinity_set_load(&executor->worker_live_mask,
+                                         iree_memory_order_acquire);
+  iree_task_affinity_set_t worker_idle_mask =
+      iree_atomic_task_affinity_set_load(&executor->worker_idle_mask,
+                                         iree_memory_order_relaxed);
+  // Limit the workers we will steal from to the ones that are currently live
+  // and not idle.
+  iree_task_affinity_set_t victim_mask = worker_live_mask & ~worker_idle_mask;
+
+  // TODO(benvanik): it may be possible to rework this such that we better
+  // use the prng; for example, instead of all this rotating stuff we could just
+  // generate an 8-bit number (or even split it into two 4-bit numbers) per
+  // theft attempt. The current rotation strategy is biased toward the same try
+  // ordering vs. what we may really want with an unbiased random selection.
+  int rotation_offset = iree_prng_minilcg128_next_uint8(theft_prng) &
+                        (8 * sizeof(iree_task_affinity_set_t) - 1);
+
+  // Try first with the workers we may have some caches shared with. This
+  // helps to prevent cache invalidations/availability updates as it's likely
+  // that we won't need to go back to main memory (or higher cache tiers) in the
+  // event that the thief and victim are running close to each other in time.
+  iree_task_t* task = iree_task_executor_try_steal_task_from_affinity_set(
+      executor, victim_mask & constructive_sharing_mask, max_theft_attempts,
+      rotation_offset, local_task_queue);
+  if (task) {
+    IREE_TRACE_ZONE_APPEND_TEXT(z0, "local");
+  } else {
+    task = iree_task_executor_try_steal_task_from_affinity_set(
+        executor, victim_mask & ~constructive_sharing_mask, max_theft_attempts,
+        rotation_offset, local_task_queue);
+    if (task) {
+      IREE_TRACE_ZONE_APPEND_TEXT(z0, "non-local");
+    }
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return task;
+}
+
+iree_status_t iree_task_executor_donate_caller(iree_task_executor_t* executor,
+                                               iree_wait_source_t wait_source,
+                                               iree_timeout_t timeout) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Perform an immediate flush/coordination (in case the caller queued).
+  iree_task_executor_flush(executor);
+
+  // Wait until completed.
+  // TODO(benvanik): make this steal tasks until wait_handle resolves?
+  // Somewhat dangerous as we don't know what kind of thread we are running on;
+  // it may have a smaller stack than we are expecting or have some weird thread
+  // local state (FPU rounding modes/etc).
+  iree_status_t status = iree_wait_source_wait_one(wait_source, timeout);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
diff --git a/runtime/src/iree/task/executor.h b/runtime/src/iree/task/executor.h
new file mode 100644
index 0000000..f060ac4
--- /dev/null
+++ b/runtime/src/iree/task/executor.h
@@ -0,0 +1,396 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_TASK_EXECUTOR_H_
+#define IREE_TASK_EXECUTOR_H_
+
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/atomics.h"
+#include "iree/base/internal/event_pool.h"
+#include "iree/task/scope.h"
+#include "iree/task/submission.h"
+#include "iree/task/task.h"
+#include "iree/task/topology.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+//==============================================================================
+// IREE Task Executor
+//==============================================================================
+//
+// Roughly models wavefront-style GPU dispatch. Users submit task DAGs with
+// fine-grained dependency information for the executor to schedule across a set
+// of workers. As tasks become ready to execute they are placed into per-worker
+// FIFOs and workers run through them in a breadth-first fashion executing and
+// resolving tasks and building up new waves of ready tasks. Workers will always
+// make forward progress and only when they run out of work will they attempt to
+// self-nominate to play the role of coordinator and schedule any newly-
+// submitted or readied tasks. Only once all tasks have been retired and
+// waits on external resources remain does the task system suspend itself until
+// more tasks are submitted or an external wait resolves.
+//
+// Our goal is to do the minimal amount of work to get the maximum amount of
+// concurrency the user requests or allows (by way of their dependencies).
+// Whether on a single core where you want to timeshare with an application or
+// across hundreds the same architecture holds. Where there is inefficiency it's
+// almost always surmountable with properly constructed tasks: choose the right
+// granularity for dispatches, choose the right fan-out for tiles within those
+// dispatches, choose the right places to insert barriers to force fan-in to
+// reduce memory utilization or right places to batch barriers to allow less
+// synchronization with the work queue, etc. All of those choices are ones this
+// system is designed to handle dynamically via the task graphs provided that
+// are themselves (in the IREE world) mapped 1:1 with the GPU-esque grid
+// dispatch and command buffer model. It's a super-power if a human is authoring
+// all that information but what makes it particularly powerful here is that we
+// are authoring that in the compiler based on a tremendous amount of
+// higher-level information we can derive from the whole program. Every bit of
+// dynamism here is matched with the ability to tighten down the screws and gain
+// back anything lost by way of compiler improvements while also being able to
+// generalize out to far more complex systems (higher parallelism, higher and
+// more efficient concurrency, etc).
+//
+// The design of this system allows for a spectrum of dynamic behavior based on
+// desired usage scenarios:
+// - variable number of persistent workers based on compute/memory topology
+// - per-task scope and per-task worker affinity to control for:
+//   - power islands on multi-core systems with fine-grained power management
+//   - heterogenous microarchitectures in big.LITTLE/etc compute complexes
+//   - task isolation between multiple active requests or users
+//   - latency prioritization by partitioning workloads by priority
+// - scheduling overhead tradeoffs by varying:
+//   - coordination/flush frequency to reduce cross-thread communication
+//   - by statically inserting dispatch shards to avoid dynamic fan-out
+//   - thread donation to avoid likely context switches upon submit+wait
+//   - multi-wait across all users by sharing a wait set
+//   - per-worker work-stealing specification of victim workers in the topology
+//   - limited work-stealing to prevent chained stealing/cascading theft
+//
+// Required reading:
+//  https://www.usenix.org/conference/osdi20/presentation/ma
+//    (closest equivalent to this scheduling model)
+//  https://www.cister-labs.pt/summer2017/w3/Parallelism%20-%20Dag%20Model.pdf
+//    (good overall, our worker local lists/mailboxes are work-stealing queues)
+//  http://people.csail.mit.edu/shanir/publications/Flat%20Combining%20SPAA%2010.pdf
+//    (what we model with the coordinator)
+//  http://mcg.cs.tau.ac.il/papers/opodis2010-quasi.pdf
+//    (we exploit relaxed consistency for all our cross-thread queuing, see ^)
+//  https://moodycamel.com/blog/2014/a-fast-general-purpose-lock-free-queue-for-c++.htm
+//    (moodycamel is the state of the art on scaling queues; read it all)
+//  https://blog.molecular-matters.com/2015/08/24/job-system-2-0-lock-free-work-stealing-part-1-basics/
+//  https://blog.molecular-matters.com/2015/09/08/job-system-2-0-lock-free-work-stealing-part-2-a-specialized-allocator/
+//  https://blog.molecular-matters.com/2015/09/25/job-system-2-0-lock-free-work-stealing-part-3-going-lock-free/
+//  https://blog.molecular-matters.com/2015/11/09/job-system-2-0-lock-free-work-stealing-part-4-parallel_for/
+//  https://blog.molecular-matters.com/2016/04/04/job-system-2-0-lock-free-work-stealing-part-5-dependencies/
+//    (fantastic 5 part blog series; very similar to this)
+//  http://www.cs.cmu.edu/afs/cs.cmu.edu/Web/People/blelloch/papers/jacm99.pdf
+//    (provably optimal dynamic nested parallelism in 1999; basically: GPUs)
+//  http://www.cs.cmu.edu/~blelloch/papers/locality2000.pdf
+//    (followup to jacm99; using locality now to guide work stealing)
+//  https://www.cs.cmu.edu/afs/cs.cmu.edu/Web/People/blelloch/papers/CGK07.pdf
+//    (worker affinity and task locality for constructive cache sharing)
+//
+//==============================================================================
+// Life of an iree_task_t / high level algorithm
+//==============================================================================
+//
+// 1. Users allocate (from iree_task_pool_t, slice from arenas, etc) and
+//    construct a DAG of iree_task_ts.
+//
+//   a. Task dependency information is setup via completion_tasks for simple
+//      dependencies, implicit fan-out/fan-in (dispatches), or explicit fan-in
+//      (barriers).
+//
+//   b. Tasks are pushed into iree_task_submission_t (LIFO, thread-local list).
+//      If the task has no initial unmet initial dependencies it is placed into
+//      the ready_list. If it is initially waiting on an external resource such
+//      as iree_wait_handle_t then it is placed into the waiting_list.
+//
+// 2. iree_task_executor_submit (LIFO, atomic slist)
+//    Submissions have their task thread-local lists concatenated into a LIFO
+//    incoming_ready_slist or the wait poller shared by the executor.
+//
+// 3. iree_task_executor_flush (or a worker puts on its coordinator hat 🎩)
+//
+//   a. Tasks are flushed from the incoming_ready_slist into a coordinator-local
+//      FIFO task queue. This centralizes enqueuing from all threads into a
+//      single ordered list.
+//
+//   b. iree_task_executor_schedule_ready_tasks: walks the FIFO task queue and
+//      builds a iree_task_post_batch_t containing the per-worker tasks
+//      in LIFO order.
+//
+//   c. iree_task_post_batch_submit: per-worker tasks are pushed to their
+//      respective iree_task_worker_t mailbox_slist and the workers with new
+//      tasks are notified to wake up (if not already awake).
+//
+// 4. iree_task_worker_main_pump_once (LIFO mailbox -> FIFO thread-local list)
+//    When either woken or after completing all available thread-local work
+//    each worker will check its mailbox_slist to see if any tasks have been
+//    posted.
+//
+//    a. Tasks are flushed from the LIFO mailbox into the local_task_queue FIFO
+//       for the particular worker.
+//
+//    b. If the mailbox is empty the worker *may* attempt to steal work from
+//       another nearby worker in the topology.
+//
+//    c. Any tasks in the local_task_queue are executed until empty.
+//       Tasks are retired and dependent tasks (via completion_task or barriers)
+//       are made ready and placed in the executor incoming_ready_slist as with
+//       iree_task_executor_submit.
+//
+//    d. If no more thread-local work is available and the mailbox_slist is
+//       empty the worker will self-nominate for coordination and attempt to don
+//       the coordinator hat with iree_task_executor_coordinate. If new work
+//       becomes available after coordination step 5 repeats.
+//
+//    e. If another worker (or iree_task_executor_flush) is already wearing the
+//       coordinator hat then the worker will go to sleep.
+//
+//==============================================================================
+// Scaling Down
+//==============================================================================
+//
+// IREE is built at all levels - and both in the compiler and runtime - to scale
+// to different needs. Everything that IREE imposes on the runtime performance
+// and binary size is a spectrum of choices made that allows a user to only pay
+// for what they use.
+//
+// If a deployment scenario does not need complex multithreading and
+// out-of-order execution then this task system can be used in single-threaded
+// mode to at least allow for offloading from the main application thread. In
+// even more constrained scenarios (or embeddings within other systems that have
+// thread pools of their own) it can be used in zero-threaded mode with only
+// donated threads from the user performing work when the user wants it to
+// happen within its control. It still gives the benefits of wave-style
+// scheduling, multi-waiting, locality-aware work distribution, etc as well as
+// giving us a single target interface from the compiler to communicate
+// fine-grained dependency information to the runtime.
+//
+// If the cost of a few KB of data structures and some cheap uncontended atomic
+// linked list concatenations is still scary (it shouldn't be for 95% of uses)
+// then it's also possible to have a HAL driver that doesn't use this task
+// system at all and instead just executes the command buffers directly just
+// like our Vulkan/Metal/etc GPU backends do. Even though I don't recommend that
+// (one wouldn't be saving as much as they think and be losing a lot instead)
+// the layering holds and it can be useful if there's an existing external
+// sophisticated task execution system (ala taskflow) that is already in present
+// in an application.
+//
+// One assertion of IREE is that for models that take more than milliseconds to
+// execute then asynchronous scheduling is almost always worth it even on
+// systems with single cores. The ability to cooperatively schedule model
+// execution allows applications significant control over their total program
+// scheduling behavior; just as on a Commodore 64 you'd have to interrupt work
+// on vsync to begin scanning out pixels to the screen and then resume afterward
+// it's rare to see any system even scaling down to double-digit MHz
+// microcontrollers that doesn't benefit from the ability to cleanly suspend and
+// resume execution.
+//
+// But even if *all* of that is too much, the compile-time representations in
+// the HAL IR are designed to be lowered away: execution modeling does not need
+// to bottom out on a hal.command_buffer.dispatch that maps 1:1 with the runtime
+// iree_hal_command_buffer_dispatch call: dispatch can be lowered into LLVM
+// IR calls and finally into native code to do precisely what you want. The HAL
+// at runtime is a useful abstraction to allow for switching your target
+// execution system (statically or dynamically across deployments) and to share
+// the same execution system across multiple models that may be executing
+// simultaneously but it is _not_ a requirement that the IREE HAL runtime
+// implementation is used. It's called multi-level IR for a reason and the HAL
+// IR is just one level that may have many more below it.
+//
+// So yeah: don't worry. It's almost certain that the thing making or breaking
+// the performance of models over 1ms of execution time is not the HAL, and that
+// in models at or above that scale the benefits we get from being able to
+// holistically schedule the work far outstrip any specialization that can be
+// done by hand. That's to say: only worry about this if your model is literally
+// 4 floats coming from an IMU and a few hundred scalar instructions to predict
+// whether the user is walking, and that shouldn't be using the runtime HAL at
+// all and really likely doesn't benefit from using IREE at any scale - just go
+// straight to LLVM IR from the source.
+//
+//==============================================================================
+// Scaling Up
+//==============================================================================
+//
+// The task system has an implicit limit of 64 workers. This intentional
+// limitation simplifies several parts of the code while also preventing misuse:
+// it rarely (if ever) makes sense to have more than 64 compute-dominated
+// threads working on a single problem. Achieving high performance in such
+// situations requires extremely careful control over the OS scheduler, memory
+// bandwidth consumption, and synchronization. It's always possible to make the
+// problem more compute-bound or very carefully try to fit in specific cache
+// sizes to avoid more constrained bandwidth paths but it's a non-portable
+// whack-a-mole style solution that is in conflict with a lot of what IREE seeks
+// to do with respect to low-latency and multi-tenant workloads.
+//
+// If more than 64 unique L1/L2 caches (or realistically more than probably ~32)
+// are available *and* all of them are attached to the same memory controllers
+// (no NUMA involved) then the solution is straightfoward: use multiple IREE
+// task executors. Either within a process or in separate processes the
+// granularity is coarse enough to not be a burden and changes the problem from
+// needing 100% perfect work scaling of a single task to needing a naive
+// distributed workload solution at the algorithm level.
+//
+// Many useful effects also fall out of solving the work distribution problem.
+// Even for single-tenant workloads being able to split work between two
+// executors allows for natural mappings on NUMA systems or completely
+// independent machines. When supporting multi-tenant workloads (even if the
+// same program is acting as multiple-tenants in a minibatched-style algorithm)
+// the improvements of isolation both in memory access patterns and in variance
+// from potentially bad system behavior dramatically improve: there aren't many
+// opportunities for contention in this system but one can guarantee zero
+// contention by simply not sharing the resources!
+
+// A bitfield specifying the scheduling mode used for configuring how (or if)
+// work is balanced across queues.
+enum iree_task_scheduling_mode_bits_t {
+  // TODO(benvanik): batch, round-robin, FCFS, SJF, etc.
+  // We can also allow for custom scheduling, though I'm skeptical of the value
+  // of that. We should look into what GPUs do in hardware for balancing things
+  // (if anything this sophisticated at all). The potential benefit here is that
+  // we can optimize for offline workloads by allowing each queue to be drained
+  // until blocking - hopefully optimizing cache coherency and reducing the
+  // total memory high-water mark - or optimize for latency across all queues by
+  // taking tasks from all queues equally. There are other more interesting
+  // scheduling strategies such as preferring the widest tasks available from
+  // any queue such that we are keeping as many workers active as possible to
+  // reach peak utilization or artificially limiting which tasks we allow
+  // through to keep certain CPU cores asleep unless absolutely required.
+  IREE_TASK_SCHEDULING_MODE_RESERVED = 0u,
+
+  // Creates all workers suspended and waits until work is first scheduled to
+  // them to resume. This trades off initial blocking startup time waking the
+  // threads for potential latency additions later on as threads take longer to
+  // wake on their first use.
+  //
+  // Prefer this setting in systems where startup time is the priority and work
+  // may not be scheduled for awhile or scheduled unevenly to start; otherwise
+  // the executor creation will take longer and a thundering herd will occur
+  // forcing context switches even if no work is needed.
+  //
+  // Avoid in systems where the latency from initial submission to worker
+  // execution is critical as this will ensure all worker threads are waiting
+  // for their respective wake notifications. The kernel then will be able to
+  // much faster schedule all worker quantums and in many cases all workers will
+  // begin processing simultaneously immediately after the submission is made.
+  IREE_TASK_SCHEDULING_MODE_DEFER_WORKER_STARTUP = 1u << 0,
+};
+typedef uint32_t iree_task_scheduling_mode_t;
+
+// Base task system executor interface.
+typedef struct iree_task_executor_t iree_task_executor_t;
+
+// Creates a task executor using the specified topology.
+//
+// |worker_local_memory_size| defines the bytes to be allocated and reserved for
+// each worker to use for local memory operations. Will be rounded up to the
+// next power of two. Dispatches performed will be able to request up to this
+// amount of memory for their invocations and no more. May be 0 if no worker
+// local memory is required.
+//
+// |topology| is only used during creation and need not live beyond this call.
+// |out_executor| must be released by the caller.
+iree_status_t iree_task_executor_create(
+    iree_task_scheduling_mode_t scheduling_mode,
+    const iree_task_topology_t* topology,
+    iree_host_size_t worker_local_memory_size, iree_allocator_t allocator,
+    iree_task_executor_t** out_executor);
+
+// Retains the given |executor| for the caller.
+void iree_task_executor_retain(iree_task_executor_t* executor);
+
+// Releases the given |executor| from the caller.
+void iree_task_executor_release(iree_task_executor_t* executor);
+
+// Trims pools and caches used by the executor and its workers.
+void iree_task_executor_trim(iree_task_executor_t* executor);
+
+// Returns the number of live workers usable by the executor.
+// The actual number used for any particular operation is dynamic.
+iree_host_size_t iree_task_executor_worker_count(
+    iree_task_executor_t* executor);
+
+// Returns an iree_event_t pool managed by the executor.
+// Users of the task system should acquire their transient events from this.
+// Long-lived events should be allocated on their own in order to avoid
+// expending the pool and harming high-frequency event acquisition.
+iree_event_pool_t* iree_task_executor_event_pool(
+    iree_task_executor_t* executor);
+
+// Acquires a fence for the given |scope| from the executor fence pool.
+iree_status_t iree_task_executor_acquire_fence(iree_task_executor_t* executor,
+                                               iree_task_scope_t* scope,
+                                               iree_task_fence_t** out_fence);
+
+// TODO(benvanik): scheduling mode mutation, compute quota control, etc.
+
+// Submits a batch of tasks for execution.
+// The submission represents a DAG of tasks all reachable from the initial
+// submission lists.
+//
+// Ownership of the tasks remains with the caller for the lifetime of the
+// submission unless tasks have a custom pool specified that they can be
+// returned to.
+//
+// Safe to call from any thread. Wait-free but may block for a small duration
+// during initial scheduling of the submitted tasks.
+//
+// NOTE: it's possible for all work in the submission to complete prior to this
+// function returning.
+void iree_task_executor_submit(iree_task_executor_t* executor,
+                               iree_task_submission_t* submission);
+
+// Flushes any pending task batches for execution.
+//
+// Safe to call from any thread. Wait-free but may block for a small duration
+// during initial scheduling of the submitted tasks.
+//
+// NOTE: due to races it's possible for new work to arrive from other threads
+// after the flush has occurred but prior to this call returning.
+void iree_task_executor_flush(iree_task_executor_t* executor);
+
+// Donates the calling thread to the executor until either |wait_source|
+// resolves or |timeout| is exceeded. Flushes any pending task batches prior
+// to doing any work or waiting.
+//
+// If there are no tasks available then the calling thread will block as if
+// iree_wait_source_wait_one had been used on |wait_source|. If tasks are ready
+// then the caller will not block prior to starting to perform work on behalf of
+// the executor.
+//
+// Donation is intended as an optimization to elide context switches when the
+// caller would have waited anyway; now instead of performing a kernel wait and
+// most certainly incurring a context switch the caller immediately begins
+// taking work from the queue - likely even prior to any of the executor workers
+// waking (assuming they were idle).
+//
+// Note that donation may not always be strictly a win: the caller may have an
+// arbitrary thread affinity that may cause oversubscription of resources within
+// the topology. This can cause additional contention for compute resources and
+// increase kernel scheduling overhead as threads are swapped or migrated.
+// Measure, measure, measure! If there is any IO that can be performed during
+// the time that a caller would otherwise donate themselves to the executor that
+// should always be preferred as should smaller computation (again to not
+// oversubscribe resources). Treat donation as a hail mary to prevent a kernel
+// wait and not something that will magically make things execute faster.
+// Especially in large applications it's almost certainly better to do something
+// useful with the calling thread (even if that's go to sleep).
+//
+// Safe to call from any thread (though bad to reentrantly call from workers).
+iree_status_t iree_task_executor_donate_caller(iree_task_executor_t* executor,
+                                               iree_wait_source_t wait_source,
+                                               iree_timeout_t timeout);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_TASK_EXECUTOR_H_
diff --git a/runtime/src/iree/task/executor_demo.cc b/runtime/src/iree/task/executor_demo.cc
new file mode 100644
index 0000000..b8869d8
--- /dev/null
+++ b/runtime/src/iree/task/executor_demo.cc
@@ -0,0 +1,170 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <cstddef>
+
+#include "iree/base/internal/prng.h"
+#include "iree/base/tracing.h"
+#include "iree/task/executor.h"
+
+// TODO(benvanik): clean this up into a reasonable demo; it's currently staging
+// area for testing executor behavior across different platforms and topologies.
+
+namespace {
+
+static thread_local volatile uint64_t xxx = 0;
+
+static void simulate_work(const iree_task_tile_context_t* tile_context) {
+  iree_prng_splitmix64_state_t state;
+  iree_prng_splitmix64_initialize(xxx, &state);
+  bool slow = false;  // tile_context->workgroup_xyz[0] % 3 == 1;
+  if (tile_context->workgroup_xyz[0] == 128 ||
+      tile_context->workgroup_xyz[0] == 1023) {
+    // Introduce big variance to highlight work stealing.
+    // std::this_thread::sleep_for(std::chrono::milliseconds(1));
+  }
+  for (int i = 0; i < 256 * 1024; ++i) {
+    uint64_t value = iree_prng_splitmix64_next(&state);
+    xxx += value;
+    if (slow) {
+      for (int j = 0; j < 4; ++j) {
+        value = iree_prng_splitmix64_next(&state);
+        xxx += value;
+      }
+    }
+  }
+}
+
+extern "C" int main(int argc, char* argv) {
+  IREE_TRACE_SCOPE0("ExecutorTest::Any");
+
+  iree_allocator_t allocator = iree_allocator_system();
+
+  iree_task_topology_t topology;
+#if 1
+  iree_task_topology_initialize_from_physical_cores(
+      /*max_core_count=*/6, &topology);
+#else
+  iree_task_topology_initialize_from_group_count(/*group_count=*/6, &topology);
+#endif
+
+  iree_task_executor_t* executor = NULL;
+  iree_task_scheduling_mode_t scheduling_mode =
+      IREE_TASK_SCHEDULING_MODE_RESERVED;
+  iree_host_size_t worker_local_memory_size = 0;  // 64 * 1024;
+  IREE_CHECK_OK(iree_task_executor_create(scheduling_mode, &topology,
+                                          worker_local_memory_size, allocator,
+                                          &executor));
+  iree_task_topology_deinitialize(&topology);
+
+  //
+  iree_task_scope_t scope_a;
+  iree_task_scope_initialize(iree_make_cstring_view("a"), &scope_a);
+
+  //
+  iree_task_call_t call0;
+  iree_task_call_initialize(&scope_a,
+                            iree_task_make_call_closure(
+                                [](void* user_context, iree_task_t* task,
+                                   iree_task_submission_t* pending_submission) {
+                                  IREE_TRACE_SCOPE0("call0");
+                                  IREE_ASSERT_EQ(0, user_context);
+                                  return iree_ok_status();
+                                },
+                                0),
+                            &call0);
+
+  const uint32_t workgroup_size_0[3] = {256, 1, 1};
+  const uint32_t workgroup_count_0[3] = {32, 4, 2};
+  iree_task_dispatch_t dispatch0;
+  iree_task_dispatch_initialize(
+      &scope_a,
+      iree_task_make_dispatch_closure(
+          [](void* user_context, const iree_task_tile_context_t* tile_context,
+             iree_task_submission_t* pending_submission) {
+            IREE_TRACE_SCOPE0("tile0");
+            IREE_ASSERT_EQ(0, user_context);
+            simulate_work(tile_context);
+            iree_atomic_fetch_add_int32(&tile_context->statistics->reserved, 1,
+                                        iree_memory_order_relaxed);
+            return iree_ok_status();
+          },
+          0),
+      workgroup_size_0, workgroup_count_0, &dispatch0);
+
+  const uint32_t workgroup_size_1[3] = {128, 1, 1};
+  const uint32_t workgroup_count_1[3] = {16, 2, 1};
+  iree_task_dispatch_t dispatch1;
+  iree_task_dispatch_initialize(
+      &scope_a,
+      iree_task_make_dispatch_closure(
+          [](void* user_context, const iree_task_tile_context_t* tile_context,
+             iree_task_submission_t* pending_submission) {
+            IREE_TRACE_SCOPE0("tile1");
+            IREE_ASSERT_EQ(0, user_context);
+            simulate_work(tile_context);
+            iree_atomic_fetch_add_int32(&tile_context->statistics->reserved, 1,
+                                        iree_memory_order_relaxed);
+            return iree_ok_status();
+          },
+          0),
+      workgroup_size_1, workgroup_count_1, &dispatch1);
+
+  //
+  iree_task_call_t call1;
+  iree_task_call_initialize(&scope_a,
+                            iree_task_make_call_closure(
+                                [](void* user_context, iree_task_t* task,
+                                   iree_task_submission_t* pending_submission) {
+                                  IREE_TRACE_SCOPE0("call1");
+                                  IREE_ASSERT_EQ((void*)1, user_context);
+                                  return iree_ok_status();
+                                },
+                                (void*)1),
+                            &call1);
+
+#if 1
+  // no barrier between dispatches; fanout
+  iree_task_t* barrier0_tasks[2] = {&dispatch0.header, &dispatch1.header};
+  iree_task_barrier_t barrier0;
+  iree_task_barrier_initialize(&scope_a, IREE_ARRAYSIZE(barrier0_tasks),
+                               barrier0_tasks, &barrier0);
+  iree_task_set_completion_task(&call0.header, &barrier0.header);
+  iree_task_set_completion_task(&dispatch0.header, &call1.header);
+  iree_task_set_completion_task(&dispatch1.header, &call1.header);
+#else
+  // barrier between dispatches
+  iree_task_set_completion_task(&call0.header, &dispatch0.header);
+  iree_task_set_completion_task(&dispatch0.header, &dispatch1.header);
+  iree_task_set_completion_task(&dispatch1.header, &call1.header);
+#endif
+
+  // fence
+  iree_task_fence_t* fence0 = NULL;
+  IREE_CHECK_OK(iree_task_executor_acquire_fence(executor, &scope_a, &fence0));
+  iree_task_set_completion_task(&call1.header, &fence0->header);
+
+  //
+  iree_task_submission_t sub0;
+  iree_task_submission_initialize(&sub0);
+  iree_task_submission_enqueue(&sub0, &call0.header);
+  iree_task_executor_submit(executor, &sub0);
+
+  //
+  // iree_task_submission_t sub1;
+  // iree_task_submission_initialize(&sub1);
+  // IREE_CHECK_OK(iree_task_executor_submit(executor, &sub1));
+
+  iree_task_executor_flush(executor);
+
+  IREE_CHECK_OK(iree_task_scope_wait_idle(&scope_a, IREE_TIME_INFINITE_FUTURE));
+
+  iree_task_scope_deinitialize(&scope_a);
+  iree_task_executor_release(executor);
+  return 0;
+}
+
+}  // namespace
diff --git a/runtime/src/iree/task/executor_impl.h b/runtime/src/iree/task/executor_impl.h
new file mode 100644
index 0000000..fc1c04b
--- /dev/null
+++ b/runtime/src/iree/task/executor_impl.h
@@ -0,0 +1,151 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_TASK_EXECUTOR_IMPL_H_
+#define IREE_TASK_EXECUTOR_IMPL_H_
+
+#include "iree/base/internal/math.h"
+#include "iree/base/internal/prng.h"
+#include "iree/base/internal/synchronization.h"
+#include "iree/base/internal/wait_handle.h"
+#include "iree/base/tracing.h"
+#include "iree/task/affinity_set.h"
+#include "iree/task/executor.h"
+#include "iree/task/list.h"
+#include "iree/task/poller.h"
+#include "iree/task/pool.h"
+#include "iree/task/post_batch.h"
+#include "iree/task/queue.h"
+#include "iree/task/tuning.h"
+#include "iree/task/worker.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+struct iree_task_executor_t {
+  iree_atomic_ref_count_t ref_count;
+  iree_allocator_t allocator;
+
+  // Defines how work is selected across queues.
+  // TODO(benvanik): make mutable; currently always the same reserved value.
+  iree_task_scheduling_mode_t scheduling_mode;
+
+  // State used by the work-stealing operations performed by donated threads.
+  // This is **NOT SYNCHRONIZED** and relies on the fact that we actually don't
+  // much care about the precise selection of workers enough to mind any tears
+  // we get in the PRNG state that lives inside. Cache write-back order and
+  // incidental cache line availability/visibility update frequency is like an
+  // extra layer of PRNG anyway ;)
+  iree_prng_minilcg128_state_t donation_theft_prng;
+
+  // Pools of transient dispatch tasks shared across all workers.
+  // Depending on configuration the task pool may allocate after creation using
+  // the allocator provided upon executor creation.
+  //
+  // Sized to be able to fit at least:
+  //   iree_task_fence_t
+  //   iree_task_dispatch_shard_t
+  // Increasing the size larger than these will waste memory.
+  iree_task_pool_t transient_task_pool;
+
+  // A list of incoming tasks that are ready to execute immediately.
+  // The list is LIFO and we require that task lists are reversed by the
+  // submitter so we can use iree_atomic_slist_concat to quickly prepend the
+  // LIFO list to the atomic slist. By doing this we can construct the task
+  // lists in LIFO order prior to submission, concat with a pointer swap into
+  // this list, flush from the list in LIFO order during coordination, and do a
+  // single LIFO->FIFO conversion while distributing work. What could have been
+  // half a dozen task list pointer walks and inverted sequential memory access
+  // becomes one.
+  //
+  // Example:
+  //   existing tasks: C B A
+  //        new tasks: 1 2 3
+  //    updated tasks: 3 2 1 C B A
+  iree_atomic_task_slist_t incoming_ready_slist;
+
+  // iree_event_t pool used to acquire system wait handles.
+  // Many subsystems interacting with the executor will need events to park
+  // their work in the wait set and sharing the pool across all of them ensures
+  // we limit the number we have outstanding and avoid syscalls to allocate
+  // them.
+  iree_event_pool_t* event_pool;
+
+  // Guards coordination logic; only one thread at a time may be acting as the
+  // coordinator.
+  iree_slim_mutex_t coordinator_mutex;
+
+  // Wait task polling and wait thread manager.
+  // This handles all system waits so that we can keep the syscalls off the
+  // worker threads and lower wake latencies (the wait thread can enqueue
+  // completed waits immediately after they resolve instead of waiting for
+  // existing computation on the workers to finish).
+  iree_task_poller_t poller;
+
+  // A bitset indicating which workers are live and usable; all attempts to
+  // push work onto a particular worker should check first with this mask. This
+  // may change over time either automatically or by user request ("don't use
+  // these cores for awhile I'm going to be using them" etc).
+  iree_atomic_task_affinity_set_t worker_live_mask;
+
+  // A bitset indicating which workers may be suspended and need to be resumed
+  // via iree_thread_resume prior to them being able to execute work.
+  iree_atomic_task_affinity_set_t worker_suspend_mask;
+
+  // A bitset indicating which workers are currently idle. Used to bias incoming
+  // tasks to workers that aren't doing much else. This is a balance of latency
+  // to wake the idle workers vs. latency to wait for existing work to complete
+  // on already woken workers.
+  iree_atomic_task_affinity_set_t worker_idle_mask;
+
+  // Specifies how many workers threads there are.
+  // For now this number is fixed per executor however if we wanted to enable
+  // live join/leave behavior we could change this to a registration mechanism.
+  iree_host_size_t worker_count;
+  iree_task_worker_t* workers;  // [worker_count]
+};
+
+// Merges a submission into the primary FIFO queues.
+// Coordinators will fetch items from here as workers demand them but otherwise
+// not be notified of the changes (waiting until coordination runs again).
+//
+// May be called from any thread.
+void iree_task_executor_merge_submission(iree_task_executor_t* executor,
+                                         iree_task_submission_t* submission);
+
+// Schedules all ready tasks in the |pending_submission| list.
+// Only called during coordination and expects the coordinator lock to be held.
+void iree_task_executor_schedule_ready_tasks(
+    iree_task_executor_t* executor, iree_task_submission_t* pending_submission,
+    iree_task_post_batch_t* post_batch);
+
+// Dispatches tasks in the global submission queue to workers.
+// |current_worker| will be NULL if called from a non-worker thread and
+// otherwise be the current worker; used to avoid round-tripping through the
+// whole system to post to oneself.
+//
+// If the |current_worker| has no more work remaining then the calling thread
+// may wait on any pending wait tasks until one resolves or more work is
+// scheduled for the worker. If no worker is provided the call will return
+// without waiting.
+void iree_task_executor_coordinate(iree_task_executor_t* executor,
+                                   iree_task_worker_t* current_worker);
+
+// Tries to steal an entire task from a sibling worker (based on topology).
+// Returns a task that is available (has not yet begun processing at all).
+// May steal multiple tasks and add them to the |local_task_queue|.
+iree_task_t* iree_task_executor_try_steal_task(
+    iree_task_executor_t* executor,
+    iree_task_affinity_set_t constructive_sharing_mask,
+    uint32_t max_theft_attempts, iree_prng_minilcg128_state_t* theft_prng,
+    iree_task_queue_t* local_task_queue);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_TASK_EXECUTOR_IMPL_H_
diff --git a/runtime/src/iree/task/executor_test.cc b/runtime/src/iree/task/executor_test.cc
new file mode 100644
index 0000000..7e96a8e
--- /dev/null
+++ b/runtime/src/iree/task/executor_test.cc
@@ -0,0 +1,139 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/task/executor.h"
+
+#include <cstddef>
+
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace {
+
+// Tests that an executor can be created and destroyed repeatedly without
+// running out of system resources. Since all systems are different there's no
+// guarantee this will fail but it does give ASAN/TSAN some nice stuff to chew
+// on.
+TEST(ExecutorTest, Lifetime) {
+  iree_task_topology_t topology;
+  iree_task_topology_initialize_from_group_count(/*group_count=*/4, &topology);
+
+  for (int i = 0; i < 100; ++i) {
+    iree_task_executor_t* executor = NULL;
+    iree_task_scheduling_mode_t scheduling_mode =
+        IREE_TASK_SCHEDULING_MODE_RESERVED;
+    iree_host_size_t worker_local_memory_size = 64 * 1024;
+    IREE_ASSERT_OK(iree_task_executor_create(
+        scheduling_mode, &topology, worker_local_memory_size,
+        iree_allocator_system(), &executor));
+    // -- idle --
+    iree_task_executor_release(executor);
+  }
+
+  iree_task_topology_deinitialize(&topology);
+}
+
+// Tests lifetime when issuing submissions before exiting.
+// This tries to catch races in shutdown with pending work.
+TEST(ExecutorTest, LifetimeStress) {
+  iree_task_topology_t topology;
+  iree_task_topology_initialize_from_group_count(/*group_count=*/4, &topology);
+
+  for (int i = 0; i < 100; ++i) {
+    iree_task_executor_t* executor = NULL;
+    iree_task_scheduling_mode_t scheduling_mode =
+        IREE_TASK_SCHEDULING_MODE_RESERVED;
+    iree_host_size_t worker_local_memory_size = 64 * 1024;
+    IREE_ASSERT_OK(iree_task_executor_create(
+        scheduling_mode, &topology, worker_local_memory_size,
+        iree_allocator_system(), &executor));
+    iree_task_scope_t scope;
+    iree_task_scope_initialize(iree_make_cstring_view("scope"), &scope);
+
+    static std::atomic<int> received_value = {0};
+    iree_task_call_t call;
+    iree_task_call_initialize(
+        &scope,
+        iree_task_make_call_closure(
+            [](void* user_context, iree_task_t* task,
+               iree_task_submission_t* pending_submission) {
+              received_value = (int)(uintptr_t)user_context;
+              return iree_ok_status();
+            },
+            (void*)(uintptr_t)i),
+        &call);
+
+    iree_task_fence_t* fence = NULL;
+    IREE_ASSERT_OK(iree_task_executor_acquire_fence(executor, &scope, &fence));
+    iree_task_set_completion_task(&call.header, &fence->header);
+
+    iree_task_submission_t submission;
+    iree_task_submission_initialize(&submission);
+    iree_task_submission_enqueue(&submission, &call.header);
+    iree_task_executor_submit(executor, &submission);
+    iree_task_executor_flush(executor);
+    IREE_ASSERT_OK(
+        iree_task_scope_wait_idle(&scope, IREE_TIME_INFINITE_FUTURE));
+
+    EXPECT_EQ(received_value, i) << "call did not correlate to loop";
+
+    iree_task_scope_deinitialize(&scope);
+    iree_task_executor_release(executor);
+  }
+
+  iree_task_topology_deinitialize(&topology);
+}
+
+// Tests heavily serialized submission to an executor.
+// This puts pressure on the overheads involved in spilling up threads.
+TEST(ExecutorTest, SubmissionStress) {
+  iree_task_topology_t topology;
+  iree_task_topology_initialize_from_group_count(/*group_count=*/4, &topology);
+  iree_task_executor_t* executor = NULL;
+  iree_task_scheduling_mode_t scheduling_mode =
+      IREE_TASK_SCHEDULING_MODE_RESERVED;
+  iree_host_size_t worker_local_memory_size = 64 * 1024;
+  IREE_ASSERT_OK(iree_task_executor_create(scheduling_mode, &topology,
+                                           worker_local_memory_size,
+                                           iree_allocator_system(), &executor));
+  iree_task_scope_t scope;
+  iree_task_scope_initialize(iree_make_cstring_view("scope"), &scope);
+
+  for (int i = 0; i < 1000; ++i) {
+    static std::atomic<int> received_value = {0};
+    iree_task_call_t call;
+    iree_task_call_initialize(
+        &scope,
+        iree_task_make_call_closure(
+            [](void* user_context, iree_task_t* task,
+               iree_task_submission_t* pending_submission) {
+              received_value = (int)(uintptr_t)user_context;
+              return iree_ok_status();
+            },
+            (void*)(uintptr_t)i),
+        &call);
+
+    iree_task_fence_t* fence = NULL;
+    IREE_ASSERT_OK(iree_task_executor_acquire_fence(executor, &scope, &fence));
+    iree_task_set_completion_task(&call.header, &fence->header);
+
+    iree_task_submission_t submission;
+    iree_task_submission_initialize(&submission);
+    iree_task_submission_enqueue(&submission, &call.header);
+    iree_task_executor_submit(executor, &submission);
+    iree_task_executor_flush(executor);
+    IREE_ASSERT_OK(
+        iree_task_scope_wait_idle(&scope, IREE_TIME_INFINITE_FUTURE));
+
+    EXPECT_EQ(received_value, i) << "call did not correlate to loop";
+  }
+
+  iree_task_scope_deinitialize(&scope);
+  iree_task_executor_release(executor);
+  iree_task_topology_deinitialize(&topology);
+}
+
+}  // namespace
diff --git a/runtime/src/iree/task/list.c b/runtime/src/iree/task/list.c
new file mode 100644
index 0000000..765e0b6
--- /dev/null
+++ b/runtime/src/iree/task/list.c
@@ -0,0 +1,207 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/task/list.h"
+
+#include <string.h>
+
+void iree_atomic_task_slist_discard(iree_atomic_task_slist_t* slist) {
+  iree_task_list_t discard_list;
+  iree_task_list_initialize(&discard_list);
+  iree_task_list_append_from_fifo_slist(&discard_list, slist);
+  iree_task_list_discard(&discard_list);
+}
+
+void iree_task_list_initialize(iree_task_list_t* out_list) {
+  memset(out_list, 0, sizeof(*out_list));
+}
+
+void iree_task_list_move(iree_task_list_t* list, iree_task_list_t* out_list) {
+  memcpy(out_list, list, sizeof(*out_list));
+  memset(list, 0, sizeof(*list));
+}
+
+void iree_task_list_discard(iree_task_list_t* list) {
+  // Fixed point iteration over the task list and all its transitive dependent
+  // tasks that get discarded. This is in contrast to a recursive discard that
+  // could potentially be thousands of calls deep in a large graph.
+  while (!iree_task_list_is_empty(list)) {
+    iree_task_t* task = iree_task_list_pop_front(list);
+    iree_task_discard(task, list);
+    task = NULL;  // invalidated during discard
+  }
+}
+
+bool iree_task_list_is_empty(const iree_task_list_t* list) {
+  return list->head == NULL;
+}
+
+iree_host_size_t iree_task_list_calculate_size(const iree_task_list_t* list) {
+  iree_host_size_t count = 0;
+  iree_task_t* p = list->head;
+  while (p) {
+    ++count;
+    p = p->next_task;
+  }
+  return count;
+}
+
+iree_task_t* iree_task_list_front(iree_task_list_t* list) { return list->head; }
+
+iree_task_t* iree_task_list_back(iree_task_list_t* list) { return list->tail; }
+
+void iree_task_list_push_back(iree_task_list_t* list, iree_task_t* task) {
+  if (!list->head) {
+    list->head = task;
+  }
+  if (list->tail) {
+    list->tail->next_task = task;
+  }
+  list->tail = task;
+  task->next_task = NULL;
+}
+
+void iree_task_list_push_front(iree_task_list_t* list, iree_task_t* task) {
+  task->next_task = list->head;
+  list->head = task;
+  if (!list->tail) {
+    list->tail = task;
+  }
+}
+
+iree_task_t* iree_task_list_pop_front(iree_task_list_t* list) {
+  if (!list->head) return NULL;
+  iree_task_t* task = list->head;
+  list->head = task->next_task;
+  if (list->tail == task) {
+    list->tail = NULL;
+  }
+  task->next_task = NULL;
+  return task;
+}
+
+void iree_task_list_erase(iree_task_list_t* list, iree_task_t* prev_task,
+                          iree_task_t* task) {
+  if (task == list->head) {
+    // Removing head (which may _also_ be the tail).
+    list->head = task->next_task;
+    if (list->tail == task) list->tail = task->next_task;
+  } else if (task == list->tail) {
+    // Removing tail.
+    list->tail = prev_task;
+    prev_task->next_task = NULL;
+  } else {
+    // Removing inner.
+    prev_task->next_task = task->next_task;
+  }
+  task->next_task = NULL;
+}
+
+void iree_task_list_prepend(iree_task_list_t* list, iree_task_list_t* prefix) {
+  if (iree_task_list_is_empty(prefix)) return;
+  if (iree_task_list_is_empty(list)) {
+    list->head = prefix->head;
+    list->tail = prefix->tail;
+  } else {
+    prefix->tail->next_task = list->head;
+    list->head = prefix->head;
+  }
+  memset(prefix, 0, sizeof(*prefix));
+}
+
+void iree_task_list_append(iree_task_list_t* list, iree_task_list_t* suffix) {
+  if (iree_task_list_is_empty(suffix)) return;
+  if (iree_task_list_is_empty(list)) {
+    list->head = suffix->head;
+    list->tail = suffix->tail;
+  } else {
+    list->tail->next_task = suffix->head;
+    list->tail = suffix->tail;
+  }
+  memset(suffix, 0, sizeof(*suffix));
+}
+
+void iree_task_list_append_from_fifo_slist(iree_task_list_t* list,
+                                           iree_atomic_task_slist_t* slist) {
+  iree_task_list_t suffix;
+  iree_task_list_initialize(&suffix);
+  if (!iree_atomic_task_slist_flush(
+          slist, IREE_ATOMIC_SLIST_FLUSH_ORDER_APPROXIMATE_FIFO, &suffix.head,
+          &suffix.tail)) {
+    return;  // empty
+  }
+  iree_task_list_append(list, &suffix);
+}
+
+void iree_task_list_reverse(iree_task_list_t* list) {
+  if (iree_task_list_is_empty(list)) return;
+  iree_task_t* tail = list->head;
+  iree_task_t* head = list->tail;
+  iree_task_t* p = list->head;
+  do {
+    iree_task_t* next = p->next_task;
+    p->next_task = head;
+    head = p;
+    p = next;
+  } while (p != NULL);
+  tail->next_task = NULL;
+  list->head = head;
+  list->tail = tail;
+}
+
+void iree_task_list_split(iree_task_list_t* head_list,
+                          iree_host_size_t max_tasks,
+                          iree_task_list_t* out_tail_list) {
+  iree_task_list_initialize(out_tail_list);
+  if (head_list->head == NULL) return;
+  if (head_list->head == head_list->tail) {
+    // 1 task in the source list; always prefer to steal it.
+    // This is because the victim is likely working on their last item and we
+    // can help them out by popping this off. It also has the side-effect of
+    // handling cases of donated workers wanting to steal all tasks to
+    // synchronously execute things.
+    iree_task_list_move(head_list, out_tail_list);
+    return;
+  }
+
+  // Walk through the |head_list| with two iterators; one at double-rate.
+  // If we ever notice this function showing up in profiling then we should
+  // build an acceleration structure to avoid the full walk of the first half
+  // (e.g. skip list).
+  iree_task_t* p_x1_m1 = head_list->head;  // p_x1 - 1 (previous to p_x1)
+  iree_task_t* p_x1 = head_list->head;     // x1 speed ptr
+  iree_task_t* p_x2 = head_list->head;     // x2 speed ptr
+  while (p_x2->next_task != NULL) {
+    p_x1_m1 = p_x1;
+    p_x1 = p_x1->next_task;
+    p_x2 = p_x2->next_task;
+    if (p_x2->next_task) p_x2 = p_x2->next_task;
+  }
+
+  // p_x1 now points at the half way point in the head_list. This is where we
+  // *start* our windowed walk for pulling out max_tasks, implicitly limiting us
+  // to take at most half of the tasks from the list.
+
+  // Advance the tail list keeping an iterator -max_tasks back; when we hit the
+  // end we have our head and tail to form the list.
+  iree_task_t* p_window_prev = p_x1_m1;
+  iree_task_t* p_window_head = p_x1;
+  iree_task_t* p_window_tail = p_x1;
+  while (p_window_tail->next_task != NULL && --max_tasks > 0) {
+    p_window_tail = p_window_tail->next_task;
+  }
+  while (p_window_tail->next_task != NULL) {
+    p_window_prev = p_window_head;
+    p_window_head = p_window_head->next_task;
+    p_window_tail = p_window_tail->next_task;
+  }
+
+  head_list->tail = p_window_prev;
+  p_window_prev->next_task = NULL;
+
+  out_tail_list->head = p_window_head;
+  out_tail_list->tail = p_window_tail;
+}
diff --git a/runtime/src/iree/task/list.h b/runtime/src/iree/task/list.h
new file mode 100644
index 0000000..ee35361
--- /dev/null
+++ b/runtime/src/iree/task/list.h
@@ -0,0 +1,109 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_TASK_LIST_H_
+#define IREE_TASK_LIST_H_
+
+#include <stdbool.h>
+#include <stddef.h>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/atomic_slist.h"
+#include "iree/task/task.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// iree_atomic_task_slist_t, an atomic approximately LIFO singly-linked list.
+// iree_task_list_t should be preferred when working with
+// uncontended/thread-local lists as it has no overhead, while the
+// iree_atomic_task_slist_t should be used when multiple threads may need to
+// share lists of tasks (free lists, mailboxes, etc).
+IREE_TYPED_ATOMIC_SLIST_WRAPPER(iree_atomic_task, iree_task_t,
+                                offsetof(iree_task_t, next_task));
+
+// Discards a task list; should be used for failure cleanup during list
+// construction to ensure intrusive pointers are reset.
+void iree_atomic_task_slist_discard(iree_atomic_task_slist_t* slist);
+
+// A singly-linked list of tasks using the embedded task next_task pointer.
+//
+// Thread-compatible; designed to be used from a single thread manipulating a
+// list for passing to an API that accepts lists.
+typedef struct iree_task_list_t {
+  iree_task_t* head;
+  iree_task_t* tail;
+} iree_task_list_t;
+
+// Initializes an empty task list.
+void iree_task_list_initialize(iree_task_list_t* out_list);
+
+// Moves |list| into |out_list|, leaving |list| empty.
+void iree_task_list_move(iree_task_list_t* list, iree_task_list_t* out_list);
+
+// Discards a task list; should be used for failure cleanup during list
+// construction to ensure intrusive pointers are reset. List is immediately
+// reusable as if it had been initialized.
+void iree_task_list_discard(iree_task_list_t* list);
+
+// Returns true if the list is empty.
+bool iree_task_list_is_empty(const iree_task_list_t* list);
+
+// Counts the total number of tasks in the list.
+// WARNING: this requires an O(n) walk of the entire list; use this only for
+// debugging or when the list is known to be small and hot in cache.
+iree_host_size_t iree_task_list_calculate_size(const iree_task_list_t* list);
+
+// Returns the first task in the list, if any.
+iree_task_t* iree_task_list_front(iree_task_list_t* list);
+
+// Returns the last task in the list, if any.
+iree_task_t* iree_task_list_back(iree_task_list_t* list);
+
+// Pushes a task onto the back of the task list. The task list takes ownership
+// of |task|.
+void iree_task_list_push_back(iree_task_list_t* list, iree_task_t* task);
+
+// Pushes a task onto the front of the task list. The task list takes ownership
+// of |task|.
+void iree_task_list_push_front(iree_task_list_t* list, iree_task_t* task);
+
+// Pops a task from the front of the task list or returns NULL if the list is
+// empty. Caller takes ownership of the returned task.
+iree_task_t* iree_task_list_pop_front(iree_task_list_t* list);
+
+// Erases |task| from the list.
+// |prev_task| must point to the task immediately prior to |task| in the list
+// or NULL if the task was at the head.
+void iree_task_list_erase(iree_task_list_t* list, iree_task_t* prev_task,
+                          iree_task_t* task);
+
+// Prepends |prefix| onto the beginning of |list|. |prefix| will be reset.
+void iree_task_list_prepend(iree_task_list_t* list, iree_task_list_t* prefix);
+
+// Appends |suffix| onto the end of |list|. |suffix| will be reset.
+void iree_task_list_append(iree_task_list_t* list, iree_task_list_t* suffix);
+
+// Flushes the given |slist| and appends all tasks to the list in FIFO order.
+void iree_task_list_append_from_fifo_slist(iree_task_list_t* list,
+                                           iree_atomic_task_slist_t* slist);
+
+// Reverses the list in-place.
+// Requires a full O(n) traversal.
+void iree_task_list_reverse(iree_task_list_t* list);
+
+// Splits |head_list| in half (up to |max_tasks|) and retains the first half
+// in |head_list| and the second half in |tail_list|.
+void iree_task_list_split(iree_task_list_t* head_list,
+                          iree_host_size_t max_tasks,
+                          iree_task_list_t* out_tail_list);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_TASK_LIST_H_
diff --git a/runtime/src/iree/task/list_test.cc b/runtime/src/iree/task/list_test.cc
new file mode 100644
index 0000000..c5cb5b2
--- /dev/null
+++ b/runtime/src/iree/task/list_test.cc
@@ -0,0 +1,655 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/task/list.h"
+
+#include "iree/task/testing/test_util.h"
+#include "iree/testing/gtest.h"
+
+namespace {
+
+TEST(TaskListTest, Empty) {
+  iree_task_list_t list;
+  iree_task_list_initialize(&list);
+  EXPECT_TRUE(iree_task_list_is_empty(&list));
+  EXPECT_EQ(0, iree_task_list_calculate_size(&list));
+  iree_task_list_discard(&list);
+}
+
+TEST(TaskListTest, CalculateSize) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t list;
+  iree_task_list_initialize(&list);
+
+  EXPECT_TRUE(iree_task_list_is_empty(&list));
+  EXPECT_EQ(0, iree_task_list_calculate_size(&list));
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  auto task1 = AcquireNopTask(pool, scope, 1);
+  auto task2 = AcquireNopTask(pool, scope, 2);
+  auto task3 = AcquireNopTask(pool, scope, 3);
+
+  iree_task_list_push_back(&list, task0);
+  EXPECT_FALSE(iree_task_list_is_empty(&list));
+  EXPECT_EQ(1, iree_task_list_calculate_size(&list));
+
+  iree_task_list_push_back(&list, task1);
+  EXPECT_EQ(2, iree_task_list_calculate_size(&list));
+  iree_task_list_push_back(&list, task2);
+  EXPECT_EQ(3, iree_task_list_calculate_size(&list));
+  iree_task_list_push_back(&list, task3);
+  EXPECT_EQ(4, iree_task_list_calculate_size(&list));
+}
+
+TEST(TaskListTest, Move) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t list_a, list_b;
+  iree_task_list_initialize(&list_a);
+  iree_task_list_initialize(&list_b);
+
+  EXPECT_TRUE(iree_task_list_is_empty(&list_a));
+  EXPECT_TRUE(iree_task_list_is_empty(&list_b));
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  auto task1 = AcquireNopTask(pool, scope, 1);
+  auto task2 = AcquireNopTask(pool, scope, 2);
+  auto task3 = AcquireNopTask(pool, scope, 3);
+  iree_task_list_push_back(&list_a, task0);
+  iree_task_list_push_back(&list_a, task1);
+  iree_task_list_push_back(&list_a, task2);
+  iree_task_list_push_back(&list_a, task3);
+  EXPECT_EQ(4, iree_task_list_calculate_size(&list_a));
+  EXPECT_TRUE(CheckListOrderFIFO(&list_a));
+
+  iree_task_list_move(&list_a, &list_b);
+  EXPECT_TRUE(iree_task_list_is_empty(&list_a));
+  EXPECT_EQ(4, iree_task_list_calculate_size(&list_b));
+  EXPECT_TRUE(CheckListOrderFIFO(&list_b));
+}
+
+TEST(TaskListTest, DiscardEmpty) {
+  iree_task_list_t list;
+  iree_task_list_initialize(&list);
+
+  EXPECT_TRUE(iree_task_list_is_empty(&list));
+  iree_task_list_discard(&list);
+  EXPECT_TRUE(iree_task_list_is_empty(&list));
+}
+
+TEST(TaskListTest, Discard) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t list;
+  iree_task_list_initialize(&list);
+  EXPECT_TRUE(iree_task_list_is_empty(&list));
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  auto task1 = AcquireNopTask(pool, scope, 1);
+  auto task2 = AcquireNopTask(pool, scope, 2);
+  auto task3 = AcquireNopTask(pool, scope, 3);
+  iree_task_list_push_back(&list, task0);
+  iree_task_list_push_back(&list, task1);
+  iree_task_list_push_back(&list, task2);
+  iree_task_list_push_back(&list, task3);
+  EXPECT_EQ(4, iree_task_list_calculate_size(&list));
+  EXPECT_TRUE(CheckListOrderFIFO(&list));
+
+  iree_task_list_discard(&list);
+  EXPECT_TRUE(iree_task_list_is_empty(&list));
+
+  // IMPLICIT: if the tasks were not released back to the pool we'll leak.
+}
+
+TEST(TaskListTest, DiscardSequence) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t list;
+  iree_task_list_initialize(&list);
+  EXPECT_TRUE(iree_task_list_is_empty(&list));
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  auto task1 = AcquireNopTask(pool, scope, 1);
+  auto task2 = AcquireNopTask(pool, scope, 2);
+  auto task3 = AcquireNopTask(pool, scope, 3);
+  iree_task_set_completion_task(task0, task1);
+  iree_task_set_completion_task(task1, task2);
+  iree_task_set_completion_task(task2, task3);
+  iree_task_list_push_back(&list, task0);
+  iree_task_list_push_back(&list, task1);
+  iree_task_list_push_back(&list, task2);
+  iree_task_list_push_back(&list, task3);
+  EXPECT_EQ(4, iree_task_list_calculate_size(&list));
+  EXPECT_TRUE(CheckListOrderFIFO(&list));
+
+  iree_task_list_discard(&list);
+  EXPECT_TRUE(iree_task_list_is_empty(&list));
+
+  // IMPLICIT: if the tasks were not released back to the pool we'll leak.
+}
+
+TEST(TaskListTest, DiscardJoin) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t list;
+  iree_task_list_initialize(&list);
+  EXPECT_TRUE(iree_task_list_is_empty(&list));
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  auto task1 = AcquireNopTask(pool, scope, 1);
+  auto task2 = AcquireNopTask(pool, scope, 2);
+  auto task3 = AcquireNopTask(pool, scope, 3);
+  iree_task_set_completion_task(task0, task3);
+  iree_task_set_completion_task(task1, task3);
+  iree_task_set_completion_task(task2, task3);
+  iree_task_list_push_back(&list, task0);
+  iree_task_list_push_back(&list, task1);
+  iree_task_list_push_back(&list, task2);
+  iree_task_list_push_back(&list, task3);
+  EXPECT_EQ(4, iree_task_list_calculate_size(&list));
+  EXPECT_TRUE(CheckListOrderFIFO(&list));
+
+  iree_task_list_discard(&list);
+  EXPECT_TRUE(iree_task_list_is_empty(&list));
+
+  // IMPLICIT: if the tasks were not released back to the pool we'll leak.
+}
+
+TEST(TaskListTest, PushFront) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t list;
+  iree_task_list_initialize(&list);
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  auto task1 = AcquireNopTask(pool, scope, 1);
+  auto task2 = AcquireNopTask(pool, scope, 2);
+  auto task3 = AcquireNopTask(pool, scope, 3);
+
+  iree_task_list_push_front(&list, task0);
+  iree_task_list_push_front(&list, task1);
+  iree_task_list_push_front(&list, task2);
+  iree_task_list_push_front(&list, task3);
+  EXPECT_EQ(4, iree_task_list_calculate_size(&list));
+  EXPECT_TRUE(CheckListOrderLIFO(&list));
+
+  EXPECT_EQ(3, iree_task_list_pop_front(&list)->flags);
+  EXPECT_EQ(2, iree_task_list_pop_front(&list)->flags);
+  EXPECT_EQ(1, iree_task_list_pop_front(&list)->flags);
+  EXPECT_EQ(0, iree_task_list_pop_front(&list)->flags);
+  EXPECT_TRUE(iree_task_list_is_empty(&list));
+}
+
+TEST(TaskListTest, PopFront) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t list;
+  iree_task_list_initialize(&list);
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  auto task1 = AcquireNopTask(pool, scope, 1);
+  auto task2 = AcquireNopTask(pool, scope, 2);
+  auto task3 = AcquireNopTask(pool, scope, 3);
+
+  iree_task_list_push_back(&list, task0);
+  iree_task_list_push_back(&list, task1);
+  iree_task_list_push_back(&list, task2);
+  iree_task_list_push_back(&list, task3);
+  EXPECT_EQ(4, iree_task_list_calculate_size(&list));
+  EXPECT_TRUE(CheckListOrderFIFO(&list));
+
+  EXPECT_EQ(0, iree_task_list_pop_front(&list)->flags);
+  EXPECT_EQ(1, iree_task_list_pop_front(&list)->flags);
+  EXPECT_EQ(2, iree_task_list_pop_front(&list)->flags);
+  EXPECT_EQ(3, iree_task_list_pop_front(&list)->flags);
+  EXPECT_TRUE(iree_task_list_is_empty(&list));
+}
+
+TEST(TaskListTest, Erase) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t list;
+  iree_task_list_initialize(&list);
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  auto task1 = AcquireNopTask(pool, scope, 1);
+  auto task2 = AcquireNopTask(pool, scope, 2);
+  auto task3 = AcquireNopTask(pool, scope, 3);
+
+  iree_task_list_push_back(&list, task0);
+  iree_task_list_push_back(&list, task1);
+  iree_task_list_push_back(&list, task2);
+  iree_task_list_push_back(&list, task3);
+  EXPECT_EQ(4, iree_task_list_calculate_size(&list));
+  EXPECT_TRUE(CheckListOrderFIFO(&list));
+
+  // Remove head.
+  iree_task_list_erase(&list, NULL, task0);
+  EXPECT_EQ(3, iree_task_list_calculate_size(&list));
+  EXPECT_TRUE(CheckListOrderFIFO(&list));
+  EXPECT_EQ(task1, iree_task_list_front(&list));
+
+  // Remove tail.
+  iree_task_list_erase(&list, task2, task3);
+  EXPECT_EQ(2, iree_task_list_calculate_size(&list));
+  EXPECT_TRUE(CheckListOrderFIFO(&list));
+  EXPECT_EQ(task2, iree_task_list_back(&list));
+
+  // Remove the rest.
+  iree_task_list_erase(&list, task1, task2);
+  EXPECT_EQ(1, iree_task_list_calculate_size(&list));
+  EXPECT_TRUE(CheckListOrderFIFO(&list));
+  EXPECT_EQ(task1, iree_task_list_front(&list));
+  EXPECT_EQ(task1, iree_task_list_back(&list));
+
+  iree_task_list_erase(&list, NULL, task1);
+  EXPECT_TRUE(iree_task_list_is_empty(&list));
+  EXPECT_EQ(NULL, iree_task_list_front(&list));
+  EXPECT_EQ(NULL, iree_task_list_back(&list));
+}
+
+TEST(TaskListTest, PrependEmpty) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t list_a, list_b;
+  iree_task_list_initialize(&list_a);
+  iree_task_list_initialize(&list_b);
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  auto task1 = AcquireNopTask(pool, scope, 1);
+
+  iree_task_list_push_back(&list_a, task0);
+  iree_task_list_push_back(&list_a, task1);
+
+  EXPECT_TRUE(iree_task_list_is_empty(&list_b));
+  iree_task_list_prepend(&list_a, &list_b);
+  EXPECT_EQ(2, iree_task_list_calculate_size(&list_a));
+  EXPECT_TRUE(CheckListOrderFIFO(&list_a));
+}
+
+TEST(TaskListTest, PrependIntoEmpty) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t list_a, list_b;
+  iree_task_list_initialize(&list_a);
+  iree_task_list_initialize(&list_b);
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  auto task1 = AcquireNopTask(pool, scope, 1);
+  auto task2 = AcquireNopTask(pool, scope, 2);
+  auto task3 = AcquireNopTask(pool, scope, 3);
+
+  iree_task_list_push_back(&list_b, task0);
+  iree_task_list_push_back(&list_b, task1);
+  iree_task_list_push_back(&list_b, task2);
+  iree_task_list_push_back(&list_b, task3);
+  EXPECT_EQ(4, iree_task_list_calculate_size(&list_b));
+  EXPECT_TRUE(CheckListOrderFIFO(&list_b));
+
+  EXPECT_TRUE(iree_task_list_is_empty(&list_a));
+  iree_task_list_prepend(&list_a, &list_b);
+  EXPECT_EQ(4, iree_task_list_calculate_size(&list_a));
+  EXPECT_TRUE(CheckListOrderFIFO(&list_a));
+  EXPECT_TRUE(iree_task_list_is_empty(&list_b));
+}
+
+TEST(TaskListTest, PrependInto1) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t list_a, list_b;
+  iree_task_list_initialize(&list_a);
+  iree_task_list_initialize(&list_b);
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  auto task1 = AcquireNopTask(pool, scope, 1);
+  auto task2 = AcquireNopTask(pool, scope, 2);
+  auto task3 = AcquireNopTask(pool, scope, 3);
+
+  iree_task_list_push_back(&list_b, task0);
+  iree_task_list_push_back(&list_b, task1);
+  iree_task_list_push_back(&list_b, task2);
+
+  iree_task_list_push_back(&list_a, task3);
+  iree_task_list_prepend(&list_a, &list_b);
+
+  EXPECT_EQ(4, iree_task_list_calculate_size(&list_a));
+  EXPECT_TRUE(CheckListOrderFIFO(&list_a));
+  EXPECT_TRUE(iree_task_list_is_empty(&list_b));
+}
+
+TEST(TaskListTest, PrependInto2) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t list_a, list_b;
+  iree_task_list_initialize(&list_a);
+  iree_task_list_initialize(&list_b);
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  auto task1 = AcquireNopTask(pool, scope, 1);
+  auto task2 = AcquireNopTask(pool, scope, 2);
+  auto task3 = AcquireNopTask(pool, scope, 3);
+
+  iree_task_list_push_back(&list_b, task0);
+  iree_task_list_push_back(&list_b, task1);
+  iree_task_list_push_back(&list_a, task2);
+  iree_task_list_push_back(&list_a, task3);
+  iree_task_list_prepend(&list_a, &list_b);
+
+  EXPECT_EQ(4, iree_task_list_calculate_size(&list_a));
+  EXPECT_TRUE(CheckListOrderFIFO(&list_a));
+  EXPECT_TRUE(iree_task_list_is_empty(&list_b));
+}
+
+TEST(TaskListTest, AppendIntoEmpty) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t list_a, list_b;
+  iree_task_list_initialize(&list_a);
+  iree_task_list_initialize(&list_b);
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  auto task1 = AcquireNopTask(pool, scope, 1);
+  auto task2 = AcquireNopTask(pool, scope, 2);
+  auto task3 = AcquireNopTask(pool, scope, 3);
+
+  iree_task_list_push_back(&list_b, task0);
+  iree_task_list_push_back(&list_b, task1);
+  iree_task_list_push_back(&list_b, task2);
+  iree_task_list_push_back(&list_b, task3);
+  EXPECT_EQ(4, iree_task_list_calculate_size(&list_b));
+  EXPECT_TRUE(CheckListOrderFIFO(&list_b));
+
+  EXPECT_TRUE(iree_task_list_is_empty(&list_a));
+  iree_task_list_append(&list_a, &list_b);
+  EXPECT_EQ(4, iree_task_list_calculate_size(&list_a));
+  EXPECT_TRUE(CheckListOrderFIFO(&list_a));
+  EXPECT_TRUE(iree_task_list_is_empty(&list_b));
+}
+
+TEST(TaskListTest, AppendInto1) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t list_a, list_b;
+  iree_task_list_initialize(&list_a);
+  iree_task_list_initialize(&list_b);
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  auto task1 = AcquireNopTask(pool, scope, 1);
+  auto task2 = AcquireNopTask(pool, scope, 2);
+  auto task3 = AcquireNopTask(pool, scope, 3);
+
+  iree_task_list_push_back(&list_b, task1);
+  iree_task_list_push_back(&list_b, task2);
+
+  iree_task_list_push_back(&list_b, task3);
+  iree_task_list_push_back(&list_a, task0);
+
+  iree_task_list_append(&list_a, &list_b);
+
+  EXPECT_EQ(4, iree_task_list_calculate_size(&list_a));
+  EXPECT_TRUE(CheckListOrderFIFO(&list_a));
+  EXPECT_TRUE(iree_task_list_is_empty(&list_b));
+}
+
+TEST(TaskListTest, AppendInto2) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t list_a, list_b;
+  iree_task_list_initialize(&list_a);
+  iree_task_list_initialize(&list_b);
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  auto task1 = AcquireNopTask(pool, scope, 1);
+  auto task2 = AcquireNopTask(pool, scope, 2);
+  auto task3 = AcquireNopTask(pool, scope, 3);
+
+  iree_task_list_push_back(&list_b, task2);
+  iree_task_list_push_back(&list_b, task3);
+
+  iree_task_list_push_back(&list_a, task0);
+  iree_task_list_push_back(&list_a, task1);
+
+  iree_task_list_append(&list_a, &list_b);
+
+  EXPECT_EQ(4, iree_task_list_calculate_size(&list_a));
+  EXPECT_TRUE(CheckListOrderFIFO(&list_a));
+  EXPECT_TRUE(iree_task_list_is_empty(&list_b));
+}
+
+TEST(TaskListTest, Reverse0) {
+  iree_task_list_t list;
+  iree_task_list_initialize(&list);
+  EXPECT_TRUE(iree_task_list_is_empty(&list));
+  iree_task_list_reverse(&list);
+  EXPECT_TRUE(iree_task_list_is_empty(&list));
+}
+
+TEST(TaskListTest, Reverse1) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t list;
+  iree_task_list_initialize(&list);
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+
+  iree_task_list_push_back(&list, task0);
+  EXPECT_EQ(1, iree_task_list_calculate_size(&list));
+  EXPECT_TRUE(CheckListOrderFIFO(&list));
+  iree_task_list_reverse(&list);
+  EXPECT_TRUE(CheckListOrderLIFO(&list));
+}
+
+TEST(TaskListTest, Reverse2) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t list;
+  iree_task_list_initialize(&list);
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  auto task1 = AcquireNopTask(pool, scope, 1);
+
+  iree_task_list_push_back(&list, task0);
+  iree_task_list_push_back(&list, task1);
+  EXPECT_EQ(2, iree_task_list_calculate_size(&list));
+  EXPECT_TRUE(CheckListOrderFIFO(&list));
+  iree_task_list_reverse(&list);
+  EXPECT_TRUE(CheckListOrderLIFO(&list));
+}
+
+TEST(TaskListTest, Reverse4) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t list;
+  iree_task_list_initialize(&list);
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  auto task1 = AcquireNopTask(pool, scope, 1);
+  auto task2 = AcquireNopTask(pool, scope, 2);
+  auto task3 = AcquireNopTask(pool, scope, 3);
+
+  iree_task_list_push_back(&list, task0);
+  iree_task_list_push_back(&list, task1);
+  iree_task_list_push_back(&list, task2);
+  iree_task_list_push_back(&list, task3);
+  EXPECT_EQ(4, iree_task_list_calculate_size(&list));
+  EXPECT_TRUE(CheckListOrderFIFO(&list));
+  iree_task_list_reverse(&list);
+  EXPECT_TRUE(CheckListOrderLIFO(&list));
+}
+
+TEST(TaskListTest, SplitEmpty) {
+  iree_task_list_t head_list;
+  iree_task_list_initialize(&head_list);
+
+  iree_task_list_t tail_list;
+  iree_task_list_split(&head_list, /*max_tasks=*/64, &tail_list);
+
+  EXPECT_TRUE(iree_task_list_is_empty(&head_list));
+  EXPECT_TRUE(iree_task_list_is_empty(&tail_list));
+}
+
+TEST(TaskListTest, Split1) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t head_list;
+  iree_task_list_initialize(&head_list);
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  iree_task_list_push_back(&head_list, task0);
+  EXPECT_EQ(1, iree_task_list_calculate_size(&head_list));
+
+  iree_task_list_t tail_list;
+  iree_task_list_split(&head_list, /*max_tasks=*/64, &tail_list);
+
+  EXPECT_TRUE(iree_task_list_is_empty(&head_list));
+  EXPECT_EQ(1, iree_task_list_calculate_size(&tail_list));
+}
+
+TEST(TaskListTest, Split2) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t head_list;
+  iree_task_list_initialize(&head_list);
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  auto task1 = AcquireNopTask(pool, scope, 1);
+
+  iree_task_list_push_back(&head_list, task0);
+  iree_task_list_push_back(&head_list, task1);
+
+  iree_task_list_t tail_list;
+  iree_task_list_split(&head_list, /*max_tasks=*/64, &tail_list);
+
+  EXPECT_EQ(1, iree_task_list_calculate_size(&head_list));
+  EXPECT_TRUE(CheckListOrderFIFO(&head_list));
+  EXPECT_EQ(1, iree_task_list_calculate_size(&tail_list));
+  EXPECT_TRUE(CheckListOrderFIFO(&tail_list));
+}
+
+TEST(TaskListTest, Split3) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t head_list;
+  iree_task_list_initialize(&head_list);
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  auto task1 = AcquireNopTask(pool, scope, 1);
+  auto task2 = AcquireNopTask(pool, scope, 2);
+
+  iree_task_list_push_back(&head_list, task0);
+  iree_task_list_push_back(&head_list, task1);
+  iree_task_list_push_back(&head_list, task2);
+
+  iree_task_list_t tail_list;
+  iree_task_list_split(&head_list, /*max_tasks=*/64, &tail_list);
+
+  EXPECT_EQ(1, iree_task_list_calculate_size(&head_list));
+  EXPECT_TRUE(CheckListOrderFIFO(&head_list));
+  EXPECT_EQ(2, iree_task_list_calculate_size(&tail_list));
+  EXPECT_TRUE(CheckListOrderFIFO(&tail_list));
+}
+
+TEST(TaskListTest, Split4) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t head_list;
+  iree_task_list_initialize(&head_list);
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  auto task1 = AcquireNopTask(pool, scope, 1);
+  auto task2 = AcquireNopTask(pool, scope, 2);
+  auto task3 = AcquireNopTask(pool, scope, 3);
+
+  iree_task_list_push_back(&head_list, task0);
+  iree_task_list_push_back(&head_list, task1);
+  iree_task_list_push_back(&head_list, task2);
+  iree_task_list_push_back(&head_list, task3);
+
+  iree_task_list_t tail_list;
+  iree_task_list_split(&head_list, /*max_tasks=*/64, &tail_list);
+
+  EXPECT_EQ(2, iree_task_list_calculate_size(&head_list));
+  EXPECT_TRUE(CheckListOrderFIFO(&head_list));
+  EXPECT_EQ(2, iree_task_list_calculate_size(&tail_list));
+  EXPECT_TRUE(CheckListOrderFIFO(&tail_list));
+}
+
+TEST(TaskListTest, SplitMaxTasks1) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t head_list;
+  iree_task_list_initialize(&head_list);
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  auto task1 = AcquireNopTask(pool, scope, 1);
+  auto task2 = AcquireNopTask(pool, scope, 2);
+  auto task3 = AcquireNopTask(pool, scope, 3);
+
+  iree_task_list_push_back(&head_list, task0);
+  iree_task_list_push_back(&head_list, task1);
+  iree_task_list_push_back(&head_list, task2);
+  iree_task_list_push_back(&head_list, task3);
+
+  iree_task_list_t tail_list;
+  iree_task_list_split(&head_list, /*max_tasks=*/1, &tail_list);
+
+  EXPECT_EQ(3, iree_task_list_calculate_size(&head_list));
+  EXPECT_TRUE(CheckListOrderFIFO(&head_list));
+  EXPECT_EQ(1, iree_task_list_calculate_size(&tail_list));
+  EXPECT_TRUE(CheckListOrderFIFO(&tail_list));
+}
+
+TEST(TaskListTest, SplitMaxTasks2) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t head_list;
+  iree_task_list_initialize(&head_list);
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  auto task1 = AcquireNopTask(pool, scope, 1);
+  auto task2 = AcquireNopTask(pool, scope, 2);
+  auto task3 = AcquireNopTask(pool, scope, 3);
+
+  iree_task_list_push_back(&head_list, task0);
+  iree_task_list_push_back(&head_list, task1);
+  iree_task_list_push_back(&head_list, task2);
+  iree_task_list_push_back(&head_list, task3);
+
+  iree_task_list_t tail_list;
+  iree_task_list_split(&head_list, /*max_tasks=*/2, &tail_list);
+
+  EXPECT_EQ(2, iree_task_list_calculate_size(&head_list));
+  EXPECT_TRUE(CheckListOrderFIFO(&head_list));
+  EXPECT_EQ(2, iree_task_list_calculate_size(&tail_list));
+  EXPECT_TRUE(CheckListOrderFIFO(&tail_list));
+}
+
+}  // namespace
diff --git a/runtime/src/iree/task/poller.c b/runtime/src/iree/task/poller.c
new file mode 100644
index 0000000..ee6465b
--- /dev/null
+++ b/runtime/src/iree/task/poller.c
@@ -0,0 +1,535 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/task/poller.h"
+
+#include "iree/base/tracing.h"
+#include "iree/task/executor.h"
+#include "iree/task/executor_impl.h"
+#include "iree/task/submission.h"
+#include "iree/task/task_impl.h"
+#include "iree/task/tuning.h"
+
+static int iree_task_poller_main(iree_task_poller_t* poller);
+
+iree_status_t iree_task_poller_initialize(
+    iree_task_executor_t* executor,
+    iree_thread_affinity_t ideal_thread_affinity,
+    iree_task_poller_t* out_poller) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  out_poller->executor = executor;
+  out_poller->ideal_thread_affinity = ideal_thread_affinity;
+  iree_notification_initialize(&out_poller->state_notification);
+  iree_atomic_task_slist_initialize(&out_poller->mailbox_slist);
+  iree_task_list_initialize(&out_poller->wait_list);
+
+  iree_task_poller_state_t initial_state = IREE_TASK_POLLER_STATE_RUNNING;
+  // TODO(benvanik): support initially suspended wait threads. This can reduce
+  // startup time as we won't give the system a chance to deschedule the calling
+  // thread as it performs the initial resume of the wait thread. We'll need to
+  // check in enqueue to see if the wait thread needs to be resumed.
+  // initial_state = IREE_TASK_POLLER_STATE_SUSPENDED;
+  iree_atomic_store_int32(&out_poller->state, initial_state,
+                          iree_memory_order_seq_cst);
+
+  // Acquire an event we can use to wake the wait thread from other threads.
+  iree_status_t status = iree_event_pool_acquire(
+      iree_task_executor_event_pool(out_poller->executor), 1,
+      &out_poller->wake_event);
+
+  // Wait set used to batch syscalls for polling/waiting on wait handles.
+  // This is currently limited to a relatively small max to make bad behavior
+  // clearer with nice RESOURCE_EXHAUSTED errors. If we start to hit that limit
+  // (~63+ simultaneous system waits) we'll need to shard out the wait sets -
+  // possibly with multiple wait threads (one per set).
+  if (iree_status_is_ok(status)) {
+    status = iree_wait_set_allocate(IREE_TASK_EXECUTOR_MAX_OUTSTANDING_WAITS,
+                                    executor->allocator, &out_poller->wait_set);
+  }
+  if (iree_status_is_ok(status)) {
+    status = iree_wait_set_insert(out_poller->wait_set, out_poller->wake_event);
+  }
+
+  iree_thread_create_params_t thread_params;
+  memset(&thread_params, 0, sizeof(thread_params));
+  thread_params.name = iree_make_cstring_view("iree-poller");
+  thread_params.create_suspended = false;
+  // TODO(benvanik): make high so to reduce latency? The sooner we wake the
+  // sooner we get ready tasks back in the execution queue, though we don't
+  // want to preempt any of the workers.
+  thread_params.priority_class = IREE_THREAD_PRIORITY_CLASS_NORMAL;
+  thread_params.initial_affinity = out_poller->ideal_thread_affinity;
+
+  // NOTE: if the thread creation fails we'll bail here and let the caller
+  // cleanup by calling deinitialize (which is safe because we zero init
+  // everything).
+  if (iree_status_is_ok(status)) {
+    status = iree_thread_create((iree_thread_entry_t)iree_task_poller_main,
+                                out_poller, thread_params, executor->allocator,
+                                &out_poller->thread);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+void iree_task_poller_request_exit(iree_task_poller_t* poller) {
+  if (!poller->thread) return;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // If the thread is already in the exiting/zombie state we don't need to do
+  // anything.
+  iree_task_poller_state_t prev_state =
+      (iree_task_poller_state_t)iree_atomic_exchange_int32(
+          &poller->state, IREE_TASK_POLLER_STATE_EXITING,
+          iree_memory_order_acq_rel);
+  switch (prev_state) {
+    case IREE_TASK_POLLER_STATE_SUSPENDED:
+      // Poller was suspended; resume it so that it can exit itself.
+      iree_thread_resume(poller->thread);
+      break;
+    case IREE_TASK_POLLER_STATE_ZOMBIE:
+      // Poller already exited; reset state to ZOMBIE.
+      iree_atomic_store_int32(&poller->state, IREE_TASK_POLLER_STATE_ZOMBIE,
+                              iree_memory_order_seq_cst);
+      break;
+    default:
+      // Poller now set to EXITING and should exit soon.
+      break;
+  }
+
+  // Kick the wait thread to exit the system wait API, if needed.
+  // It'll check the state and abort ASAP.
+  iree_event_set(&poller->wake_event);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+// Returns true if the wait thread is in the zombie state (exited and awaiting
+// teardown).
+static bool iree_task_poller_is_zombie(iree_task_poller_t* poller) {
+  return iree_atomic_load_int32(&poller->state, iree_memory_order_seq_cst) ==
+         IREE_TASK_POLLER_STATE_ZOMBIE;
+}
+
+void iree_task_poller_await_exit(iree_task_poller_t* poller) {
+  if (!poller->thread) return;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_task_poller_request_exit(poller);
+  iree_notification_await(&poller->state_notification,
+                          (iree_condition_fn_t)iree_task_poller_is_zombie,
+                          poller, iree_infinite_timeout());
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_task_poller_deinitialize(iree_task_poller_t* poller) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Must have called request_exit/await_exit.
+  IREE_ASSERT_TRUE(iree_task_poller_is_zombie(poller));
+
+  iree_thread_release(poller->thread);
+  poller->thread = NULL;
+
+  iree_wait_set_free(poller->wait_set);
+  if (!iree_wait_handle_is_immediate(poller->wake_event)) {
+    iree_event_pool_release(iree_task_executor_event_pool(poller->executor), 1,
+                            &poller->wake_event);
+  }
+
+  iree_task_list_discard(&poller->wait_list);
+  iree_atomic_task_slist_discard(&poller->mailbox_slist);
+  iree_atomic_task_slist_deinitialize(&poller->mailbox_slist);
+  iree_notification_deinitialize(&poller->state_notification);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_task_poller_enqueue(iree_task_poller_t* poller,
+                              iree_task_list_t* wait_tasks) {
+  if (iree_task_list_is_empty(wait_tasks)) return;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Move the list into the mailbox. Note that the mailbox is LIFO and this list
+  // is concatenated with its current order preserved (which should be LIFO),
+  // though we don't really care about order here.
+  iree_atomic_task_slist_concat(&poller->mailbox_slist, wait_tasks->head,
+                                wait_tasks->tail);
+  memset(wait_tasks, 0, sizeof(*wait_tasks));
+
+  // Kick the wait thread to exit the system wait API, if needed.
+  // It'll merge the new wait tasks and reset the event.
+  iree_event_set(&poller->wake_event);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+// Acquires a wait handle for |task| and inserts it into |wait_set|.
+static iree_status_t iree_task_poller_insert_wait_handle(
+    iree_wait_set_t* wait_set, iree_task_wait_t* task) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_status_t status = iree_ok_status();
+
+  iree_wait_handle_t wait_handle = iree_wait_handle_immediate();
+  iree_wait_handle_t* wait_handle_ptr =
+      iree_wait_handle_from_source(&task->wait_source);
+  if (wait_handle_ptr) {
+    // Already a wait handle - can directly insert it.
+    wait_handle = *wait_handle_ptr;
+  } else {
+    iree_wait_primitive_t wait_primitive = iree_wait_primitive_immediate();
+    status =
+        iree_wait_source_export(task->wait_source, IREE_WAIT_PRIMITIVE_TYPE_ANY,
+                                iree_immediate_timeout(), &wait_primitive);
+    if (iree_status_is_ok(status)) {
+      // Swap the wait handle with the exported handle so we can wake it later.
+      // It'd be ideal if we retained the wait handle separate so that we could
+      // still do fast queries for local wait sources.
+      iree_wait_handle_wrap_primitive(wait_primitive.type, wait_primitive.value,
+                                      &wait_handle);
+      status = iree_wait_source_import(wait_primitive, &task->wait_source);
+    }
+  }
+
+  if (iree_status_is_ok(status)) {
+    status = iree_wait_set_insert(wait_set, wait_handle);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+enum iree_task_poller_prepare_result_bits_e {
+  IREE_TASK_POLLER_PREPARE_OK = 0,
+  IREE_TASK_POLLER_PREPARE_RETIRED = 1u << 0,
+  IREE_TASK_POLLER_PREPARE_CANCELLED = 1u << 1,
+};
+typedef uint32_t iree_task_poller_prepare_result_t;
+
+// Prepares a wait |task| for waiting.
+// The task will be checked for completion or failure such as deadline exceeded
+// and removed from the wait list if resolved. If unresolved the wait will be
+// prepared for the system wait by ensuring a wait handle is available.
+static iree_task_poller_prepare_result_t iree_task_poller_prepare_task(
+    iree_task_poller_t* poller, iree_task_wait_t* task,
+    iree_task_submission_t* pending_submission, iree_time_t now_ns,
+    iree_time_t* earliest_deadline_ns) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Status of the preparation - failures propagate to the task scope.
+  iree_status_t status = iree_ok_status();
+  // Wait status:
+  //   OK: wait resolved successfully
+  //   DEFERRED: wait unresolved
+  //   DEADLINE_EXCEEDED: deadline was hit before the wait resolved
+  //   CANCELLED: wait was cancelled via the cancellation flag
+  iree_status_code_t wait_status_code = IREE_STATUS_DEFERRED;
+  if (iree_all_bits_set(task->header.flags, IREE_TASK_FLAG_WAIT_COMPLETED)) {
+    // Wait was marked as resolved and we just pass that through here.
+    // This allows us to bypass more expensive queries when doing a post-wake
+    // scan of tasks.
+    wait_status_code = IREE_STATUS_OK;
+  } else if (task->cancellation_flag != NULL &&
+             iree_atomic_load_int32(task->cancellation_flag,
+                                    iree_memory_order_acquire) != 0) {
+    // Task was cancelled by the user (or a wait-any). These retire without
+    // failure and it's up to the user to handle what happens to them.
+    wait_status_code = IREE_STATUS_CANCELLED;
+  } else if (iree_wait_source_is_immediate(task->wait_source)) {
+    // Task has been neutered and is treated as an immediately resolved wait.
+    wait_status_code = IREE_STATUS_OK;
+  } else if (iree_wait_source_is_delay(task->wait_source)) {
+    // Task is a delay until some future time; factor that in to our earliest
+    // deadline so that we'll wait in the system until that time. If we wake
+    // earlier because another wait resolved it's still possible for the delay
+    // to have been reached before we get back to this check.
+    iree_time_t delay_deadline_ns = (iree_time_t)task->wait_source.data;
+    if (delay_deadline_ns <= now_ns + IREE_TASK_EXECUTOR_DELAY_SLOP_NS) {
+      // Wait deadline reached.
+      wait_status_code = IREE_STATUS_OK;
+    } else {
+      // Still waiting.
+      *earliest_deadline_ns =
+          iree_min(*earliest_deadline_ns, delay_deadline_ns);
+      wait_status_code = IREE_STATUS_DEFERRED;
+    }
+  } else {
+    // An actual wait. Ensure that the deadline has not been exceeded yet.
+    // If it hasn't yet been hit we'll propagate the deadline to the system wait
+    // API - then on the next pump we'll hit this case and retire the task.
+    IREE_TRACE_ZONE_APPEND_VALUE(z0, task->deadline_ns);
+    IREE_TRACE_ZONE_APPEND_VALUE(z0, now_ns);
+    if (task->deadline_ns <= now_ns) {
+      wait_status_code = IREE_STATUS_DEADLINE_EXCEEDED;
+    } else {
+      // Query the status of the wait source to see if it has already been
+      // resolved. Under load we can get lucky and end up with resolved waits
+      // before ever needing to export them for a full system wait. This query
+      // can also avoid making a syscall to check the state of the source such
+      // as when the source is a process-local type.
+      wait_status_code = IREE_STATUS_OK;
+      status = iree_wait_source_query(task->wait_source, &wait_status_code);
+
+      // TODO(benvanik): avoid this query for wait handles: we don't want to
+      // make one syscall per handle and could rely on the completed bit being
+      // set to retire these.
+    }
+
+    // If the wait has not been resolved then we need to ensure there's an
+    // exported wait handle in the wait set. We only do this on the first time
+    // we prepare the task.
+    if (wait_status_code == IREE_STATUS_DEFERRED) {
+      if (!iree_all_bits_set(task->header.flags,
+                             IREE_TASK_FLAG_WAIT_EXPORTED)) {
+        task->header.flags |= IREE_TASK_FLAG_WAIT_EXPORTED;
+        status = iree_task_poller_insert_wait_handle(poller->wait_set, task);
+      }
+      *earliest_deadline_ns =
+          iree_min(*earliest_deadline_ns, task->deadline_ns);
+    }
+  }
+
+  if (iree_status_is_ok(status) && wait_status_code == IREE_STATUS_DEFERRED) {
+    // Wait is prepared for use and can be waited on.
+    IREE_TRACE_ZONE_END(z0);
+    return IREE_TASK_POLLER_PREPARE_OK;
+  }
+
+  // If the task was able to be retired (deadline elapsed, completed, etc)
+  // then we need to unregister it from the poller and send it back to the
+  // workers for completion.
+  iree_task_poller_prepare_result_t result = IREE_TASK_POLLER_PREPARE_RETIRED;
+
+  // If this was part of a wait-any operation then set the cancellation flag
+  // such that other waits are cancelled.
+  if (iree_any_bit_set(task->header.flags, IREE_TASK_FLAG_WAIT_ANY)) {
+    if (iree_atomic_fetch_add_int32(task->cancellation_flag, 1,
+                                    iree_memory_order_release) == 0) {
+      // Ensure we scan again to clean up any potentially cancelled tasks.
+      // If this was task 4 in a wait-any list then tasks 0-3 need to be
+      // retired.
+      result |= IREE_TASK_POLLER_PREPARE_CANCELLED;
+    }
+  }
+
+  // Remove the system wait handle from the wait set, if assigned.
+  if (iree_all_bits_set(task->header.flags, IREE_TASK_FLAG_WAIT_EXPORTED)) {
+    iree_wait_handle_t* wait_handle =
+        iree_wait_handle_from_source(&task->wait_source);
+    if (wait_handle) {
+      iree_wait_set_erase(poller->wait_set, *wait_handle);
+    }
+    task->header.flags &= ~IREE_TASK_FLAG_WAIT_EXPORTED;
+  }
+
+  // Retire the task and enqueue any available completion task.
+  // Note that we pass in the status of the wait query above: that propagates
+  // any query failure into the task/task scope.
+  if (iree_status_is_ok(status) && wait_status_code != IREE_STATUS_OK) {
+    // Cancellation is ok - we just ignore those.
+    if (wait_status_code != IREE_STATUS_CANCELLED) {
+      status = iree_status_from_code(wait_status_code);
+    }
+  }
+  iree_task_wait_retire(task, pending_submission, status);
+
+  IREE_TRACE_ZONE_END(z0);
+  return result;
+}
+
+// Scans all wait tasks in |poller| to see if they have resolved.
+// Resolved/failed waits are enqueued on |pending_submission|.
+// If there are any unresolved delay tasks the earliest deadline will be stored
+// in |out_earliest_deadline_ns| and otherwise it'll be set to
+// IREE_TIME_INFINITE_FUTURE.
+static void iree_task_poller_prepare_wait(
+    iree_task_poller_t* poller, iree_task_submission_t* pending_submission,
+    iree_time_t* out_earliest_deadline_ns) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  *out_earliest_deadline_ns = IREE_TIME_INFINITE_FUTURE;
+
+  // TODO(benvanik): only query if there are pending delays; this is (likely) a
+  // syscall that we only need to perform if we're going to delay.
+  iree_time_t now_ns = iree_time_now();
+
+  // Perform the scan over the task list; we may need to retry the scan if we
+  // encounter a situation that would invalidate other waits - such as
+  // cancellation or scope errors.
+  bool retry_scan = false;
+  do {
+    retry_scan = false;
+
+    // Note that we walk the singly-linked list inline and need to keep track of
+    // the previous task in case we need to unlink one.
+    iree_task_t* prev_task = NULL;
+    iree_task_t* task = iree_task_list_front(&poller->wait_list);
+    while (task != NULL) {
+      iree_task_t* next_task = task->next_task;
+
+      iree_task_poller_prepare_result_t result = iree_task_poller_prepare_task(
+          poller, (iree_task_wait_t*)task, pending_submission, now_ns,
+          out_earliest_deadline_ns);
+      if (iree_all_bits_set(result, IREE_TASK_POLLER_PREPARE_CANCELLED)) {
+        // A task was cancelled; we'll need to retry the scan to clean up any
+        // waits we may have already checked.
+        retry_scan = true;
+      }
+
+      if (iree_all_bits_set(result, IREE_TASK_POLLER_PREPARE_RETIRED)) {
+        // Erase the retired task from the wait list.
+        iree_task_list_erase(&poller->wait_list, prev_task, task);
+      } else {
+        prev_task = task;
+      }
+      task = next_task;
+    }
+  } while (retry_scan);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+// Finds tasks in |poller| using the given wait handle and marks them as
+// completed.
+static void iree_task_poller_wake_task(iree_task_poller_t* poller,
+                                       iree_wait_handle_t wake_handle) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // TODO(benvanik): scan the list. We need a way to map wake_handle back to
+  // the zero or more tasks that match it but don't currently store the
+  // handle. Ideally we'd have the wait set tell us precisely which things
+  // woke - possibly by having a bitmap of original insertions that match the
+  // handle - but for now we just eat the extra query syscall.
+  int woken_tasks = 0;
+
+  (void)woken_tasks;
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, woken_tasks);
+  IREE_TRACE_ZONE_END(z0);
+}
+
+// Commits a system wait on the current wait set in |poller|.
+// The wait will time out after |deadline_ns| is reached and return even if no
+// wait handles were resolved.
+static void iree_task_poller_commit_wait(iree_task_poller_t* poller,
+                                         iree_time_t deadline_ns) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Enter the system wait API.
+  iree_wait_handle_t wake_handle = iree_wait_handle_immediate();
+  iree_status_t status =
+      iree_wait_any(poller->wait_set, deadline_ns, &wake_handle);
+  if (iree_status_is_ok(status)) {
+    // One or more waiters is ready. We don't support multi-wake right now so
+    // we'll just take the one we got back and try again.
+    //
+    // To avoid extra syscalls we scan the list and mark whatever tasks were
+    // using the handle the wait set reported waking as completed. On the next
+    // scan they'll be retired immediately. Ideally we'd have the wait set be
+    // able to tell us this precise list.
+    if (iree_wait_handle_is_immediate(wake_handle)) {
+      // No-op wait - ignore.
+      IREE_TRACE_ZONE_APPEND_TEXT(z0, "nop");
+    } else if (wake_handle.type == poller->wake_event.type &&
+               memcmp(&wake_handle.value, &poller->wake_event.value,
+                      sizeof(wake_handle.value)) == 0) {
+      // Woken on the wake_event used to exit the system wait early.
+      IREE_TRACE_ZONE_APPEND_TEXT(z0, "wake_event");
+    } else {
+      // Route to zero or more tasks using this handle.
+      IREE_TRACE_ZONE_APPEND_TEXT(z0, "task(s)");
+      iree_task_poller_wake_task(poller, wake_handle);
+    }
+  } else if (iree_status_is_deadline_exceeded(status)) {
+    // Indicates nothing was woken within the deadline. We gracefully bail here
+    // and let the scan check for per-task deadline exceeded events or delay
+    // completion.
+    IREE_TRACE_ZONE_APPEND_TEXT(z0, "deadline exceeded");
+  } else {
+    // (Spurious?) error during wait.
+    // TODO(#4026): propagate failure to all scopes involved.
+    // Failures during waits are serious: ignoring them could lead to live-lock
+    // as tasks further in the pipeline expect them to have completed or - even
+    // worse - user code/other processes/drivers/etc may expect them to
+    // complete.
+    IREE_TRACE_ZONE_APPEND_TEXT(z0, "failure");
+    IREE_ASSERT_TRUE(iree_status_is_ok(status));
+    iree_status_ignore(status);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+// Pumps the |poller| until it is requested to exit.
+static void iree_task_poller_pump_until_exit(iree_task_poller_t* poller) {
+  while (true) {
+    // Check state to see if we've been asked to exit.
+    if (iree_atomic_load_int32(&poller->state, iree_memory_order_seq_cst) ==
+        IREE_TASK_POLLER_STATE_EXITING) {
+      // Thread exit requested - cancel pumping.
+      break;
+    }
+
+    IREE_TRACE_ZONE_BEGIN_NAMED(z0, "iree_task_poller_pump");
+
+    // Reset the wake event and merge any incoming tasks to the wait list.
+    // To avoid races we reset and then merge: this allows another thread
+    // coming in and enqueuing tasks to set the event and ensure that we'll
+    // get the tasks as we'll fall through on the wait below and loop again.
+    iree_event_reset(&poller->wake_event);
+    iree_task_list_append_from_fifo_slist(&poller->wait_list,
+                                          &poller->mailbox_slist);
+
+    // Scan all wait tasks to see if any have resolved and if so we'll enqueue
+    // their retirement on the executor and drop them from the list.
+    iree_task_submission_t pending_submission;
+    iree_task_submission_initialize(&pending_submission);
+    iree_time_t earliest_deadline_ns = IREE_TIME_INFINITE_FUTURE;
+    iree_task_poller_prepare_wait(poller, &pending_submission,
+                                  &earliest_deadline_ns);
+    if (!iree_task_submission_is_empty(&pending_submission)) {
+      iree_task_executor_submit(poller->executor, &pending_submission);
+      iree_task_executor_flush(poller->executor);
+    }
+
+    // Enter the system multi-wait API.
+    // We unconditionally do this: if we have nothing to wait on we'll still
+    // wait on the wake_event for new waits to be enqueued - or the first delay
+    // to be reached.
+    iree_task_poller_commit_wait(poller, earliest_deadline_ns);
+
+    IREE_TRACE_ZONE_END(z0);
+  }
+}
+
+// Thread entry point for the poller wait thread.
+static int iree_task_poller_main(iree_task_poller_t* poller) {
+  IREE_TRACE_ZONE_BEGIN(thread_zone);
+
+  // Reset affinity (as it can change over time).
+  // TODO(benvanik): call this after waking in case CPU hotplugging happens.
+  iree_thread_request_affinity(poller->thread, poller->ideal_thread_affinity);
+
+  // Enter the running state immediately. Note that we could have been requested
+  // to exit while suspended/still starting up, so check that here before we
+  // mess with any data structures.
+  const bool should_run =
+      iree_atomic_exchange_int32(&poller->state, IREE_TASK_POLLER_STATE_RUNNING,
+                                 iree_memory_order_seq_cst) !=
+      IREE_TASK_POLLER_STATE_EXITING;
+  if (IREE_LIKELY(should_run)) {
+    // << work happens here >>
+    iree_task_poller_pump_until_exit(poller);
+  }
+
+  IREE_TRACE_ZONE_END(thread_zone);
+  iree_atomic_store_int32(&poller->state, IREE_TASK_POLLER_STATE_ZOMBIE,
+                          iree_memory_order_seq_cst);
+  iree_notification_post(&poller->state_notification, IREE_ALL_WAITERS);
+  return 0;
+}
diff --git a/runtime/src/iree/task/poller.h b/runtime/src/iree/task/poller.h
new file mode 100644
index 0000000..8618682
--- /dev/null
+++ b/runtime/src/iree/task/poller.h
@@ -0,0 +1,146 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_TASK_POLLER_H_
+#define IREE_TASK_POLLER_H_
+
+#include <stdbool.h>
+#include <stddef.h>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/synchronization.h"
+#include "iree/base/internal/threading.h"
+#include "iree/base/internal/wait_handle.h"
+#include "iree/task/affinity_set.h"
+#include "iree/task/list.h"
+#include "iree/task/task.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef struct iree_task_executor_t iree_task_executor_t;
+
+// Indicates the current state of a poller or, in the case of EXITING, the state
+// the poller should transition to.
+//
+// Transition graph:
+//   SUSPENDED -> RUNNING -> EXITING -> ZOMBIE
+//
+// NOTE: state values are ordered such that </> comparisons can be used; ensure
+// that for example all states after resuming are > SUSPENDED and all states
+// before exiting are < EXITING.
+typedef enum iree_task_poller_state_e {
+  // Wait thread has been created in a suspended state and must be resumed to
+  // wake for the first time.
+  IREE_TASK_POLLER_STATE_SUSPENDED = 0,
+  // Wait thread is running and servicing wait tasks.
+  IREE_TASK_POLLER_STATE_RUNNING = 1,
+  // Wait thread should exit (or is exiting) and will soon enter the zombie
+  // state.
+  IREE_TASK_POLLER_STATE_EXITING = 2,
+  // Wait thread has exited and entered a 🧟 state (waiting for join).
+  // The thread handle is still valid and must be destroyed.
+  IREE_TASK_POLLER_STATE_ZOMBIE = 3,
+} iree_task_poller_state_t;
+
+// Wait task poller with a dedicated thread for performing syscalls.
+// This keeps potentially-blocking syscalls off the worker threads and ensures
+// the lowest possible latency for wakes as the poller will always be kept in
+// the system wait queue.
+//
+// During coordination wait tasks are registered with the poller for handling.
+// The wait thread will wake, merge the newly-registered tasks into its lists,
+// and then enter the system multi-wait API to wait for either one or more waits
+// to resolve or the timeout to be hit (representing sleeps). Resolved waits
+// will cause the wait task to be resubmitted to the executor with a flag
+// indicating that they have completed waiting and can be retired. This ensures
+// that all task-related work (completion callbacks, etc) executes on the worker
+// threads and the poller can immediately return to the system for more waiting.
+typedef struct {
+  // Parent executor used to access the global work queue and submit wakes.
+  iree_task_executor_t* executor;
+
+  // Current state of the poller (iree_task_poller_state_t).
+  iree_atomic_int32_t state;
+  // Notification signaled when the wait thread changes state.
+  iree_notification_t state_notification;
+
+  // Ideal affinity for the wait thread. This can be used to keep the wait
+  // thread from contending with the processing threads. To allow the wait
+  // thread to run anywhere use iree_thread_affinity_set_any.
+  iree_thread_affinity_t ideal_thread_affinity;
+
+  // Thread handle of the wait thread. If the thread has exited the handle will
+  // remain valid so that the poller can query its state.
+  iree_thread_t* thread;
+
+  // Event used to force the wait thread to wake.
+  // This allows the wait thread to remain in a syscall but still be woken when
+  // new wait tasks arrive and need to be managed by the wait thread.
+  // Set from threads submitting tasks to the poller and reset after the wait
+  // thread has woken and processed them. All system waits have this event
+  // in the wait set.
+  iree_event_t wake_event;
+
+  // A LIFO mailbox used by coordinators to post wait tasks to the poller.
+  // This allows for submissions to add tasks without needing to synchronize
+  // with the wait thread; tasks are pushed to the mailbox and then merged with
+  // the full wait set by the wait thread the next time it wakes.
+  iree_atomic_task_slist_t mailbox_slist;
+
+  // A list of wait tasks with external handles that need to be waited on.
+  // Managed by the wait thread and must not be accessed from any other thread.
+  // This is the full set of waits actively being managed by the poller.
+  iree_task_list_t wait_list;
+
+  // Wait set containing wait handles from wait_list.
+  // Managed by the wait thread and must not be accessed from any other thread.
+  // This may only contain a subset of the wait_list in cases where some of
+  // the wait tasks do not have full system handles.
+  iree_wait_set_t* wait_set;
+} iree_task_poller_t;
+
+// Initializes |out_poller| with a new poller.
+// |executor| will be used to submit woken tasks for processing.
+iree_status_t iree_task_poller_initialize(
+    iree_task_executor_t* executor,
+    iree_thread_affinity_t ideal_thread_affinity,
+    iree_task_poller_t* out_poller);
+
+// Requests that the poller wait thread begin exiting (if it hasn't already).
+// If the wait thread is in a syscall it will be woken as soon as possible.
+//
+// May be called from any thread. Any active waits will be aborted as possible.
+void iree_task_poller_request_exit(iree_task_poller_t* poller);
+
+// Blocks the caller until |poller| has exited.
+//
+// May be called from any thread.
+void iree_task_poller_await_exit(iree_task_poller_t* poller);
+
+// Deinitializes |poller| after the thread has exited.
+// The poller must be in the IREE_TASK_POLLER_STATE_ZOMBIE state.
+//
+// Expected shutdown sequence:
+//  - request_exit
+//  - await_exit
+//  - deinitialize
+void iree_task_poller_deinitialize(iree_task_poller_t* poller);
+
+// Enqueues |wait_tasks| on the poller and kicks the wait thread.
+// The task pointers will be retained by the poller and must remain valid.
+//
+// May be called from any thread. Waits may begin and complete prior to the
+// function returning.
+void iree_task_poller_enqueue(iree_task_poller_t* poller,
+                              iree_task_list_t* wait_tasks);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_TASK_POLLER_H_
diff --git a/runtime/src/iree/task/pool.c b/runtime/src/iree/task/pool.c
new file mode 100644
index 0000000..387bdbb
--- /dev/null
+++ b/runtime/src/iree/task/pool.c
@@ -0,0 +1,291 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/task/pool.h"
+
+#include <stdint.h>
+
+#include "iree/base/internal/math.h"
+#include "iree/base/tracing.h"
+
+// Minimum byte size of a block in bytes, including the tasks as well as the
+// allocation header. This is here to allow us to reduce the number of times
+// we go to the allocator and amortize the overhead of our block header.
+#define IREE_TASK_POOL_MIN_BLOCK_SIZE (4 * 1024)
+
+// Alignment for block allocations; roughly a (likely) page size.
+// Since many allocators after the small byte range (~thousands of bytes) will
+// round up this just prevents us from being 1 over the allocator block size and
+// wasting space in a larger bucket.
+#define IREE_TASK_POOL_BLOCK_ALIGNMENT (4 * 1024)
+
+// The minimum number of tasks that will be allocated when growth is needed.
+// The total number may be larger once rounded to meet block size and alignment
+// requirements. Note that we leave a bit of room here for the block header
+// such that we don't always allocate a nice round number + N bytes that then
+// bumps us into the next power of two bucket.
+#define IREE_TASK_POOL_MIN_GROWTH_CAPACITY (255)
+
+// Grows the task pool by at least |minimum_capacity| on top of its current
+// capacity. The actual number of tasks available may be rounded up to make the
+// allocated blocks more allocator-friendly sizes.
+//
+// As an optimization for on-demand growth cases an |out_task| can be specified
+// to receive a task without the need for acquiring one from the pool
+// immediately after the growth completes. This avoids a race condition where
+// another thread could snipe the tasks we just allocated for the caller prior
+// to the caller getting a chance to acquire one.
+static iree_status_t iree_task_pool_grow(iree_task_pool_t* pool,
+                                         iree_host_size_t minimum_capacity,
+                                         iree_task_t** out_task) {
+  if (IREE_UNLIKELY(!minimum_capacity)) return iree_ok_status();
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Allocate a new block of tasks. To try to prevent the allocator from
+  // fragmenting we try to always allocate blocks that are page-aligned and
+  // powers of two.
+  //
+  // Note that we pad out our header to iree_max_align_t bytes so that all tasks
+  // are aligned on the same boundaries as required by atomic operations.
+  iree_host_size_t header_size =
+      iree_host_align(sizeof(iree_task_allocation_header_t), iree_max_align_t);
+  iree_host_size_t pow2_block_size = iree_math_round_up_to_pow2_u64(
+      header_size + minimum_capacity * pool->task_size);
+  iree_host_size_t aligned_block_size =
+      iree_host_align(pow2_block_size, IREE_TASK_POOL_BLOCK_ALIGNMENT);
+  if (aligned_block_size < IREE_TASK_POOL_MIN_BLOCK_SIZE) {
+    aligned_block_size = IREE_TASK_POOL_MIN_BLOCK_SIZE;
+  }
+  iree_task_allocation_header_t* allocation = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_allocator_malloc(pool->allocator, aligned_block_size,
+                                (void**)&allocation));
+
+  // Insert the allocation into the tracking list. Nothing reads the list until
+  // the pool is trimmed/deinitialized so it's safe to do now prior to
+  // populating anything. It's all just empty data anyway.
+  iree_atomic_task_allocation_slist_push(&pool->allocations_slist, allocation);
+
+  // Since we may have rounded up the allocation we may have gotten more space
+  // for tasks than we were asked for. Ensure we actually make use of them.
+  iree_host_size_t actual_capacity =
+      (aligned_block_size - header_size) / pool->task_size;
+
+  // Stitch together the tasks by setting all next pointers.
+  // Since we are going to be touching all the pages the order here is important
+  // as once we insert these new tasks into the available_slist they'll be
+  // popped out head->tail. To ensure the head that gets popped first is still
+  // warm in cache we construct the list backwards, with the tail tasks being
+  // fine to be evicted.
+  //
+  // The nice thing about this walk is that it ensures that if there were any
+  // zero-fill-on-demand trickery going on the pages are all wired here vs.
+  // when the tasks are first acquired from the list where it'd be harder to
+  // track.
+  uintptr_t p = ((uintptr_t)allocation + aligned_block_size) - pool->task_size;
+  iree_task_t* head = (iree_task_t*)p;
+  iree_task_t* tail = head;
+  head->next_task = NULL;
+  head->pool = pool;
+  for (iree_host_size_t i = 0; i < actual_capacity; ++i, p -= pool->task_size) {
+    iree_task_t* task = (iree_task_t*)p;
+    task->next_task = head;
+    task->pool = pool;
+    head = task;
+  }
+
+  // If the caller needs a task we can slice off the head to return prior to
+  // adding it to the slist where it may get stolen.
+  if (out_task) {
+    *out_task = head;
+    head = head->next_task;
+  }
+
+  // Concatenate the list of new free tasks into the pool.
+  iree_atomic_task_slist_concat(&pool->available_slist, head, tail);
+
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+iree_status_t iree_task_pool_initialize(iree_allocator_t allocator,
+                                        iree_host_size_t task_size,
+                                        iree_host_size_t initial_capacity,
+                                        iree_task_pool_t* out_pool) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, task_size);
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, initial_capacity);
+
+  out_pool->allocator = allocator;
+  out_pool->task_size = task_size;
+  iree_atomic_task_allocation_slist_initialize(&out_pool->allocations_slist);
+  iree_atomic_task_slist_initialize(&out_pool->available_slist);
+  iree_status_t status =
+      iree_task_pool_grow(out_pool, initial_capacity, /*out_task=*/NULL);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+void iree_task_pool_deinitialize(iree_task_pool_t* pool) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_task_allocation_header_t* allocation = NULL;
+  if (iree_atomic_task_allocation_slist_flush(
+          &pool->allocations_slist,
+          IREE_ATOMIC_SLIST_FLUSH_ORDER_APPROXIMATE_LIFO, &allocation, NULL)) {
+    while (allocation) {
+      iree_task_allocation_header_t* next =
+          iree_atomic_task_allocation_slist_get_next(allocation);
+      iree_allocator_free(pool->allocator, allocation);
+      allocation = next;
+    }
+  }
+  iree_atomic_task_allocation_slist_deinitialize(&pool->allocations_slist);
+  iree_atomic_task_slist_deinitialize(&pool->available_slist);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_task_pool_trim(iree_task_pool_t* pool) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  // NOTE: this is only safe if there are no outstanding tasks.
+  // Hopefully the caller read the docstring!
+
+  // We only need to flush the list to empty it - these are just references into
+  // the allocations and don't need to be released.
+  iree_task_t* task_head = NULL;
+  iree_atomic_task_slist_flush(&pool->available_slist,
+                               IREE_ATOMIC_SLIST_FLUSH_ORDER_APPROXIMATE_LIFO,
+                               &task_head, /*tail=*/NULL);
+
+  iree_task_allocation_header_t* allocation_head = NULL;
+  if (iree_atomic_task_allocation_slist_flush(
+          &pool->allocations_slist,
+          IREE_ATOMIC_SLIST_FLUSH_ORDER_APPROXIMATE_LIFO, &allocation_head,
+          /*tail=*/NULL)) {
+    do {
+      iree_task_allocation_header_t* next =
+          iree_atomic_task_allocation_slist_get_next(allocation_head);
+      iree_allocator_free(pool->allocator, allocation_head);
+      allocation_head = next;
+    } while (allocation_head != NULL);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+iree_status_t iree_task_pool_acquire(iree_task_pool_t* pool,
+                                     iree_task_t** out_task) {
+  if (!pool) return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED);
+
+  // Attempt to acquire a task from the available list.
+  iree_task_t* task = iree_atomic_task_slist_pop(&pool->available_slist);
+  if (task) {
+    *out_task = task;
+    return iree_ok_status();
+  }
+
+  // No tasks were available when we tried; force growth now.
+  // Note that due to races it's possible that there are now tasks that have
+  // been released back into the pool, but the fact that we failed once means
+  // we are sitting right at the current limit of the pool and growing will
+  // help ensure we go down the fast path more frequently in the future.
+  return iree_task_pool_grow(pool, IREE_TASK_POOL_MIN_GROWTH_CAPACITY,
+                             out_task);
+}
+
+iree_status_t iree_task_pool_acquire_many(iree_task_pool_t* pool,
+                                          iree_host_size_t count,
+                                          iree_task_list_t* out_list) {
+  if (!pool) return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED);
+
+  // If we acquire more than the requested count we need to give those leftovers
+  // back to the pool before we leave.
+  iree_task_list_t leftover_tasks;
+  iree_task_list_initialize(&leftover_tasks);
+  iree_task_list_initialize(out_list);
+
+  iree_status_t status = iree_ok_status();
+  while (count) {
+    // Flush the entire available list so we can start operating on it.
+    // This is where the potential race comes in: if another thread goes to
+    // acquire a task while we have the list local here it'll grow the list so
+    // it can meet its demand. That's still correct behavior but will result in
+    // potentially more wasted memory than if the other thread would have
+    // waited. Thankfully we save memory in so many other places that in the
+    // rare case there are multiple concurrent schedulers acquiring tasks it's
+    // not the end of the world.
+    iree_task_list_t acquired_tasks;
+    iree_task_list_initialize(&acquired_tasks);
+    if (iree_atomic_task_slist_flush(
+            &pool->available_slist,
+            IREE_ATOMIC_SLIST_FLUSH_ORDER_APPROXIMATE_LIFO,
+            &acquired_tasks.head,
+            /*tail=*/NULL)) {
+      // Had some items in the pool; eat up to the requested count.
+      // Note that we may run out and need to allocate more or have gotten
+      // too many during the flush and need to track those leftovers.
+      //
+      // Instead of having the slist flush walk the list and give us a tail we
+      // do that here: we need to walk the list anyway to partition it.
+      iree_task_t* p = acquired_tasks.head;
+      while (count > 0) {
+        p = iree_atomic_task_slist_get_next(p);
+        if (!p) break;
+        acquired_tasks.tail = p;
+        --count;
+      }
+
+      // If we got everything we need then we have to put all of the flushed
+      // tasks we didn't use into the leftover list.
+      if (count == 0) {
+        iree_task_list_t acquire_leftovers;
+        iree_task_list_initialize(&acquire_leftovers);
+        acquire_leftovers.head =
+            iree_atomic_task_slist_get_next(acquired_tasks.tail);
+        iree_atomic_task_slist_set_next(acquired_tasks.tail, NULL);
+        p = acquire_leftovers.head;
+        iree_task_t* next;
+        while ((next = iree_atomic_task_slist_get_next(p))) p = next;
+        acquire_leftovers.tail = p;
+        iree_task_list_append(&leftover_tasks, &acquire_leftovers);
+      }
+
+      // Add the tasks we did acquire to our result list.
+      // NOTE: this is unmeasured but the intuition is that we want to put the
+      // tasks we just acquired at the head of the list so that they are warm
+      // upon return to the caller who will then be touching the head of the
+      // list immediately.
+      iree_task_list_prepend(out_list, &acquired_tasks);
+    }
+
+    // If we still need more tasks but ran out of ones in the flush list then we
+    // need to grow some more.
+    if (count > 0) {
+      status = iree_task_pool_grow(pool, count, /*out_task=*/NULL);
+      if (IREE_UNLIKELY(!iree_status_is_ok(status))) break;
+    }
+  }
+
+  // Return leftovers that we acquired but didn't need to the pool.
+  iree_atomic_task_slist_concat(&pool->available_slist, leftover_tasks.head,
+                                leftover_tasks.tail);
+
+  // Upon failure return any tasks we may have already acquired from the pool.
+  if (IREE_UNLIKELY(!iree_status_is_ok(status))) {
+    iree_atomic_task_slist_concat(&pool->available_slist, out_list->head,
+                                  out_list->tail);
+  }
+
+  return status;
+}
+
+void iree_task_pool_release(iree_task_pool_t* pool, iree_task_t* task) {
+  if (!pool) return;
+  IREE_ASSERT_EQ(task->pool, pool);
+  iree_atomic_task_slist_push(&pool->available_slist, task);
+}
diff --git a/runtime/src/iree/task/pool.h b/runtime/src/iree/task/pool.h
new file mode 100644
index 0000000..de9d5e9
--- /dev/null
+++ b/runtime/src/iree/task/pool.h
@@ -0,0 +1,115 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_TASK_POOL_H_
+#define IREE_TASK_POOL_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/task/list.h"
+#include "iree/task/task.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// An allocation of tasks in a task pool containing multiple tasks.
+// This struct is at the head of all task allocations made from the allocator.
+// It is used to form a linked list of all allocations made so that they can be
+// easily freed during pool teardown.
+typedef struct iree_task_allocation_header_t {
+  // Next allocation in the linked list of allocations.
+  iree_atomic_slist_intrusive_ptr_t* next;
+} iree_task_allocation_header_t;
+
+// An atomic approximately LIFO singly-linked list.
+IREE_TYPED_ATOMIC_SLIST_WRAPPER(iree_atomic_task_allocation,
+                                iree_task_allocation_header_t,
+                                offsetof(iree_task_allocation_header_t, next));
+
+// Shared thread-safe pool of iree_task_t structures of a particular size.
+// This can be used to quickly allocate blocks of tasks to be initialized by
+// task producers, enqueued, and then eventually recycled back to the pool.
+//
+// The lifetime of all tasks must be less than the pool they were acquired
+// from. Tasks acquired from one pool must not be released to another pool or
+// via any other mechanism.
+//
+// Pools can either be fixed-size with a maximum number of available tasks that
+// can be outstanding at any time or growable to allow the pool to be grown
+// unbounded after initialization.
+typedef struct iree_task_pool_t {
+  // Allocator used for allocating/freeing each allocation block.
+  iree_allocator_t allocator;
+
+  // Task size, in bytes.
+  iree_host_size_t task_size;
+
+  // NOTE: we don't track current usage count as that would introduce additional
+  // contention as tasks are acquired/released. If we end up finding a lot of
+  // memory idling here we can add a threshold over which we reclaim it, but the
+  // easiest (and most efficient) solution is to force the user to synchronize
+  // with the executor on a low memory event and use iree_task_pool_trim.
+
+  // Head of a linked list of all allocations made by the pool.
+  iree_atomic_task_allocation_slist_t allocations_slist;
+
+  // Linked list of free tasks used as a stack (LIFO).
+  // This is not a great structure for this as over time the tasks will get out
+  // of order and walking the linked list will incur cache misses. We offset
+  // that cost a bit by knowing that the time between walking the list to
+  // acquire tasks and when we initialize the tasks is short and that we would
+  // have triggered a cache miss anyway. In the future we can explore other
+  // approaches (such as small chunked linear lists) that better exploit spatial
+  // locality, if needed.
+  iree_atomic_task_slist_t available_slist;
+} iree_task_pool_t;
+
+// Initializes a task pool and optionally performs an initial task allocation.
+iree_status_t iree_task_pool_initialize(iree_allocator_t allocator,
+                                        iree_host_size_t task_size,
+                                        iree_host_size_t initial_capacity,
+                                        iree_task_pool_t* out_pool);
+
+// Deinitializes a task pool and releases all task allocations back to the
+// allocator specified during initialization. All tasks must have already been
+// released back to the pool.
+void iree_task_pool_deinitialize(iree_task_pool_t* pool);
+
+// Attempts to trim unused allocations from the task pool.
+// Must not be called while any tasks that were acquired from this pool are
+// still live; callers must synchronize with the executor and ensure they aren't
+// pushing any more work during the trim operation.
+void iree_task_pool_trim(iree_task_pool_t* pool);
+
+// Acquires a task from the task pool. The returned task will have undefined
+// contents and must be initialized by the caller.
+iree_status_t iree_task_pool_acquire(iree_task_pool_t* pool,
+                                     iree_task_t** out_task);
+
+// Acquires a set of tasks from the task pool. The returned tasks will have
+// undefined contents besides their intrusive next pointers and must be
+// intialized by the caller.
+//
+// WARNING: this may cause growth during races if multiple threads are trying to
+// acquire at the same time. Our usage patterns here are such that this is never
+// the case, though, as all acquisition from the internal executor pools happens
+// with the coordination lock held.
+iree_status_t iree_task_pool_acquire_many(iree_task_pool_t* pool,
+                                          iree_host_size_t count,
+                                          iree_task_list_t* out_list);
+
+// Releases a task to the task pool.
+// Callers must ensure the task is no longer in use.
+void iree_task_pool_release(iree_task_pool_t* pool, iree_task_t* task);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_TASK_POOL_H_
diff --git a/runtime/src/iree/task/pool_test.cc b/runtime/src/iree/task/pool_test.cc
new file mode 100644
index 0000000..107b83b
--- /dev/null
+++ b/runtime/src/iree/task/pool_test.cc
@@ -0,0 +1,92 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/task/pool.h"
+
+#include <cstdint>
+
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace {
+
+typedef struct iree_test_task_t {
+  iree_task_t base;
+  uint8_t payload[32];
+} iree_test_task_t;
+
+TEST(PoolTest, Lifetime) {
+  iree_task_pool_t pool;
+  IREE_ASSERT_OK(iree_task_pool_initialize(
+      iree_allocator_system(), sizeof(iree_test_task_t), 32, &pool));
+  iree_task_pool_deinitialize(&pool);
+}
+
+TEST(PoolTest, AcquireRelease) {
+  // Start with 2 preallocated tasks so we can test both acquiring existing and
+  // growing to allocate new tasks.
+  iree_task_pool_t pool;
+  IREE_ASSERT_OK(iree_task_pool_initialize(iree_allocator_system(),
+                                           sizeof(iree_test_task_t), 2, &pool));
+
+  // Acquire 4 tasks (so we test both the initial size and allocated tasks).
+  iree_test_task_t* tasks[4] = {NULL, NULL, NULL, NULL};
+  for (iree_host_size_t i = 0; i < IREE_ARRAYSIZE(tasks); ++i) {
+    IREE_ASSERT_OK(iree_task_pool_acquire(&pool, (iree_task_t**)&tasks[i]));
+    EXPECT_TRUE(tasks[i] != NULL);
+  }
+
+  // Release all tasks back to the pool.
+  for (iree_host_size_t i = 0; i < IREE_ARRAYSIZE(tasks); ++i) {
+    iree_task_pool_release(&pool, (iree_task_t*)tasks[i]);
+  }
+
+  // Acquire all tasks again to make sure we put them back in correctly.
+  for (iree_host_size_t i = 0; i < IREE_ARRAYSIZE(tasks); ++i) {
+    IREE_ASSERT_OK(iree_task_pool_acquire(&pool, (iree_task_t**)&tasks[i]));
+    EXPECT_TRUE(tasks[i] != NULL);
+  }
+  for (iree_host_size_t i = 0; i < IREE_ARRAYSIZE(tasks); ++i) {
+    iree_task_pool_release(&pool, (iree_task_t*)tasks[i]);
+  }
+
+  iree_task_pool_deinitialize(&pool);
+}
+
+TEST(PoolTest, Trim) {
+  // Start with 2 preallocated tasks so we can test both acquiring existing and
+  // growing to allocate new tasks.
+  iree_task_pool_t pool;
+  IREE_ASSERT_OK(iree_task_pool_initialize(iree_allocator_system(),
+                                           sizeof(iree_test_task_t), 2, &pool));
+
+  // Acquire and release some tasks.
+  iree_test_task_t* tasks[8] = {NULL, NULL, NULL, NULL};
+  for (iree_host_size_t i = 0; i < IREE_ARRAYSIZE(tasks); ++i) {
+    IREE_ASSERT_OK(iree_task_pool_acquire(&pool, (iree_task_t**)&tasks[i]));
+    EXPECT_TRUE(tasks[i] != NULL);
+  }
+  for (iree_host_size_t i = 0; i < IREE_ARRAYSIZE(tasks); ++i) {
+    iree_task_pool_release(&pool, (iree_task_t*)tasks[i]);
+  }
+
+  // Trim to shrink the pool memory.
+  // NOTE: trimming is only supported when there are no outstanding tasks.
+  iree_task_pool_trim(&pool);
+
+  // Acquire again to make sure we can reallocate the pool.
+  for (iree_host_size_t i = 0; i < IREE_ARRAYSIZE(tasks); ++i) {
+    IREE_ASSERT_OK(iree_task_pool_acquire(&pool, (iree_task_t**)&tasks[i]));
+    EXPECT_TRUE(tasks[i] != NULL);
+  }
+  for (iree_host_size_t i = 0; i < IREE_ARRAYSIZE(tasks); ++i) {
+    iree_task_pool_release(&pool, (iree_task_t*)tasks[i]);
+  }
+
+  iree_task_pool_deinitialize(&pool);
+}
+
+}  // namespace
diff --git a/runtime/src/iree/task/post_batch.c b/runtime/src/iree/task/post_batch.c
new file mode 100644
index 0000000..bd6c383
--- /dev/null
+++ b/runtime/src/iree/task/post_batch.c
@@ -0,0 +1,192 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/task/post_batch.h"
+
+#include <string.h>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/math.h"
+#include "iree/base/internal/synchronization.h"
+#include "iree/base/internal/threading.h"
+#include "iree/base/tracing.h"
+#include "iree/task/executor_impl.h"
+#include "iree/task/queue.h"
+#include "iree/task/worker.h"
+
+void iree_task_post_batch_initialize(iree_task_executor_t* executor,
+                                     iree_task_worker_t* current_worker,
+                                     iree_task_post_batch_t* out_post_batch) {
+  out_post_batch->executor = executor;
+  out_post_batch->current_worker = current_worker;
+  out_post_batch->worker_pending_mask = 0;
+  memset(&out_post_batch->worker_pending_lifos, 0,
+         executor->worker_count * sizeof(iree_task_list_t));
+}
+
+iree_host_size_t iree_task_post_batch_worker_count(
+    const iree_task_post_batch_t* post_batch) {
+  return post_batch->executor->worker_count;
+}
+
+static iree_host_size_t iree_task_post_batch_select_random_worker(
+    iree_task_post_batch_t* post_batch, iree_task_affinity_set_t affinity_set) {
+  iree_task_affinity_set_t worker_live_mask =
+      iree_atomic_task_affinity_set_load(
+          &post_batch->executor->worker_live_mask, iree_memory_order_acquire);
+  iree_task_affinity_set_t valid_worker_mask = affinity_set & worker_live_mask;
+  if (!valid_worker_mask) {
+    // No valid workers as desired; for now just bail to worker 0.
+    return 0;
+  }
+
+  // TODO(benvanik): rotate through workers here. Instead, if the affinity set
+  // has the current_worker allowed we just use that to avoid needing a
+  // cross-thread hop.
+  return iree_task_affinity_set_count_trailing_zeros(valid_worker_mask);
+}
+
+iree_host_size_t iree_task_post_batch_select_worker(
+    iree_task_post_batch_t* post_batch, iree_task_affinity_set_t affinity_set) {
+  if (post_batch->current_worker) {
+    // Posting from a worker - prefer sending right back to this worker if we
+    // haven't already scheduled for it.
+    if ((affinity_set & post_batch->current_worker->worker_bit) &&
+        !(post_batch->worker_pending_mask &
+          post_batch->current_worker->worker_bit)) {
+      return iree_task_affinity_set_count_trailing_zeros(
+          post_batch->current_worker->worker_bit);
+    }
+  }
+
+  // Prefer workers that are idle as though they'll need to wake up it is
+  // guaranteed that they aren't working on something else and the latency of
+  // waking should (hopefully) be less than the latency of waiting for a
+  // worker's queue to finish. Note that we only consider workers idle if we
+  // ourselves in this batch haven't already queued work for them (as then they
+  // aren't going to be idle).
+  iree_task_affinity_set_t worker_idle_mask =
+      iree_atomic_task_affinity_set_load(
+          &post_batch->executor->worker_idle_mask, iree_memory_order_relaxed);
+  worker_idle_mask &= ~post_batch->worker_pending_mask;
+  iree_task_affinity_set_t idle_affinity_set = affinity_set & worker_idle_mask;
+  if (idle_affinity_set) {
+    return iree_task_post_batch_select_random_worker(post_batch,
+                                                     idle_affinity_set);
+  }
+
+  // No more workers are idle; farm out at random. In the worst case work
+  // stealing will help balance things out on the backend.
+  return iree_task_post_batch_select_random_worker(post_batch, affinity_set);
+}
+
+void iree_task_post_batch_enqueue(iree_task_post_batch_t* post_batch,
+                                  iree_host_size_t worker_index,
+                                  iree_task_t* task) {
+  iree_task_list_push_front(&post_batch->worker_pending_lifos[worker_index],
+                            task);
+  post_batch->worker_pending_mask |=
+      iree_task_affinity_for_worker(worker_index);
+}
+
+// Wakes each worker indicated in the |wake_mask|, if needed.
+static void iree_task_post_batch_wake_workers(
+    iree_task_post_batch_t* post_batch, iree_task_affinity_set_t wake_mask) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, iree_math_count_ones_u64(wake_mask));
+
+  iree_task_executor_t* executor = post_batch->executor;
+
+  // Wake workers that may be suspended. We fetch the set of workers we need to
+  // wake (hopefully none in the common case) and mark that we've woken them so
+  // that we don't double-resume.
+  iree_task_affinity_set_t resume_mask =
+      iree_atomic_task_affinity_set_fetch_and(&executor->worker_suspend_mask,
+                                              ~wake_mask,
+                                              iree_memory_order_acquire);
+  resume_mask &= wake_mask;
+  if (IREE_UNLIKELY(resume_mask)) {
+    int resume_count = iree_task_affinity_set_count_ones(resume_mask);
+    int worker_index = 0;
+    for (int i = 0; i < resume_count; ++i) {
+      int offset = iree_task_affinity_set_count_trailing_zeros(resume_mask) + 1;
+      int resume_index = worker_index + offset;
+      worker_index += offset + 1;
+      resume_mask = iree_shr(resume_mask, offset + 1);
+      iree_thread_resume(executor->workers[resume_index].thread);
+    }
+  }
+
+  // TODO(#4016): use a FUTEX_WAKE_BITSET here to wake all of the workers that
+  // have pending work in a single syscall (vs. popcnt(worker_pending_mask)
+  // syscalls). This will reduce wake latency for workers later in the set;
+  // for example today worker[31] will wait until workers[0-30] have had their
+  // syscalls performed before it's even requested to wake. This also loses
+  // information the kernel could use to avoid core migration as it knows when N
+  // threads will be needed simultaneously and can hopefully perform any needed
+  // migrations prior to beginning execution.
+  int wake_count = iree_task_affinity_set_count_ones(wake_mask);
+  int worker_index = 0;
+  for (int i = 0; i < wake_count; ++i) {
+    int offset = iree_task_affinity_set_count_trailing_zeros(wake_mask);
+    int wake_index = worker_index + offset;
+    worker_index += offset + 1;
+    wake_mask = iree_shr(wake_mask, offset + 1);
+
+    // Wake workers if they are waiting - workers are the only thing that can
+    // wait on this notification so this should almost always be either free (an
+    // atomic load) if a particular worker isn't waiting or it's required to
+    // actually wake it and we can't avoid it.
+    iree_task_worker_t* worker = &executor->workers[wake_index];
+    iree_notification_post(&worker->wake_notification, 1);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+bool iree_task_post_batch_submit(iree_task_post_batch_t* post_batch) {
+  if (!post_batch->worker_pending_mask) return false;
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Run through each worker that has a bit set in the pending mask and post
+  // the pending tasks.
+  iree_task_affinity_set_t worker_mask = post_batch->worker_pending_mask;
+  post_batch->worker_pending_mask = 0;
+  int worker_index = 0;
+  int post_count = iree_task_affinity_set_count_ones(worker_mask);
+  iree_task_affinity_set_t worker_wake_mask = 0;
+  for (int i = 0; i < post_count; ++i) {
+    int offset = iree_task_affinity_set_count_trailing_zeros(worker_mask);
+    int target_index = worker_index + offset;
+    worker_index += offset + 1;
+    worker_mask = iree_shr(worker_mask, offset + 1);
+
+    iree_task_worker_t* worker = &post_batch->executor->workers[target_index];
+    iree_task_list_t* target_pending_lifo =
+        &post_batch->worker_pending_lifos[target_index];
+    if (worker == post_batch->current_worker) {
+      // Fast-path for posting to self; this happens when a worker plays the
+      // role of coordinator and we want to ensure we aren't doing a fully
+      // block-and-flush loop when we could just be popping the next new task
+      // off the list.
+      iree_task_queue_append_from_lifo_list_unsafe(&worker->local_task_queue,
+                                                   target_pending_lifo);
+    } else {
+      iree_task_worker_post_tasks(worker, target_pending_lifo);
+      worker_wake_mask |= iree_task_affinity_for_worker(target_index);
+    }
+  }
+
+  // Wake all workers that now have pending work. If a worker is not already
+  // waiting this will be cheap (no syscall).
+  if (worker_wake_mask != 0) {
+    iree_task_post_batch_wake_workers(post_batch, worker_wake_mask);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return post_count != 0;
+}
diff --git a/runtime/src/iree/task/post_batch.h b/runtime/src/iree/task/post_batch.h
new file mode 100644
index 0000000..470b7a9
--- /dev/null
+++ b/runtime/src/iree/task/post_batch.h
@@ -0,0 +1,73 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_TASK_POST_BATCH_H_
+#define IREE_TASK_POST_BATCH_H_
+
+#include <stdbool.h>
+
+#include "iree/base/config.h"
+#include "iree/task/affinity_set.h"
+#include "iree/task/executor.h"
+#include "iree/task/list.h"
+#include "iree/task/task.h"
+#include "iree/task/tuning.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef struct iree_task_worker_t iree_task_worker_t;
+
+// Transient/stack-allocated structure for batching up tasks for posting to
+// worker mailboxes in single operations. This avoids the need to repeatedly
+// thrash caches during coordination as only during submission are the worker
+// mailboxes touched and only once per worker.
+typedef struct iree_task_post_batch_t {
+  iree_task_executor_t* executor;
+
+  // Local worker constructing the post batch.
+  // This is used to know when lighter-weight queuing can occur (no need to
+  // post across a mailbox channel to yourself!).
+  // May be NULL if not being posted from a worker (such as a submission).
+  iree_task_worker_t* current_worker;
+
+  // A bitmask of workers indicating which have pending tasks in their lists.
+  // Used to quickly scan the lists and perform the posts only when required.
+  iree_task_affinity_set_t worker_pending_mask;
+
+  // A per-worker LIFO task list waiting to be posted.
+  iree_task_list_t worker_pending_lifos[0];
+} iree_task_post_batch_t;
+
+void iree_task_post_batch_initialize(iree_task_executor_t* executor,
+                                     iree_task_worker_t* current_worker,
+                                     iree_task_post_batch_t* out_post_batch);
+
+// Returns the total number of workers that the post batch is targeting.
+iree_host_size_t iree_task_post_batch_worker_count(
+    const iree_task_post_batch_t* post_batch);
+
+// Selects a random worker from the given affinity set.
+iree_host_size_t iree_task_post_batch_select_worker(
+    iree_task_post_batch_t* post_batch, iree_task_affinity_set_t affinity_set);
+
+// Enqueues a task to the given worker. Note that the pending work lists for
+// each work is kept in LIFO order so that we can easily concatenate it with the
+// worker mailbox slist that's in LIFO order.
+void iree_task_post_batch_enqueue(iree_task_post_batch_t* post_batch,
+                                  iree_host_size_t worker_index,
+                                  iree_task_t* task);
+
+// Submits all pending tasks to their worker mailboxes and resets state.
+// Returns true if any tasks were posted to workers.
+bool iree_task_post_batch_submit(iree_task_post_batch_t* post_batch);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_TASK_POST_BATCH_H_
diff --git a/runtime/src/iree/task/queue.c b/runtime/src/iree/task/queue.c
new file mode 100644
index 0000000..823947b
--- /dev/null
+++ b/runtime/src/iree/task/queue.c
@@ -0,0 +1,90 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/task/queue.h"
+
+#include <stddef.h>
+#include <string.h>
+
+void iree_task_queue_initialize(iree_task_queue_t* out_queue) {
+  memset(out_queue, 0, sizeof(*out_queue));
+  iree_slim_mutex_initialize(&out_queue->mutex);
+  iree_task_list_initialize(&out_queue->list);
+}
+
+void iree_task_queue_deinitialize(iree_task_queue_t* queue) {
+  iree_task_list_discard(&queue->list);
+  iree_slim_mutex_deinitialize(&queue->mutex);
+}
+
+bool iree_task_queue_is_empty(iree_task_queue_t* queue) {
+  iree_slim_mutex_lock(&queue->mutex);
+  bool is_empty = iree_task_list_is_empty(&queue->list);
+  iree_slim_mutex_unlock(&queue->mutex);
+  return is_empty;
+}
+
+void iree_task_queue_push_front(iree_task_queue_t* queue, iree_task_t* task) {
+  iree_slim_mutex_lock(&queue->mutex);
+  iree_task_list_push_front(&queue->list, task);
+  iree_slim_mutex_unlock(&queue->mutex);
+}
+
+void iree_task_queue_append_from_lifo_list_unsafe(iree_task_queue_t* queue,
+                                                  iree_task_list_t* list) {
+  // NOTE: reversing the list outside of the lock.
+  iree_task_list_reverse(list);
+  iree_slim_mutex_lock(&queue->mutex);
+  iree_task_list_append(&queue->list, list);
+  iree_slim_mutex_unlock(&queue->mutex);
+}
+
+iree_task_t* iree_task_queue_flush_from_lifo_slist(
+    iree_task_queue_t* queue, iree_atomic_task_slist_t* source_slist) {
+  // Perform the flush and swap outside of the lock; acquiring the list is
+  // atomic and then we own it exclusively.
+  iree_task_list_t suffix;
+  iree_task_list_initialize(&suffix);
+  const bool did_flush = iree_atomic_task_slist_flush(
+      source_slist, IREE_ATOMIC_SLIST_FLUSH_ORDER_APPROXIMATE_FIFO,
+      &suffix.head, &suffix.tail);
+
+  // Append the tasks and pop off the front for return.
+  iree_slim_mutex_lock(&queue->mutex);
+  if (did_flush) iree_task_list_append(&queue->list, &suffix);
+  iree_task_t* next_task = iree_task_list_pop_front(&queue->list);
+  iree_slim_mutex_unlock(&queue->mutex);
+
+  return next_task;
+}
+
+iree_task_t* iree_task_queue_pop_front(iree_task_queue_t* queue) {
+  iree_slim_mutex_lock(&queue->mutex);
+  iree_task_t* next_task = iree_task_list_pop_front(&queue->list);
+  iree_slim_mutex_unlock(&queue->mutex);
+  return next_task;
+}
+
+iree_task_t* iree_task_queue_try_steal(iree_task_queue_t* source_queue,
+                                       iree_task_queue_t* target_queue,
+                                       iree_host_size_t max_tasks) {
+  // First attempt to steal up to max_tasks from the source queue.
+  iree_task_list_t stolen_tasks;
+  iree_task_list_initialize(&stolen_tasks);
+  iree_slim_mutex_lock(&source_queue->mutex);
+  iree_task_list_split(&source_queue->list, max_tasks, &stolen_tasks);
+  iree_slim_mutex_unlock(&source_queue->mutex);
+
+  // Add any stolen tasks to the target queue and pop off the head for return.
+  iree_task_t* next_task = NULL;
+  if (!iree_task_list_is_empty(&stolen_tasks)) {
+    iree_slim_mutex_lock(&target_queue->mutex);
+    iree_task_list_append(&target_queue->list, &stolen_tasks);
+    next_task = iree_task_list_pop_front(&target_queue->list);
+    iree_slim_mutex_unlock(&target_queue->mutex);
+  }
+  return next_task;
+}
diff --git a/runtime/src/iree/task/queue.h b/runtime/src/iree/task/queue.h
new file mode 100644
index 0000000..917b872
--- /dev/null
+++ b/runtime/src/iree/task/queue.h
@@ -0,0 +1,166 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_TASK_QUEUE_H_
+#define IREE_TASK_QUEUE_H_
+
+#include <stdbool.h>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/synchronization.h"
+#include "iree/task/list.h"
+#include "iree/task/task.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// A simple work-stealing LIFO queue modeled on a Chase-Lev concurrent deque.
+// This is used by workers to maintain their thread-local working lists. The
+// workers keep the tasks they will process in FIFO order. They allow it to
+// empty and then refresh it with more tasks from the incoming worker mailbox.
+// The performance bias here is to the workers as they are >90% of the
+// accesses and the only other accesses are thieves that hopefully we can just
+// improve our distribution to vs. introducing a slowdown here.
+//
+// A futex is used to synchronize access; because the common case is that of
+// only the worker that owns the queue touching it for pushing and popping items
+// this puts us into the sweet-spot of uncontended lightweight exclusive locks.
+// Since futices are effectively just single machine words managed with atomic
+// ops we can avoid a lot of the traditional atomic tomfoolery one finds in
+// systems like these that originated prior to the introduction of futices while
+// also keeping the tiny overhead of the pure atomic solutions.
+//
+// We can also take advantage of the futex providing an actual exclusive region
+// such that our data structure can be whatever we want as opposed to needing to
+// be something that someone had figured out how to make atomic. For example,
+// common implementations of work-stealing queues are all bounded as unbounded
+// atomic deques are an unsolved problem in CS.
+//
+// Very rarely when another worker runs out of work it'll try to steal tasks
+// from nearby workers and use this queue type to do it: the assumption is that
+// it's better to take the last task the victim worker will get to so that in a
+// long list of tasks it remains chugging through the head of the list with good
+// cache locality. If we end up with a lot of theft, though, it's possible for
+// the cache benefits of the pop_back approach to the worker to outweigh the
+// cache pessimism for all thieves. Let's hope we can schedule deterministic-
+// enough tiles such that theft is rare!
+//
+// Our queue variant here is tuned for the use case we have: we exclusively
+// push in multiple tasks at a time (flushed from the mailbox) and exclusively
+// pop a single task a time (what to work on next). The stealing part is batched
+// so that when a remote worker has to perform a theft it takes a good chunk of
+// tasks in one go (hopefully roughly half) to reduce the total overhead when
+// there is high imbalance in workloads.
+//
+// Flushing from the mailbox slist (LIFO) to our list (FIFO) requires a full
+// walk of the incoming task linked list. This is generally fine as the number
+// of tasks in any given flush is low(ish) and by walking in reverse order to
+// then process forward the cache should be hot as the worker starts making its
+// way back through the tasks. As we walk forward we'll be using the task fields
+// for execution and retiring of tasks (notifing dependencies/etc) and the
+// intrusive next pointer sitting next to those should be in-cache when we need
+// to access it. This, combined with slab allocation of tasks in command buffers
+// to begin with gives us the (probabilistically) same characteristics of a flat
+// array walked with an index as is common in other work queues but with the
+// flexibility to reorder tasks as we see fit (theft, redistribution/rotation,
+// reprioritization, etc).
+//
+// Similar concepts, though implemented with atomics:
+//   "Dynamic Circular Work-Stealing Deque":
+//   http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.170.1097&rep=rep1&type=pdf
+//   "Correct and Efficient Work-Stealing for Weak Memory Models":
+//   https://fzn.fr/readings/ppopp13.pdf
+//   Motivating article:
+//   https://blog.molecular-matters.com/2015/08/24/job-system-2-0-lock-free-work-stealing-part-1-basics/
+//
+// Useful diagram from https://github.com/injinj/WSQ
+// Much of this implementation is inspired from that; though significant
+// reworking was required for our FIFO->LIFO->FIFO sandwich.
+//  +--------+ <- tasks[0]
+//  |  top   | <- stealers consume here: task = tasks[top++]
+//  |        |
+//  |   ||   |
+//  |        |
+//  |   vv   |
+//  | bottom | <- owner pushes here:    tasks[bottom++] = task
+//  |        |    owner consumes here:  task = tasks[--bottom]
+//  |        |
+//  +--------+ <- tasks[IREE_TASK_QUEUE_CAPACITY-1]
+//
+// Unlike that implementation, though, our task list is unbounded because we use
+// a linked list. To keep our options open, though, I've left the API of this
+// implementation compatible with classic atomic work-stealing queues. I'm
+// hopeful this will not need to be revisted for awhile, though!
+//
+// Future improvement idea: have the owner of the queue maintain a theft point
+// skip list that makes it possible for thieves to quickly come in and slice
+// off batches of tasks at the tail of the queue. Since we are a singly-linked
+// list we can't easily just walk backward and we don't want to be introducing
+// cache line contention as thieves start touching the same tasks as the worker
+// is while processing.
+typedef struct iree_task_queue_t {
+  // Must be held when manipulating the queue. >90% accesses are by the owner.
+  iree_slim_mutex_t mutex;
+
+  // FIFO task list.
+  iree_task_list_t list IREE_GUARDED_BY(mutex);
+} iree_task_queue_t;
+
+// Initializes a work-stealing task queue in-place.
+void iree_task_queue_initialize(iree_task_queue_t* out_queue);
+
+// Deinitializes a task queue and clears all references.
+// Must not be called while any other worker may be attempting to steal tasks.
+void iree_task_queue_deinitialize(iree_task_queue_t* queue);
+
+// Returns true if the queue is empty.
+// Note that due to races this may return both false-positives and -negatives.
+bool iree_task_queue_is_empty(iree_task_queue_t* queue);
+
+// Pushes a task to the front of the queue.
+// Always prefer the multi-push variants (prepend/append) when adding more than
+// one task to the queue. This is mostly useful for exceptional cases such as
+// when a task may yield and need to be reprocessed after the worker resumes.
+//
+// Must only be called from the owning worker's thread.
+void iree_task_queue_push_front(iree_task_queue_t* queue, iree_task_t* task);
+
+// Appends a LIFO |list| of tasks to the queue.
+//
+// Must only be called from the owning worker's thread.
+void iree_task_queue_append_from_lifo_list_unsafe(iree_task_queue_t* queue,
+                                                  iree_task_list_t* list);
+
+// Flushes the |source_slist| LIFO mailbox into the task queue in FIFO order.
+// Returns the first task in the queue upon success; the task may be
+// pre-existing or from the newly flushed tasks.
+//
+// Must only be called from the owning worker's thread.
+iree_task_t* iree_task_queue_flush_from_lifo_slist(
+    iree_task_queue_t* queue, iree_atomic_task_slist_t* source_slist);
+
+// Pops a task from the front of the queue if any are available.
+//
+// Must only be called from the owning worker's thread.
+iree_task_t* iree_task_queue_pop_front(iree_task_queue_t* queue);
+
+// Tries to steal up to |max_tasks| from the back of the queue.
+// Returns NULL if no tasks are available and otherwise up to |max_tasks| tasks
+// that were at the tail of the |source_queue| will be moved to the
+// |target_queue| and the first of the stolen tasks is returned.
+//
+// It's expected this is not called from the queue's owning worker, though it's
+// valid to do so.
+iree_task_t* iree_task_queue_try_steal(iree_task_queue_t* source_queue,
+                                       iree_task_queue_t* target_queue,
+                                       iree_host_size_t max_tasks);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_TASK_QUEUE_H_
diff --git a/runtime/src/iree/task/queue_test.cc b/runtime/src/iree/task/queue_test.cc
new file mode 100644
index 0000000..53342fd
--- /dev/null
+++ b/runtime/src/iree/task/queue_test.cc
@@ -0,0 +1,322 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/task/queue.h"
+
+#include "iree/testing/gtest.h"
+
+namespace {
+
+TEST(QueueTest, Lifetime) {
+  iree_task_queue_t queue;
+  iree_task_queue_initialize(&queue);
+  iree_task_queue_deinitialize(&queue);
+}
+
+TEST(QueueTest, Empty) {
+  iree_task_queue_t queue;
+  iree_task_queue_initialize(&queue);
+  EXPECT_TRUE(iree_task_queue_is_empty(&queue));
+  EXPECT_FALSE(iree_task_queue_pop_front(&queue));
+  iree_task_queue_deinitialize(&queue);
+}
+
+TEST(QueueTest, PushPop) {
+  iree_task_queue_t queue;
+  iree_task_queue_initialize(&queue);
+
+  EXPECT_TRUE(iree_task_queue_is_empty(&queue));
+  EXPECT_FALSE(iree_task_queue_pop_front(&queue));
+
+  iree_task_t task_a = {0};
+  iree_task_queue_push_front(&queue, &task_a);
+
+  EXPECT_FALSE(iree_task_queue_is_empty(&queue));
+
+  iree_task_t task_b = {0};
+  iree_task_queue_push_front(&queue, &task_b);
+
+  EXPECT_FALSE(iree_task_queue_is_empty(&queue));
+  EXPECT_EQ(&task_b, iree_task_queue_pop_front(&queue));
+
+  EXPECT_FALSE(iree_task_queue_is_empty(&queue));
+  EXPECT_EQ(&task_a, iree_task_queue_pop_front(&queue));
+
+  EXPECT_TRUE(iree_task_queue_is_empty(&queue));
+  EXPECT_FALSE(iree_task_queue_pop_front(&queue));
+
+  iree_task_queue_deinitialize(&queue);
+}
+
+TEST(QueueTest, AppendListEmpty) {
+  iree_task_queue_t queue;
+  iree_task_queue_initialize(&queue);
+
+  iree_task_list_t list = {0};
+
+  EXPECT_TRUE(iree_task_queue_is_empty(&queue));
+  iree_task_queue_append_from_lifo_list_unsafe(&queue, &list);
+  EXPECT_TRUE(iree_task_queue_is_empty(&queue));
+  EXPECT_TRUE(iree_task_list_is_empty(&list));
+
+  iree_task_queue_deinitialize(&queue);
+}
+
+TEST(QueueTest, AppendList1) {
+  iree_task_queue_t queue;
+  iree_task_queue_initialize(&queue);
+
+  iree_task_list_t list = {0};
+  iree_task_t task_a = {0};
+  iree_task_list_push_front(&list, &task_a);
+
+  EXPECT_TRUE(iree_task_queue_is_empty(&queue));
+  iree_task_queue_append_from_lifo_list_unsafe(&queue, &list);
+  EXPECT_FALSE(iree_task_queue_is_empty(&queue));
+  EXPECT_TRUE(iree_task_list_is_empty(&list));
+
+  EXPECT_EQ(&task_a, iree_task_queue_pop_front(&queue));
+  EXPECT_TRUE(iree_task_queue_is_empty(&queue));
+
+  iree_task_queue_deinitialize(&queue);
+}
+
+TEST(QueueTest, AppendListOrdered) {
+  iree_task_queue_t queue;
+  iree_task_queue_initialize(&queue);
+
+  // Make a lifo list: b<-a.
+  iree_task_list_t list = {0};
+  iree_task_t task_a = {0};
+  iree_task_list_push_front(&list, &task_a);
+  iree_task_t task_b = {0};
+  iree_task_list_push_front(&list, &task_b);
+
+  // Append the list to the queue; it should swap LIFO->FIFO.
+  EXPECT_TRUE(iree_task_queue_is_empty(&queue));
+  iree_task_queue_append_from_lifo_list_unsafe(&queue, &list);
+  EXPECT_FALSE(iree_task_queue_is_empty(&queue));
+  EXPECT_TRUE(iree_task_list_is_empty(&list));
+
+  // Pop list and ensure order: a->b.
+  EXPECT_EQ(&task_a, iree_task_queue_pop_front(&queue));
+  EXPECT_EQ(&task_b, iree_task_queue_pop_front(&queue));
+  EXPECT_TRUE(iree_task_queue_is_empty(&queue));
+
+  iree_task_queue_deinitialize(&queue);
+}
+
+TEST(QueueTest, FlushSlistEmpty) {
+  iree_task_queue_t queue;
+  iree_task_queue_initialize(&queue);
+
+  iree_atomic_task_slist_t slist;
+  iree_atomic_task_slist_initialize(&slist);
+
+  EXPECT_TRUE(iree_task_queue_is_empty(&queue));
+  EXPECT_FALSE(iree_task_queue_flush_from_lifo_slist(&queue, &slist));
+  EXPECT_TRUE(iree_task_queue_is_empty(&queue));
+
+  iree_atomic_task_slist_deinitialize(&slist);
+
+  iree_task_queue_deinitialize(&queue);
+}
+
+TEST(QueueTest, FlushSlist1) {
+  iree_task_queue_t queue;
+  iree_task_queue_initialize(&queue);
+
+  iree_atomic_task_slist_t slist;
+  iree_atomic_task_slist_initialize(&slist);
+  iree_task_t task_a = {0};
+  iree_atomic_task_slist_push(&slist, &task_a);
+
+  EXPECT_TRUE(iree_task_queue_is_empty(&queue));
+  EXPECT_EQ(&task_a, iree_task_queue_flush_from_lifo_slist(&queue, &slist));
+  EXPECT_TRUE(iree_task_queue_is_empty(&queue));
+
+  iree_atomic_task_slist_deinitialize(&slist);
+
+  iree_task_queue_deinitialize(&queue);
+}
+
+TEST(QueueTest, FlushSlistOrdered) {
+  iree_task_queue_t queue;
+  iree_task_queue_initialize(&queue);
+
+  // Make a lifo list: c<-b<-a.
+  iree_atomic_task_slist_t slist;
+  iree_atomic_task_slist_initialize(&slist);
+  iree_task_t task_a = {0};
+  iree_atomic_task_slist_push(&slist, &task_a);
+  iree_task_t task_b = {0};
+  iree_atomic_task_slist_push(&slist, &task_b);
+  iree_task_t task_c = {0};
+  iree_atomic_task_slist_push(&slist, &task_c);
+
+  // Flush the list to the queue; it should swap LIFO->FIFO and return the
+  // first task in the queue.
+  EXPECT_TRUE(iree_task_queue_is_empty(&queue));
+  EXPECT_EQ(&task_a, iree_task_queue_flush_from_lifo_slist(&queue, &slist));
+  EXPECT_FALSE(iree_task_queue_is_empty(&queue));
+
+  // Pop list and ensure order: [a->]b->c.
+  EXPECT_EQ(&task_b, iree_task_queue_pop_front(&queue));
+  EXPECT_EQ(&task_c, iree_task_queue_pop_front(&queue));
+  EXPECT_TRUE(iree_task_queue_is_empty(&queue));
+
+  iree_atomic_task_slist_deinitialize(&slist);
+
+  iree_task_queue_deinitialize(&queue);
+}
+
+TEST(QueueTest, TryStealEmpty) {
+  iree_task_queue_t source_queue;
+  iree_task_queue_initialize(&source_queue);
+  iree_task_queue_t target_queue;
+  iree_task_queue_initialize(&target_queue);
+
+  iree_task_t task_a = {0};
+  iree_task_queue_push_front(&source_queue, &task_a);
+  iree_task_t task_b = {0};
+  iree_task_queue_push_front(&source_queue, &task_b);
+  iree_task_t task_c = {0};
+  iree_task_queue_push_front(&source_queue, &task_c);
+
+  EXPECT_EQ(&task_a,
+            iree_task_queue_try_steal(&source_queue, &target_queue, 1));
+
+  iree_task_queue_deinitialize(&source_queue);
+  iree_task_queue_deinitialize(&target_queue);
+}
+
+TEST(QueueTest, TryStealLast) {
+  iree_task_queue_t source_queue;
+  iree_task_queue_initialize(&source_queue);
+  iree_task_queue_t target_queue;
+  iree_task_queue_initialize(&target_queue);
+
+  iree_task_t task_a = {0};
+  iree_task_queue_push_front(&source_queue, &task_a);
+
+  EXPECT_EQ(&task_a,
+            iree_task_queue_try_steal(&source_queue, &target_queue, 100));
+  EXPECT_TRUE(iree_task_queue_is_empty(&target_queue));
+  EXPECT_TRUE(iree_task_queue_is_empty(&source_queue));
+
+  iree_task_queue_deinitialize(&source_queue);
+  iree_task_queue_deinitialize(&target_queue);
+}
+
+TEST(QueueTest, TrySteal1) {
+  iree_task_queue_t source_queue;
+  iree_task_queue_initialize(&source_queue);
+  iree_task_queue_t target_queue;
+  iree_task_queue_initialize(&target_queue);
+
+  iree_task_t task_a = {0};
+  iree_task_t task_b = {0};
+  iree_task_t task_c = {0};
+  iree_task_queue_push_front(&source_queue, &task_c);
+  iree_task_queue_push_front(&source_queue, &task_b);
+  iree_task_queue_push_front(&source_queue, &task_a);
+
+  EXPECT_EQ(&task_c,
+            iree_task_queue_try_steal(&source_queue, &target_queue, 1));
+  EXPECT_TRUE(iree_task_queue_is_empty(&target_queue));
+
+  EXPECT_EQ(&task_a, iree_task_queue_pop_front(&source_queue));
+  EXPECT_EQ(&task_b, iree_task_queue_pop_front(&source_queue));
+  EXPECT_TRUE(iree_task_queue_is_empty(&source_queue));
+
+  iree_task_queue_deinitialize(&source_queue);
+  iree_task_queue_deinitialize(&target_queue);
+}
+
+TEST(QueueTest, TryStealIntoExisting) {
+  iree_task_queue_t source_queue;
+  iree_task_queue_initialize(&source_queue);
+  iree_task_queue_t target_queue;
+  iree_task_queue_initialize(&target_queue);
+
+  iree_task_t task_a = {0};
+  iree_task_t task_b = {0};
+  iree_task_queue_push_front(&source_queue, &task_b);
+  iree_task_queue_push_front(&source_queue, &task_a);
+
+  iree_task_t task_existing = {0};
+  iree_task_queue_push_front(&target_queue, &task_existing);
+
+  EXPECT_EQ(&task_existing,
+            iree_task_queue_try_steal(&source_queue, &target_queue, 1));
+
+  EXPECT_EQ(&task_a, iree_task_queue_pop_front(&source_queue));
+  EXPECT_TRUE(iree_task_queue_is_empty(&source_queue));
+
+  EXPECT_EQ(&task_b, iree_task_queue_pop_front(&target_queue));
+  EXPECT_TRUE(iree_task_queue_is_empty(&target_queue));
+
+  iree_task_queue_deinitialize(&source_queue);
+  iree_task_queue_deinitialize(&target_queue);
+}
+
+TEST(QueueTest, TryStealMany) {
+  iree_task_queue_t source_queue;
+  iree_task_queue_initialize(&source_queue);
+  iree_task_queue_t target_queue;
+  iree_task_queue_initialize(&target_queue);
+
+  iree_task_t task_a = {0};
+  iree_task_t task_b = {0};
+  iree_task_t task_c = {0};
+  iree_task_t task_d = {0};
+  iree_task_queue_push_front(&source_queue, &task_d);
+  iree_task_queue_push_front(&source_queue, &task_c);
+  iree_task_queue_push_front(&source_queue, &task_b);
+  iree_task_queue_push_front(&source_queue, &task_a);
+
+  EXPECT_EQ(&task_c,
+            iree_task_queue_try_steal(&source_queue, &target_queue, 2));
+  EXPECT_EQ(&task_d, iree_task_queue_pop_front(&target_queue));
+  EXPECT_TRUE(iree_task_queue_is_empty(&target_queue));
+
+  EXPECT_EQ(&task_a, iree_task_queue_pop_front(&source_queue));
+  EXPECT_EQ(&task_b, iree_task_queue_pop_front(&source_queue));
+  EXPECT_TRUE(iree_task_queue_is_empty(&source_queue));
+
+  iree_task_queue_deinitialize(&source_queue);
+  iree_task_queue_deinitialize(&target_queue);
+}
+
+TEST(QueueTest, TryStealAll) {
+  iree_task_queue_t source_queue;
+  iree_task_queue_initialize(&source_queue);
+  iree_task_queue_t target_queue;
+  iree_task_queue_initialize(&target_queue);
+
+  iree_task_t task_a = {0};
+  iree_task_t task_b = {0};
+  iree_task_t task_c = {0};
+  iree_task_t task_d = {0};
+  iree_task_queue_push_front(&source_queue, &task_d);
+  iree_task_queue_push_front(&source_queue, &task_c);
+  iree_task_queue_push_front(&source_queue, &task_b);
+  iree_task_queue_push_front(&source_queue, &task_a);
+
+  EXPECT_EQ(&task_c,
+            iree_task_queue_try_steal(&source_queue, &target_queue, 1000));
+  EXPECT_EQ(&task_d, iree_task_queue_pop_front(&target_queue));
+  EXPECT_TRUE(iree_task_queue_is_empty(&target_queue));
+
+  EXPECT_EQ(&task_a, iree_task_queue_pop_front(&source_queue));
+  EXPECT_EQ(&task_b, iree_task_queue_pop_front(&source_queue));
+  EXPECT_TRUE(iree_task_queue_is_empty(&source_queue));
+
+  iree_task_queue_deinitialize(&source_queue);
+  iree_task_queue_deinitialize(&target_queue);
+}
+
+}  // namespace
diff --git a/runtime/src/iree/task/scope.c b/runtime/src/iree/task/scope.c
new file mode 100644
index 0000000..ff0f34b
--- /dev/null
+++ b/runtime/src/iree/task/scope.c
@@ -0,0 +1,163 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/task/scope.h"
+
+#include <stddef.h>
+#include <string.h>
+
+#include "iree/base/api.h"
+
+void iree_task_scope_initialize(iree_string_view_t name,
+                                iree_task_scope_t* out_scope) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  memset(out_scope, 0, sizeof(*out_scope));
+
+  iree_host_size_t name_length =
+      iree_min(name.size, IREE_ARRAYSIZE(out_scope->name) - 1);
+  memcpy(out_scope->name, name.data, name_length);
+  out_scope->name[name_length] = 0;
+
+  // TODO(benvanik): pick trace colors based on name hash.
+  IREE_TRACE(out_scope->task_trace_color = 0xFFFF0000u);
+
+  iree_slim_mutex_initialize(&out_scope->mutex);
+  iree_notification_initialize(&out_scope->idle_notification);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_task_scope_deinitialize(iree_task_scope_t* scope) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  IREE_ASSERT(
+      iree_task_scope_is_idle(scope),
+      "pending submissions must be aborted prior to deinitializing their "
+      "scope");
+
+  // Makes it easier to see if we were incorrectly using the name even after the
+  // scope is deinitialized. Since scopes may be stack allocated we don't want
+  // to have anyone trying to access them (like tracy).
+  memset(scope->name, 0xCD, sizeof(scope->name));
+
+  // In most cases the status will have been consumed by the scope owner.
+  iree_status_t status = (iree_status_t)iree_atomic_exchange_intptr(
+      &scope->permanent_status, (intptr_t)NULL, iree_memory_order_acquire);
+  IREE_IGNORE_ERROR(status);
+
+  iree_notification_deinitialize(&scope->idle_notification);
+  iree_slim_mutex_deinitialize(&scope->mutex);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+iree_string_view_t iree_task_scope_name(iree_task_scope_t* scope) {
+  return iree_make_cstring_view(scope->name);
+}
+
+iree_task_dispatch_statistics_t iree_task_scope_consume_statistics(
+    iree_task_scope_t* scope) {
+  iree_task_dispatch_statistics_t result = scope->dispatch_statistics;
+  memset(&scope->dispatch_statistics, 0, sizeof(scope->dispatch_statistics));
+  return result;
+}
+
+bool iree_task_scope_has_failed(iree_task_scope_t* scope) {
+  return iree_atomic_load_intptr(&scope->permanent_status,
+                                 iree_memory_order_seq_cst) != 0;
+}
+
+iree_status_t iree_task_scope_consume_status(iree_task_scope_t* scope) {
+  iree_status_t old_status = iree_ok_status();
+  iree_status_t new_status = iree_ok_status();
+  while (!iree_atomic_compare_exchange_strong_intptr(
+      &scope->permanent_status, (intptr_t*)&old_status, (intptr_t)new_status,
+      iree_memory_order_seq_cst, iree_memory_order_seq_cst)) {
+    // Previous status was not OK; we have it now though and can try again.
+    new_status = iree_status_from_code(iree_status_code(old_status));
+  }
+  return old_status;
+}
+
+static void iree_task_scope_try_set_status(iree_task_scope_t* scope,
+                                           iree_status_t new_status) {
+  if (IREE_UNLIKELY(iree_status_is_ok(new_status))) return;
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_TEXT(z0, "failed: ");
+  IREE_TRACE_ZONE_APPEND_TEXT(
+      z0, iree_status_code_string(iree_status_code(new_status)));
+
+  iree_status_t old_status = iree_ok_status();
+  if (!iree_atomic_compare_exchange_strong_intptr(
+          &scope->permanent_status, (intptr_t*)&old_status,
+          (intptr_t)new_status, iree_memory_order_seq_cst,
+          iree_memory_order_seq_cst)) {
+    // Previous status was not OK; drop our new status.
+    IREE_IGNORE_ERROR(new_status);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_task_scope_abort(iree_task_scope_t* scope) {
+  iree_status_t status =
+      iree_make_status(IREE_STATUS_ABORTED, "entire scope aborted by user");
+  iree_task_scope_try_set_status(scope, status);
+}
+
+void iree_task_scope_fail(iree_task_scope_t* scope, iree_status_t status) {
+  iree_task_scope_try_set_status(scope, status);
+}
+
+void iree_task_scope_begin(iree_task_scope_t* scope) {
+  iree_slim_mutex_lock(&scope->mutex);
+  ++scope->pending_submissions;
+  iree_slim_mutex_unlock(&scope->mutex);
+}
+
+void iree_task_scope_end(iree_task_scope_t* scope) {
+  iree_slim_mutex_lock(&scope->mutex);
+  bool signal = (--scope->pending_submissions == 0);
+  iree_slim_mutex_unlock(&scope->mutex);
+  if (signal) {
+    // All submissions have completed in this scope - notify any waiters.
+    iree_notification_post(&scope->idle_notification, IREE_ALL_WAITERS);
+  }
+}
+
+bool iree_task_scope_is_idle(iree_task_scope_t* scope) {
+  iree_slim_mutex_lock(&scope->mutex);
+  bool is_idle = scope->pending_submissions == 0;
+  iree_slim_mutex_unlock(&scope->mutex);
+  return is_idle;
+}
+
+iree_status_t iree_task_scope_wait_idle(iree_task_scope_t* scope,
+                                        iree_time_t deadline_ns) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_status_t status = iree_ok_status();
+  if (deadline_ns == IREE_TIME_INFINITE_PAST) {
+    // Polling for idle.
+    if (iree_task_scope_is_idle(scope)) {
+      status = iree_ok_status();
+    } else {
+      status = iree_status_from_code(IREE_STATUS_DEADLINE_EXCEEDED);
+    }
+  } else {
+    // Wait for the scope to enter the idle state.
+    if (!iree_notification_await(&scope->idle_notification,
+                                 (iree_condition_fn_t)iree_task_scope_is_idle,
+                                 scope, iree_make_deadline(deadline_ns))) {
+      status = iree_status_from_code(IREE_STATUS_DEADLINE_EXCEEDED);
+    }
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
diff --git a/runtime/src/iree/task/scope.h b/runtime/src/iree/task/scope.h
new file mode 100644
index 0000000..2578f57
--- /dev/null
+++ b/runtime/src/iree/task/scope.h
@@ -0,0 +1,160 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_TASK_SCOPE_H_
+#define IREE_TASK_SCOPE_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/atomics.h"
+#include "iree/base/internal/synchronization.h"
+#include "iree/base/tracing.h"
+#include "iree/task/task.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// A loose way of grouping tasks within the task system.
+// Each scope represents a unique collection of tasks that have some related
+// properties - most often their producer - that need to carry along some
+// tracking information to act on all related tasks at once. They do not
+// indicate any particular ordering of tasks or how the tasks are to be treated
+// by executors.
+//
+// Scopes can be used to signal, propagate, and retrieve failure statuses. As
+// the executor processes tasks in an unordered fashion this is the only way to
+// perform cross-task operations such as "abort all of the tasks from this
+// producer" or "wait until all tasks from this producer finish." In addition
+// there are statistics that can be aggregated across all tasks attributed to
+// the scope that allows for an efficient roll-up of activity over specific
+// durations.
+//
+// Task producers can decide whether to create new scopes for each batch of
+// tasks they submit or reuse scopes for the lifetime of their subprocess. Scope
+// overhead is low and the only advantage of reusing them is that lifetime can
+// become easier to manage by tying them 1:1 with producers.
+//
+// Thread-safe; once created scopes are modified exclusively via atomic
+// operations.
+typedef struct iree_task_scope_t {
+  // Name used for logging and tracing.
+  char name[16];
+
+  // Base color used for tasks in this scope.
+  // The color will be modulated based on task type.
+  IREE_TRACE(uint32_t task_trace_color;)
+
+  // A permanent status code set when a task within the scope fails. All pending
+  // tasks will be aborted, though any in-flight tasks may continue executing
+  // to completion.
+  iree_atomic_intptr_t permanent_status;
+
+  // Dispatch statistics aggregated from all dispatches in this scope. Updated
+  // relatively infrequently and must not be used for task control as values
+  // are undefined in the case of failure and may tear.
+  iree_task_dispatch_statistics_t dispatch_statistics;
+
+  // A mutex used to guard the pending_submissions.
+  // We need a mutex here so that we can ensure proper ordering with respect to
+  // the pending_submissions changes and the idle_notification: if we were to
+  // decrement the pending_submissions to 0 ("going idle") there's a race that
+  // can happen where another thread may come in and observe that prior to the
+  // idle_notification being notified. If that thread happens to be destroying
+  // the scope then boom.
+  //
+  // Thankfully we insert fences fairly infrequently, the contention is low,
+  // and iree_slim_mutex_t is a futex so this isn't much more expensive than
+  // just having an atomic variable.
+  iree_slim_mutex_t mutex;
+
+  // A count of pending submissions within this scope. 0 indicates idle.
+  // Each submission has a fence that references this value and decrements it
+  // as it is reached indicating that all memory used by all tasks within that
+  // submission is available for reuse.
+  uint32_t pending_submissions;
+
+  // A notification signaled when the scope transitions to having no pending
+  // tasks or completes all pending tasks after a failure.
+  iree_notification_t idle_notification;
+} iree_task_scope_t;
+
+// Initializes a caller-allocated scope.
+// Callers must ensure the scope remains live for as long as there are any
+// tasks that may reference it.
+void iree_task_scope_initialize(iree_string_view_t name,
+                                iree_task_scope_t* out_scope);
+
+// Deinitializes an task scope.
+// No tasks may be pending and the scope must be idle.
+void iree_task_scope_deinitialize(iree_task_scope_t* scope);
+
+// Returns the name of the scope. Informational only and may be the empty
+// string.
+iree_string_view_t iree_task_scope_name(iree_task_scope_t* scope);
+
+// Returns and resets the statistics for the scope.
+// Statistics may experience tearing (non-atomic update across fields) if this
+// is performed while tasks are in-flight.
+iree_task_dispatch_statistics_t iree_task_scope_consume_statistics(
+    iree_task_scope_t* scope);
+
+// Returns true if the scope has failed.
+// iree_task_scope_consume_status can be used once to get the full status
+// describing the failure and subsequent calls will return the status code.
+bool iree_task_scope_has_failed(iree_task_scope_t* scope);
+
+// Returns the permanent scope failure status to the caller (transfering
+// ownership). The scope will remain in a failed state with the status code.
+iree_status_t iree_task_scope_consume_status(iree_task_scope_t* scope);
+
+// Marks the scope as having been aborted by the user with IREE_STATUS_ABORTED.
+// All pending tasks will be dropped though in-flight tasks may complete
+// execution. Callers must use iree_task_scope_wait_idle to ensure the scope
+// state synchronizes prior to deinitializing. If the scope has already been
+// aborted or failed with a permanent error then the operation is ignored and
+// the previous error status is preserved.
+void iree_task_scope_abort(iree_task_scope_t* scope);
+
+// Marks the scope as having encountered an error while processing a task.
+// The scope will be moved into a permanent failure state and all pending tasks
+// will be aborted. In-flight tasks may continue executing prior to
+// iree_task_scope_wait_idle returning true. If the scope has already been
+// marked as failing then the status is ignored.
+void iree_task_scope_fail(iree_task_scope_t* scope, iree_status_t status);
+
+// Notifies the scope that a new execution task assigned to the scope has begun.
+// The scope is considered active until it is notified execution has completed
+// with iree_task_scope_end.
+void iree_task_scope_begin(iree_task_scope_t* scope);
+
+// Notifies the scope that a previously begun execution task has completed.
+void iree_task_scope_end(iree_task_scope_t* scope);
+
+// Returns true if the scope has no pending or in-flight tasks.
+//
+// May race with other threads enqueuing work and be out of date immediately
+// upon return; callers are expected to use this only when it is safe.
+bool iree_task_scope_is_idle(iree_task_scope_t* scope);
+
+// Waits for the scope to become idle indicating that all pending and in-flight
+// tasks have completed. If the scope is aborted or marked for permanent failure
+// then the wait will only return after it is guaranteed no more tasks will ever
+// be issued by the task system.
+//
+// May race with other threads enqueuing work and be out of date immediately
+// upon return; callers must ensure this is used for command and control
+// decisions only when no other threads may be enqueuing more work.
+iree_status_t iree_task_scope_wait_idle(iree_task_scope_t* scope,
+                                        iree_time_t deadline_ns);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_TASK_SCOPE_H_
diff --git a/runtime/src/iree/task/scope_test.cc b/runtime/src/iree/task/scope_test.cc
new file mode 100644
index 0000000..72befd4
--- /dev/null
+++ b/runtime/src/iree/task/scope_test.cc
@@ -0,0 +1,248 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/task/scope.h"
+
+#include <chrono>
+#include <thread>
+
+#include "iree/task/submission.h"
+#include "iree/task/task_impl.h"
+#include "iree/testing/gtest.h"
+
+namespace {
+
+TEST(ScopeTest, Lifetime) {
+  iree_task_scope_t scope;
+  iree_task_scope_initialize(iree_make_cstring_view("scope_a"), &scope);
+  EXPECT_TRUE(iree_task_scope_is_idle(&scope));
+  iree_task_scope_deinitialize(&scope);
+}
+
+// NOTE: the exact capacity (and whether we store the name at all) is an
+// implementation detail.
+TEST(ScopeTest, LongNameTruncation) {
+  iree_task_scope_t scope;
+  iree_task_scope_initialize(iree_make_cstring_view("01234567890123456789"),
+                             &scope);
+  EXPECT_TRUE(iree_string_view_equal(iree_make_cstring_view("012345678901234"),
+                                     iree_task_scope_name(&scope)));
+  iree_task_scope_deinitialize(&scope);
+}
+
+TEST(ScopeTest, AbortEmpty) {
+  iree_task_scope_t scope;
+  iree_task_scope_initialize(iree_make_cstring_view("scope_a"), &scope);
+
+  // Current state is OK.
+  EXPECT_TRUE(iree_task_scope_is_idle(&scope));
+  EXPECT_TRUE(iree_status_is_ok(iree_task_scope_consume_status(&scope)));
+
+  // Enter aborted state.
+  iree_task_scope_abort(&scope);
+  iree_status_t consumed_status = iree_task_scope_consume_status(&scope);
+  EXPECT_TRUE(iree_status_is_aborted(consumed_status));
+  iree_status_ignore(consumed_status);
+
+  // Ensure aborted state is sticky.
+  EXPECT_TRUE(iree_status_is_aborted(iree_task_scope_consume_status(&scope)));
+
+  iree_task_scope_deinitialize(&scope);
+}
+
+TEST(ScopeTest, FailEmpty) {
+  iree_task_scope_t scope;
+  iree_task_scope_initialize(iree_make_cstring_view("scope_a"), &scope);
+
+  // Current state is OK.
+  EXPECT_TRUE(iree_task_scope_is_idle(&scope));
+  EXPECT_TRUE(iree_status_is_ok(iree_task_scope_consume_status(&scope)));
+
+  // Enter failure state.
+  iree_task_t failed_task = {0};
+  failed_task.scope = &scope;
+  iree_task_scope_fail(&scope,
+                       iree_make_status(IREE_STATUS_DATA_LOSS, "whoops!"));
+  iree_status_t consumed_status = iree_task_scope_consume_status(&scope);
+  EXPECT_TRUE(iree_status_is_data_loss(consumed_status));
+  iree_status_ignore(consumed_status);
+
+  // Ensure failure state is sticky.
+  EXPECT_TRUE(iree_status_is_data_loss(iree_task_scope_consume_status(&scope)));
+
+  iree_task_scope_deinitialize(&scope);
+}
+
+// NOTE: only the first failure is recorded and made sticky; subsequent failure
+// calls are ignored.
+TEST(ScopeTest, FailAgain) {
+  iree_task_scope_t scope;
+  iree_task_scope_initialize(iree_make_cstring_view("scope_a"), &scope);
+
+  // Current state is OK.
+  EXPECT_TRUE(iree_task_scope_is_idle(&scope));
+  EXPECT_TRUE(iree_status_is_ok(iree_task_scope_consume_status(&scope)));
+
+  // Enter initial failure state.
+  iree_task_t failed_task_a = {0};
+  failed_task_a.scope = &scope;
+  iree_task_scope_fail(&scope,
+                       iree_make_status(IREE_STATUS_DATA_LOSS, "whoops 1"));
+  iree_status_t consumed_status_a = iree_task_scope_consume_status(&scope);
+  EXPECT_TRUE(iree_status_is_data_loss(consumed_status_a));
+  iree_status_ignore(consumed_status_a);
+
+  // Ensure failure s tate is sticky.
+  EXPECT_TRUE(iree_status_is_data_loss(iree_task_scope_consume_status(&scope)));
+
+  // Try failing again - it should be ignored and correctly iree_status_free'd.
+  iree_task_t failed_task_b = {0};
+  failed_task_b.scope = &scope;
+  iree_task_scope_fail(
+      &scope, iree_make_status(IREE_STATUS_FAILED_PRECONDITION, "whoops 2"));
+  iree_status_t consumed_status_b = iree_task_scope_consume_status(&scope);
+  EXPECT_TRUE(iree_status_is_data_loss(consumed_status_b));
+  iree_status_ignore(consumed_status_b);
+
+  // Still the first failure status.
+  EXPECT_TRUE(iree_status_is_data_loss(iree_task_scope_consume_status(&scope)));
+
+  iree_task_scope_deinitialize(&scope);
+}
+
+TEST(ScopeTest, WaitIdleWhenIdle) {
+  iree_task_scope_t scope;
+  iree_task_scope_initialize(iree_make_cstring_view("scope_a"), &scope);
+
+  // Current state is OK and idle.
+  EXPECT_TRUE(iree_task_scope_is_idle(&scope));
+  EXPECT_TRUE(iree_status_is_ok(iree_task_scope_consume_status(&scope)));
+
+  // Wait until idle... which is now.
+  EXPECT_TRUE(iree_status_is_ok(
+      iree_task_scope_wait_idle(&scope, IREE_TIME_INFINITE_FUTURE)));
+  EXPECT_TRUE(iree_task_scope_is_idle(&scope));
+
+  iree_task_scope_deinitialize(&scope);
+}
+
+TEST(ScopeTest, WaitIdleDeadlineExceeded) {
+  iree_task_scope_t scope;
+  iree_task_scope_initialize(iree_make_cstring_view("scope_a"), &scope);
+
+  // Current state is OK and idle.
+  EXPECT_TRUE(iree_task_scope_is_idle(&scope));
+  EXPECT_TRUE(iree_status_is_ok(iree_task_scope_consume_status(&scope)));
+
+  // Enqueue a task to the scope so it is no longer idle.
+  iree_task_fence_t fence_task;
+  iree_task_fence_initialize(&scope, iree_wait_primitive_immediate(),
+                             &fence_task);
+  EXPECT_FALSE(iree_task_scope_is_idle(&scope));
+
+  // Poll, which should fail immediately because we have the outstanding task.
+  iree_status_t wait_status =
+      iree_task_scope_wait_idle(&scope, IREE_TIME_INFINITE_PAST);
+  EXPECT_TRUE(iree_status_is_deadline_exceeded(wait_status));
+  EXPECT_FALSE(iree_task_scope_is_idle(&scope));
+
+  // Complete the task (required as part of the scope contract).
+  iree_task_submission_t pending_submission;
+  iree_task_submission_initialize(&pending_submission);
+  iree_task_fence_retire(&fence_task, &pending_submission);
+  EXPECT_TRUE(iree_task_submission_is_empty(&pending_submission));
+
+  iree_task_scope_deinitialize(&scope);
+}
+
+TEST(ScopeTest, WaitIdleSuccess) {
+  iree_task_scope_t scope;
+  iree_task_scope_initialize(iree_make_cstring_view("scope_a"), &scope);
+
+  // Current state is OK and idle.
+  EXPECT_TRUE(iree_task_scope_is_idle(&scope));
+  EXPECT_TRUE(iree_status_is_ok(iree_task_scope_consume_status(&scope)));
+
+  // Enqueue a task to the scope so it is no longer idle.
+  iree_task_fence_t fence_task;
+  iree_task_fence_initialize(&scope, iree_wait_primitive_immediate(),
+                             &fence_task);
+  EXPECT_FALSE(iree_task_scope_is_idle(&scope));
+
+  // Spin up a thread to wait on the scope.
+  std::thread wait_thread([&]() {
+    EXPECT_FALSE(iree_task_scope_is_idle(&scope));
+    EXPECT_TRUE(iree_status_is_ok(
+        iree_task_scope_wait_idle(&scope, IREE_TIME_INFINITE_FUTURE)));
+    EXPECT_TRUE(iree_task_scope_is_idle(&scope));
+  });
+
+  // Wait a moment for the thread to spin up.
+  // NOTE: this may flake. Need to see if there's a better way to do this.
+  std::this_thread::sleep_for(std::chrono::milliseconds(150));
+
+  // Complete the task.
+  iree_task_submission_t pending_submission;
+  iree_task_submission_initialize(&pending_submission);
+  iree_task_fence_retire(&fence_task, &pending_submission);
+  EXPECT_TRUE(iree_task_submission_is_empty(&pending_submission));
+  EXPECT_TRUE(iree_task_scope_is_idle(&scope));
+
+  // Join with the thread - this will hang if it didn't wake correctly.
+  wait_thread.join();
+
+  iree_task_scope_deinitialize(&scope);
+}
+
+TEST(ScopeTest, WaitIdleFailure) {
+  iree_task_scope_t scope;
+  iree_task_scope_initialize(iree_make_cstring_view("scope_a"), &scope);
+
+  // Current state is OK and idle.
+  EXPECT_TRUE(iree_task_scope_is_idle(&scope));
+  EXPECT_TRUE(iree_status_is_ok(iree_task_scope_consume_status(&scope)));
+
+  // Enqueue a task to the scope so it is no longer idle.
+  iree_task_fence_t fence_task;
+  iree_task_fence_initialize(&scope, iree_wait_primitive_immediate(),
+                             &fence_task);
+  EXPECT_FALSE(iree_task_scope_is_idle(&scope));
+
+  // Spin up a thread to wait on the scope.
+  std::thread wait_thread([&]() {
+    EXPECT_FALSE(iree_task_scope_is_idle(&scope));
+    EXPECT_TRUE(iree_status_is_ok(
+        iree_task_scope_wait_idle(&scope, IREE_TIME_INFINITE_FUTURE)));
+    EXPECT_TRUE(iree_task_scope_is_idle(&scope));
+  });
+
+  // Wait a moment for the thread to spin up.
+  // NOTE: this may flake. Need to see if there's a better way to do this.
+  std::this_thread::sleep_for(std::chrono::milliseconds(150));
+
+  // Set the failure state.
+  iree_task_scope_fail(
+      &scope, iree_make_status(IREE_STATUS_FAILED_PRECONDITION, "whoops"));
+  EXPECT_FALSE(iree_task_scope_is_idle(&scope));
+
+  // Complete the task.
+  // Note that even if a scope fails we still must complete the tasks so it
+  // becomes idle. This ensures that if the scope state is used to control
+  // deallocation we don't go deallocating the tasks still in flight and waiting
+  // to gracefully fail.
+  iree_task_submission_t pending_submission;
+  iree_task_submission_initialize(&pending_submission);
+  iree_task_fence_retire(&fence_task, &pending_submission);
+  EXPECT_TRUE(iree_task_submission_is_empty(&pending_submission));
+  EXPECT_TRUE(iree_task_scope_is_idle(&scope));
+
+  // Join with the thread - this will hang if it didn't wake correctly.
+  wait_thread.join();
+
+  iree_task_scope_deinitialize(&scope);
+}
+
+}  // namespace
diff --git a/runtime/src/iree/task/submission.c b/runtime/src/iree/task/submission.c
new file mode 100644
index 0000000..0e8f2d6
--- /dev/null
+++ b/runtime/src/iree/task/submission.c
@@ -0,0 +1,71 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/task/submission.h"
+
+#include <stddef.h>
+#include <string.h>
+
+#include "iree/base/api.h"
+
+void iree_task_submission_initialize(iree_task_submission_t* out_submission) {
+  iree_task_list_initialize(&out_submission->ready_list);
+  iree_task_list_initialize(&out_submission->waiting_list);
+}
+
+void iree_task_submission_initialize_from_lifo_slist(
+    iree_atomic_task_slist_t* ready_slist,
+    iree_task_submission_t* out_submission) {
+  // Flush from the LIFO ready list to the LIFO submission queue.
+  // We have to walk everything here to get the tail pointer, which could be
+  // improved by sourcing from something other than an slist.
+  iree_task_submission_initialize(out_submission);
+  iree_atomic_task_slist_flush(
+      ready_slist, IREE_ATOMIC_SLIST_FLUSH_ORDER_APPROXIMATE_LIFO,
+      &out_submission->ready_list.head, &out_submission->ready_list.tail);
+}
+
+void iree_task_submission_reset(iree_task_submission_t* submission) {
+  memset(&submission->ready_list, 0, sizeof(submission->ready_list));
+  memset(&submission->waiting_list, 0, sizeof(submission->waiting_list));
+}
+
+void iree_task_submission_discard(iree_task_submission_t* submission) {
+  iree_task_list_discard(&submission->ready_list);
+  iree_task_list_discard(&submission->waiting_list);
+}
+
+bool iree_task_submission_is_empty(iree_task_submission_t* submission) {
+  return iree_task_list_is_empty(&submission->ready_list) &&
+         iree_task_list_is_empty(&submission->waiting_list);
+}
+
+void iree_task_submission_enqueue(iree_task_submission_t* submission,
+                                  iree_task_t* task) {
+  IREE_ASSERT_TRUE(iree_task_is_ready(task),
+                   "must be a root task to be enqueued on a submission");
+  if (task->type == IREE_TASK_TYPE_WAIT &&
+      (task->flags & IREE_TASK_FLAG_WAIT_COMPLETED) == 0) {
+    // A wait that we know is unresolved and can immediately route to the
+    // waiting list. This avoids the need to try to schedule the wait when it's
+    // almost certain that the wait would not be satisfied.
+    iree_task_list_push_front(&submission->waiting_list, task);
+  } else {
+    // Task is ready to execute immediately.
+    iree_task_list_push_front(&submission->ready_list, task);
+  }
+}
+
+void iree_task_submission_enqueue_list(iree_task_submission_t* submission,
+                                       iree_task_list_t* list) {
+  iree_task_t* task = list->head;
+  list->head = list->tail = NULL;
+  while (task) {
+    iree_task_t* next = task->next_task;
+    iree_task_submission_enqueue(submission, task);
+    task = next;
+  }
+}
diff --git a/runtime/src/iree/task/submission.h b/runtime/src/iree/task/submission.h
new file mode 100644
index 0000000..9315dc6
--- /dev/null
+++ b/runtime/src/iree/task/submission.h
@@ -0,0 +1,103 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_TASK_SUBMISSION_H_
+#define IREE_TASK_SUBMISSION_H_
+
+#include <stdbool.h>
+
+#include "iree/base/api.h"
+#include "iree/task/list.h"
+#include "iree/task/task.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// A pending submission to a task queue made up of a DAG of tasks.
+// Tasks are executed when ready in the order they were enqueued while observing
+// all dependencies. This means that two tasks that have no dependencies may
+// execute out of order/overlap.
+//
+// By keeping track of which tasks are ready for execution (ready_list) upon
+// submission to a queue we avoid the need to walk the task list again and
+// instead only touch the waiting tasks during construction and as they are made
+// ready, avoiding needless work and cache thrashing.
+//
+// Waiting tasks (waiting_list) are those waiting on external dependencies such
+// as file descriptor wait handles. Because we track all of these the executor
+// can perform an efficient multi-wait across queues without needing to block
+// (or even check) every waiting task individually.
+//
+// Because we only track roots of the DAG to release all tasks in a submission
+// early (due to failure or shutdown) the DAG must be walked. Releasing just the
+// lists will only handle the roots and leave all the rest of the tasks
+// dangling.
+//
+// Thread-compatible; designed to be used from a single thread producing the
+// submission.
+typedef struct iree_task_submission_t {
+  // List of tasks that are ready for execution immediately. Upon submission to
+  // a queue the tasks will be passed on to the executor with no delay.
+  //
+  // Tasks are stored in LIFO order; this allows us to quickly concat them with
+  // incoming/mailbox slists that are naturally in LIFO order and that may
+  // contain tasks from prior submissions. Note that we are representing a
+  // ready list - meaning that all tasks are able to start simultaneously (in
+  // the best case where tasks <= workers); this means that the ordering
+  // requirements here are purely for performance and ease of debugging. In
+  // cases where tasks >> workers we could also see some benefits from the
+  // eventual FIFO order matching how the tasks were allocated.
+  iree_task_list_t ready_list;
+
+  // List of tasks that are waiting for execution on external dependencies.
+  // These are root tasks that have no internal task dependencies.
+  // Order is not important here; the assumption is that all waiting tasks are
+  // more of a set than an ordered list and that they can all be waited on as a
+  // multi-wait-any.
+  iree_task_list_t waiting_list;
+} iree_task_submission_t;
+
+// Initializes a task submission.
+void iree_task_submission_initialize(iree_task_submission_t* out_submission);
+
+// Flushes the given |ready_slist| and initializes the submission with all tasks
+// to the submission in LIFO order. All tasks in |ready_slist| are assumed to be
+// ready for execution immediately.
+void iree_task_submission_initialize_from_lifo_slist(
+    iree_atomic_task_slist_t* ready_slist,
+    iree_task_submission_t* out_submission);
+
+// Resets the submission by dropping the list references.
+void iree_task_submission_reset(iree_task_submission_t* submission);
+
+// Discards all pending tasks in the submission. This is only safe to call if
+// the submission has not yet been submitted to a queue for execution and should
+// be used for failure cleanup during submission construction.
+void iree_task_submission_discard(iree_task_submission_t* submission);
+
+// Returns true if the submission has no tasks.
+bool iree_task_submission_is_empty(iree_task_submission_t* submission);
+
+// Enqueues |task| to the pending |submission|.
+// The task will be checked to see whether it is immediately ready to execute
+// and placed in an appropriate list; all dependencies must be declared prior to
+// calling this method. After returning new tasks that depend on this task may
+// still be defined. The submission takes ownership of the |task|.
+void iree_task_submission_enqueue(iree_task_submission_t* submission,
+                                  iree_task_t* task);
+
+// Enqueues all tasks in |list| to the pending |submission|.
+// Ownership of the tasks transfers to the submission and the |list| will be
+// reset upon return. Ready tasks may execute in any order.
+void iree_task_submission_enqueue_list(iree_task_submission_t* submission,
+                                       iree_task_list_t* list);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_TASK_SUBMISSION_H_
diff --git a/runtime/src/iree/task/task.c b/runtime/src/iree/task/task.c
new file mode 100644
index 0000000..4f3593d
--- /dev/null
+++ b/runtime/src/iree/task/task.c
@@ -0,0 +1,826 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/task/task.h"
+
+#include <stdio.h>
+#include <string.h>
+
+#include "iree/base/tracing.h"
+#include "iree/task/list.h"
+#include "iree/task/pool.h"
+#include "iree/task/post_batch.h"
+#include "iree/task/scope.h"
+#include "iree/task/submission.h"
+#include "iree/task/task_impl.h"
+#include "iree/task/tuning.h"
+
+//==============================================================================
+// Task bookkeeping
+//==============================================================================
+
+void iree_task_initialize(iree_task_type_t type, iree_task_scope_t* scope,
+                          iree_task_t* out_task) {
+  // NOTE: only clears the header, not the task body.
+  memset(out_task, 0, sizeof(*out_task));
+  out_task->scope = scope;
+  out_task->affinity_set = iree_task_affinity_for_any_worker();
+  out_task->type = type;
+}
+
+void iree_task_set_cleanup_fn(iree_task_t* task,
+                              iree_task_cleanup_fn_t cleanup_fn) {
+  task->cleanup_fn = cleanup_fn;
+}
+
+void iree_task_set_completion_task(iree_task_t* task,
+                                   iree_task_t* completion_task) {
+  IREE_ASSERT(!task->completion_task);
+  task->completion_task = completion_task;
+  iree_atomic_fetch_add_int32(&completion_task->pending_dependency_count, 1,
+                              iree_memory_order_seq_cst);
+}
+
+bool iree_task_is_ready(iree_task_t* task) {
+  if (iree_atomic_load_int32(&task->pending_dependency_count,
+                             iree_memory_order_relaxed) > 0) {
+    // At least one dependency is still pending.
+    return false;
+  }
+  return true;
+}
+
+static void iree_task_try_set_status(iree_atomic_intptr_t* permanent_status,
+                                     iree_status_t new_status) {
+  if (IREE_UNLIKELY(iree_status_is_ok(new_status))) return;
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_TEXT(z0, "failed: ");
+  IREE_TRACE_ZONE_APPEND_TEXT(
+      z0, iree_status_code_string(iree_status_code(new_status)));
+
+  iree_status_t old_status = iree_ok_status();
+  if (!iree_atomic_compare_exchange_strong_intptr(
+          permanent_status, (intptr_t*)&old_status, (intptr_t)new_status,
+          iree_memory_order_seq_cst, iree_memory_order_seq_cst)) {
+    // Previous status was not OK; drop our new status.
+    IREE_IGNORE_ERROR(new_status);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static void iree_task_cleanup(iree_task_t* task,
+                              iree_status_code_t status_code) {
+  // Call the (optional) cleanup function.
+  // NOTE: this may free the memory of the task itself!
+  iree_task_pool_t* pool = task->pool;
+  iree_task_cleanup_fn_t cleanup_fn = task->cleanup_fn;
+  if (cleanup_fn) {
+    cleanup_fn(task, status_code);
+  }
+
+  // Return the task to the pool it was allocated from.
+  // Some tasks are allocated as part of arenas/ringbuffers and won't have a
+  // pool as they'll be cleaned up as part of a larger operation.
+  if (pool) {
+    iree_task_pool_release(pool, task);
+  }
+}
+
+static void iree_task_barrier_discard(iree_task_barrier_t* task,
+                                      iree_task_list_t* discard_worklist);
+
+void iree_task_discard(iree_task_t* task, iree_task_list_t* discard_worklist) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // This models a BFS discard in our non-recursive approach.
+  // We must ensure that we only discard each task once and that we discard the
+  // tasks in the appropriate order: if we had a DAG of A -> B, C -> D we must
+  // discard respecting the same topological ordering.
+
+  IREE_ASSERT_EQ(0, iree_atomic_load_int32(&task->pending_dependency_count,
+                                           iree_memory_order_acquire));
+
+  // Almost all tasks will have a completion task; some may have additional
+  // dependent tasks (like barriers) that will be handled below.
+  const bool completion_task_ready =
+      task->completion_task &&
+      iree_atomic_fetch_sub_int32(
+          &task->completion_task->pending_dependency_count, 1,
+          iree_memory_order_acq_rel) == 1;
+  if (completion_task_ready) {
+    iree_task_list_push_back(discard_worklist, task->completion_task);
+  }
+
+  iree_task_scope_t* end_scope = NULL;
+  switch (task->type) {
+    default:
+    case IREE_TASK_TYPE_NOP:
+    case IREE_TASK_TYPE_CALL:
+      break;
+    case IREE_TASK_TYPE_BARRIER:
+      iree_task_barrier_discard((iree_task_barrier_t*)task, discard_worklist);
+      break;
+    case IREE_TASK_TYPE_FENCE:
+      end_scope = task->scope;  // need to clean up the task first
+      break;
+    case IREE_TASK_TYPE_WAIT:
+    case IREE_TASK_TYPE_DISPATCH:
+      break;
+  }
+
+  iree_task_cleanup(task, IREE_STATUS_ABORTED);
+  // NOTE: task is invalidated here and cannot be used!
+  task = NULL;
+
+  if (end_scope) {
+    iree_task_scope_end(end_scope);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static void iree_task_retire(iree_task_t* task,
+                             iree_task_submission_t* pending_submission,
+                             iree_status_t status) {
+  IREE_ASSERT_EQ(0, iree_atomic_load_int32(&task->pending_dependency_count,
+                                           iree_memory_order_acquire));
+
+  // Decrement the pending count on the completion task, if any.
+  iree_task_t* completion_task = task->completion_task;
+  task->completion_task = NULL;
+
+  if (iree_status_is_ok(status)) {
+    // Task completed successfully.
+    iree_task_cleanup(task, IREE_STATUS_OK);
+    bool completion_task_ready =
+        completion_task &&
+        iree_atomic_fetch_sub_int32(&completion_task->pending_dependency_count,
+                                    1, iree_memory_order_acq_rel) == 1;
+    if (completion_task_ready) {
+      // This was the last pending dependency and the completion task is ready
+      // to run.
+      iree_task_submission_enqueue(pending_submission, completion_task);
+    }
+  } else {
+    // Task failed: notify the scope.
+    iree_task_scope_t* scope = task->scope;
+    iree_task_scope_fail(scope, status);
+    status = iree_ok_status();  // consumed by the fail
+
+    // We need to carefully clean up the task: if we go discarding fences we'll
+    // end up waking waiters before we're done. To ensure this doesn't happen
+    // we retain the scope until we've finished cleaning things up.
+    iree_task_scope_begin(scope);
+    iree_task_cleanup(task, IREE_STATUS_ABORTED);
+
+    bool completion_task_ready =
+        completion_task &&
+        iree_atomic_fetch_sub_int32(&completion_task->pending_dependency_count,
+                                    1, iree_memory_order_acq_rel) == 1;
+    if (completion_task_ready) {
+      // This was the last pending dependency and we know that we can safely
+      // abort the completion task by discarding.
+      iree_task_list_t discard_worklist;
+      iree_task_list_initialize(&discard_worklist);
+      iree_task_discard(completion_task, &discard_worklist);
+      iree_task_list_discard(&discard_worklist);
+    } else if (completion_task) {
+      // One or more pending dependencies are not yet satisfied and the
+      // completion task must stay alive. We can mark it as aborted, though,
+      // so that it knows not to execute when it is ready to run.
+      // TODO(benvanik): make this atomic? we only ever add bits and it's safe
+      // for it to run if we got this far.
+      completion_task->flags |= IREE_TASK_FLAG_ABORTED;
+    }
+
+    // Unlock the scope; it may immediately be freed before this returns!
+    iree_task_scope_end(scope);
+  }
+
+  // NOTE: task is invalidated here and cannot be used!
+  task = NULL;
+}
+
+//==============================================================================
+// IREE_TASK_TYPE_NOP
+//==============================================================================
+
+void iree_task_nop_initialize(iree_task_scope_t* scope,
+                              iree_task_nop_t* out_task) {
+  iree_task_initialize(IREE_TASK_TYPE_NOP, scope, &out_task->header);
+}
+
+void iree_task_nop_retire(iree_task_nop_t* task,
+                          iree_task_submission_t* pending_submission) {
+  iree_task_retire(&task->header, pending_submission, iree_ok_status());
+}
+
+//==============================================================================
+// IREE_TASK_TYPE_CALL
+//==============================================================================
+
+// Returns an XXBBGGRR color (red in the lowest bits).
+// Must not be 0 (tracy will ignore).
+static uint32_t iree_math_ptr_to_xrgb(const void* ptr) {
+  // This is just a simple hack to give us a unique(ish) per-pointer color.
+  // It's only to make it easier to distinguish which tiles are from the same
+  // dispatch.
+  uint64_t ptr64 = (uintptr_t)ptr;
+  return (uint32_t)ptr64 ^ (uint32_t)(ptr64 >> 32);
+}
+
+void iree_task_call_initialize(iree_task_scope_t* scope,
+                               iree_task_call_closure_t closure,
+                               iree_task_call_t* out_task) {
+  iree_task_initialize(IREE_TASK_TYPE_CALL, scope, &out_task->header);
+  out_task->closure = closure;
+  iree_atomic_store_intptr(&out_task->status, 0, iree_memory_order_release);
+}
+
+void iree_task_call_execute(iree_task_call_t* task,
+                            iree_task_submission_t* pending_submission) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_SET_COLOR(z0,
+                            iree_math_ptr_to_xrgb(task->closure.user_context));
+
+  if (IREE_LIKELY(
+          !iree_any_bit_set(task->header.flags, IREE_TASK_FLAG_ABORTED))) {
+    // Execute the user callback.
+    // Note that this may enqueue more nested tasks, including tasks that
+    // prevent this task from retiring.
+    iree_status_t status = task->closure.fn(task->closure.user_context,
+                                            &task->header, pending_submission);
+    if (!iree_status_is_ok(status)) {
+      // Stash the failure status on the task.
+      // If there's still pending dependencies we won't be able to discard
+      // immediately and need to keep the status around until they all complete.
+      iree_task_try_set_status(&task->status, status);
+      status = iree_ok_status();  // consumed by try_set_status
+
+      // TODO(benvanik): discard pending_submission? As we may have pending work
+      // from multiple scopes it's dangerous to discard all. We could filter
+      // based on scope, though, and if we did that we (probably) wouldn't need
+      // to handle the permanent status on the task and could discard
+      // immediately.
+    }
+  }
+
+  // Check to see if there are no pending dependencies before retiring; the
+  // dependency count can go up if new nested tasks were enqueued.
+  if (iree_atomic_load_int32(&task->header.pending_dependency_count,
+                             iree_memory_order_acquire) == 0) {
+    iree_status_t status = (iree_status_t)iree_atomic_exchange_intptr(
+        &task->status, 0, iree_memory_order_seq_cst);
+    iree_task_retire(&task->header, pending_submission, status);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+//==============================================================================
+// IREE_TASK_TYPE_BARRIER
+//==============================================================================
+
+void iree_task_barrier_initialize(iree_task_scope_t* scope,
+                                  iree_host_size_t dependent_task_count,
+                                  iree_task_t* const* dependent_tasks,
+                                  iree_task_barrier_t* out_task) {
+  iree_task_initialize(IREE_TASK_TYPE_BARRIER, scope, &out_task->header);
+  out_task->dependent_task_count = dependent_task_count;
+  out_task->dependent_tasks = dependent_tasks;
+  for (iree_host_size_t i = 0; i < out_task->dependent_task_count; ++i) {
+    iree_task_t* dependent_task = out_task->dependent_tasks[i];
+    iree_atomic_fetch_add_int32(&dependent_task->pending_dependency_count, 1,
+                                iree_memory_order_relaxed);
+  }
+}
+
+void iree_task_barrier_initialize_empty(iree_task_scope_t* scope,
+                                        iree_task_barrier_t* out_task) {
+  iree_task_initialize(IREE_TASK_TYPE_BARRIER, scope, &out_task->header);
+  out_task->dependent_task_count = 0;
+  out_task->dependent_tasks = NULL;
+}
+
+void iree_task_barrier_set_dependent_tasks(
+    iree_task_barrier_t* task, iree_host_size_t dependent_task_count,
+    iree_task_t* const* dependent_tasks) {
+  task->dependent_task_count = dependent_task_count;
+  task->dependent_tasks = dependent_tasks;
+  for (iree_host_size_t i = 0; i < task->dependent_task_count; ++i) {
+    iree_task_t* dependent_task = task->dependent_tasks[i];
+    iree_atomic_fetch_add_int32(&dependent_task->pending_dependency_count, 1,
+                                iree_memory_order_relaxed);
+  }
+}
+
+static void iree_task_barrier_discard(iree_task_barrier_t* task,
+                                      iree_task_list_t* discard_worklist) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Discard all of the tasks after the barrier.
+  // Note that we need to ensure we only enqueue them for discard after all of
+  // their dependencies have been met - otherwise we'll double-discard.
+  for (iree_host_size_t i = 0; i < task->dependent_task_count; ++i) {
+    iree_task_t* dependent_task = task->dependent_tasks[i];
+    const bool dependent_task_ready =
+        iree_atomic_fetch_sub_int32(&dependent_task->pending_dependency_count,
+                                    1, iree_memory_order_acq_rel) == 1;
+    if (dependent_task_ready) {
+      // The dependent task has retired and can now be discard.
+      iree_task_list_push_back(discard_worklist, dependent_task);
+    }
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_task_barrier_retire(iree_task_barrier_t* task,
+                              iree_task_submission_t* pending_submission) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // NOTE: we walk in reverse so that we enqueue in LIFO order.
+  for (iree_host_size_t i = 0; i < task->dependent_task_count; ++i) {
+    iree_task_t* dependent_task =
+        task->dependent_tasks[task->dependent_task_count - i - 1];
+    if (iree_atomic_fetch_sub_int32(&dependent_task->pending_dependency_count,
+                                    1, iree_memory_order_acq_rel) == 1) {
+      // The dependent task has retired and can now be made ready.
+      iree_task_submission_enqueue(pending_submission, dependent_task);
+    }
+  }
+
+  iree_task_retire(&task->header, pending_submission, iree_ok_status());
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+//==============================================================================
+// IREE_TASK_TYPE_FENCE
+//==============================================================================
+
+void iree_task_fence_initialize(iree_task_scope_t* scope,
+                                iree_wait_primitive_t signal_handle,
+                                iree_task_fence_t* out_task) {
+  iree_task_initialize(IREE_TASK_TYPE_FENCE, scope, &out_task->header);
+  out_task->signal_handle = signal_handle;
+  iree_task_scope_begin(scope);
+}
+
+void iree_task_fence_retire(iree_task_fence_t* task,
+                            iree_task_submission_t* pending_submission) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Need to wait until after we clean up the task before ending the scope.
+  // This way anyone waiting on the scope to go idle will be able to ensure the
+  // scope is actually idle - otherwise it may try to free the task memory
+  // while we are still using it.
+  iree_task_scope_t* end_scope = task->header.scope;
+
+  // TODO(benvanik): better API that doesn't require wrapping or requiring that
+  // iree_event_t is an iree_wait_handle_t.
+  iree_wait_handle_t signal_handle = {
+      .type = task->signal_handle.type,
+      .value = task->signal_handle.value,
+  };
+  iree_event_set(&signal_handle);
+
+  iree_task_retire(&task->header, pending_submission, iree_ok_status());
+
+  if (end_scope) {
+    iree_task_scope_end(end_scope);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+//==============================================================================
+// IREE_TASK_TYPE_WAIT
+//==============================================================================
+
+void iree_task_wait_initialize(iree_task_scope_t* scope,
+                               iree_wait_source_t wait_source,
+                               iree_time_t deadline_ns,
+                               iree_task_wait_t* out_task) {
+  iree_task_initialize(IREE_TASK_TYPE_WAIT, scope, &out_task->header);
+  out_task->wait_source = wait_source;
+  out_task->deadline_ns = deadline_ns;
+  out_task->cancellation_flag = NULL;
+}
+
+void iree_task_wait_initialize_delay(iree_task_scope_t* scope,
+                                     iree_time_t deadline_ns,
+                                     iree_task_wait_t* out_task) {
+  iree_task_wait_initialize(scope, iree_wait_source_delay(deadline_ns),
+                            IREE_TIME_INFINITE_FUTURE, out_task);
+}
+
+void iree_task_wait_set_wait_any(iree_task_wait_t* task,
+                                 iree_atomic_int32_t* cancellation_flag) {
+  task->header.flags |= IREE_TASK_FLAG_WAIT_ANY;
+  task->cancellation_flag = cancellation_flag;
+}
+
+void iree_task_wait_retire(iree_task_wait_t* task,
+                           iree_task_submission_t* pending_submission,
+                           iree_status_t status) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  task->header.flags &= ~IREE_TASK_FLAG_WAIT_COMPLETED;  // reset for future use
+
+  // TODO(benvanik): allow deinit'ing the wait handle (if transient/from the
+  // executor event pool).
+  iree_task_retire(&task->header, pending_submission, status);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+//==============================================================================
+// IREE_TASK_TYPE_DISPATCH_* utilities
+//==============================================================================
+
+// Returns an XXBBGGRR color (red in the lowest bits).
+// Must not be 0 (tracy will ignore).
+static uint32_t iree_task_tile_to_color(
+    const iree_task_tile_context_t* tile_context);
+
+#if defined(IREE_TASK_TRACING_PER_TILE_COLORS)
+
+// TODO(#4017): optimize this to compute entire slices at once and fold in the
+// work grid location code.
+static uint32_t iree_math_hsv_to_xrgb(const uint8_t h, const uint8_t s,
+                                      const uint8_t v) {
+  // NOTE: this is matching with tracy's TracyColor.cpp implementation so that
+  // our colors fit nicely in the UI.
+  const uint8_t reg = h / 43;
+  const uint8_t rem = (h - (reg * 43)) * 6;
+  const uint8_t p = (v * (255 - s)) >> 8;
+  const uint8_t q = (v * (255 - ((s * rem) >> 8))) >> 8;
+  const uint8_t t = (v * (255 - ((s * (255 - rem)) >> 8))) >> 8;
+
+  // clang-format off
+  uint8_t r, g, b;
+  switch (reg) {
+    case 0:  r = v; g = t; b = p; break;
+    case 1:  r = q; g = v; b = p; break;
+    case 2:  r = p; g = v; b = t; break;
+    case 3:  r = p; g = q; b = v; break;
+    case 4:  r = t; g = p; b = v; break;
+    default: r = v; g = p; b = q; break;
+  }
+  // clang-format on
+
+  uint32_t xrgb = (r << 16) | (g << 8) | b;
+  xrgb |= (xrgb ? 0 : 1);  // ensure never zero
+  return xrgb;
+}
+
+static uint32_t iree_task_tile_to_color(
+    const iree_task_tile_context_t* tile_context) {
+  // TODO(#4017): optimize such that it's always on when tracing is
+  // enabled by amortizing the cost across the entire slice.
+
+  // Picked to try to make it easy to see gradients from tiles along the same x,
+  // y, and z (in that order). x is the fastest changing dimension and as such
+  // should all have the same hue, while z is the slowest changing dimension and
+  // should have different hues.
+  uint8_t h = (tile_context->workgroup_xyz[1] /
+               (float)(tile_context->workgroup_count[1])) *
+              255;
+  h = (h * 11400714819323198485ull) & 0xFF;
+  uint8_t s = 100 - (tile_context->workgroup_xyz[2] /
+                     (float)(tile_context->workgroup_count[2])) *
+                        100;
+  uint8_t v = (tile_context->workgroup_xyz[0] /
+               (float)(tile_context->workgroup_count[0])) *
+                  50 +
+              50;
+  return iree_math_hsv_to_xrgb(h, s, v);
+}
+
+#else
+
+static uint32_t iree_task_tile_to_color(
+    const iree_task_tile_context_t* tile_context) {
+  return 0;  // use default tracy colors
+}
+
+#endif  // IREE_TASK_TRACING_PER_TILE_COLORS
+
+void iree_task_dispatch_statistics_merge(
+    const iree_task_dispatch_statistics_t* source,
+    iree_task_dispatch_statistics_t* target) {
+  // TODO(benvanik): statistics.
+}
+
+//==============================================================================
+// IREE_TASK_TYPE_DISPATCH
+//==============================================================================
+
+static void iree_task_dispatch_initialize_base(
+    iree_task_scope_t* scope, iree_task_dispatch_closure_t closure,
+    const uint32_t workgroup_size[3], iree_task_dispatch_t* out_task) {
+  iree_task_initialize(IREE_TASK_TYPE_DISPATCH, scope, &out_task->header);
+  out_task->closure = closure;
+  memcpy(out_task->workgroup_size, workgroup_size,
+         sizeof(out_task->workgroup_size));
+  out_task->local_memory_size = 0;
+  iree_atomic_store_intptr(&out_task->status, 0, iree_memory_order_release);
+  memset(&out_task->statistics, 0, sizeof(out_task->statistics));
+
+  IREE_TRACE({
+    static iree_atomic_int64_t next_dispatch_id = IREE_ATOMIC_VAR_INIT(0);
+    out_task->dispatch_id = iree_atomic_fetch_add_int64(
+        &next_dispatch_id, 1ll, iree_memory_order_acq_rel);
+  });
+}
+
+void iree_task_dispatch_initialize(iree_task_scope_t* scope,
+                                   iree_task_dispatch_closure_t closure,
+                                   const uint32_t workgroup_size[3],
+                                   const uint32_t workgroup_count[3],
+                                   iree_task_dispatch_t* out_task) {
+  iree_task_dispatch_initialize_base(scope, closure, workgroup_size, out_task);
+  memcpy(out_task->workgroup_count.value, workgroup_count,
+         sizeof(out_task->workgroup_count.value));
+}
+
+void iree_task_dispatch_initialize_indirect(
+    iree_task_scope_t* scope, iree_task_dispatch_closure_t closure,
+    const uint32_t workgroup_size[3], const uint32_t* workgroup_count_ptr,
+    iree_task_dispatch_t* out_task) {
+  iree_task_dispatch_initialize_base(scope, closure, workgroup_size, out_task);
+  out_task->header.flags |= IREE_TASK_FLAG_DISPATCH_INDIRECT;
+  out_task->workgroup_count.ptr = workgroup_count_ptr;
+}
+
+void iree_task_dispatch_issue(iree_task_dispatch_t* dispatch_task,
+                              iree_task_pool_t* shard_task_pool,
+                              iree_task_submission_t* pending_submission,
+                              iree_task_post_batch_t* post_batch) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, dispatch_task->dispatch_id);
+
+  // Mark the dispatch as having been issued; the next time it retires it'll be
+  // because all work has completed.
+  dispatch_task->header.flags |= IREE_TASK_FLAG_DISPATCH_RETIRE;
+
+  // Fetch the workgroup count (directly or indirectly).
+  if (dispatch_task->header.flags & IREE_TASK_FLAG_DISPATCH_INDIRECT) {
+    // By the task being ready to execute we know any dependencies on the
+    // indirection buffer have been satisfied and its safe to read. We perform
+    // the indirection here and convert the dispatch to a direct one such that
+    // following code can read the value.
+    // TODO(benvanik): non-one-shot command buffers won't be able to do this as
+    // the intent is that they can be dynamic per execution.
+    const uint32_t* source_ptr = dispatch_task->workgroup_count.ptr;
+    memcpy(dispatch_task->workgroup_count.value, source_ptr,
+           sizeof(dispatch_task->workgroup_count.value));
+    dispatch_task->header.flags ^= IREE_TASK_FLAG_DISPATCH_INDIRECT;
+  }
+  const uint32_t* workgroup_count = dispatch_task->workgroup_count.value;
+
+  IREE_TRACE({
+    char xyz_string[32];
+    int xyz_string_length =
+        snprintf(xyz_string, IREE_ARRAYSIZE(xyz_string), "%ux%ux%u",
+                 workgroup_count[0], workgroup_count[1], workgroup_count[2]);
+    IREE_TRACE_ZONE_APPEND_TEXT_STRING_VIEW(z0, xyz_string, xyz_string_length);
+  });
+
+  // Setup the iteration space for shards to pull work from the complete grid.
+  iree_atomic_store_int32(&dispatch_task->tile_index, 0,
+                          iree_memory_order_relaxed);
+  dispatch_task->tile_count =
+      workgroup_count[0] * workgroup_count[1] * workgroup_count[2];
+
+  // Compute shard count - almost always worker_count unless we are a very small
+  // dispatch (1x1x1, etc).
+  iree_host_size_t worker_count = iree_task_post_batch_worker_count(post_batch);
+  iree_host_size_t shard_count =
+      iree_min(dispatch_task->tile_count, worker_count);
+
+  // Compute how many tiles we want each shard to reserve at a time from the
+  // larger grid. A higher number reduces overhead and improves locality while
+  // a lower number reduces maximum worst-case latency (coarser work stealing).
+  if (dispatch_task->tile_count <
+      worker_count * IREE_TASK_DISPATCH_MAX_TILES_PER_SHARD_RESERVATION) {
+    // Grid is small - allow it to be eagerly sliced up.
+    dispatch_task->tiles_per_reservation = 1;
+  } else {
+    dispatch_task->tiles_per_reservation =
+        IREE_TASK_DISPATCH_MAX_TILES_PER_SHARD_RESERVATION;
+  }
+
+  // Randomize starting worker.
+  iree_host_size_t worker_offset = iree_task_post_batch_select_worker(
+      post_batch, dispatch_task->header.affinity_set);
+  iree_host_size_t worker_index = worker_offset;
+
+  for (iree_host_size_t i = 0; i < shard_count; ++i) {
+    // Allocate and initialize the shard.
+    iree_task_dispatch_shard_t* shard_task =
+        iree_task_dispatch_shard_allocate(dispatch_task, shard_task_pool);
+
+    // Enqueue on the worker selected for the task.
+    iree_task_post_batch_enqueue(post_batch, worker_index % worker_count,
+                                 &shard_task->header);
+    ++worker_index;
+  }
+
+  // NOTE: the dispatch is not retired until all shards complete. Upon the last
+  // shard completing the lucky worker will retire the task inline and
+  // potentially queue up more ready tasks that follow.
+  //
+  // The gotcha here is that it's possible for there to be zero shards within
+  // a dispatch (if, for example, and indirect dispatch had its workgroup counts
+  // set to zero to prevent it from running). We check for that here.
+  if (shard_count == 0) {
+    iree_task_dispatch_retire(dispatch_task, pending_submission);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_task_dispatch_retire(iree_task_dispatch_t* dispatch_task,
+                               iree_task_submission_t* pending_submission) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, dispatch_task->dispatch_id);
+
+  // TODO(benvanik): attach statistics to the tracy zone.
+
+  // Merge the statistics from the dispatch into the scope so we can track all
+  // of the work without tracking all the dispatches at a global level.
+  iree_task_dispatch_statistics_merge(
+      &dispatch_task->statistics,
+      &dispatch_task->header.scope->dispatch_statistics);
+
+  // Consume the status of the dispatch that may have been set from a workgroup
+  // and notify the scope. We need to do this here so that each shard retires
+  // before we discard any subsequent tasks: otherwise a failure of one shard
+  // would discard the shared dispatch task (and potentially everything) while
+  // other shards were still running. We also want to avoid fine-grained
+  // synchronization across shards that would occur by each checking to see if
+  // any other has hit an error; failure in a dispatch should be so exceedingly
+  // rare that allowing some shards to complete after one encounters an error is
+  // not a problem.
+  iree_status_t status = (iree_status_t)iree_atomic_exchange_intptr(
+      &dispatch_task->status, 0, iree_memory_order_seq_cst);
+
+  iree_task_retire(&dispatch_task->header, pending_submission, status);
+  IREE_TRACE_ZONE_END(z0);
+}
+
+//==============================================================================
+// IREE_TASK_TYPE_DISPATCH_SHARD
+//==============================================================================
+
+static inline iree_task_dispatch_t* iree_task_dispatch_shard_parent(
+    iree_task_dispatch_shard_t* task) {
+  return (iree_task_dispatch_t*)task->header.completion_task;
+}
+
+void iree_task_dispatch_shard_initialize(iree_task_dispatch_t* dispatch_task,
+                                         iree_task_dispatch_shard_t* out_task) {
+  iree_task_initialize(IREE_TASK_TYPE_DISPATCH_SHARD,
+                       dispatch_task->header.scope, &out_task->header);
+  iree_task_set_completion_task(&out_task->header, &dispatch_task->header);
+}
+
+iree_task_dispatch_shard_t* iree_task_dispatch_shard_allocate(
+    iree_task_dispatch_t* dispatch_task, iree_task_pool_t* shard_task_pool) {
+  iree_task_dispatch_shard_t* shard_task = NULL;
+  iree_status_t status =
+      iree_task_pool_acquire(shard_task_pool, (iree_task_t**)&shard_task);
+  if (!iree_status_is_ok(status)) {
+    iree_status_ignore(status);
+    return NULL;
+  }
+  iree_task_dispatch_shard_initialize(dispatch_task, shard_task);
+  shard_task->header.pool = shard_task_pool;
+  return shard_task;
+}
+
+void iree_task_dispatch_shard_execute(
+    iree_task_dispatch_shard_t* task, iree_cpu_processor_id_t processor_id,
+    iree_byte_span_t worker_local_memory,
+    iree_task_submission_t* pending_submission) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_task_dispatch_t* dispatch_task = iree_task_dispatch_shard_parent(task);
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, dispatch_task->dispatch_id);
+  IREE_TRACE_ZONE_SET_COLOR(
+      z0, iree_math_ptr_to_xrgb(dispatch_task->closure.user_context));
+
+  // Map only the requested amount of worker local memory into the tile context.
+  // This ensures that how much memory is used by some executions does not
+  // inadvertently leak over into other executions.
+  if (IREE_UNLIKELY(dispatch_task->local_memory_size >
+                    worker_local_memory.data_length)) {
+    iree_task_try_set_status(
+        &dispatch_task->status,
+        iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED,
+                         "dispatch requires %ub of local memory but only "
+                         "%zub is available per-worker",
+                         dispatch_task->local_memory_size,
+                         worker_local_memory.data_length));
+    iree_task_retire(&task->header, pending_submission, iree_ok_status());
+    IREE_TRACE_ZONE_END(z0);
+    return;
+  }
+  iree_byte_span_t local_memory = iree_make_byte_span(
+      worker_local_memory.data, dispatch_task->local_memory_size);
+
+  // Prepare context shared for all tiles in the shard.
+  iree_task_tile_context_t tile_context;
+  memcpy(&tile_context.workgroup_size, dispatch_task->workgroup_size,
+         sizeof(tile_context.workgroup_size));
+  memcpy(&tile_context.workgroup_count, dispatch_task->workgroup_count.value,
+         sizeof(tile_context.workgroup_count));
+  uint32_t workgroup_count_x = tile_context.workgroup_count[0];
+  uint32_t workgroup_count_y = tile_context.workgroup_count[1];
+  tile_context.local_memory = local_memory;
+
+  // We perform all our shard statistics work locally here and only push back to
+  // the dispatch at the end; this avoids contention from each shard trying to
+  // update the statistics together.
+  iree_task_dispatch_statistics_t shard_statistics;
+  memset(&shard_statistics, 0, sizeof(shard_statistics));
+  tile_context.statistics = &shard_statistics;
+
+  // Hint as to which processor we are running on.
+  tile_context.processor_id = processor_id;
+
+  // Loop over all tiles until they are all processed.
+  const uint32_t tile_count = dispatch_task->tile_count;
+  const uint32_t tiles_per_reservation = dispatch_task->tiles_per_reservation;
+  uint32_t tile_base = iree_atomic_fetch_add_int32(&dispatch_task->tile_index,
+                                                   tiles_per_reservation,
+                                                   iree_memory_order_relaxed);
+  while (tile_base < tile_count) {
+    const uint32_t tile_range =
+        iree_min(tile_base + tiles_per_reservation, tile_count);
+    for (uint32_t tile_index = tile_base; tile_index < tile_range;
+         ++tile_index) {
+      // TODO(benvanik): faster math here, especially knowing we pull off N
+      // sequential indices per reservation.
+      uint32_t tile_i = tile_index;
+      tile_context.workgroup_xyz[0] = tile_i % workgroup_count_x;
+      tile_i /= workgroup_count_x;
+      tile_context.workgroup_xyz[1] = tile_i % workgroup_count_y;
+      tile_i /= workgroup_count_y;
+      tile_context.workgroup_xyz[2] = tile_i;
+
+      IREE_TRACE_ZONE_BEGIN_NAMED(z_tile,
+                                  "iree_task_dispatch_shard_execute_tile");
+      IREE_TRACE_ZONE_SET_COLOR(z_tile, iree_task_tile_to_color(&tile_context));
+
+      // NOTE: these are useful for debugging but dramatically increase our
+      // cost here; only enable if needed for tracking work distribution:
+      IREE_TRACE_ZONE_APPEND_VALUE(z_tile, tile_context.workgroup_xyz[0]);
+      IREE_TRACE_ZONE_APPEND_VALUE(z_tile, tile_context.workgroup_xyz[1]);
+      IREE_TRACE_ZONE_APPEND_VALUE(z_tile, tile_context.workgroup_xyz[2]);
+      // IREE_TRACE_ZONE_APPEND_VALUE(z_tile, (uint64_t)task->closure.fn);
+
+      iree_status_t status =
+          dispatch_task->closure.fn(dispatch_task->closure.user_context,
+                                    &tile_context, pending_submission);
+
+      IREE_TRACE_ZONE_END(z_tile);
+
+      // If any tile fails we bail early from the loop. This doesn't match
+      // what an accelerator would do but saves some unneeded work.
+      // Note that other shards may have completed execution, be executing
+      // concurrently with this one, or still be pending - this does not
+      // have any influence on them and they may continue to execute even
+      // after we bail from here.
+      if (!iree_status_is_ok(status)) {
+        // Propagate failures to the dispatch task.
+        iree_task_try_set_status(&dispatch_task->status, status);
+        goto abort_shard;  // out of the while-for nest
+      }
+    }
+
+    // Try to grab the next slice of tiles.
+    tile_base = iree_atomic_fetch_add_int32(&dispatch_task->tile_index,
+                                            tiles_per_reservation,
+                                            iree_memory_order_relaxed);
+  }
+abort_shard:
+
+  // Push aggregate statistics up to the dispatch.
+  // Note that we may have partial information here if we errored out of the
+  // loop but that's still useful to know.
+  iree_task_dispatch_statistics_merge(&shard_statistics,
+                                      &dispatch_task->statistics);
+
+  // NOTE: even if an error was hit we retire OK - the error has already been
+  // propagated to the dispatch and it'll clean up after all shards are joined.
+  iree_task_retire(&task->header, pending_submission, iree_ok_status());
+  IREE_TRACE_ZONE_END(z0);
+}
diff --git a/runtime/src/iree/task/task.h b/runtime/src/iree/task/task.h
new file mode 100644
index 0000000..aeef180
--- /dev/null
+++ b/runtime/src/iree/task/task.h
@@ -0,0 +1,687 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_TASK_TASK_H_
+#define IREE_TASK_TASK_H_
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/atomic_slist.h"
+#include "iree/base/internal/atomics.h"
+#include "iree/base/internal/cpu.h"
+#include "iree/base/internal/synchronization.h"
+#include "iree/task/affinity_set.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef struct iree_task_list_t iree_task_list_t;
+typedef struct iree_task_pool_t iree_task_pool_t;
+typedef struct iree_task_scope_t iree_task_scope_t;
+typedef struct iree_task_submission_t iree_task_submission_t;
+
+//==============================================================================
+// Task header for internal tracking
+//==============================================================================
+
+// Specifies the type of a task and how executors handle it.
+enum iree_task_type_bits_t {
+  // Task is a no-op (performs no work) and exists for flexibility.
+  IREE_TASK_TYPE_NOP = 0u,
+
+  // Task will synchronously call a function before continuing.
+  IREE_TASK_TYPE_CALL = 1u,
+
+  // Task exists only as a barrier to join/fork tasks and has no executable
+  // payload.
+  IREE_TASK_TYPE_BARRIER = 2u,
+
+  // Task is a fence indicating that a certain point in the task graph has been
+  // reached. All tasks prior to this fence (by way of happens-before
+  // dependencies) are guaranteed to have retired.
+  IREE_TASK_TYPE_FENCE = 3u,
+
+  // Task is a wait on an external wait handle (fd, HANDLE, etc).
+  // Executors will wait on the handle until it is signaled and meets the
+  // specified condition prior to readying the dependent tasks.
+  IREE_TASK_TYPE_WAIT = 4u,
+
+  // Task is a 3D grid dispatch of zero or more tiles.
+  // Dispatches are issued when ready by either being split into one shard per
+  // worker that should process the dispatch.
+  //
+  // If IREE_TASK_FLAG_DISPATCH_INDIRECT is set then the dispatch reads the
+  // workgroup count from a buffer immediately prior to fan-out instead of using
+  // the values embedded in the task structure.
+  //
+  // After a dispatch has been issued the IREE_TASK_FLAG_DISPATCH_RETIRE flag is
+  // set to indicate that when the dispatch becomes ready again it will be after
+  // all shards have completed.
+  IREE_TASK_TYPE_DISPATCH = 5u,
+
+  // Task is one of potentially many shards processing a larger dispatch grid.
+  // Each shard may have a preference as to which parts of grid it will focus
+  // on but is able to otherwise steal any available region directly from the
+  // shared dispatch coordination state. Shards retire once there are no more
+  // tiles remaining in the dispatch grid.
+  IREE_TASK_TYPE_DISPATCH_SHARD = 6u,
+};
+typedef uint8_t iree_task_type_t;
+
+enum iree_task_flag_bits_t {
+  IREE_TASK_FLAG_NONE = 0u,
+
+  // Indicates that a wait task is part of a wait-any operation and the
+  // cancellation flag should be latched by any wait that resolves.
+  IREE_TASK_FLAG_WAIT_ANY = 1u << 0,
+
+  // The wait handle of the wait task has been acquired and the task can be
+  // waited on with system APIs.
+  IREE_TASK_FLAG_WAIT_EXPORTED = 1u << 1,
+
+  // The wait handle the task is specified to wait on has resolved and the task
+  // can now be considered complete.
+  IREE_TASK_FLAG_WAIT_COMPLETED = 1u << 2,
+
+  // The workgroup count for the dispatch is provided by way of a pointer to a
+  // list of 3 uint32_t values that will be sampled immediately prior to
+  // issuing of the dispatch. The contents of the pointer can be safely modified
+  // up until the last dependency has completed and the dispatch is about to be
+  // issued.
+  IREE_TASK_FLAG_DISPATCH_INDIRECT = 1u << 3,
+
+  // The dispatch has been issued and the task is waiting for one or more
+  // shards to complete. After they complete the dispatch will be readied and
+  // can be retired.
+  //
+  // Though added by the executor after issuing a dispatch users can also set
+  // this to indicate that all dispatch shards for a particular dispatch have
+  // been statically scheduled. Executors will then skip issuing the dispatch
+  // and instead wait until all shards complete, enabling IREE_TASK_TYPE_BARRIER
+  // behavior but without an additional task as dispatches are still required
+  // to store information for shards.
+  IREE_TASK_FLAG_DISPATCH_RETIRE = 1u << 4,
+
+  // An error occurred at or before the task and it has been aborted.
+  // Aborted tasks may continue to execute if they're already in-flight but must
+  // not begin execution after the flag has been set.
+  //
+  // The actual error that occurred is routed to the parent task scope as it
+  // happens and may be available for querying before all tasks have been
+  // cleaned up.
+  IREE_TASK_FLAG_ABORTED = 1u << 5,
+};
+typedef uint16_t iree_task_flags_t;
+
+typedef struct iree_task_t iree_task_t;
+
+// A function called to cleanup tasks.
+// Each task has its associated cleanup function called exactly once.
+// The provided |status_code| indicates the execution status of the task prior
+// to cleanup and will usually be IREE_STATUS_OK indicating the task was
+// successfully issued or IREE_STATUS_ABORTED if the task was discard prior to
+// issuing.
+typedef void(IREE_API_PTR* iree_task_cleanup_fn_t)(
+    iree_task_t* task, iree_status_code_t status_code);
+
+// A task within the task system that runs on an executor.
+// Tasks have an iree_task_type_t that defines which parameters are valid and
+// how the executor is to treat the task. Dependency edges can be defined that
+// determine the execution order of tasks within the executors.
+struct iree_alignas(iree_max_align_t) iree_task_t {
+  // Intrusive pointer used to store tasks within iree_task_list_t and
+  // iree_atomic_task_list_t singly-linked lists. This must come first in the
+  // structure so that it is at the appropriate alignment.
+  iree_task_t* next_task;
+
+  // The scope this task is attributed to. Errors with the task will be
+  // propagated to the scope and errors in the scope will cause pending tasks to
+  // be skipped.
+  iree_task_scope_t* scope;
+
+  // Optional function to call to cleanup the task on completion.
+  // Will be called after the task has retired or if the task fails to issue
+  // (dependency failed, etc).
+  iree_task_cleanup_fn_t cleanup_fn;
+
+  // Optional task that will be notified when the task completes.
+  // The task will have its pending_dependency_count decremented and will be
+  // readied for execution when the count reaches 0.
+  iree_task_t* completion_task;
+
+  // Specifies which workers will be used to execute this task.
+  // Forked tasks will inherit their parent task affinity (possibly with some
+  // task-dependent rules) to partition workloads across workers with knowledge
+  // of the specific work being performed. For example, some dispatches can be
+  // limited to run on certain microarchitectures that workers have affinity
+  // with at the OS scheduler level (such as little.BIG topologies).
+  iree_task_affinity_set_t affinity_set;
+
+  // Total number of dependent tasks still outstanding. Decremented each time
+  // a dependent task completes. The task is considered ready to execute when
+  // this value reaches 0.
+  iree_atomic_int32_t pending_dependency_count;
+
+  // Optional pool the task should be returned to after it has resolved. If the
+  // task was allocated as part of a larger data structure (embedded within
+  // an arena for example) then this can be NULL to prevent the task system
+  // from interfering.
+  iree_task_pool_t* pool;
+
+  // Specifies the type of the task and how the executor handles it.
+  iree_task_type_t type;
+
+  // Task-specific flag bits.
+  iree_task_flags_t flags;
+};
+static_assert(offsetof(iree_task_t, next_task) == 0,
+              "next_task intrusive pointer must be at offset 0");
+static_assert(sizeof(iree_task_t) <= 64,
+              "the task header greatly influences pool sizes due to alignment "
+              "requirements and should be kept tiny");
+
+// Initializes a task header with the given type.
+// Must be called on all tasks to ensure proper dependency tracking and list
+// state prior to enqueuing. Only the task header structure is initialized and
+// any additional data as part of the wrapping task type must be initialized by
+// the caller.
+void iree_task_initialize(iree_task_type_t type, iree_task_scope_t* scope,
+                          iree_task_t* out_task);
+
+// Sets the optional function called when the task completes (whether successful
+// or not). The cleanup function will receive a status indicating whether the
+// cleanup is from expected execution as the task retires (IREE_STATUS_OK)
+// or because it was aborted (IREE_STATUS_ABORTED).
+void iree_task_set_cleanup_fn(iree_task_t* task,
+                              iree_task_cleanup_fn_t cleanup_fn);
+
+// Sets up a dependency edge from |task| to |completion_task| such that when
+// |task| completes |completion_task| will be notified and have its
+// pending_dependency_count decremented.
+void iree_task_set_completion_task(iree_task_t* task,
+                                   iree_task_t* completion_task);
+
+// Returns true if the |task| is ready to execute immediately.
+// Though this is safe to call from any thread the test may have false-negatives
+// (ready tasks are not returned as ready) due to cross-thread synchronization
+// latency. Note that tasks may yield themselves during execution and switch
+// from ready to waiting (such as when an indirect dispatch needs to wait for
+// all tiles to complete).
+bool iree_task_is_ready(iree_task_t* task);
+
+// Discards the task and any dependent tasks.
+// Any dependent tasks that need to be discarded will be added to
+// |discard_worklist| for the caller to continue discarding.
+void iree_task_discard(iree_task_t* task, iree_task_list_t* discard_worklist);
+
+//==============================================================================
+// IREE_TASK_TYPE_NOP
+//==============================================================================
+
+// Task is a no-op (performs no work) and exists for flexibility.
+// NOP tasks can be used to link together task lists from multiple threads
+// where it may otherwise not be ideal to have heavy-weight concurrency
+// structures. NOP tasks can also be useful for neutering another task type
+// after it has already been recorded into a list such as when cancellations
+// occur.
+typedef iree_alignas(iree_max_align_t) struct {
+  // Task header: implementation detail, do not use.
+  iree_task_t header;
+} iree_task_nop_t;
+
+void iree_task_nop_initialize(iree_task_scope_t* scope,
+                              iree_task_nop_t* out_task);
+
+//==============================================================================
+// IREE_TASK_TYPE_CALL
+//==============================================================================
+
+typedef iree_status_t(IREE_API_PTR* iree_task_call_closure_fn_t)(
+    void* user_context, iree_task_t* task,
+    iree_task_submission_t* pending_submission);
+
+// A function closure representing the function to call and its arguments.
+typedef struct iree_task_call_closure_t {
+  // Function called per tile invocation.
+  iree_task_call_closure_fn_t fn;
+
+  // Opaque pointer to a user-provided data structure.
+  // No lifetime management is performed by the task system and it is required
+  // that users ensure that the memory referenced is live until after the task
+  // has completed.
+  void* user_context;
+
+  // TODO(benvanik): cleanup function? right now assume arg is never freed.
+} iree_task_call_closure_t;
+
+// Binds a function pointer and the arguments it should be called with.
+// If the arguments represent pointers they must remain live until the task
+// has completed execution.
+static inline iree_task_call_closure_t iree_task_make_call_closure(
+    iree_task_call_closure_fn_t fn, void* user_context) {
+  iree_task_call_closure_t closure = {fn, user_context};
+  return closure;
+}
+
+// A task that will synchronously call a function from the executor and wait
+// for it to complete before continuing.
+//
+// Memory referenced by closure arguments must be kept valid until the function
+// executes (in general with the same lifetime as the task itself).
+typedef iree_alignas(iree_max_align_t) struct {
+  // Task header: implementation detail, do not use.
+  iree_task_t header;
+
+  // Function closure to call when the task is executed.
+  iree_task_call_closure_t closure;
+
+  // Resulting status from the call available once all nested tasks have
+  // completed (or would have completed). It's possible for a call to nest
+  // additional work under it and then return a failure; to ensure we don't
+  // discard the root call while the nested tasks are still executing we set the
+  // status here and wait for the nested tasks to complete. We'll try not to
+  // issue work that was enqueued while the call was executing but it's possible
+  // for work to come from other angles and we need to err on the side of
+  // safety.
+  iree_atomic_intptr_t status;
+} iree_task_call_t;
+
+void iree_task_call_initialize(iree_task_scope_t* scope,
+                               iree_task_call_closure_t closure,
+                               iree_task_call_t* out_task);
+
+//==============================================================================
+// IREE_TASK_TYPE_BARRIER
+//==============================================================================
+
+// A join point for fork/join-style scheduling.
+// References a set of dependent tasks that will be notified and possibly
+// readied when the barrier is reached.
+//
+// This allows for modeling one-to-many and many-to-many relationships. The base
+// task dependency system only models one-to-one and should be used if possible
+// to avoid the additional overhead of a barrier task both in memory and task
+// indirection/queuing.
+//
+// Example:
+//  * [A] -> Barrier -> [C, D]
+//  - A executes
+//  - Barrier is processed after A completes
+//  - C and D execute concurrently (in any order)
+//
+//  * [A, B] -> Barrier -> [C, D]
+//  - A and B execute concurrently (in any order)
+//  - Barrier is processed after both A and B complete
+//  - C and D execute concurrently
+//
+//  * [A] -> Barrier -> [B]
+//  - Don't do this and use the base task dependency instead; it'll work, but
+//    it's much better to avoid the additional barrier indirection when
+//    possible.
+typedef iree_alignas(iree_max_align_t) struct {
+  // Task header: implementation detail, do not use.
+  iree_task_t header;
+
+  // Number of valid tasks in the dependent_tasks list.
+  iree_host_size_t dependent_task_count;
+  // [0-dependent_task_count] tasks that will be notified when the barrier is
+  // reached. Each task will have its pending_dependency_count decremented and
+  // when the count reaches 0 be added to the ready list.
+  iree_task_t* const* dependent_tasks;
+} iree_task_barrier_t;
+
+void iree_task_barrier_initialize(iree_task_scope_t* scope,
+                                  iree_host_size_t dependent_task_count,
+                                  iree_task_t* const* dependent_tasks,
+                                  iree_task_barrier_t* out_task);
+
+void iree_task_barrier_initialize_empty(iree_task_scope_t* scope,
+                                        iree_task_barrier_t* out_task);
+
+void iree_task_barrier_set_dependent_tasks(
+    iree_task_barrier_t* task, iree_host_size_t dependent_task_count,
+    iree_task_t* const* dependent_tasks);
+
+//==============================================================================
+// IREE_TASK_TYPE_FENCE
+//==============================================================================
+
+// A fence indicating that a certain point in the task graph has been reached.
+// All tasks prior to this fence (by way of happens-before dependencies) are
+// guaranteed to have retired.
+//
+// When all of the dependencies of a fence have retired the fence will notify
+// the parent scope of the task by decrementing the pending_submissions count
+// and publishing an idle_notification if it was the last in-flight submission.
+//
+// An optional platform primitive may be provided to signal in a way determined
+// by the primitive type via iree_event_set.
+typedef iree_alignas(iree_max_align_t) struct {
+  // Task header: implementation detail, do not use.
+  iree_task_t header;
+
+  // An optional wait primitive to signal when the fence is hit.
+  // If iree_wait_primitive_immediate then the signal will be ignored.
+  iree_wait_primitive_t signal_handle;
+} iree_task_fence_t;
+
+// Initializes a fence in |out_task| that demarcates activity in a |scope|.
+// An optional unowned |signal_handle| can be provided that will be signaled
+// with iree_event_set when the fence is reached.
+void iree_task_fence_initialize(iree_task_scope_t* scope,
+                                iree_wait_primitive_t signal_handle,
+                                iree_task_fence_t* out_task);
+
+//==============================================================================
+// IREE_TASK_TYPE_WAIT
+//==============================================================================
+
+// A task representing either a delay until a point in time or a wait on a wait
+// source external to the task system.
+//
+// Waits are modeled in the task graph to enable reducing the number of times a
+// full system wait is required by only beginning the wait when the task
+// dependencies have completed. Wait sources will be eagerly queried and
+// exported to wait handles when the task system would otherwise go idle. All
+// wait sources from all pending wait tasks will be accumulated into a wait set
+// and waited on in a single syscall.
+//
+// Waits will block the completion task until the wait resolves successfully or
+// the deadline is reached or exceeded.
+//
+// Sleeps (where wait_source is iree_wait_source_delay) will delay the
+// completion task until the delay time is reached or exceeded and will do so
+// without triggering an IREE_STATUS_DEADLINE_EXCEEDED.
+//
+// Wait-all behavior can be modeled with multiple wait tasks joined on one task;
+// all of the waits must successfully resolve prior to the completion task being
+// issued. If any wait fails then the scope is failed.
+//
+// Wait-any behavior can be modeled with multiple wait tasks joined on one task
+// as with wait-all but with each sharing a cancellation flag and having the
+// IREE_TASK_FLAG_WAIT_ANY bit set. If any wait successfully resolves or fails
+// the flag will be set to cancel all sibling waits. The cancellation flag must
+// be owned by the completion task to ensure that it is live for the lifetime of
+// all wait tasks sharing it. In more sophisticated scenarios the cancellation
+// flag may be owned by anything in the system that can guarantee the lifetime,
+// enabling cancellation actions from external code.
+//
+// Non-failing deadlines can be implemented with a wait-any on one or more wait
+// sources as well as on a delay task: if the delay task is resolved before any
+// of the other waits they will be cancelled and the completion task will be
+// issued without an IREE_STATUS_DEADLINE_EXCEEDED being emitted.
+typedef iree_alignas(iree_max_align_t) struct {
+  // Task header: implementation detail, do not use.
+  iree_task_t header;
+
+  // The wait source that the task is waiting on.
+  // May be iree_wait_source_immediate if the wait is neutered or
+  // iree_wait_source_delay if this is a delay (sleep).
+  iree_wait_source_t wait_source;
+
+  // Deadline for the wait; if this time elapses the wait will be failed with
+  // IREE_STATUS_DEADLINE_EXCEEDED. May be IREE_TIME_INFINITE_FUTURE to indicate
+  // that the wait has no deadline.
+  iree_time_t deadline_ns;
+
+  // Optional pointer to a shared cancellation flag.
+  // Set to non-zero to have the wait cancel and issue the completion task as if
+  // it had successfully waited. No error will be raised and the completion task
+  // will need to handle the wake. This is used to model wait-any behavior where
+  // multiple waits can be issued but if any one resolves all waits are silently
+  // cancelled.
+  //
+  // The flag memory must remain valid until all waits sharing it have retired.
+  // For a wait-any it would commonly be stored on the completion task to ensure
+  // that no waits tasks will be live when it is cleaned up.
+  //
+  // If omitted no cancellation behavior is enabled.
+  // If specified the wait task will check the flag prior to entering a system
+  // wait scope. Cancellation does not impact waits once the system is entered.
+  // If the IREE_TASK_FLAG_WAIT_ANY bit is set on the task the cancellation flag
+  // will be set to non-zero after it resolves in order to cancel the sibling
+  // waits in the wait-any operation.
+  iree_atomic_int32_t* cancellation_flag;
+} iree_task_wait_t;
+
+// Initializes |out_task| as a wait task on |wait_source|.
+// The wait will fail with IREE_STATUS_DEADLINE_EXCEEDED if |deadline_ns| is
+// exceeded prior to the wait resolving. If the wait fails (system error, etc)
+// the failure will be propagated to the |scope|.
+void iree_task_wait_initialize(iree_task_scope_t* scope,
+                               iree_wait_source_t wait_source,
+                               iree_time_t deadline_ns,
+                               iree_task_wait_t* out_task);
+
+// Initializes |out_task| as a delay until the given |deadline_ns| is reached or
+// exceeded. The completion task will be issued instead of failing with an
+// IREE_STATUS_DEADLINE_EXCEEDED.
+void iree_task_wait_initialize_delay(iree_task_scope_t* scope,
+                                     iree_time_t deadline_ns,
+                                     iree_task_wait_t* out_task);
+
+// Sets the wait |task| to a cooperative wait-any mode by marking the
+// IREE_TASK_FLAG_WAIT_ANY bit and storing the |cancellation_flag|.
+// The cancellation flag must be kept live until after the wait task has
+// retired.
+void iree_task_wait_set_wait_any(iree_task_wait_t* task,
+                                 iree_atomic_int32_t* cancellation_flag);
+
+//==============================================================================
+// IREE_TASK_TYPE_DISPATCH_* structures
+//==============================================================================
+
+// Statistics tracked across an entire dispatch operation.
+// Each tile contributes to these statistics as they execute to provide an
+// aggregate set of statistics that can be reported to tracing/user queries.
+//
+// We want to keep this structure relatively compact as it does add overhead.
+// If statistics are used purely for interactive tracing then they can be
+// piped directly to the tracing tool using IREE_TRACE_* macros. If the
+// statistics are programmatically queried for benchmarks or reporting then
+// they belong here where we can efficiently move them around.
+//
+// If we find ourselves with a lot of hardware-specific counters (vs more
+// generic ones like 'l2 cache misses' or 'ipc') then we can sprinkle in some
+// #ifdefs.
+typedef struct iree_task_dispatch_statistics_t {
+  // TODO(benvanik): statistics counters.
+  // NOTE: each of these increases the command buffer storage requirements; we
+  // should always guard these with IREE_STATISTICS_ENABLE.
+  iree_atomic_int32_t reserved;
+} iree_task_dispatch_statistics_t;
+
+// Merges statistics from |source| to |target| atomically per-field.
+// As each field is updated independently and in a relaxed memory order it's
+// possible for statistics consumers to see a tear.
+void iree_task_dispatch_statistics_merge(
+    const iree_task_dispatch_statistics_t* source,
+    iree_task_dispatch_statistics_t* target);
+
+typedef struct iree_task_tile_storage_t {
+  // TODO(benvanik): coroutine storage.
+  // Ideally we'll be able to have a fixed coroutine storage size per dispatch
+  // (via @llvm.coro.size) such that we can preallocate all of the storage for
+  // a dispatch in one shot. If we need to do dynamic allocation we will need a
+  // ringbuffer or other kind of pool to allocate from on-demand.
+  uint32_t reserved;
+} iree_task_tile_storage_t;
+
+// Per-tile context provided to each dispatch function invocation in the grid.
+// This information is unique to the tile being dispatched and may contain
+// specific state about the calling thread/fiber/etc.
+//
+// If tile execution is suspended by hitting a coroutine suspend point then the
+// coroutine state will be stored within the tile context until the tile is
+// resumed.
+typedef iree_alignas(iree_max_align_t) struct {
+  // Workgroup ID for the current invocation.
+  uint32_t workgroup_xyz[3];
+  // Workgroup size for each invocation.
+  uint32_t workgroup_size[3];
+  // Total workgroup count for the task. Can be used in conjunction with the
+  // per-invocation workgroup_xyz and workgroup_size to compute offsets/indices.
+  uint32_t workgroup_count[3];
+  // TODO(benvanik): workgroup index to amortize calculating linear offsets.
+  // (like gl_GlobalInvocationID)
+
+  // Opaque ID of the processor executing the tile.
+  // May be slightly out of date or 0 if the processor could not be queried.
+  iree_cpu_processor_id_t processor_id;
+
+  // Tile-local memory that is pinned to each worker ensuring no cache
+  // thrashing. Aligned to at least the natural pointer size of the machine.
+  // Contents are (today) undefined upon entry.
+  iree_byte_span_t local_memory;
+
+  // Shared statistics counters for the dispatch shard.
+  iree_task_dispatch_statistics_t* statistics;
+} iree_task_tile_context_t;
+
+typedef struct iree_task_dispatch_t iree_task_dispatch_t;
+
+//==============================================================================
+// Dispatch function closures
+//==============================================================================
+
+typedef iree_status_t(IREE_API_PTR* iree_task_dispatch_closure_fn_t)(
+    void* user_context, const iree_task_tile_context_t* tile_context,
+    iree_task_submission_t* pending_submission);
+
+// A function closure representing the function to call and its arguments.
+typedef struct iree_task_dispatch_closure_t {
+  // Function called per tile invocation.
+  iree_task_dispatch_closure_fn_t fn;
+
+  // User-defined argument passed to task functions during invocation.
+  // Opaque pointer-sized values that could point to user data structures or
+  // contain embedded values. No lifetime management is performed by the task
+  // system and it is required that users ensure that the memory referenced is
+  // live until after the task has completed.
+  void* user_context;
+} iree_task_dispatch_closure_t;
+
+// Binds a function pointer and the arguments it should be called with.
+// If the arguments represent pointers they must remain live until the task
+// has completed execution.
+static inline iree_task_dispatch_closure_t iree_task_make_dispatch_closure(
+    iree_task_dispatch_closure_fn_t fn, void* user_context) {
+  iree_task_dispatch_closure_t closure = {fn, user_context};
+  return closure;
+}
+
+//==============================================================================
+// IREE_TASK_TYPE_DISPATCH
+//==============================================================================
+
+// An execution request across a tiled grid.
+// Dispatches are fork points where zero or more dispatch shard tasks are
+// spawned and processed prior to joining again on the dispatch completion task.
+//
+// The total workgroup count defines the [x,y,z] extents of the dispatch grid.
+// The count may either be embedded directly into the dispatch or provided as a
+// pointer to the workgroup_count[3] that will be read immediately prior to
+// forking. If any dimension of the workgroup count is zero then the dispatch is
+// skipped and the completion task will be readied immediately.
+//
+// Example:
+//   dispatch([5, 1, 1])
+//     forked into shards based on affinity/scheduling parameters:
+//     -> dispatch_shard for core 0, processes [0-1, 1, 1]
+//     -> dispatch_shard for core 1, processes [2-3, 1, 1]
+//     -> dispatch_shard for core 2, processes [4-5, 1, 1]
+//   completion_task run after all shards complete
+typedef iree_alignas(iree_max_align_t) struct iree_task_dispatch_t {
+  // Task header: implementation detail, do not use.
+  iree_task_t header;
+
+  // Function closure to call per tile.
+  iree_task_dispatch_closure_t closure;
+
+  // Workgroup size for each invocation. Passed on to tiles without
+  // modification and not used for scheduling.
+  uint32_t workgroup_size[3];
+
+  // 3D workgroup count used to tile the dispatch.
+  // [1,1,1] specifies single invocation of the function. A value of 0 in
+  // any dimension will skip execution of the function.
+  union {
+    // Embedded immutable 3D workgroup count value.
+    uint32_t value[3];
+    // Pointer to the uint32_t[3] containing the 3D workgroup count.
+    // Sampled immediately prior to execution.
+    const uint32_t* ptr;
+  } workgroup_count;
+
+  // Optional transient shared memory size in bytes to allocate and pass into
+  // the iree_task_tile_context_t::local_memory of each invocation of the
+  // dispatch closure.
+  uint32_t local_memory_size;
+
+  // Resulting status from the dispatch available once all workgroups have
+  // completed (or would have completed). If multiple shards processing the
+  // workgroups hit an error the first will be taken and the result ignored. A
+  // dispatch with a non-ok status will mark the parent task scope as failing
+  // when it retires.
+  iree_atomic_intptr_t status;
+
+  // Statistics storage used for aggregating counters across all shards.
+  iree_task_dispatch_statistics_t statistics;
+
+  // The total number of tiles in the dispatch bounding tile_index.
+  uint32_t tile_count;
+
+  // Maximum number of tiles to fetch per tile reservation from the grid.
+  // Bounded by IREE_TASK_DISPATCH_MAX_TILES_PER_SHARD_RESERVATION and a
+  // reasonable number chosen based on the tile and shard counts.
+  uint32_t tiles_per_reservation;
+
+  // The tail tile index; the next reservation will start from here.
+  // This is used by shards to slice off the work to perform in their inner
+  // loop. Ideally we'd have no destructive interference with other shared data
+  // in this structure but the shared parts (status/statistics) are updated once
+  // per shard instead of once per slice and are less of a concern.
+  iree_atomic_int32_t tile_index;
+
+  // Incrementing process-lifetime dispatch identifier.
+  IREE_TRACE(int64_t dispatch_id;)
+} iree_task_dispatch_t;
+
+void iree_task_dispatch_initialize(iree_task_scope_t* scope,
+                                   iree_task_dispatch_closure_t closure,
+                                   const uint32_t workgroup_size[3],
+                                   const uint32_t workgroup_count[3],
+                                   iree_task_dispatch_t* out_task);
+
+void iree_task_dispatch_initialize_indirect(
+    iree_task_scope_t* scope, iree_task_dispatch_closure_t closure,
+    const uint32_t workgroup_size[3], const uint32_t* workgroup_count_ptr,
+    iree_task_dispatch_t* out_task);
+
+//==============================================================================
+// IREE_TASK_TYPE_DISPATCH_SHARD
+//==============================================================================
+
+typedef iree_alignas(iree_max_align_t) struct {
+  // Task header: implementation detail, do not use.
+  iree_task_t header;
+
+  // NOTE: the parent dispatch task this shard is applied to is in the
+  // header.completion_task field.
+} iree_task_dispatch_shard_t;
+
+void iree_task_dispatch_shard_initialize(iree_task_dispatch_t* dispatch_task,
+                                         iree_task_dispatch_shard_t* out_task);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_TASK_TASK_H_
diff --git a/runtime/src/iree/task/task_impl.h b/runtime/src/iree/task/task_impl.h
new file mode 100644
index 0000000..ee1b5a3
--- /dev/null
+++ b/runtime/src/iree/task/task_impl.h
@@ -0,0 +1,132 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_TASK_TASK_IMPL_H_
+#define IREE_TASK_TASK_IMPL_H_
+
+#include "iree/task/list.h"
+#include "iree/task/pool.h"
+#include "iree/task/post_batch.h"
+#include "iree/task/submission.h"
+#include "iree/task/task.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//==============================================================================
+// IREE_TASK_TYPE_NOP
+//==============================================================================
+
+// Retires a no-op task.
+// No-op tasks don't *do* anything but must still be handled like any other
+// task in the system so dependent tasks are properly scheduled.
+void iree_task_nop_retire(iree_task_nop_t* task,
+                          iree_task_submission_t* pending_submission);
+
+//==============================================================================
+// IREE_TASK_TYPE_CALL
+//==============================================================================
+
+// Executes and retires a user call.
+// May block the caller for an indeterminate amount of time and should only be
+// called from threads owned by or donated to the executor.
+//
+// Errors are propagated to the parent scope.
+void iree_task_call_execute(iree_task_call_t* task,
+                            iree_task_submission_t* pending_submission);
+
+//==============================================================================
+// IREE_TASK_TYPE_BARRIER
+//==============================================================================
+
+// Retires a barrier task by notifying all dependent tasks.
+// May add zero or more tasks to the |pending_submission| if they are ready.
+//
+// Only called during coordination and expects the coordinator lock to be held.
+void iree_task_barrier_retire(iree_task_barrier_t* task,
+                              iree_task_submission_t* pending_submission);
+
+//==============================================================================
+// IREE_TASK_TYPE_FENCE
+//==============================================================================
+
+// Retires a fence task by updating the scope state.
+//
+// Only called during coordination and expects the coordinator lock to be held.
+void iree_task_fence_retire(iree_task_fence_t* task,
+                            iree_task_submission_t* pending_submission);
+
+//==============================================================================
+// IREE_TASK_TYPE_WAIT
+//==============================================================================
+
+// Returns true if the user-specified condition on the task is true.
+//
+// Only called during coordination and expects the coordinator lock to be held.
+bool iree_task_wait_check_condition(iree_task_wait_t* task);
+
+// Retires a wait when it has completed waiting (successfully or not).
+//
+// Only called during coordination and expects the coordinator lock to be held.
+void iree_task_wait_retire(iree_task_wait_t* task,
+                           iree_task_submission_t* pending_submission,
+                           iree_status_t status);
+
+//==============================================================================
+// IREE_TASK_TYPE_DISPATCH
+//==============================================================================
+
+// Schedules a dispatch by forking out to zero or more shards that will be
+// executed on workers. The shards are allocated from an executor-owned pool
+// and are generally not user-visible - they'll just see their dispatch begin
+// execution prior to the shards and end execution after the last shard
+// finishes.
+//
+// Only called during coordination and expects the coordinator lock to be held.
+void iree_task_dispatch_issue(iree_task_dispatch_t* dispatch_task,
+                              iree_task_pool_t* shard_task_pool,
+                              iree_task_submission_t* pending_submission,
+                              iree_task_post_batch_t* post_batch);
+
+// Retires a dispatch when all issued shards have completed executing.
+//
+// Only called during coordination and expects the coordinator lock to be held.
+void iree_task_dispatch_retire(iree_task_dispatch_t* dispatch_task,
+                               iree_task_submission_t* pending_submission);
+
+//==============================================================================
+// IREE_TASK_TYPE_DISPATCH_SHARD
+//==============================================================================
+
+// Allocates a dispatch shard task from the shared executor task pool.
+// The shard will be released back to the pool when it has completed execution.
+iree_task_dispatch_shard_t* iree_task_dispatch_shard_allocate(
+    iree_task_dispatch_t* dispatch_task, iree_task_pool_t* shard_task_pool);
+
+// Executes and retires a dispatch shard task.
+// May block the caller for an indeterminate amount of time and should only be
+// called from threads owned by or donated to the executor.
+//
+// |processor_id| is a guess as to which logical processor the shard is
+// executing on. It may be out of date or 0 if the processor could not be
+// queried.
+//
+// |worker_local_memory| is a block of memory exclusively available to the shard
+// during execution. Contents are undefined both before and after execution.
+//
+// Errors are propagated to the parent scope and the dispatch will fail once
+// all shards have completed.
+void iree_task_dispatch_shard_execute(
+    iree_task_dispatch_shard_t* task, iree_cpu_processor_id_t processor_id,
+    iree_byte_span_t worker_local_memory,
+    iree_task_submission_t* pending_submission);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_TASK_TASK_IMPL_H_
diff --git a/runtime/src/iree/task/task_test_barrier.cc b/runtime/src/iree/task/task_test_barrier.cc
new file mode 100644
index 0000000..135f63f
--- /dev/null
+++ b/runtime/src/iree/task/task_test_barrier.cc
@@ -0,0 +1,323 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <atomic>
+#include <cstdint>
+
+#include "iree/base/api.h"
+#include "iree/task/submission.h"
+#include "iree/task/task.h"
+#include "iree/task/testing/task_test.h"
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace {
+
+using iree::Status;
+using iree::StatusCode;
+using iree::testing::status::StatusIs;
+
+class TaskBarrierTest : public TaskTest {};
+
+enum {
+  TASK_A = 1 << 0,
+  TASK_B = 1 << 1,
+  TASK_C = 1 << 2,
+  TASK_D = 1 << 3,
+};
+
+// We track which tasks were successfully executed
+struct TaskCtx {
+  std::atomic<uint32_t> tasks_called = {0};
+};
+
+#define MAKE_CALL_TASK_CLOSURE(task_ctx, task_id, status_code) \
+  iree_task_make_call_closure(                                 \
+      [](void* user_context, iree_task_t* task,                \
+         iree_task_submission_t* pending_submission) {         \
+        IREE_TRACE_SCOPE();                                    \
+        auto* ctx = (TaskCtx*)user_context;                    \
+        EXPECT_EQ(0, (ctx->tasks_called & (task_id)));         \
+        ctx->tasks_called |= (task_id);                        \
+        return iree_status_from_code(status_code);             \
+      },                                                       \
+      (void*)task_ctx)
+
+// Issues a standalone empty barrier:
+//  { barrier }
+TEST_F(TaskBarrierTest, IssueStandalone) {
+  iree_task_barrier_t barrier_task;
+  iree_task_barrier_initialize_empty(&scope_, &barrier_task);
+  IREE_ASSERT_OK(
+      SubmitTasksAndWaitIdle(&barrier_task.header, &barrier_task.header));
+}
+
+// Issues a serialized sequence:
+//  { a | barrier | b }
+TEST_F(TaskBarrierTest, IssueSequence) {
+  IREE_TRACE_SCOPE();
+  TaskCtx task_ctx;
+
+  iree_task_call_t task_a;
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_A, IREE_STATUS_OK),
+      &task_a);
+  iree_task_call_t task_b;
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_B, IREE_STATUS_OK),
+      &task_b);
+
+  iree_task_t* dependent_tasks[1] = {&task_b.header};
+  iree_task_barrier_t barrier_task;
+  iree_task_barrier_initialize(&scope_, IREE_ARRAYSIZE(dependent_tasks),
+                               dependent_tasks, &barrier_task);
+  iree_task_set_completion_task(&task_a.header, &barrier_task.header);
+
+  IREE_ASSERT_OK(SubmitTasksAndWaitIdle(&task_a.header, &task_b.header));
+  EXPECT_EQ(TASK_A | TASK_B, task_ctx.tasks_called);
+}
+
+// Issues a serialized sequence where task A fails:
+//  { a | barrier | b }
+// B should not be run.
+TEST_F(TaskBarrierTest, IssueSequenceFailure) {
+  IREE_TRACE_SCOPE();
+  TaskCtx task_ctx;
+
+  iree_task_call_t task_a;
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_A, IREE_STATUS_DATA_LOSS),
+      &task_a);
+  iree_task_call_t task_b;
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_B, IREE_STATUS_OK),
+      &task_b);
+
+  iree_task_t* dependent_tasks[1] = {&task_b.header};
+  iree_task_barrier_t barrier_task;
+  iree_task_barrier_initialize(&scope_, IREE_ARRAYSIZE(dependent_tasks),
+                               dependent_tasks, &barrier_task);
+  iree_task_set_completion_task(&task_a.header, &barrier_task.header);
+
+  IREE_ASSERT_OK(SubmitTasksAndWaitIdle(&task_a.header, &task_b.header));
+  EXPECT_EQ(TASK_A, task_ctx.tasks_called);
+  EXPECT_THAT(Status(iree_task_scope_consume_status(&scope_)),
+              StatusIs(StatusCode::kDataLoss));
+}
+
+// Issues a deeply serialized sequence where task A fails:
+//  { a | barrier | b | barrier | c }
+// B and C should not be run.
+TEST_F(TaskBarrierTest, IssueDeepSequenceFailure) {
+  IREE_TRACE_SCOPE();
+  TaskCtx task_ctx;
+
+  iree_task_call_t task_a;
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_A, IREE_STATUS_DATA_LOSS),
+      &task_a);
+  iree_task_call_t task_b;
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_B, IREE_STATUS_OK),
+      &task_b);
+  iree_task_call_t task_c;
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_C, IREE_STATUS_OK),
+      &task_c);
+
+  iree_task_t* dependent_tasks_0[1] = {&task_b.header};
+  iree_task_barrier_t barrier_task_0;
+  iree_task_barrier_initialize(&scope_, IREE_ARRAYSIZE(dependent_tasks_0),
+                               dependent_tasks_0, &barrier_task_0);
+  iree_task_set_completion_task(&task_a.header, &barrier_task_0.header);
+
+  iree_task_t* dependent_tasks_1[1] = {&task_c.header};
+  iree_task_barrier_t barrier_task_1;
+  iree_task_barrier_initialize(&scope_, IREE_ARRAYSIZE(dependent_tasks_1),
+                               dependent_tasks_1, &barrier_task_1);
+  iree_task_set_completion_task(&task_b.header, &barrier_task_1.header);
+
+  IREE_ASSERT_OK(SubmitTasksAndWaitIdle(&task_a.header, &task_c.header));
+  EXPECT_EQ(TASK_A, task_ctx.tasks_called);
+  EXPECT_THAT(Status(iree_task_scope_consume_status(&scope_)),
+              StatusIs(StatusCode::kDataLoss));
+}
+
+// Issues a join:
+//  { a, b, c | barrier | d }
+TEST_F(TaskBarrierTest, IssueJoin) {
+  IREE_TRACE_SCOPE();
+  TaskCtx task_ctx;
+
+  iree_task_call_t task_a;
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_A, IREE_STATUS_OK),
+      &task_a);
+  iree_task_call_t task_b;
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_B, IREE_STATUS_OK),
+      &task_b);
+  iree_task_call_t task_c;
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_C, IREE_STATUS_OK),
+      &task_c);
+  iree_task_call_t task_d;
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_D, IREE_STATUS_OK),
+      &task_d);
+
+  iree_task_t* dependent_tasks[1] = {&task_d.header};
+  iree_task_barrier_t barrier_task;
+  iree_task_barrier_initialize(&scope_, IREE_ARRAYSIZE(dependent_tasks),
+                               dependent_tasks, &barrier_task);
+  iree_task_set_completion_task(&task_a.header, &barrier_task.header);
+  iree_task_set_completion_task(&task_b.header, &barrier_task.header);
+  iree_task_set_completion_task(&task_c.header, &barrier_task.header);
+
+  iree_task_submission_t submission;
+  iree_task_submission_initialize(&submission);
+  iree_task_submission_enqueue(&submission, &task_a.header);
+  iree_task_submission_enqueue(&submission, &task_b.header);
+  iree_task_submission_enqueue(&submission, &task_c.header);
+  IREE_ASSERT_OK(SubmitAndWaitIdle(&submission, &task_d.header));
+  EXPECT_EQ(TASK_A | TASK_B | TASK_C | TASK_D, task_ctx.tasks_called);
+}
+
+// Issues a join where a dependent task B fails:
+//  { a, b, c | barrier | d }
+// A, B, and C should all run but the barrier should fail and D should not.
+TEST_F(TaskBarrierTest, IssueJoinFailure) {
+  IREE_TRACE_SCOPE();
+  TaskCtx task_ctx;
+
+  iree_task_call_t task_a;
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_A, IREE_STATUS_OK),
+      &task_a);
+  iree_task_call_t task_b;
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_B, IREE_STATUS_DATA_LOSS),
+      &task_b);
+  iree_task_call_t task_c;
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_C, IREE_STATUS_OK),
+      &task_c);
+  iree_task_call_t task_d;
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_D, IREE_STATUS_OK),
+      &task_d);
+
+  iree_task_t* dependent_tasks[1] = {&task_d.header};
+  iree_task_barrier_t barrier_task;
+  iree_task_barrier_initialize(&scope_, IREE_ARRAYSIZE(dependent_tasks),
+                               dependent_tasks, &barrier_task);
+  iree_task_set_completion_task(&task_a.header, &barrier_task.header);
+  iree_task_set_completion_task(&task_b.header, &barrier_task.header);
+  iree_task_set_completion_task(&task_c.header, &barrier_task.header);
+
+  iree_task_submission_t submission;
+  iree_task_submission_initialize(&submission);
+  iree_task_submission_enqueue(&submission, &task_a.header);
+  iree_task_submission_enqueue(&submission, &task_b.header);
+  iree_task_submission_enqueue(&submission, &task_c.header);
+  IREE_ASSERT_OK(SubmitAndWaitIdle(&submission, &task_d.header));
+  EXPECT_EQ(TASK_A | TASK_B | TASK_C, task_ctx.tasks_called);
+  EXPECT_THAT(Status(iree_task_scope_consume_status(&scope_)),
+              StatusIs(StatusCode::kDataLoss));
+}
+
+// Issues a fork:
+//  { a | barrier | b, c, d | nop }
+TEST_F(TaskBarrierTest, IssueFork) {
+  IREE_TRACE_SCOPE();
+  TaskCtx task_ctx;
+
+  iree_task_call_t task_a;
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_A, IREE_STATUS_OK),
+      &task_a);
+  iree_task_call_t task_b;
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_B, IREE_STATUS_OK),
+      &task_b);
+  iree_task_call_t task_c;
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_C, IREE_STATUS_OK),
+      &task_c);
+  iree_task_call_t task_d;
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_D, IREE_STATUS_OK),
+      &task_d);
+
+  iree_task_t* dependent_tasks[3] = {
+      &task_b.header,
+      &task_c.header,
+      &task_d.header,
+  };
+  iree_task_barrier_t barrier_task;
+  iree_task_barrier_initialize(&scope_, IREE_ARRAYSIZE(dependent_tasks),
+                               dependent_tasks, &barrier_task);
+  iree_task_set_completion_task(&task_a.header, &barrier_task.header);
+
+  // Just to give us a tail task to wait on.
+  iree_task_nop_t nop_task;
+  iree_task_nop_initialize(&scope_, &nop_task);
+  iree_task_set_completion_task(&task_b.header, &nop_task.header);
+  iree_task_set_completion_task(&task_c.header, &nop_task.header);
+  iree_task_set_completion_task(&task_d.header, &nop_task.header);
+
+  IREE_ASSERT_OK(SubmitTasksAndWaitIdle(&task_a.header, &nop_task.header));
+  EXPECT_EQ(TASK_A | TASK_B | TASK_C | TASK_D, task_ctx.tasks_called);
+}
+
+// Issues a fork where task A fails:
+//  { a (fails) | barrier | b, c, d | nop }
+// The barrier should fail and none of the subsequent tasks B, C, D should run.
+TEST_F(TaskBarrierTest, IssueForkFailure) {
+  IREE_TRACE_SCOPE();
+  TaskCtx task_ctx;
+
+  iree_task_call_t task_a;
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_A, IREE_STATUS_DATA_LOSS),
+      &task_a);
+  iree_task_call_t task_b;
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_B, IREE_STATUS_OK),
+      &task_b);
+  iree_task_call_t task_c;
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_C, IREE_STATUS_OK),
+      &task_c);
+  iree_task_call_t task_d;
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_D, IREE_STATUS_OK),
+      &task_d);
+
+  iree_task_t* dependent_tasks[3] = {
+      &task_b.header,
+      &task_c.header,
+      &task_d.header,
+  };
+  iree_task_barrier_t barrier_task;
+  iree_task_barrier_initialize(&scope_, IREE_ARRAYSIZE(dependent_tasks),
+                               dependent_tasks, &barrier_task);
+  iree_task_set_completion_task(&task_a.header, &barrier_task.header);
+
+  // Just to give us a tail task to wait on.
+  iree_task_nop_t nop_task;
+  iree_task_nop_initialize(&scope_, &nop_task);
+  iree_task_set_completion_task(&task_b.header, &nop_task.header);
+  iree_task_set_completion_task(&task_c.header, &nop_task.header);
+  iree_task_set_completion_task(&task_d.header, &nop_task.header);
+
+  IREE_ASSERT_OK(SubmitTasksAndWaitIdle(&task_a.header, &nop_task.header));
+  EXPECT_EQ(TASK_A, task_ctx.tasks_called);
+  EXPECT_THAT(Status(iree_task_scope_consume_status(&scope_)),
+              StatusIs(StatusCode::kDataLoss));
+}
+
+}  // namespace
diff --git a/runtime/src/iree/task/task_test_call.cc b/runtime/src/iree/task/task_test_call.cc
new file mode 100644
index 0000000..5572fac
--- /dev/null
+++ b/runtime/src/iree/task/task_test_call.cc
@@ -0,0 +1,333 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+
+#include "iree/base/api.h"
+#include "iree/task/submission.h"
+#include "iree/task/task.h"
+#include "iree/task/testing/task_test.h"
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace {
+
+using iree::Status;
+using iree::StatusCode;
+using iree::testing::status::StatusIs;
+
+class TaskCallTest : public TaskTest {};
+
+// Tests issuing a single call and waiting for it to complete.
+TEST_F(TaskCallTest, Issue) {
+  IREE_TRACE_SCOPE();
+
+  struct TestCtx {
+    int did_call = 0;
+  };
+  TestCtx ctx;
+
+  iree_task_call_t task;
+  iree_task_call_initialize(&scope_,
+                            iree_task_make_call_closure(
+                                [](void* user_context, iree_task_t* task,
+                                   iree_task_submission_t* pending_submission) {
+                                  IREE_TRACE_SCOPE();
+                                  auto* ctx = (TestCtx*)user_context;
+                                  EXPECT_TRUE(NULL != ctx);
+                                  EXPECT_EQ(0, ctx->did_call);
+                                  ++ctx->did_call;
+                                  return iree_ok_status();
+                                },
+                                (void*)&ctx),
+                            &task);
+  IREE_ASSERT_OK(SubmitTasksAndWaitIdle(&task.header, &task.header));
+  EXPECT_EQ(1, ctx.did_call);
+  IREE_EXPECT_OK(iree_task_scope_consume_status(&scope_));
+}
+
+// Tests issuing a single call that returns a failure.
+// The failure should be propagated back on the task scope.
+TEST_F(TaskCallTest, IssueFailure) {
+  IREE_TRACE_SCOPE();
+
+  struct TestCtx {
+    int did_call = 0;
+  };
+  TestCtx ctx;
+
+  // Call successfully issues but fails with some user error.
+  iree_task_call_t task;
+  iree_task_call_initialize(&scope_,
+                            iree_task_make_call_closure(
+                                [](void* user_context, iree_task_t* task,
+                                   iree_task_submission_t* pending_submission) {
+                                  IREE_TRACE_SCOPE();
+                                  auto* ctx = (TestCtx*)user_context;
+                                  EXPECT_TRUE(NULL != ctx);
+                                  EXPECT_EQ(0, ctx->did_call);
+                                  ++ctx->did_call;
+                                  return iree_make_status(
+                                      IREE_STATUS_UNAUTHENTICATED, "whoops!");
+                                },
+                                (void*)&ctx),
+                            &task);
+
+  // The task should still be cleaned up, even if it fails.
+  static int did_cleanup = 0;
+  did_cleanup = 0;
+  iree_task_set_cleanup_fn(
+      &task.header, +[](iree_task_t* task, iree_status_code_t status_code) {
+        IREE_TRACE_SCOPE();
+        EXPECT_EQ(status_code, IREE_STATUS_ABORTED);
+        ++did_cleanup;
+      });
+
+  IREE_ASSERT_OK(SubmitTasksAndWaitIdle(&task.header, &task.header));
+
+  // Expect both the call to have been made and the task cleaned up.
+  // The scope has the failure status.
+  EXPECT_EQ(1, ctx.did_call);
+  EXPECT_EQ(1, did_cleanup);
+  EXPECT_THAT(Status(iree_task_scope_consume_status(&scope_)),
+              StatusIs(StatusCode::kUnauthenticated));
+}
+
+// Tests issuing chained calls where the first fails.
+// The failure should be propagated back on the task scope and the chained call
+// should be aborted.
+TEST_F(TaskCallTest, IssueFailureChained) {
+  IREE_TRACE_SCOPE();
+
+  struct TestCtx {
+    int did_call_a = 0;
+    int did_call_b = 0;
+  };
+  TestCtx ctx;
+
+  // First call that will fail.
+  iree_task_call_t task_a;
+  iree_task_call_initialize(&scope_,
+                            iree_task_make_call_closure(
+                                [](void* user_context, iree_task_t* task,
+                                   iree_task_submission_t* pending_submission) {
+                                  IREE_TRACE_SCOPE();
+                                  auto* ctx = (TestCtx*)user_context;
+                                  EXPECT_TRUE(NULL != ctx);
+                                  EXPECT_EQ(0, ctx->did_call_a);
+                                  ++ctx->did_call_a;
+                                  // Force a failure.
+                                  return iree_make_status(
+                                      IREE_STATUS_UNAUTHENTICATED, "whoops!");
+                                },
+                                (void*)&ctx),
+                            &task_a);
+  static int did_cleanup_a = 0;
+  did_cleanup_a = 0;
+  iree_task_set_cleanup_fn(
+      &task_a.header, +[](iree_task_t* task, iree_status_code_t status_code) {
+        // Expect that the cleanup gets a signal indicating the task failed.
+        IREE_TRACE_SCOPE();
+        EXPECT_EQ(status_code, IREE_STATUS_ABORTED);
+        ++did_cleanup_a;
+      });
+
+  // Second call that will be aborted after the first fails.
+  iree_task_call_t task_b;
+  iree_task_call_initialize(&scope_,
+                            iree_task_make_call_closure(
+                                [](void* user_context, iree_task_t* task,
+                                   iree_task_submission_t* pending_submission) {
+                                  // This should never get called!
+                                  IREE_TRACE_SCOPE();
+                                  auto* ctx = (TestCtx*)user_context;
+                                  EXPECT_TRUE(NULL != ctx);
+                                  EXPECT_EQ(0, ctx->did_call_b);
+                                  ++ctx->did_call_b;
+                                  return iree_ok_status();
+                                },
+                                (void*)&ctx),
+                            &task_b);
+  static int did_cleanup_b = 0;
+  did_cleanup_b = 0;
+  iree_task_set_cleanup_fn(
+      &task_b.header, +[](iree_task_t* task, iree_status_code_t status_code) {
+        // Expect that the cleanup gets a signal indicating the task failed.
+        IREE_TRACE_SCOPE();
+        EXPECT_EQ(status_code, IREE_STATUS_ABORTED);
+        ++did_cleanup_b;
+      });
+
+  // A -> B
+  iree_task_set_completion_task(&task_a.header, &task_b.header);
+
+  IREE_ASSERT_OK(SubmitTasksAndWaitIdle(&task_a.header, &task_b.header));
+
+  // Expect that A was called but B was not, and both were cleaned up.
+  EXPECT_EQ(1, ctx.did_call_a);
+  EXPECT_EQ(1, did_cleanup_a);
+  EXPECT_EQ(0, ctx.did_call_b);
+  EXPECT_EQ(1, did_cleanup_b);
+  EXPECT_THAT(Status(iree_task_scope_consume_status(&scope_)),
+              StatusIs(StatusCode::kUnauthenticated));
+}
+
+// Issues task_a which then issues a nested task_b and waits for it to complete
+// prior to progressing. This models dynamic parallelism:
+// http://developer.download.nvidia.com/GTC/PDF/GTC2012/PresentationPDF/S0338-GTC2012-CUDA-Programming-Model.pdf
+TEST_F(TaskCallTest, IssueNested) {
+  IREE_TRACE_SCOPE();
+
+  struct TestCtx {
+    std::atomic<int> did_call_a = {0};
+    std::atomic<int> did_call_b = {0};
+    std::atomic<bool> has_issued = {false};
+    iree_task_call_t task_b;
+  };
+  TestCtx ctx;
+
+  // task_a will get called twice: the first time it will schedule task_b and
+  // then it'll get called again when task_b completes. This is not the only way
+  // to do this: task_a could set it up so that a task_c ran after task_b
+  // completed instead of getting itself called twice. Both approaches have
+  // their uses.
+  iree_task_call_t task_a;
+  iree_task_call_initialize(
+      &scope_,
+      iree_task_make_call_closure(
+          [](void* user_context, iree_task_t* task,
+             iree_task_submission_t* pending_submission) {
+            IREE_TRACE_SCOPE();
+            auto* ctx = (TestCtx*)user_context;
+            EXPECT_TRUE(NULL != ctx);
+
+            if (!ctx->has_issued) {
+              ctx->has_issued = true;
+              EXPECT_EQ(0, ctx->did_call_a);
+              ++ctx->did_call_a;
+              iree_task_call_initialize(
+                  task->scope,
+                  iree_task_make_call_closure(
+                      [](void* user_context, iree_task_t* task,
+                         iree_task_submission_t* pending_submission) {
+                        IREE_TRACE_SCOPE();
+                        auto* ctx = (TestCtx*)user_context;
+                        EXPECT_TRUE(NULL != ctx);
+                        EXPECT_EQ(0, ctx->did_call_b);
+                        ++ctx->did_call_b;
+                        return iree_ok_status();
+                      },
+                      user_context),
+                  &ctx->task_b);
+              iree_task_set_completion_task(&ctx->task_b.header, task);
+              iree_task_submission_enqueue(pending_submission,
+                                           &ctx->task_b.header);
+            } else {
+              EXPECT_EQ(1, ctx->did_call_a);
+              ++ctx->did_call_a;
+            }
+
+            return iree_ok_status();
+          },
+          (void*)&ctx),
+      &task_a);
+  IREE_ASSERT_OK(SubmitTasksAndWaitIdle(&task_a.header, &task_a.header));
+  EXPECT_EQ(2, ctx.did_call_a);
+  EXPECT_EQ(1, ctx.did_call_b);
+  IREE_EXPECT_OK(iree_task_scope_consume_status(&scope_));
+}
+
+// Issues task_a which then issues a nested task_b and task_c; task_b fails and
+// it's expected that task_c completes before failing task_a.
+// Sibling tasks don't abort each other and as such we are guaranteed that C
+// will run: A -> [B fail, C ok] -> A fail
+TEST_F(TaskCallTest, IssueNestedFailure) {
+  IREE_TRACE_SCOPE();
+
+  struct TestCtx {
+    std::atomic<int> did_call_a = {0};
+    std::atomic<int> did_call_b = {0};
+    std::atomic<int> did_call_c = {0};
+    std::atomic<bool> has_issued = {false};
+    iree_task_call_t task_b;
+    iree_task_call_t task_c;
+  };
+  TestCtx ctx;
+
+  // task_a will get called only once due to the error: the pre-nesting call
+  // will schedule task_b/task_c and then the expected call after the tasks
+  // complete will not be made as task_b fails.
+  iree_task_call_t task_a;
+  iree_task_call_initialize(
+      &scope_,
+      iree_task_make_call_closure(
+          [](void* user_context, iree_task_t* task,
+             iree_task_submission_t* pending_submission) {
+            auto* ctx = (TestCtx*)user_context;
+            EXPECT_TRUE(NULL != ctx);
+
+            if (!ctx->has_issued) {
+              ctx->has_issued = true;
+              EXPECT_EQ(0, ctx->did_call_a);
+              ++ctx->did_call_a;
+
+              // task_b: (fails)
+              iree_task_call_initialize(
+                  task->scope,
+                  iree_task_make_call_closure(
+                      [](void* user_context, iree_task_t* task,
+                         iree_task_submission_t* pending_submission) {
+                        IREE_TRACE_SCOPE();
+                        auto* ctx = (TestCtx*)user_context;
+                        EXPECT_TRUE(NULL != ctx);
+                        EXPECT_EQ(0, ctx->did_call_b);
+                        ++ctx->did_call_b;
+                        return iree_make_status(IREE_STATUS_DATA_LOSS, "uh oh");
+                      },
+                      user_context),
+                  &ctx->task_b);
+              iree_task_set_completion_task(&ctx->task_b.header, task);
+              iree_task_submission_enqueue(pending_submission,
+                                           &ctx->task_b.header);
+
+              // task_c: (ok)
+              iree_task_call_initialize(
+                  task->scope,
+                  iree_task_make_call_closure(
+                      [](void* user_context, iree_task_t* task,
+                         iree_task_submission_t* pending_submission) {
+                        IREE_TRACE_SCOPE();
+                        auto* ctx = (TestCtx*)user_context;
+                        EXPECT_TRUE(NULL != ctx);
+                        EXPECT_EQ(0, ctx->did_call_c);
+                        ++ctx->did_call_c;
+                        return iree_ok_status();
+                      },
+                      user_context),
+                  &ctx->task_c);
+              iree_task_set_completion_task(&ctx->task_c.header, task);
+              iree_task_submission_enqueue(pending_submission,
+                                           &ctx->task_c.header);
+            } else {
+              EXPECT_EQ(1, ctx->did_call_a);
+              ++ctx->did_call_a;
+            }
+
+            return iree_ok_status();
+          },
+          (void*)&ctx),
+      &task_a);
+  IREE_ASSERT_OK(SubmitTasksAndWaitIdle(&task_a.header, &task_a.header));
+  EXPECT_EQ(1, ctx.did_call_a);
+  EXPECT_EQ(1, ctx.did_call_b);
+  EXPECT_EQ(1, ctx.did_call_c);
+  EXPECT_THAT(Status(iree_task_scope_consume_status(&scope_)),
+              StatusIs(StatusCode::kDataLoss));
+}
+
+}  // namespace
diff --git a/runtime/src/iree/task/task_test_dispatch.cc b/runtime/src/iree/task/task_test_dispatch.cc
new file mode 100644
index 0000000..3324b6c
--- /dev/null
+++ b/runtime/src/iree/task/task_test_dispatch.cc
@@ -0,0 +1,217 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <cstdint>
+#include <cstdio>
+#include <memory>
+
+#include "iree/base/api.h"
+#include "iree/task/submission.h"
+#include "iree/task/task.h"
+#include "iree/task/testing/task_test.h"
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace {
+
+using iree::Status;
+using iree::StatusCode;
+using iree::testing::status::StatusIs;
+
+class GridCoverage {
+ public:
+  explicit GridCoverage(const uint32_t workgroup_count[3])
+      : workgroup_count_(workgroup_count[0] * workgroup_count[1] *
+                         workgroup_count[2]),
+        storage_(new iree_atomic_int32_t[workgroup_count_]) {
+    for (iree_host_size_t i = 0; i < workgroup_count_; ++i) {
+      storage_[i] = IREE_ATOMIC_VAR_INIT(0);
+    }
+  }
+
+  bool Verify() {
+    fflush(stdout);
+    for (iree_host_size_t i = 0; i < workgroup_count_; ++i) {
+      if (iree_atomic_load_int32(&storage_[i], iree_memory_order_seq_cst) !=
+          1) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  static iree_status_t Tile(void* user_context,
+                            const iree_task_tile_context_t* tile_context,
+                            iree_task_submission_t* pending_submission) {
+    GridCoverage* coverage = reinterpret_cast<GridCoverage*>(user_context);
+    uint32_t slot =
+        tile_context->workgroup_xyz[2] * (tile_context->workgroup_count[1] *
+                                          tile_context->workgroup_count[0]) +
+        tile_context->workgroup_xyz[1] * tile_context->workgroup_count[0] +
+        tile_context->workgroup_xyz[0];
+    iree_atomic_fetch_add_int32(&coverage->storage_[slot], 1,
+                                iree_memory_order_seq_cst);
+
+    // Useful when testing large grids:
+    // printf("%u, %u, %u\n", tile_context->workgroup_xyz[0],
+    //        tile_context->workgroup_xyz[1], tile_context->workgroup_xyz[2]);
+
+    return iree_ok_status();
+  }
+
+ private:
+  size_t workgroup_count_;
+  std::unique_ptr<iree_atomic_int32_t[]> storage_;
+};
+
+class TaskDispatchTest : public TaskTest {
+ public:
+  void DispatchAndVerifyGrid(const uint32_t workgroup_size[3],
+                             const uint32_t workgroup_count[3],
+                             uint32_t dispatch_flags) {
+    IREE_TRACE_SCOPE();
+    GridCoverage coverage(workgroup_count);
+    iree_task_dispatch_t task;
+    iree_task_dispatch_initialize(
+        &scope_,
+        iree_task_make_dispatch_closure(GridCoverage::Tile, (void*)&coverage),
+        workgroup_size, workgroup_count, &task);
+    task.header.flags |= dispatch_flags;
+    IREE_ASSERT_OK(SubmitTasksAndWaitIdle(&task.header, &task.header));
+    EXPECT_TRUE(coverage.Verify());
+  }
+};
+
+TEST_F(TaskDispatchTest, Issue000) {
+  IREE_TRACE_SCOPE();
+  const uint32_t kWorkgroupSize[3] = {1, 1, 1};
+  const uint32_t kWorkgroupCount[3] = {0, 0, 0};
+  DispatchAndVerifyGrid(kWorkgroupSize, kWorkgroupCount, IREE_TASK_FLAG_NONE);
+}
+
+TEST_F(TaskDispatchTest, Issue120) {
+  IREE_TRACE_SCOPE();
+  const uint32_t kWorkgroupSize[3] = {1, 1, 1};
+  const uint32_t kWorkgroupCount[3] = {1, 2, 0};
+  DispatchAndVerifyGrid(kWorkgroupSize, kWorkgroupCount, IREE_TASK_FLAG_NONE);
+}
+
+TEST_F(TaskDispatchTest, Issue111) {
+  IREE_TRACE_SCOPE();
+  const uint32_t kWorkgroupSize[3] = {1, 1, 1};
+  const uint32_t kWorkgroupCount[3] = {1, 1, 1};
+  DispatchAndVerifyGrid(kWorkgroupSize, kWorkgroupCount, IREE_TASK_FLAG_NONE);
+}
+
+TEST_F(TaskDispatchTest, Issue345) {
+  IREE_TRACE_SCOPE();
+  const uint32_t kWorkgroupSize[3] = {1, 1, 1};
+  const uint32_t kWorkgroupCount[3] = {3, 4, 5};
+  DispatchAndVerifyGrid(kWorkgroupSize, kWorkgroupCount, IREE_TASK_FLAG_NONE);
+}
+
+TEST_F(TaskDispatchTest, IssueIndirect) {
+  IREE_TRACE_SCOPE();
+
+  static const uint32_t kWorkgroupSize[3] = {1, 1, 1};
+  static const uint32_t kWorkgroupCount[3] = {3, 4, 5};
+  uint32_t indirect_workgroup_count[3] = {0, 0, 0};
+  GridCoverage coverage(kWorkgroupCount);
+
+  iree_task_call_t calculate_task;
+  iree_task_call_initialize(
+      &scope_,
+      iree_task_make_call_closure(
+          [](void* user_context, iree_task_t* task,
+             iree_task_submission_t* pending_submission) {
+            IREE_TRACE_SCOPE();
+            uint32_t* indirect_workgroup_count_ptr = (uint32_t*)user_context;
+            for (size_t i = 0; i < IREE_ARRAYSIZE(kWorkgroupCount); ++i) {
+              indirect_workgroup_count_ptr[i] = kWorkgroupCount[i];
+            }
+            return iree_ok_status();
+          },
+          (void*)indirect_workgroup_count),
+      &calculate_task);
+
+  iree_task_dispatch_t dispatch_task;
+  iree_task_dispatch_initialize_indirect(
+      &scope_,
+      iree_task_make_dispatch_closure(GridCoverage::Tile, (void*)&coverage),
+      kWorkgroupSize, indirect_workgroup_count, &dispatch_task);
+  iree_task_set_completion_task(&calculate_task.header, &dispatch_task.header);
+
+  IREE_ASSERT_OK(
+      SubmitTasksAndWaitIdle(&calculate_task.header, &dispatch_task.header));
+  EXPECT_TRUE(coverage.Verify());
+}
+
+TEST_F(TaskDispatchTest, IssueFailure) {
+  IREE_TRACE_SCOPE();
+
+  const uint32_t kWorkgroupSize[3] = {1, 1, 1};
+  const uint32_t kWorkgroupCount[3] = {64, 1, 1};
+
+  auto tile = [](void* user_context,
+                 const iree_task_tile_context_t* tile_context,
+                 iree_task_submission_t* pending_submission) -> iree_status_t {
+    IREE_TRACE_SCOPE();
+    return tile_context->workgroup_xyz[0] == 32
+               ? iree_make_status(IREE_STATUS_DATA_LOSS, "whoops!")
+               : iree_ok_status();
+  };
+
+  iree_task_dispatch_t task;
+  iree_task_dispatch_initialize(&scope_,
+                                iree_task_make_dispatch_closure(tile, NULL),
+                                kWorkgroupSize, kWorkgroupCount, &task);
+  IREE_ASSERT_OK(SubmitTasksAndWaitIdle(&task.header, &task.header));
+  EXPECT_THAT(Status(iree_task_scope_consume_status(&scope_)),
+              StatusIs(StatusCode::kDataLoss));
+}
+
+TEST_F(TaskDispatchTest, IssueFailureChained) {
+  IREE_TRACE_SCOPE();
+
+  const uint32_t kWorkgroupSize[3] = {1, 1, 1};
+  const uint32_t kWorkgroupCount[3] = {64, 1, 1};
+
+  auto tile = [](void* user_context,
+                 const iree_task_tile_context_t* tile_context,
+                 iree_task_submission_t* pending_submission) -> iree_status_t {
+    return tile_context->workgroup_xyz[0] == 32
+               ? iree_make_status(IREE_STATUS_DATA_LOSS, "whoops!")
+               : iree_ok_status();
+  };
+
+  iree_task_dispatch_t dispatch_task;
+  iree_task_dispatch_initialize(
+      &scope_, iree_task_make_dispatch_closure(tile, NULL), kWorkgroupSize,
+      kWorkgroupCount, &dispatch_task);
+
+  int did_call = 0;
+  iree_task_call_t call_task;
+  iree_task_call_initialize(&scope_,
+                            iree_task_make_call_closure(
+                                [](void* user_context, iree_task_t* task,
+                                   iree_task_submission_t* pending_submission) {
+                                  IREE_TRACE_SCOPE();
+                                  int* did_call_ptr = (int*)user_context;
+                                  ++(*did_call_ptr);
+                                  return iree_ok_status();
+                                },
+                                &did_call),
+                            &call_task);
+  iree_task_set_completion_task(&dispatch_task.header, &call_task.header);
+
+  IREE_ASSERT_OK(
+      SubmitTasksAndWaitIdle(&dispatch_task.header, &call_task.header));
+  EXPECT_EQ(0, did_call);
+  EXPECT_THAT(Status(iree_task_scope_consume_status(&scope_)),
+              StatusIs(StatusCode::kDataLoss));
+}
+
+}  // namespace
diff --git a/runtime/src/iree/task/task_test_fence.cc b/runtime/src/iree/task/task_test_fence.cc
new file mode 100644
index 0000000..0ed7828
--- /dev/null
+++ b/runtime/src/iree/task/task_test_fence.cc
@@ -0,0 +1,83 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/task/task.h"
+#include "iree/task/testing/task_test.h"
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace {
+
+using iree::Status;
+using iree::StatusCode;
+using iree::testing::status::StatusIs;
+
+class TaskFenceTest : public TaskTest {};
+
+// Tests a chain of fences A -> B -> C.
+TEST_F(TaskFenceTest, IssueChained) {
+  iree_task_fence_t task_a;
+  iree_task_fence_initialize(&scope_, iree_wait_primitive_immediate(), &task_a);
+
+  iree_task_fence_t task_b;
+  iree_task_fence_initialize(&scope_, iree_wait_primitive_immediate(), &task_b);
+  iree_task_set_completion_task(&task_a.header, &task_b.header);
+
+  iree_task_fence_t task_c;
+  iree_task_fence_initialize(&scope_, iree_wait_primitive_immediate(), &task_c);
+  iree_task_set_completion_task(&task_b.header, &task_c.header);
+
+  IREE_ASSERT_OK(SubmitTasksAndWaitIdle(&task_a.header, &task_c.header));
+}
+
+// Tests that failures propagate through fences; task B should not be called.
+// A fails -> fence -> B
+TEST_F(TaskFenceTest, IssueChainedFailure) {
+  IREE_TRACE_SCOPE();
+
+  int did_call_a = 0;
+  iree_task_call_t task_a;
+  iree_task_call_initialize(&scope_,
+                            iree_task_make_call_closure(
+                                [](void* user_context, iree_task_t* task,
+                                   iree_task_submission_t* pending_submission) {
+                                  IREE_TRACE_SCOPE();
+                                  int* did_call_ptr = (int*)user_context;
+                                  ++(*did_call_ptr);
+                                  return iree_make_status(IREE_STATUS_DATA_LOSS,
+                                                          "whoops!");
+                                },
+                                &did_call_a),
+                            &task_a);
+
+  iree_task_fence_t fence_task;
+  iree_task_fence_initialize(&scope_, iree_wait_primitive_immediate(),
+                             &fence_task);
+  iree_task_set_completion_task(&task_a.header, &fence_task.header);
+
+  int did_call_b = 0;
+  iree_task_call_t task_b;
+  iree_task_call_initialize(&scope_,
+                            iree_task_make_call_closure(
+                                [](void* user_context, iree_task_t* task,
+                                   iree_task_submission_t* pending_submission) {
+                                  IREE_TRACE_SCOPE();
+                                  int* did_call_ptr = (int*)user_context;
+                                  ++(*did_call_ptr);
+                                  return iree_ok_status();
+                                },
+                                &did_call_b),
+                            &task_b);
+  iree_task_set_completion_task(&fence_task.header, &task_b.header);
+
+  IREE_ASSERT_OK(SubmitTasksAndWaitIdle(&task_a.header, &task_b.header));
+  EXPECT_EQ(1, did_call_a);
+  EXPECT_EQ(0, did_call_b);
+  EXPECT_THAT(Status(iree_task_scope_consume_status(&scope_)),
+              StatusIs(StatusCode::kDataLoss));
+}
+
+}  // namespace
diff --git a/runtime/src/iree/task/task_test_nop.cc b/runtime/src/iree/task/task_test_nop.cc
new file mode 100644
index 0000000..8aeb539
--- /dev/null
+++ b/runtime/src/iree/task/task_test_nop.cc
@@ -0,0 +1,23 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/task/task.h"
+#include "iree/task/testing/task_test.h"
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace {
+
+class TaskNopTest : public TaskTest {};
+
+TEST_F(TaskNopTest, Issue) {
+  IREE_TRACE_SCOPE();
+  iree_task_nop_t task;
+  iree_task_nop_initialize(&scope_, &task);
+  IREE_ASSERT_OK(SubmitTasksAndWaitIdle(&task.header, &task.header));
+}
+
+}  // namespace
diff --git a/runtime/src/iree/task/task_test_wait.cc b/runtime/src/iree/task/task_test_wait.cc
new file mode 100644
index 0000000..907e2fa
--- /dev/null
+++ b/runtime/src/iree/task/task_test_wait.cc
@@ -0,0 +1,297 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <atomic>
+#include <chrono>
+#include <thread>
+
+#include "iree/task/task.h"
+#include "iree/task/testing/task_test.h"
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace {
+
+using iree::Status;
+using iree::StatusCode;
+using iree::testing::status::StatusIs;
+
+// NOTE: we intentionally perform most signaling to/from C++ std::threads.
+// This models a real application that may be passing in handles tied to custom
+// or system primitives unrelated to the task system.
+
+class TaskWaitTest : public TaskTest {};
+
+// Issues a wait task on a handle that has already been signaled.
+// The poller will query the status of the handle and immediately retire the
+// task.
+TEST_F(TaskWaitTest, IssueSignaled) {
+  IREE_TRACE_SCOPE();
+
+  iree_event_t event;
+  iree_event_initialize(/*initial_state=*/true, &event);
+
+  iree_task_wait_t task;
+  iree_task_wait_initialize(&scope_, iree_event_await(&event),
+                            IREE_TIME_INFINITE_FUTURE, &task);
+
+  IREE_ASSERT_OK(SubmitTasksAndWaitIdle(&task.header, &task.header));
+  IREE_EXPECT_OK(iree_task_scope_consume_status(&scope_));
+
+  iree_event_deinitialize(&event);
+}
+
+// Issues a wait task on an unsignaled handle such that the poller must wait.
+// We'll spin up a thread that sets it a short time in the future and ensure
+// that the poller woke and retired the task.
+TEST_F(TaskWaitTest, IssueUnsignaled) {
+  IREE_TRACE_SCOPE();
+
+  iree_event_t event;
+  iree_event_initialize(/*initial_state=*/false, &event);
+
+  iree_task_wait_t task;
+  iree_task_wait_initialize(&scope_, iree_event_await(&event),
+                            IREE_TIME_INFINITE_FUTURE, &task);
+
+  // Spin up a thread that will signal the event after we start waiting on it.
+  std::atomic<bool> has_signaled = {false};
+  std::thread signal_thread([&]() {
+    IREE_TRACE_SCOPE();
+    std::this_thread::sleep_for(std::chrono::milliseconds(150));
+    EXPECT_FALSE(has_signaled);
+    has_signaled = true;
+    iree_event_set(&event);
+  });
+
+  EXPECT_FALSE(has_signaled);
+  IREE_ASSERT_OK(SubmitTasksAndWaitIdle(&task.header, &task.header));
+  EXPECT_TRUE(has_signaled);
+  IREE_EXPECT_OK(iree_task_scope_consume_status(&scope_));
+
+  signal_thread.join();
+  iree_event_deinitialize(&event);
+}
+
+// Issues a wait task on a handle that will never be signaled.
+// We set the deadline in the near future and ensure that the poller correctly
+// fails the wait with a DEADLINE_EXCEEDED.
+TEST_F(TaskWaitTest, IssueTimeout) {
+  IREE_TRACE_SCOPE();
+
+  iree_event_t event;
+  iree_event_initialize(/*initial_state=*/false, &event);
+
+  iree_task_wait_t task;
+  iree_task_wait_initialize(&scope_, iree_event_await(&event),
+                            iree_time_now() + (150 * 1000000), &task);
+
+  IREE_ASSERT_OK(SubmitTasksAndWaitIdle(&task.header, &task.header));
+  EXPECT_THAT(Status(iree_task_scope_consume_status(&scope_)),
+              StatusIs(StatusCode::kDeadlineExceeded));
+
+  iree_event_deinitialize(&event);
+}
+
+// Issues a delay task that should wait until the requested time.
+// NOTE: this kind of test can be flaky - if we have issues we can bump the
+// sleep time up.
+TEST_F(TaskWaitTest, IssueDelay) {
+  IREE_TRACE_SCOPE();
+
+  iree_time_t start_time_ns = iree_time_now();
+
+  iree_task_wait_t task;
+  iree_task_wait_initialize_delay(&scope_, start_time_ns + (50 * 1000000),
+                                  &task);
+
+  IREE_ASSERT_OK(SubmitTasksAndWaitIdle(&task.header, &task.header));
+  IREE_EXPECT_OK(iree_task_scope_consume_status(&scope_));
+
+  iree_time_t end_time_ns = iree_time_now();
+  EXPECT_GE(end_time_ns - start_time_ns, 25 * 1000000);
+}
+
+// Issues multiple waits that join on a single task. This models a wait-all.
+TEST_F(TaskWaitTest, WaitAll) {
+  IREE_TRACE_SCOPE();
+
+  iree_event_t event_a;
+  iree_event_initialize(/*initial_state=*/false, &event_a);
+  iree_task_wait_t task_a;
+  iree_task_wait_initialize(&scope_, iree_event_await(&event_a),
+                            IREE_TIME_INFINITE_FUTURE, &task_a);
+
+  iree_event_t event_b;
+  iree_event_initialize(/*initial_state=*/false, &event_b);
+  iree_task_wait_t task_b;
+  iree_task_wait_initialize(&scope_, iree_event_await(&event_b),
+                            IREE_TIME_INFINITE_FUTURE, &task_b);
+
+  iree_task_t* wait_tasks[] = {&task_a.header, &task_b.header};
+  iree_task_barrier_t barrier;
+  iree_task_barrier_initialize(&scope_, IREE_ARRAYSIZE(wait_tasks), wait_tasks,
+                               &barrier);
+
+  iree_task_fence_t fence;
+  iree_task_fence_initialize(&scope_, iree_wait_primitive_immediate(), &fence);
+  iree_task_set_completion_task(&task_a.header, &fence.header);
+  iree_task_set_completion_task(&task_b.header, &fence.header);
+
+  // Spin up a thread that will signal the event after we start waiting on it.
+  std::atomic<bool> has_signaled = {false};
+  std::thread signal_thread([&]() {
+    IREE_TRACE_SCOPE();
+    std::this_thread::sleep_for(std::chrono::milliseconds(50));
+    EXPECT_FALSE(has_signaled);
+    iree_event_set(&event_a);
+    std::this_thread::sleep_for(std::chrono::milliseconds(50));
+    has_signaled = true;
+    iree_event_set(&event_b);
+  });
+
+  EXPECT_FALSE(has_signaled);
+  IREE_ASSERT_OK(SubmitTasksAndWaitIdle(&barrier.header, &fence.header));
+  EXPECT_TRUE(has_signaled);
+  IREE_EXPECT_OK(iree_task_scope_consume_status(&scope_));
+
+  signal_thread.join();
+  iree_event_deinitialize(&event_a);
+  iree_event_deinitialize(&event_b);
+}
+
+// Issues multiple waits that join on a single task but where one times out.
+TEST_F(TaskWaitTest, WaitAllTimeout) {
+  IREE_TRACE_SCOPE();
+
+  iree_event_t event_a;
+  iree_event_initialize(/*initial_state=*/true, &event_a);
+  iree_task_wait_t task_a;
+  iree_task_wait_initialize(&scope_, iree_event_await(&event_a),
+                            IREE_TIME_INFINITE_FUTURE, &task_a);
+
+  iree_event_t event_b;
+  iree_event_initialize(/*initial_state=*/false, &event_b);
+  iree_task_wait_t task_b;
+  iree_task_wait_initialize(&scope_, iree_event_await(&event_b),
+                            iree_time_now() + (50 * 1000000), &task_b);
+
+  iree_task_t* wait_tasks[] = {&task_a.header, &task_b.header};
+  iree_task_barrier_t barrier;
+  iree_task_barrier_initialize(&scope_, IREE_ARRAYSIZE(wait_tasks), wait_tasks,
+                               &barrier);
+
+  iree_task_fence_t fence;
+  iree_task_fence_initialize(&scope_, iree_wait_primitive_immediate(), &fence);
+  iree_task_set_completion_task(&task_a.header, &fence.header);
+  iree_task_set_completion_task(&task_b.header, &fence.header);
+
+  IREE_ASSERT_OK(SubmitTasksAndWaitIdle(&barrier.header, &fence.header));
+  EXPECT_THAT(Status(iree_task_scope_consume_status(&scope_)),
+              StatusIs(StatusCode::kDeadlineExceeded));
+
+  iree_event_deinitialize(&event_a);
+  iree_event_deinitialize(&event_b);
+}
+
+// Issues multiple waits that join on a single task in wait-any mode.
+// This means that if one wait finishes all other waits will be cancelled and
+// the completion task will continue.
+//
+// Here event_a is signaled but event_b is not.
+TEST_F(TaskWaitTest, WaitAny) {
+  IREE_TRACE_SCOPE();
+
+  // Flag shared between all waits in a group.
+  iree_atomic_int32_t cancellation_flag = IREE_ATOMIC_VAR_INIT(0);
+
+  iree_event_t event_a;
+  iree_event_initialize(/*initial_state=*/false, &event_a);
+  iree_task_wait_t task_a;
+  iree_task_wait_initialize(&scope_, iree_event_await(&event_a),
+                            IREE_TIME_INFINITE_FUTURE, &task_a);
+  iree_task_wait_set_wait_any(&task_a, &cancellation_flag);
+
+  iree_event_t event_b;
+  iree_event_initialize(/*initial_state=*/false, &event_b);
+  iree_task_wait_t task_b;
+  iree_task_wait_initialize(&scope_, iree_event_await(&event_b),
+                            IREE_TIME_INFINITE_FUTURE, &task_b);
+  iree_task_wait_set_wait_any(&task_b, &cancellation_flag);
+
+  iree_task_t* wait_tasks[] = {&task_a.header, &task_b.header};
+  iree_task_barrier_t barrier;
+  iree_task_barrier_initialize(&scope_, IREE_ARRAYSIZE(wait_tasks), wait_tasks,
+                               &barrier);
+
+  iree_task_fence_t fence;
+  iree_task_fence_initialize(&scope_, iree_wait_primitive_immediate(), &fence);
+  iree_task_set_completion_task(&task_a.header, &fence.header);
+  iree_task_set_completion_task(&task_b.header, &fence.header);
+
+  // Spin up a thread that will signal the event after we start waiting on it.
+  std::atomic<bool> has_signaled = {false};
+  std::thread signal_thread([&]() {
+    IREE_TRACE_SCOPE();
+    // NOTE: we only signal event_a - event_b remains unsignaled.
+    std::this_thread::sleep_for(std::chrono::milliseconds(50));
+    EXPECT_FALSE(has_signaled);
+    has_signaled = true;
+    iree_event_set(&event_a);
+  });
+
+  EXPECT_FALSE(has_signaled);
+  IREE_ASSERT_OK(SubmitTasksAndWaitIdle(&barrier.header, &fence.header));
+  EXPECT_TRUE(has_signaled);
+  IREE_EXPECT_OK(iree_task_scope_consume_status(&scope_));
+
+  signal_thread.join();
+  iree_event_deinitialize(&event_a);
+  iree_event_deinitialize(&event_b);
+}
+
+// Issues multiple waits that join on a single task in wait-any mode.
+// Here instead of signaling anything we cause event_a to timeout so that the
+// entire wait is cancelled.
+TEST_F(TaskWaitTest, WaitAnyTimeout) {
+  IREE_TRACE_SCOPE();
+
+  // Flag shared between all waits in a group.
+  iree_atomic_int32_t cancellation_flag = IREE_ATOMIC_VAR_INIT(0);
+
+  iree_event_t event_a;
+  iree_event_initialize(/*initial_state=*/false, &event_a);
+  iree_task_wait_t task_a;
+  iree_task_wait_initialize(&scope_, iree_event_await(&event_a),
+                            iree_time_now() + (50 * 1000000), &task_a);
+  iree_task_wait_set_wait_any(&task_a, &cancellation_flag);
+
+  iree_event_t event_b;
+  iree_event_initialize(/*initial_state=*/false, &event_b);
+  iree_task_wait_t task_b;
+  iree_task_wait_initialize(&scope_, iree_event_await(&event_b),
+                            IREE_TIME_INFINITE_FUTURE, &task_b);
+  iree_task_wait_set_wait_any(&task_b, &cancellation_flag);
+
+  iree_task_t* wait_tasks[] = {&task_a.header, &task_b.header};
+  iree_task_barrier_t barrier;
+  iree_task_barrier_initialize(&scope_, IREE_ARRAYSIZE(wait_tasks), wait_tasks,
+                               &barrier);
+
+  iree_task_fence_t fence;
+  iree_task_fence_initialize(&scope_, iree_wait_primitive_immediate(), &fence);
+  iree_task_set_completion_task(&task_a.header, &fence.header);
+  iree_task_set_completion_task(&task_b.header, &fence.header);
+
+  IREE_ASSERT_OK(SubmitTasksAndWaitIdle(&barrier.header, &fence.header));
+  EXPECT_THAT(Status(iree_task_scope_consume_status(&scope_)),
+              StatusIs(StatusCode::kDeadlineExceeded));
+
+  iree_event_deinitialize(&event_a);
+  iree_event_deinitialize(&event_b);
+}
+
+}  // namespace
diff --git a/runtime/src/iree/task/testing/BUILD b/runtime/src/iree/task/testing/BUILD
new file mode 100644
index 0000000..c355f6b
--- /dev/null
+++ b/runtime/src/iree/task/testing/BUILD
@@ -0,0 +1,33 @@
+# Copyright 2020 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("//iree:build_defs.oss.bzl", "iree_runtime_cc_library")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+iree_runtime_cc_library(
+    name = "task_test",
+    testonly = 1,
+    hdrs = ["task_test.h"],
+    deps = [
+        "//runtime/src/iree/task",
+        "//runtime/src/iree/testing:gtest",
+    ],
+)
+
+iree_runtime_cc_library(
+    name = "test_util",
+    testonly = 1,
+    hdrs = ["test_util.h"],
+    deps = [
+        "//runtime/src/iree/task",
+        "//runtime/src/iree/testing:gtest",
+    ],
+)
diff --git a/runtime/src/iree/task/testing/CMakeLists.txt b/runtime/src/iree/task/testing/CMakeLists.txt
new file mode 100644
index 0000000..9dbd55d
--- /dev/null
+++ b/runtime/src/iree/task/testing/CMakeLists.txt
@@ -0,0 +1,37 @@
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# runtime/src/iree/task/testing/BUILD                                          #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
+
+iree_add_all_subdirs()
+
+iree_cc_library(
+  NAME
+    task_test
+  HDRS
+    "task_test.h"
+  DEPS
+    iree::task
+    iree::testing::gtest
+  TESTONLY
+  PUBLIC
+)
+
+iree_cc_library(
+  NAME
+    test_util
+  HDRS
+    "test_util.h"
+  DEPS
+    iree::task
+    iree::testing::gtest
+  TESTONLY
+  PUBLIC
+)
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/runtime/src/iree/task/testing/task_test.h b/runtime/src/iree/task/testing/task_test.h
new file mode 100644
index 0000000..12068e6
--- /dev/null
+++ b/runtime/src/iree/task/testing/task_test.h
@@ -0,0 +1,77 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// NOTE: the best kind of synchronization is no synchronization; always try to
+// design your algorithm so that you don't need anything from this file :)
+// See https://travisdowns.github.io/blog/2020/07/06/concurrency-costs.html
+
+#ifndef IREE_TASK_TESTING_TASK_TEST_H_
+#define IREE_TASK_TESTING_TASK_TEST_H_
+
+#include <memory>
+
+#include "iree/task/executor.h"
+#include "iree/task/scope.h"
+#include "iree/task/task.h"
+#include "iree/task/topology.h"
+#include "iree/testing/status_matchers.h"
+
+class TaskTest : public ::testing::Test {
+ protected:
+  virtual void SetUp() {
+    iree_task_topology_t topology;
+    iree_task_topology_initialize_from_group_count(8, &topology);
+    IREE_ASSERT_OK(
+        iree_task_executor_create(IREE_TASK_SCHEDULING_MODE_RESERVED, &topology,
+                                  /*worker_local_memory_size=*/(64 * 1024),
+                                  iree_allocator_system(), &executor_));
+    iree_task_topology_deinitialize(&topology);
+
+    iree_task_scope_initialize(iree_make_cstring_view("scope"), &scope_);
+  }
+
+  virtual void TearDown() {
+    iree_task_scope_deinitialize(&scope_);
+
+    iree_task_executor_release(executor_);
+  }
+
+  // Submits a sequence of tasks with |head_task| at the head and |tail_task| at
+  // the tail (they can be the same).
+  iree_status_t SubmitTasksAndWaitIdle(iree_task_t* head_task,
+                                       iree_task_t* tail_task) {
+    iree_task_fence_t* fence = NULL;
+    IREE_RETURN_IF_ERROR(
+        iree_task_executor_acquire_fence(executor_, &scope_, &fence));
+    iree_task_set_completion_task(tail_task, &fence->header);
+
+    iree_task_submission_t submission;
+    iree_task_submission_initialize(&submission);
+    iree_task_submission_enqueue(&submission, head_task);
+    iree_task_executor_submit(executor_, &submission);
+    iree_task_executor_flush(executor_);
+    return iree_task_scope_wait_idle(&scope_, IREE_TIME_INFINITE_FUTURE);
+  }
+
+  // Submits a DAG of tasks with |tail_task| at the tail (used just for idle
+  // detection).
+  iree_status_t SubmitAndWaitIdle(iree_task_submission_t* submission,
+                                  iree_task_t* tail_task) {
+    iree_task_fence_t* fence = NULL;
+    IREE_RETURN_IF_ERROR(
+        iree_task_executor_acquire_fence(executor_, &scope_, &fence));
+    iree_task_set_completion_task(tail_task, &fence->header);
+
+    iree_task_executor_submit(executor_, submission);
+    iree_task_executor_flush(executor_);
+    return iree_task_scope_wait_idle(&scope_, IREE_TIME_INFINITE_FUTURE);
+  }
+
+  iree_task_executor_t* executor_ = NULL;
+  iree_task_scope_t scope_;
+};
+
+#endif  // IREE_TASK_TESTING_TASK_TEST_H_
diff --git a/runtime/src/iree/task/testing/test_util.h b/runtime/src/iree/task/testing/test_util.h
new file mode 100644
index 0000000..047882b
--- /dev/null
+++ b/runtime/src/iree/task/testing/test_util.h
@@ -0,0 +1,77 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// NOTE: the best kind of synchronization is no synchronization; always try to
+// design your algorithm so that you don't need anything from this file :)
+// See https://travisdowns.github.io/blog/2020/07/06/concurrency-costs.html
+
+#ifndef IREE_TASK_TESTING_TEST_UTIL_H_
+#define IREE_TASK_TESTING_TEST_UTIL_H_
+
+#include <memory>
+
+#include "iree/task/list.h"
+#include "iree/task/pool.h"
+#include "iree/task/scope.h"
+#include "iree/testing/status_matchers.h"
+
+using TaskPoolPtr =
+    std::unique_ptr<iree_task_pool_t, void (*)(iree_task_pool_t*)>;
+static inline TaskPoolPtr AllocateNopPool() {
+  iree_task_pool_t* pool = new iree_task_pool_t();
+  IREE_CHECK_OK(iree_task_pool_initialize(iree_allocator_system(),
+                                          sizeof(iree_task_nop_t), 1024, pool));
+  return {pool, [](iree_task_pool_t* pool) {
+            iree_task_pool_deinitialize(pool);
+            delete pool;
+          }};
+}
+
+using TaskScopePtr =
+    std::unique_ptr<iree_task_scope_t, void (*)(iree_task_scope_t*)>;
+static inline TaskScopePtr AllocateScope(const char* name) {
+  iree_task_scope_t* scope = new iree_task_scope_t();
+  iree_task_scope_initialize(iree_make_cstring_view(name), scope);
+  return {scope, [](iree_task_scope_t* scope) {
+            iree_task_scope_deinitialize(scope);
+            delete scope;
+          }};
+}
+
+static inline iree_task_t* AcquireNopTask(TaskPoolPtr& pool,
+                                          TaskScopePtr& scope, uint16_t value) {
+  iree_task_t* task = NULL;
+  IREE_CHECK_OK(iree_task_pool_acquire(pool.get(), &task));
+  iree_task_initialize(IREE_TASK_TYPE_NOP, scope.get(), task);
+  task->flags = value;
+  return task;
+}
+
+static inline bool CheckListOrderFIFO(iree_task_list_t* list) {
+  iree_task_t* p = list->head;
+  if (!p) return true;
+  uint16_t value = p->flags;
+  p = p->next_task;
+  while (p) {
+    if (p->flags <= value) return false;
+    p = p->next_task;
+  }
+  return true;
+}
+
+static inline bool CheckListOrderLIFO(iree_task_list_t* list) {
+  iree_task_t* p = list->head;
+  if (!p) return true;
+  uint16_t value = p->flags;
+  p = p->next_task;
+  while (p) {
+    if (p->flags >= value) return false;
+    p = p->next_task;
+  }
+  return true;
+}
+
+#endif  // IREE_TASK_TESTING_TEST_UTIL_H_
diff --git a/runtime/src/iree/task/topology.c b/runtime/src/iree/task/topology.c
new file mode 100644
index 0000000..57450b9
--- /dev/null
+++ b/runtime/src/iree/task/topology.c
@@ -0,0 +1,94 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/task/topology.h"
+
+#include <stdio.h>
+#include <string.h>
+
+#include "iree/base/api.h"
+#include "iree/base/target_platform.h"
+#include "iree/base/tracing.h"
+
+void iree_task_topology_group_initialize(
+    uint8_t group_index, iree_task_topology_group_t* out_group) {
+  memset(out_group, 0, sizeof(*out_group));
+  out_group->group_index = group_index;
+  snprintf(out_group->name, IREE_ARRAYSIZE(out_group->name), "iree-worker-%u",
+           group_index);
+  iree_thread_affinity_set_any(&out_group->ideal_thread_affinity);
+  out_group->constructive_sharing_mask = IREE_TASK_TOPOLOGY_GROUP_MASK_ALL;
+}
+
+void iree_task_topology_initialize(iree_task_topology_t* out_topology) {
+  IREE_ASSERT_ARGUMENT(out_topology);
+  memset(out_topology, 0, sizeof(*out_topology));
+}
+
+void iree_task_topology_deinitialize(iree_task_topology_t* topology) {
+  IREE_ASSERT_ARGUMENT(topology);
+}
+
+iree_status_t iree_task_topology_parse(iree_string_view_t value,
+                                       iree_task_topology_t* out_topology) {
+  // TODO(benvanik): define a format that is generally useful alongside cpuinfo.
+  // Maybe colon-separated group-id values from thread affinities? Like:
+  //   0.0:0.2:0.4:0.8 to indicate cores 0,2,4,8 on group 0
+  //   0.0:0.1:1.0:1.1 to indicate cores 0,1 of both groups 0,1
+  // etc
+  return iree_make_status(IREE_STATUS_UNIMPLEMENTED);
+}
+
+bool iree_task_topology_format(const iree_task_topology_t* topology,
+                               iree_host_size_t buffer_capacity, char* buffer,
+                               iree_host_size_t* out_buffer_length) {
+  // TODO(benvanik): formatting to match parsing.
+  return false;
+}
+
+iree_host_size_t iree_task_topology_group_capacity(
+    const iree_task_topology_t* topology) {
+  return IREE_ARRAYSIZE(topology->groups);
+}
+
+iree_host_size_t iree_task_topology_group_count(
+    const iree_task_topology_t* topology) {
+  return topology->group_count;
+}
+
+const iree_task_topology_group_t* iree_task_topology_get_group(
+    const iree_task_topology_t* topology, iree_host_size_t group_index) {
+  if (group_index >= topology->group_count) return NULL;
+  return &topology->groups[group_index];
+}
+
+iree_status_t iree_task_topology_push_group(
+    iree_task_topology_t* topology, const iree_task_topology_group_t* group) {
+  if (topology->group_count + 1 > IREE_ARRAYSIZE(topology->groups)) {
+    return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED,
+                            "group capacity exceeded");
+  }
+  iree_task_topology_group_t* dst_group =
+      &topology->groups[topology->group_count];
+  memcpy(dst_group, group, sizeof(*group));
+  dst_group->group_index = topology->group_count++;
+  return iree_ok_status();
+}
+
+void iree_task_topology_initialize_from_group_count(
+    iree_host_size_t group_count, iree_task_topology_t* out_topology) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, group_count);
+
+  iree_task_topology_initialize(out_topology);
+  for (iree_host_size_t i = 0; i < group_count; ++i) {
+    iree_task_topology_group_t* group = &out_topology->groups[i];
+    iree_task_topology_group_initialize(i, group);
+  }
+  out_topology->group_count = group_count;
+
+  IREE_TRACE_ZONE_END(z0);
+}
diff --git a/runtime/src/iree/task/topology.h b/runtime/src/iree/task/topology.h
new file mode 100644
index 0000000..ca02352
--- /dev/null
+++ b/runtime/src/iree/task/topology.h
@@ -0,0 +1,133 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_TASK_TOPOLOGY_H_
+#define IREE_TASK_TOPOLOGY_H_
+
+#include <limits.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/threading.h"
+#include "iree/task/tuning.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// A bitmask indicating which other groups from 0 to N may constructively share
+// caches. For example, a value of 0b1100 indicates that group 2 and 3 share.
+typedef uint64_t iree_task_topology_group_mask_t;
+
+#define IREE_TASK_TOPOLOGY_GROUP_MASK_ALL UINT64_MAX
+#define IREE_TASK_TOPOLOGY_GROUP_BIT_COUNT \
+  (sizeof(iree_task_topology_group_mask_t) * 8)
+
+// Information about a particular group within the topology.
+// Groups may be of varying levels of granularity even within the same topology
+// based on how the topology is defined.
+typedef struct iree_task_topology_group_t {
+  // Group index within the topology matching a particular bit in
+  // iree_task_topology_group_mask_t.
+  uint8_t group_index;
+
+  // A name assigned to executor workers used for logging/tracing.
+  char name[15];
+
+  // Processor index in the cpuinfo set.
+  uint32_t processor_index;
+
+  // Ideal thread affinity for threads within this group.
+  // All threads within the group share the same affinity and this is what
+  // allows us to model Simultaneous Multi-Threading (SMT) (aka hyperthreading).
+  iree_thread_affinity_t ideal_thread_affinity;
+
+  // A bitmask of other group indices that share some level of the cache
+  // hierarchy. Workers of this group are more likely to constructively share
+  // some cache levels higher up with these other groups. For example, if the
+  // workers in a group all share an L2 cache then the groups indicated here may
+  // all share the same L3 cache.
+  iree_task_topology_group_mask_t constructive_sharing_mask;
+} iree_task_topology_group_t;
+
+// Initializes |out_group| with a |group_index| derived name.
+void iree_task_topology_group_initialize(uint8_t group_index,
+                                         iree_task_topology_group_t* out_group);
+
+// Task system topology information used to define the workers within an
+// executor.
+//
+// Topologies are used to statically configure task executors by defining the
+// total number of workers in the worker pool and how those workers map to
+// hardware compute resources.
+//
+// Users can allocate topologies, populate them with zero or more groups, and
+// then pass them to the executor to construct the desired configuration. To
+// ease testing and debugging topologies can be formatted as string values and
+// round tripped through flags, though obviously the value of such encodings are
+// machine-dependent.
+//
+// Several helper constructors are available that query the machine topology
+// and attempt to derive some (hopefully) useful task system topology from it.
+// We can add the more common heuristics over time to the core and leave the
+// edge cases for applications to construct.
+typedef struct iree_task_topology_t {
+  iree_host_size_t group_count;
+  iree_task_topology_group_t groups[IREE_TASK_EXECUTOR_MAX_WORKER_COUNT];
+} iree_task_topology_t;
+
+// Initializes an empty task topology.
+void iree_task_topology_initialize(iree_task_topology_t* out_topology);
+
+// Deinitializes a topology structure.
+void iree_task_topology_deinitialize(iree_task_topology_t* topology);
+
+// Parses a serialized topology in string form.
+iree_status_t iree_task_topology_parse(iree_string_view_t value,
+                                       iree_task_topology_t* out_topology);
+
+// Formats the topology as a string value that can be parsed with
+// iree_task_topology_parse.
+bool iree_task_topology_format(const iree_task_topology_t* topology,
+                               iree_host_size_t buffer_capacity, char* buffer,
+                               iree_host_size_t* out_buffer_length);
+
+// Returns the group capacity in the topology structure.
+iree_host_size_t iree_task_topology_group_capacity(
+    const iree_task_topology_t* topology);
+
+// Returns the total group count defined by the topology.
+iree_host_size_t iree_task_topology_group_count(
+    const iree_task_topology_t* topology);
+
+// Returns the group information for the given group index.
+const iree_task_topology_group_t* iree_task_topology_get_group(
+    const iree_task_topology_t* topology, iree_host_size_t group_index);
+
+// Pushes a new group onto the topology set.
+// The provided group data will be copied into the topology structure.
+iree_status_t iree_task_topology_push_group(
+    iree_task_topology_t* topology, const iree_task_topology_group_t* group);
+
+// Initializes a topology with the specified number of groups.
+// 0 is a valid value, indicating that only donated threads will be used to
+// perform work. Groups will have no specific affinity and rely on the OS
+// scheduler to ensure they are distributed in a meaningful way; this generally
+// works out as threads created within a process are usually rotated across
+// preferred processors by default.
+void iree_task_topology_initialize_from_group_count(
+    iree_host_size_t group_count, iree_task_topology_t* out_topology);
+
+// Initializes a topology with one group for each physical core in the machine.
+void iree_task_topology_initialize_from_physical_cores(
+    iree_host_size_t max_core_count, iree_task_topology_t* out_topology);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_TASK_TOPOLOGY_H_
diff --git a/runtime/src/iree/task/topology_cpuinfo.c b/runtime/src/iree/task/topology_cpuinfo.c
new file mode 100644
index 0000000..ca91263
--- /dev/null
+++ b/runtime/src/iree/task/topology_cpuinfo.c
@@ -0,0 +1,256 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <stdio.h>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/math.h"
+#include "iree/base/target_platform.h"
+#include "iree/base/tracing.h"
+#include "iree/task/topology.h"
+
+// Initializes |out_topology| with a standardized behavior when cpuinfo is not
+// available (unsupported arch, failed to query, etc).
+static void iree_task_topology_initialize_fallback(
+    iree_host_size_t max_group_count, iree_task_topology_t* out_topology) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, max_group_count);
+  // TODO(benvanik): implement our own query... but that seems not so great.
+  // For now we default to a single group: if a user wants more then they can
+  // either get cpuinfo working for their platform or manually construct the
+  // topology themselves.
+  iree_host_size_t group_count = 1;
+  iree_task_topology_initialize_from_group_count(group_count, out_topology);
+  IREE_TRACE_ZONE_END(z0);
+}
+
+#if defined(IREE_TASK_CPUINFO_DISABLED)
+
+void iree_task_topology_initialize_from_physical_cores(
+    iree_host_size_t max_core_count, iree_task_topology_t* out_topology) {
+  iree_task_topology_initialize_fallback(max_core_count, out_topology);
+}
+
+#else
+
+#include <cpuinfo.h>
+
+static bool iree_task_topology_is_cpuinfo_available() {
+  return cpuinfo_initialize() && cpuinfo_get_cores_count() > 0;
+}
+
+// Returns the core of the calling thread or NULL if not supported.
+// We wrap this here because cpuinfo only returns non-NULL on linux.
+static const struct cpuinfo_core* iree_task_topology_get_current_core() {
+  const struct cpuinfo_core* current_core = cpuinfo_get_current_core();
+#if defined(IREE_PLATFORM_WINDOWS)
+  // TODO(benvanik): upstream into cpuinfo.
+  if (current_core == NULL) {
+    PROCESSOR_NUMBER processor_number;
+    GetCurrentProcessorNumberEx(&processor_number);
+    uint32_t processor_id =
+        cpuinfo_get_package(processor_number.Group)->processor_start +
+        processor_number.Number;
+    current_core = cpuinfo_get_processor(processor_id)->core;
+  }
+#endif  // IREE_PLATFORM_WINDOWS
+  return current_core;
+}
+
+// Returns |core_id| rotated by the calling base core ID.
+// On many systems the kernel will have already assigned a randomized starting
+// core for thread distribution and we can just reuse that.
+static uint32_t iree_task_topology_rotate_from_base_core(uint32_t core_id) {
+  const struct cpuinfo_core* current_core =
+      iree_task_topology_get_current_core();
+  if (!current_core) {
+    return core_id;  // don't modify if we don't know
+  }
+  uint32_t next_core_id =
+      (current_core->core_id + 1) % cpuinfo_get_cores_count();
+  return (next_core_id + core_id) % cpuinfo_get_cores_count();
+}
+
+// Sets a platform-specific iree_thread_affinity_t based on the cpuinfo
+// processor.
+static void iree_task_topology_set_affinity_from_processor(
+    const struct cpuinfo_processor* processor,
+    iree_thread_affinity_t* out_affinity) {
+  memset(out_affinity, 0, sizeof(*out_affinity));
+  out_affinity->specified = 1;
+
+  // Special bit to indicate that (if required) we want the entire core.
+  if (processor->core->processor_count > 1) {
+    out_affinity->smt = 1;
+  }
+
+  // cpuinfo #ifdefs the fields we need to extract the right platform IDs.
+  // We purposefully use the same exact macros they do there so that we don't
+  // have to worry about skew.
+
+#if defined(__MACH__) && defined(__APPLE__)
+  // TODO(benvanik): run on darwin to see how the l2 caches map. We ideally want
+  // a unique affinity ID per L2 cache.
+  // For now, we just use some random pointer bytes. It's just a tag used by
+  // the kernel to distribute the threads so the exact bits don't matter as long
+  // as they are unique per group we want isolated.
+  out_affinity->id = (uint32_t)(uintptr_t)processor;
+#elif defined(__linux__)
+  out_affinity->id = processor->linux_id;
+#elif defined(_WIN32) || defined(__CYGWIN__)
+  out_affinity->group = processor->windows_group_id;
+  out_affinity->id = processor->windows_processor_id;
+#else
+  // WASM? Unusued today.
+  out_affinity->specified = 0;
+#endif  // cpuinfo-like platform field
+}
+
+// Returns a bitset with all *processors* that share the same |cache|.
+static uint64_t iree_task_topology_calculate_cache_bits(
+    const struct cpuinfo_cache* cache) {
+  if (!cache) return 0;
+  uint64_t mask = 0;
+  for (uint32_t processor_i = 0; processor_i < cache->processor_count;
+       ++processor_i) {
+    uint32_t i = cache->processor_start + processor_i;
+    if (i < IREE_TASK_TOPOLOGY_GROUP_BIT_COUNT) {
+      mask |= 1ull << i;
+    }
+  }
+  return mask;
+}
+
+// Constructs a constructive sharing mask for all *processors* that share the
+// same cache as the specified |processor|.
+static uint64_t iree_task_topology_calculate_constructive_sharing_mask(
+    const struct cpuinfo_processor* processor) {
+  uint64_t mask = 0;
+  mask |= iree_task_topology_calculate_cache_bits(processor->cache.l1i);
+  mask |= iree_task_topology_calculate_cache_bits(processor->cache.l1d);
+  mask |= iree_task_topology_calculate_cache_bits(processor->cache.l2);
+  // TODO(benvanik): include L3 here too (for systems that have it)? Or use L3
+  // info purely for distribution and focus the group mask on lower-latency
+  // caches?
+  return mask;
+}
+
+// Populates |our_group| with the information from |core|.
+static void iree_task_topology_group_initialize_from_core(
+    uint32_t group_index, const struct cpuinfo_core* core,
+    iree_task_topology_group_t* out_group) {
+  iree_task_topology_group_initialize(group_index, out_group);
+
+  // Guess: always pick the first processor in a core.
+  // When pinning to threads we'll take into account whether the core is SMT
+  // and use all threads anyway so this alignment is just helpful for debugging.
+  uint32_t processor_i = core->processor_start;
+  out_group->processor_index = processor_i;
+
+  const struct cpuinfo_processor* processor =
+      cpuinfo_get_processor(processor_i);
+  iree_task_topology_set_affinity_from_processor(
+      processor, &out_group->ideal_thread_affinity);
+}
+
+// Fixes constructive_sharing_mask values such that they represent other chosen
+// topology groups instead of processor indices. We do this so that code using
+// the topology groups doesn't need to know anything about which physical
+// processor IDs a particular group is mapped to.
+static void iree_task_topology_fixup_constructive_sharing_masks(
+    iree_task_topology_t* topology) {
+  // O(n^2), but n is always <= 64 (and often <= 8).
+  for (iree_host_size_t i = 0; i < topology->group_count; ++i) {
+    iree_task_topology_group_t* group = &topology->groups[i];
+
+    // Compute the processors that we can constructively share with.
+    uint64_t constructive_sharing_mask =
+        iree_task_topology_calculate_constructive_sharing_mask(
+            cpuinfo_get_processor(group->processor_index));
+
+    iree_task_topology_group_mask_t group_mask = 0;
+    for (iree_host_size_t j = 0; j < topology->group_count; ++j) {
+      if (i == j) continue;
+      const iree_task_topology_group_t* other_group = &topology->groups[j];
+      uint64_t group_processor_bits =
+          iree_math_rotl_u64(1ull, other_group->processor_index);
+      if (constructive_sharing_mask & group_processor_bits) {
+        group_mask |= iree_math_rotl_u64(1ull, other_group->group_index);
+      }
+    }
+
+    group->constructive_sharing_mask = group_mask;
+  }
+}
+
+// Matches all cores.
+static bool iree_task_topology_core_filter_all(const struct cpuinfo_core* core,
+                                               uintptr_t user_data) {
+  return true;
+}
+
+// Returns true if the given |core| passes the filter and should be included.
+// |user_data| is the value passed alongside the filter function.
+typedef bool (*iree_task_topology_core_filter_t)(
+    const struct cpuinfo_core* core, uintptr_t user_data);
+
+// Initializes a topology with one group for each core that matches |filter_fn|.
+//
+// If cpuinfo is not available this falls back to the same behavior as
+// iree_task_topology_initialize_from_physical_cores.
+static void iree_task_topology_initialize_from_physical_cores_with_filter(
+    iree_task_topology_core_filter_t filter_fn, uintptr_t filter_fn_data,
+    iree_host_size_t max_core_count, iree_task_topology_t* out_topology) {
+  max_core_count = iree_min(max_core_count, IREE_TASK_TOPOLOGY_GROUP_BIT_COUNT);
+  if (!iree_task_topology_is_cpuinfo_available()) {
+    iree_task_topology_initialize_fallback(max_core_count, out_topology);
+    return;
+  }
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, max_core_count);
+
+  // Count cores that match the filter.
+  iree_host_size_t core_count = 0;
+  for (uint32_t i = 0; i < cpuinfo_get_cores_count(); i++) {
+    const struct cpuinfo_core* core = cpuinfo_get_core(i);
+    if (filter_fn(core, filter_fn_data)) ++core_count;
+  }
+  core_count = iree_min(core_count, max_core_count);
+
+  iree_task_topology_initialize(out_topology);
+
+  // Build each core up to the max allowed.
+  // TODO(benvanik): if our group_count <= core_count/2 then distribute better;
+  // for now we just do a straight-line through (cores 0-N) when instead we may
+  // want to take advantage of L3 cache info (half of groups on one L3 cache,
+  // half of groups on another, etc).
+  out_topology->group_count = core_count;
+  for (uint32_t core_i = 0, group_i = 0; group_i < out_topology->group_count;
+       ++core_i) {
+    // Rotate the core ID so that we avoid setting the affinity to the calling
+    // thread which we assume is something the user has plans for and doesn't
+    // want to have our workers stealing their time.
+    const struct cpuinfo_core* core =
+        cpuinfo_get_core(iree_task_topology_rotate_from_base_core(core_i));
+    if (filter_fn(core, filter_fn_data)) {
+      iree_task_topology_group_initialize_from_core(
+          group_i, core, &out_topology->groups[group_i]);
+      ++group_i;
+    }
+  }
+
+  iree_task_topology_fixup_constructive_sharing_masks(out_topology);
+  IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_task_topology_initialize_from_physical_cores(
+    iree_host_size_t max_core_count, iree_task_topology_t* out_topology) {
+  iree_task_topology_initialize_from_physical_cores_with_filter(
+      iree_task_topology_core_filter_all, 0, max_core_count, out_topology);
+}
+
+#endif  // IREE_TASK_CPUINFO_DISABLED
diff --git a/runtime/src/iree/task/topology_test.cc b/runtime/src/iree/task/topology_test.cc
new file mode 100644
index 0000000..446e824
--- /dev/null
+++ b/runtime/src/iree/task/topology_test.cc
@@ -0,0 +1,146 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/task/topology.h"
+
+#include <cstddef>
+
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace {
+
+using namespace iree::testing::status;
+
+TEST(TopologyTest, Lifetime) {
+  iree_task_topology_t topology;
+  iree_task_topology_initialize(&topology);
+  EXPECT_GT(iree_task_topology_group_capacity(&topology), 0);
+  EXPECT_EQ(0, iree_task_topology_group_count(&topology));
+  iree_task_topology_deinitialize(&topology);
+}
+
+TEST(TopologyTest, Empty) {
+  iree_task_topology_t topology;
+  iree_task_topology_initialize(&topology);
+
+  EXPECT_EQ(0, iree_task_topology_group_count(&topology));
+  EXPECT_EQ(NULL, iree_task_topology_get_group(&topology, 0));
+  EXPECT_EQ(NULL, iree_task_topology_get_group(&topology, 100));
+
+  iree_task_topology_deinitialize(&topology);
+}
+
+TEST(TopologyTest, Parsing) {
+  // TODO(benvanik): implement parsing.
+}
+
+TEST(TopologyTest, Formatting) {
+  // TODO(benvanik): implement formatting.
+}
+
+TEST(TopologyTest, Construction) {
+  iree_task_topology_t topology;
+  iree_task_topology_initialize(&topology);
+
+  EXPECT_EQ(0, iree_task_topology_group_count(&topology));
+
+  for (iree_host_size_t i = 0; i < 8; ++i) {
+    iree_task_topology_group_t group;
+    iree_task_topology_group_initialize(i, &group);
+    IREE_EXPECT_OK(iree_task_topology_push_group(&topology, &group));
+    EXPECT_EQ(i + 1, iree_task_topology_group_count(&topology));
+  }
+  EXPECT_EQ(8, iree_task_topology_group_count(&topology));
+
+  for (iree_host_size_t i = 0; i < 8; ++i) {
+    const iree_task_topology_group_t* group =
+        iree_task_topology_get_group(&topology, i);
+    EXPECT_EQ(i, group->group_index);
+  }
+
+  iree_task_topology_deinitialize(&topology);
+}
+
+TEST(TopologyTest, MaxCapacity) {
+  iree_task_topology_t topology;
+  iree_task_topology_initialize(&topology);
+
+  EXPECT_EQ(0, iree_task_topology_group_count(&topology));
+
+  // Fill up to capacity.
+  for (iree_host_size_t i = 0; i < iree_task_topology_group_capacity(&topology);
+       ++i) {
+    iree_task_topology_group_t group;
+    iree_task_topology_group_initialize(i, &group);
+    IREE_EXPECT_OK(iree_task_topology_push_group(&topology, &group));
+    EXPECT_EQ(i + 1, iree_task_topology_group_count(&topology));
+  }
+  EXPECT_EQ(iree_task_topology_group_capacity(&topology),
+            iree_task_topology_group_count(&topology));
+
+  // Try adding one more - it should it fail because we are at capacity.
+  iree_task_topology_group_t extra_group;
+  iree_task_topology_group_initialize(UINT8_MAX, &extra_group);
+  iree_status_t status = iree_task_topology_push_group(&topology, &extra_group);
+  EXPECT_TRUE(iree_status_is_resource_exhausted(status));
+  iree_status_ignore(status);
+
+  // Confirm that the only groups we have are the valid ones we added above.
+  for (iree_host_size_t i = 0; i < 8; ++i) {
+    const iree_task_topology_group_t* group =
+        iree_task_topology_get_group(&topology, i);
+    EXPECT_EQ(i, group->group_index);
+  }
+
+  iree_task_topology_deinitialize(&topology);
+}
+
+TEST(TopologyTest, FromGroupCount) {
+  static constexpr iree_host_size_t kGroupCount = 4;
+  iree_task_topology_t topology;
+  iree_task_topology_initialize(&topology);
+
+  iree_task_topology_initialize_from_group_count(kGroupCount, &topology);
+  EXPECT_LE(iree_task_topology_group_count(&topology),
+            iree_task_topology_group_capacity(&topology));
+  EXPECT_EQ(iree_task_topology_group_count(&topology), kGroupCount);
+  for (iree_host_size_t i = 0; i < kGroupCount; ++i) {
+    const iree_task_topology_group_t* group =
+        iree_task_topology_get_group(&topology, i);
+    EXPECT_EQ(i, group->group_index);
+  }
+
+  iree_task_topology_deinitialize(&topology);
+}
+
+// Verifies only that the |topology| is usable.
+// If we actually checked the contents here then we'd just be validating that
+// cpuinfo was working and the tests would become machine-dependent.
+static void EnsureTopologyValid(iree_host_size_t max_group_count,
+                                iree_task_topology_t* topology) {
+  EXPECT_LE(iree_task_topology_group_count(topology),
+            iree_task_topology_group_capacity(topology));
+  EXPECT_LE(iree_task_topology_group_count(topology), max_group_count);
+  EXPECT_GE(iree_task_topology_group_count(topology), 1);
+  for (iree_host_size_t i = 0; i < iree_task_topology_group_count(topology);
+       ++i) {
+    const iree_task_topology_group_t* group =
+        iree_task_topology_get_group(topology, i);
+    EXPECT_EQ(i, group->group_index);
+  }
+}
+
+TEST(TopologyTest, FromPhysicalCores) {
+  static constexpr iree_host_size_t kMaxGroupCount = 4;
+  iree_task_topology_t topology;
+  iree_task_topology_initialize(&topology);
+  iree_task_topology_initialize_from_physical_cores(kMaxGroupCount, &topology);
+  EnsureTopologyValid(kMaxGroupCount, &topology);
+  iree_task_topology_deinitialize(&topology);
+}
+
+}  // namespace
diff --git a/runtime/src/iree/task/tuning.h b/runtime/src/iree/task/tuning.h
new file mode 100644
index 0000000..dbe4bbf
--- /dev/null
+++ b/runtime/src/iree/task/tuning.h
@@ -0,0 +1,105 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_TASK_TUNING_H_
+#define IREE_TASK_TUNING_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Maximum number of workers that an executor can manage.
+// A 64 worker hard limit is based on us using uint64_t as a bitmask to select
+// workers. It's easy to go smaller (just use fewer bits) if it's known that
+// only <64 will ever be used (such as for devices with 2 cores).
+#define IREE_TASK_EXECUTOR_MAX_WORKER_COUNT (64)
+
+// Initial number of shard tasks that are allocated in the executor pool.
+// Increasing this number will decrease initial allocation storms in cases of
+// extremely wide concurrency regions (many dispatches running at the same time)
+// at the cost of a higher minimum memory consumption.
+#define IREE_TASK_EXECUTOR_INITIAL_SHARD_RESERVATION_PER_WORKER (4)
+
+// Maximum number of events retained by the executor event pool.
+#define IREE_TASK_EXECUTOR_EVENT_POOL_CAPACITY 64
+
+// Maximum number of simultaneous waits an executor may perform as part of a
+// wait-any operation. A larger value may enable better wake coalescing by the
+// kernel. This is only a count limiting wait tasks that have been scheduled and
+// been promoted to the root executor waiting list. There may be any number of
+// waits deeper in the pipeline so long as they don't all become ready
+// simultaneously.
+//
+// Realistically, though, if we have more than 64 outstanding **root** waits
+// it's hard to reason about if/when the executor queue could make forward
+// progress and indicates a possible error in task assignment.
+//
+// Also, the underlying iree_wait_set_t may not support more than 64 handles on
+// certain platforms without emulation. Trying to keep us on the fast-path
+// with a reasonable number seems fine for now until we have a need for more.
+//
+// NOTE: we reserve 1 wait handle for our own internal use. This allows us to
+// wake the coordination worker when new work is submitted from external
+// sources.
+#define IREE_TASK_EXECUTOR_MAX_OUTSTANDING_WAITS (64 - 1)
+
+// Amount of time that can remain in a delay task while still retiring.
+// This prevents additional system sleeps when the remaining time before the
+// deadline is less than the granularity the system is likely able to sleep for.
+// Some platforms may have as much as 10-15ms of potential slop and sleeping for
+// 1ms may result in 10-15ms.
+#define IREE_TASK_EXECUTOR_DELAY_SLOP_NS (1 /*ms*/ * 1000000)
+
+// Allows for dividing the total number of attempts that a worker will make to
+// steal tasks from other workers. By default all other workers will be
+// attempted while setting this to 2, for example, will try for only half of
+// the available workers.
+#define IREE_TASK_EXECUTOR_MAX_THEFT_ATTEMPTS_DIVISOR (1)
+
+// Maximum number of tasks that will be stolen in one go from another worker.
+//
+// Too few tasks will cause additional overhead as the worker repeatedly sips
+// away tasks and when it does get tasks it may suffer spatial locality cache
+// issues as it is effectively walking backwards in memory to both touch the
+// tasks and - a much larger impact - running tasks that themselves are walking
+// orders of magnitude more memory backwards.
+//
+// Too many tasks will cause additional latency on workers that may interfere
+// with higher level scheduling; for example, if a worker runs out of tasks and
+// immediately steals 8000 of them from another worker it's going to take until
+// those 8000 complete before any work that arrives specifically for the worker
+// is able to start processing.
+//
+// In real-time systems too few tasks is better (slightly more work for much
+// lower variance in execution) while in batch mode systems too many tasks is
+// better (as latencies don't matter so long as throughput is maximized).
+#define IREE_TASK_EXECUTOR_MAX_THEFT_TASK_COUNT \
+  IREE_TASK_EXECUTOR_MAX_WORKER_COUNT
+
+// Number of tiles that will be batched into a single reservation from the grid.
+// This is a maximum; if there are fewer tiles that would otherwise allow for
+// maximum parallelism then this may be ignored.
+//
+// The more tiles reserved at a time the higher the chance for latency to
+// increase as many reserved tiles are held up on one worker while another may
+// have otherwise been able to steal them and help finish them sooner.
+//
+// The fewer tiles reserved at a time the higher the chance for cache-locality
+// destroying behavior where multiple workers all stomp on the same cache lines
+// (as say worker 0 and worker 1 both fight over sequential tiles adjacent in
+// memory).
+#define IREE_TASK_DISPATCH_MAX_TILES_PER_SHARD_RESERVATION (8)
+
+// Whether to enable per-tile colors for each tile tracing zone based on the
+// tile grid xyz. Not cheap and can be disabled to reduce tracing overhead.
+// TODO(#4017): make per-tile color tracing fast enough to always have on.
+#define IREE_TASK_TRACING_PER_TILE_COLORS 1
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_TASK_TUNING_H_
diff --git a/runtime/src/iree/task/worker.c b/runtime/src/iree/task/worker.c
new file mode 100644
index 0000000..d12e9a2
--- /dev/null
+++ b/runtime/src/iree/task/worker.c
@@ -0,0 +1,386 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/task/worker.h"
+
+#include <stdbool.h>
+#include <string.h>
+
+#include "iree/base/internal/fpu_state.h"
+#include "iree/base/internal/math.h"
+#include "iree/base/tracing.h"
+#include "iree/task/executor_impl.h"
+#include "iree/task/post_batch.h"
+#include "iree/task/submission.h"
+#include "iree/task/task_impl.h"
+#include "iree/task/tuning.h"
+
+static int iree_task_worker_main(iree_task_worker_t* worker);
+
+iree_status_t iree_task_worker_initialize(
+    iree_task_executor_t* executor, iree_host_size_t worker_index,
+    const iree_task_topology_group_t* topology_group,
+    iree_byte_span_t local_memory, iree_prng_splitmix64_state_t* seed_prng,
+    iree_task_worker_t* out_worker) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  out_worker->executor = executor;
+  out_worker->worker_bit = iree_task_affinity_for_worker(worker_index);
+  out_worker->ideal_thread_affinity = topology_group->ideal_thread_affinity;
+  out_worker->constructive_sharing_mask =
+      topology_group->constructive_sharing_mask;
+  out_worker->max_theft_attempts =
+      executor->worker_count / IREE_TASK_EXECUTOR_MAX_THEFT_ATTEMPTS_DIVISOR;
+  iree_prng_minilcg128_initialize(iree_prng_splitmix64_next(seed_prng),
+                                  &out_worker->theft_prng);
+  out_worker->local_memory = local_memory;
+  out_worker->processor_id = 0;
+  out_worker->processor_tag = 0;
+
+  iree_task_worker_state_t initial_state = IREE_TASK_WORKER_STATE_RUNNING;
+  if (executor->scheduling_mode &
+      IREE_TASK_SCHEDULING_MODE_DEFER_WORKER_STARTUP) {
+    // User is favoring startup latency vs. initial scheduling latency. Our
+    // thread will be created suspended and not first scheduled until work
+    // arrives for it, (almost) ensuring no context switches and 10x+ lower
+    // blocking startup time.
+    initial_state = IREE_TASK_WORKER_STATE_SUSPENDED;
+  }
+  iree_atomic_store_int32(&out_worker->state, initial_state,
+                          iree_memory_order_seq_cst);
+
+  iree_notification_initialize(&out_worker->wake_notification);
+  iree_notification_initialize(&out_worker->state_notification);
+  iree_atomic_task_slist_initialize(&out_worker->mailbox_slist);
+  iree_task_queue_initialize(&out_worker->local_task_queue);
+
+  iree_thread_create_params_t thread_params;
+  memset(&thread_params, 0, sizeof(thread_params));
+  thread_params.name = iree_make_cstring_view(topology_group->name);
+  thread_params.create_suspended =
+      initial_state == IREE_TASK_WORKER_STATE_SUSPENDED;
+  thread_params.priority_class = IREE_THREAD_PRIORITY_CLASS_NORMAL;
+  thread_params.initial_affinity = out_worker->ideal_thread_affinity;
+
+  // NOTE: if the thread creation fails we'll bail here and let the caller
+  // cleanup by calling deinitialize (which is safe because we zero init
+  // everything).
+  iree_status_t status = iree_thread_create(
+      (iree_thread_entry_t)iree_task_worker_main, out_worker, thread_params,
+      executor->allocator, &out_worker->thread);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+void iree_task_worker_request_exit(iree_task_worker_t* worker) {
+  if (!worker->thread) return;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // If the thread is already in the exiting/zombie state we don't need to do
+  // anything.
+  iree_task_worker_state_t prev_state =
+      (iree_task_worker_state_t)iree_atomic_exchange_int32(
+          &worker->state, IREE_TASK_WORKER_STATE_EXITING,
+          iree_memory_order_acq_rel);
+  switch (prev_state) {
+    case IREE_TASK_WORKER_STATE_SUSPENDED:
+      // Worker was suspended; resume it so that it can exit itself.
+      iree_thread_resume(worker->thread);
+      break;
+    case IREE_TASK_WORKER_STATE_ZOMBIE:
+      // Worker already exited; reset state to ZOMBIE.
+      iree_atomic_store_int32(&worker->state, IREE_TASK_WORKER_STATE_ZOMBIE,
+                              iree_memory_order_seq_cst);
+      break;
+    default:
+      // Worker now set to EXITING and should exit soon.
+      break;
+  }
+
+  // Kick the worker in case it is waiting for work.
+  iree_notification_post(&worker->wake_notification, 1);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+// Returns true if the worker is in the zombie state (exited and awaiting
+// teardown).
+static bool iree_task_worker_is_zombie(iree_task_worker_t* worker) {
+  return iree_atomic_load_int32(&worker->state, iree_memory_order_seq_cst) ==
+         IREE_TASK_WORKER_STATE_ZOMBIE;
+}
+
+void iree_task_worker_await_exit(iree_task_worker_t* worker) {
+  if (!worker->thread) return;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_task_worker_request_exit(worker);
+  iree_notification_await(&worker->state_notification,
+                          (iree_condition_fn_t)iree_task_worker_is_zombie,
+                          worker, iree_infinite_timeout());
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_task_worker_deinitialize(iree_task_worker_t* worker) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Must have called request_exit/await_exit.
+  IREE_ASSERT_TRUE(iree_task_worker_is_zombie(worker));
+
+  iree_thread_release(worker->thread);
+  worker->thread = NULL;
+
+  // Release unfinished tasks by flushing the mailbox (which if we're here can't
+  // get anything more posted to it) and then discarding everything we still
+  // have a reference to.
+  iree_atomic_task_slist_discard(&worker->mailbox_slist);
+  iree_task_list_discard(&worker->local_task_queue.list);
+
+  iree_notification_deinitialize(&worker->wake_notification);
+  iree_notification_deinitialize(&worker->state_notification);
+  iree_atomic_task_slist_deinitialize(&worker->mailbox_slist);
+  iree_task_queue_deinitialize(&worker->local_task_queue);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_task_worker_post_tasks(iree_task_worker_t* worker,
+                                 iree_task_list_t* list) {
+  // Move the list into the mailbox. Note that the mailbox is LIFO and this list
+  // is concatenated with its current order preserved (which should be LIFO).
+  iree_atomic_task_slist_concat(&worker->mailbox_slist, list->head, list->tail);
+  memset(list, 0, sizeof(*list));
+}
+
+iree_task_t* iree_task_worker_try_steal_task(iree_task_worker_t* worker,
+                                             iree_task_queue_t* target_queue,
+                                             iree_host_size_t max_tasks) {
+  // Try to grab tasks from the worker; if more than one task is stolen then the
+  // first will be returned and the remaining will be added to the target queue.
+  iree_task_t* task = iree_task_queue_try_steal(
+      &worker->local_task_queue, target_queue,
+      /*max_tasks=*/IREE_TASK_EXECUTOR_MAX_THEFT_TASK_COUNT);
+  if (task) return task;
+
+  // If we still didn't steal any tasks then let's try the slist instead.
+  task = iree_atomic_task_slist_pop(&worker->mailbox_slist);
+  if (task) return task;
+
+  return NULL;
+}
+
+// Executes a task on a worker.
+// Only task types that are scheduled to workers are handled; all others must be
+// handled by the coordinator during scheduling.
+static void iree_task_worker_execute(
+    iree_task_worker_t* worker, iree_task_t* task,
+    iree_task_submission_t* pending_submission) {
+  // Execute the task and resolve the task and gather any tasks that are now
+  // ready for submission to the executor. They'll be scheduled the next time
+  // the coordinator runs.
+  //
+  // TODO(benvanik): think a bit more about this timing; this ensures we have
+  // BFS behavior at the cost of the additional merge overhead - it's probably
+  // worth it?
+  // TODO(benvanik): handle partial tasks and re-queuing.
+  switch (task->type) {
+    case IREE_TASK_TYPE_CALL: {
+      iree_task_call_execute((iree_task_call_t*)task, pending_submission);
+      break;
+    }
+    case IREE_TASK_TYPE_DISPATCH_SHARD: {
+      iree_task_dispatch_shard_execute(
+          (iree_task_dispatch_shard_t*)task, worker->processor_id,
+          worker->local_memory, pending_submission);
+      break;
+    }
+    default:
+      IREE_ASSERT_UNREACHABLE("incorrect task type for worker execution");
+      break;
+  }
+
+  // NOTE: task is invalidated above and must not be used!
+  task = NULL;
+}
+
+// Pumps the worker thread once, processing a single task.
+// Returns true if pumping should continue as there are more tasks remaining or
+// false if the caller should wait for more tasks to be posted.
+static bool iree_task_worker_pump_once(
+    iree_task_worker_t* worker, iree_task_submission_t* pending_submission) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Check the local work queue for any work we know we should start
+  // processing immediately. Other workers may try to steal some of this work
+  // if we take too long.
+  iree_task_t* task = iree_task_queue_pop_front(&worker->local_task_queue);
+
+  // Check the mailbox to see if we have incoming work that has been posted.
+  // We try to greedily move it to our local work list so that we can work
+  // with the full thread-local pending task list.
+  if (!task) {
+    // NOTE: there's a potential for theft pessimization if the queue runs too
+    // low and there's nothing there when a thief goes to grab some tasks. A
+    // standout there would indicate that we weren't scheduling very well in the
+    // first place (large uneven workloads for various workers, bad distribution
+    // in the face of heterogenous multi-core architectures where some workers
+    // complete tasks faster than others, etc).
+    task = iree_task_queue_flush_from_lifo_slist(&worker->local_task_queue,
+                                                 &worker->mailbox_slist);
+  }
+
+  // If we ran out of work assigned to this specific worker try to steal some
+  // from other workers that we hopefully share some of the cache hierarchy
+  // with. Their tasks will be moved from their local queue into ours and the
+  // the first task in the queue is popped off and returned.
+  if (!task) {
+    task = iree_task_executor_try_steal_task(
+        worker->executor, worker->constructive_sharing_mask,
+        worker->max_theft_attempts, &worker->theft_prng,
+        &worker->local_task_queue);
+  }
+
+  // No tasks to run; let the caller know we want to wait for more.
+  if (!task) {
+    IREE_TRACE_ZONE_END(z0);
+    return false;
+  }
+
+  // Execute the task (may call out to arbitrary user code and may submit more
+  // tasks for execution).
+  iree_task_worker_execute(worker, task, pending_submission);
+
+  IREE_TRACE_ZONE_END(z0);
+  return true;  // try again
+}
+
+// Updates the cached processor ID field in the worker.
+static void iree_task_worker_update_processor_id(iree_task_worker_t* worker) {
+  iree_cpu_requery_processor_id(&worker->processor_tag, &worker->processor_id);
+}
+
+// Alternates between pumping ready tasks in the worker queue and waiting
+// for more tasks to arrive. Only returns when the worker has been asked by
+// the executor to exit.
+static void iree_task_worker_pump_until_exit(iree_task_worker_t* worker) {
+  // Initial processor ID assignment. We normally refresh this upon waking from
+  // a wait but it's possible that there's already work pending and we want to
+  // be able to process it with the proper processor ID immediately.
+  iree_task_worker_update_processor_id(worker);
+
+  // Pump the thread loop to process more tasks.
+  while (true) {
+    // If we fail to find any work to do we'll wait at the end of this loop.
+    // In order not to not miss any work that is enqueued after we've already
+    // checked a particular source we use an interruptable wait token that
+    // will prevent the wait from happening if anyone touches the data
+    // structures we use.
+    iree_wait_token_t wait_token =
+        iree_notification_prepare_wait(&worker->wake_notification);
+    iree_atomic_task_affinity_set_fetch_and(&worker->executor->worker_idle_mask,
+                                            ~worker->worker_bit,
+                                            iree_memory_order_seq_cst);
+
+    // Check state to see if we've been asked to exit.
+    if (iree_atomic_load_int32(&worker->state, iree_memory_order_seq_cst) ==
+        IREE_TASK_WORKER_STATE_EXITING) {
+      // Thread exit requested - cancel pumping.
+      iree_notification_cancel_wait(&worker->wake_notification);
+      // TODO(benvanik): complete tasks before exiting?
+      break;
+    }
+
+    // TODO(benvanik): we could try to update the processor ID here before we
+    // begin a new batch of work - assuming it's not too expensive.
+
+    iree_task_submission_t pending_submission;
+    iree_task_submission_initialize(&pending_submission);
+
+    while (iree_task_worker_pump_once(worker, &pending_submission)) {
+      // All work done ^, which will return false when the worker should wait.
+    }
+
+    bool schedule_dirty = false;
+    if (!iree_task_submission_is_empty(&pending_submission)) {
+      iree_task_executor_merge_submission(worker->executor,
+                                          &pending_submission);
+      schedule_dirty = true;
+    }
+
+    // We've finished all the work we have scheduled so set our idle flag.
+    // This ensures that if any other thread comes in and wants to give us
+    // work we will properly coordinate/wake below.
+    iree_atomic_task_affinity_set_fetch_or(&worker->executor->worker_idle_mask,
+                                           worker->worker_bit,
+                                           iree_memory_order_seq_cst);
+
+    // When we encounter a complete lack of work we can self-nominate to check
+    // the global work queue and distribute work to other threads. Only one
+    // coordinator can be running at a time so we also ensure that if another
+    // is doing its work we gracefully wait for it. It's fine to block in here
+    // as the next thing we'd have done is go idle anyway.
+
+    // First self-nominate; this *may* do something or just be ignored (if
+    // another worker is already coordinating).
+    iree_task_executor_coordinate(worker->executor, worker);
+
+    // If nothing has been enqueued since we started this loop (so even
+    // coordination didn't find anything) we go idle. Otherwise we fall
+    // through and try the loop again.
+    if (schedule_dirty ||
+        !iree_task_queue_is_empty(&worker->local_task_queue)) {
+      // Have more work to do; loop around to try another pump.
+      iree_notification_cancel_wait(&worker->wake_notification);
+    } else {
+      IREE_TRACE_ZONE_BEGIN_NAMED(z_wait,
+                                  "iree_task_worker_main_pump_wake_wait");
+      iree_notification_commit_wait(&worker->wake_notification, wait_token,
+                                    IREE_TIME_INFINITE_FUTURE);
+      IREE_TRACE_ZONE_END(z_wait);
+
+      // Woke from a wait - query the processor ID in case we migrated during
+      // the sleep.
+      iree_task_worker_update_processor_id(worker);
+    }
+
+    // Wait completed.
+    // Jump back up and try pumping any tasks that arrived.
+    continue;
+  }
+}
+
+// Thread entry point for each worker.
+static int iree_task_worker_main(iree_task_worker_t* worker) {
+  IREE_TRACE_ZONE_BEGIN(thread_zone);
+
+  // We cannot rely on the global process settings for FPU state.
+  // Be explicit here on what we need.
+  iree_fpu_state_push(IREE_FPU_STATE_FLAG_FLUSH_DENORMALS_TO_ZERO);
+
+  // Reset affinity (as it can change over time).
+  // TODO(benvanik): call this after waking in case CPU hotplugging happens.
+  iree_thread_request_affinity(worker->thread, worker->ideal_thread_affinity);
+
+  // Enter the running state immediately. Note that we could have been requested
+  // to exit while suspended/still starting up, so check that here before we
+  // mess with any data structures.
+  const bool should_run =
+      iree_atomic_exchange_int32(&worker->state, IREE_TASK_WORKER_STATE_RUNNING,
+                                 iree_memory_order_seq_cst) !=
+      IREE_TASK_WORKER_STATE_EXITING;
+  if (IREE_LIKELY(should_run)) {
+    // << work happens here >>
+    iree_task_worker_pump_until_exit(worker);
+  }
+
+  IREE_TRACE_ZONE_END(thread_zone);
+  iree_atomic_store_int32(&worker->state, IREE_TASK_WORKER_STATE_ZOMBIE,
+                          iree_memory_order_seq_cst);
+  iree_notification_post(&worker->state_notification, IREE_ALL_WAITERS);
+  return 0;
+}
diff --git a/runtime/src/iree/task/worker.h b/runtime/src/iree/task/worker.h
new file mode 100644
index 0000000..6a7fc31
--- /dev/null
+++ b/runtime/src/iree/task/worker.h
@@ -0,0 +1,205 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_TASK_WORKER_H_
+#define IREE_TASK_WORKER_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/prng.h"
+#include "iree/base/internal/synchronization.h"
+#include "iree/base/internal/threading.h"
+#include "iree/base/tracing.h"
+#include "iree/task/affinity_set.h"
+#include "iree/task/executor.h"
+#include "iree/task/list.h"
+#include "iree/task/queue.h"
+#include "iree/task/task.h"
+#include "iree/task/topology.h"
+#include "iree/task/tuning.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Indicates the current state of a worker or, in the case of EXITING, the state
+// the worker should transition to.
+//
+// Transition graph:
+//   SUSPENDED -> RUNNING (IDLE<->PROCESSING) -> EXITING -> ZOMBIE
+//
+// NOTE: state values are ordered such that </> comparisons can be used; ensure
+// that for example all states after resuming are > SUSPENDED and all states
+// before exiting are < EXITING.
+typedef enum iree_task_worker_state_e {
+  // Worker has been created in a suspended state and must be resumed to wake.
+  IREE_TASK_WORKER_STATE_SUSPENDED = 0,
+  // Worker is idle or actively processing tasks (either its own or others).
+  IREE_TASK_WORKER_STATE_RUNNING = 1,
+  // Worker should exit (or is exiting) and will soon enter the zombie state.
+  // Coordinators can request workers to exit by setting their state to this and
+  // then waking.
+  IREE_TASK_WORKER_STATE_EXITING = 2,
+  // Worker has exited and entered a 🧟 state (waiting for join).
+  // The thread handle is still valid and must be destroyed.
+  IREE_TASK_WORKER_STATE_ZOMBIE = 3,
+} iree_task_worker_state_t;
+
+// A worker within the executor pool.
+//
+// NOTE: fields in here are touched from multiple threads with lock-free
+// techniques. The alignment of the entire iree_task_worker_t as well as the
+// alignment and padding between particular fields is carefully (though perhaps
+// not yet correctly) selected; see the 'LAYOUT' comments below.
+typedef struct iree_task_worker_t {
+  // A LIFO mailbox used by coordinators to post tasks to this worker.
+  // As workers self-nominate to be coordinators and fan out dispatch shards
+  // they can directly emplace those shards into the workers that should execute
+  // them based on the work distribution policy. When workers go to look for
+  // more work after their local queue empties they will flush this list and
+  // move all of the tasks into their local queue and restart processing.
+  // LAYOUT: must be 64b away from local_task_queue.
+  iree_atomic_task_slist_t mailbox_slist;
+
+  // Current state of the worker (iree_task_worker_state_t).
+  // LAYOUT: frequent access; next to wake_notification as they are always
+  //         accessed together.
+  iree_atomic_int32_t state;
+
+  // Notification signaled when the worker should wake (if it is idle).
+  // LAYOUT: next to state for similar access patterns; when posting other
+  //         threads will touch mailbox_slist and then send a wake
+  //         notification.
+  iree_notification_t wake_notification;
+
+  // Notification signaled when the worker changes any state.
+  iree_notification_t state_notification;
+
+  // Parent executor that can be used to access the global work queue or task
+  // pool. Executors always outlive the workers they own.
+  iree_task_executor_t* executor;
+
+  // Bit the worker represents in the various worker bitsets.
+  iree_task_affinity_set_t worker_bit;
+
+  // Ideal thread affinity for the worker thread.
+  iree_thread_affinity_t ideal_thread_affinity;
+
+  // A bitmask of other group indices that share some level of the cache
+  // hierarchy. Workers of this group are more likely to constructively share
+  // some cache levels higher up with these other groups. For example, if the
+  // workers in a group all share an L2 cache then the groups indicated here may
+  // all share the same L3 cache.
+  iree_task_affinity_set_t constructive_sharing_mask;
+
+  // Maximum number of attempts to make when trying to steal tasks from other
+  // workers. This could be 64 (try stealing from all workers) or just a handful
+  // (try stealing from these 3 other cores that share your L3 cache).
+  uint32_t max_theft_attempts;
+
+  // Rotation counter for work stealing (ensures we don't favor one victim).
+  // Only ever touched by the worker thread as it steals work.
+  iree_prng_minilcg128_state_t theft_prng;
+
+  // Thread handle of the worker. If the thread has exited the handle will
+  // remain valid so that the executor can query its state.
+  iree_thread_t* thread;
+
+  // Guess at the current processor ID.
+  // This is updated infrequently as it can be semi-expensive to determine
+  // (on some platforms at least 1 syscall involved). We always update it upon
+  // waking as idle waits are the most likely place the worker will be migrated
+  // across processors.
+  iree_cpu_processor_id_t processor_id;
+  // An opaque tag used to reduce the cost of processor ID queries.
+  iree_cpu_processor_tag_t processor_tag;
+
+  // Destructive interference padding between the mailbox and local task queue
+  // to ensure that the worker - who is pounding on local_task_queue - doesn't
+  // contend with submissions or coordinators dropping new tasks in the mailbox.
+  //
+  // Today we don't need this, however on 32-bit systems or if we adjust the
+  // size of iree_task_affinity_t/iree_task_affinity_set_t/etc we may need to
+  // add it back.
+  //
+  // NOTE: due to the layout requirements of this structure (to avoid cache
+  // interference) this is the only place padding should be added.
+  // uint8_t _padding[8];
+
+  // Pointer to local memory available for use exclusively by the worker.
+  // The base address should be aligned to avoid false sharing with other
+  // workers.
+  iree_byte_span_t local_memory;
+
+  // Worker-local FIFO queue containing the tasks that will be processed by the
+  // worker. This queue supports work-stealing by other workers if they run out
+  // of work of their own.
+  // LAYOUT: must be 64b away from mailbox_slist.
+  iree_task_queue_t local_task_queue;
+} iree_task_worker_t;
+static_assert(offsetof(iree_task_worker_t, mailbox_slist) +
+                      sizeof(iree_atomic_task_slist_t) <
+                  iree_hardware_constructive_interference_size,
+              "mailbox_slist must be in the first cache line");
+static_assert(offsetof(iree_task_worker_t, local_task_queue) >=
+                  iree_hardware_constructive_interference_size,
+              "local_task_queue must be separated from mailbox_slist by "
+              "at least a cache line");
+
+// Initializes a worker by creating its thread and configuring it for receiving
+// tasks. Where supported the worker will be created in a suspended state so
+// that we aren't creating a thundering herd on startup:
+// https://en.wikipedia.org/wiki/Thundering_herd_problem
+iree_status_t iree_task_worker_initialize(
+    iree_task_executor_t* executor, iree_host_size_t worker_index,
+    const iree_task_topology_group_t* topology_group,
+    iree_byte_span_t local_memory, iree_prng_splitmix64_state_t* seed_prng,
+    iree_task_worker_t* out_worker);
+
+// Requests that the worker begin exiting (if it hasn't already).
+// If the worker is actively processing tasks it will wait until it has
+// completed all it can and is about to go idle prior to exiting.
+//
+// May be called from any thread (including the worker thread).
+void iree_task_worker_request_exit(iree_task_worker_t* worker);
+
+// Blocks the caller until |worker| has exited.
+//
+// May be called from any thread.
+void iree_task_worker_await_exit(iree_task_worker_t* worker);
+
+// Deinitializes a worker that has successfully exited.
+// The worker must be in the IREE_TASK_WORKER_STATE_ZOMBIE state.
+//
+// Expected shutdown sequence:
+//  - request_exit on all workers
+//  - await_exit on all workers
+//  - deinitialize all workers
+void iree_task_worker_deinitialize(iree_task_worker_t* worker);
+
+// Posts a FIFO list of tasks to the worker mailbox. The target worker takes
+// ownership of the tasks and will be woken if it is currently idle.
+//
+// May be called from any thread (including the worker thread).
+void iree_task_worker_post_tasks(iree_task_worker_t* worker,
+                                 iree_task_list_t* list);
+
+// Tries to steal up to |max_tasks| from the back of the queue.
+// Returns NULL if no tasks are available and otherwise up to |max_tasks| tasks
+// that were at the tail of the worker FIFO will be moved to the |target_queue|
+// and the first of the stolen tasks is returned. While tasks from the FIFO
+// are preferred this may also steal tasks from the mailbox.
+iree_task_t* iree_task_worker_try_steal_task(iree_task_worker_t* worker,
+                                             iree_task_queue_t* target_queue,
+                                             iree_host_size_t max_tasks);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_TASK_WORKER_H_
diff --git a/runtime/src/iree/testing/BUILD b/runtime/src/iree/testing/BUILD
new file mode 100644
index 0000000..31be851
--- /dev/null
+++ b/runtime/src/iree/testing/BUILD
@@ -0,0 +1,65 @@
+# Copyright 2019 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# Testing utilities for IREE.
+
+load("//iree:build_defs.oss.bzl", "iree_runtime_cc_library")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+iree_runtime_cc_library(
+    name = "benchmark",
+    srcs = [
+        "benchmark_full.cc",
+    ],
+    hdrs = [
+        "benchmark.h",
+    ],
+    deps = [
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:tracing",
+        "@com_google_benchmark//:benchmark",
+    ],
+)
+
+iree_runtime_cc_library(
+    name = "benchmark_main",
+    testonly = True,
+    srcs = ["benchmark_main.c"],
+    deps = [
+        ":benchmark",
+        "//runtime/src/iree/base/internal:flags",
+    ],
+)
+
+iree_runtime_cc_library(
+    name = "gtest",
+    testonly = True,
+    hdrs = [
+        "gtest.h",
+        "status_matchers.h",
+    ],
+    deps = [
+        "//runtime/src/iree/base:cc",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+iree_runtime_cc_library(
+    name = "gtest_main",
+    testonly = True,
+    srcs = ["gtest_main.cc"],
+    tags = ["keep_dep"],
+    deps = [
+        ":gtest",
+        "//runtime/src/iree/base/internal:flags",
+        "@com_google_googletest//:gtest",
+    ],
+)
diff --git a/runtime/src/iree/testing/CMakeLists.txt b/runtime/src/iree/testing/CMakeLists.txt
new file mode 100644
index 0000000..be9935c
--- /dev/null
+++ b/runtime/src/iree/testing/CMakeLists.txt
@@ -0,0 +1,80 @@
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# Doesn't use bazel_to_cmake because of special logic for the benchmark library.
+
+iree_add_all_subdirs()
+
+if(${IREE_ENABLE_THREADING})
+  iree_cc_library(
+    NAME
+      benchmark
+    HDRS
+      "benchmark.h"
+    SRCS
+      "benchmark_full.cc"
+    DEPS
+      benchmark
+      iree::base
+      iree::base::tracing
+    PUBLIC
+  )
+else()
+  iree_cc_library(
+    NAME
+      benchmark
+    HDRS
+      "benchmark.h"
+    SRCS
+      "benchmark_nop.c"
+    DEPS
+      iree::base
+      iree::base::tracing
+    PUBLIC
+  )
+endif()
+
+iree_cc_library(
+  NAME
+    benchmark_main
+  SRCS
+    "benchmark_main.c"
+  DEPS
+    ::benchmark
+    iree::base::internal::flags
+  TESTONLY
+  PUBLIC
+)
+
+iree_cc_library(
+  NAME
+    gtest
+  HDRS
+    "gtest.h"
+    "status_matchers.h"
+  DEPS
+    gmock
+    gtest
+    iree::base::cc
+  TESTONLY
+  PUBLIC
+)
+
+iree_cc_library(
+  NAME
+    gtest_main
+  SRCS
+    "gtest_main.cc"
+  DEPS
+    ::gtest
+    gmock
+    gtest
+    iree::base::internal::flags
+  TESTONLY
+  PUBLIC
+)
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/runtime/src/iree/testing/benchmark.h b/runtime/src/iree/testing/benchmark.h
new file mode 100644
index 0000000..cc258d5
--- /dev/null
+++ b/runtime/src/iree/testing/benchmark.h
@@ -0,0 +1,147 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_TESTING_BENCHMARK_H_
+#define IREE_TESTING_BENCHMARK_H_
+
+// This is a C API shim for a benchmark-like interface.
+// The intent is that we can write benchmarks that are portable to bare-metal
+// systems and use some simple tooling while also allowing them to run on
+// the full benchmark library with all its useful reporting and statistics.
+
+#include "iree/base/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+//===----------------------------------------------------------------------===//
+// iree_benchmark_state_t
+//===----------------------------------------------------------------------===//
+
+// Benchmark state manipulator.
+// Passed to each benchmark during execution to control the benchmark state
+// or append information beyond just timing.
+typedef struct iree_benchmark_state_t {
+  // Internal implementation handle.
+  void* impl;
+
+  // Allocator that can be used for host allocations required during benchmark
+  // execution.
+  iree_allocator_t host_allocator;
+} iree_benchmark_state_t;
+
+// Returns a range argument with the given ordial.
+int64_t iree_benchmark_get_range(iree_benchmark_state_t* state,
+                                 iree_host_size_t ordinal);
+
+// Returns true while the benchmark should keep running its step loop.
+//
+// Usage:
+//  while (iree_benchmark_keep_running(state, 1000)) {
+//    // process 1000 elements
+//  }
+bool iree_benchmark_keep_running(iree_benchmark_state_t* state,
+                                 uint64_t batch_count);
+
+// Reports that the currently executing benchmark cannot be run.
+// Callers should return after calling as further benchmark-related calls may
+// fail.
+void iree_benchmark_skip(iree_benchmark_state_t* state, const char* message);
+
+// Suspends the benchmark timer until iree_benchmark_resume_timing is called.
+// This can be used to guard per-step code that is required to initialze the
+// work but not something that needs to be accounted for in the benchmark
+// timing. Introduces non-trivial overhead: only use this ~once per step when
+// then going on to perform large amounts of batch work in the step.
+void iree_benchmark_pause_timing(iree_benchmark_state_t* state);
+
+// Resumes the benchmark timer after a prior iree_benchmark_suspend_timing.
+void iree_benchmark_resume_timing(iree_benchmark_state_t* state);
+
+// Sets a label string that will be displayed alongside the report line from the
+// currently executing benchmark.
+void iree_benchmark_set_label(iree_benchmark_state_t* state, const char* label);
+
+// Adds a 'bytes/s' label with the given value.
+//
+// REQUIRES: must only be called outside of the benchmark step loop.
+void iree_benchmark_set_bytes_processed(iree_benchmark_state_t* state,
+                                        int64_t bytes);
+
+// Adds an `items/s` label with the given value.
+//
+// REQUIRES: must only be called outside of the benchmark step loop.
+void iree_benchmark_set_items_processed(iree_benchmark_state_t* state,
+                                        int64_t items);
+
+//===----------------------------------------------------------------------===//
+// iree_benchmark_def_t
+//===----------------------------------------------------------------------===//
+
+enum iree_benchmark_flag_bits_t {
+  IREE_BENCHMARK_FLAG_MEASURE_PROCESS_CPU_TIME = 1u << 0,
+
+  IREE_BENCHMARK_FLAG_USE_REAL_TIME = 1u << 1,
+  IREE_BENCHMARK_FLAG_USE_MANUAL_TIME = 1u << 2,
+};
+typedef uint32_t iree_benchmark_flags_t;
+
+typedef enum iree_benchmark_unit_e {
+  IREE_BENCHMARK_UNIT_MILLISECOND = 0,
+  IREE_BENCHMARK_UNIT_MICROSECOND,
+  IREE_BENCHMARK_UNIT_NANOSECOND,
+} iree_benchmark_unit_t;
+
+typedef struct iree_benchmark_def_t iree_benchmark_def_t;
+
+// A benchmark case definition.
+struct iree_benchmark_def_t {
+  // IREE_BENCHMARK_FLAG_* bitmask controlling benchmark behavior and reporting.
+  iree_benchmark_flags_t flags;
+
+  // Time unit used in display.
+  iree_benchmark_unit_t time_unit;  // MILLISECOND by default
+
+  // Optional minimum duration the benchmark should run for in nanoseconds.
+  iree_duration_t minimum_duration_ns;  // 0 if unspecified to autodetect
+  // Optional iteration count the benchmark should run for.
+  uint64_t iteration_count;  // 0 if unspecified to autodetect
+
+  // TODO(benvanik): add range arguments.
+
+  // Runs the benchmark to completion.
+  // Implementations must call iree_benchmark_keep_running in a loop until it
+  // returns false.
+  iree_status_t (*run)(const iree_benchmark_def_t* benchmark_def,
+                       iree_benchmark_state_t* benchmark_state);
+
+  // User-defined data accessible in the run function.
+  const void* user_data;
+};
+
+// Registers a benchmark with the given definition.
+void iree_benchmark_register(iree_string_view_t name,
+                             const iree_benchmark_def_t* benchmark_def);
+
+//===----------------------------------------------------------------------===//
+// Benchmark infra management
+//===----------------------------------------------------------------------===//
+
+// Initializes the benchmark framework.
+// Must be called before any other iree_benchmark_* functions.
+void iree_benchmark_initialize(int* argc, char** argv);
+
+// Runs all registered benchmarks specified by the command line flags.
+// Must be called after iree_benchmark_initialize and zero or more benchmarks
+// have been registered with iree_benchmark_register.
+void iree_benchmark_run_specified(void);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_TESTING_BENCHMARK_H_
diff --git a/runtime/src/iree/testing/benchmark_full.cc b/runtime/src/iree/testing/benchmark_full.cc
new file mode 100644
index 0000000..c01abf0
--- /dev/null
+++ b/runtime/src/iree/testing/benchmark_full.cc
@@ -0,0 +1,190 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <string>
+#include <utility>
+
+#include "benchmark/benchmark.h"
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+#include "iree/testing/benchmark.h"
+
+//===----------------------------------------------------------------------===//
+// iree_benchmark_state_t
+//===----------------------------------------------------------------------===//
+
+benchmark::State& GetBenchmarkState(iree_benchmark_state_t* state) {
+  return *(benchmark::State*)state->impl;
+}
+
+int64_t iree_benchmark_get_range(iree_benchmark_state_t* state,
+                                 iree_host_size_t ordinal) {
+  auto& s = GetBenchmarkState(state);
+  return s.range(ordinal);
+}
+
+bool iree_benchmark_keep_running(iree_benchmark_state_t* state,
+                                 uint64_t batch_count) {
+  auto& s = GetBenchmarkState(state);
+  return s.KeepRunningBatch(batch_count);
+}
+
+void iree_benchmark_skip(iree_benchmark_state_t* state, const char* message) {
+  auto& s = GetBenchmarkState(state);
+  s.SkipWithError(message);
+}
+
+void iree_benchmark_pause_timing(iree_benchmark_state_t* state) {
+  auto& s = GetBenchmarkState(state);
+  s.PauseTiming();
+}
+
+void iree_benchmark_resume_timing(iree_benchmark_state_t* state) {
+  auto& s = GetBenchmarkState(state);
+  s.ResumeTiming();
+}
+
+void iree_benchmark_set_label(iree_benchmark_state_t* state,
+                              const char* label) {
+  auto& s = GetBenchmarkState(state);
+  s.SetLabel(label);
+}
+
+void iree_benchmark_set_bytes_processed(iree_benchmark_state_t* state,
+                                        int64_t bytes) {
+  auto& s = GetBenchmarkState(state);
+  s.SetBytesProcessed(bytes);
+}
+
+void iree_benchmark_set_items_processed(iree_benchmark_state_t* state,
+                                        int64_t items) {
+  auto& s = GetBenchmarkState(state);
+  s.SetItemsProcessed(items);
+}
+
+//===----------------------------------------------------------------------===//
+// iree_benchmark_def_t
+//===----------------------------------------------------------------------===//
+
+static std::string StatusToString(iree_status_t status) {
+  if (iree_status_is_ok(status)) {
+    return "OK";
+  }
+  iree_host_size_t buffer_length = 0;
+  if (IREE_UNLIKELY(!iree_status_format(status, /*buffer_capacity=*/0,
+                                        /*buffer=*/NULL, &buffer_length))) {
+    return "<!>";
+  }
+  std::string result(buffer_length, '\0');
+  if (IREE_UNLIKELY(!iree_status_format(status, result.size() + 1,
+                                        const_cast<char*>(result.data()),
+                                        &buffer_length))) {
+    return "<!>";
+  }
+  return result;
+}
+
+static void iree_benchmark_run(const char* benchmark_name,
+                               const iree_benchmark_def_t* benchmark_def,
+                               benchmark::State& benchmark_state) {
+  IREE_TRACE_SCOPE_DYNAMIC(benchmark_name);
+  IREE_TRACE_FRAME_MARK();
+
+  iree_benchmark_state_t state;
+  memset(&state, 0, sizeof(state));
+  state.impl = &benchmark_state;
+  state.host_allocator = iree_allocator_system();
+
+  iree_status_t status = benchmark_def->run(benchmark_def, &state);
+  if (!iree_status_is_ok(status)) {
+    auto status_str = StatusToString(status);
+    iree_status_ignore(status);
+    benchmark_state.SkipWithError(status_str.c_str());
+  }
+}
+
+void iree_benchmark_register(iree_string_view_t name,
+                             const iree_benchmark_def_t* benchmark_def) {
+  std::string name_str(name.data, name.size);
+  std::string prefixed_str = "BM_" + name_str;
+  iree_benchmark_def_t cloned_def = *benchmark_def;
+  auto* instance = benchmark::RegisterBenchmark(
+      prefixed_str.c_str(),
+      [name_str, cloned_def](benchmark::State& state) -> void {
+        iree_benchmark_run(name_str.c_str(), &cloned_def, state);
+      });
+
+  if (iree_all_bits_set(benchmark_def->flags,
+                        IREE_BENCHMARK_FLAG_MEASURE_PROCESS_CPU_TIME)) {
+    instance->MeasureProcessCPUTime();
+  }
+  if (iree_all_bits_set(benchmark_def->flags,
+                        IREE_BENCHMARK_FLAG_USE_REAL_TIME)) {
+    instance->UseRealTime();
+  }
+  if (iree_all_bits_set(benchmark_def->flags,
+                        IREE_BENCHMARK_FLAG_USE_MANUAL_TIME)) {
+    instance->UseManualTime();
+  }
+
+  if (benchmark_def->minimum_duration_ns != 0) {
+    instance->MinTime((double)benchmark_def->minimum_duration_ns / 1e-9);
+  } else if (benchmark_def->iteration_count != 0) {
+    instance->Iterations(benchmark_def->iteration_count);
+  }
+
+  switch (benchmark_def->time_unit) {
+    default:
+    case IREE_BENCHMARK_UNIT_MILLISECOND:
+      instance->Unit(benchmark::kMillisecond);
+      break;
+    case IREE_BENCHMARK_UNIT_MICROSECOND:
+      instance->Unit(benchmark::kMicrosecond);
+      break;
+    case IREE_BENCHMARK_UNIT_NANOSECOND:
+      instance->Unit(benchmark::kNanosecond);
+      break;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Benchmark infra management
+//===----------------------------------------------------------------------===//
+
+void iree_benchmark_initialize(int* argc, char** argv) {
+  benchmark::Initialize(argc, argv);
+
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+  // clang-format off
+  fprintf(stderr,
+"\x1b[31m"
+"===----------------------------------------------------------------------===\n"
+"\n"
+"         ██     ██  █████  ██████  ███    ██ ██ ███    ██  ██████\n"
+"         ██     ██ ██   ██ ██   ██ ████   ██ ██ ████   ██ ██\n"
+"         ██  █  ██ ███████ ██████  ██ ██  ██ ██ ██ ██  ██ ██   ███\n"
+"         ██ ███ ██ ██   ██ ██   ██ ██  ██ ██ ██ ██  ██ ██ ██    ██\n"
+"          ███ ███  ██   ██ ██   ██ ██   ████ ██ ██   ████  ██████\n"
+"\n"
+"===----------------------------------------------------------------------===\n"
+"\n"
+"Tracing is enabled and will skew your results!\n"
+"The timings involved here can an order of magnitude off due to the tracing\n"
+"time sampling, recording, and instrumentation overhead. Disable tracing with\n"
+"IREE_ENABLE_RUNTIME_TRACING=OFF and rebuild.\n"
+"\x1b[0m"
+"\n"
+  );
+  fflush(stderr);
+  // clang-format on
+#endif  // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+}
+
+void iree_benchmark_run_specified(void) { benchmark::RunSpecifiedBenchmarks(); }
diff --git a/runtime/src/iree/testing/benchmark_main.c b/runtime/src/iree/testing/benchmark_main.c
new file mode 100644
index 0000000..860f4a6
--- /dev/null
+++ b/runtime/src/iree/testing/benchmark_main.c
@@ -0,0 +1,18 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/internal/flags.h"
+#include "iree/testing/benchmark.h"
+
+int main(int argc, char** argv) {
+  // Pass through flags to benchmark (allowing --help to fall through).
+  iree_flags_parse_checked(IREE_FLAGS_PARSE_MODE_UNDEFINED_OK |
+                               IREE_FLAGS_PARSE_MODE_CONTINUE_AFTER_HELP,
+                           &argc, &argv);
+  iree_benchmark_initialize(&argc, argv);
+  iree_benchmark_run_specified();
+  return 0;
+}
diff --git a/runtime/src/iree/testing/benchmark_nop.c b/runtime/src/iree/testing/benchmark_nop.c
new file mode 100644
index 0000000..65272e7
--- /dev/null
+++ b/runtime/src/iree/testing/benchmark_nop.c
@@ -0,0 +1,41 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+#include "iree/testing/benchmark.h"
+
+int64_t iree_benchmark_get_range(iree_benchmark_state_t* state,
+                                 iree_host_size_t ordinal) {
+  return 0;
+}
+
+bool iree_benchmark_keep_running(iree_benchmark_state_t* state,
+                                 uint64_t batch_count) {
+  return false;
+}
+
+void iree_benchmark_skip(iree_benchmark_state_t* state, const char* message) {}
+
+void iree_benchmark_pause_timing(iree_benchmark_state_t* state) {}
+
+void iree_benchmark_resume_timing(iree_benchmark_state_t* state) {}
+
+void iree_benchmark_set_label(iree_benchmark_state_t* state,
+                              const char* label) {}
+
+void iree_benchmark_set_bytes_processed(iree_benchmark_state_t* state,
+                                        int64_t bytes) {}
+
+void iree_benchmark_set_items_processed(iree_benchmark_state_t* state,
+                                        int64_t items) {}
+
+void iree_benchmark_register(iree_string_view_t name,
+                             const iree_benchmark_def_t* benchmark_def) {}
+
+void iree_benchmark_initialize(int* argc, char** argv) {}
+
+void iree_benchmark_run_specified(void) {}
diff --git a/runtime/src/iree/testing/gtest.h b/runtime/src/iree/testing/gtest.h
new file mode 100644
index 0000000..fbd6dc4
--- /dev/null
+++ b/runtime/src/iree/testing/gtest.h
@@ -0,0 +1,17 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_TESTING_GTEST_H_
+#define IREE_TESTING_GTEST_H_
+
+#include "gmock/gmock-matchers.h"   // IWYU pragma: export
+#include "gmock/gmock.h"            // IWYU pragma: export
+#include "gtest/gtest-message.h"    // IWYU pragma: export
+#include "gtest/gtest-spi.h"        // IWYU pragma: export
+#include "gtest/gtest-test-part.h"  // IWYU pragma: export
+#include "gtest/gtest.h"            // IWYU pragma: export
+
+#endif  // IREE_TESTING_GTEST_H_
diff --git a/runtime/src/iree/testing/gtest_main.cc b/runtime/src/iree/testing/gtest_main.cc
new file mode 100644
index 0000000..801aac8
--- /dev/null
+++ b/runtime/src/iree/testing/gtest_main.cc
@@ -0,0 +1,18 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/internal/flags.h"
+#include "iree/testing/gtest.h"
+
+extern "C" int main(int argc, char** argv) {
+  // Pass through flags to gtest (allowing --help to fall through).
+  iree_flags_parse_checked(IREE_FLAGS_PARSE_MODE_UNDEFINED_OK |
+                               IREE_FLAGS_PARSE_MODE_CONTINUE_AFTER_HELP,
+                           &argc, &argv);
+  ::testing::InitGoogleTest(&argc, argv);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/runtime/src/iree/testing/status_matchers.h b/runtime/src/iree/testing/status_matchers.h
new file mode 100644
index 0000000..1697e4c
--- /dev/null
+++ b/runtime/src/iree/testing/status_matchers.h
@@ -0,0 +1,369 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_TESTING_STATUS_MATCHERS_H_
+#define IREE_TESTING_STATUS_MATCHERS_H_
+
+#include <memory>
+#include <string>
+
+#include "iree/base/status_cc.h"  // IWYU pragma: export
+#include "iree/testing/gtest.h"
+
+namespace iree {
+
+namespace internal {
+
+// Implements a gMock matcher that checks that an iree::StaturOr<T> has an OK
+// status and that the contained T value matches another matcher.
+template <typename T>
+class IsOkAndHoldsMatcher
+    : public ::testing::MatcherInterface<const StatusOr<T> &> {
+ public:
+  template <typename MatcherT>
+  IsOkAndHoldsMatcher(MatcherT &&value_matcher)
+      : value_matcher_(::testing::SafeMatcherCast<const T &>(value_matcher)) {}
+
+  // From testing::MatcherInterface.
+  void DescribeTo(std::ostream *os) const override {
+    *os << "is OK and contains a value that ";
+    value_matcher_.DescribeTo(os);
+  }
+
+  // From testing::MatcherInterface.
+  void DescribeNegationTo(std::ostream *os) const override {
+    *os << "is not OK or contains a value that ";
+    value_matcher_.DescribeNegationTo(os);
+  }
+
+  // From testing::MatcherInterface.
+  bool MatchAndExplain(
+      const StatusOr<T> &status_or,
+      ::testing::MatchResultListener *listener) const override {
+    if (!status_or.ok()) {
+      *listener << "which is not OK";
+      return false;
+    }
+
+    ::testing::StringMatchResultListener value_listener;
+    bool is_a_match =
+        value_matcher_.MatchAndExplain(status_or.value(), &value_listener);
+    std::string value_explanation = value_listener.str();
+    if (!value_explanation.empty()) {
+      *listener << "which contains a value " << value_explanation;
+    }
+
+    return is_a_match;
+  }
+
+ private:
+  const ::testing::Matcher<const T &> value_matcher_;
+};
+
+// A polymorphic IsOkAndHolds() matcher.
+//
+// IsOkAndHolds() returns a matcher that can be used to process an IsOkAndHolds
+// expectation. However, the value type T is not provided when IsOkAndHolds() is
+// invoked. The value type is only inferable when the gUnit framework invokes
+// the matcher with a value. Consequently, the IsOkAndHolds() function must
+// return an object that is implicitly convertible to a matcher for StatusOr<T>.
+// gUnit refers to such an object as a polymorphic matcher, since it can be used
+// to match with more than one type of value.
+template <typename ValueMatcherT>
+class IsOkAndHoldsGenerator {
+ public:
+  explicit IsOkAndHoldsGenerator(ValueMatcherT value_matcher)
+      : value_matcher_(std::move(value_matcher)) {}
+
+  template <typename T>
+  operator ::testing::Matcher<const StatusOr<T> &>() const {
+    return ::testing::MakeMatcher(new IsOkAndHoldsMatcher<T>(value_matcher_));
+  }
+
+ private:
+  const ValueMatcherT value_matcher_;
+};
+
+// Implements a gMock matcher for checking error-code expectations on
+// iree::Status and iree::StatusOr objects.
+template <typename Enum, typename Matchee>
+class StatusMatcher : public ::testing::MatcherInterface<Matchee> {
+ public:
+  StatusMatcher(Enum code, std::string message)
+      : code_(code), message_(std::move(message)) {}
+
+  // From testing::MatcherInterface.
+  //
+  // Describes the expected error code.
+  void DescribeTo(std::ostream *os) const override {
+    *os << "error code " << StatusCodeToString(code_);
+    if (!message_.empty()) {
+      *os << "::'" << message_ << "'";
+    }
+  }
+
+  // From testing::MatcherInterface.
+  //
+  // Tests whether |matchee| has an error code that meets this matcher's
+  // expectation. If an error message string is specified in this matcher, it
+  // also tests that |matchee| has an error message that matches that
+  // expectation.
+  bool MatchAndExplain(
+      Matchee &matchee,
+      ::testing::MatchResultListener *listener) const override {
+    if (GetCode(matchee) != code_) {
+      *listener << "whose error code is "
+                << StatusCodeToString(GetCode(matchee)) << ": "
+                << GetMessage(matchee);
+      return false;
+    }
+    if (!message_.empty() && GetMessage(matchee) != message_) {
+      *listener << "whose error message is '" << GetMessage(matchee) << "'";
+      return false;
+    }
+    return true;
+  }
+
+ private:
+  template <typename T>
+  StatusCode GetCode(const T &matchee) const {
+    return GetCode(matchee.status());
+  }
+
+  StatusCode GetCode(const iree_status_code_t &status_code) const {
+    return static_cast<StatusCode>(status_code);
+  }
+
+  StatusCode GetCode(const iree_status_t &status) const {
+    return static_cast<StatusCode>(iree_status_code(status));
+  }
+
+  StatusCode GetCode(const Status &status) const { return status.code(); }
+
+  template <typename T>
+  std::string GetMessage(const T &matchee) const {
+    return GetMessage(matchee.status());
+  }
+
+  std::string GetMessage(const iree_status_t &status) const {
+    return Status::ToString(status);
+  }
+
+  std::string GetMessage(const Status &status) const {
+    return status.ToString();
+  }
+
+  // Expected error code.
+  const Enum code_;
+
+  // Expected error message (empty if none expected and verified).
+  const std::string message_;
+};
+
+// StatusMatcherGenerator is an intermediate object returned by
+// iree::testing::status::StatusIs().
+// It implements implicit type-cast operators to supported matcher types:
+// Matcher<const Status &> and Matcher<const StatusOr<T> &>. These typecast
+// operators create gMock matchers that test OK expectations on a status
+// container.
+template <typename Enum>
+class StatusIsMatcherGenerator {
+ public:
+  StatusIsMatcherGenerator(Enum code, std::string message)
+      : code_(code), message_(std::move(message)) {}
+
+  operator ::testing::Matcher<const StatusCode &>() const {
+    return ::testing::MakeMatcher(
+        new internal::StatusMatcher<Enum, const StatusCode &>(code_, message_));
+  }
+
+  operator ::testing::Matcher<const iree_status_t &>() const {
+    return ::testing::MakeMatcher(
+        new internal::StatusMatcher<Enum, const iree_status_t &>(code_,
+                                                                 message_));
+  }
+
+  operator ::testing::Matcher<const Status &>() const {
+    return ::testing::MakeMatcher(
+        new internal::StatusMatcher<Enum, const Status &>(code_, message_));
+  }
+
+  template <class T>
+  operator ::testing::Matcher<const StatusOr<T> &>() const {
+    return ::testing::MakeMatcher(
+        new internal::StatusMatcher<Enum, const StatusOr<T> &>(code_,
+                                                               message_));
+  }
+
+ private:
+  // Expected error code.
+  const Enum code_;
+
+  // Expected error message (empty if none expected and verified).
+  const std::string message_;
+};
+
+// Implements a gMock matcher that checks whether a status container (e.g.
+// iree::Status or iree::StatusOr<T>) has an OK status.
+template <class T>
+class IsOkMatcherImpl : public ::testing::MatcherInterface<T> {
+ public:
+  IsOkMatcherImpl() = default;
+
+  // From testing::MatcherInterface.
+  //
+  // Describes the OK expectation.
+  void DescribeTo(std::ostream *os) const override { *os << "is OK"; }
+
+  // From testing::MatcherInterface.
+  //
+  // Describes the negative OK expectation.
+  void DescribeNegationTo(std::ostream *os) const override {
+    *os << "is not OK";
+  }
+
+  // From testing::MatcherInterface.
+  //
+  // Tests whether |status_container|'s OK value meets this matcher's
+  // expectation.
+  bool MatchAndExplain(
+      const T &status_container,
+      ::testing::MatchResultListener *listener) const override {
+    if (!::iree::IsOk(status_container)) {
+      *listener << "which is not OK";
+      return false;
+    }
+    return true;
+  }
+};
+
+// IsOkMatcherGenerator is an intermediate object returned by iree::IsOk().
+// It implements implicit type-cast operators to supported matcher types:
+// Matcher<const Status &> and Matcher<const StatusOr<T> &>. These typecast
+// operators create gMock matchers that test OK expectations on a status
+// container.
+class IsOkMatcherGenerator {
+ public:
+  operator ::testing::Matcher<const iree_status_t &>() const {
+    return ::testing::MakeMatcher(
+        new internal::IsOkMatcherImpl<const iree_status_t &>());
+  }
+
+  operator ::testing::Matcher<const Status &>() const {
+    return ::testing::MakeMatcher(
+        new internal::IsOkMatcherImpl<const Status &>());
+  }
+
+  template <class T>
+  operator ::testing::Matcher<const StatusOr<T> &>() const {
+    return ::testing::MakeMatcher(
+        new internal::IsOkMatcherImpl<const StatusOr<T> &>());
+  }
+};
+
+}  // namespace internal
+
+namespace testing {
+namespace status {
+
+// Returns a gMock matcher that expects an iree::StatusOr<T> object to have an
+// OK status and for the contained T object to match |value_matcher|.
+//
+// Example:
+//
+//     StatusOr<string> raven_speech_result = raven.Speak();
+//     EXPECT_THAT(raven_speech_result, IsOkAndHolds(HasSubstr("nevermore")));
+//
+// If foo is an object of type T and foo_result is an object of type
+// StatusOr<T>, you can write:
+//
+//     EXPECT_THAT(foo_result, IsOkAndHolds(foo));
+//
+// instead of:
+//
+//     EXPECT_THAT(foo_result, IsOkAndHolds(Eq(foo)));
+template <typename ValueMatcherT>
+internal::IsOkAndHoldsGenerator<ValueMatcherT> IsOkAndHolds(
+    ValueMatcherT value_matcher) {
+  return internal::IsOkAndHoldsGenerator<ValueMatcherT>(value_matcher);
+}
+
+// Returns a gMock matcher that expects an iree::Status object to have the
+// given |code|.
+template <typename Enum>
+internal::StatusIsMatcherGenerator<Enum> StatusIs(Enum code) {
+  return internal::StatusIsMatcherGenerator<Enum>(code, "");
+}
+
+// Returns a gMock matcher that expects an iree::Status object to have the
+// given |code| and |message|.
+template <typename Enum>
+internal::StatusIsMatcherGenerator<Enum> StatusIs(Enum code,
+                                                  std::string message) {
+  return internal::StatusIsMatcherGenerator<Enum>(code, std::move(message));
+}
+
+// Returns an internal::IsOkMatcherGenerator, which may be typecast to a
+// Matcher<iree::Status> or Matcher<iree::StatusOr<T>>. These gMock
+// matchers test that a given status container has an OK status.
+inline internal::IsOkMatcherGenerator IsOk() {
+  return internal::IsOkMatcherGenerator();
+}
+
+}  // namespace status
+}  // namespace testing
+
+// Macros for testing the results of functions that return iree::Status or
+// iree::StatusOr<T> (for any type T).
+#define IREE_EXPECT_OK(rexpr) \
+  EXPECT_THAT(rexpr, ::iree::testing::status::StatusIs(::iree::StatusCode::kOk))
+#define IREE_ASSERT_OK(rexpr) \
+  ASSERT_THAT(rexpr, ::iree::testing::status::StatusIs(::iree::StatusCode::kOk))
+#define IREE_EXPECT_STATUS_IS(expected_code, expr)     \
+  EXPECT_THAT(expr, ::iree::testing::status::StatusIs( \
+                        static_cast<::iree::StatusCode>(expected_code)))
+
+// Executes an expression that returns an iree::StatusOr<T>, and assigns the
+// contained variable to lhs if the error code is OK.
+// If the Status is non-OK, generates a test failure and returns from the
+// current function, which must have a void return type.
+//
+// Example: Assigning to an existing value
+//   IREE_ASSERT_OK_AND_ASSIGN(ValueType value, MaybeGetValue(arg));
+//
+// The value assignment example might expand into:
+//   StatusOr<ValueType> status_or_value = MaybeGetValue(arg);
+//   IREE_ASSERT_OK(status_or_value.status());
+//   ValueType value = status_or_value.value();
+#define IREE_ASSERT_OK_AND_ASSIGN(lhs, rexpr)                             \
+  IREE_ASSERT_OK_AND_ASSIGN_IMPL(                                         \
+      IREE_STATUS_MACROS_CONCAT_NAME(_status_or_value, __COUNTER__), lhs, \
+      rexpr);
+
+#define IREE_ASSERT_OK_AND_ASSIGN_IMPL(statusor, lhs, rexpr) \
+  auto statusor = (rexpr);                                   \
+  IREE_ASSERT_OK(statusor.status());                         \
+  lhs = std::move(statusor.value())
+#define IREE_STATUS_MACROS_CONCAT_NAME(x, y) \
+  IREE_STATUS_MACROS_CONCAT_IMPL(x, y)
+#define IREE_STATUS_MACROS_CONCAT_IMPL(x, y) x##y
+
+// Implements the PrintTo() method for iree::StatusOr<T>. This method is
+// used by gUnit to print iree::StatusOr<T> objects for debugging. The
+// implementation relies on gUnit for printing values of T when a
+// iree::StatusOr<T> object is OK and contains a value.
+template <typename T>
+void PrintTo(const StatusOr<T> &statusor, std::ostream *os) {
+  if (!statusor.ok()) {
+    *os << statusor.status();
+  } else {
+    *os << "OK: " << ::testing::PrintToString(statusor.value());
+  }
+}
+
+}  // namespace iree
+
+#endif  // IREE_TESTING_STATUS_MATCHERS_H_
diff --git a/runtime/src/iree/testing/vulkan/CMakeLists.txt b/runtime/src/iree/testing/vulkan/CMakeLists.txt
new file mode 100644
index 0000000..1dd197f
--- /dev/null
+++ b/runtime/src/iree/testing/vulkan/CMakeLists.txt
@@ -0,0 +1,70 @@
+# Copyright 2020 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+if(NOT "${IREE_HAL_DRIVER_VULKAN}" OR NOT "${IREE_BUILD_SAMPLES}")
+  return()
+endif()
+
+# This target statically links against Vulkan.
+# One way to achieve this is by installing the Vulkan SDK from
+# https://vulkan.lunarg.com/.
+# If Vulkan is not found, omit the target.
+include(FindVulkan)
+if(NOT Vulkan_FOUND)
+  message(VERBOSE "Could not find Vulkan, disabling Vulkan GUI programs")
+  return()
+endif()
+
+# vcpkg install imgui[vulkan-binding,sdl2-binding]
+find_package(imgui CONFIG QUIET)
+if(NOT imgui_FOUND)
+  message(VERBOSE "Could not find Dear ImGui, disabling Vulkan GUI programs")
+  return()
+endif()
+
+# vcpkg install sdl2[vulkan]
+find_package(SDL2 CONFIG QUIET)
+if(NOT SDL2_FOUND)
+  message(VERBOSE "Could not find SDL2, disabling Vulkan GUI programs")
+  return()
+endif()
+
+iree_cc_library(
+  NAME
+    vulkan_gui_util
+  HDRS
+    "vulkan_gui_util.h"
+  SRCS
+    "vulkan_gui_util.cc"
+  DEPS
+    imgui::imgui
+    iree::base
+    iree::base::logging
+    iree::hal::vulkan
+    SDL2::SDL2
+    Vulkan::Vulkan
+)
+
+iree_cc_binary(
+  NAME
+    iree-run-module-vulkan-gui
+  SRCS
+    "iree-run-module-vulkan-gui-main.cc"
+  DEPS
+    ::vulkan_gui_util
+    iree::base::cc
+    iree::base::internal::file_io
+    iree::base::internal::flags
+    iree::base::internal::main
+    iree::base::tracing
+    iree::hal::vulkan::registration
+    iree::modules::hal
+    iree::tools::utils::vm_util
+    iree::vm
+    iree::vm::bytecode_module
+  LINKOPTS
+    "${IREE_TARGET_GUI_LINKOPTS}"
+)
diff --git a/runtime/src/iree/testing/vulkan/iree-run-module-vulkan-gui-main.cc b/runtime/src/iree/testing/vulkan/iree-run-module-vulkan-gui-main.cc
new file mode 100644
index 0000000..bfb8816
--- /dev/null
+++ b/runtime/src/iree/testing/vulkan/iree-run-module-vulkan-gui-main.cc
@@ -0,0 +1,436 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Vulkan GUI utility functions
+// Other matters here: we need to pull in this first to make sure Vulkan API
+// prototypes are defined so that we can statically link against them.
+#include "iree/testing/vulkan/vulkan_gui_util.h"
+
+// Other dependencies (helpers, etc.)
+#include "iree/base/internal/file_io.h"
+#include "iree/base/internal/flags.h"
+#include "iree/base/internal/main.h"
+#include "iree/base/status_cc.h"
+#include "iree/hal/vulkan/registration/driver_module.h"
+#include "iree/modules/hal/module.h"
+#include "iree/tools/utils/vm_util.h"
+#include "iree/vm/api.h"
+#include "iree/vm/bytecode_module.h"
+
+IREE_FLAG(string, module_file, "-",
+          "File containing the module to load that contains the entry "
+          "function. Defaults to stdin.");
+
+IREE_FLAG(string, entry_function, "",
+          "Name of a function contained in the module specified by input_file "
+          "to run.");
+
+static iree_status_t parse_function_input(iree_string_view_t flag_name,
+                                          void* storage,
+                                          iree_string_view_t value) {
+  auto* list = (std::vector<std::string>*)storage;
+  list->push_back(std::string(value.data, value.size));
+  return iree_ok_status();
+}
+static void print_function_input(iree_string_view_t flag_name, void* storage,
+                                 FILE* file) {
+  auto* list = (std::vector<std::string>*)storage;
+  if (list->empty()) {
+    fprintf(file, "# --%.*s=\n", (int)flag_name.size, flag_name.data);
+  } else {
+    for (size_t i = 0; i < list->size(); ++i) {
+      fprintf(file, "--%.*s=\"%s\"\n", (int)flag_name.size, flag_name.data,
+              list->at(i).c_str());
+    }
+  }
+}
+static std::vector<std::string> FLAG_function_inputs;
+IREE_FLAG_CALLBACK(
+    parse_function_input, print_function_input, &FLAG_function_inputs,
+    function_input,
+    "An input value or buffer of the format:\n"
+    "  [shape]xtype=[value]\n"
+    "  2x2xi32=1 2 3 4\n"
+    "Optionally, brackets may be used to separate the element values:\n"
+    "  2x2xi32=[[1 2][3 4]]\n"
+    "Each occurrence of the flag indicates an input in the order they were\n"
+    "specified on the command line.");
+
+static VkAllocationCallbacks* g_Allocator = NULL;
+static VkInstance g_Instance = VK_NULL_HANDLE;
+static VkPhysicalDevice g_PhysicalDevice = VK_NULL_HANDLE;
+static VkDevice g_Device = VK_NULL_HANDLE;
+static uint32_t g_QueueFamily = (uint32_t)-1;
+static VkQueue g_Queue = VK_NULL_HANDLE;
+static VkPipelineCache g_PipelineCache = VK_NULL_HANDLE;
+static VkDescriptorPool g_DescriptorPool = VK_NULL_HANDLE;
+
+static ImGui_ImplVulkanH_Window g_MainWindowData;
+static uint32_t g_MinImageCount = 2;
+static bool g_SwapChainRebuild = false;
+static int g_SwapChainResizeWidth = 0;
+static int g_SwapChainResizeHeight = 0;
+
+namespace iree {
+namespace {
+
+void check_vk_result(VkResult err) {
+  if (err == 0) return;
+  IREE_LOG(FATAL) << "VkResult: " << err;
+}
+
+void CleanupVulkan() {
+  vkDestroyDescriptorPool(g_Device, g_DescriptorPool, g_Allocator);
+
+  vkDestroyDevice(g_Device, g_Allocator);
+  vkDestroyInstance(g_Instance, g_Allocator);
+}
+
+void CleanupVulkanWindow() {
+  ImGui_ImplVulkanH_DestroyWindow(g_Instance, g_Device, &g_MainWindowData,
+                                  g_Allocator);
+}
+
+iree_status_t GetModuleContentsFromFlags(iree_file_contents_t** out_contents) {
+  IREE_TRACE_SCOPE0("GetModuleContentsFromFlags");
+  auto module_file = std::string(FLAG_module_file);
+  if (module_file == "-") {
+    return iree_stdin_read_contents(iree_allocator_system(), out_contents);
+  } else {
+    return iree_file_read_contents(module_file.c_str(), iree_allocator_system(),
+                                   out_contents);
+  }
+}
+
+// Runs the current IREE bytecode module and renders its result to a window
+// using ImGui.
+Status RunModuleAndUpdateImGuiWindow(
+    iree_hal_device_t* device, iree_vm_context_t* context,
+    iree_vm_function_t function, const std::string& function_name,
+    const vm::ref<iree_vm_list_t>& function_inputs,
+    const std::string& window_title) {
+  vm::ref<iree_vm_list_t> outputs;
+  IREE_RETURN_IF_ERROR(iree_vm_list_create(/*element_type=*/nullptr, 16,
+                                           iree_allocator_system(), &outputs));
+
+  IREE_LOG(INFO) << "EXEC @" << function_name;
+  IREE_RETURN_IF_ERROR(iree_vm_invoke(
+      context, function, IREE_VM_INVOCATION_FLAG_NONE, /*policy=*/nullptr,
+      function_inputs.get(), outputs.get(), iree_allocator_system()));
+
+  std::ostringstream oss;
+  IREE_RETURN_IF_ERROR(PrintVariantList(outputs.get(), &oss));
+
+  outputs.reset();
+
+  ImGui::Begin(window_title.c_str(), /*p_open=*/nullptr,
+               ImGuiWindowFlags_AlwaysAutoResize);
+
+  ImGui::Text("Entry function:");
+  ImGui::Text("%s", function_name.c_str());
+  ImGui::Separator();
+
+  ImGui::Text("Invocation result:");
+  ImGui::Text("%s", oss.str().c_str());
+  ImGui::Separator();
+
+  // Framerate counter.
+  ImGui::Text("Application average %.3f ms/frame (%.1f FPS)",
+              1000.0f / ImGui::GetIO().Framerate, ImGui::GetIO().Framerate);
+
+  ImGui::End();
+  return OkStatus();
+}
+}  // namespace
+
+extern "C" int iree_main(int argc, char** argv) {
+  iree_flags_parse_checked(IREE_FLAGS_PARSE_MODE_DEFAULT, &argc, &argv);
+  IREE_CHECK_OK(iree_hal_vulkan_driver_module_register(
+      iree_hal_driver_registry_default()));
+
+  // --------------------------------------------------------------------------
+  // Create a window.
+  if (SDL_Init(SDL_INIT_VIDEO | SDL_INIT_TIMER) != 0) {
+    IREE_LOG(FATAL) << "Failed to initialize SDL";
+    return 1;
+  }
+
+  // Setup window
+  SDL_WindowFlags window_flags = (SDL_WindowFlags)(  //
+      SDL_WINDOW_VULKAN | SDL_WINDOW_RESIZABLE | SDL_WINDOW_ALLOW_HIGHDPI);
+  SDL_Window* window = SDL_CreateWindow(
+      "IREE Samples - Vulkan Inference GUI", SDL_WINDOWPOS_CENTERED,
+      SDL_WINDOWPOS_CENTERED, 1280, 720, window_flags);
+  if (!window) {
+    IREE_LOG(FATAL) << "Failed to create SDL window";
+    return 1;
+  }
+
+  // Setup Vulkan
+  iree_hal_vulkan_features_t iree_vulkan_features =
+      static_cast<iree_hal_vulkan_features_t>(
+          IREE_HAL_VULKAN_FEATURE_ENABLE_VALIDATION_LAYERS |
+          IREE_HAL_VULKAN_FEATURE_ENABLE_DEBUG_UTILS);
+  std::vector<const char*> layers = GetInstanceLayers(iree_vulkan_features);
+  std::vector<const char*> extensions =
+      GetInstanceExtensions(window, iree_vulkan_features);
+  SetupVulkan(iree_vulkan_features, layers.data(), layers.size(),
+              extensions.data(), extensions.size(), g_Allocator, &g_Instance,
+              &g_QueueFamily, &g_PhysicalDevice, &g_Queue, &g_Device,
+              &g_DescriptorPool);
+
+  // Create Window Surface
+  VkSurfaceKHR surface;
+  VkResult err;
+  if (SDL_Vulkan_CreateSurface(window, g_Instance, &surface) == 0) {
+    printf("Failed to create Vulkan surface.\n");
+    return 1;
+  }
+
+  // Create Framebuffers
+  int w, h;
+  SDL_GetWindowSize(window, &w, &h);
+  ImGui_ImplVulkanH_Window* wd = &g_MainWindowData;
+  SetupVulkanWindow(wd, g_Allocator, g_Instance, g_QueueFamily,
+                    g_PhysicalDevice, g_Device, surface, w, h, g_MinImageCount);
+
+  // Setup Dear ImGui context
+  IMGUI_CHECKVERSION();
+  ImGui::CreateContext();
+  ImGuiIO& io = ImGui::GetIO();
+  (void)io;
+
+  ImGui::StyleColorsDark();
+
+  // Setup Platform/Renderer bindings
+  ImGui_ImplSDL2_InitForVulkan(window);
+  ImGui_ImplVulkan_InitInfo init_info = {};
+  init_info.Instance = g_Instance;
+  init_info.PhysicalDevice = g_PhysicalDevice;
+  init_info.Device = g_Device;
+  init_info.QueueFamily = g_QueueFamily;
+  init_info.Queue = g_Queue;
+  init_info.PipelineCache = g_PipelineCache;
+  init_info.DescriptorPool = g_DescriptorPool;
+  init_info.Allocator = g_Allocator;
+  init_info.MinImageCount = g_MinImageCount;
+  init_info.ImageCount = wd->ImageCount;
+  init_info.CheckVkResultFn = check_vk_result;
+  ImGui_ImplVulkan_Init(&init_info, wd->RenderPass);
+
+  // Upload Fonts
+  {
+    // Use any command queue
+    VkCommandPool command_pool = wd->Frames[wd->FrameIndex].CommandPool;
+    VkCommandBuffer command_buffer = wd->Frames[wd->FrameIndex].CommandBuffer;
+
+    err = vkResetCommandPool(g_Device, command_pool, 0);
+    check_vk_result(err);
+    VkCommandBufferBeginInfo begin_info = {};
+    begin_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+    begin_info.flags |= VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
+    err = vkBeginCommandBuffer(command_buffer, &begin_info);
+    check_vk_result(err);
+
+    ImGui_ImplVulkan_CreateFontsTexture(command_buffer);
+
+    VkSubmitInfo end_info = {};
+    end_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+    end_info.commandBufferCount = 1;
+    end_info.pCommandBuffers = &command_buffer;
+    err = vkEndCommandBuffer(command_buffer);
+    check_vk_result(err);
+    err = vkQueueSubmit(g_Queue, 1, &end_info, VK_NULL_HANDLE);
+    check_vk_result(err);
+
+    err = vkDeviceWaitIdle(g_Device);
+    check_vk_result(err);
+    ImGui_ImplVulkan_DestroyFontUploadObjects();
+  }
+  // --------------------------------------------------------------------------
+
+  // --------------------------------------------------------------------------
+  // Setup IREE.
+
+  // Check API version.
+  iree_api_version_t actual_version;
+  iree_status_t status =
+      iree_api_version_check(IREE_API_VERSION_LATEST, &actual_version);
+  if (iree_status_is_ok(status)) {
+    IREE_LOG(INFO) << "IREE runtime API version " << actual_version;
+  } else {
+    IREE_LOG(FATAL) << "Unsupported runtime API version " << actual_version;
+  }
+
+  // Register HAL module types.
+  IREE_CHECK_OK(iree_hal_module_register_types());
+
+  // Create a runtime Instance.
+  iree_vm_instance_t* iree_instance = nullptr;
+  IREE_CHECK_OK(
+      iree_vm_instance_create(iree_allocator_system(), &iree_instance));
+
+  // Create IREE Vulkan Driver and Device, sharing our VkInstance/VkDevice.
+  IREE_LOG(INFO) << "Creating Vulkan driver/device";
+  // Load symbols from our static `vkGetInstanceProcAddr` for IREE to use.
+  iree_hal_vulkan_syms_t* iree_vk_syms = nullptr;
+  IREE_CHECK_OK(iree_hal_vulkan_syms_create(
+      reinterpret_cast<void*>(&vkGetInstanceProcAddr), iree_allocator_system(),
+      &iree_vk_syms));
+  // Create the driver sharing our VkInstance.
+  iree_hal_driver_t* iree_vk_driver = nullptr;
+  iree_string_view_t driver_identifier = iree_make_cstring_view("vulkan");
+  iree_hal_vulkan_driver_options_t driver_options;
+  driver_options.api_version = VK_API_VERSION_1_2;
+  driver_options.requested_features = static_cast<iree_hal_vulkan_features_t>(
+      IREE_HAL_VULKAN_FEATURE_ENABLE_DEBUG_UTILS);
+  IREE_CHECK_OK(iree_hal_vulkan_driver_create_using_instance(
+      driver_identifier, &driver_options, iree_vk_syms, g_Instance,
+      iree_allocator_system(), &iree_vk_driver));
+  // Create a device sharing our VkDevice and queue. This makes capturing with
+  // vendor tools easier because we will have sync compute residing in the
+  // rendered frame.
+  iree_string_view_t device_identifier = iree_make_cstring_view("vulkan");
+  iree_hal_vulkan_queue_set_t compute_queue_set;
+  compute_queue_set.queue_family_index = g_QueueFamily;
+  compute_queue_set.queue_indices = 1 << 0;
+  iree_hal_vulkan_queue_set_t transfer_queue_set;
+  transfer_queue_set.queue_indices = 0;
+  iree_hal_device_t* iree_vk_device = nullptr;
+  IREE_CHECK_OK(iree_hal_vulkan_wrap_device(
+      device_identifier, &driver_options.device_options, iree_vk_syms,
+      g_Instance, g_PhysicalDevice, g_Device, &compute_queue_set,
+      &transfer_queue_set, iree_allocator_system(), &iree_vk_device));
+  // Create a HAL module using the HAL device.
+  iree_vm_module_t* hal_module = nullptr;
+  IREE_CHECK_OK(iree_hal_module_create(iree_vk_device, iree_allocator_system(),
+                                       &hal_module));
+
+  // Load bytecode module from embedded data.
+  IREE_LOG(INFO) << "Loading IREE byecode module...";
+  iree_file_contents_t* flatbuffer_contents = NULL;
+  IREE_CHECK_OK(iree::GetModuleContentsFromFlags(&flatbuffer_contents));
+  iree_vm_module_t* bytecode_module = nullptr;
+  IREE_CHECK_OK(iree_vm_bytecode_module_create(
+      flatbuffer_contents->const_buffer,
+      iree_file_contents_deallocator(flatbuffer_contents),
+      iree_allocator_system(), &bytecode_module));
+
+  // Allocate a context that will hold the module state across invocations.
+  iree_vm_context_t* iree_context = nullptr;
+  std::vector<iree_vm_module_t*> modules = {hal_module, bytecode_module};
+  IREE_CHECK_OK(iree_vm_context_create_with_modules(
+      iree_instance, IREE_VM_CONTEXT_FLAG_NONE, modules.data(), modules.size(),
+      iree_allocator_system(), &iree_context));
+  IREE_LOG(INFO) << "Context with modules is ready for use";
+
+  // Lookup the entry point function.
+  std::string entry_function = FLAG_entry_function;
+  iree_vm_function_t main_function;
+  IREE_CHECK_OK(bytecode_module->lookup_function(
+      bytecode_module->self, IREE_VM_FUNCTION_LINKAGE_EXPORT,
+      iree_string_view_t{entry_function.data(), entry_function.size()},
+      &main_function));
+  iree_string_view_t main_function_name = iree_vm_function_name(&main_function);
+  IREE_LOG(INFO) << "Resolved main function named '"
+                 << std::string(main_function_name.data,
+                                main_function_name.size)
+                 << "'";
+
+  vm::ref<iree_vm_list_t> main_function_inputs;
+  IREE_CHECK_OK(ParseToVariantList(
+      iree_hal_device_allocator(iree_vk_device),
+      iree::span<const std::string>{FLAG_function_inputs.data(),
+                                    FLAG_function_inputs.size()},
+      &main_function_inputs));
+
+  const std::string window_title = std::string(FLAG_module_file);
+  // --------------------------------------------------------------------------
+
+  // --------------------------------------------------------------------------
+  // Main loop.
+  bool done = false;
+  while (!done) {
+    SDL_Event event;
+
+    while (SDL_PollEvent(&event)) {
+      if (event.type == SDL_QUIT) {
+        done = true;
+      }
+
+      ImGui_ImplSDL2_ProcessEvent(&event);
+      if (event.type == SDL_QUIT) done = true;
+      if (event.type == SDL_WINDOWEVENT &&
+          event.window.event == SDL_WINDOWEVENT_RESIZED &&
+          event.window.windowID == SDL_GetWindowID(window)) {
+        g_SwapChainResizeWidth = (int)event.window.data1;
+        g_SwapChainResizeHeight = (int)event.window.data2;
+        g_SwapChainRebuild = true;
+      }
+    }
+
+    if (g_SwapChainRebuild) {
+      g_SwapChainRebuild = false;
+      ImGui_ImplVulkan_SetMinImageCount(g_MinImageCount);
+      ImGui_ImplVulkanH_CreateOrResizeWindow(
+          g_Instance, g_PhysicalDevice, g_Device, &g_MainWindowData,
+          g_QueueFamily, g_Allocator, g_SwapChainResizeWidth,
+          g_SwapChainResizeHeight, g_MinImageCount);
+      g_MainWindowData.FrameIndex = 0;
+    }
+
+    // Start the Dear ImGui frame
+    ImGui_ImplVulkan_NewFrame();
+    ImGui_ImplSDL2_NewFrame(window);
+    ImGui::NewFrame();
+
+    // Custom window.
+    auto status = RunModuleAndUpdateImGuiWindow(
+        iree_vk_device, iree_context, main_function, entry_function,
+        main_function_inputs, window_title);
+    if (!status.ok()) {
+      IREE_LOG(FATAL) << status;
+      done = true;
+      continue;
+    }
+
+    // Rendering
+    ImGui::Render();
+    RenderFrame(wd, g_Device, g_Queue);
+
+    PresentFrame(wd, g_Queue);
+  }
+  // --------------------------------------------------------------------------
+
+  // --------------------------------------------------------------------------
+  // Cleanup
+  iree_vm_ref_release(main_function_inputs);
+
+  iree_vm_module_release(hal_module);
+  iree_vm_module_release(bytecode_module);
+  iree_vm_context_release(iree_context);
+  iree_hal_device_release(iree_vk_device);
+  iree_hal_driver_release(iree_vk_driver);
+  iree_hal_vulkan_syms_release(iree_vk_syms);
+  iree_vm_instance_release(iree_instance);
+
+  err = vkDeviceWaitIdle(g_Device);
+  check_vk_result(err);
+  ImGui_ImplVulkan_Shutdown();
+  ImGui_ImplSDL2_Shutdown();
+  ImGui::DestroyContext();
+
+  CleanupVulkanWindow();
+  CleanupVulkan();
+
+  SDL_DestroyWindow(window);
+  SDL_Quit();
+  // --------------------------------------------------------------------------
+
+  return 0;
+}
+
+}  // namespace iree
diff --git a/runtime/src/iree/testing/vulkan/vulkan_gui_util.cc b/runtime/src/iree/testing/vulkan/vulkan_gui_util.cc
new file mode 100644
index 0000000..7569d94
--- /dev/null
+++ b/runtime/src/iree/testing/vulkan/vulkan_gui_util.cc
@@ -0,0 +1,426 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/testing/vulkan/vulkan_gui_util.h"
+
+#include <cstring>
+#include <set>
+
+#include "iree/base/api.h"
+#include "iree/base/logging.h"
+
+namespace iree {
+
+namespace {
+
+void check_vk_result(VkResult err) {
+  if (err == 0) return;
+  IREE_LOG(FATAL) << "VkResult: " << err;
+}
+
+// Returns the names of the Vulkan layers used for the given IREE
+// |extensibility_set| and |features|.
+std::vector<const char*> GetIreeLayers(
+    iree_hal_vulkan_extensibility_set_t extensibility_set,
+    iree_hal_vulkan_features_t features) {
+  iree_host_size_t required_count;
+  iree_hal_vulkan_query_extensibility_set(
+      features, extensibility_set, /*string_capacity=*/0,
+      /*out_string_values=*/NULL, &required_count);
+  std::vector<const char*> layers(required_count);
+  iree_hal_vulkan_query_extensibility_set(features, extensibility_set,
+                                          layers.size(), layers.data(),
+                                          &required_count);
+  return layers;
+}
+
+// Returns the names of the Vulkan extensions used for the given IREE
+// |extensibility_set| and |features|.
+std::vector<const char*> GetIreeExtensions(
+    iree_hal_vulkan_extensibility_set_t extensibility_set,
+    iree_hal_vulkan_features_t features) {
+  iree_host_size_t required_count;
+  iree_hal_vulkan_query_extensibility_set(
+      features, extensibility_set, /*string_capacity=*/0,
+      /*out_string_values=*/NULL, &required_count);
+  std::vector<const char*> extensions(required_count);
+  iree_hal_vulkan_query_extensibility_set(features, extensibility_set,
+                                          extensions.size(), extensions.data(),
+                                          &required_count);
+  return extensions;
+}
+
+// Returns the names of the Vulkan extensions used for the given IREE
+// |vulkan_features|.
+std::vector<const char*> GetDeviceExtensions(
+    VkPhysicalDevice physical_device,
+    iree_hal_vulkan_features_t vulkan_features) {
+  std::vector<const char*> iree_required_extensions = GetIreeExtensions(
+      IREE_HAL_VULKAN_EXTENSIBILITY_DEVICE_EXTENSIONS_REQUIRED,
+      vulkan_features);
+  std::vector<const char*> iree_optional_extensions = GetIreeExtensions(
+      IREE_HAL_VULKAN_EXTENSIBILITY_DEVICE_EXTENSIONS_OPTIONAL,
+      vulkan_features);
+
+  uint32_t extension_count = 0;
+  check_vk_result(vkEnumerateDeviceExtensionProperties(
+      physical_device, nullptr, &extension_count, nullptr));
+  std::vector<VkExtensionProperties> extension_properties(extension_count);
+  check_vk_result(vkEnumerateDeviceExtensionProperties(
+      physical_device, nullptr, &extension_count, extension_properties.data()));
+
+  // Merge extensions lists, including optional and required for simplicity.
+  std::set<const char*> ext_set;
+  ext_set.insert("VK_KHR_swapchain");
+  ext_set.insert(iree_required_extensions.begin(),
+                 iree_required_extensions.end());
+  for (int i = 0; i < iree_optional_extensions.size(); ++i) {
+    const char* optional_extension = iree_optional_extensions[i];
+    for (int j = 0; j < extension_count; ++j) {
+      if (strcmp(optional_extension, extension_properties[j].extensionName) ==
+          0) {
+        ext_set.insert(optional_extension);
+        break;
+      }
+    }
+  }
+  std::vector<const char*> extensions(ext_set.begin(), ext_set.end());
+  return extensions;
+}
+
+}  // namespace
+
+std::vector<const char*> GetInstanceLayers(
+    iree_hal_vulkan_features_t vulkan_features) {
+  // Query the layers that IREE wants / needs.
+  std::vector<const char*> required_layers = GetIreeLayers(
+      IREE_HAL_VULKAN_EXTENSIBILITY_INSTANCE_LAYERS_REQUIRED, vulkan_features);
+  std::vector<const char*> optional_layers = GetIreeLayers(
+      IREE_HAL_VULKAN_EXTENSIBILITY_INSTANCE_LAYERS_OPTIONAL, vulkan_features);
+
+  // Query the layers that are available on the Vulkan ICD.
+  uint32_t layer_property_count = 0;
+  check_vk_result(
+      vkEnumerateInstanceLayerProperties(&layer_property_count, NULL));
+  std::vector<VkLayerProperties> layer_properties(layer_property_count);
+  check_vk_result(vkEnumerateInstanceLayerProperties(&layer_property_count,
+                                                     layer_properties.data()));
+
+  // Match between optional/required and available layers.
+  std::vector<const char*> layers;
+  for (const char* layer_name : required_layers) {
+    bool found = false;
+    for (const auto& layer_property : layer_properties) {
+      if (std::strcmp(layer_name, layer_property.layerName) == 0) {
+        found = true;
+        layers.push_back(layer_name);
+        break;
+      }
+    }
+    if (!found) {
+      IREE_LOG(FATAL) << "Required layer " << layer_name << " not available";
+    }
+  }
+  for (const char* layer_name : optional_layers) {
+    for (const auto& layer_property : layer_properties) {
+      if (std::strcmp(layer_name, layer_property.layerName) == 0) {
+        layers.push_back(layer_name);
+        break;
+      }
+    }
+  }
+
+  return layers;
+}
+
+std::vector<const char*> GetInstanceExtensions(
+    SDL_Window* window, iree_hal_vulkan_features_t vulkan_features) {
+  // Ask SDL for its list of required instance extensions.
+  uint32_t sdl_extensions_count = 0;
+  SDL_Vulkan_GetInstanceExtensions(window, &sdl_extensions_count, NULL);
+  std::vector<const char*> sdl_extensions(sdl_extensions_count);
+  SDL_Vulkan_GetInstanceExtensions(window, &sdl_extensions_count,
+                                   sdl_extensions.data());
+
+  std::vector<const char*> iree_required_extensions = GetIreeExtensions(
+      IREE_HAL_VULKAN_EXTENSIBILITY_INSTANCE_EXTENSIONS_REQUIRED,
+      vulkan_features);
+  std::vector<const char*> iree_optional_extensions = GetIreeExtensions(
+      IREE_HAL_VULKAN_EXTENSIBILITY_INSTANCE_EXTENSIONS_OPTIONAL,
+      vulkan_features);
+
+  // Merge extensions lists, including optional and required for simplicity.
+  std::set<const char*> ext_set;
+  ext_set.insert(sdl_extensions.begin(), sdl_extensions.end());
+  ext_set.insert(iree_required_extensions.begin(),
+                 iree_required_extensions.end());
+  ext_set.insert(iree_optional_extensions.begin(),
+                 iree_optional_extensions.end());
+  std::vector<const char*> extensions(ext_set.begin(), ext_set.end());
+  return extensions;
+}
+
+void SetupVulkan(iree_hal_vulkan_features_t vulkan_features,
+                 const char** instance_layers, uint32_t instance_layers_count,
+                 const char** instance_extensions,
+                 uint32_t instance_extensions_count,
+                 const VkAllocationCallbacks* allocator, VkInstance* instance,
+                 uint32_t* queue_family_index,
+                 VkPhysicalDevice* physical_device, VkQueue* queue,
+                 VkDevice* device, VkDescriptorPool* descriptor_pool) {
+  VkResult err;
+
+  // Create Vulkan Instance
+  {
+    VkInstanceCreateInfo create_info = {};
+    create_info.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
+    create_info.enabledLayerCount = instance_layers_count;
+    create_info.ppEnabledLayerNames = instance_layers;
+    create_info.enabledExtensionCount = instance_extensions_count;
+    create_info.ppEnabledExtensionNames = instance_extensions;
+    err = vkCreateInstance(&create_info, allocator, instance);
+    check_vk_result(err);
+  }
+
+  // Select GPU
+  {
+    uint32_t gpu_count;
+    err = vkEnumeratePhysicalDevices(*instance, &gpu_count, NULL);
+    check_vk_result(err);
+    IM_ASSERT(gpu_count > 0);
+
+    VkPhysicalDevice* gpus =
+        (VkPhysicalDevice*)malloc(sizeof(VkPhysicalDevice) * gpu_count);
+    err = vkEnumeratePhysicalDevices(*instance, &gpu_count, gpus);
+    check_vk_result(err);
+
+    // Use the first reported GPU for simplicity.
+    *physical_device = gpus[0];
+
+    VkPhysicalDeviceProperties properties;
+    vkGetPhysicalDeviceProperties(*physical_device, &properties);
+    IREE_LOG(INFO) << "Selected Vulkan device: " << properties.deviceName;
+    free(gpus);
+  }
+
+  // Select queue family. We want a single queue with graphics and compute for
+  // simplicity, but we could also discover and use separate queues for each.
+  {
+    uint32_t count;
+    vkGetPhysicalDeviceQueueFamilyProperties(*physical_device, &count, NULL);
+    VkQueueFamilyProperties* queues = (VkQueueFamilyProperties*)malloc(
+        sizeof(VkQueueFamilyProperties) * count);
+    vkGetPhysicalDeviceQueueFamilyProperties(*physical_device, &count, queues);
+    for (uint32_t i = 0; i < count; i++) {
+      if (queues[i].queueFlags &
+          (VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT)) {
+        *queue_family_index = i;
+        break;
+      }
+    }
+    free(queues);
+    IM_ASSERT(*queue_family_index != (uint32_t)-1);
+  }
+
+  // Create Logical Device (with 1 queue)
+  {
+    std::vector<const char*> device_extensions =
+        GetDeviceExtensions(*physical_device, vulkan_features);
+    const float queue_priority[] = {1.0f};
+    VkDeviceQueueCreateInfo queue_info = {};
+    queue_info.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
+    queue_info.queueFamilyIndex = *queue_family_index;
+    queue_info.queueCount = 1;
+    queue_info.pQueuePriorities = queue_priority;
+    VkDeviceCreateInfo create_info = {};
+    create_info.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
+    create_info.queueCreateInfoCount = 1;
+    create_info.pQueueCreateInfos = &queue_info;
+    create_info.enabledExtensionCount =
+        static_cast<uint32_t>(device_extensions.size());
+    create_info.ppEnabledExtensionNames = device_extensions.data();
+
+    // Enable timeline semaphores.
+    VkPhysicalDeviceFeatures2 features2;
+    memset(&features2, 0, sizeof(features2));
+    features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
+    create_info.pNext = &features2;
+    VkPhysicalDeviceTimelineSemaphoreFeatures semaphore_features;
+    memset(&semaphore_features, 0, sizeof(semaphore_features));
+    semaphore_features.sType =
+        VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES;
+    semaphore_features.pNext = features2.pNext;
+    features2.pNext = &semaphore_features;
+    semaphore_features.timelineSemaphore = VK_TRUE;
+
+    err = vkCreateDevice(*physical_device, &create_info, allocator, device);
+    check_vk_result(err);
+    vkGetDeviceQueue(*device, *queue_family_index, 0, queue);
+  }
+
+  // Create Descriptor Pool
+  {
+    VkDescriptorPoolSize pool_sizes[] = {
+        {VK_DESCRIPTOR_TYPE_SAMPLER, 1000},
+        {VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, 1000},
+        {VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, 1000},
+        {VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 1000},
+        {VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, 1000},
+        {VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER, 1000},
+        {VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, 1000},
+        {VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1000},
+        {VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC, 1000},
+        {VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC, 1000},
+        {VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT, 1000}};
+    VkDescriptorPoolCreateInfo pool_info = {};
+    pool_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO;
+    pool_info.flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT;
+    pool_info.maxSets = 1000 * IREE_ARRAYSIZE(pool_sizes);
+    pool_info.poolSizeCount = (uint32_t)IREE_ARRAYSIZE(pool_sizes);
+    pool_info.pPoolSizes = pool_sizes;
+    err =
+        vkCreateDescriptorPool(*device, &pool_info, allocator, descriptor_pool);
+    check_vk_result(err);
+  }
+}
+
+void SetupVulkanWindow(ImGui_ImplVulkanH_Window* wd,
+                       const VkAllocationCallbacks* allocator,
+                       VkInstance instance, uint32_t queue_family_index,
+                       VkPhysicalDevice physical_device, VkDevice device,
+                       VkSurfaceKHR surface, int width, int height,
+                       uint32_t min_image_count) {
+  wd->Surface = surface;
+
+  // Check for WSI support
+  VkBool32 res;
+  vkGetPhysicalDeviceSurfaceSupportKHR(physical_device, queue_family_index,
+                                       wd->Surface, &res);
+  if (res != VK_TRUE) {
+    fprintf(stderr, "Error no WSI support on physical device 0\n");
+    exit(-1);
+  }
+
+  // Select Surface Format
+  const VkFormat requestSurfaceImageFormat[] = {
+      VK_FORMAT_B8G8R8A8_UNORM, VK_FORMAT_R8G8B8A8_UNORM,
+      VK_FORMAT_B8G8R8_UNORM, VK_FORMAT_R8G8B8_UNORM};
+  const VkColorSpaceKHR requestSurfaceColorSpace =
+      VK_COLORSPACE_SRGB_NONLINEAR_KHR;
+  wd->SurfaceFormat = ImGui_ImplVulkanH_SelectSurfaceFormat(
+      physical_device, wd->Surface, requestSurfaceImageFormat,
+      (size_t)IREE_ARRAYSIZE(requestSurfaceImageFormat),
+      requestSurfaceColorSpace);
+
+  // Select Present Mode
+#ifdef IMGUI_UNLIMITED_FRAME_RATE
+  VkPresentModeKHR present_modes[] = {VK_PRESENT_MODE_MAILBOX_KHR,
+                                      VK_PRESENT_MODE_IMMEDIATE_KHR,
+                                      VK_PRESENT_MODE_FIFO_KHR};
+#else
+  VkPresentModeKHR present_modes[] = {VK_PRESENT_MODE_FIFO_KHR};
+#endif
+  wd->PresentMode = ImGui_ImplVulkanH_SelectPresentMode(
+      physical_device, wd->Surface, &present_modes[0],
+      IREE_ARRAYSIZE(present_modes));
+
+  // Create SwapChain, RenderPass, Framebuffer, etc.
+  IM_ASSERT(min_image_count >= 2);
+  ImGui_ImplVulkanH_CreateOrResizeWindow(instance, physical_device, device, wd,
+                                         queue_family_index, allocator, width,
+                                         height, min_image_count);
+
+  // Set clear color.
+  ImVec4 clear_color = ImVec4(0.45f, 0.55f, 0.60f, 1.00f);
+  memcpy(&wd->ClearValue.color.float32[0], &clear_color, 4 * sizeof(float));
+}
+
+void RenderFrame(ImGui_ImplVulkanH_Window* wd, VkDevice device, VkQueue queue) {
+  VkResult err;
+
+  VkSemaphore image_acquired_semaphore =
+      wd->FrameSemaphores[wd->SemaphoreIndex].ImageAcquiredSemaphore;
+  VkSemaphore render_complete_semaphore =
+      wd->FrameSemaphores[wd->SemaphoreIndex].RenderCompleteSemaphore;
+  err = vkAcquireNextImageKHR(device, wd->Swapchain, UINT64_MAX,
+                              image_acquired_semaphore, VK_NULL_HANDLE,
+                              &wd->FrameIndex);
+  check_vk_result(err);
+
+  ImGui_ImplVulkanH_Frame* fd = &wd->Frames[wd->FrameIndex];
+  {
+    err = vkWaitForFences(
+        device, 1, &fd->Fence, VK_TRUE,
+        UINT64_MAX);  // wait indefinitely instead of periodically checking
+    check_vk_result(err);
+
+    err = vkResetFences(device, 1, &fd->Fence);
+    check_vk_result(err);
+  }
+  {
+    err = vkResetCommandPool(device, fd->CommandPool, 0);
+    check_vk_result(err);
+    VkCommandBufferBeginInfo info = {};
+    info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+    info.flags |= VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
+    err = vkBeginCommandBuffer(fd->CommandBuffer, &info);
+    check_vk_result(err);
+  }
+  {
+    VkRenderPassBeginInfo info = {};
+    info.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO;
+    info.renderPass = wd->RenderPass;
+    info.framebuffer = fd->Framebuffer;
+    info.renderArea.extent.width = wd->Width;
+    info.renderArea.extent.height = wd->Height;
+    info.clearValueCount = 1;
+    info.pClearValues = &wd->ClearValue;
+    vkCmdBeginRenderPass(fd->CommandBuffer, &info, VK_SUBPASS_CONTENTS_INLINE);
+  }
+
+  // Record Imgui Draw Data and draw funcs into command buffer
+  ImGui_ImplVulkan_RenderDrawData(ImGui::GetDrawData(), fd->CommandBuffer);
+
+  // Submit command buffer
+  vkCmdEndRenderPass(fd->CommandBuffer);
+  {
+    VkPipelineStageFlags wait_stage =
+        VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
+    VkSubmitInfo info = {};
+    info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+    info.waitSemaphoreCount = 1;
+    info.pWaitSemaphores = &image_acquired_semaphore;
+    info.pWaitDstStageMask = &wait_stage;
+    info.commandBufferCount = 1;
+    info.pCommandBuffers = &fd->CommandBuffer;
+    info.signalSemaphoreCount = 1;
+    info.pSignalSemaphores = &render_complete_semaphore;
+
+    err = vkEndCommandBuffer(fd->CommandBuffer);
+    check_vk_result(err);
+    err = vkQueueSubmit(queue, 1, &info, fd->Fence);
+    check_vk_result(err);
+  }
+}
+
+void PresentFrame(ImGui_ImplVulkanH_Window* wd, VkQueue queue) {
+  VkSemaphore render_complete_semaphore =
+      wd->FrameSemaphores[wd->SemaphoreIndex].RenderCompleteSemaphore;
+  VkPresentInfoKHR info = {};
+  info.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR;
+  info.waitSemaphoreCount = 1;
+  info.pWaitSemaphores = &render_complete_semaphore;
+  info.swapchainCount = 1;
+  info.pSwapchains = &wd->Swapchain;
+  info.pImageIndices = &wd->FrameIndex;
+  VkResult err = vkQueuePresentKHR(queue, &info);
+  check_vk_result(err);
+  wd->SemaphoreIndex =
+      (wd->SemaphoreIndex + 1) %
+      wd->ImageCount;  // Now we can use the next set of semaphores
+}
+
+}  // namespace iree
diff --git a/runtime/src/iree/testing/vulkan/vulkan_gui_util.h b/runtime/src/iree/testing/vulkan/vulkan_gui_util.h
new file mode 100644
index 0000000..2e7f158
--- /dev/null
+++ b/runtime/src/iree/testing/vulkan/vulkan_gui_util.h
@@ -0,0 +1,73 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_TESTING_VULKAN_VULKAN_GUI_UTIL_H_
+#define IREE_TESTING_VULKAN_VULKAN_GUI_UTIL_H_
+
+#include <SDL.h>
+#include <SDL_vulkan.h>
+#include <imgui.h>
+#include <imgui_impl_sdl.h>
+#include <imgui_impl_vulkan.h>
+#include <vulkan/vulkan.h>
+
+#include <vector>
+
+#include "iree/hal/vulkan/api.h"
+
+namespace iree {
+
+// Returns the names of the Vulkan instance layers needed for the given IREE
+// |vulkan_features|.
+std::vector<const char*> GetInstanceLayers(
+    iree_hal_vulkan_features_t vulkan_features);
+
+// Returns the names of the Vulkan instance extensions needed for the given IREE
+// |vulkan_features|.
+std::vector<const char*> GetInstanceExtensions(
+    SDL_Window* window, iree_hal_vulkan_features_t vulkan_features);
+
+// Initializes the Vulkan environment with the given |vulkan_features| and
+// layers/extensions, and writes various Vulkan handles. If errors occur, this
+// function asserts and aborts.
+//
+// This function creates Vulkan |instance|, selects a GPU and
+// |queue_family_index| with both graphics and compute bits, gets the
+// |physical_device|, creates a logical |device| from it, and creates a
+// |descriptor_pool|.
+void SetupVulkan(iree_hal_vulkan_features_t vulkan_features,
+                 const char** instance_layers, uint32_t instance_layers_count,
+                 const char** instance_extensions,
+                 uint32_t instance_extensions_count,
+                 const VkAllocationCallbacks* allocator, VkInstance* instance,
+                 uint32_t* queue_family_index,
+                 VkPhysicalDevice* physical_device, VkQueue* queue,
+                 VkDevice* device, VkDescriptorPool* descriptor_pool);
+
+// Sets up a ImGui Vukan GUI window.
+//
+// This function creates surface, swapchain, framebuffer, and others in
+// prepration for rendering.
+void SetupVulkanWindow(ImGui_ImplVulkanH_Window* wd,
+                       const VkAllocationCallbacks* allocator,
+                       VkInstance instance, uint32_t queue_family_index,
+                       VkPhysicalDevice physical_device, VkDevice device,
+                       VkSurfaceKHR surface, int width, int height,
+                       uint32_t min_image_count);
+
+// Renders the next frame of the ImGui Vulkan GUI window.
+//
+// This function acquires next swapchain image, creates a command buffer
+// containing a render pass for the next frame, and finally submits to the
+// queue.
+void RenderFrame(ImGui_ImplVulkanH_Window* wd, VkDevice device, VkQueue queue);
+
+// Presents the next frame of the ImGui Vukan GUI window.
+void PresentFrame(ImGui_ImplVulkanH_Window* wd, VkQueue queue);
+
+}  // namespace iree
+
+#endif  // IREE_TESTING_VULKAN_VULKAN_GUI_UTIL_H_
diff --git a/runtime/src/iree/vm/BUILD b/runtime/src/iree/vm/BUILD
new file mode 100644
index 0000000..34ededc
--- /dev/null
+++ b/runtime/src/iree/vm/BUILD
@@ -0,0 +1,335 @@
+# Copyright 2020 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("//iree:build_defs.oss.bzl", "iree_cmake_extra_content", "iree_runtime_cc_library", "iree_runtime_cc_test")
+load("//build_tools/bazel:iree_bytecode_module.bzl", "iree_bytecode_module")
+load("//build_tools/bazel:cc_binary_benchmark.bzl", "cc_binary_benchmark")
+# load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+#===------------------------------------------------------------------------===#
+# Public API
+#===------------------------------------------------------------------------===#
+
+iree_runtime_cc_library(
+    name = "vm",
+    hdrs = [
+        "api.h",
+    ],
+    deps = [
+        ":impl",
+        "//runtime/src/iree/base",
+    ],
+)
+
+# TODO(benvanik): make these srcs and only expose an api_cc.h.
+iree_runtime_cc_library(
+    name = "cc",
+    hdrs = [
+        "native_module_cc.h",
+        "native_module_packing.h",
+        "ref_cc.h",
+    ],
+    deps = [
+        ":vm",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:cc",
+        "//runtime/src/iree/base:core_headers",
+        "//runtime/src/iree/base/internal:span",
+    ],
+)
+
+#===------------------------------------------------------------------------===#
+# Implementation
+#===------------------------------------------------------------------------===#
+
+iree_runtime_cc_library(
+    name = "impl",
+    srcs = [
+        "buffer.c",
+        "builtin_types.c",
+        "context.c",
+        "instance.c",
+        "invocation.c",
+        "list.c",
+        "module.c",
+        "native_module.c",
+        "ref.c",
+        "shims.c",
+        "stack.c",
+    ],
+    hdrs = [
+        "buffer.h",
+        "builtin_types.h",
+        "context.h",
+        "instance.h",
+        "invocation.h",
+        "list.h",
+        "module.h",
+        "native_module.h",
+        "ref.h",
+        "shims.h",
+        "stack.h",
+        "type_def.h",
+        "value.h",
+    ],
+    deps = [
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:core_headers",
+        "//runtime/src/iree/base:tracing",
+        "//runtime/src/iree/base/internal",
+    ],
+)
+
+iree_runtime_cc_test(
+    name = "buffer_test",
+    srcs = ["buffer_test.cc"],
+    deps = [
+        ":cc",
+        ":impl",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/testing:gtest",
+        "//runtime/src/iree/testing:gtest_main",
+    ],
+)
+
+iree_runtime_cc_test(
+    name = "list_test",
+    srcs = ["list_test.cc"],
+    deps = [
+        ":cc",
+        ":impl",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/testing:gtest",
+        "//runtime/src/iree/testing:gtest_main",
+    ],
+)
+
+iree_runtime_cc_test(
+    name = "native_module_test",
+    srcs = ["native_module_test.cc"],
+    deps = [
+        ":cc",
+        ":impl",
+        ":native_module_test_hdrs",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:cc",
+        "//runtime/src/iree/testing:gtest",
+        "//runtime/src/iree/testing:gtest_main",
+    ],
+)
+
+iree_runtime_cc_library(
+    name = "native_module_test_hdrs",
+    hdrs = [
+        "native_module_test.h",
+    ],
+    deps = [
+        ":impl",
+        "//runtime/src/iree/base",
+    ],
+)
+
+cc_binary_benchmark(
+    name = "native_module_benchmark",
+    srcs = ["native_module_benchmark.cc"],
+    deps = [
+        ":impl",
+        ":native_module_test_hdrs",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:logging",
+        "//runtime/src/iree/testing:benchmark_main",
+        "@com_google_benchmark//:benchmark",
+    ],
+)
+
+iree_runtime_cc_test(
+    name = "ref_test",
+    srcs = ["ref_test.cc"],
+    deps = [
+        ":cc",
+        ":impl",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/testing:gtest",
+        "//runtime/src/iree/testing:gtest_main",
+    ],
+)
+
+iree_runtime_cc_test(
+    name = "stack_test",
+    srcs = ["stack_test.cc"],
+    deps = [
+        ":impl",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/testing:gtest",
+        "//runtime/src/iree/testing:gtest_main",
+    ],
+)
+
+#===------------------------------------------------------------------------===#
+# Bytecode interpreter module
+#===------------------------------------------------------------------------===#
+
+iree_runtime_cc_library(
+    name = "bytecode_module",
+    srcs = [
+        "bytecode_disasm.c",
+        "bytecode_disasm.h",
+        "bytecode_dispatch.c",
+        "bytecode_dispatch_util.h",
+        "bytecode_module.c",
+        "bytecode_module_impl.h",
+        "generated/bytecode_op_table.h",
+    ],
+    hdrs = [
+        "bytecode_module.h",
+    ],
+    deps = [
+        ":ops",
+        ":vm",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:core_headers",
+        "//runtime/src/iree/base:tracing",
+        "//runtime/src/iree/base/internal",
+        "//runtime/src/iree/base/internal/flatcc:parsing",
+        "//runtime/src/iree/schemas:bytecode_module_def_c_fbs",
+    ],
+)
+
+# TODO(#357): Add a script to update bytecode_op_table.h.
+# gentbl_cc_library(
+#     name = "bytecode_op_table_gen",
+#     tbl_outs = [
+#         (["-gen-iree-vm-op-table-defs"], "bytecode_op_table.h"),
+#     ],
+#     tblgen = "//iree/tools:iree-tblgen",
+#     td_file = "//iree/compiler/Dialect/VM/IR:VMOps.td",
+#     td_srcs = [
+#         "//iree/compiler/Dialect/Util/IR:td_files",
+#         "//iree/compiler/Dialect/VM/IR:td_files",
+#         "@llvm-project//mlir:OpBaseTdFiles",
+#         "@llvm-project//mlir:include/mlir/IR/SymbolInterfaces.td",
+#         "@llvm-project//mlir:include/mlir/Interfaces/CallInterfaces.td",
+#         "@llvm-project//mlir:include/mlir/Interfaces/ControlFlowInterfaces.td",
+#         "@llvm-project//mlir:SideEffectTdFiles",
+#     ],
+# )
+
+iree_cmake_extra_content(
+    content = """
+if(${IREE_BUILD_COMPILER})
+""",
+    inline = True,
+)
+
+iree_runtime_cc_test(
+    name = "bytecode_module_test",
+    srcs = [
+        "bytecode_dispatch_test.cc",
+        "bytecode_module_test.cc",
+    ],
+    tags = [
+        # TODO(benvanik): Fix type casting errors for --config=android_arm.
+        "notap",
+    ],
+    deps = [
+        ":bytecode_module",
+        ":vm",
+        "//runtime/src/iree/base:cc",
+        "//runtime/src/iree/base:logging",
+        "//runtime/src/iree/testing:gtest",
+        "//runtime/src/iree/testing:gtest_main",
+        "//runtime/src/iree/vm/test:all_bytecode_modules_c",
+    ],
+)
+
+cc_binary_benchmark(
+    name = "bytecode_module_benchmark",
+    testonly = True,
+    srcs = ["bytecode_module_benchmark.cc"],
+    deps = [
+        ":bytecode_module",
+        ":bytecode_module_benchmark_module_c",
+        ":vm",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:logging",
+        "//runtime/src/iree/testing:benchmark_main",
+        "@com_google_benchmark//:benchmark",
+    ],
+)
+
+iree_bytecode_module(
+    name = "bytecode_module_benchmark_module",
+    testonly = True,
+    src = "bytecode_module_benchmark.mlir",
+    c_identifier = "iree_vm_bytecode_module_benchmark_module",
+    flags = ["-iree-vm-ir-to-bytecode-module"],
+    translate_tool = "//iree/tools:iree-translate",
+)
+
+cc_binary_benchmark(
+    name = "bytecode_module_size_benchmark",
+    srcs = ["bytecode_module_size_benchmark.cc"],
+    deps = [
+        ":bytecode_module",
+        ":bytecode_module_size_benchmark_module_c",
+        ":vm",
+        "//runtime/src/iree/base",
+    ],
+)
+
+iree_bytecode_module(
+    name = "bytecode_module_size_benchmark_module",
+    testonly = True,
+    src = "bytecode_module_size_benchmark.mlir",
+    c_identifier = "iree_vm_bytecode_module_size_benchmark_module",
+    flags = ["-iree-vm-ir-to-bytecode-module"],
+    translate_tool = "//iree/tools:iree-translate",
+)
+
+iree_cmake_extra_content(
+    content = """
+endif()
+""",
+    inline = True,
+)
+
+#===------------------------------------------------------------------------===#
+# Common VM op implementations
+#===------------------------------------------------------------------------===#
+
+iree_runtime_cc_library(
+    name = "ops",
+    hdrs = [
+        "ops.h",
+    ],
+    deps = [
+        "//runtime/src/iree/base",
+    ],
+)
+
+iree_runtime_cc_library(
+    name = "ops_emitc",
+    hdrs = [
+        "ops_emitc.h",
+    ],
+)
+
+iree_runtime_cc_library(
+    name = "shims_emitc",
+    hdrs = [
+        "shims_emitc.h",
+    ],
+    deps = [
+        ":impl",
+        "//runtime/src/iree/base:core_headers",
+    ],
+)
diff --git a/runtime/src/iree/vm/CMakeLists.txt b/runtime/src/iree/vm/CMakeLists.txt
new file mode 100644
index 0000000..ba43737
--- /dev/null
+++ b/runtime/src/iree/vm/CMakeLists.txt
@@ -0,0 +1,306 @@
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# runtime/src/iree/vm/BUILD                                                    #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
+
+iree_add_all_subdirs()
+
+iree_cc_library(
+  NAME
+    vm
+  HDRS
+    "api.h"
+  DEPS
+    ::impl
+    iree::base
+  PUBLIC
+)
+
+iree_cc_library(
+  NAME
+    cc
+  HDRS
+    "native_module_cc.h"
+    "native_module_packing.h"
+    "ref_cc.h"
+  DEPS
+    ::vm
+    iree::base
+    iree::base::cc
+    iree::base::core_headers
+    iree::base::internal::span
+  PUBLIC
+)
+
+iree_cc_library(
+  NAME
+    impl
+  HDRS
+    "buffer.h"
+    "builtin_types.h"
+    "context.h"
+    "instance.h"
+    "invocation.h"
+    "list.h"
+    "module.h"
+    "native_module.h"
+    "ref.h"
+    "shims.h"
+    "stack.h"
+    "type_def.h"
+    "value.h"
+  SRCS
+    "buffer.c"
+    "builtin_types.c"
+    "context.c"
+    "instance.c"
+    "invocation.c"
+    "list.c"
+    "module.c"
+    "native_module.c"
+    "ref.c"
+    "shims.c"
+    "stack.c"
+  DEPS
+    iree::base
+    iree::base::core_headers
+    iree::base::internal
+    iree::base::tracing
+  PUBLIC
+)
+
+iree_cc_test(
+  NAME
+    buffer_test
+  SRCS
+    "buffer_test.cc"
+  DEPS
+    ::cc
+    ::impl
+    iree::base
+    iree::testing::gtest
+    iree::testing::gtest_main
+)
+
+iree_cc_test(
+  NAME
+    list_test
+  SRCS
+    "list_test.cc"
+  DEPS
+    ::cc
+    ::impl
+    iree::base
+    iree::testing::gtest
+    iree::testing::gtest_main
+)
+
+iree_cc_test(
+  NAME
+    native_module_test
+  SRCS
+    "native_module_test.cc"
+  DEPS
+    ::cc
+    ::impl
+    ::native_module_test_hdrs
+    iree::base
+    iree::base::cc
+    iree::testing::gtest
+    iree::testing::gtest_main
+)
+
+iree_cc_library(
+  NAME
+    native_module_test_hdrs
+  HDRS
+    "native_module_test.h"
+  DEPS
+    ::impl
+    iree::base
+  PUBLIC
+)
+
+iree_cc_binary_benchmark(
+  NAME
+    native_module_benchmark
+  SRCS
+    "native_module_benchmark.cc"
+  DEPS
+    ::impl
+    ::native_module_test_hdrs
+    benchmark
+    iree::base
+    iree::base::logging
+    iree::testing::benchmark_main
+  TESTONLY
+)
+
+iree_cc_test(
+  NAME
+    ref_test
+  SRCS
+    "ref_test.cc"
+  DEPS
+    ::cc
+    ::impl
+    iree::base
+    iree::testing::gtest
+    iree::testing::gtest_main
+)
+
+iree_cc_test(
+  NAME
+    stack_test
+  SRCS
+    "stack_test.cc"
+  DEPS
+    ::impl
+    iree::base
+    iree::testing::gtest
+    iree::testing::gtest_main
+)
+
+iree_cc_library(
+  NAME
+    bytecode_module
+  HDRS
+    "bytecode_module.h"
+  SRCS
+    "bytecode_disasm.c"
+    "bytecode_disasm.h"
+    "bytecode_dispatch.c"
+    "bytecode_dispatch_util.h"
+    "bytecode_module.c"
+    "bytecode_module_impl.h"
+    "generated/bytecode_op_table.h"
+  DEPS
+    ::ops
+    ::vm
+    iree::base
+    iree::base::core_headers
+    iree::base::internal
+    iree::base::internal::flatcc::parsing
+    iree::base::tracing
+    iree::schemas::bytecode_module_def_c_fbs
+  PUBLIC
+)
+
+if(${IREE_BUILD_COMPILER})
+
+iree_cc_test(
+  NAME
+    bytecode_module_test
+  SRCS
+    "bytecode_dispatch_test.cc"
+    "bytecode_module_test.cc"
+  DEPS
+    ::bytecode_module
+    ::vm
+    iree::base::cc
+    iree::base::logging
+    iree::testing::gtest
+    iree::testing::gtest_main
+    iree::vm::test::all_bytecode_modules_c
+  LABELS
+    "notap"
+)
+
+iree_cc_binary_benchmark(
+  NAME
+    bytecode_module_benchmark
+  SRCS
+    "bytecode_module_benchmark.cc"
+  DEPS
+    ::bytecode_module
+    ::bytecode_module_benchmark_module_c
+    ::vm
+    benchmark
+    iree::base
+    iree::base::logging
+    iree::testing::benchmark_main
+  TESTONLY
+)
+
+iree_bytecode_module(
+  NAME
+    bytecode_module_benchmark_module
+  SRC
+    "bytecode_module_benchmark.mlir"
+  C_IDENTIFIER
+    "iree_vm_bytecode_module_benchmark_module"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+  FLAGS
+    "-iree-vm-ir-to-bytecode-module"
+  TESTONLY
+  PUBLIC
+)
+
+iree_cc_binary_benchmark(
+  NAME
+    bytecode_module_size_benchmark
+  SRCS
+    "bytecode_module_size_benchmark.cc"
+  DEPS
+    ::bytecode_module
+    ::bytecode_module_size_benchmark_module_c
+    ::vm
+    iree::base
+  TESTONLY
+)
+
+iree_bytecode_module(
+  NAME
+    bytecode_module_size_benchmark_module
+  SRC
+    "bytecode_module_size_benchmark.mlir"
+  C_IDENTIFIER
+    "iree_vm_bytecode_module_size_benchmark_module"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+  FLAGS
+    "-iree-vm-ir-to-bytecode-module"
+  TESTONLY
+  PUBLIC
+)
+
+endif()
+
+iree_cc_library(
+  NAME
+    ops
+  HDRS
+    "ops.h"
+  DEPS
+    iree::base
+  PUBLIC
+)
+
+iree_cc_library(
+  NAME
+    ops_emitc
+  HDRS
+    "ops_emitc.h"
+  DEPS
+
+  PUBLIC
+)
+
+iree_cc_library(
+  NAME
+    shims_emitc
+  HDRS
+    "shims_emitc.h"
+  DEPS
+    ::impl
+    iree::base::core_headers
+  PUBLIC
+)
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/runtime/src/iree/vm/api.h b/runtime/src/iree/vm/api.h
new file mode 100644
index 0000000..3f559f7
--- /dev/null
+++ b/runtime/src/iree/vm/api.h
@@ -0,0 +1,25 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_VM_API_H_
+#define IREE_VM_API_H_
+
+#include "iree/base/api.h"
+#include "iree/vm/buffer.h"         // IWYU pragma: export
+#include "iree/vm/builtin_types.h"  // IWYU pragma: export
+#include "iree/vm/context.h"        // IWYU pragma: export
+#include "iree/vm/instance.h"       // IWYU pragma: export
+#include "iree/vm/invocation.h"     // IWYU pragma: export
+#include "iree/vm/list.h"           // IWYU pragma: export
+#include "iree/vm/module.h"         // IWYU pragma: export
+#include "iree/vm/native_module.h"  // IWYU pragma: export
+#include "iree/vm/ref.h"            // IWYU pragma: export
+#include "iree/vm/shims.h"          // IWYU pragma: export
+#include "iree/vm/stack.h"          // IWYU pragma: export
+#include "iree/vm/type_def.h"       // IWYU pragma: export
+#include "iree/vm/value.h"          // IWYU pragma: export
+
+#endif  // IREE_VM_API_H_
diff --git a/runtime/src/iree/vm/buffer.c b/runtime/src/iree/vm/buffer.c
new file mode 100644
index 0000000..d433a9f
--- /dev/null
+++ b/runtime/src/iree/vm/buffer.c
@@ -0,0 +1,309 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/vm/buffer.h"
+
+#include <stddef.h>
+#include <string.h>
+
+#include "iree/base/tracing.h"
+
+static iree_vm_ref_type_descriptor_t iree_vm_buffer_descriptor = {0};
+
+IREE_VM_DEFINE_TYPE_ADAPTERS(iree_vm_buffer, iree_vm_buffer_t);
+
+static iree_status_t iree_vm_buffer_map(const iree_vm_buffer_t* buffer,
+                                        iree_host_size_t offset,
+                                        iree_host_size_t length,
+                                        iree_host_size_t alignment,
+                                        uint8_t** out_data,
+                                        iree_host_size_t* out_data_length) {
+  // Force alignment.
+  offset &= ~(alignment - 1);
+  length &= ~(alignment - 1);
+  const iree_host_size_t end = offset + length;
+  if (IREE_UNLIKELY(end > buffer->data.data_length)) {
+    return iree_make_status(IREE_STATUS_OUT_OF_RANGE,
+                            "out-of-bounds access detected (offset=%zu, "
+                            "length=%zu, alignment=%zu, buffer length=%zu)",
+                            offset, length, alignment,
+                            buffer->data.data_length);
+  }
+  *out_data = buffer->data.data + offset;
+  *out_data_length = length;
+  return iree_ok_status();
+}
+
+// Maps a subrange to a span of bytes within the |buffer| for read-only access.
+// |offset| and |length| must match the provided |alignment| (1, 2, 4, 8) and
+// will be rounded toward zero if they do not.
+static iree_status_t iree_vm_buffer_map_ro(const iree_vm_buffer_t* buffer,
+                                           iree_host_size_t offset,
+                                           iree_host_size_t length,
+                                           iree_host_size_t alignment,
+                                           iree_const_byte_span_t* out_span) {
+  // Always allowed regardless of access.
+  return iree_vm_buffer_map(buffer, offset, length, alignment,
+                            (uint8_t**)&out_span->data, &out_span->data_length);
+}
+
+// Maps a subrange to a span of bytes within the |buffer| for read/write access.
+// |offset| and |length| must match the provided |alignment| (1, 2, 4, 8) and
+// will be rounded toward zero if they do not.
+static iree_status_t iree_vm_buffer_map_rw(const iree_vm_buffer_t* buffer,
+                                           iree_host_size_t offset,
+                                           iree_host_size_t length,
+                                           iree_host_size_t alignment,
+                                           iree_byte_span_t* out_span) {
+  // Buffer requires mutable access.
+  if (IREE_UNLIKELY(
+          !iree_all_bits_set(buffer->access, IREE_VM_BUFFER_ACCESS_MUTABLE))) {
+    return iree_make_status(
+        IREE_STATUS_PERMISSION_DENIED,
+        "buffer is read-only and cannot be mapped for mutation");
+  }
+  return iree_vm_buffer_map(buffer, offset, length, alignment, &out_span->data,
+                            &out_span->data_length);
+}
+
+IREE_API_EXPORT void iree_vm_buffer_initialize(iree_vm_buffer_access_t access,
+                                               iree_byte_span_t data,
+                                               iree_allocator_t allocator,
+                                               iree_vm_buffer_t* out_buffer) {
+  IREE_ASSERT_ARGUMENT(out_buffer);
+  iree_atomic_ref_count_init(&out_buffer->ref_object.counter);
+  out_buffer->access = access;
+  out_buffer->data = data;
+  out_buffer->allocator = allocator;
+}
+
+IREE_API_EXPORT void iree_vm_buffer_deinitialize(iree_vm_buffer_t* buffer) {
+  IREE_ASSERT_ARGUMENT(buffer);
+  iree_atomic_ref_count_abort_if_uses(&buffer->ref_object.counter);
+  iree_allocator_free(buffer->allocator, buffer->data.data);
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_buffer_create(
+    iree_vm_buffer_access_t access, iree_host_size_t length,
+    iree_allocator_t allocator, iree_vm_buffer_t** out_buffer) {
+  IREE_ASSERT_ARGUMENT(out_buffer);
+  *out_buffer = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // The actual buffer payload is prefixed with the buffer type so we need only
+  // a single allocation.
+  iree_host_size_t prefix_size = iree_sizeof_struct(**out_buffer);
+  iree_host_size_t total_size = prefix_size + length;
+
+  // Allocate combined [prefix | buffer] memory.
+  uint8_t* data_ptr = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_allocator_malloc(allocator, total_size, (void**)&data_ptr));
+
+  // Initialize the prefix buffer handle.
+  iree_vm_buffer_t* buffer = (iree_vm_buffer_t*)data_ptr;
+  memset(data_ptr, 0, prefix_size - sizeof(*buffer));  // padding
+  iree_byte_span_t target_span =
+      iree_make_byte_span(data_ptr + prefix_size, length);
+  iree_vm_buffer_initialize(access, target_span, allocator, buffer);
+
+  *out_buffer = buffer;
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+static void iree_vm_buffer_destroy(void* ptr) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Buffers are stored as [prefix | data]; freeing the prefix is all we need
+  // to do to free it all.
+  iree_vm_buffer_t* buffer = (iree_vm_buffer_t*)ptr;
+  iree_allocator_free(buffer->allocator, buffer);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+IREE_API_EXPORT void iree_vm_buffer_retain(iree_vm_buffer_t* buffer) {
+  iree_vm_ref_object_retain(buffer, &iree_vm_buffer_descriptor);
+}
+
+IREE_API_EXPORT void iree_vm_buffer_release(iree_vm_buffer_t* buffer) {
+  iree_vm_ref_object_release(buffer, &iree_vm_buffer_descriptor);
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_buffer_clone(
+    iree_vm_buffer_access_t access, const iree_vm_buffer_t* source_buffer,
+    iree_host_size_t source_offset, iree_host_size_t length,
+    iree_allocator_t allocator, iree_vm_buffer_t** out_buffer) {
+  IREE_ASSERT_ARGUMENT(source_buffer);
+  IREE_ASSERT_ARGUMENT(out_buffer);
+  *out_buffer = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Try to map the source buffer first; no use continuing if we can't read the
+  // data to clone.
+  iree_const_byte_span_t source_span;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_vm_buffer_map_ro(source_buffer, source_offset, length, 1,
+                                &source_span));
+
+  // The actual buffer payload is prefixed with the buffer type so we need only
+  // a single allocation.
+  iree_host_size_t prefix_size =
+      iree_host_align(sizeof(iree_vm_buffer_t), iree_max_align_t);
+  iree_host_size_t total_size = prefix_size + source_span.data_length;
+
+  // Allocate combined [prefix | buffer] memory.
+  // NOTE: we are allocating without initialization here as we will be writing
+  // over all of it.
+  uint8_t* data_ptr = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_allocator_malloc_uninitialized(allocator, total_size,
+                                              (void**)&data_ptr));
+
+  // Initialize the prefix buffer handle.
+  iree_vm_buffer_t* buffer = (iree_vm_buffer_t*)data_ptr;
+  memset(data_ptr, 0, prefix_size - sizeof(*buffer));  // padding
+  iree_byte_span_t target_span =
+      iree_make_byte_span(data_ptr + prefix_size, length);
+  iree_vm_buffer_initialize(access, target_span, allocator, buffer);
+
+  // Copy the data from the source buffer.
+  memcpy(target_span.data, source_span.data, target_span.data_length);
+
+  *out_buffer = buffer;
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_host_size_t
+iree_vm_buffer_length(const iree_vm_buffer_t* buffer) {
+  IREE_ASSERT_ARGUMENT(buffer);
+  return buffer->data.data_length;
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_buffer_copy_bytes(
+    const iree_vm_buffer_t* source_buffer, iree_host_size_t source_offset,
+    const iree_vm_buffer_t* target_buffer, iree_host_size_t target_offset,
+    iree_host_size_t length) {
+  IREE_ASSERT_ARGUMENT(source_buffer);
+  IREE_ASSERT_ARGUMENT(target_buffer);
+  iree_const_byte_span_t source_span;
+  IREE_RETURN_IF_ERROR(iree_vm_buffer_map_ro(source_buffer, source_offset,
+                                             length, 1, &source_span));
+  iree_byte_span_t target_span;
+  IREE_RETURN_IF_ERROR(iree_vm_buffer_map_rw(target_buffer, target_offset,
+                                             length, 1, &target_span));
+  memcpy(target_span.data, source_span.data, length);
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_buffer_compare_bytes(
+    const iree_vm_buffer_t* lhs_buffer, iree_host_size_t lhs_offset,
+    const iree_vm_buffer_t* rhs_buffer, iree_host_size_t rhs_offset,
+    iree_host_size_t length, bool* out_result) {
+  IREE_ASSERT_ARGUMENT(lhs_buffer);
+  IREE_ASSERT_ARGUMENT(rhs_buffer);
+  iree_const_byte_span_t lhs_span;
+  IREE_RETURN_IF_ERROR(
+      iree_vm_buffer_map_ro(lhs_buffer, lhs_offset, length, 1, &lhs_span));
+  iree_const_byte_span_t rhs_span;
+  IREE_RETURN_IF_ERROR(
+      iree_vm_buffer_map_ro(rhs_buffer, rhs_offset, length, 1, &rhs_span));
+  *out_result = memcmp(lhs_span.data, rhs_span.data, length) == 0;
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_buffer_fill_bytes(
+    const iree_vm_buffer_t* target_buffer, iree_host_size_t target_offset,
+    iree_host_size_t length, uint8_t value) {
+  return iree_vm_buffer_fill_elements(target_buffer, target_offset, length, 1,
+                                      &value);
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_buffer_fill_elements(
+    const iree_vm_buffer_t* target_buffer, iree_host_size_t target_offset,
+    iree_host_size_t element_count, iree_host_size_t element_length,
+    const void* value) {
+  IREE_ASSERT_ARGUMENT(target_buffer);
+  iree_byte_span_t span;
+  IREE_RETURN_IF_ERROR(iree_vm_buffer_map_rw(target_buffer, target_offset,
+                                             element_count * element_length,
+                                             element_length, &span));
+  switch (element_length) {
+    case 1: {
+      const uint8_t pattern_value = *(const uint8_t*)value;
+      memset(span.data, pattern_value, span.data_length);
+    } break;
+    case 2: {
+      const uint16_t pattern_value = *(const uint16_t*)value;
+      uint16_t* target_ptr = (uint16_t*)span.data;
+      for (iree_host_size_t i = 0; i < element_count; ++i) {
+        target_ptr[i] = pattern_value;
+      }
+    } break;
+    case 4: {
+      const uint32_t pattern_value = *(const uint32_t*)value;
+      uint32_t* target_ptr = (uint32_t*)span.data;
+      for (iree_host_size_t i = 0; i < element_count; ++i) {
+        target_ptr[i] = pattern_value;
+      }
+    } break;
+    case 8: {
+      const uint64_t pattern_value = *(const uint64_t*)value;
+      uint64_t* target_ptr = (uint64_t*)span.data;
+      for (iree_host_size_t i = 0; i < element_count; ++i) {
+        target_ptr[i] = pattern_value;
+      }
+    } break;
+    default:
+      return iree_make_status(
+          IREE_STATUS_INVALID_ARGUMENT,
+          "invalid element length %d; expected one of [1, 2, 4, 8]",
+          (int)element_length);
+  }
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_buffer_read_elements(
+    const iree_vm_buffer_t* source_buffer, iree_host_size_t source_offset,
+    void* target_ptr, iree_host_size_t element_count,
+    iree_host_size_t element_length) {
+  IREE_ASSERT_ARGUMENT(source_buffer);
+  iree_const_byte_span_t source_span;
+  IREE_RETURN_IF_ERROR(iree_vm_buffer_map_ro(source_buffer, source_offset,
+                                             element_count * element_length,
+                                             element_length, &source_span));
+  memcpy(target_ptr, source_span.data, source_span.data_length);
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_buffer_write_elements(
+    const void* source_ptr, const iree_vm_buffer_t* target_buffer,
+    iree_host_size_t target_offset, iree_host_size_t element_count,
+    iree_host_size_t element_length) {
+  IREE_ASSERT_ARGUMENT(source_ptr);
+  IREE_ASSERT_ARGUMENT(target_buffer);
+  iree_byte_span_t target_span;
+  IREE_RETURN_IF_ERROR(iree_vm_buffer_map_rw(target_buffer, target_offset,
+                                             element_count * element_length,
+                                             element_length, &target_span));
+  memcpy(target_span.data, source_ptr, target_span.data_length);
+  return iree_ok_status();
+}
+
+iree_status_t iree_vm_buffer_register_types(void) {
+  if (iree_vm_buffer_descriptor.type != IREE_VM_REF_TYPE_NULL) {
+    // Already registered.
+    return iree_ok_status();
+  }
+
+  iree_vm_buffer_descriptor.destroy = iree_vm_buffer_destroy;
+  iree_vm_buffer_descriptor.offsetof_counter =
+      offsetof(iree_vm_buffer_t, ref_object.counter);
+  iree_vm_buffer_descriptor.type_name = iree_make_cstring_view("vm.buffer");
+  return iree_vm_ref_register_type(&iree_vm_buffer_descriptor);
+}
diff --git a/runtime/src/iree/vm/buffer.h b/runtime/src/iree/vm/buffer.h
new file mode 100644
index 0000000..1667e49
--- /dev/null
+++ b/runtime/src/iree/vm/buffer.h
@@ -0,0 +1,191 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_VM_BUFFER_H_
+#define IREE_VM_BUFFER_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/vm/ref.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Describes where a byte buffer originates from, what guarantees can be made
+// about its lifetime and ownership, and how it may be accessed.
+// Note that buffers may always be read.
+enum iree_vm_buffer_access_bits_t {
+  // The guest is allowed to write to the buffer.
+  // If not specified the buffer is read-only.
+  IREE_VM_BUFFER_ACCESS_MUTABLE = 1u << 0,
+
+  // Buffer references memory in the module space (rodata or rwdata) that is
+  // guaranteed to be live for the lifetime of the module.
+  IREE_VM_BUFFER_ACCESS_ORIGIN_MODULE = 1u << 1,
+  // Buffer references memory created by the guest module code. It has a
+  // lifetime less than that of the module but is always tracked with proper
+  // references (a handle existing to the memory implies it is valid).
+  IREE_VM_BUFFER_ACCESS_ORIGIN_GUEST = 1u << 2,
+  // Buffer references external host memory with an unknown lifetime.
+  IREE_VM_BUFFER_ACCESS_ORIGIN_HOST = 1u << 3,
+};
+typedef uint32_t iree_vm_buffer_access_t;
+
+// A simple byte range with options for ownership and wrapping semantics.
+// The access flags indicate what access is allowed from the VM.
+// Buffers are fixed-length and may only contain primitive values.
+// For resizable lists with mixed element types and ref objects use
+// iree_vm_list_t.
+//
+// Note that because buffers are just bags of bytes endianness issues are very
+// likely depending on usage. In general IREE takes the stance that
+// little-endian is all that is practically relevant nowadays and big-endian
+// targets will need their own modules compiled with such a setting. This is to
+// avoid the significant amount of work trying to ensure cross-endian
+// correctness in things like packed .rodata, cross-device switching (host in
+// a different endianness than HAL device), etc.
+//
+// For stack-allocated buffers setup with iree_vm_buffer_initialize the
+// allocator provided will be used to free the data when the buffer is
+// deinitialized. It may be iree_allocator_null to indicate the data is unowned.
+//
+// For heap-allocated buffers created with iree_vm_buffer_create/clone/etc the
+// allocator is used to free the entire iree_vm_buffer_t and the co-allocated
+// buffer data that lives after it in memory.
+typedef struct iree_vm_buffer_t {
+  iree_vm_ref_object_t ref_object;
+  iree_vm_buffer_access_t access;
+  iree_byte_span_t data;
+  iree_allocator_t allocator;
+} iree_vm_buffer_t;
+
+// Initializes a buffer in-place with the given byte contents.
+// This can be used to avoid buffer allocation overhead when wrapping existing
+// buffers for API interop but buffer lifetime must be observed carefully by
+// the caller.
+//
+// Some systems may assume that the data is aligned to at least the natural
+// word size of the machine. If possible align to iree_max_align_t.
+//
+// |data| will be freed with |allocator| when the buffer is deinitialized.
+// If the data is not owned then iree_allocator_null can be used to no-op the
+// free.
+//
+// |access| can be used to control who (guest, host, etc) and how (read/write)
+// the buffer may be accessed. If the allocation being wrapped has its own
+// access requirements (read-only, etc) the caller must specify those flags.
+IREE_API_EXPORT void iree_vm_buffer_initialize(iree_vm_buffer_access_t access,
+                                               iree_byte_span_t data,
+                                               iree_allocator_t allocator,
+                                               iree_vm_buffer_t* out_buffer);
+
+// Deinitializes a buffer previously initialized in-place with
+// iree_vm_buffer_initialize. Invalid to call on a buffer that was allocated
+// on the heap via iree_vm_buffer_create. Aborts if there are still references
+// remaining.
+IREE_API_EXPORT void iree_vm_buffer_deinitialize(iree_vm_buffer_t* buffer);
+
+// Creates a new zero-initialized buffer of the given byte |length|.
+// The underlying storage buffer may be allocated larger to ensure alignment.
+// The allocated data will be aligned to iree_max_align_t.
+//
+// |access| can be used to control who (guest, host, etc) and how (read/write)
+// the buffer may be accessed.
+IREE_API_EXPORT iree_status_t iree_vm_buffer_create(
+    iree_vm_buffer_access_t access, iree_host_size_t length,
+    iree_allocator_t allocator, iree_vm_buffer_t** out_buffer);
+
+// Retains the given |buffer| for the caller.
+IREE_API_EXPORT void iree_vm_buffer_retain(iree_vm_buffer_t* buffer);
+
+// Releases the given |buffer| from the caller.
+IREE_API_EXPORT void iree_vm_buffer_release(iree_vm_buffer_t* buffer);
+
+// Clones a range of bytes in |source| to a new buffer.
+// The allocated data will be aligned to iree_max_align_t.
+//
+// |access| can be used to control who (guest, host, etc) and how (read/write)
+// the buffer may be accessed. As this returns a newly allocated buffer the
+// new access may be more permissive than the source buffer.
+IREE_API_EXPORT iree_status_t iree_vm_buffer_clone(
+    iree_vm_buffer_access_t access, const iree_vm_buffer_t* source_buffer,
+    iree_host_size_t source_offset, iree_host_size_t length,
+    iree_allocator_t allocator, iree_vm_buffer_t** out_buffer);
+
+// Returns the user-visible length of the buffer in bytes.
+IREE_API_EXPORT iree_host_size_t
+iree_vm_buffer_length(const iree_vm_buffer_t* buffer);
+
+// Returns the underlying data storage for the buffer.
+// WARNING: this performs no validation of the access allowance on the buffer
+// and the caller is responsible for all range checking. Use with caution and
+// prefer the utility methods instead.
+IREE_API_EXPORT iree_byte_span_t
+iree_vm_buffer_data(const iree_vm_buffer_t* buffer);
+
+// Copies a byte range from |source_buffer| to |target_buffer|.
+IREE_API_EXPORT iree_status_t iree_vm_buffer_copy_bytes(
+    const iree_vm_buffer_t* source_buffer, iree_host_size_t source_offset,
+    const iree_vm_buffer_t* target_buffer, iree_host_size_t target_offset,
+    iree_host_size_t length);
+
+// Compares |lhs_buffer| to |rhs_buffer| for bitwise equality.
+// |out_result| will receive 1 if the byte ranges are equal and 0 otherwise.
+IREE_API_EXPORT iree_status_t iree_vm_buffer_compare_bytes(
+    const iree_vm_buffer_t* lhs_buffer, iree_host_size_t lhs_offset,
+    const iree_vm_buffer_t* rhs_buffer, iree_host_size_t rhs_offset,
+    iree_host_size_t length, bool* out_result);
+
+// Fills a byte range of |target_buffer| with the byte pattern.
+IREE_API_EXPORT iree_status_t iree_vm_buffer_fill_bytes(
+    const iree_vm_buffer_t* target_buffer, iree_host_size_t target_offset,
+    iree_host_size_t length, uint8_t value);
+
+// Fills an element range of |buffer| with the given pattern.
+// Only |pattern_length| values with 1, 2, 4, or 8 bytes are supported.
+// The |target_offset|, in bytes, must match the alignment of the pattern.
+IREE_API_EXPORT iree_status_t iree_vm_buffer_fill_elements(
+    const iree_vm_buffer_t* target_buffer, iree_host_size_t target_offset,
+    iree_host_size_t element_count, iree_host_size_t element_length,
+    const void* value);
+
+// Reads |element_count| elements each of |element_length| bytes from the
+// |source_buffer| into |out_target_ptr|. The |source_offset|, in bytes, must be
+// aligned to at least the |element_length|.
+// This routine performs checks on bounds, alignment, and access rights.
+IREE_API_EXPORT iree_status_t iree_vm_buffer_read_elements(
+    const iree_vm_buffer_t* source_buffer, iree_host_size_t source_offset,
+    void* target_ptr, iree_host_size_t element_count,
+    iree_host_size_t element_length);
+
+// Writes |element_count| elements each of |element_length| bytes to the
+// |target_buffer| from |source_ptr|. The |target_offset|, in bytes, must be
+// aligned to at least the |element_length|.
+// This routine performs checks on bounds, alignment, and access rights.
+IREE_API_EXPORT iree_status_t iree_vm_buffer_write_elements(
+    const void* source_ptr, const iree_vm_buffer_t* target_buffer,
+    iree_host_size_t target_offset, iree_host_size_t element_count,
+    iree_host_size_t element_length);
+
+// Returns the a string view referencing the given |value| buffer.
+// The returned view will only be valid for as long as the buffer is live.
+static inline iree_string_view_t iree_vm_buffer_as_string(
+    const iree_vm_buffer_t* value) {
+  return value ? iree_make_string_view((const char*)value->data.data,
+                                       value->data.data_length)
+               : iree_string_view_empty();
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+IREE_VM_DECLARE_TYPE_ADAPTERS(iree_vm_buffer, iree_vm_buffer_t);
+
+#endif  // IREE_VM_BUFFER_H_
diff --git a/runtime/src/iree/vm/buffer_test.cc b/runtime/src/iree/vm/buffer_test.cc
new file mode 100644
index 0000000..f7b3029
--- /dev/null
+++ b/runtime/src/iree/vm/buffer_test.cc
@@ -0,0 +1,51 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/vm/buffer.h"
+
+#include <cstddef>
+
+#include "iree/base/api.h"
+#include "iree/testing/gtest.h"
+#include "iree/vm/builtin_types.h"
+
+namespace {
+
+class VMBufferTest : public ::testing::Test {
+ protected:
+  static void SetUpTestSuite() {
+    IREE_CHECK_OK(iree_vm_register_builtin_types());
+  }
+};
+
+// Tests that the data allocator is correctly called when using stack
+// initialization of a buffer.
+TEST_F(VMBufferTest, Initialize) {
+  bool did_free = false;
+  iree_allocator_t test_allocator = {
+      /*.self=*/&did_free,
+      /*.ctl=*/
+      +[](void* self, iree_allocator_command_t command, const void* params,
+          void** inout_ptr) {
+        if (command == IREE_ALLOCATOR_COMMAND_FREE) {
+          *(bool*)self = true;
+        }
+        return iree_ok_status();
+      },
+  };
+
+  uint32_t data[] = {0, 1, 2, 3};
+  iree_vm_buffer_t buffer;
+  iree_vm_buffer_initialize(
+      IREE_VM_BUFFER_ACCESS_MUTABLE | IREE_VM_BUFFER_ACCESS_ORIGIN_HOST,
+      iree_make_byte_span(data, sizeof(data)), test_allocator, &buffer);
+
+  ASSERT_FALSE(did_free);
+  iree_vm_buffer_deinitialize(&buffer);
+  ASSERT_TRUE(did_free);
+}
+
+}  // namespace
diff --git a/runtime/src/iree/vm/builtin_types.c b/runtime/src/iree/vm/builtin_types.c
new file mode 100644
index 0000000..6e133ae
--- /dev/null
+++ b/runtime/src/iree/vm/builtin_types.c
@@ -0,0 +1,16 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/vm/builtin_types.h"
+
+iree_status_t iree_vm_buffer_register_types(void);
+iree_status_t iree_vm_list_register_types(void);
+
+IREE_API_EXPORT iree_status_t iree_vm_register_builtin_types(void) {
+  IREE_RETURN_IF_ERROR(iree_vm_buffer_register_types());
+  IREE_RETURN_IF_ERROR(iree_vm_list_register_types());
+  return iree_ok_status();
+}
diff --git a/runtime/src/iree/vm/builtin_types.h b/runtime/src/iree/vm/builtin_types.h
new file mode 100644
index 0000000..b3e6890
--- /dev/null
+++ b/runtime/src/iree/vm/builtin_types.h
@@ -0,0 +1,24 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_VM_BUILTIN_TYPES_H_
+#define IREE_VM_BUILTIN_TYPES_H_
+
+#include "iree/base/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Registers the builtin VM types. This must be called on startup. Safe to call
+// multiple times.
+IREE_API_EXPORT iree_status_t iree_vm_register_builtin_types(void);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_VM_BUILTIN_TYPES_H_
diff --git a/runtime/src/iree/vm/bytecode_disasm.c b/runtime/src/iree/vm/bytecode_disasm.c
new file mode 100644
index 0000000..f44c230
--- /dev/null
+++ b/runtime/src/iree/vm/bytecode_disasm.c
@@ -0,0 +1,2249 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/vm/bytecode_disasm.h"
+
+#include <inttypes.h>
+
+#include "iree/base/config.h"
+#include "iree/vm/ops.h"
+
+#define BEGIN_DISASM_PREFIX(op_name, ext) \
+  case IREE_VM_OP_CORE_##op_name: {       \
+    switch (bytecode_data[pc++]) {
+#define END_DISASM_PREFIX()                            \
+  default:                                             \
+    return iree_make_status(IREE_STATUS_UNIMPLEMENTED, \
+                            "unhandled ext opcode");   \
+    }                                                  \
+    break;                                             \
+    }
+#define UNHANDLED_DISASM_PREFIX(op_name, ext)                      \
+  case IREE_VM_OP_CORE_##op_name: {                                \
+    return iree_make_status(IREE_STATUS_UNIMPLEMENTED,             \
+                            "unhandled dispatch extension " #ext); \
+  }
+
+#define DISASM_OP(ext, op_name) case IREE_VM_OP_##ext##_##op_name:
+
+#define VM_ParseConstI8(name) \
+  OP_I8(0);                   \
+  ++pc;
+#define VM_ParseConstI32(name) \
+  OP_I32(0);                   \
+  pc += 4;
+#define VM_ParseConstI64(name) \
+  OP_I64(0);                   \
+  pc += 8;
+#define VM_ParseConstF32(name) \
+  OP_F32(0);                   \
+  pc += 4;
+#define VM_ParseConstF64(name) \
+  OP_F64(0);                   \
+  pc += 8;
+#define VM_ParseOpcode(opcode) VM_ParseConstI8(#opcode)
+#define VM_ParseFuncAttr(name) VM_ParseConstI32(name)
+#define VM_ParseGlobalAttr(name) VM_ParseConstI32(name)
+#define VM_ParseRodataAttr(name) VM_ParseConstI32(name)
+#define VM_ParseType(name)             \
+  iree_vm_map_type(module, OP_I32(0)); \
+  pc += 4;
+#define VM_ParseTypeOf(name) VM_ParseType(name)
+#define VM_ParseIntAttr32(name) VM_ParseConstI32(name)
+#define VM_ParseIntAttr64(name) VM_ParseConstI64(name)
+#define VM_ParseFloatAttr32(name) VM_ParseConstF32(name)
+#define VM_ParseFloatAttr64(name) VM_ParseConstF64(name)
+#define VM_ParseStrAttr(name, out_str)                   \
+  (out_str)->size = (iree_host_size_t)OP_I16(0);         \
+  (out_str)->data = (const char*)&bytecode_data[pc + 2]; \
+  pc += 2 + (out_str)->size;
+#define VM_ParseBranchTarget(block_name) VM_ParseConstI32(name)
+#define VM_ParseBranchOperands(operands_name) \
+  VM_DecBranchOperandsImpl(bytecode_data, &pc)
+#define VM_ParseOperandRegI32(name) \
+  OP_I16(0) & regs->i32_mask;       \
+  pc += kRegSize;
+#define VM_ParseOperandRegI64(name)  \
+  OP_I16(0) & (regs->i32_mask & ~1); \
+  pc += kRegSize;
+#define VM_ParseOperandRegF32(name) \
+  OP_I16(0) & regs->i32_mask;       \
+  pc += kRegSize;
+#define VM_ParseOperandRegF64(name)  \
+  OP_I16(0) & (regs->i32_mask & ~1); \
+  pc += kRegSize;
+#define VM_ParseOperandRegRef(name, out_is_move)                    \
+  OP_I16(0) & regs->ref_mask;                                       \
+  *(out_is_move) = 0; /*= OP_I16(0) & IREE_REF_REGISTER_MOVE_BIT;*/ \
+  pc += kRegSize;
+#define VM_ParseVariadicOperands(name) \
+  VM_DecVariadicOperandsImpl(bytecode_data, &pc)
+#define VM_ParseResultRegI32(name) \
+  OP_I16(0) & regs->i32_mask;      \
+  pc += kRegSize;
+#define VM_ParseResultRegI64(name)   \
+  OP_I16(0) & (regs->i32_mask & ~1); \
+  pc += kRegSize;
+#define VM_ParseResultRegF32(name) \
+  OP_I16(0) & regs->i32_mask;      \
+  pc += kRegSize;
+#define VM_ParseResultRegF64(name)   \
+  OP_I16(0) & (regs->i32_mask & ~1); \
+  pc += kRegSize;
+#define VM_ParseResultRegRef(name, out_is_move)                     \
+  OP_I16(0) & regs->ref_mask;                                       \
+  *(out_is_move) = 0; /*= OP_I16(0) & IREE_REF_REGISTER_MOVE_BIT;*/ \
+  pc += kRegSize;
+#define VM_ParseVariadicResults(name) VM_ParseVariadicOperands(name)
+
+#define EMIT_REG_NAME(reg)                \
+  if ((reg)&IREE_REF_REGISTER_TYPE_BIT) { \
+    EMIT_REF_REG_NAME(reg);               \
+  } else {                                \
+    EMIT_I32_REG_NAME(reg);               \
+  }
+#define EMIT_I32_REG_NAME(reg)                            \
+  IREE_RETURN_IF_ERROR(iree_string_builder_append_format( \
+      b, "%%i%u", ((reg)&IREE_I32_REGISTER_MASK)));
+#define EMIT_I64_REG_NAME(reg)                            \
+  IREE_RETURN_IF_ERROR(iree_string_builder_append_format( \
+      b, "%%i%u:%u", ((reg)&IREE_I32_REGISTER_MASK),      \
+      ((reg)&IREE_I32_REGISTER_MASK) + 1));
+#define EMIT_F32_REG_NAME(reg) EMIT_I32_REG_NAME(reg)
+#define EMIT_REF_REG_NAME(reg)                            \
+  IREE_RETURN_IF_ERROR(iree_string_builder_append_format( \
+      b, "%%r%u", ((reg)&IREE_REF_REGISTER_MASK)));
+
+#define EMIT_REG_VALUE(regs, reg)                                           \
+  if ((reg)&IREE_REF_REGISTER_TYPE_BIT) {                                   \
+    iree_vm_ref_t* ref = &(regs)->ref[(reg)&IREE_REF_REGISTER_MASK];        \
+    if (iree_vm_ref_is_null(ref)) {                                         \
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, "null"));  \
+    } else {                                                                \
+      iree_string_view_t type_name = iree_vm_ref_type_name(ref->type);      \
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_format(               \
+          b, "!%.*s/0x%p", (int)type_name.size, type_name.data, ref->ptr)); \
+    }                                                                       \
+  } else {                                                                  \
+    IREE_RETURN_IF_ERROR(iree_string_builder_append_format(                 \
+        b, "%u", ((regs)->i32[(reg)&IREE_I32_REGISTER_MASK])));             \
+  }
+
+static iree_status_t iree_vm_bytecode_disasm_emit_type_name(
+    const iree_vm_type_def_t* type_def, iree_string_builder_t* b) {
+  if (iree_vm_type_def_is_value(type_def)) {
+    const char* type_name;
+    switch (type_def->value_type) {
+      case IREE_VM_VALUE_TYPE_I8:
+        type_name = "i8";
+        break;
+      case IREE_VM_VALUE_TYPE_I16:
+        type_name = "i16";
+        break;
+      case IREE_VM_VALUE_TYPE_I32:
+        type_name = "i32";
+        break;
+      case IREE_VM_VALUE_TYPE_I64:
+        type_name = "i64";
+        break;
+      case IREE_VM_VALUE_TYPE_F32:
+        type_name = "f32";
+        break;
+      case IREE_VM_VALUE_TYPE_F64:
+        type_name = "f64";
+        break;
+      default:
+        type_name = "unknown";
+        break;
+    }
+    return iree_string_builder_append_cstring(b, type_name);
+  } else if (iree_vm_type_def_is_ref(type_def)) {
+    iree_string_view_t type_name = iree_vm_ref_type_name(type_def->ref_type);
+    return iree_string_builder_append_format(b, "%.*s", (int)type_name.size,
+                                             type_name.data);
+  } else {
+    return iree_string_builder_append_cstring(b, "*");
+  }
+}
+#define EMIT_TYPE_NAME(type_def) \
+  iree_vm_bytecode_disasm_emit_type_name(type_def, b);
+
+static iree_status_t iree_vm_bytecode_disasm_emit_operand_list(
+    const iree_vm_registers_t* regs, const iree_vm_register_list_t* list,
+    iree_vm_bytecode_disasm_format_t format, iree_string_builder_t* b) {
+  bool include_values =
+      regs && (format & IREE_VM_BYTECODE_DISASM_FORMAT_INLINE_VALUES);
+  for (uint16_t i = 0; i < list->size; ++i) {
+    if (i > 0) {
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+    }
+    uint16_t reg = list->registers[i];
+    EMIT_REG_NAME(reg);
+    if (include_values) {
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, "("));
+      EMIT_REG_VALUE(regs, reg);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ")"));
+    }
+  }
+  return iree_ok_status();
+}
+#define EMIT_OPERAND_REG_LIST(reg_list) \
+  iree_vm_bytecode_disasm_emit_operand_list(regs, reg_list, format, b)
+static iree_status_t iree_vm_bytecode_disasm_emit_result_list(
+    const iree_vm_register_list_t* list,
+    iree_vm_bytecode_disasm_format_t format, iree_string_builder_t* b) {
+  for (uint16_t i = 0; i < list->size; ++i) {
+    if (i > 0) {
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+    }
+    uint16_t reg = list->registers[i];
+    EMIT_REG_NAME(reg);
+  }
+  return iree_ok_status();
+}
+#define EMIT_RESULT_REG_LIST(reg_list) \
+  iree_vm_bytecode_disasm_emit_result_list(reg_list, format, b)
+static iree_status_t iree_vm_bytecode_disasm_emit_remap_list(
+    const iree_vm_registers_t* regs,
+    const iree_vm_register_remap_list_t* remap_list,
+    iree_vm_bytecode_disasm_format_t format, iree_string_builder_t* b) {
+  bool include_values =
+      regs && (format & IREE_VM_BYTECODE_DISASM_FORMAT_INLINE_VALUES);
+  for (uint16_t i = 0; i < remap_list->size; ++i) {
+    if (i > 0) {
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+    }
+    EMIT_REG_NAME(remap_list->pairs[i].src_reg);
+    if (include_values) {
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, "("));
+      EMIT_REG_VALUE(regs, remap_list->pairs[i].src_reg);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ")"));
+    }
+    IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, "->"));
+    EMIT_REG_NAME(remap_list->pairs[i].dst_reg);
+  }
+  return iree_ok_status();
+}
+#define EMIT_REMAP_LIST(remap_list) \
+  iree_vm_bytecode_disasm_emit_remap_list(regs, remap_list, format, b)
+
+#define EMIT_OPTIONAL_VALUE_I32(expr)                                          \
+  if (regs && (format & IREE_VM_BYTECODE_DISASM_FORMAT_INLINE_VALUES)) {       \
+    IREE_RETURN_IF_ERROR(iree_string_builder_append_format(b, "(%" PRId32 ")", \
+                                                           (int32_t)(expr)));  \
+  }
+#define EMIT_OPTIONAL_VALUE_I64(expr)                                    \
+  if (regs && (format & IREE_VM_BYTECODE_DISASM_FORMAT_INLINE_VALUES)) { \
+    IREE_RETURN_IF_ERROR(iree_string_builder_append_format(              \
+        b, "(%" PRId64 ")", *(int64_t*)&(expr)));                        \
+  }
+#define EMIT_OPTIONAL_VALUE_F32(expr)                                    \
+  if (regs && (format & IREE_VM_BYTECODE_DISASM_FORMAT_INLINE_VALUES)) { \
+    IREE_RETURN_IF_ERROR(                                                \
+        iree_string_builder_append_format(b, "(%f)", *(float*)&(expr))); \
+  }
+#define EMIT_OPTIONAL_VALUE_F64(expr)                                     \
+  if (regs && (format & IREE_VM_BYTECODE_DISASM_FORMAT_INLINE_VALUES)) {  \
+    IREE_RETURN_IF_ERROR(                                                 \
+        iree_string_builder_append_format(b, "(%f)", *(double*)&(expr))); \
+  }
+#define EMIT_OPTIONAL_VALUE_REF(expr)                                         \
+  if (regs && (format & IREE_VM_BYTECODE_DISASM_FORMAT_INLINE_VALUES)) {      \
+    iree_vm_ref_t* ref = (expr);                                              \
+    if (iree_vm_ref_is_null(ref)) {                                           \
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, "(null)"));  \
+    } else {                                                                  \
+      iree_string_view_t type_name = iree_vm_ref_type_name(ref->type);        \
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_format(                 \
+          b, "(!%.*s/0x%p)", (int)type_name.size, type_name.data, ref->ptr)); \
+    }                                                                         \
+  }
+
+#define DISASM_OP_CORE_UNARY_I32(op_name, op_mnemonic)                \
+  DISASM_OP(CORE, op_name) {                                          \
+    uint16_t operand_reg = VM_ParseOperandRegI32("operand");          \
+    uint16_t result_reg = VM_ParseResultRegI32("result");             \
+    EMIT_I32_REG_NAME(result_reg);                                    \
+    IREE_RETURN_IF_ERROR(                                             \
+        iree_string_builder_append_format(b, " = %s ", op_mnemonic)); \
+    EMIT_I32_REG_NAME(operand_reg);                                   \
+    EMIT_OPTIONAL_VALUE_I32(regs->i32[operand_reg]);                  \
+    break;                                                            \
+  }
+
+#define DISASM_OP_CORE_BINARY_I32(op_name, op_mnemonic)                \
+  DISASM_OP(CORE, op_name) {                                           \
+    uint16_t lhs_reg = VM_ParseOperandRegI32("lhs");                   \
+    uint16_t rhs_reg = VM_ParseOperandRegI32("rhs");                   \
+    uint16_t result_reg = VM_ParseResultRegI32("result");              \
+    EMIT_I32_REG_NAME(result_reg);                                     \
+    IREE_RETURN_IF_ERROR(                                              \
+        iree_string_builder_append_format(b, " = %s ", op_mnemonic));  \
+    EMIT_I32_REG_NAME(lhs_reg);                                        \
+    EMIT_OPTIONAL_VALUE_I32(regs->i32[lhs_reg]);                       \
+    IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", ")); \
+    EMIT_I32_REG_NAME(rhs_reg);                                        \
+    EMIT_OPTIONAL_VALUE_I32(regs->i32[rhs_reg]);                       \
+    break;                                                             \
+  }
+
+#define DISASM_OP_CORE_TERNARY_I32(op_name, op_mnemonic)               \
+  DISASM_OP(CORE, op_name) {                                           \
+    uint16_t a_reg = VM_ParseOperandRegI32("a");                       \
+    uint16_t b_reg = VM_ParseOperandRegI32("b");                       \
+    uint16_t c_reg = VM_ParseOperandRegI32("c");                       \
+    uint16_t result_reg = VM_ParseResultRegI32("result");              \
+    EMIT_I32_REG_NAME(result_reg);                                     \
+    IREE_RETURN_IF_ERROR(                                              \
+        iree_string_builder_append_format(b, " = %s ", op_mnemonic));  \
+    EMIT_I32_REG_NAME(a_reg);                                          \
+    EMIT_OPTIONAL_VALUE_I32(regs->i32[a_reg]);                         \
+    IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", ")); \
+    EMIT_I32_REG_NAME(b_reg);                                          \
+    EMIT_OPTIONAL_VALUE_I32(regs->i32[b_reg]);                         \
+    IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", ")); \
+    EMIT_I32_REG_NAME(c_reg);                                          \
+    EMIT_OPTIONAL_VALUE_I32(regs->i32[c_reg]);                         \
+    break;                                                             \
+  }
+
+#define DISASM_OP_EXT_I64_UNARY_I64(op_name, op_mnemonic)             \
+  DISASM_OP(EXT_I64, op_name) {                                       \
+    uint16_t operand_reg = VM_ParseOperandRegI64("operand");          \
+    uint16_t result_reg = VM_ParseResultRegI64("result");             \
+    EMIT_I64_REG_NAME(result_reg);                                    \
+    IREE_RETURN_IF_ERROR(                                             \
+        iree_string_builder_append_format(b, " = %s ", op_mnemonic)); \
+    EMIT_I64_REG_NAME(operand_reg);                                   \
+    EMIT_OPTIONAL_VALUE_I64(regs->i32[operand_reg]);                  \
+    break;                                                            \
+  }
+
+#define DISASM_OP_EXT_I64_BINARY_I64(op_name, op_mnemonic)             \
+  DISASM_OP(EXT_I64, op_name) {                                        \
+    uint16_t lhs_reg = VM_ParseOperandRegI64("lhs");                   \
+    uint16_t rhs_reg = VM_ParseOperandRegI64("rhs");                   \
+    uint16_t result_reg = VM_ParseResultRegI64("result");              \
+    EMIT_I64_REG_NAME(result_reg);                                     \
+    IREE_RETURN_IF_ERROR(                                              \
+        iree_string_builder_append_format(b, " = %s ", op_mnemonic));  \
+    EMIT_I64_REG_NAME(lhs_reg);                                        \
+    EMIT_OPTIONAL_VALUE_I64(regs->i32[lhs_reg]);                       \
+    IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", ")); \
+    EMIT_I64_REG_NAME(rhs_reg);                                        \
+    EMIT_OPTIONAL_VALUE_I64(regs->i32[rhs_reg]);                       \
+    break;                                                             \
+  }
+
+#define DISASM_OP_EXT_I64_TERNARY_I64(op_name, op_mnemonic)            \
+  DISASM_OP(EXT_I64, op_name) {                                        \
+    uint16_t a_reg = VM_ParseOperandRegI64("a");                       \
+    uint16_t b_reg = VM_ParseOperandRegI64("b");                       \
+    uint16_t c_reg = VM_ParseOperandRegI64("c");                       \
+    uint16_t result_reg = VM_ParseResultRegI64("result");              \
+    EMIT_I64_REG_NAME(result_reg);                                     \
+    IREE_RETURN_IF_ERROR(                                              \
+        iree_string_builder_append_format(b, " = %s ", op_mnemonic));  \
+    EMIT_I64_REG_NAME(a_reg);                                          \
+    EMIT_OPTIONAL_VALUE_I64(regs->i32[a_reg]);                         \
+    IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", ")); \
+    EMIT_I64_REG_NAME(b_reg);                                          \
+    EMIT_OPTIONAL_VALUE_I64(regs->i32[b_reg]);                         \
+    IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", ")); \
+    EMIT_I64_REG_NAME(c_reg);                                          \
+    EMIT_OPTIONAL_VALUE_I64(regs->i32[c_reg]);                         \
+    break;                                                             \
+  }
+
+#define DISASM_OP_EXT_F32_UNARY_F32(op_name, op_mnemonic)             \
+  DISASM_OP(EXT_F32, op_name) {                                       \
+    uint16_t operand_reg = VM_ParseOperandRegF32("operand");          \
+    uint16_t result_reg = VM_ParseResultRegF32("result");             \
+    EMIT_F32_REG_NAME(result_reg);                                    \
+    IREE_RETURN_IF_ERROR(                                             \
+        iree_string_builder_append_format(b, " = %s ", op_mnemonic)); \
+    EMIT_F32_REG_NAME(operand_reg);                                   \
+    EMIT_OPTIONAL_VALUE_F32(regs->i32[operand_reg]);                  \
+    break;                                                            \
+  }
+
+#define DISASM_OP_EXT_F32_BINARY_F32(op_name, op_mnemonic)             \
+  DISASM_OP(EXT_F32, op_name) {                                        \
+    uint16_t lhs_reg = VM_ParseOperandRegF32("lhs");                   \
+    uint16_t rhs_reg = VM_ParseOperandRegF32("rhs");                   \
+    uint16_t result_reg = VM_ParseResultRegF32("result");              \
+    EMIT_F32_REG_NAME(result_reg);                                     \
+    IREE_RETURN_IF_ERROR(                                              \
+        iree_string_builder_append_format(b, " = %s ", op_mnemonic));  \
+    EMIT_F32_REG_NAME(lhs_reg);                                        \
+    EMIT_OPTIONAL_VALUE_F32(regs->i32[lhs_reg]);                       \
+    IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", ")); \
+    EMIT_F32_REG_NAME(rhs_reg);                                        \
+    EMIT_OPTIONAL_VALUE_F32(regs->i32[rhs_reg]);                       \
+    break;                                                             \
+  }
+
+#define DISASM_OP_EXT_F32_TERNARY_F32(op_name, op_mnemonic)            \
+  DISASM_OP(EXT_F32, op_name) {                                        \
+    uint16_t a_reg = VM_ParseOperandRegF32("a");                       \
+    uint16_t b_reg = VM_ParseOperandRegF32("b");                       \
+    uint16_t c_reg = VM_ParseOperandRegF32("c");                       \
+    uint16_t result_reg = VM_ParseResultRegF32("result");              \
+    EMIT_F32_REG_NAME(result_reg);                                     \
+    IREE_RETURN_IF_ERROR(                                              \
+        iree_string_builder_append_format(b, " = %s ", op_mnemonic));  \
+    EMIT_F32_REG_NAME(a_reg);                                          \
+    EMIT_OPTIONAL_VALUE_F32(regs->i32[a_reg]);                         \
+    IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", ")); \
+    EMIT_F32_REG_NAME(b_reg);                                          \
+    EMIT_OPTIONAL_VALUE_F32(regs->i32[b_reg]);                         \
+    IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", ")); \
+    EMIT_F32_REG_NAME(c_reg);                                          \
+    EMIT_OPTIONAL_VALUE_F32(regs->i32[c_reg]);                         \
+    break;                                                             \
+  }
+
+iree_status_t iree_vm_bytecode_disasm_op(
+    iree_vm_bytecode_module_t* module,
+    iree_vm_bytecode_module_state_t* module_state, uint16_t function_ordinal,
+    iree_vm_source_offset_t pc, const iree_vm_registers_t* regs,
+    iree_vm_bytecode_disasm_format_t format, iree_string_builder_t* b) {
+  const uint8_t* IREE_RESTRICT bytecode_data =
+      module->bytecode_data.data +
+      module->function_descriptor_table[function_ordinal].bytecode_offset;
+
+  switch (bytecode_data[pc++]) {
+    //===------------------------------------------------------------------===//
+    // Globals
+    //===------------------------------------------------------------------===//
+
+    DISASM_OP(CORE, GlobalLoadI32) {
+      uint32_t byte_offset = VM_ParseGlobalAttr("global");
+      uint16_t value_reg = VM_ParseResultRegI32("value");
+      EMIT_I32_REG_NAME(value_reg);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
+          b, " = vm.global.load.i32 .rwdata[%u]", byte_offset));
+      EMIT_OPTIONAL_VALUE_I32(
+          vm_global_load_i32(module_state->rwdata_storage.data, byte_offset));
+      break;
+    }
+
+    DISASM_OP(CORE, GlobalStoreI32) {
+      uint32_t byte_offset = VM_ParseGlobalAttr("global");
+      uint16_t value_reg = VM_ParseOperandRegI32("value");
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_format(b, "vm.global.store.i32 "));
+      EMIT_I32_REG_NAME(value_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[value_reg]);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_format(b, ", .rwdata[%u]", byte_offset));
+      break;
+    }
+
+    DISASM_OP(CORE, GlobalLoadIndirectI32) {
+      uint16_t byte_offset_reg = VM_ParseOperandRegI32("global");
+      uint16_t value_reg = VM_ParseResultRegI32("value");
+      EMIT_I32_REG_NAME(value_reg);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(
+          b, " = vm.global.load.indirect.i32 .rwdata["));
+      EMIT_I32_REG_NAME(byte_offset_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[byte_offset_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, "]"));
+      EMIT_OPTIONAL_VALUE_I32(vm_global_load_i32(
+          module_state->rwdata_storage.data, regs->i32[byte_offset_reg]));
+      break;
+    }
+
+    DISASM_OP(CORE, GlobalStoreIndirectI32) {
+      uint16_t byte_offset_reg = VM_ParseOperandRegI32("global");
+      uint16_t value_reg = VM_ParseOperandRegI32("value");
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(
+          b, "vm.global.store.indirect.i32 "));
+      EMIT_I32_REG_NAME(value_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[value_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", .rwdata["));
+      EMIT_I32_REG_NAME(byte_offset_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[byte_offset_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, "]"));
+      break;
+    }
+
+    DISASM_OP(CORE, GlobalLoadRef) {
+      uint32_t global = VM_ParseGlobalAttr("global");
+      const iree_vm_type_def_t* type_def = VM_ParseTypeOf("value");
+      bool result_is_move;
+      uint16_t result_reg = VM_ParseResultRegRef("value", &result_is_move);
+      EMIT_REF_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
+          b, " = vm.global.load.ref .refs[%u]", global));
+      EMIT_OPTIONAL_VALUE_REF(&module_state->global_ref_table[global]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, " : !"));
+      EMIT_TYPE_NAME(type_def);
+      break;
+    }
+
+    DISASM_OP(CORE, GlobalStoreRef) {
+      uint32_t global = VM_ParseGlobalAttr("global");
+      const iree_vm_type_def_t* type_def = VM_ParseTypeOf("value");
+      bool value_is_move;
+      uint16_t value_reg = VM_ParseOperandRegRef("value", &value_is_move);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, "vm.global.store.ref "));
+      EMIT_REF_REG_NAME(value_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[value_reg]);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_format(b, ", .refs[%u] : !", global));
+      EMIT_TYPE_NAME(type_def);
+      break;
+    }
+
+    DISASM_OP(CORE, GlobalLoadIndirectRef) {
+      uint16_t global_reg = VM_ParseOperandRegI32("global");
+      const iree_vm_type_def_t* type_def = VM_ParseTypeOf("value");
+      bool result_is_move;
+      uint16_t result_reg = VM_ParseResultRegRef("value", &result_is_move);
+      EMIT_REF_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(
+          b, " = vm.global.load.indirect.ref .refs["));
+      EMIT_I32_REG_NAME(global_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[global_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, "]"));
+      EMIT_OPTIONAL_VALUE_REF(
+          &module_state->global_ref_table[regs->i32[global_reg]]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, " : !"));
+      EMIT_TYPE_NAME(type_def);
+      break;
+    }
+
+    DISASM_OP(CORE, GlobalStoreIndirectRef) {
+      uint16_t global_reg = VM_ParseOperandRegI32("global");
+      const iree_vm_type_def_t* type_def = VM_ParseTypeOf("value");
+      bool value_is_move;
+      uint16_t value_reg = VM_ParseOperandRegRef("value", &value_is_move);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
+          b, "vm.global.store.indirect.ref "));
+      EMIT_REF_REG_NAME(value_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[value_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_format(b, ", .refs["));
+      EMIT_I32_REG_NAME(global_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[global_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_format(b, "] : !"));
+      EMIT_TYPE_NAME(type_def);
+      break;
+    }
+
+    //===------------------------------------------------------------------===//
+    // Constants
+    //===------------------------------------------------------------------===//
+
+    DISASM_OP(CORE, ConstI32) {
+      int32_t value = VM_ParseIntAttr32("value");
+      uint16_t result_reg = VM_ParseResultRegI32("result");
+      EMIT_I32_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
+          b, " = vm.const.i32 %d  // 0x%08X", value, value));
+      break;
+    }
+
+    DISASM_OP(CORE, ConstI32Zero) {
+      uint16_t result_reg = VM_ParseResultRegI32("result");
+      EMIT_I32_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, " = vm.const.i32.zero"));
+      break;
+    }
+
+    DISASM_OP(CORE, ConstRefZero) {
+      bool result_is_move;
+      uint16_t result_reg = VM_ParseResultRegRef("result", &result_is_move);
+      EMIT_REF_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, " = vm.const.ref.zero"));
+      break;
+    }
+
+    DISASM_OP(CORE, ConstRefRodata) {
+      uint32_t rodata_ordinal = VM_ParseRodataAttr("rodata");
+      bool result_is_move;
+      uint16_t result_reg = VM_ParseResultRegRef("value", &result_is_move);
+      iree_vm_buffer_t* buffer =
+          &module_state->rodata_ref_table[rodata_ordinal];
+      EMIT_REF_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
+          b, " = vm.const.ref.rodata %u  // 0x%p %" PRIhsz "b", rodata_ordinal,
+          buffer->data.data, buffer->data.data_length));
+      break;
+    }
+
+    //===------------------------------------------------------------------===//
+    // Buffers
+    //===------------------------------------------------------------------===//
+
+    DISASM_OP(CORE, BufferAlloc) {
+      uint16_t length_reg = VM_ParseOperandRegI32("length");
+      bool result_is_move;
+      uint16_t result_reg = VM_ParseResultRegRef("result", &result_is_move);
+      EMIT_REF_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, " = vm.buffer.alloc "));
+      EMIT_I32_REG_NAME(length_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[length_reg]);
+      break;
+    }
+
+    DISASM_OP(CORE, BufferClone) {
+      bool source_is_move;
+      uint16_t source_reg = VM_ParseOperandRegRef("source", &source_is_move);
+      uint16_t offset_reg = VM_ParseOperandRegI32("offset");
+      uint16_t length_reg = VM_ParseOperandRegI32("length");
+      bool result_is_move;
+      uint16_t result_reg = VM_ParseResultRegRef("result", &result_is_move);
+      EMIT_REF_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, " = vm.buffer.clone "));
+      EMIT_REF_REG_NAME(source_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[source_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_I32_REG_NAME(offset_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[offset_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_I32_REG_NAME(length_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[length_reg]);
+      break;
+    }
+
+    DISASM_OP(CORE, BufferLength) {
+      bool buffer_is_move;
+      uint16_t buffer_reg = VM_ParseOperandRegRef("buffer", &buffer_is_move);
+      uint16_t result_reg = VM_ParseResultRegI32("result");
+      EMIT_I32_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, " = vm.buffer.length "));
+      EMIT_REF_REG_NAME(buffer_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[buffer_reg]);
+      break;
+    }
+
+    DISASM_OP(CORE, BufferCopy) {
+      bool source_buffer_is_move;
+      uint16_t source_buffer_reg =
+          VM_ParseOperandRegRef("source_buffer", &source_buffer_is_move);
+      uint16_t source_offset_reg = VM_ParseOperandRegI32("source_offset");
+      bool target_buffer_is_move;
+      uint16_t target_buffer_reg =
+          VM_ParseOperandRegRef("target_buffer", &target_buffer_is_move);
+      uint16_t target_offset_reg = VM_ParseOperandRegI32("target_offset");
+      uint16_t length_reg = VM_ParseOperandRegI32("length");
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, "vm.buffer.copy "));
+      EMIT_REF_REG_NAME(source_buffer_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[source_buffer_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_I32_REG_NAME(source_offset_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[source_offset_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_REF_REG_NAME(target_buffer_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[target_buffer_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_I32_REG_NAME(target_offset_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[target_offset_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_I32_REG_NAME(length_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[length_reg]);
+      break;
+    }
+
+    DISASM_OP(CORE, BufferCompare) {
+      bool lhs_buffer_is_move;
+      uint16_t lhs_buffer_reg =
+          VM_ParseOperandRegRef("lhs_buffer", &lhs_buffer_is_move);
+      uint16_t lhs_offset_reg = VM_ParseOperandRegI32("lhs_offset");
+      bool rhs_buffer_is_move;
+      uint16_t rhs_buffer_reg =
+          VM_ParseOperandRegRef("rhs_buffer", &rhs_buffer_is_move);
+      uint16_t rhs_offset_reg = VM_ParseOperandRegI32("rhs_offset");
+      uint16_t length_reg = VM_ParseOperandRegI32("length");
+      uint16_t result_reg = VM_ParseResultRegI32("result");
+      EMIT_I32_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, " = vm.buffer.compare "));
+      EMIT_REF_REG_NAME(lhs_buffer_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[lhs_buffer_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_I32_REG_NAME(lhs_offset_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[lhs_offset_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_REF_REG_NAME(rhs_buffer_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[rhs_buffer_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_I32_REG_NAME(rhs_offset_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[rhs_offset_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_I32_REG_NAME(length_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[length_reg]);
+      break;
+    }
+
+    DISASM_OP(CORE, BufferFillI8) {
+      bool buffer_is_move;
+      uint16_t buffer_reg =
+          VM_ParseOperandRegRef("target_buffer", &buffer_is_move);
+      uint16_t offset_reg = VM_ParseOperandRegI32("target_offset");
+      uint16_t length_reg = VM_ParseOperandRegI32("length");
+      uint16_t value_reg = VM_ParseOperandRegI32("value");
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, "vm.buffer.fill.i8 "));
+      EMIT_REF_REG_NAME(buffer_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[buffer_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_I32_REG_NAME(offset_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[offset_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_I32_REG_NAME(length_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[length_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_I32_REG_NAME(value_reg);
+      EMIT_OPTIONAL_VALUE_I32((uint8_t)regs->i32[value_reg]);
+      break;
+    }
+    DISASM_OP(CORE, BufferFillI16) {
+      bool buffer_is_move;
+      uint16_t buffer_reg =
+          VM_ParseOperandRegRef("target_buffer", &buffer_is_move);
+      uint16_t offset_reg = VM_ParseOperandRegI32("target_offset");
+      uint16_t length_reg = VM_ParseOperandRegI32("length");
+      uint16_t value_reg = VM_ParseOperandRegI32("value");
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, "vm.buffer.fill.i16 "));
+      EMIT_REF_REG_NAME(buffer_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[buffer_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_I32_REG_NAME(offset_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[offset_reg] / sizeof(uint16_t));
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_REF_REG_NAME(length_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[length_reg] / sizeof(uint16_t));
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_I32_REG_NAME(value_reg);
+      EMIT_OPTIONAL_VALUE_I32((uint16_t)regs->i32[value_reg]);
+      break;
+    }
+    DISASM_OP(CORE, BufferFillI32) {
+      bool buffer_is_move;
+      uint16_t buffer_reg =
+          VM_ParseOperandRegRef("target_buffer", &buffer_is_move);
+      uint16_t offset_reg = VM_ParseOperandRegI32("target_offset");
+      uint16_t length_reg = VM_ParseOperandRegI32("length");
+      uint16_t value_reg = VM_ParseOperandRegI32("value");
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, "vm.buffer.fill.i32 "));
+      EMIT_REF_REG_NAME(buffer_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[buffer_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_I32_REG_NAME(offset_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[offset_reg] / sizeof(uint32_t));
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_REF_REG_NAME(length_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[length_reg] / sizeof(uint32_t));
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_I32_REG_NAME(value_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[value_reg]);
+      break;
+    }
+
+    DISASM_OP(CORE, BufferLoadI8U) {
+      bool buffer_is_move;
+      uint16_t buffer_reg =
+          VM_ParseOperandRegRef("source_buffer", &buffer_is_move);
+      uint16_t offset_reg = VM_ParseOperandRegI32("source_offset");
+      uint16_t result_reg = VM_ParseResultRegI32("result");
+      EMIT_I32_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, " = vm.buffer.load.i8.u "));
+      EMIT_REF_REG_NAME(buffer_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[buffer_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_I32_REG_NAME(offset_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[offset_reg]);
+      break;
+    }
+    DISASM_OP(CORE, BufferLoadI8S) {
+      bool buffer_is_move;
+      uint16_t buffer_reg =
+          VM_ParseOperandRegRef("source_buffer", &buffer_is_move);
+      uint16_t offset_reg = VM_ParseOperandRegI32("source_offset");
+      uint16_t result_reg = VM_ParseResultRegI32("result");
+      EMIT_I32_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, " = vm.buffer.load.i8.s "));
+      EMIT_REF_REG_NAME(buffer_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[buffer_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_I32_REG_NAME(offset_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[offset_reg]);
+      break;
+    }
+    DISASM_OP(CORE, BufferLoadI16U) {
+      bool buffer_is_move;
+      uint16_t buffer_reg =
+          VM_ParseOperandRegRef("source_buffer", &buffer_is_move);
+      uint16_t offset_reg = VM_ParseOperandRegI32("source_offset");
+      uint16_t result_reg = VM_ParseResultRegI32("result");
+      EMIT_I32_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, " = vm.buffer.load.i16.u "));
+      EMIT_REF_REG_NAME(buffer_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[buffer_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_I32_REG_NAME(offset_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[offset_reg] / sizeof(uint16_t));
+      break;
+    }
+    DISASM_OP(CORE, BufferLoadI16S) {
+      bool buffer_is_move;
+      uint16_t buffer_reg =
+          VM_ParseOperandRegRef("source_buffer", &buffer_is_move);
+      uint16_t offset_reg = VM_ParseOperandRegI32("source_offset");
+      uint16_t result_reg = VM_ParseResultRegI32("result");
+      EMIT_I32_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, " = vm.buffer.load.i16.s "));
+      EMIT_REF_REG_NAME(buffer_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[buffer_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_I32_REG_NAME(offset_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[offset_reg] / sizeof(uint16_t));
+      break;
+    }
+    DISASM_OP(CORE, BufferLoadI32) {
+      bool buffer_is_move;
+      uint16_t buffer_reg =
+          VM_ParseOperandRegRef("source_buffer", &buffer_is_move);
+      uint16_t offset_reg = VM_ParseOperandRegI32("source_offset");
+      uint16_t result_reg = VM_ParseResultRegI32("result");
+      EMIT_I32_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, " = vm.buffer.load.i32 "));
+      EMIT_REF_REG_NAME(buffer_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[buffer_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_I32_REG_NAME(offset_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[offset_reg] / sizeof(uint32_t));
+      break;
+    }
+
+    DISASM_OP(CORE, BufferStoreI8) {
+      bool buffer_is_move;
+      uint16_t buffer_reg =
+          VM_ParseOperandRegRef("target_buffer", &buffer_is_move);
+      uint16_t offset_reg = VM_ParseOperandRegI32("target_offset");
+      uint16_t value_reg = VM_ParseOperandRegI32("value");
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, "vm.buffer.store.i8 "));
+      EMIT_I32_REG_NAME(value_reg);
+      EMIT_OPTIONAL_VALUE_I32((uint8_t)regs->i32[value_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_REF_REG_NAME(buffer_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[buffer_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_I32_REG_NAME(offset_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[offset_reg]);
+      break;
+    }
+    DISASM_OP(CORE, BufferStoreI16) {
+      bool buffer_is_move;
+      uint16_t buffer_reg =
+          VM_ParseOperandRegRef("target_buffer", &buffer_is_move);
+      uint16_t offset_reg = VM_ParseOperandRegI32("target_offset");
+      uint16_t value_reg = VM_ParseOperandRegI32("value");
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, "vm.buffer.store.i16 "));
+      EMIT_I32_REG_NAME(value_reg);
+      EMIT_OPTIONAL_VALUE_I32((uint16_t)regs->i32[value_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_REF_REG_NAME(buffer_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[buffer_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_I32_REG_NAME(offset_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[offset_reg] / sizeof(uint16_t));
+      break;
+    }
+    DISASM_OP(CORE, BufferStoreI32) {
+      bool buffer_is_move;
+      uint16_t buffer_reg =
+          VM_ParseOperandRegRef("target_buffer", &buffer_is_move);
+      uint16_t offset_reg = VM_ParseOperandRegI32("target_offset");
+      uint16_t value_reg = VM_ParseOperandRegI32("value");
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, "vm.buffer.store.i32 "));
+      EMIT_I32_REG_NAME(value_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[value_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_REF_REG_NAME(buffer_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[buffer_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_I32_REG_NAME(offset_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[offset_reg] / sizeof(uint32_t));
+      break;
+    }
+
+    //===------------------------------------------------------------------===//
+    // Lists
+    //===------------------------------------------------------------------===//
+
+    DISASM_OP(CORE, ListAlloc) {
+      const iree_vm_type_def_t* element_type_def =
+          VM_ParseTypeOf("element_type");
+      uint16_t initial_capacity_reg = VM_ParseOperandRegI32("initial_capacity");
+      bool result_is_move;
+      uint16_t result_reg = VM_ParseResultRegRef("result", &result_is_move);
+      EMIT_REF_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, " = vm.list.alloc "));
+      EMIT_I32_REG_NAME(initial_capacity_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[initial_capacity_reg]);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, " : !vm.list<"));
+      EMIT_TYPE_NAME(element_type_def);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ">"));
+      break;
+    }
+
+    DISASM_OP(CORE, ListReserve) {
+      bool list_is_move;
+      uint16_t list_reg = VM_ParseOperandRegRef("list", &list_is_move);
+      uint16_t minimum_capacity_reg = VM_ParseOperandRegI32("minimum_capacity");
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, "vm.list.reserve "));
+      EMIT_REF_REG_NAME(list_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[list_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_I32_REG_NAME(minimum_capacity_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[minimum_capacity_reg]);
+      break;
+    }
+
+    DISASM_OP(CORE, ListSize) {
+      bool list_is_move;
+      uint16_t list_reg = VM_ParseOperandRegRef("list", &list_is_move);
+      uint16_t result_reg = VM_ParseResultRegI32("result");
+      EMIT_I32_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, " = vm.list.size "));
+      EMIT_REF_REG_NAME(list_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[list_reg]);
+      break;
+    }
+
+    DISASM_OP(CORE, ListResize) {
+      bool list_is_move;
+      uint16_t list_reg = VM_ParseOperandRegRef("list", &list_is_move);
+      uint16_t new_size_reg = VM_ParseOperandRegI32("new_size");
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, "vm.list.resize "));
+      EMIT_REF_REG_NAME(list_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[list_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_I32_REG_NAME(new_size_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[new_size_reg]);
+      break;
+    }
+
+    DISASM_OP(CORE, ListGetI32) {
+      bool list_is_move;
+      uint16_t list_reg = VM_ParseOperandRegRef("list", &list_is_move);
+      uint16_t index_reg = VM_ParseOperandRegI32("index");
+      uint16_t result_reg = VM_ParseResultRegI32("result");
+      EMIT_I32_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, " = vm.list.get.i32 "));
+      EMIT_REF_REG_NAME(list_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[list_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_I32_REG_NAME(index_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[index_reg]);
+      break;
+    }
+
+    DISASM_OP(CORE, ListSetI32) {
+      bool list_is_move;
+      uint16_t list_reg = VM_ParseOperandRegRef("list", &list_is_move);
+      uint16_t index_reg = VM_ParseOperandRegI32("index");
+      uint16_t raw_value_reg = VM_ParseOperandRegI32("raw_value");
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, "vm.list.set.i32 "));
+      EMIT_REF_REG_NAME(list_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[list_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_I32_REG_NAME(index_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[index_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_I32_REG_NAME(raw_value_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[raw_value_reg]);
+      break;
+    }
+
+    DISASM_OP(CORE, ListGetRef) {
+      bool list_is_move;
+      uint16_t list_reg = VM_ParseOperandRegRef("list", &list_is_move);
+      uint16_t index_reg = VM_ParseOperandRegI32("index");
+      const iree_vm_type_def_t* type_def = VM_ParseTypeOf("result");
+      bool result_is_move;
+      uint16_t result_reg = VM_ParseResultRegRef("result", &result_is_move);
+      EMIT_REF_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, " = vm.list.get.ref "));
+      EMIT_REF_REG_NAME(list_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[list_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_I32_REG_NAME(index_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[index_reg]);
+      EMIT_TYPE_NAME(type_def);
+      break;
+    }
+
+    DISASM_OP(CORE, ListSetRef) {
+      bool list_is_move;
+      uint16_t list_reg = VM_ParseOperandRegRef("list", &list_is_move);
+      uint16_t index_reg = VM_ParseOperandRegI32("index");
+      bool operand_is_move;
+      uint16_t operand_reg = VM_ParseOperandRegRef("value", &operand_is_move);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, "vm.list.set.ref "));
+      EMIT_REF_REG_NAME(list_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[list_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_I32_REG_NAME(index_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[index_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_REF_REG_NAME(operand_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[operand_reg]);
+      break;
+    }
+
+    //===------------------------------------------------------------------===//
+    // Conditional assignment
+    //===------------------------------------------------------------------===//
+
+    DISASM_OP(CORE, SelectI32) {
+      uint16_t condition_reg = VM_ParseOperandRegI32("condition");
+      uint16_t true_value_reg = VM_ParseOperandRegI32("true_value");
+      uint16_t false_value_reg = VM_ParseOperandRegI32("false_value");
+      uint16_t result_reg = VM_ParseResultRegI32("result");
+      EMIT_I32_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, " = vm.select.i32 "));
+      EMIT_I32_REG_NAME(condition_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[condition_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, " ? "));
+      EMIT_I32_REG_NAME(true_value_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[true_value_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, " : "));
+      EMIT_I32_REG_NAME(false_value_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[false_value_reg]);
+      break;
+    }
+
+    DISASM_OP(CORE, SelectRef) {
+      uint16_t condition_reg = VM_ParseOperandRegI32("condition");
+      const iree_vm_type_def_t* type_def = VM_ParseTypeOf("true_value");
+      bool true_value_is_move;
+      uint16_t true_value_reg =
+          VM_ParseOperandRegRef("true_value", &true_value_is_move);
+      bool false_value_is_move;
+      uint16_t false_value_reg =
+          VM_ParseOperandRegRef("false_value", &false_value_is_move);
+      bool result_is_move;
+      uint16_t result_reg = VM_ParseResultRegRef("result", &result_is_move);
+      EMIT_REF_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, " = vm.select.ref "));
+      EMIT_I32_REG_NAME(condition_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[condition_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, " ? "));
+      EMIT_REF_REG_NAME(true_value_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[true_value_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, " : "));
+      EMIT_REF_REG_NAME(false_value_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[false_value_reg]);
+      EMIT_TYPE_NAME(type_def);
+      break;
+    }
+
+    DISASM_OP(CORE, SwitchI32) {
+      uint16_t index_reg = VM_ParseOperandRegI32("index");
+      int32_t default_value = VM_ParseIntAttr32("default_value");
+      const iree_vm_register_list_t* value_reg_list =
+          VM_ParseVariadicOperands("values");
+      uint16_t result_reg = VM_ParseResultRegI32("result");
+      EMIT_I32_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, " = vm.switch.i32 "));
+      EMIT_I32_REG_NAME(index_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[index_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, "["));
+      EMIT_OPERAND_REG_LIST(value_reg_list);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_format(b, "] else %u", default_value));
+      break;
+    }
+
+    DISASM_OP(CORE, SwitchRef) {
+      uint16_t index_reg = VM_ParseOperandRegI32("index");
+      bool default_is_move;
+      uint16_t default_value_reg =
+          VM_ParseOperandRegRef("default_value", &default_is_move);
+      const iree_vm_register_list_t* value_reg_list =
+          VM_ParseVariadicOperands("values");
+      bool result_is_move;
+      uint16_t result_reg = VM_ParseResultRegRef("result", &result_is_move);
+      EMIT_REF_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, " = vm.switch.ref "));
+      EMIT_I32_REG_NAME(index_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[index_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, "["));
+      EMIT_OPERAND_REG_LIST(value_reg_list);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, "] else "));
+      EMIT_REF_REG_NAME(default_value_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[default_value_reg]);
+      break;
+    }
+
+    //===------------------------------------------------------------------===//
+    // Native integer arithmetic
+    //===------------------------------------------------------------------===//
+
+    DISASM_OP_CORE_BINARY_I32(AddI32, "vm.add.i32");
+    DISASM_OP_CORE_BINARY_I32(SubI32, "vm.sub.i32");
+    DISASM_OP_CORE_BINARY_I32(MulI32, "vm.mul.i32");
+    DISASM_OP_CORE_BINARY_I32(DivI32S, "vm.div.i32.s");
+    DISASM_OP_CORE_BINARY_I32(DivI32U, "vm.div.i32.u");
+    DISASM_OP_CORE_BINARY_I32(RemI32S, "vm.rem.i32.s");
+    DISASM_OP_CORE_BINARY_I32(RemI32U, "vm.rem.i32.u");
+    DISASM_OP_CORE_TERNARY_I32(FMAI32, "vm.fma.i32");
+    DISASM_OP_CORE_UNARY_I32(NotI32, "vm.not.i32");
+    DISASM_OP_CORE_BINARY_I32(AndI32, "vm.and.i32");
+    DISASM_OP_CORE_BINARY_I32(OrI32, "vm.or.i32");
+    DISASM_OP_CORE_BINARY_I32(XorI32, "vm.xor.i32");
+
+    //===------------------------------------------------------------------===//
+    // Casting and type conversion/emulation
+    //===------------------------------------------------------------------===//
+
+    DISASM_OP_CORE_UNARY_I32(TruncI32I8, "vm.trunc.i32.i8");
+    DISASM_OP_CORE_UNARY_I32(TruncI32I16, "vm.trunc.i32.i16");
+    DISASM_OP_CORE_UNARY_I32(ExtI8I32S, "vm.ext.i8.i32.s");
+    DISASM_OP_CORE_UNARY_I32(ExtI8I32U, "vm.ext.i8.i32.u");
+    DISASM_OP_CORE_UNARY_I32(ExtI16I32S, "vm.ext.i16.i32.s");
+    DISASM_OP_CORE_UNARY_I32(ExtI16I32U, "vm.ext.i16.i32.u");
+
+    //===------------------------------------------------------------------===//
+    // Native bitwise shifts and rotates
+    //===------------------------------------------------------------------===//
+
+#define DISASM_OP_CORE_SHIFT_I32(op_name, op_mnemonic)                 \
+  DISASM_OP(CORE, op_name) {                                           \
+    uint16_t operand_reg = VM_ParseOperandRegI32("operand");           \
+    uint16_t amount_reg = VM_ParseOperandRegI32("amount");             \
+    uint16_t result_reg = VM_ParseResultRegI32("result");              \
+    EMIT_I32_REG_NAME(result_reg);                                     \
+    IREE_RETURN_IF_ERROR(                                              \
+        iree_string_builder_append_format(b, " = %s ", op_mnemonic));  \
+    EMIT_I32_REG_NAME(operand_reg);                                    \
+    EMIT_OPTIONAL_VALUE_I32(regs->i32[operand_reg]);                   \
+    IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", ")); \
+    EMIT_I32_REG_NAME(amount_reg);                                     \
+    EMIT_OPTIONAL_VALUE_I32(regs->i32[amount_reg]);                    \
+    break;                                                             \
+  }
+
+    DISASM_OP_CORE_SHIFT_I32(ShlI32, "vm.shl.i32");
+    DISASM_OP_CORE_SHIFT_I32(ShrI32S, "vm.shr.i32.s");
+    DISASM_OP_CORE_SHIFT_I32(ShrI32U, "vm.shr.i32.u");
+
+    //===------------------------------------------------------------------===//
+    // Comparison ops
+    //===------------------------------------------------------------------===//
+
+    DISASM_OP_CORE_BINARY_I32(CmpEQI32, "vm.cmp.eq.i32");
+    DISASM_OP_CORE_BINARY_I32(CmpNEI32, "vm.cmp.ne.i32");
+    DISASM_OP_CORE_BINARY_I32(CmpLTI32S, "vm.cmp.lt.i32.s");
+    DISASM_OP_CORE_BINARY_I32(CmpLTI32U, "vm.cmp.lt.i32.u");
+    DISASM_OP_CORE_UNARY_I32(CmpNZI32, "vm.cmp.nz.i32");
+
+    DISASM_OP(CORE, CmpEQRef) {
+      bool lhs_is_move;
+      uint16_t lhs_reg = VM_ParseOperandRegRef("lhs", &lhs_is_move);
+      bool rhs_is_move;
+      uint16_t rhs_reg = VM_ParseOperandRegRef("rhs", &rhs_is_move);
+      uint16_t result_reg = VM_ParseResultRegI32("result");
+      EMIT_I32_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, " = vm.cmp.eq.ref "));
+      EMIT_REF_REG_NAME(lhs_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[lhs_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_REF_REG_NAME(rhs_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[rhs_reg]);
+      break;
+    }
+    DISASM_OP(CORE, CmpNERef) {
+      bool lhs_is_move;
+      uint16_t lhs_reg = VM_ParseOperandRegRef("lhs", &lhs_is_move);
+      bool rhs_is_move;
+      uint16_t rhs_reg = VM_ParseOperandRegRef("rhs", &rhs_is_move);
+      uint16_t result_reg = VM_ParseResultRegI32("result");
+      EMIT_I32_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, " = vm.cmp.ne.ref "));
+      EMIT_REF_REG_NAME(lhs_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[lhs_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_REF_REG_NAME(rhs_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[rhs_reg]);
+      break;
+    }
+    DISASM_OP(CORE, CmpNZRef) {
+      bool operand_is_move;
+      uint16_t operand_reg = VM_ParseOperandRegRef("operand", &operand_is_move);
+      uint16_t result_reg = VM_ParseResultRegI32("result");
+      EMIT_I32_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, " = vm.cmp.nz.ref "));
+      EMIT_REF_REG_NAME(operand_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[operand_reg]);
+      break;
+    }
+
+    //===------------------------------------------------------------------===//
+    // Control flow
+    //===------------------------------------------------------------------===//
+
+    DISASM_OP(CORE, Branch) {
+      int32_t block_pc = VM_ParseBranchTarget("dest");
+      const iree_vm_register_remap_list_t* remap_list =
+          VM_ParseBranchOperands("operands");
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_format(b, "vm.br ^%08X(", block_pc));
+      EMIT_REMAP_LIST(remap_list);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ")"));
+      break;
+    }
+
+    DISASM_OP(CORE, CondBranch) {
+      uint16_t condition_reg = VM_ParseOperandRegI32("condition");
+      int32_t true_block_pc = VM_ParseBranchTarget("true_dest");
+      const iree_vm_register_remap_list_t* true_remap_list =
+          VM_ParseBranchOperands("true_operands");
+      int32_t false_block_pc = VM_ParseBranchTarget("false_dest");
+      const iree_vm_register_remap_list_t* false_remap_list =
+          VM_ParseBranchOperands("false_operands");
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, "vm.cond_br "));
+      EMIT_I32_REG_NAME(condition_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[condition_reg]);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_format(b, ", ^%08X(", true_block_pc));
+      EMIT_REMAP_LIST(true_remap_list);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_format(b, "), ^%08X(", false_block_pc));
+      EMIT_REMAP_LIST(false_remap_list);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ")"));
+      break;
+    }
+
+    DISASM_OP(CORE, Call) {
+      int32_t function_ordinal = VM_ParseFuncAttr("callee");
+      const iree_vm_register_list_t* src_reg_list =
+          VM_ParseVariadicOperands("operands");
+      const iree_vm_register_list_t* dst_reg_list =
+          VM_ParseVariadicResults("results");
+      if (dst_reg_list->size > 0) {
+        EMIT_RESULT_REG_LIST(dst_reg_list);
+        IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, " = "));
+      }
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, "vm.call @"));
+      int is_import = (function_ordinal & 0x80000000u) != 0;
+      iree_vm_function_t function;
+      if (is_import) {
+        const iree_vm_bytecode_import_t* import =
+            &module_state->import_table[function_ordinal & 0x7FFFFFFFu];
+        function = import->function;
+      } else {
+        function.module = &module->interface;
+        function.linkage = IREE_VM_FUNCTION_LINKAGE_INTERNAL;
+        function.ordinal = function_ordinal;
+      }
+      if (function.module) {
+        iree_string_view_t module_name = iree_vm_module_name(function.module);
+        iree_string_view_t func_name = iree_vm_function_name(&function);
+        if (iree_string_view_is_empty(func_name)) {
+          IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
+              b, "%.*s:%u", (int)module_name.size, module_name.data,
+              function.ordinal));
+        } else {
+          IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
+              b, "%.*s.%.*s", (int)module_name.size, module_name.data,
+              (int)func_name.size, func_name.data));
+        }
+      } else {
+        IREE_RETURN_IF_ERROR(
+            iree_string_builder_append_cstring(b, "{{UNRESOLVED}}"));
+      }
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, "("));
+      EMIT_OPERAND_REG_LIST(src_reg_list);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ")"));
+      break;
+    }
+
+    DISASM_OP(CORE, CallVariadic) {
+      int32_t function_ordinal = VM_ParseFuncAttr("callee");
+      // TODO(benvanik): print segment sizes.
+      // const iree_vm_register_list_t* segment_size_list =
+      VM_ParseVariadicOperands("segment_sizes");
+      const iree_vm_register_list_t* src_reg_list =
+          VM_ParseVariadicOperands("operands");
+      const iree_vm_register_list_t* dst_reg_list =
+          VM_ParseVariadicResults("results");
+      if (dst_reg_list->size > 0) {
+        EMIT_RESULT_REG_LIST(dst_reg_list);
+        IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, " = "));
+      }
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, "vm.call.varadic @"));
+      int is_import = (function_ordinal & 0x80000000u) != 0;
+      iree_vm_function_t function;
+      if (is_import) {
+        const iree_vm_bytecode_import_t* import =
+            &module_state->import_table[function_ordinal & 0x7FFFFFFFu];
+        function = import->function;
+      } else {
+        function.module = &module->interface;
+        function.linkage = IREE_VM_FUNCTION_LINKAGE_INTERNAL;
+        function.ordinal = function_ordinal;
+      }
+      iree_string_view_t module_name = iree_vm_module_name(function.module);
+      iree_string_view_t func_name = iree_vm_function_name(&function);
+      if (iree_string_view_is_empty(func_name)) {
+        IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
+            b, "%.*s:%u", (int)module_name.size, module_name.data,
+            function.ordinal));
+      } else {
+        IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
+            b, "%.*s.%.*s", (int)module_name.size, module_name.data,
+            (int)func_name.size, func_name.data));
+      }
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, "("));
+      EMIT_OPERAND_REG_LIST(src_reg_list);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ")"));
+      break;
+    }
+
+    DISASM_OP(CORE, Return) {
+      const iree_vm_register_list_t* src_reg_list =
+          VM_ParseVariadicOperands("operands");
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, "vm.return "));
+      EMIT_OPERAND_REG_LIST(src_reg_list);
+      break;
+    }
+
+    DISASM_OP(CORE, Fail) {
+      uint16_t status_code_reg = VM_ParseOperandRegI32("status");
+      iree_string_view_t message;
+      VM_ParseStrAttr("message", &message);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, "vm.fail "));
+      EMIT_I32_REG_NAME(status_code_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[status_code_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
+          b, ", \"%.*s\"", (int)message.size, message.data));
+      break;
+    }
+
+    DISASM_OP(CORE, ImportResolved) {
+      int32_t function_ordinal = VM_ParseFuncAttr("import");
+      uint16_t result_reg = VM_ParseResultRegI32("result");
+      EMIT_I32_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, "= vm.import.exists @"));
+      int is_import = (function_ordinal & 0x80000000u) != 0;
+      if (IREE_UNLIKELY(!is_import)) {
+        IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
+            b, "{{INVALID ORDINAL %d}}", function_ordinal));
+        break;
+      }
+      uint32_t import_ordinal = function_ordinal & 0x7FFFFFFFu;
+      if (IREE_UNLIKELY(import_ordinal >= module_state->import_count)) {
+        IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
+            b, "{{OUT OF RANGE ORDINAL %u}}", import_ordinal));
+        break;
+      }
+      iree_vm_function_t decl_function;
+      IREE_RETURN_IF_ERROR(iree_vm_module_lookup_function_by_ordinal(
+          &module->interface, IREE_VM_FUNCTION_LINKAGE_IMPORT_OPTIONAL,
+          import_ordinal, &decl_function));
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_string(
+          b, iree_vm_function_name(&decl_function)));
+      const iree_vm_bytecode_import_t* import =
+          &module_state->import_table[import_ordinal];
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(
+          b, import->function.module != NULL ? " // (resolved)"
+                                             : " // (unresolved)"));
+      break;
+    }
+
+    //===------------------------------------------------------------------===//
+    // Async/fiber ops
+    //===------------------------------------------------------------------===//
+
+    DISASM_OP(CORE, Yield) {
+      int32_t block_pc = VM_DecBranchTarget("dest");
+      const iree_vm_register_remap_list_t* remap_list =
+          VM_ParseBranchOperands("operands");
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_format(b, "vm.yield ^%08X(", block_pc));
+      EMIT_REMAP_LIST(remap_list);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ")"));
+      break;
+    }
+
+    //===------------------------------------------------------------------===//
+    // Debugging
+    //===------------------------------------------------------------------===//
+
+    DISASM_OP(CORE, Trace) {
+      iree_string_view_t event_name;
+      VM_ParseStrAttr("event_name", &event_name);
+      const iree_vm_register_list_t* src_reg_list =
+          VM_ParseVariadicOperands("operands");
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
+          b, "vm.trace \"%.*s\"(", (int)event_name.size, event_name.data));
+      EMIT_OPERAND_REG_LIST(src_reg_list);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ")"));
+      break;
+    }
+
+    DISASM_OP(CORE, Print) {
+      iree_string_view_t event_name;
+      VM_ParseStrAttr("event_name", &event_name);
+      const iree_vm_register_list_t* src_reg_list =
+          VM_ParseVariadicOperands("operands");
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
+          b, "vm.print \"%.*s\"(", (int)event_name.size, event_name.data));
+      EMIT_OPERAND_REG_LIST(src_reg_list);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ")"));
+      break;
+    }
+
+    DISASM_OP(CORE, Break) {
+      int32_t block_pc = VM_DecBranchTarget("dest");
+      const iree_vm_register_remap_list_t* remap_list =
+          VM_ParseBranchOperands("operands");
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_format(b, "vm.break ^%08X(", block_pc));
+      EMIT_REMAP_LIST(remap_list);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ")"));
+      break;
+    }
+
+    DISASM_OP(CORE, CondBreak) {
+      uint16_t condition_reg = VM_ParseOperandRegI32("condition");
+      int32_t block_pc = VM_ParseBranchTarget("dest");
+      const iree_vm_register_remap_list_t* remap_list =
+          VM_ParseBranchOperands("operands");
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, "vm.cond_break "));
+      EMIT_I32_REG_NAME(condition_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[condition_reg]);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_format(b, ", ^%08X(", block_pc));
+      EMIT_REMAP_LIST(remap_list);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ")"));
+      break;
+    }
+
+    //===------------------------------------------------------------------===//
+    // Extension trampolines
+    //===------------------------------------------------------------------===//
+
+#if IREE_VM_EXT_I64_ENABLE
+    BEGIN_DISASM_PREFIX(PrefixExtI64, EXT_I64)
+
+    //===----------------------------------------------------------------===//
+    // ExtI64: Globals
+    //===----------------------------------------------------------------===//
+
+    DISASM_OP(EXT_I64, GlobalLoadI64) {
+      uint32_t byte_offset = VM_ParseGlobalAttr("global");
+      uint16_t value_reg = VM_ParseResultRegI64("value");
+      EMIT_I32_REG_NAME(value_reg);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
+          b, " = vm.global.load.i64 .rwdata[%u]", byte_offset));
+      EMIT_OPTIONAL_VALUE_I64(module_state->rwdata_storage.data[byte_offset]);
+      break;
+    }
+
+    DISASM_OP(EXT_I64, GlobalStoreI64) {
+      uint32_t byte_offset = VM_ParseGlobalAttr("global");
+      uint16_t value_reg = VM_ParseOperandRegI64("value");
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_format(b, "vm.global.store.i64 "));
+      EMIT_I64_REG_NAME(value_reg);
+      EMIT_OPTIONAL_VALUE_I64(regs->i32[value_reg]);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_format(b, ", .rwdata[%u]", byte_offset));
+      break;
+    }
+
+    DISASM_OP(EXT_I64, GlobalLoadIndirectI64) {
+      uint16_t byte_offset_reg = VM_ParseOperandRegI32("global");
+      uint16_t value_reg = VM_ParseResultRegI64("value");
+      EMIT_I64_REG_NAME(value_reg);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(
+          b, " = vm.global.load.indirect.i64 .rwdata["));
+      EMIT_I32_REG_NAME(byte_offset_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[byte_offset_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, "]"));
+      EMIT_OPTIONAL_VALUE_I64(
+          module_state->rwdata_storage.data[regs->i32[byte_offset_reg]]);
+      break;
+    }
+
+    DISASM_OP(EXT_I64, GlobalStoreIndirectI64) {
+      uint16_t byte_offset_reg = VM_ParseOperandRegI32("global");
+      uint16_t value_reg = VM_ParseOperandRegI64("value");
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(
+          b, "vm.global.store.indirect.i64 "));
+      EMIT_I64_REG_NAME(value_reg);
+      EMIT_OPTIONAL_VALUE_I64(regs->i32[value_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", .rwdata["));
+      EMIT_I32_REG_NAME(byte_offset_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[byte_offset_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, "]"));
+      break;
+    }
+
+    //===----------------------------------------------------------------===//
+    // ExtI64: Constants
+    //===----------------------------------------------------------------===//
+
+    DISASM_OP(EXT_I64, ConstI64) {
+      int64_t value = VM_ParseIntAttr64("value");
+      uint16_t result_reg = VM_ParseResultRegI64("result");
+      EMIT_I64_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
+          b, " = vm.const.i64 %" PRId64 "  // 0x%08" PRIX64 "", value, value));
+      break;
+    }
+
+    DISASM_OP(EXT_I64, ConstI64Zero) {
+      uint16_t result_reg = VM_ParseResultRegI64("result");
+      EMIT_I64_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, " = vm.const.i64.zero"));
+      break;
+    }
+
+    //===----------------------------------------------------------------===//
+    // ExtI64: Lists
+    //===----------------------------------------------------------------===//
+
+    DISASM_OP(EXT_I64, ListGetI64) {
+      bool list_is_move;
+      uint16_t list_reg = VM_ParseOperandRegRef("list", &list_is_move);
+      uint16_t index_reg = VM_ParseOperandRegI32("index");
+      uint16_t result_reg = VM_ParseResultRegI64("result");
+      EMIT_I64_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, " = vm.list.get.i64 "));
+      EMIT_REF_REG_NAME(list_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[list_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_I32_REG_NAME(index_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[index_reg]);
+      break;
+    }
+
+    DISASM_OP(EXT_I64, ListSetI64) {
+      bool list_is_move;
+      uint16_t list_reg = VM_ParseOperandRegRef("list", &list_is_move);
+      uint16_t index_reg = VM_ParseOperandRegI32("index");
+      uint16_t value_reg = VM_ParseOperandRegI64("value");
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, "vm.list.set.i64 "));
+      EMIT_REF_REG_NAME(list_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[list_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_I32_REG_NAME(index_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[index_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_I64_REG_NAME(value_reg);
+      EMIT_OPTIONAL_VALUE_I64(regs->i32[value_reg]);
+      break;
+    }
+
+    //===----------------------------------------------------------------===//
+    // ExtI64: Conditional assignment
+    //===----------------------------------------------------------------===//
+
+    DISASM_OP(EXT_I64, SelectI64) {
+      uint16_t condition_reg = VM_ParseOperandRegI32("condition");
+      uint16_t true_value_reg = VM_ParseOperandRegI64("true_value");
+      uint16_t false_value_reg = VM_ParseOperandRegI64("false_value");
+      uint16_t result_reg = VM_ParseResultRegI64("result");
+      EMIT_I64_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, " = vm.select.i64 "));
+      EMIT_I32_REG_NAME(condition_reg);
+      EMIT_OPTIONAL_VALUE_I64(regs->i32[condition_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, " ? "));
+      EMIT_I64_REG_NAME(true_value_reg);
+      EMIT_OPTIONAL_VALUE_I64(regs->i32[true_value_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, " : "));
+      EMIT_I64_REG_NAME(false_value_reg);
+      EMIT_OPTIONAL_VALUE_I64(regs->i32[false_value_reg]);
+      break;
+    }
+
+    DISASM_OP(EXT_I64, SwitchI64) {
+      uint16_t index_reg = VM_ParseOperandRegI32("index");
+      int64_t default_value = VM_ParseIntAttr64("default_value");
+      const iree_vm_register_list_t* value_reg_list =
+          VM_ParseVariadicOperands("values");
+      uint16_t result_reg = VM_ParseResultRegI64("result");
+      EMIT_I64_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, " = vm.switch.i64 "));
+      EMIT_I32_REG_NAME(index_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[index_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, "["));
+      EMIT_OPERAND_REG_LIST(value_reg_list);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
+          b, "] else %" PRId64, default_value));
+      break;
+    }
+
+    //===----------------------------------------------------------------===//
+    // ExtI64: Native integer arithmetic
+    //===----------------------------------------------------------------===//
+
+    DISASM_OP_EXT_I64_BINARY_I64(AddI64, "vm.add.i64");
+    DISASM_OP_EXT_I64_BINARY_I64(SubI64, "vm.sub.i64");
+    DISASM_OP_EXT_I64_BINARY_I64(MulI64, "vm.mul.i64");
+    DISASM_OP_EXT_I64_BINARY_I64(DivI64S, "vm.div.i64.s");
+    DISASM_OP_EXT_I64_BINARY_I64(DivI64U, "vm.div.i64.u");
+    DISASM_OP_EXT_I64_BINARY_I64(RemI64S, "vm.rem.i64.s");
+    DISASM_OP_EXT_I64_BINARY_I64(RemI64U, "vm.rem.i64.u");
+    DISASM_OP_EXT_I64_TERNARY_I64(FMAI64, "vm.fma.i64");
+    DISASM_OP_EXT_I64_UNARY_I64(NotI64, "vm.not.i64");
+    DISASM_OP_EXT_I64_BINARY_I64(AndI64, "vm.and.i64");
+    DISASM_OP_EXT_I64_BINARY_I64(OrI64, "vm.or.i64");
+    DISASM_OP_EXT_I64_BINARY_I64(XorI64, "vm.xor.i64");
+
+    //===----------------------------------------------------------------===//
+    // ExtI64: Casting and type conversion/emulation
+    //===----------------------------------------------------------------===//
+
+    DISASM_OP(EXT_I64, TruncI64I32) {
+      uint16_t operand_reg = VM_ParseOperandRegI64("operand");
+      uint16_t result_reg = VM_ParseResultRegI32("result");
+      EMIT_I32_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, " = vm.trunc.i64.i32 "));
+      EMIT_I64_REG_NAME(operand_reg);
+      EMIT_OPTIONAL_VALUE_I64(regs->i32[operand_reg]);
+      break;
+    }
+    DISASM_OP(EXT_I64, ExtI32I64S) {
+      uint16_t operand_reg = VM_ParseOperandRegI32("operand");
+      uint16_t result_reg = VM_ParseResultRegI64("result");
+      EMIT_I64_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, " = vm.ext.i32.i64.s "));
+      EMIT_I64_REG_NAME(operand_reg);
+      EMIT_OPTIONAL_VALUE_I64(regs->i32[operand_reg]);
+      break;
+    }
+    DISASM_OP(EXT_I64, ExtI32I64U) {
+      uint16_t operand_reg = VM_ParseOperandRegI32("operand");
+      uint16_t result_reg = VM_ParseResultRegI64("result");
+      EMIT_I64_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, " = vm.ext.i32.i64.u "));
+      EMIT_I64_REG_NAME(operand_reg);
+      EMIT_OPTIONAL_VALUE_I64(regs->i32[operand_reg]);
+      break;
+    }
+
+    //===----------------------------------------------------------------===//
+    // ExtI64: Native bitwise shifts and rotates
+    //===----------------------------------------------------------------===//
+
+#define DISASM_OP_EXT_I64_SHIFT_I64(op_name, op_mnemonic)              \
+  DISASM_OP(EXT_I64, op_name) {                                        \
+    uint16_t operand_reg = VM_ParseOperandRegI64("operand");           \
+    uint16_t amount_reg = VM_ParseOperandRegI32("amount");             \
+    uint16_t result_reg = VM_ParseResultRegI64("result");              \
+    EMIT_I64_REG_NAME(result_reg);                                     \
+    IREE_RETURN_IF_ERROR(                                              \
+        iree_string_builder_append_format(b, " = %s ", op_mnemonic));  \
+    EMIT_I64_REG_NAME(operand_reg);                                    \
+    EMIT_OPTIONAL_VALUE_I64(regs->i32[operand_reg]);                   \
+    IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", ")); \
+    EMIT_I32_REG_NAME(amount_reg);                                     \
+    EMIT_OPTIONAL_VALUE_I32(regs->i32[amount_reg]);                    \
+    break;                                                             \
+  }
+
+    DISASM_OP_EXT_I64_SHIFT_I64(ShlI64, "vm.shl.i64");
+    DISASM_OP_EXT_I64_SHIFT_I64(ShrI64S, "vm.shr.i64.s");
+    DISASM_OP_EXT_I64_SHIFT_I64(ShrI64U, "vm.shr.i64.u");
+
+    //===----------------------------------------------------------------===//
+    // ExtI64: Comparison ops
+    //===----------------------------------------------------------------===//
+
+#define DISASM_OP_EXT_I64_CMP_I64(op_name, op_mnemonic)                \
+  DISASM_OP(EXT_I64, op_name) {                                        \
+    uint16_t lhs_reg = VM_ParseOperandRegI64("lhs");                   \
+    uint16_t rhs_reg = VM_ParseOperandRegI64("rhs");                   \
+    uint16_t result_reg = VM_ParseResultRegI32("result");              \
+    EMIT_I32_REG_NAME(result_reg);                                     \
+    IREE_RETURN_IF_ERROR(                                              \
+        iree_string_builder_append_format(b, " = %s ", op_mnemonic));  \
+    EMIT_I64_REG_NAME(lhs_reg);                                        \
+    EMIT_OPTIONAL_VALUE_I64(regs->i32[lhs_reg]);                       \
+    IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", ")); \
+    EMIT_I64_REG_NAME(rhs_reg);                                        \
+    EMIT_OPTIONAL_VALUE_I64(regs->i32[rhs_reg]);                       \
+    break;                                                             \
+  }
+
+    DISASM_OP_EXT_I64_CMP_I64(CmpEQI64, "vm.cmp.eq.i64");
+    DISASM_OP_EXT_I64_CMP_I64(CmpNEI64, "vm.cmp.ne.i64");
+    DISASM_OP_EXT_I64_CMP_I64(CmpLTI64S, "vm.cmp.lt.i64.s");
+    DISASM_OP_EXT_I64_CMP_I64(CmpLTI64U, "vm.cmp.lt.i64.u");
+    DISASM_OP(EXT_I64, CmpNZI64) {
+      uint16_t operand_reg = VM_ParseOperandRegI64("operand");
+      uint16_t result_reg = VM_ParseResultRegI32("result");
+      EMIT_I32_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, " = vm.cmp.nz.i64 "));
+      EMIT_I64_REG_NAME(operand_reg);
+      EMIT_OPTIONAL_VALUE_I64(regs->i32[operand_reg]);
+      break;
+    }
+
+    //===----------------------------------------------------------------===//
+    // ExtI64: Buffers
+    //===----------------------------------------------------------------===//
+
+    DISASM_OP(EXT_I64, BufferFillI64) {
+      bool buffer_is_move;
+      uint16_t buffer_reg =
+          VM_ParseOperandRegRef("target_buffer", &buffer_is_move);
+      uint16_t offset_reg = VM_ParseOperandRegI32("target_offset");
+      uint16_t length_reg = VM_ParseOperandRegI32("length");
+      uint16_t value_reg = VM_ParseOperandRegI64("value");
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, "vm.buffer.fill.i64 "));
+      EMIT_REF_REG_NAME(buffer_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[buffer_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_I32_REG_NAME(offset_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[offset_reg] / sizeof(uint64_t));
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_REF_REG_NAME(length_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[length_reg] / sizeof(uint64_t));
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_I64_REG_NAME(value_reg);
+      EMIT_OPTIONAL_VALUE_I64(regs->i32[value_reg]);
+      break;
+    }
+
+    DISASM_OP(EXT_I64, BufferLoadI64) {
+      bool buffer_is_move;
+      uint16_t buffer_reg =
+          VM_ParseOperandRegRef("source_buffer", &buffer_is_move);
+      uint16_t offset_reg = VM_ParseOperandRegI32("source_offset");
+      uint16_t result_reg = VM_ParseResultRegI64("result");
+      EMIT_I64_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, " = vm.buffer.load.i64 "));
+      EMIT_REF_REG_NAME(buffer_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[buffer_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_I32_REG_NAME(offset_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[offset_reg] / sizeof(uint64_t));
+      break;
+    }
+
+    DISASM_OP(EXT_I64, BufferStoreI64) {
+      bool buffer_is_move;
+      uint16_t buffer_reg =
+          VM_ParseOperandRegRef("target_buffer", &buffer_is_move);
+      uint16_t offset_reg = VM_ParseOperandRegI32("target_offset");
+      uint16_t value_reg = VM_ParseOperandRegI64("value");
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, "vm.buffer.store.i64 "));
+      EMIT_I64_REG_NAME(value_reg);
+      EMIT_OPTIONAL_VALUE_I64(regs->i32[value_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_REF_REG_NAME(buffer_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[buffer_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_I32_REG_NAME(offset_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[offset_reg] / sizeof(uint32_t));
+      break;
+    }
+
+    END_DISASM_PREFIX()
+#else
+    UNHANDLED_DISASM_PREFIX(PrefixExtI64, EXT_I64);
+#endif  // IREE_VM_EXT_I64_ENABLE
+
+#if IREE_VM_EXT_F32_ENABLE
+    BEGIN_DISASM_PREFIX(PrefixExtF32, EXT_F32)
+
+    //===----------------------------------------------------------------===//
+    // ExtF32: Globals
+    //===----------------------------------------------------------------===//
+
+    DISASM_OP(EXT_F32, GlobalLoadF32) {
+      uint32_t byte_offset = VM_ParseGlobalAttr("global");
+      uint16_t value_reg = VM_ParseResultRegF32("value");
+      EMIT_F32_REG_NAME(value_reg);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
+          b, " = vm.global.load.f32 .rwdata[%u]", byte_offset));
+      EMIT_OPTIONAL_VALUE_F32(module_state->rwdata_storage.data[byte_offset]);
+      break;
+    }
+
+    DISASM_OP(EXT_F32, GlobalStoreF32) {
+      uint32_t byte_offset = VM_ParseGlobalAttr("global");
+      uint16_t value_reg = VM_ParseOperandRegF32("value");
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_format(b, "vm.global.store.f32 "));
+      EMIT_F32_REG_NAME(value_reg);
+      EMIT_OPTIONAL_VALUE_F32(regs->i32[value_reg]);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_format(b, ", .rwdata[%u]", byte_offset));
+      break;
+    }
+
+    DISASM_OP(EXT_F32, GlobalLoadIndirectF32) {
+      uint16_t byte_offset_reg = VM_ParseOperandRegI32("global");
+      uint16_t value_reg = VM_ParseResultRegI32("value");
+      EMIT_F32_REG_NAME(value_reg);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(
+          b, " = vm.global.load.indirect.f32 .rwdata["));
+      EMIT_I32_REG_NAME(byte_offset_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[byte_offset_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, "]"));
+      EMIT_OPTIONAL_VALUE_F32(
+          module_state->rwdata_storage.data[regs->i32[byte_offset_reg]]);
+      break;
+    }
+
+    DISASM_OP(EXT_F32, GlobalStoreIndirectF32) {
+      uint16_t byte_offset_reg = VM_ParseOperandRegI32("global");
+      uint16_t value_reg = VM_ParseOperandRegF32("value");
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(
+          b, "vm.global.store.indirect.f32 "));
+      EMIT_F32_REG_NAME(value_reg);
+      EMIT_OPTIONAL_VALUE_F32(regs->i32[value_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", .rwdata["));
+      EMIT_I32_REG_NAME(byte_offset_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[byte_offset_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, "]"));
+      break;
+    }
+
+    //===----------------------------------------------------------------===//
+    // ExtF32: Constants
+    //===----------------------------------------------------------------===//
+
+    DISASM_OP(EXT_F32, ConstF32) {
+      float value = VM_ParseFloatAttr32("value");
+      uint16_t result_reg = VM_ParseResultRegF32("result");
+      EMIT_F32_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_format(b, " = vm.const.f32 %f", value));
+      break;
+    }
+
+    DISASM_OP(EXT_F32, ConstF32Zero) {
+      uint16_t result_reg = VM_ParseResultRegF32("result");
+      EMIT_F32_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, " = vm.const.f32.zero"));
+      break;
+    }
+
+    //===----------------------------------------------------------------===//
+    // ExtF32: Lists
+    //===----------------------------------------------------------------===//
+
+    DISASM_OP(EXT_F32, ListGetF32) {
+      bool list_is_move;
+      uint16_t list_reg = VM_ParseOperandRegRef("list", &list_is_move);
+      uint16_t index_reg = VM_ParseOperandRegI32("index");
+      uint16_t result_reg = VM_ParseResultRegF32("result");
+      EMIT_F32_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, " = vm.list.get.f32 "));
+      EMIT_REF_REG_NAME(list_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[list_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_I32_REG_NAME(index_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[index_reg]);
+      break;
+    }
+
+    DISASM_OP(EXT_F32, ListSetF32) {
+      bool list_is_move;
+      uint16_t list_reg = VM_ParseOperandRegRef("list", &list_is_move);
+      uint16_t index_reg = VM_ParseOperandRegI32("index");
+      uint16_t raw_value_reg = VM_ParseOperandRegF32("raw_value");
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, "vm.list.set.f32 "));
+      EMIT_REF_REG_NAME(list_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[list_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_I32_REG_NAME(index_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[index_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_F32_REG_NAME(raw_value_reg);
+      EMIT_OPTIONAL_VALUE_F32(regs->i32[raw_value_reg]);
+      break;
+    }
+
+    //===----------------------------------------------------------------===//
+    // ExtF32: Conditional assignment
+    //===----------------------------------------------------------------===//
+
+    DISASM_OP(EXT_F32, SelectF32) {
+      uint16_t condition_reg = VM_ParseOperandRegI32("condition");
+      uint16_t true_value_reg = VM_ParseOperandRegF32("true_value");
+      uint16_t false_value_reg = VM_ParseOperandRegF32("false_value");
+      uint16_t result_reg = VM_ParseResultRegF32("result");
+      EMIT_F32_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, " = vm.select.f32 "));
+      EMIT_I32_REG_NAME(condition_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[condition_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, " ? "));
+      EMIT_F32_REG_NAME(true_value_reg);
+      EMIT_OPTIONAL_VALUE_F32(regs->i32[true_value_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, " : "));
+      EMIT_F32_REG_NAME(false_value_reg);
+      EMIT_OPTIONAL_VALUE_F32(regs->i32[false_value_reg]);
+      break;
+    }
+
+    DISASM_OP(EXT_F32, SwitchF32) {
+      uint16_t index_reg = VM_ParseOperandRegI32("index");
+      float default_value = VM_ParseFloatAttr32("default_value");
+      const iree_vm_register_list_t* value_reg_list =
+          VM_ParseVariadicOperands("values");
+      uint16_t result_reg = VM_ParseResultRegF32("result");
+      EMIT_F32_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, " = vm.switch.f32 "));
+      EMIT_I32_REG_NAME(index_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[index_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, "["));
+      EMIT_OPERAND_REG_LIST(value_reg_list);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_format(b, "] else %f", default_value));
+      break;
+    }
+
+    //===----------------------------------------------------------------===//
+    // ExtF32: Native floating-point arithmetic
+    //===----------------------------------------------------------------===//
+
+    DISASM_OP_EXT_F32_BINARY_F32(AddF32, "vm.add.f32");
+    DISASM_OP_EXT_F32_BINARY_F32(SubF32, "vm.sub.f32");
+    DISASM_OP_EXT_F32_BINARY_F32(MulF32, "vm.mul.f32");
+    DISASM_OP_EXT_F32_BINARY_F32(DivF32, "vm.div.f32");
+    DISASM_OP_EXT_F32_BINARY_F32(RemF32, "vm.rem.f32");
+    DISASM_OP_EXT_F32_TERNARY_F32(FMAF32, "vm.fma.f32");
+    DISASM_OP_EXT_F32_UNARY_F32(AbsF32, "vm.abs.f32");
+    DISASM_OP_EXT_F32_UNARY_F32(NegF32, "vm.neg.f32");
+    DISASM_OP_EXT_F32_UNARY_F32(CeilF32, "vm.ceil.f32");
+    DISASM_OP_EXT_F32_UNARY_F32(FloorF32, "vm.floor.f32");
+
+    DISASM_OP_EXT_F32_UNARY_F32(AtanF32, "vm.atan.f32");
+    DISASM_OP_EXT_F32_BINARY_F32(Atan2F32, "vm.atan2.f32");
+    DISASM_OP_EXT_F32_UNARY_F32(CosF32, "vm.cos.f32");
+    DISASM_OP_EXT_F32_UNARY_F32(SinF32, "vm.sin.f32");
+    DISASM_OP_EXT_F32_UNARY_F32(ExpF32, "vm.exp.f32");
+    DISASM_OP_EXT_F32_UNARY_F32(Exp2F32, "vm.exp2.f32");
+    DISASM_OP_EXT_F32_UNARY_F32(ExpM1F32, "vm.expm1.f32");
+    DISASM_OP_EXT_F32_UNARY_F32(LogF32, "vm.log.f32");
+    DISASM_OP_EXT_F32_UNARY_F32(Log10F32, "vm.log10.f32");
+    DISASM_OP_EXT_F32_UNARY_F32(Log1pF32, "vm.log1p.f32");
+    DISASM_OP_EXT_F32_UNARY_F32(Log2F32, "vm.log2.f32");
+    DISASM_OP_EXT_F32_BINARY_F32(PowF32, "vm.pow.f32");
+    DISASM_OP_EXT_F32_UNARY_F32(RsqrtF32, "vm.rsqrt.f32");
+    DISASM_OP_EXT_F32_UNARY_F32(SqrtF32, "vm.sqrt.f32");
+    DISASM_OP_EXT_F32_UNARY_F32(TanhF32, "vm.tanh.f32");
+    DISASM_OP_EXT_F32_UNARY_F32(ErfF32, "vm.erf.f32");
+
+    //===----------------------------------------------------------------===//
+    // ExtF32: Casting and type conversion/emulation
+    //===----------------------------------------------------------------===//
+
+    DISASM_OP(EXT_F32, CastSI32F32) {
+      uint16_t operand_reg = VM_ParseOperandRegI32("operand");
+      uint16_t result_reg = VM_ParseResultRegF32("result");
+      EMIT_F32_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, " = vm.cast.si32.f32 "));
+      EMIT_I32_REG_NAME(operand_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[operand_reg]);
+      break;
+    }
+    DISASM_OP(EXT_F32, CastUI32F32) {
+      uint16_t operand_reg = VM_ParseOperandRegI32("operand");
+      uint16_t result_reg = VM_ParseResultRegF32("result");
+      EMIT_F32_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, " = vm.cast.ui32.f32 "));
+      EMIT_I32_REG_NAME(operand_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[operand_reg]);
+      break;
+    }
+    DISASM_OP(EXT_F32, CastF32SI32) {
+      uint16_t operand_reg = VM_ParseOperandRegF32("operand");
+      uint16_t result_reg = VM_ParseResultRegI32("result");
+      EMIT_I32_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, " = vm.cast.f32.sif32 "));
+      EMIT_F32_REG_NAME(operand_reg);
+      EMIT_OPTIONAL_VALUE_F32(regs->i32[operand_reg]);
+      break;
+    }
+    DISASM_OP(EXT_F32, CastF32UI32) {
+      uint16_t operand_reg = VM_ParseOperandRegF32("operand");
+      uint16_t result_reg = VM_ParseResultRegI32("result");
+      EMIT_I32_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, " = vm.cast.f32.uif32 "));
+      EMIT_F32_REG_NAME(operand_reg);
+      EMIT_OPTIONAL_VALUE_F32(regs->i32[operand_reg]);
+      break;
+    }
+    DISASM_OP(EXT_F32, BitcastI32F32) {
+      uint16_t operand_reg = VM_ParseOperandRegI32("operand");
+      uint16_t result_reg = VM_ParseResultRegF32("result");
+      EMIT_F32_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, " = vm.bitcast.i32.f32 "));
+      EMIT_I32_REG_NAME(operand_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[operand_reg]);
+      break;
+    }
+    DISASM_OP(EXT_F32, BitcastF32I32) {
+      uint16_t operand_reg = VM_ParseOperandRegF32("operand");
+      uint16_t result_reg = VM_ParseResultRegI32("result");
+      EMIT_I32_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, " = vm.bitcast.f32.if32 "));
+      EMIT_F32_REG_NAME(operand_reg);
+      EMIT_OPTIONAL_VALUE_F32(regs->i32[operand_reg]);
+      break;
+    }
+
+    //===----------------------------------------------------------------===//
+    // ExtF32: Comparison ops
+    //===----------------------------------------------------------------===//
+
+#define DISASM_OP_EXT_F32_CMP_F32(op_name, op_mnemonic)                \
+  DISASM_OP(EXT_F32, op_name) {                                        \
+    uint16_t lhs_reg = VM_ParseOperandRegF32("lhs");                   \
+    uint16_t rhs_reg = VM_ParseOperandRegF32("rhs");                   \
+    uint16_t result_reg = VM_ParseResultRegI32("result");              \
+    EMIT_I32_REG_NAME(result_reg);                                     \
+    IREE_RETURN_IF_ERROR(                                              \
+        iree_string_builder_append_format(b, " = %s ", op_mnemonic));  \
+    EMIT_F32_REG_NAME(lhs_reg);                                        \
+    EMIT_OPTIONAL_VALUE_F32(regs->i32[lhs_reg]);                       \
+    IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", ")); \
+    EMIT_F32_REG_NAME(rhs_reg);                                        \
+    EMIT_OPTIONAL_VALUE_F32(regs->i32[rhs_reg]);                       \
+    break;                                                             \
+  }
+
+    DISASM_OP_EXT_F32_CMP_F32(CmpEQF32O, "vm.cmp.eq.f32.o");
+    DISASM_OP_EXT_F32_CMP_F32(CmpEQF32U, "vm.cmp.eq.f32.u");
+    DISASM_OP_EXT_F32_CMP_F32(CmpNEF32O, "vm.cmp.ne.f32.o");
+    DISASM_OP_EXT_F32_CMP_F32(CmpNEF32U, "vm.cmp.ne.f32.u");
+    DISASM_OP_EXT_F32_CMP_F32(CmpLTF32O, "vm.cmp.lt.f32.o");
+    DISASM_OP_EXT_F32_CMP_F32(CmpLTF32U, "vm.cmp.lt.f32.u");
+    DISASM_OP_EXT_F32_CMP_F32(CmpLTEF32O, "vm.cmp.lte.f32.o");
+    DISASM_OP_EXT_F32_CMP_F32(CmpLTEF32U, "vm.cmp.lte.f32.u");
+    DISASM_OP(EXT_F32, CmpNaNF32) {
+      uint16_t operand_reg = VM_ParseOperandRegF32("operand");
+      uint16_t result_reg = VM_ParseResultRegI32("result");
+      EMIT_I32_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, " = vm.cmp.nan.f32 "));
+      EMIT_F32_REG_NAME(operand_reg);
+      EMIT_OPTIONAL_VALUE_F32(regs->i32[operand_reg]);
+      break;
+    }
+
+    //===----------------------------------------------------------------===//
+    // ExtF32: Buffers
+    //===----------------------------------------------------------------===//
+
+    DISASM_OP(EXT_F32, BufferFillF32) {
+      bool buffer_is_move;
+      uint16_t buffer_reg =
+          VM_ParseOperandRegRef("target_buffer", &buffer_is_move);
+      uint16_t offset_reg = VM_ParseOperandRegI32("target_offset");
+      uint16_t length_reg = VM_ParseOperandRegI32("length");
+      uint16_t value_reg = VM_ParseOperandRegF32("value");
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, "vm.buffer.fill.f32 "));
+      EMIT_REF_REG_NAME(buffer_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[buffer_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_I32_REG_NAME(offset_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[offset_reg] / sizeof(float));
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_REF_REG_NAME(length_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[length_reg] / sizeof(float));
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_F32_REG_NAME(value_reg);
+      EMIT_OPTIONAL_VALUE_F32(regs->i32[value_reg]);
+      break;
+    }
+
+    DISASM_OP(EXT_F32, BufferLoadF32) {
+      bool buffer_is_move;
+      uint16_t buffer_reg =
+          VM_ParseOperandRegRef("source_buffer", &buffer_is_move);
+      uint16_t offset_reg = VM_ParseOperandRegI32("source_offset");
+      uint16_t result_reg = VM_ParseResultRegF32("result");
+      EMIT_F32_REG_NAME(result_reg);
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, " = vm.buffer.load.f32 "));
+      EMIT_REF_REG_NAME(buffer_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[buffer_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_I32_REG_NAME(offset_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[offset_reg] / sizeof(float));
+      break;
+    }
+
+    DISASM_OP(EXT_F32, BufferStoreF32) {
+      bool buffer_is_move;
+      uint16_t buffer_reg =
+          VM_ParseOperandRegRef("target_buffer", &buffer_is_move);
+      uint16_t offset_reg = VM_ParseOperandRegI32("target_offset");
+      uint16_t value_reg = VM_ParseOperandRegF32("value");
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(b, "vm.buffer.store.f32 "));
+      EMIT_F32_REG_NAME(value_reg);
+      EMIT_OPTIONAL_VALUE_F32(regs->i32[value_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_REF_REG_NAME(buffer_reg);
+      EMIT_OPTIONAL_VALUE_REF(&regs->ref[buffer_reg]);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(b, ", "));
+      EMIT_I32_REG_NAME(offset_reg);
+      EMIT_OPTIONAL_VALUE_I32(regs->i32[offset_reg] / sizeof(uint32_t));
+      break;
+    }
+
+    END_DISASM_PREFIX()
+#else
+    UNHANDLED_DISASM_PREFIX(PrefixExtF32, EXT_F32)
+#endif  // IREE_VM_EXT_F32_ENABLE
+    UNHANDLED_DISASM_PREFIX(PrefixExtF64, EXT_F64)
+
+    default:
+      return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                              "unhandled core opcode");
+  }
+  return iree_ok_status();
+}
+
+iree_status_t iree_vm_bytecode_trace_disasm(iree_vm_stack_frame_t* frame,
+                                            iree_vm_source_offset_t pc,
+                                            const iree_vm_registers_t* regs,
+                                            FILE* file) {
+  iree_string_builder_t b;
+  iree_string_builder_initialize(iree_allocator_system(), &b);
+
+  // TODO(benvanik): ensure frame is in-sync before call or restore original.
+  // It's shady to manipulate the frame here but I know we expect the pc to be
+  // valid only on entry/exit from a function.
+  frame->pc = pc;
+
+#if IREE_VM_EXECUTION_TRACING_SRC_LOC_ENABLE
+  iree_vm_source_location_t source_location;
+  iree_status_t status = iree_vm_module_resolve_source_location(
+      frame->function.module, frame, &source_location);
+  if (iree_status_is_ok(status)) {
+    status = iree_vm_source_location_format(
+        &source_location, IREE_VM_SOURCE_LOCATION_FORMAT_FLAG_SINGLE_LINE, &b);
+  }
+  if (iree_status_is_ok(status)) {
+    // Pad out to keep alignment. This is just guesswork based on my machine.
+    static const iree_host_size_t pad_to = 80;
+    iree_host_size_t col = iree_string_builder_size(&b);
+    if (col < pad_to) {
+      iree_string_builder_append_format(&b, "%*s ", (int)(pad_to - col), "");
+    } else {
+      status = iree_string_builder_append_cstring(&b, " ");
+    }
+  } else {
+    // Ignore failures when no source location is available.
+    if (iree_status_is_unavailable(status)) {
+      status = iree_ok_status();
+    } else {
+      return status;
+    }
+  }
+#else
+  iree_status_t status = iree_ok_status();
+#endif  // IREE_VM_EXECUTION_TRACING_ENABLE
+
+  if (iree_status_is_ok(status)) {
+    iree_string_view_t module_name =
+        iree_vm_module_name(frame->function.module);
+    IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
+        &b, "[%.*s", (int)module_name.size, module_name.data));
+    iree_string_view_t function_name = iree_vm_function_name(&frame->function);
+    if (iree_string_view_is_empty(function_name)) {
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
+          &b, "@%u", (uint32_t)frame->function.ordinal));
+    } else {
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
+          &b, ".%.*s", (int)function_name.size, function_name.data));
+    }
+    status = iree_string_builder_append_format(&b, "+%08" PRIX64 "]    ", pc);
+  }
+
+  if (iree_status_is_ok(status)) {
+    status = iree_vm_bytecode_disasm_op(
+        (iree_vm_bytecode_module_t*)frame->function.module,
+        (iree_vm_bytecode_module_state_t*)frame->module_state,
+        frame->function.ordinal, pc, regs,
+        IREE_VM_BYTECODE_DISASM_FORMAT_INLINE_VALUES, &b);
+  }
+
+  if (iree_status_is_ok(status)) {
+    fprintf(file, "%.*s\n", (int)iree_string_builder_size(&b),
+            iree_string_builder_buffer(&b));
+  }
+
+  iree_string_builder_deinitialize(&b);
+  return status;
+}
diff --git a/runtime/src/iree/vm/bytecode_disasm.h b/runtime/src/iree/vm/bytecode_disasm.h
new file mode 100644
index 0000000..2c73025
--- /dev/null
+++ b/runtime/src/iree/vm/bytecode_disasm.h
@@ -0,0 +1,46 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_VM_BYTECODE_DISASM_H_
+#define IREE_VM_BYTECODE_DISASM_H_
+
+#include <stdio.h>
+
+#include "iree/base/string_builder.h"
+#include "iree/vm/bytecode_dispatch_util.h"
+#include "iree/vm/bytecode_module_impl.h"
+#include "iree/vm/stack.h"
+
+// Controls how bytecode disassembly is formatted.
+typedef enum iree_vm_bytecode_disasm_format_e {
+  IREE_VM_BYTECODE_DISASM_FORMAT_DEFAULT = 0,
+  // Includes the input register values inline in the op text.
+  // Example: `%i0 <= ShrI32U %i2(5), %i3(6)`
+  IREE_VM_BYTECODE_DISASM_FORMAT_INLINE_VALUES = 1u << 0,
+} iree_vm_bytecode_disasm_format_t;
+
+// Disassembles the bytecode operation at |pc| using the provided module state.
+// Appends the disasembled op to |string_builder| in a format based on |format|.
+// If |regs| are available then values can be added using the format mode.
+//
+// Example: `%i0 <= ShrI32U %i2, %i3`
+//
+// WARNING: this does not currently perform any verification on the bytecode;
+// it's assumed all bytecode is valid. This is a debug tool: you shouldn't be
+// running this in production on untrusted inputs anyway.
+iree_status_t iree_vm_bytecode_disasm_op(
+    iree_vm_bytecode_module_t* module,
+    iree_vm_bytecode_module_state_t* module_state, uint16_t function_ordinal,
+    iree_vm_source_offset_t pc, const iree_vm_registers_t* regs,
+    iree_vm_bytecode_disasm_format_t format,
+    iree_string_builder_t* string_builder);
+
+iree_status_t iree_vm_bytecode_trace_disasm(iree_vm_stack_frame_t* frame,
+                                            iree_vm_source_offset_t pc,
+                                            const iree_vm_registers_t* regs,
+                                            FILE* file);
+
+#endif  // IREE_VM_BYTECODE_DISASM_H_
diff --git a/runtime/src/iree/vm/bytecode_dispatch.c b/runtime/src/iree/vm/bytecode_dispatch.c
new file mode 100644
index 0000000..2c5eb5f
--- /dev/null
+++ b/runtime/src/iree/vm/bytecode_dispatch.c
@@ -0,0 +1,2149 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/math.h"
+#include "iree/vm/api.h"
+#include "iree/vm/bytecode_disasm.h"
+#include "iree/vm/bytecode_dispatch_util.h"
+#include "iree/vm/bytecode_module_impl.h"
+#include "iree/vm/ops.h"
+
+//===----------------------------------------------------------------------===//
+// Register remapping utilities
+//===----------------------------------------------------------------------===//
+
+// Remaps registers from a source set to a destination set within the same stack
+// frame. This is a way to perform a conditional multi-mov sequence instead of
+// requiring the additional bytecode representation of the conditional movs.
+//
+// This assumes that the remapping list is properly ordered such that there are
+// no swapping hazards (such as 0->1,1->0). The register allocator in the
+// compiler should ensure this is the case when it can occur.
+static void iree_vm_bytecode_dispatch_remap_branch_registers(
+    const iree_vm_registers_t regs,
+    const iree_vm_register_remap_list_t* IREE_RESTRICT remap_list) {
+  for (int i = 0; i < remap_list->size; ++i) {
+    // TODO(benvanik): change encoding to avoid this branching.
+    // Could write two arrays: one for prims and one for refs.
+    uint16_t src_reg = remap_list->pairs[i].src_reg;
+    uint16_t dst_reg = remap_list->pairs[i].dst_reg;
+    if (src_reg & IREE_REF_REGISTER_TYPE_BIT) {
+      iree_vm_ref_retain_or_move(src_reg & IREE_REF_REGISTER_MOVE_BIT,
+                                 &regs.ref[src_reg & regs.ref_mask],
+                                 &regs.ref[dst_reg & regs.ref_mask]);
+    } else {
+      regs.i32[dst_reg & regs.i32_mask] = regs.i32[src_reg & regs.i32_mask];
+    }
+  }
+}
+
+// Discards ref registers in the list if they are marked move.
+// This can be used to eagerly release resources we don't need and reduces
+// memory consumption if used effectively prior to yields/waits.
+static void iree_vm_bytecode_dispatch_discard_registers(
+    const iree_vm_registers_t regs,
+    const iree_vm_register_list_t* IREE_RESTRICT reg_list) {
+  for (int i = 0; i < reg_list->size; ++i) {
+    // TODO(benvanik): change encoding to avoid this branching.
+    uint16_t reg = reg_list->registers[i];
+    if ((reg & (IREE_REF_REGISTER_TYPE_BIT | IREE_REF_REGISTER_MOVE_BIT)) ==
+        (IREE_REF_REGISTER_TYPE_BIT | IREE_REF_REGISTER_MOVE_BIT)) {
+      iree_vm_ref_release(&regs.ref[reg & regs.ref_mask]);
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Stack management
+//===----------------------------------------------------------------------===//
+
+static iree_vm_registers_t iree_vm_bytecode_get_register_storage(
+    iree_vm_stack_frame_t* frame) {
+  const iree_vm_bytecode_frame_storage_t* stack_storage =
+      (iree_vm_bytecode_frame_storage_t*)iree_vm_stack_frame_storage(frame);
+
+  // Masks indicate the valid bits of any register value within the range we
+  // have allocated in the storage. So for 4 registers we'd expect a 0b11 mask.
+  iree_vm_registers_t registers;
+  memset(&registers, 0, sizeof(registers));
+  registers.i32_mask = (uint16_t)(stack_storage->i32_register_count
+                                      ? stack_storage->i32_register_count - 1
+                                      : 0);
+  registers.ref_mask = (uint16_t)(stack_storage->ref_register_count
+                                      ? stack_storage->ref_register_count - 1
+                                      : 0);
+
+  // Register storage immediately follows the stack storage header.
+  registers.i32 =
+      (int32_t*)((uintptr_t)stack_storage + stack_storage->i32_register_offset);
+  registers.ref = (iree_vm_ref_t*)((uintptr_t)stack_storage +
+                                   stack_storage->ref_register_offset);
+
+  return registers;
+}
+
+// Releases any remaining refs held in the frame storage.
+static void iree_vm_bytecode_stack_frame_cleanup(iree_vm_stack_frame_t* frame) {
+  iree_vm_registers_t regs = iree_vm_bytecode_get_register_storage(frame);
+  // TODO(benvanik): allow the VM to elide this when it's known that there are
+  // no more live registers.
+  for (uint16_t i = 0; i <= regs.ref_mask; ++i) {
+    iree_vm_ref_t* ref = &regs.ref[i];
+    if (ref->ptr) iree_vm_ref_release(ref);
+  }
+}
+
+static iree_status_t iree_vm_bytecode_function_enter(
+    iree_vm_stack_t* stack, const iree_vm_function_t function,
+    iree_vm_stack_frame_t** out_callee_frame,
+    iree_vm_registers_t* out_callee_registers) {
+  iree_vm_bytecode_module_t* module =
+      (iree_vm_bytecode_module_t*)function.module->self;
+  if (IREE_UNLIKELY(function.ordinal >= module->function_descriptor_count)) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "import ordinal out of range");
+  }
+  const iree_vm_FunctionDescriptor_t* target_descriptor =
+      &module->function_descriptor_table[function.ordinal];
+
+  // We first compute the frame size of the callee and the masks we'll use to
+  // bounds check register access. This lets us allocate the entire frame
+  // (header, frame, and register storage) as a single pointer bump below.
+
+  // Round up register counts to the nearest power of 2 (if not already).
+  // This let's us use bit masks on register accesses to do bounds checking
+  // instead of more complex logic. The cost of these extra registers is only at
+  // worst 2x the required cost: so not large when thinking about the normal
+  // size of data used in an IREE app for tensors.
+  //
+  // Note that to allow the masking to work as a guard we need to ensure we at
+  // least allocate 1 register; this way an i32[reg & mask] will always point at
+  // valid memory even if mask == 0.
+  uint32_t i32_register_count = iree_math_round_up_to_pow2_u32(
+      VMMAX(1, target_descriptor->i32_register_count));
+  uint32_t ref_register_count = iree_math_round_up_to_pow2_u32(
+      VMMAX(1, target_descriptor->ref_register_count));
+  if (IREE_UNLIKELY(i32_register_count > IREE_I32_REGISTER_MASK) ||
+      IREE_UNLIKELY(ref_register_count > IREE_REF_REGISTER_MASK)) {
+    // Register count overflow. A valid compiler should never produce files that
+    // hit this.
+    return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED,
+                            "register count overflow");
+  }
+
+  // We need to align the ref register start to the natural machine
+  // alignment in case the compiler is expecting that (it makes it easier to
+  // debug too).
+  iree_host_size_t header_size =
+      iree_host_align(sizeof(iree_vm_bytecode_frame_storage_t), 16);
+  iree_host_size_t i32_register_size =
+      iree_host_align(i32_register_count * sizeof(int32_t), 16);
+  iree_host_size_t ref_register_size =
+      iree_host_align(ref_register_count * sizeof(iree_vm_ref_t), 16);
+  iree_host_size_t frame_size =
+      header_size + i32_register_size + ref_register_size;
+
+  // Enter function and allocate stack frame storage.
+  IREE_RETURN_IF_ERROR(iree_vm_stack_function_enter(
+      stack, &function, IREE_VM_STACK_FRAME_BYTECODE, frame_size,
+      iree_vm_bytecode_stack_frame_cleanup, out_callee_frame));
+
+  // Stash metadata and compute register pointers.
+  iree_vm_bytecode_frame_storage_t* stack_storage =
+      (iree_vm_bytecode_frame_storage_t*)iree_vm_stack_frame_storage(
+          *out_callee_frame);
+  stack_storage->i32_register_count = i32_register_count;
+  stack_storage->ref_register_count = ref_register_count;
+  stack_storage->i32_register_offset = header_size;
+  stack_storage->ref_register_offset = header_size + i32_register_size;
+  *out_callee_registers =
+      iree_vm_bytecode_get_register_storage(*out_callee_frame);
+
+  return iree_ok_status();
+}
+
+// Enters an internal bytecode stack frame from an external caller.
+// A new |out_callee_frame| will be pushed to the stack with storage space for
+// the registers used by the function and |arguments| will be marshaled into the
+// ABI-defined registers.
+//
+// Note that callers are expected to have matched our expectations for
+// |arguments| and we don't validate that here.
+static iree_status_t iree_vm_bytecode_external_enter(
+    iree_vm_stack_t* stack, const iree_vm_function_t function,
+    iree_string_view_t cconv_arguments, iree_byte_span_t arguments,
+    iree_vm_stack_frame_t** out_callee_frame,
+    iree_vm_registers_t* out_callee_registers) {
+  // Enter the bytecode function and allocate registers.
+  IREE_RETURN_IF_ERROR(iree_vm_bytecode_function_enter(
+      stack, function, out_callee_frame, out_callee_registers));
+
+  // Marshal arguments from the ABI format to the VM registers.
+  iree_vm_registers_t callee_registers = *out_callee_registers;
+  uint16_t i32_reg = 0;
+  uint16_t ref_reg = 0;
+  const uint8_t* p = arguments.data;
+  for (iree_host_size_t i = 0; i < cconv_arguments.size; ++i) {
+    switch (cconv_arguments.data[i]) {
+      case IREE_VM_CCONV_TYPE_VOID:
+        break;
+      case IREE_VM_CCONV_TYPE_I32:
+      case IREE_VM_CCONV_TYPE_F32: {
+        uint16_t dst_reg = i32_reg++;
+        memcpy(&callee_registers.i32[dst_reg & callee_registers.i32_mask], p,
+               sizeof(int32_t));
+        p += sizeof(int32_t);
+      } break;
+      case IREE_VM_CCONV_TYPE_I64:
+      case IREE_VM_CCONV_TYPE_F64: {
+        uint16_t dst_reg = i32_reg;
+        i32_reg += 2;
+        memcpy(&callee_registers.i32[dst_reg & callee_registers.i32_mask], p,
+               sizeof(int64_t));
+        p += sizeof(int64_t);
+      } break;
+      case IREE_VM_CCONV_TYPE_REF: {
+        uint16_t dst_reg = ref_reg++;
+        iree_vm_ref_move(
+            (iree_vm_ref_t*)p,
+            &callee_registers.ref[dst_reg & callee_registers.ref_mask]);
+        p += sizeof(iree_vm_ref_t);
+      } break;
+    }
+  }
+
+  return iree_ok_status();
+}
+
+// Leaves an internal bytecode stack frame and returns to an external caller.
+// Registers will be marshaled from the |src_reg_list| to the |results| buffer.
+//
+// Note that callers are expected to have matched our expectations for
+// |results| and we don't validate that here.
+static iree_status_t iree_vm_bytecode_external_leave(
+    iree_vm_stack_t* stack, iree_vm_stack_frame_t* callee_frame,
+    const iree_vm_registers_t* IREE_RESTRICT callee_registers,
+    const iree_vm_register_list_t* IREE_RESTRICT src_reg_list,
+    iree_string_view_t cconv_results, iree_byte_span_t results) {
+  // Marshal results from registers to the ABI results buffer.
+  uint8_t* p = results.data;
+  for (iree_host_size_t i = 0; i < cconv_results.size; ++i) {
+    uint16_t src_reg = src_reg_list->registers[i];
+    switch (cconv_results.data[i]) {
+      case IREE_VM_CCONV_TYPE_VOID:
+        break;
+      case IREE_VM_CCONV_TYPE_I32:
+      case IREE_VM_CCONV_TYPE_F32: {
+        memcpy(p, &callee_registers->i32[src_reg & callee_registers->i32_mask],
+               sizeof(int32_t));
+        p += sizeof(int32_t);
+      } break;
+      case IREE_VM_CCONV_TYPE_I64:
+      case IREE_VM_CCONV_TYPE_F64: {
+        memcpy(
+            p,
+            &callee_registers->i32[src_reg & (callee_registers->i32_mask & ~1)],
+            sizeof(int64_t));
+        p += sizeof(int64_t);
+      } break;
+      case IREE_VM_CCONV_TYPE_REF: {
+        iree_vm_ref_retain_or_move(
+            src_reg & IREE_REF_REGISTER_MOVE_BIT,
+            &callee_registers->ref[src_reg & callee_registers->ref_mask],
+            (iree_vm_ref_t*)p);
+        p += sizeof(iree_vm_ref_t);
+      } break;
+    }
+  }
+
+  // Leave and deallocate bytecode stack frame.
+  return iree_vm_stack_function_leave(stack);
+}
+
+// Enters an internal bytecode stack frame from a parent bytecode frame.
+// Registers in |src_reg_list| will be marshaled into the callee frame and the
+// |dst_reg_list| will be stashed for use when leaving the frame.
+static iree_status_t iree_vm_bytecode_internal_enter(
+    iree_vm_stack_t* stack, iree_vm_module_t* module, int32_t function_ordinal,
+    const iree_vm_register_list_t* IREE_RESTRICT src_reg_list,
+    const iree_vm_register_list_t* IREE_RESTRICT dst_reg_list,
+    iree_vm_stack_frame_t** out_callee_frame,
+    iree_vm_registers_t* out_callee_registers) {
+  // Stash the destination register list for result values on the caller.
+  iree_vm_bytecode_frame_storage_t* caller_storage =
+      (iree_vm_bytecode_frame_storage_t*)iree_vm_stack_frame_storage(
+          iree_vm_stack_current_frame(stack));
+  caller_storage->return_registers = dst_reg_list;
+
+  // NOTE: after this call the caller registers may be invalid and need to be
+  // requeried.
+  iree_vm_function_t function;
+  function.module = module;
+  function.linkage = IREE_VM_FUNCTION_LINKAGE_INTERNAL;
+  function.ordinal = function_ordinal;
+  IREE_RETURN_IF_ERROR(iree_vm_bytecode_function_enter(
+      stack, function, out_callee_frame, out_callee_registers));
+
+  // Remaps argument/result registers from a source list in the caller/callee
+  // frame to the 0-N ABI registers in the callee/caller frame.
+  // This assumes that the destination stack frame registers are unused and ok
+  // to overwrite directly. Each bank begins left-aligned at 0 and increments
+  // per arg of its type.
+  iree_vm_registers_t src_regs =
+      iree_vm_bytecode_get_register_storage(iree_vm_stack_parent_frame(stack));
+  iree_vm_registers_t* dst_regs = out_callee_registers;
+  int i32_reg_offset = 0;
+  int ref_reg_offset = 0;
+  for (int i = 0; i < src_reg_list->size; ++i) {
+    // TODO(benvanik): change encoding to avoid this branching.
+    // Could write two arrays: one for prims and one for refs.
+    uint16_t src_reg = src_reg_list->registers[i];
+    if (src_reg & IREE_REF_REGISTER_TYPE_BIT) {
+      uint16_t dst_reg = ref_reg_offset++;
+      memset(&dst_regs->ref[dst_reg & dst_regs->ref_mask], 0,
+             sizeof(iree_vm_ref_t));
+      iree_vm_ref_retain_or_move(src_reg & IREE_REF_REGISTER_MOVE_BIT,
+                                 &src_regs.ref[src_reg & src_regs.ref_mask],
+                                 &dst_regs->ref[dst_reg & dst_regs->ref_mask]);
+    } else {
+      uint16_t dst_reg = i32_reg_offset++;
+      dst_regs->i32[dst_reg & dst_regs->i32_mask] =
+          src_regs.i32[src_reg & src_regs.i32_mask];
+    }
+  }
+
+  return iree_ok_status();
+}
+
+// Leaves an internal bytecode stack frame and returns to the parent bytecode
+// frame. |src_reg_list| registers will be marshaled into the dst_reg_list
+// provided by the caller frame when entering.
+static iree_status_t iree_vm_bytecode_internal_leave(
+    iree_vm_stack_t* stack, iree_vm_stack_frame_t* callee_frame,
+    const iree_vm_registers_t callee_registers,
+    const iree_vm_register_list_t* IREE_RESTRICT src_reg_list,
+    iree_vm_stack_frame_t** out_caller_frame,
+    iree_vm_registers_t* out_caller_registers) {
+  // Remaps registers from source to destination across frames.
+  // Registers from the |src_regs| will be copied/moved to |dst_regs| with the
+  // mappings provided by |src_reg_list| and |dst_reg_list|. It's assumed that
+  // the mappings are matching by type and - in the case that they aren't -
+  // things will get weird (but not crash).
+  *out_caller_frame = iree_vm_stack_parent_frame(stack);
+  iree_vm_bytecode_frame_storage_t* caller_storage =
+      (iree_vm_bytecode_frame_storage_t*)iree_vm_stack_frame_storage(
+          *out_caller_frame);
+  const iree_vm_register_list_t* dst_reg_list =
+      caller_storage->return_registers;
+  VMCHECK(src_reg_list->size <= dst_reg_list->size);
+  if (IREE_UNLIKELY(src_reg_list->size > dst_reg_list->size)) {
+    return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                            "src/dst reg count mismatch on internal return");
+  }
+  iree_vm_registers_t caller_registers =
+      iree_vm_bytecode_get_register_storage(*out_caller_frame);
+  for (int i = 0; i < src_reg_list->size; ++i) {
+    // TODO(benvanik): change encoding to avoid this branching.
+    // Could write two arrays: one for prims and one for refs.
+    uint16_t src_reg = src_reg_list->registers[i];
+    uint16_t dst_reg = dst_reg_list->registers[i];
+    if (src_reg & IREE_REF_REGISTER_TYPE_BIT) {
+      iree_vm_ref_retain_or_move(
+          src_reg & IREE_REF_REGISTER_MOVE_BIT,
+          &callee_registers.ref[src_reg & callee_registers.ref_mask],
+          &caller_registers.ref[dst_reg & caller_registers.ref_mask]);
+    } else {
+      caller_registers.i32[dst_reg & caller_registers.i32_mask] =
+          callee_registers.i32[src_reg & callee_registers.i32_mask];
+    }
+  }
+
+  // Leave and deallocate bytecode stack frame.
+  *out_caller_registers = caller_registers;
+  return iree_vm_stack_function_leave(stack);
+}
+
+// Populates an import call arguments
+static void iree_vm_bytecode_populate_import_cconv_arguments(
+    iree_string_view_t cconv_arguments,
+    const iree_vm_registers_t caller_registers,
+    const iree_vm_register_list_t* IREE_RESTRICT segment_size_list,
+    const iree_vm_register_list_t* IREE_RESTRICT src_reg_list,
+    iree_byte_span_t storage) {
+  uint8_t* IREE_RESTRICT p = storage.data;
+  for (iree_host_size_t i = 0, seg_i = 0, reg_i = 0; i < cconv_arguments.size;
+       ++i, ++seg_i) {
+    switch (cconv_arguments.data[i]) {
+      case IREE_VM_CCONV_TYPE_VOID:
+        break;
+      case IREE_VM_CCONV_TYPE_I32:
+      case IREE_VM_CCONV_TYPE_F32: {
+        memcpy(p,
+               &caller_registers.i32[src_reg_list->registers[reg_i++] &
+                                     caller_registers.i32_mask],
+               sizeof(int32_t));
+        p += sizeof(int32_t);
+      } break;
+      case IREE_VM_CCONV_TYPE_I64:
+      case IREE_VM_CCONV_TYPE_F64: {
+        memcpy(p,
+               &caller_registers.i32[src_reg_list->registers[reg_i++] &
+                                     (caller_registers.i32_mask & ~1)],
+               sizeof(int64_t));
+        p += sizeof(int64_t);
+      } break;
+      case IREE_VM_CCONV_TYPE_REF: {
+        uint16_t src_reg = src_reg_list->registers[reg_i++];
+        iree_vm_ref_assign(
+            &caller_registers.ref[src_reg & caller_registers.ref_mask],
+            (iree_vm_ref_t*)p);
+        p += sizeof(iree_vm_ref_t);
+      } break;
+      case IREE_VM_CCONV_TYPE_SPAN_START: {
+        VMCHECK(segment_size_list);
+        int32_t span_count = segment_size_list->registers[seg_i];
+        memcpy(p, &span_count, sizeof(int32_t));
+        p += sizeof(int32_t);
+        if (!span_count) {
+          // No items; skip the span.
+          do {
+            ++i;
+          } while (i < cconv_arguments.size &&
+                   cconv_arguments.data[i] != IREE_VM_CCONV_TYPE_SPAN_END);
+          continue;
+        }
+        iree_host_size_t span_start_i = i + 1;
+        for (int32_t j = 0; j < span_count; ++j) {
+          for (i = span_start_i;
+               i < cconv_arguments.size &&
+               cconv_arguments.data[i] != IREE_VM_CCONV_TYPE_SPAN_END;
+               ++i) {
+            // TODO(benvanik): share with switch above.
+            switch (cconv_arguments.data[i]) {
+              case IREE_VM_CCONV_TYPE_VOID:
+                break;
+              case IREE_VM_CCONV_TYPE_I32:
+              case IREE_VM_CCONV_TYPE_F32: {
+                memcpy(p,
+                       &caller_registers.i32[src_reg_list->registers[reg_i++] &
+                                             caller_registers.i32_mask],
+                       sizeof(int32_t));
+                p += sizeof(int32_t);
+              } break;
+              case IREE_VM_CCONV_TYPE_I64:
+              case IREE_VM_CCONV_TYPE_F64: {
+                memcpy(p,
+                       &caller_registers.i32[src_reg_list->registers[reg_i++] &
+                                             (caller_registers.i32_mask & ~1)],
+                       sizeof(int64_t));
+                p += sizeof(int64_t);
+              } break;
+              case IREE_VM_CCONV_TYPE_REF: {
+                uint16_t src_reg = src_reg_list->registers[reg_i++];
+                iree_vm_ref_assign(
+                    &caller_registers.ref[src_reg & caller_registers.ref_mask],
+                    (iree_vm_ref_t*)p);
+                p += sizeof(iree_vm_ref_t);
+              } break;
+            }
+          }
+        }
+      } break;
+    }
+  }
+}
+
+// Issues a populated import call and marshals the results into |dst_reg_list|.
+static iree_status_t iree_vm_bytecode_issue_import_call(
+    iree_vm_stack_t* stack, const iree_vm_function_call_t call,
+    iree_string_view_t cconv_results,
+    const iree_vm_register_list_t* IREE_RESTRICT dst_reg_list,
+    iree_vm_stack_frame_t** out_caller_frame,
+    iree_vm_registers_t* out_caller_registers,
+    iree_vm_execution_result_t* out_result) {
+  // Call external function.
+  iree_status_t call_status = call.function.module->begin_call(
+      call.function.module->self, stack, &call, out_result);
+  if (IREE_UNLIKELY(!iree_status_is_ok(call_status))) {
+    // TODO(benvanik): set execution result to failure/capture stack.
+    return iree_status_annotate(call_status,
+                                iree_make_cstring_view("while calling import"));
+  }
+
+  // NOTE: we don't support yielding within imported functions right now so it's
+  // safe to assume the stack is still valid here. If the called function can
+  // yield then we'll need to requery all pointers here.
+  *out_caller_frame = iree_vm_stack_current_frame(stack);
+  *out_caller_registers =
+      iree_vm_bytecode_get_register_storage(*out_caller_frame);
+
+  // Marshal outputs from the ABI results buffer to registers.
+  iree_vm_registers_t caller_registers = *out_caller_registers;
+  uint8_t* IREE_RESTRICT p = call.results.data;
+  for (iree_host_size_t i = 0; i < cconv_results.size && i < dst_reg_list->size;
+       ++i) {
+    uint16_t dst_reg = dst_reg_list->registers[i];
+    switch (cconv_results.data[i]) {
+      case IREE_VM_CCONV_TYPE_VOID:
+        break;
+      case IREE_VM_CCONV_TYPE_I32:
+      case IREE_VM_CCONV_TYPE_F32:
+        memcpy(&caller_registers.i32[dst_reg & caller_registers.i32_mask], p,
+               sizeof(int32_t));
+        p += sizeof(int32_t);
+        break;
+      case IREE_VM_CCONV_TYPE_I64:
+      case IREE_VM_CCONV_TYPE_F64:
+        memcpy(
+            &caller_registers.i32[dst_reg & (caller_registers.i32_mask & ~1)],
+            p, sizeof(int64_t));
+        p += sizeof(int64_t);
+        break;
+      case IREE_VM_CCONV_TYPE_REF:
+        iree_vm_ref_move(
+            (iree_vm_ref_t*)p,
+            &caller_registers.ref[dst_reg & caller_registers.ref_mask]);
+        p += sizeof(iree_vm_ref_t);
+        break;
+    }
+  }
+
+  return iree_ok_status();
+}
+
+// Verifies that the requested import is valid and returns its table entry.
+static iree_status_t iree_vm_bytecode_verify_import(
+    iree_vm_stack_t* stack, const iree_vm_bytecode_module_state_t* module_state,
+    uint32_t import_ordinal, const iree_vm_bytecode_import_t** out_import) {
+  *out_import = NULL;
+
+  import_ordinal &= 0x7FFFFFFFu;
+  if (IREE_UNLIKELY(import_ordinal >= module_state->import_count)) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "import ordinal %u out of range", import_ordinal);
+  }
+
+  const iree_vm_bytecode_import_t* import =
+      &module_state->import_table[import_ordinal];
+  if (!import->function.module) {
+    iree_vm_function_t decl_function;
+    IREE_RETURN_IF_ERROR(iree_vm_module_lookup_function_by_ordinal(
+        iree_vm_stack_current_frame(stack)->function.module,
+        IREE_VM_FUNCTION_LINKAGE_IMPORT_OPTIONAL, import_ordinal,
+        &decl_function));
+    iree_string_view_t import_name = iree_vm_function_name(&decl_function);
+    (void)import_name;
+    return iree_make_status(IREE_STATUS_NOT_FOUND,
+                            "optional import `%.*s` (ordinal %u) not resolved",
+                            (int)import_name.size, import_name.data,
+                            import_ordinal);
+  }
+
+  *out_import = import;
+  return iree_ok_status();
+}
+
+// Calls an imported function from another module.
+// Marshals the |src_reg_list| registers into ABI storage and results into
+// |dst_reg_list|.
+static iree_status_t iree_vm_bytecode_call_import(
+    iree_vm_stack_t* stack, const iree_vm_bytecode_module_state_t* module_state,
+    uint32_t import_ordinal, const iree_vm_registers_t caller_registers,
+    const iree_vm_register_list_t* IREE_RESTRICT src_reg_list,
+    const iree_vm_register_list_t* IREE_RESTRICT dst_reg_list,
+    iree_vm_stack_frame_t** out_caller_frame,
+    iree_vm_registers_t* out_caller_registers,
+    iree_vm_execution_result_t* out_result) {
+  // Prepare |call| by looking up the import information.
+  const iree_vm_bytecode_import_t* import = NULL;
+  IREE_RETURN_IF_ERROR(iree_vm_bytecode_verify_import(stack, module_state,
+                                                      import_ordinal, &import));
+
+  iree_vm_function_call_t call;
+  memset(&call, 0, sizeof(call));
+  call.function = import->function;
+
+  // Marshal inputs from registers to the ABI arguments buffer.
+  call.arguments.data_length = import->argument_buffer_size;
+  call.arguments.data = iree_alloca(call.arguments.data_length);
+  memset(call.arguments.data, 0, call.arguments.data_length);
+  iree_vm_bytecode_populate_import_cconv_arguments(
+      import->arguments, caller_registers,
+      /*segment_size_list=*/NULL, src_reg_list, call.arguments);
+
+  // Issue the call and handle results.
+  call.results.data_length = import->result_buffer_size;
+  call.results.data = iree_alloca(call.results.data_length);
+  memset(call.results.data, 0, call.results.data_length);
+  return iree_vm_bytecode_issue_import_call(stack, call, import->results,
+                                            dst_reg_list, out_caller_frame,
+                                            out_caller_registers, out_result);
+}
+
+// Calls a variadic imported function from another module.
+// Marshals the |src_reg_list| registers into ABI storage and results into
+// |dst_reg_list|. |segment_size_list| contains the counts within each segment.
+static iree_status_t iree_vm_bytecode_call_import_variadic(
+    iree_vm_stack_t* stack, const iree_vm_bytecode_module_state_t* module_state,
+    uint32_t import_ordinal, const iree_vm_registers_t caller_registers,
+    const iree_vm_register_list_t* IREE_RESTRICT segment_size_list,
+    const iree_vm_register_list_t* IREE_RESTRICT src_reg_list,
+    const iree_vm_register_list_t* IREE_RESTRICT dst_reg_list,
+    iree_vm_stack_frame_t** out_caller_frame,
+    iree_vm_registers_t* out_caller_registers,
+    iree_vm_execution_result_t* out_result) {
+  // Prepare |call| by looking up the import information.
+  const iree_vm_bytecode_import_t* import = NULL;
+  IREE_RETURN_IF_ERROR(iree_vm_bytecode_verify_import(stack, module_state,
+                                                      import_ordinal, &import));
+
+  iree_vm_function_call_t call;
+  memset(&call, 0, sizeof(call));
+  call.function = import->function;
+
+  // Allocate ABI argument/result storage taking into account the variadic
+  // segments.
+  IREE_RETURN_IF_ERROR(iree_vm_function_call_compute_cconv_fragment_size(
+      import->arguments, segment_size_list, &call.arguments.data_length));
+  call.arguments.data = iree_alloca(call.arguments.data_length);
+  memset(call.arguments.data, 0, call.arguments.data_length);
+
+  // Marshal inputs from registers to the ABI arguments buffer.
+  iree_vm_bytecode_populate_import_cconv_arguments(
+      import->arguments, caller_registers, segment_size_list, src_reg_list,
+      call.arguments);
+
+  // Issue the call and handle results.
+  call.results.data_length = import->result_buffer_size;
+  call.results.data = iree_alloca(call.results.data_length);
+  memset(call.results.data, 0, call.results.data_length);
+  return iree_vm_bytecode_issue_import_call(stack, call, import->results,
+                                            dst_reg_list, out_caller_frame,
+                                            out_caller_registers, out_result);
+}
+
+//===----------------------------------------------------------------------===//
+// Main interpreter dispatch routine
+//===----------------------------------------------------------------------===//
+
+iree_status_t iree_vm_bytecode_dispatch(
+    iree_vm_stack_t* stack, iree_vm_bytecode_module_t* module,
+    const iree_vm_function_call_t* call, iree_string_view_t cconv_arguments,
+    iree_string_view_t cconv_results, iree_vm_execution_result_t* out_result) {
+  memset(out_result, 0, sizeof(*out_result));
+
+  // When required emit the dispatch tables here referencing the labels we are
+  // defining below.
+  DEFINE_DISPATCH_TABLES();
+
+  // Enter function (as this is the initial call).
+  // The callee's return will take care of storing the output registers when it
+  // actually does return, either immediately or in the future via a resume.
+  iree_vm_stack_frame_t* current_frame = NULL;
+  iree_vm_registers_t regs;
+  IREE_RETURN_IF_ERROR(
+      iree_vm_bytecode_external_enter(stack, call->function, cconv_arguments,
+                                      call->arguments, &current_frame, &regs));
+
+  // Primary dispatch state. This is our 'native stack frame' and really
+  // just enough to make dereferencing common addresses (like the current
+  // offset) faster. You can think of this like CPU state (like PC).
+  //
+  // The hope is that the compiler decides to keep these in registers (as
+  // they are touched for every instruction executed). The frame will change
+  // as we call into different functions.
+  const iree_vm_bytecode_module_state_t* IREE_RESTRICT module_state =
+      (iree_vm_bytecode_module_state_t*)current_frame->module_state;
+  const uint8_t* IREE_RESTRICT bytecode_data =
+      module->bytecode_data.data +
+      module->function_descriptor_table[current_frame->function.ordinal]
+          .bytecode_offset;
+  iree_vm_source_offset_t pc = current_frame->pc;
+  const int32_t entry_frame_depth = current_frame->depth;
+
+  BEGIN_DISPATCH_CORE() {
+    //===------------------------------------------------------------------===//
+    // Globals
+    //===------------------------------------------------------------------===//
+
+    DISPATCH_OP(CORE, GlobalLoadI32, {
+      uint32_t byte_offset = VM_DecGlobalAttr("global");
+      if (IREE_UNLIKELY(byte_offset >=
+                        module_state->rwdata_storage.data_length)) {
+        return iree_make_status(
+            IREE_STATUS_OUT_OF_RANGE,
+            "global byte_offset out of range: %d (rwdata=%zu)", byte_offset,
+            module_state->rwdata_storage.data_length);
+      }
+      int32_t* value = VM_DecResultRegI32("value");
+      const int32_t global_value =
+          vm_global_load_i32(module_state->rwdata_storage.data, byte_offset);
+      *value = global_value;
+    });
+
+    DISPATCH_OP(CORE, GlobalStoreI32, {
+      uint32_t byte_offset = VM_DecGlobalAttr("global");
+      if (IREE_UNLIKELY(byte_offset >=
+                        module_state->rwdata_storage.data_length)) {
+        return iree_make_status(
+            IREE_STATUS_OUT_OF_RANGE,
+            "global byte_offset out of range: %d (rwdata=%zu)", byte_offset,
+            module_state->rwdata_storage.data_length);
+      }
+      int32_t value = VM_DecOperandRegI32("value");
+      vm_global_store_i32(module_state->rwdata_storage.data, byte_offset,
+                          value);
+    });
+
+    DISPATCH_OP(CORE, GlobalLoadIndirectI32, {
+      uint32_t byte_offset = VM_DecOperandRegI32("global");
+      if (IREE_UNLIKELY(byte_offset >=
+                        module_state->rwdata_storage.data_length)) {
+        return iree_make_status(
+            IREE_STATUS_OUT_OF_RANGE,
+            "global byte_offset out of range: %d (rwdata=%zu)", byte_offset,
+            module_state->rwdata_storage.data_length);
+      }
+      int32_t* value = VM_DecResultRegI32("value");
+      const int32_t global_value =
+          vm_global_load_i32(module_state->rwdata_storage.data, byte_offset);
+      *value = global_value;
+    });
+
+    DISPATCH_OP(CORE, GlobalStoreIndirectI32, {
+      uint32_t byte_offset = VM_DecOperandRegI32("global");
+      if (IREE_UNLIKELY(byte_offset >=
+                        module_state->rwdata_storage.data_length)) {
+        return iree_make_status(
+            IREE_STATUS_OUT_OF_RANGE,
+            "global byte_offset out of range: %d (rwdata=%zu)", byte_offset,
+            module_state->rwdata_storage.data_length);
+      }
+      int32_t value = VM_DecOperandRegI32("value");
+      vm_global_store_i32(module_state->rwdata_storage.data, byte_offset,
+                          value);
+    });
+
+    DISPATCH_OP(CORE, GlobalLoadRef, {
+      uint32_t global = VM_DecGlobalAttr("global");
+      if (IREE_UNLIKELY(global >= module_state->global_ref_count)) {
+        return iree_make_status(
+            IREE_STATUS_OUT_OF_RANGE,
+            "global ref ordinal out of range: %d (table=%zu)", global,
+            module_state->global_ref_count);
+      }
+      const iree_vm_type_def_t* type_def = VM_DecTypeOf("value");
+      bool result_is_move;
+      iree_vm_ref_t* result = VM_DecResultRegRef("value", &result_is_move);
+      iree_vm_ref_t* global_ref = &module_state->global_ref_table[global];
+      IREE_RETURN_IF_ERROR(iree_vm_ref_retain_or_move_checked(
+          result_is_move, global_ref, type_def->ref_type, result));
+    });
+
+    DISPATCH_OP(CORE, GlobalStoreRef, {
+      uint32_t global = VM_DecGlobalAttr("global");
+      if (IREE_UNLIKELY(global >= module_state->global_ref_count)) {
+        return iree_make_status(
+            IREE_STATUS_OUT_OF_RANGE,
+            "global ref ordinal out of range: %d (table=%zu)", global,
+            module_state->global_ref_count);
+      }
+      const iree_vm_type_def_t* type_def = VM_DecTypeOf("value");
+      bool value_is_move;
+      iree_vm_ref_t* value = VM_DecOperandRegRef("value", &value_is_move);
+      iree_vm_ref_t* global_ref = &module_state->global_ref_table[global];
+      IREE_RETURN_IF_ERROR(iree_vm_ref_retain_or_move_checked(
+          value_is_move, value, type_def->ref_type, global_ref));
+    });
+
+    DISPATCH_OP(CORE, GlobalLoadIndirectRef, {
+      uint32_t global = VM_DecOperandRegI32("global");
+      if (IREE_UNLIKELY(global >= module_state->global_ref_count)) {
+        return iree_make_status(
+            IREE_STATUS_OUT_OF_RANGE,
+            "global ref ordinal out of range: %d (table=%zu)", global,
+            module_state->global_ref_count);
+      }
+      const iree_vm_type_def_t* type_def = VM_DecTypeOf("value");
+      bool result_is_move;
+      iree_vm_ref_t* result = VM_DecResultRegRef("value", &result_is_move);
+      iree_vm_ref_t* global_ref = &module_state->global_ref_table[global];
+      IREE_RETURN_IF_ERROR(iree_vm_ref_retain_or_move_checked(
+          result_is_move, global_ref, type_def->ref_type, result));
+    });
+
+    DISPATCH_OP(CORE, GlobalStoreIndirectRef, {
+      uint32_t global = VM_DecOperandRegI32("global");
+      if (IREE_UNLIKELY(global >= module_state->global_ref_count)) {
+        return iree_make_status(
+            IREE_STATUS_OUT_OF_RANGE,
+            "global ref ordinal out of range: %d (table=%zu)", global,
+            module_state->global_ref_count);
+      }
+      const iree_vm_type_def_t* type_def = VM_DecTypeOf("value");
+      bool value_is_move;
+      iree_vm_ref_t* value = VM_DecOperandRegRef("value", &value_is_move);
+      iree_vm_ref_t* global_ref = &module_state->global_ref_table[global];
+      IREE_RETURN_IF_ERROR(iree_vm_ref_retain_or_move_checked(
+          value_is_move, value, type_def->ref_type, global_ref));
+    });
+
+    //===------------------------------------------------------------------===//
+    // Constants
+    //===------------------------------------------------------------------===//
+
+    DISPATCH_OP(CORE, ConstI32, {
+      int32_t value = VM_DecIntAttr32("value");
+      int32_t* result = VM_DecResultRegI32("result");
+      *result = value;
+    });
+
+    DISPATCH_OP(CORE, ConstI32Zero, {
+      int32_t* result = VM_DecResultRegI32("result");
+      *result = 0;
+    });
+
+    DISPATCH_OP(CORE, ConstRefZero, {
+      bool result_is_move;
+      iree_vm_ref_t* result = VM_DecResultRegRef("result", &result_is_move);
+      iree_vm_ref_release(result);
+    });
+
+    DISPATCH_OP(CORE, ConstRefRodata, {
+      uint32_t rodata_ordinal = VM_DecRodataAttr("rodata");
+      if (IREE_UNLIKELY(rodata_ordinal >= module_state->rodata_ref_count)) {
+        return iree_make_status(
+            IREE_STATUS_OUT_OF_RANGE,
+            "rodata ref ordinal out of range: %d (table=%zu)", rodata_ordinal,
+            module_state->rodata_ref_count);
+      }
+      bool result_is_move;
+      iree_vm_ref_t* result = VM_DecResultRegRef("value", &result_is_move);
+      IREE_RETURN_IF_ERROR(iree_vm_ref_wrap_retain(
+          &module_state->rodata_ref_table[rodata_ordinal],
+          iree_vm_buffer_type_id(), result));
+    });
+
+    //===------------------------------------------------------------------===//
+    // Buffers
+    //===------------------------------------------------------------------===//
+
+    DISPATCH_OP(CORE, BufferAlloc, {
+      uint32_t length = VM_DecOperandRegI32("length");
+      bool result_is_move;
+      iree_vm_ref_t* result_ref = VM_DecResultRegRef("result", &result_is_move);
+      iree_vm_buffer_t* buffer = NULL;
+      IREE_RETURN_IF_ERROR(iree_vm_buffer_create(
+          IREE_VM_BUFFER_ACCESS_MUTABLE | IREE_VM_BUFFER_ACCESS_ORIGIN_GUEST,
+          length, module_state->allocator, &buffer));
+      IREE_RETURN_IF_ERROR(iree_vm_ref_wrap_assign(
+          buffer, iree_vm_buffer_type_id(), result_ref));
+    });
+
+    DISPATCH_OP(CORE, BufferClone, {
+      bool source_is_move;
+      iree_vm_ref_t* source_ref =
+          VM_DecOperandRegRef("source", &source_is_move);
+      iree_vm_buffer_t* source = iree_vm_buffer_deref(*source_ref);
+      if (IREE_UNLIKELY(!source)) {
+        return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "source is null");
+      }
+      uint32_t offset = VM_DecOperandRegI32("offset");
+      uint32_t length = VM_DecOperandRegI32("length");
+      bool result_is_move;
+      iree_vm_ref_t* result_ref = VM_DecResultRegRef("result", &result_is_move);
+      iree_vm_buffer_t* result = NULL;
+      IREE_RETURN_IF_ERROR(iree_vm_buffer_clone(
+          IREE_VM_BUFFER_ACCESS_MUTABLE | IREE_VM_BUFFER_ACCESS_ORIGIN_GUEST,
+          source, offset, length, module_state->allocator, &result));
+      IREE_RETURN_IF_ERROR(iree_vm_ref_wrap_assign(
+          result, iree_vm_buffer_type_id(), result_ref));
+    });
+
+    DISPATCH_OP(CORE, BufferLength, {
+      bool buffer_is_move;
+      iree_vm_ref_t* buffer_ref =
+          VM_DecOperandRegRef("buffer", &buffer_is_move);
+      iree_vm_buffer_t* buffer = iree_vm_buffer_deref(*buffer_ref);
+      if (IREE_UNLIKELY(!buffer)) {
+        return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "buffer is null");
+      }
+      uint32_t* result = VM_DecResultRegI32("result");
+      *result = (uint32_t)iree_vm_buffer_length(buffer);
+    });
+
+    DISPATCH_OP(CORE, BufferCopy, {
+      bool source_buffer_is_move;
+      iree_vm_ref_t* source_buffer_ref =
+          VM_DecOperandRegRef("source_buffer", &source_buffer_is_move);
+      iree_vm_buffer_t* source_buffer =
+          iree_vm_buffer_deref(*source_buffer_ref);
+      if (IREE_UNLIKELY(!source_buffer)) {
+        return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                                "source_buffer is null");
+      }
+      uint32_t source_offset = VM_DecOperandRegI32("source_offset");
+      bool target_buffer_is_move;
+      iree_vm_ref_t* target_buffer_ref =
+          VM_DecOperandRegRef("target_buffer", &target_buffer_is_move);
+      iree_vm_buffer_t* target_buffer =
+          iree_vm_buffer_deref(*target_buffer_ref);
+      if (IREE_UNLIKELY(!target_buffer)) {
+        return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                                "target_buffer is null");
+      }
+      uint32_t target_offset = VM_DecOperandRegI32("target_offset");
+      uint32_t length = VM_DecOperandRegI32("length");
+      IREE_RETURN_IF_ERROR(iree_vm_buffer_copy_bytes(
+          source_buffer, source_offset, target_buffer, target_offset, length));
+    });
+
+    DISPATCH_OP(CORE, BufferCompare, {
+      bool lhs_buffer_is_move;
+      iree_vm_ref_t* lhs_buffer_ref =
+          VM_DecOperandRegRef("lhs_buffer", &lhs_buffer_is_move);
+      iree_vm_buffer_t* lhs_buffer = iree_vm_buffer_deref(*lhs_buffer_ref);
+      if (IREE_UNLIKELY(!lhs_buffer)) {
+        return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                                "lhs_buffer is null");
+      }
+      uint32_t lhs_offset = VM_DecOperandRegI32("lhs_offset");
+      bool rhs_buffer_is_move;
+      iree_vm_ref_t* rhs_buffer_ref =
+          VM_DecOperandRegRef("rhs_buffer", &rhs_buffer_is_move);
+      iree_vm_buffer_t* rhs_buffer = iree_vm_buffer_deref(*rhs_buffer_ref);
+      if (IREE_UNLIKELY(!rhs_buffer)) {
+        return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                                "rhs_buffer is null");
+      }
+      uint32_t rhs_offset = VM_DecOperandRegI32("rhs_offset");
+      uint32_t length = VM_DecOperandRegI32("length");
+      uint32_t* result_ptr = VM_DecResultRegI32("result");
+      bool result = false;
+      IREE_RETURN_IF_ERROR(iree_vm_buffer_compare_bytes(
+          lhs_buffer, lhs_offset, rhs_buffer, rhs_offset, length, &result));
+      *result_ptr = result ? 1 : 0;
+    });
+
+    // TODO(benvanik): rework dispatch so that the FillI* ops can share the same
+    // body - they all only vary by the length passed to fill_elements. The
+    // gotcha is that on big-endian machines we'd have to flip around the bytes.
+    // See VMOpcodesCore.td for more information on the encoding.
+    DISPATCH_OP(CORE, BufferFillI8, {
+      bool buffer_is_move;
+      iree_vm_ref_t* buffer_ref =
+          VM_DecOperandRegRef("target_buffer", &buffer_is_move);
+      iree_vm_buffer_t* buffer = iree_vm_buffer_deref(*buffer_ref);
+      if (IREE_UNLIKELY(!buffer)) {
+        return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "buffer is null");
+      }
+      uint32_t offset = VM_DecOperandRegI32("target_offset");
+      uint32_t length = VM_DecOperandRegI32("length");
+      uint8_t value = (uint8_t)VM_DecOperandRegI32("value");
+      IREE_RETURN_IF_ERROR(iree_vm_buffer_fill_elements(
+          buffer, offset, length / sizeof(uint8_t), sizeof(uint8_t), &value));
+    });
+    DISPATCH_OP(CORE, BufferFillI16, {
+      bool buffer_is_move;
+      iree_vm_ref_t* buffer_ref =
+          VM_DecOperandRegRef("target_buffer", &buffer_is_move);
+      iree_vm_buffer_t* buffer = iree_vm_buffer_deref(*buffer_ref);
+      if (IREE_UNLIKELY(!buffer)) {
+        return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "buffer is null");
+      }
+      uint32_t offset = VM_DecOperandRegI32("target_offset");
+      uint32_t length = VM_DecOperandRegI32("length");
+      uint16_t value = (uint16_t)VM_DecOperandRegI32("value");
+      IREE_RETURN_IF_ERROR(iree_vm_buffer_fill_elements(
+          buffer, offset, length / sizeof(uint16_t), sizeof(uint16_t), &value));
+    });
+    DISPATCH_OP(CORE, BufferFillI32, {
+      bool buffer_is_move;
+      iree_vm_ref_t* buffer_ref =
+          VM_DecOperandRegRef("target_buffer", &buffer_is_move);
+      iree_vm_buffer_t* buffer = iree_vm_buffer_deref(*buffer_ref);
+      if (IREE_UNLIKELY(!buffer)) {
+        return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "buffer is null");
+      }
+      uint32_t offset = VM_DecOperandRegI32("target_offset");
+      uint32_t length = VM_DecOperandRegI32("length");
+      uint32_t value = VM_DecOperandRegI32("value");
+      IREE_RETURN_IF_ERROR(iree_vm_buffer_fill_elements(
+          buffer, offset, length / sizeof(uint32_t), sizeof(uint32_t), &value));
+    });
+
+    // TODO(benvanik): rework dispatch so that the LoadI* ops can share the same
+    // body - they only vary on the length and sign/zero extension mode but
+    // can be packed into a single handler to reduce code-size.
+    // See VMOpcodesCore.td for more information on the encoding.
+    DISPATCH_OP(CORE, BufferLoadI8U, {
+      bool buffer_is_move;
+      iree_vm_ref_t* buffer_ref =
+          VM_DecOperandRegRef("source_buffer", &buffer_is_move);
+      iree_vm_buffer_t* buffer = iree_vm_buffer_deref(*buffer_ref);
+      if (IREE_UNLIKELY(!buffer)) {
+        return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                                "source_buffer is null");
+      }
+      uint32_t offset = VM_DecOperandRegI32("source_offset");
+      uint32_t* result_ptr = VM_DecResultRegI32("result");
+      uint8_t result_x8 = 0;
+      IREE_RETURN_IF_ERROR(iree_vm_buffer_read_elements(
+          buffer, offset, &result_x8, 1, sizeof(result_x8)));
+      *result_ptr = vm_ext_i8i32u(result_x8);
+    });
+    DISPATCH_OP(CORE, BufferLoadI8S, {
+      bool buffer_is_move;
+      iree_vm_ref_t* buffer_ref =
+          VM_DecOperandRegRef("source_buffer", &buffer_is_move);
+      iree_vm_buffer_t* buffer = iree_vm_buffer_deref(*buffer_ref);
+      if (IREE_UNLIKELY(!buffer)) {
+        return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                                "source_buffer is null");
+      }
+      uint32_t offset = VM_DecOperandRegI32("source_offset");
+      uint32_t* result_ptr = VM_DecResultRegI32("result");
+      int8_t result_x8 = 0;
+      IREE_RETURN_IF_ERROR(iree_vm_buffer_read_elements(
+          buffer, offset, &result_x8, 1, sizeof(result_x8)));
+      *result_ptr = vm_ext_i8i32s(result_x8);
+    });
+    DISPATCH_OP(CORE, BufferLoadI16U, {
+      bool buffer_is_move;
+      iree_vm_ref_t* buffer_ref =
+          VM_DecOperandRegRef("source_buffer", &buffer_is_move);
+      iree_vm_buffer_t* buffer = iree_vm_buffer_deref(*buffer_ref);
+      if (IREE_UNLIKELY(!buffer)) {
+        return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                                "source_buffer is null");
+      }
+      uint32_t offset = VM_DecOperandRegI32("source_offset");
+      uint32_t* result_ptr = VM_DecResultRegI32("result");
+      uint16_t result_x16 = 0;
+      IREE_RETURN_IF_ERROR(iree_vm_buffer_read_elements(
+          buffer, offset, &result_x16, 1, sizeof(result_x16)));
+      *result_ptr = vm_ext_i16i32u(result_x16);
+    });
+    DISPATCH_OP(CORE, BufferLoadI16S, {
+      bool buffer_is_move;
+      iree_vm_ref_t* buffer_ref =
+          VM_DecOperandRegRef("source_buffer", &buffer_is_move);
+      iree_vm_buffer_t* buffer = iree_vm_buffer_deref(*buffer_ref);
+      if (IREE_UNLIKELY(!buffer)) {
+        return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                                "source_buffer is null");
+      }
+      uint32_t offset = VM_DecOperandRegI32("source_offset");
+      uint32_t* result_ptr = VM_DecResultRegI32("result");
+      int16_t result_x16 = 0;
+      IREE_RETURN_IF_ERROR(iree_vm_buffer_read_elements(
+          buffer, offset, &result_x16, 1, sizeof(result_x16)));
+      *result_ptr = vm_ext_i16i32s(result_x16);
+    });
+    DISPATCH_OP(CORE, BufferLoadI32, {
+      bool buffer_is_move;
+      iree_vm_ref_t* buffer_ref =
+          VM_DecOperandRegRef("source_buffer", &buffer_is_move);
+      iree_vm_buffer_t* buffer = iree_vm_buffer_deref(*buffer_ref);
+      if (IREE_UNLIKELY(!buffer)) {
+        return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                                "source_buffer is null");
+      }
+      uint32_t offset = VM_DecOperandRegI32("source_offset");
+      uint32_t* result = VM_DecResultRegI32("result");
+      IREE_RETURN_IF_ERROR(iree_vm_buffer_read_elements(buffer, offset, result,
+                                                        1, sizeof(*result)));
+    });
+
+    // TODO(benvanik): rework dispatch so that the StoreI* ops can share the
+    // same body - they only vary on the length.
+    // See VMOpcodesCore.td for more information on the encoding.
+    DISPATCH_OP(CORE, BufferStoreI8, {
+      bool buffer_is_move;
+      iree_vm_ref_t* buffer_ref =
+          VM_DecOperandRegRef("target_buffer", &buffer_is_move);
+      iree_vm_buffer_t* buffer = iree_vm_buffer_deref(*buffer_ref);
+      if (IREE_UNLIKELY(!buffer)) {
+        return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                                "target_buffer is null");
+      }
+      uint32_t offset = VM_DecOperandRegI32("target_offset");
+      uint8_t value = (uint8_t)VM_DecOperandRegI32("value");
+      IREE_RETURN_IF_ERROR(iree_vm_buffer_write_elements(&value, buffer, offset,
+                                                         1, sizeof(uint8_t)));
+    });
+    DISPATCH_OP(CORE, BufferStoreI16, {
+      bool buffer_is_move;
+      iree_vm_ref_t* buffer_ref =
+          VM_DecOperandRegRef("target_buffer", &buffer_is_move);
+      iree_vm_buffer_t* buffer = iree_vm_buffer_deref(*buffer_ref);
+      if (IREE_UNLIKELY(!buffer)) {
+        return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                                "target_buffer is null");
+      }
+      uint32_t offset = VM_DecOperandRegI32("target_offset");
+      uint16_t value = (uint16_t)VM_DecOperandRegI32("value");
+      IREE_RETURN_IF_ERROR(iree_vm_buffer_write_elements(&value, buffer, offset,
+                                                         1, sizeof(uint16_t)));
+    });
+    DISPATCH_OP(CORE, BufferStoreI32, {
+      bool buffer_is_move;
+      iree_vm_ref_t* buffer_ref =
+          VM_DecOperandRegRef("target_buffer", &buffer_is_move);
+      iree_vm_buffer_t* buffer = iree_vm_buffer_deref(*buffer_ref);
+      if (IREE_UNLIKELY(!buffer)) {
+        return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                                "target_buffer is null");
+      }
+      uint32_t offset = VM_DecOperandRegI32("target_offset");
+      uint32_t value = VM_DecOperandRegI32("value");
+      IREE_RETURN_IF_ERROR(iree_vm_buffer_write_elements(&value, buffer, offset,
+                                                         1, sizeof(uint32_t)));
+    });
+
+    //===------------------------------------------------------------------===//
+    // Lists
+    //===------------------------------------------------------------------===//
+
+    DISPATCH_OP(CORE, ListAlloc, {
+      const iree_vm_type_def_t* element_type_def = VM_DecTypeOf("element_type");
+      uint32_t initial_capacity = VM_DecOperandRegI32("initial_capacity");
+      bool result_is_move;
+      iree_vm_ref_t* result = VM_DecResultRegRef("result", &result_is_move);
+      iree_vm_list_t* list = NULL;
+      IREE_RETURN_IF_ERROR(iree_vm_list_create(
+          element_type_def, initial_capacity, module_state->allocator, &list));
+      IREE_RETURN_IF_ERROR(
+          iree_vm_ref_wrap_assign(list, iree_vm_list_type_id(), result));
+    });
+
+    DISPATCH_OP(CORE, ListReserve, {
+      bool list_is_move;
+      iree_vm_ref_t* list_ref = VM_DecOperandRegRef("list", &list_is_move);
+      iree_vm_list_t* list = iree_vm_list_deref(*list_ref);
+      if (IREE_UNLIKELY(!list)) {
+        return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "list is null");
+      }
+      uint32_t minimum_capacity = VM_DecOperandRegI32("minimum_capacity");
+      IREE_RETURN_IF_ERROR(iree_vm_list_reserve(list, minimum_capacity));
+    });
+
+    DISPATCH_OP(CORE, ListSize, {
+      bool list_is_move;
+      iree_vm_ref_t* list_ref = VM_DecOperandRegRef("list", &list_is_move);
+      iree_vm_list_t* list = iree_vm_list_deref(*list_ref);
+      if (IREE_UNLIKELY(!list)) {
+        return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "list is null");
+      }
+      int32_t* result = VM_DecResultRegI32("result");
+      *result = (int32_t)iree_vm_list_size(list);
+    });
+
+    DISPATCH_OP(CORE, ListResize, {
+      bool list_is_move;
+      iree_vm_ref_t* list_ref = VM_DecOperandRegRef("list", &list_is_move);
+      iree_vm_list_t* list = iree_vm_list_deref(*list_ref);
+      if (IREE_UNLIKELY(!list)) {
+        return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "list is null");
+      }
+      uint32_t new_size = VM_DecOperandRegI32("new_size");
+      IREE_RETURN_IF_ERROR(iree_vm_list_resize(list, new_size));
+    });
+
+    DISPATCH_OP(CORE, ListGetI32, {
+      bool list_is_move;
+      iree_vm_ref_t* list_ref = VM_DecOperandRegRef("list", &list_is_move);
+      iree_vm_list_t* list = iree_vm_list_deref(*list_ref);
+      if (IREE_UNLIKELY(!list)) {
+        return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "list is null");
+      }
+      uint32_t index = VM_DecOperandRegI32("index");
+      int32_t* result = VM_DecResultRegI32("result");
+      iree_vm_value_t value;
+      IREE_RETURN_IF_ERROR(iree_vm_list_get_value_as(
+          list, index, IREE_VM_VALUE_TYPE_I32, &value));
+      *result = value.i32;
+    });
+
+    DISPATCH_OP(CORE, ListSetI32, {
+      bool list_is_move;
+      iree_vm_ref_t* list_ref = VM_DecOperandRegRef("list", &list_is_move);
+      iree_vm_list_t* list = iree_vm_list_deref(*list_ref);
+      if (IREE_UNLIKELY(!list)) {
+        return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "list is null");
+      }
+      uint32_t index = VM_DecOperandRegI32("index");
+      int32_t raw_value = VM_DecOperandRegI32("raw_value");
+      iree_vm_value_t value = iree_vm_value_make_i32(raw_value);
+      IREE_RETURN_IF_ERROR(iree_vm_list_set_value(list, index, &value));
+    });
+
+    DISPATCH_OP(CORE, ListGetRef, {
+      bool list_is_move;
+      iree_vm_ref_t* list_ref = VM_DecOperandRegRef("list", &list_is_move);
+      iree_vm_list_t* list = iree_vm_list_deref(*list_ref);
+      if (IREE_UNLIKELY(!list)) {
+        return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "list is null");
+      }
+      uint32_t index = VM_DecOperandRegI32("index");
+      const iree_vm_type_def_t* type_def = VM_DecTypeOf("result");
+      bool result_is_move;
+      iree_vm_ref_t* result = VM_DecResultRegRef("result", &result_is_move);
+      // TODO(benvanik): use result_is_move with a _retain_or_move.
+      IREE_RETURN_IF_ERROR(iree_vm_list_get_ref_retain(list, index, result));
+      if (result->type != IREE_VM_REF_TYPE_NULL &&
+          (iree_vm_type_def_is_value(type_def) ||
+           result->type != type_def->ref_type)) {
+        // Type mismatch; put null in the register instead.
+        // TODO(benvanik): return an error here and make a query type method?
+        iree_vm_ref_release(result);
+      }
+    });
+
+    DISPATCH_OP(CORE, ListSetRef, {
+      bool list_is_move;
+      iree_vm_ref_t* list_ref = VM_DecOperandRegRef("list", &list_is_move);
+      iree_vm_list_t* list = iree_vm_list_deref(*list_ref);
+      if (IREE_UNLIKELY(!list)) {
+        return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "list is null");
+      }
+      uint32_t index = VM_DecOperandRegI32("index");
+      bool operand_is_move;
+      iree_vm_ref_t* operand = VM_DecOperandRegRef("value", &operand_is_move);
+      if (operand_is_move) {
+        IREE_RETURN_IF_ERROR(iree_vm_list_set_ref_move(list, index, operand));
+      } else {
+        IREE_RETURN_IF_ERROR(iree_vm_list_set_ref_retain(list, index, operand));
+      }
+    });
+
+    //===------------------------------------------------------------------===//
+    // Conditional assignment
+    //===------------------------------------------------------------------===//
+
+    DISPATCH_OP(CORE, SelectI32, {
+      int32_t condition = VM_DecOperandRegI32("condition");
+      int32_t true_value = VM_DecOperandRegI32("true_value");
+      int32_t false_value = VM_DecOperandRegI32("false_value");
+      int32_t* result = VM_DecResultRegI32("result");
+      *result = vm_select_i32(condition, true_value, false_value);
+    });
+
+    DISPATCH_OP(CORE, SelectRef, {
+      int32_t condition = VM_DecOperandRegI32("condition");
+      // TODO(benvanik): remove the type_id and use either LHS/RHS (if both are
+      // null then output is always null so no need to know the type).
+      const iree_vm_type_def_t* type_def = VM_DecTypeOf("true_value");
+      bool true_value_is_move;
+      iree_vm_ref_t* true_value =
+          VM_DecOperandRegRef("true_value", &true_value_is_move);
+      bool false_value_is_move;
+      iree_vm_ref_t* false_value =
+          VM_DecOperandRegRef("false_value", &false_value_is_move);
+      bool result_is_move;
+      iree_vm_ref_t* result = VM_DecResultRegRef("result", &result_is_move);
+      if (condition) {
+        // Select LHS.
+        IREE_RETURN_IF_ERROR(iree_vm_ref_retain_or_move_checked(
+            true_value_is_move, true_value, type_def->ref_type, result));
+        if (false_value_is_move && false_value != result) {
+          iree_vm_ref_release(false_value);
+        }
+      } else {
+        // Select RHS.
+        IREE_RETURN_IF_ERROR(iree_vm_ref_retain_or_move_checked(
+            false_value_is_move, false_value, type_def->ref_type, result));
+        if (true_value_is_move && true_value != result) {
+          iree_vm_ref_release(true_value);
+        }
+      }
+    });
+
+    DISPATCH_OP(CORE, SwitchI32, {
+      int32_t index = VM_DecOperandRegI32("index");
+      int32_t default_value = VM_DecIntAttr32("default_value");
+      const iree_vm_register_list_t* value_reg_list =
+          VM_DecVariadicOperands("values");
+      int32_t* result = VM_DecResultRegI32("result");
+      if (index >= 0 && index < value_reg_list->size) {
+        *result = regs.i32[value_reg_list->registers[index] & regs.i32_mask];
+      } else {
+        *result = default_value;
+      }
+    });
+
+    DISPATCH_OP(CORE, SwitchRef, {
+      int32_t index = VM_DecOperandRegI32("index");
+      const iree_vm_type_def_t* type_def = VM_DecTypeOf("result");
+      bool default_is_move;
+      iree_vm_ref_t* default_value =
+          VM_DecOperandRegRef("default_value", &default_is_move);
+      const iree_vm_register_list_t* value_reg_list =
+          VM_DecVariadicOperands("values");
+      bool result_is_move;
+      iree_vm_ref_t* result = VM_DecResultRegRef("result", &result_is_move);
+      if (index >= 0 && index < value_reg_list->size) {
+        bool is_move =
+            value_reg_list->registers[index] & IREE_REF_REGISTER_MOVE_BIT;
+        iree_vm_ref_t* new_value =
+            &regs.ref[value_reg_list->registers[index] & regs.ref_mask];
+        IREE_RETURN_IF_ERROR(iree_vm_ref_retain_or_move_checked(
+            is_move, new_value, type_def->ref_type, result));
+      } else {
+        IREE_RETURN_IF_ERROR(iree_vm_ref_retain_or_move_checked(
+            default_is_move, default_value, type_def->ref_type, result));
+      }
+    });
+
+    //===------------------------------------------------------------------===//
+    // Native integer arithmetic
+    //===------------------------------------------------------------------===//
+
+    DISPATCH_OP_CORE_BINARY_I32(AddI32, vm_add_i32);
+    DISPATCH_OP_CORE_BINARY_I32(SubI32, vm_sub_i32);
+    DISPATCH_OP_CORE_BINARY_I32(MulI32, vm_mul_i32);
+    DISPATCH_OP_CORE_BINARY_I32(DivI32S, vm_div_i32s);
+    DISPATCH_OP_CORE_BINARY_I32(DivI32U, vm_div_i32u);
+    DISPATCH_OP_CORE_BINARY_I32(RemI32S, vm_rem_i32s);
+    DISPATCH_OP_CORE_BINARY_I32(RemI32U, vm_rem_i32u);
+    DISPATCH_OP_CORE_TERNARY_I32(FMAI32, vm_fma_i32);
+    DISPATCH_OP_CORE_UNARY_I32(NotI32, vm_not_i32);
+    DISPATCH_OP_CORE_BINARY_I32(AndI32, vm_and_i32);
+    DISPATCH_OP_CORE_BINARY_I32(OrI32, vm_or_i32);
+    DISPATCH_OP_CORE_BINARY_I32(XorI32, vm_xor_i32);
+
+    //===------------------------------------------------------------------===//
+    // Casting and type conversion/emulation
+    //===------------------------------------------------------------------===//
+
+    DISPATCH_OP_CORE_UNARY_I32(TruncI32I8, vm_trunc_i32i8);
+    DISPATCH_OP_CORE_UNARY_I32(TruncI32I16, vm_trunc_i32i16);
+    DISPATCH_OP_CORE_UNARY_I32(ExtI8I32S, vm_ext_i8i32s);
+    DISPATCH_OP_CORE_UNARY_I32(ExtI8I32U, vm_ext_i8i32u);
+    DISPATCH_OP_CORE_UNARY_I32(ExtI16I32S, vm_ext_i16i32s);
+    DISPATCH_OP_CORE_UNARY_I32(ExtI16I32U, vm_ext_i16i32u);
+
+    //===------------------------------------------------------------------===//
+    // Native bitwise shifts and rotates
+    //===------------------------------------------------------------------===//
+
+#define DISPATCH_OP_CORE_SHIFT_I32(op_name, op_func)  \
+  DISPATCH_OP(CORE, op_name, {                        \
+    int32_t operand = VM_DecOperandRegI32("operand"); \
+    int32_t amount = VM_DecOperandRegI32("amount");   \
+    int32_t* result = VM_DecResultRegI32("result");   \
+    *result = op_func(operand, amount);               \
+  });
+
+    DISPATCH_OP_CORE_SHIFT_I32(ShlI32, vm_shl_i32);
+    DISPATCH_OP_CORE_SHIFT_I32(ShrI32S, vm_shr_i32s);
+    DISPATCH_OP_CORE_SHIFT_I32(ShrI32U, vm_shr_i32u);
+
+    //===------------------------------------------------------------------===//
+    // Comparison ops
+    //===------------------------------------------------------------------===//
+
+    DISPATCH_OP_CORE_BINARY_I32(CmpEQI32, vm_cmp_eq_i32);
+    DISPATCH_OP_CORE_BINARY_I32(CmpNEI32, vm_cmp_ne_i32);
+    DISPATCH_OP_CORE_BINARY_I32(CmpLTI32S, vm_cmp_lt_i32s);
+    DISPATCH_OP_CORE_BINARY_I32(CmpLTI32U, vm_cmp_lt_i32u);
+    DISPATCH_OP_CORE_UNARY_I32(CmpNZI32, vm_cmp_nz_i32);
+
+    DISPATCH_OP(CORE, CmpEQRef, {
+      bool lhs_is_move;
+      iree_vm_ref_t* lhs = VM_DecOperandRegRef("lhs", &lhs_is_move);
+      bool rhs_is_move;
+      iree_vm_ref_t* rhs = VM_DecOperandRegRef("rhs", &rhs_is_move);
+      int32_t* result = VM_DecResultRegI32("result");
+      *result = vm_cmp_eq_ref(lhs, rhs);
+      if (lhs_is_move) iree_vm_ref_release(lhs);
+      if (rhs_is_move) iree_vm_ref_release(rhs);
+    });
+    DISPATCH_OP(CORE, CmpNERef, {
+      bool lhs_is_move;
+      iree_vm_ref_t* lhs = VM_DecOperandRegRef("lhs", &lhs_is_move);
+      bool rhs_is_move;
+      iree_vm_ref_t* rhs = VM_DecOperandRegRef("rhs", &rhs_is_move);
+      int32_t* result = VM_DecResultRegI32("result");
+      *result = vm_cmp_ne_ref(lhs, rhs);
+      if (lhs_is_move) iree_vm_ref_release(lhs);
+      if (rhs_is_move) iree_vm_ref_release(rhs);
+    });
+    DISPATCH_OP(CORE, CmpNZRef, {
+      bool operand_is_move;
+      iree_vm_ref_t* operand = VM_DecOperandRegRef("operand", &operand_is_move);
+      int32_t* result = VM_DecResultRegI32("result");
+      *result = vm_cmp_nz_ref(operand);
+      if (operand_is_move) iree_vm_ref_release(operand);
+    });
+
+    //===------------------------------------------------------------------===//
+    // Control flow
+    //===------------------------------------------------------------------===//
+
+    DISPATCH_OP(CORE, Branch, {
+      int32_t block_pc = VM_DecBranchTarget("dest");
+      const iree_vm_register_remap_list_t* remap_list =
+          VM_DecBranchOperands("operands");
+      pc = block_pc;
+      iree_vm_bytecode_dispatch_remap_branch_registers(regs, remap_list);
+    });
+
+    DISPATCH_OP(CORE, CondBranch, {
+      int32_t condition = VM_DecOperandRegI32("condition");
+      int32_t true_block_pc = VM_DecBranchTarget("true_dest");
+      const iree_vm_register_remap_list_t* true_remap_list =
+          VM_DecBranchOperands("true_operands");
+      int32_t false_block_pc = VM_DecBranchTarget("false_dest");
+      const iree_vm_register_remap_list_t* false_remap_list =
+          VM_DecBranchOperands("false_operands");
+      if (condition) {
+        pc = true_block_pc;
+        iree_vm_bytecode_dispatch_remap_branch_registers(regs, true_remap_list);
+      } else {
+        pc = false_block_pc;
+        iree_vm_bytecode_dispatch_remap_branch_registers(regs,
+                                                         false_remap_list);
+      }
+    });
+
+    DISPATCH_OP(CORE, Call, {
+      int32_t function_ordinal = VM_DecFuncAttr("callee");
+      const iree_vm_register_list_t* src_reg_list =
+          VM_DecVariadicOperands("operands");
+      const iree_vm_register_list_t* dst_reg_list =
+          VM_DecVariadicResults("results");
+      current_frame->pc = pc;
+
+      // NOTE: we assume validation has ensured these functions exist.
+      // TODO(benvanik): something more clever than just a high bit?
+      int is_import = (function_ordinal & 0x80000000u) != 0;
+      if (is_import) {
+        // Call import (and possible yield).
+        IREE_RETURN_IF_ERROR(iree_vm_bytecode_call_import(
+            stack, module_state, function_ordinal, regs, src_reg_list,
+            dst_reg_list, &current_frame, &regs, out_result));
+      } else {
+        // Switch execution to the target function and continue running in the
+        // bytecode dispatcher.
+        IREE_RETURN_IF_ERROR(iree_vm_bytecode_internal_enter(
+            stack, current_frame->function.module, function_ordinal,
+            src_reg_list, dst_reg_list, &current_frame, &regs));
+        bytecode_data =
+            module->bytecode_data.data +
+            module->function_descriptor_table[function_ordinal].bytecode_offset;
+        pc = current_frame->pc;
+      }
+    });
+
+    DISPATCH_OP(CORE, CallVariadic, {
+      // TODO(benvanik): dedupe with above or merge and always have the seg size
+      // list be present (but empty) for non-variadic calls.
+      int32_t function_ordinal = VM_DecFuncAttr("callee");
+      const iree_vm_register_list_t* segment_size_list =
+          VM_DecVariadicOperands("segment_sizes");
+      const iree_vm_register_list_t* src_reg_list =
+          VM_DecVariadicOperands("operands");
+      const iree_vm_register_list_t* dst_reg_list =
+          VM_DecVariadicResults("results");
+      current_frame->pc = pc;
+
+      // NOTE: we assume validation has ensured these functions exist.
+      // TODO(benvanik): something more clever than just a high bit?
+      int is_import = (function_ordinal & 0x80000000u) != 0;
+      if (IREE_UNLIKELY(!is_import)) {
+        // Variadic calls are currently only supported for import functions.
+        return iree_make_status(
+            IREE_STATUS_FAILED_PRECONDITION,
+            "variadic calls only supported for internal callees");
+      }
+
+      // Call import (and possible yield).
+      IREE_RETURN_IF_ERROR(iree_vm_bytecode_call_import_variadic(
+          stack, module_state, function_ordinal, regs, segment_size_list,
+          src_reg_list, dst_reg_list, &current_frame, &regs, out_result));
+    });
+
+    DISPATCH_OP(CORE, Return, {
+      const iree_vm_register_list_t* src_reg_list =
+          VM_DecVariadicOperands("operands");
+      current_frame->pc = pc;
+
+      if (current_frame->depth <= entry_frame_depth) {
+        // Return from the top-level entry frame - return back to call().
+        return iree_vm_bytecode_external_leave(stack, current_frame, &regs,
+                                               src_reg_list, cconv_results,
+                                               call->results);
+      }
+
+      // Store results into the caller frame and pop back to the parent.
+      IREE_RETURN_IF_ERROR(iree_vm_bytecode_internal_leave(
+          stack, current_frame, regs, src_reg_list, &current_frame, &regs));
+
+      // Reset dispatch state so we can continue executing in the caller.
+      bytecode_data =
+          module->bytecode_data.data +
+          module->function_descriptor_table[current_frame->function.ordinal]
+              .bytecode_offset;
+      pc = current_frame->pc;
+    });
+
+    DISPATCH_OP(CORE, Fail, {
+      uint32_t status_code = VM_DecOperandRegI32("status");
+      iree_string_view_t message;
+      VM_DecStrAttr("message", &message);
+      if (status_code != 0) {
+        // TODO(benvanik): capture source information.
+        return iree_status_allocate_f(status_code, "<vm>", 0, "%.*s",
+                                      (int)message.size, message.data);
+      }
+    });
+
+    DISPATCH_OP(CORE, ImportResolved, {
+      uint32_t function_ordinal = VM_DecFuncAttr("import");
+      int32_t* result = VM_DecResultRegI32("result");
+      uint32_t import_ordinal = function_ordinal & 0x7FFFFFFFu;
+      if (IREE_UNLIKELY(import_ordinal >= module_state->import_count)) {
+        return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                                "import ordinal out of range");
+      }
+      const iree_vm_bytecode_import_t* import =
+          &module_state->import_table[import_ordinal];
+      *result = import->function.module != NULL ? 1 : 0;
+    });
+
+    //===------------------------------------------------------------------===//
+    // Async/fiber ops
+    //===------------------------------------------------------------------===//
+
+    DISPATCH_OP(CORE, Yield, {
+      // Perform branch before yielding; in this way we will resume at the
+      // target without needing to retain any information about the yield.
+      int32_t block_pc = VM_DecBranchTarget("dest");
+      const iree_vm_register_remap_list_t* remap_list =
+          VM_DecBranchOperands("operands");
+      iree_vm_bytecode_dispatch_remap_branch_registers(regs, remap_list);
+      pc = block_pc;
+
+      // Return magic status code indicating a yield.
+      // This isn't an error, though callers not supporting coroutines will
+      // treat it as one and propagate it up.
+      return iree_status_from_code(IREE_STATUS_DEFERRED);
+    });
+
+    //===------------------------------------------------------------------===//
+    // Debugging
+    //===------------------------------------------------------------------===//
+
+    DISPATCH_OP(CORE, Trace, {
+      iree_string_view_t event_name;
+      VM_DecStrAttr("event_name", &event_name);
+      const iree_vm_register_list_t* src_reg_list =
+          VM_DecVariadicOperands("operands");
+      // TODO(benvanik): trace (if enabled).
+      iree_vm_bytecode_dispatch_discard_registers(regs, src_reg_list);
+    });
+
+    DISPATCH_OP(CORE, Print, {
+      iree_string_view_t event_name;
+      VM_DecStrAttr("event_name", &event_name);
+      const iree_vm_register_list_t* src_reg_list =
+          VM_DecVariadicOperands("operands");
+      // TODO(benvanik): print.
+      iree_vm_bytecode_dispatch_discard_registers(regs, src_reg_list);
+    });
+
+    DISPATCH_OP(CORE, Break, {
+      // TODO(benvanik): break unconditionally.
+      int32_t block_pc = VM_DecBranchTarget("dest");
+      const iree_vm_register_remap_list_t* remap_list =
+          VM_DecBranchOperands("operands");
+      iree_vm_bytecode_dispatch_remap_branch_registers(regs, remap_list);
+      pc = block_pc;
+    });
+
+    DISPATCH_OP(CORE, CondBreak, {
+      int32_t condition = VM_DecOperandRegI32("condition");
+      if (condition) {
+        // TODO(benvanik): cond break.
+      }
+      int32_t block_pc = VM_DecBranchTarget("dest");
+      const iree_vm_register_remap_list_t* remap_list =
+          VM_DecBranchOperands("operands");
+      iree_vm_bytecode_dispatch_remap_branch_registers(regs, remap_list);
+      pc = block_pc;
+    });
+
+    //===------------------------------------------------------------------===//
+    // Extension trampolines
+    //===------------------------------------------------------------------===//
+
+#if IREE_VM_EXT_I64_ENABLE
+    BEGIN_DISPATCH_PREFIX(PrefixExtI64, EXT_I64) {
+      //===----------------------------------------------------------------===//
+      // ExtI64: Globals
+      //===----------------------------------------------------------------===//
+
+      DISPATCH_OP(EXT_I64, GlobalLoadI64, {
+        uint32_t byte_offset = VM_DecGlobalAttr("global");
+        if (IREE_UNLIKELY(byte_offset >=
+                          module_state->rwdata_storage.data_length)) {
+          return iree_make_status(
+              IREE_STATUS_OUT_OF_RANGE,
+              "global byte_offset out of range: %d (rwdata=%zu)", byte_offset,
+              module_state->rwdata_storage.data_length);
+        }
+        int64_t* value = VM_DecResultRegI64("value");
+        const int64_t global_value =
+            vm_global_load_i64(module_state->rwdata_storage.data, byte_offset);
+        *value = global_value;
+      });
+
+      DISPATCH_OP(EXT_I64, GlobalStoreI64, {
+        uint32_t byte_offset = VM_DecGlobalAttr("global");
+        if (IREE_UNLIKELY(byte_offset >=
+                          module_state->rwdata_storage.data_length)) {
+          return iree_make_status(
+              IREE_STATUS_OUT_OF_RANGE,
+              "global byte_offset out of range: %d (rwdata=%zu)", byte_offset,
+              module_state->rwdata_storage.data_length);
+        }
+        int64_t value = VM_DecOperandRegI64("value");
+        vm_global_store_i64(module_state->rwdata_storage.data, byte_offset,
+                            value);
+      });
+
+      DISPATCH_OP(EXT_I64, GlobalLoadIndirectI64, {
+        uint32_t byte_offset = VM_DecOperandRegI32("global");
+        if (IREE_UNLIKELY(byte_offset >=
+                          module_state->rwdata_storage.data_length)) {
+          return iree_make_status(
+              IREE_STATUS_OUT_OF_RANGE,
+              "global byte_offset out of range: %d (rwdata=%zu)", byte_offset,
+              module_state->rwdata_storage.data_length);
+        }
+        int64_t* value = VM_DecResultRegI64("value");
+        const int64_t global_value =
+            vm_global_load_i64(module_state->rwdata_storage.data, byte_offset);
+        *value = global_value;
+      });
+
+      DISPATCH_OP(EXT_I64, GlobalStoreIndirectI64, {
+        uint32_t byte_offset = VM_DecOperandRegI32("global");
+        if (IREE_UNLIKELY(byte_offset >=
+                          module_state->rwdata_storage.data_length)) {
+          return iree_make_status(
+              IREE_STATUS_OUT_OF_RANGE,
+              "global byte_offset out of range: %d (rwdata=%zu)", byte_offset,
+              module_state->rwdata_storage.data_length);
+        }
+        int64_t value = VM_DecOperandRegI64("value");
+        vm_global_store_i64(module_state->rwdata_storage.data, byte_offset,
+                            value);
+      });
+
+      //===----------------------------------------------------------------===//
+      // ExtI64: Constants
+      //===----------------------------------------------------------------===//
+
+      DISPATCH_OP(EXT_I64, ConstI64, {
+        int64_t value = VM_DecIntAttr64("value");
+        int64_t* result = VM_DecResultRegI64("result");
+        *result = value;
+      });
+
+      DISPATCH_OP(EXT_I64, ConstI64Zero, {
+        int64_t* result = VM_DecResultRegI64("result");
+        *result = 0;
+      });
+
+      //===----------------------------------------------------------------===//
+      // ExtI64: Lists
+      //===----------------------------------------------------------------===//
+
+      DISPATCH_OP(EXT_I64, ListGetI64, {
+        bool list_is_move;
+        iree_vm_ref_t* list_ref = VM_DecOperandRegRef("list", &list_is_move);
+        iree_vm_list_t* list = iree_vm_list_deref(*list_ref);
+        if (IREE_UNLIKELY(!list)) {
+          return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "list is null");
+        }
+        uint32_t index = VM_DecOperandRegI32("index");
+        int64_t* result = VM_DecResultRegI64("result");
+        iree_vm_value_t value;
+        IREE_RETURN_IF_ERROR(iree_vm_list_get_value_as(
+            list, index, IREE_VM_VALUE_TYPE_I64, &value));
+        *result = value.i64;
+      });
+
+      DISPATCH_OP(EXT_I64, ListSetI64, {
+        bool list_is_move;
+        iree_vm_ref_t* list_ref = VM_DecOperandRegRef("list", &list_is_move);
+        iree_vm_list_t* list = iree_vm_list_deref(*list_ref);
+        if (IREE_UNLIKELY(!list)) {
+          return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "list is null");
+        }
+        uint32_t index = VM_DecOperandRegI32("index");
+        int64_t raw_value = VM_DecOperandRegI64("value");
+        iree_vm_value_t value = iree_vm_value_make_i64(raw_value);
+        IREE_RETURN_IF_ERROR(iree_vm_list_set_value(list, index, &value));
+      });
+
+      //===----------------------------------------------------------------===//
+      // ExtI64: Conditional assignment
+      //===----------------------------------------------------------------===//
+
+      DISPATCH_OP(EXT_I64, SelectI64, {
+        int32_t condition = VM_DecOperandRegI32("condition");
+        int64_t true_value = VM_DecOperandRegI64("true_value");
+        int64_t false_value = VM_DecOperandRegI64("false_value");
+        int64_t* result = VM_DecResultRegI64("result");
+        *result = vm_select_i64(condition, true_value, false_value);
+      });
+
+      DISPATCH_OP(EXT_I64, SwitchI64, {
+        int32_t index = VM_DecOperandRegI32("index");
+        int64_t default_value = VM_DecIntAttr64("default_value");
+        const iree_vm_register_list_t* value_reg_list =
+            VM_DecVariadicOperands("values");
+        int64_t* result = VM_DecResultRegI64("result");
+        if (index >= 0 && index < value_reg_list->size) {
+          *result =
+              regs.i32[value_reg_list->registers[index] & (regs.i32_mask & ~1)];
+        } else {
+          *result = default_value;
+        }
+      });
+
+      //===----------------------------------------------------------------===//
+      // ExtI64: Native integer arithmetic
+      //===----------------------------------------------------------------===//
+
+      DISPATCH_OP_EXT_I64_BINARY_I64(AddI64, vm_add_i64);
+      DISPATCH_OP_EXT_I64_BINARY_I64(SubI64, vm_sub_i64);
+      DISPATCH_OP_EXT_I64_BINARY_I64(MulI64, vm_mul_i64);
+      DISPATCH_OP_EXT_I64_BINARY_I64(DivI64S, vm_div_i64s);
+      DISPATCH_OP_EXT_I64_BINARY_I64(DivI64U, vm_div_i64u);
+      DISPATCH_OP_EXT_I64_BINARY_I64(RemI64S, vm_rem_i64s);
+      DISPATCH_OP_EXT_I64_BINARY_I64(RemI64U, vm_rem_i64u);
+      DISPATCH_OP_EXT_I64_TERNARY_I64(FMAI64, vm_fma_i64);
+      DISPATCH_OP_EXT_I64_UNARY_I64(NotI64, vm_not_i64);
+      DISPATCH_OP_EXT_I64_BINARY_I64(AndI64, vm_and_i64);
+      DISPATCH_OP_EXT_I64_BINARY_I64(OrI64, vm_or_i64);
+      DISPATCH_OP_EXT_I64_BINARY_I64(XorI64, vm_xor_i64);
+
+      //===----------------------------------------------------------------===//
+      // ExtI64: Casting and type conversion/emulation
+      //===----------------------------------------------------------------===//
+
+      DISPATCH_OP(EXT_I64, TruncI64I32, {
+        int64_t operand = VM_DecOperandRegI64("operand");
+        int32_t* result = VM_DecResultRegI32("result");
+        *result = vm_trunc_i64i32(operand);
+      });
+      DISPATCH_OP(EXT_I64, ExtI32I64S, {
+        int32_t operand = VM_DecOperandRegI32("operand");
+        int64_t* result = VM_DecResultRegI64("result");
+        *result = vm_ext_i32i64s(operand);
+      });
+      DISPATCH_OP(EXT_I64, ExtI32I64U, {
+        int32_t operand = VM_DecOperandRegI32("operand");
+        int64_t* result = VM_DecResultRegI64("result");
+        *result = vm_ext_i32i64u(operand);
+      });
+
+      //===----------------------------------------------------------------===//
+      // ExtI64: Native bitwise shifts and rotates
+      //===----------------------------------------------------------------===//
+
+#define DISPATCH_OP_EXT_I64_SHIFT_I64(op_name, op_func) \
+  DISPATCH_OP(EXT_I64, op_name, {                       \
+    int64_t operand = VM_DecOperandRegI64("operand");   \
+    int32_t amount = VM_DecOperandRegI32("amount");     \
+    int64_t* result = VM_DecResultRegI64("result");     \
+    *result = op_func(operand, amount);                 \
+  });
+
+      DISPATCH_OP_EXT_I64_SHIFT_I64(ShlI64, vm_shl_i64);
+      DISPATCH_OP_EXT_I64_SHIFT_I64(ShrI64S, vm_shr_i64s);
+      DISPATCH_OP_EXT_I64_SHIFT_I64(ShrI64U, vm_shr_i64u);
+
+      //===----------------------------------------------------------------===//
+      // ExtI64: Comparison ops
+      //===----------------------------------------------------------------===//
+
+#define DISPATCH_OP_EXT_I64_CMP_I64(op_name, op_func) \
+  DISPATCH_OP(EXT_I64, op_name, {                     \
+    int64_t lhs = VM_DecOperandRegI64("lhs");         \
+    int64_t rhs = VM_DecOperandRegI64("rhs");         \
+    int32_t* result = VM_DecResultRegI32("result");   \
+    *result = op_func(lhs, rhs);                      \
+  });
+
+      DISPATCH_OP_EXT_I64_CMP_I64(CmpEQI64, vm_cmp_eq_i64);
+      DISPATCH_OP_EXT_I64_CMP_I64(CmpNEI64, vm_cmp_ne_i64);
+      DISPATCH_OP_EXT_I64_CMP_I64(CmpLTI64S, vm_cmp_lt_i64s);
+      DISPATCH_OP_EXT_I64_CMP_I64(CmpLTI64U, vm_cmp_lt_i64u);
+      DISPATCH_OP(EXT_I64, CmpNZI64, {
+        int64_t operand = VM_DecOperandRegI64("operand");
+        int32_t* result = VM_DecResultRegI32("result");
+        *result = vm_cmp_nz_i64(operand);
+      });
+
+      //===----------------------------------------------------------------===//
+      // ExtI64: Buffers
+      //===----------------------------------------------------------------===//
+
+      DISPATCH_OP(EXT_I64, BufferFillI64, {
+        bool buffer_is_move;
+        iree_vm_ref_t* buffer_ref =
+            VM_DecOperandRegRef("target_buffer", &buffer_is_move);
+        iree_vm_buffer_t* buffer = iree_vm_buffer_deref(*buffer_ref);
+        if (IREE_UNLIKELY(!buffer)) {
+          return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                                  "buffer is null");
+        }
+        uint32_t offset = VM_DecOperandRegI32("target_offset");
+        uint32_t length = VM_DecOperandRegI32("length");
+        uint64_t value = VM_DecOperandRegI64("value");
+        IREE_RETURN_IF_ERROR(iree_vm_buffer_fill_elements(
+            buffer, offset, length / sizeof(uint64_t), sizeof(uint64_t),
+            &value));
+      });
+
+      DISPATCH_OP(EXT_I64, BufferLoadI64, {
+        bool buffer_is_move;
+        iree_vm_ref_t* buffer_ref =
+            VM_DecOperandRegRef("source_buffer", &buffer_is_move);
+        iree_vm_buffer_t* buffer = iree_vm_buffer_deref(*buffer_ref);
+        if (IREE_UNLIKELY(!buffer)) {
+          return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                                  "source_buffer is null");
+        }
+        uint32_t offset = VM_DecOperandRegI32("source_offset");
+        uint64_t* result = VM_DecResultRegI64("result");
+        IREE_RETURN_IF_ERROR(iree_vm_buffer_read_elements(
+            buffer, offset, result, 1, sizeof(*result)));
+      });
+
+      DISPATCH_OP(EXT_I64, BufferStoreI64, {
+        bool buffer_is_move;
+        iree_vm_ref_t* buffer_ref =
+            VM_DecOperandRegRef("target_buffer", &buffer_is_move);
+        iree_vm_buffer_t* buffer = iree_vm_buffer_deref(*buffer_ref);
+        if (IREE_UNLIKELY(!buffer)) {
+          return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                                  "target_buffer is null");
+        }
+        uint32_t offset = VM_DecOperandRegI32("target_offset");
+        uint64_t value = (uint64_t)VM_DecOperandRegI64("value");
+        IREE_RETURN_IF_ERROR(iree_vm_buffer_write_elements(
+            &value, buffer, offset, 1, sizeof(uint64_t)));
+      });
+    }
+    END_DISPATCH_PREFIX();
+#else
+    UNHANDLED_DISPATCH_PREFIX(PrefixExtI64, EXT_I64);
+#endif  // IREE_VM_EXT_I64_ENABLE
+
+#if IREE_VM_EXT_F32_ENABLE
+    BEGIN_DISPATCH_PREFIX(PrefixExtF32, EXT_F32) {
+      //===----------------------------------------------------------------===//
+      // ExtF32: Globals
+      //===----------------------------------------------------------------===//
+
+      DISPATCH_OP(EXT_F32, GlobalLoadF32, {
+        uint32_t byte_offset = VM_DecGlobalAttr("global");
+        if (IREE_UNLIKELY(byte_offset >=
+                          module_state->rwdata_storage.data_length)) {
+          return iree_make_status(
+              IREE_STATUS_OUT_OF_RANGE,
+              "global byte_offset out of range: %d (rwdata=%zu)", byte_offset,
+              module_state->rwdata_storage.data_length);
+        }
+        float* value = VM_DecResultRegF32("value");
+        const float global_value =
+            vm_global_load_f32(module_state->rwdata_storage.data, byte_offset);
+        *value = global_value;
+      });
+
+      DISPATCH_OP(EXT_F32, GlobalStoreF32, {
+        uint32_t byte_offset = VM_DecGlobalAttr("global");
+        if (IREE_UNLIKELY(byte_offset >=
+                          module_state->rwdata_storage.data_length)) {
+          return iree_make_status(
+              IREE_STATUS_OUT_OF_RANGE,
+              "global byte_offset out of range: %d (rwdata=%zu)", byte_offset,
+              module_state->rwdata_storage.data_length);
+        }
+        float value = VM_DecOperandRegF32("value");
+        vm_global_store_f32(module_state->rwdata_storage.data, byte_offset,
+                            value);
+      });
+
+      DISPATCH_OP(EXT_F32, GlobalLoadIndirectF32, {
+        uint32_t byte_offset = VM_DecOperandRegI32("global");
+        if (IREE_UNLIKELY(byte_offset >=
+                          module_state->rwdata_storage.data_length)) {
+          return iree_make_status(
+              IREE_STATUS_OUT_OF_RANGE,
+              "global byte_offset out of range: %d (rwdata=%zu)", byte_offset,
+              module_state->rwdata_storage.data_length);
+        }
+        float* value = VM_DecResultRegF32("value");
+        const float global_value =
+            vm_global_load_f32(module_state->rwdata_storage.data, byte_offset);
+        *value = global_value;
+      });
+
+      DISPATCH_OP(EXT_F32, GlobalStoreIndirectF32, {
+        uint32_t byte_offset = VM_DecOperandRegI32("global");
+        if (IREE_UNLIKELY(byte_offset >=
+                          module_state->rwdata_storage.data_length)) {
+          return iree_make_status(
+              IREE_STATUS_OUT_OF_RANGE,
+              "global byte_offset out of range: %d (rwdata=%zu)", byte_offset,
+              module_state->rwdata_storage.data_length);
+        }
+        float value = VM_DecOperandRegF32("value");
+        vm_global_store_f32(module_state->rwdata_storage.data, byte_offset,
+                            value);
+      });
+
+      //===----------------------------------------------------------------===//
+      // ExtF32: Constants
+      //===----------------------------------------------------------------===//
+
+      DISPATCH_OP(EXT_F32, ConstF32, {
+        float value = VM_DecFloatAttr32("value");
+        float* result = VM_DecResultRegF32("result");
+        *result = value;
+      });
+
+      DISPATCH_OP(EXT_F32, ConstF32Zero, {
+        float* result = VM_DecResultRegF32("result");
+        *result = 0;
+      });
+
+      //===----------------------------------------------------------------===//
+      // ExtF32: Lists
+      //===----------------------------------------------------------------===//
+
+      DISPATCH_OP(EXT_F32, ListGetF32, {
+        bool list_is_move;
+        iree_vm_ref_t* list_ref = VM_DecOperandRegRef("list", &list_is_move);
+        iree_vm_list_t* list = iree_vm_list_deref(*list_ref);
+        if (IREE_UNLIKELY(!list)) {
+          return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "list is null");
+        }
+        uint32_t index = VM_DecOperandRegI32("index");
+        float* result = VM_DecResultRegF32("result");
+        iree_vm_value_t value;
+        IREE_RETURN_IF_ERROR(iree_vm_list_get_value_as(
+            list, index, IREE_VM_VALUE_TYPE_F32, &value));
+        *result = value.f32;
+      });
+
+      DISPATCH_OP(EXT_F32, ListSetF32, {
+        bool list_is_move;
+        iree_vm_ref_t* list_ref = VM_DecOperandRegRef("list", &list_is_move);
+        iree_vm_list_t* list = iree_vm_list_deref(*list_ref);
+        if (IREE_UNLIKELY(!list)) {
+          return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "list is null");
+        }
+        uint32_t index = VM_DecOperandRegI32("index");
+        float raw_value = VM_DecOperandRegF32("value");
+        iree_vm_value_t value = iree_vm_value_make_f32(raw_value);
+        IREE_RETURN_IF_ERROR(iree_vm_list_set_value(list, index, &value));
+      });
+
+      //===----------------------------------------------------------------===//
+      // ExtF32: Conditional assignment
+      //===----------------------------------------------------------------===//
+
+      DISPATCH_OP(EXT_F32, SelectF32, {
+        int32_t condition = VM_DecOperandRegI32("condition");
+        float true_value = VM_DecOperandRegF32("true_value");
+        float false_value = VM_DecOperandRegF32("false_value");
+        float* result = VM_DecResultRegF32("result");
+        *result = vm_select_f32(condition, true_value, false_value);
+      });
+
+      DISPATCH_OP(EXT_F32, SwitchF32, {
+        int32_t index = VM_DecOperandRegI32("index");
+        float default_value = VM_DecFloatAttr32("default_value");
+        const iree_vm_register_list_t* value_reg_list =
+            VM_DecVariadicOperands("values");
+        float* result = VM_DecResultRegF32("result");
+        if (index >= 0 && index < value_reg_list->size) {
+          *result = *((float*)&regs.i32[value_reg_list->registers[index] &
+                                        (regs.i32_mask & ~1)]);
+        } else {
+          *result = default_value;
+        }
+      });
+
+      //===----------------------------------------------------------------===//
+      // ExtF32: Native floating-point arithmetic
+      //===----------------------------------------------------------------===//
+
+      DISPATCH_OP_EXT_F32_BINARY_F32(AddF32, vm_add_f32);
+      DISPATCH_OP_EXT_F32_BINARY_F32(SubF32, vm_sub_f32);
+      DISPATCH_OP_EXT_F32_BINARY_F32(MulF32, vm_mul_f32);
+      DISPATCH_OP_EXT_F32_BINARY_F32(DivF32, vm_div_f32);
+      DISPATCH_OP_EXT_F32_BINARY_F32(RemF32, vm_rem_f32);
+      DISPATCH_OP_EXT_F32_TERNARY_F32(FMAF32, vm_fma_f32);
+      DISPATCH_OP_EXT_F32_UNARY_F32(AbsF32, vm_abs_f32);
+      DISPATCH_OP_EXT_F32_UNARY_F32(NegF32, vm_neg_f32);
+      DISPATCH_OP_EXT_F32_UNARY_F32(CeilF32, vm_ceil_f32);
+      DISPATCH_OP_EXT_F32_UNARY_F32(FloorF32, vm_floor_f32);
+
+      DISPATCH_OP_EXT_F32_UNARY_F32(AtanF32, vm_atan_f32);
+      DISPATCH_OP_EXT_F32_BINARY_F32(Atan2F32, vm_atan2_f32);
+      DISPATCH_OP_EXT_F32_UNARY_F32(CosF32, vm_cos_f32);
+      DISPATCH_OP_EXT_F32_UNARY_F32(SinF32, vm_sin_f32);
+      DISPATCH_OP_EXT_F32_UNARY_F32(ExpF32, vm_exp_f32);
+      DISPATCH_OP_EXT_F32_UNARY_F32(Exp2F32, vm_exp2_f32);
+      DISPATCH_OP_EXT_F32_UNARY_F32(ExpM1F32, vm_expm1_f32);
+      DISPATCH_OP_EXT_F32_UNARY_F32(LogF32, vm_log_f32);
+      DISPATCH_OP_EXT_F32_UNARY_F32(Log10F32, vm_log10_f32);
+      DISPATCH_OP_EXT_F32_UNARY_F32(Log1pF32, vm_log1p_f32);
+      DISPATCH_OP_EXT_F32_UNARY_F32(Log2F32, vm_log2_f32);
+      DISPATCH_OP_EXT_F32_BINARY_F32(PowF32, vm_pow_f32);
+      DISPATCH_OP_EXT_F32_UNARY_F32(RsqrtF32, vm_rsqrt_f32);
+      DISPATCH_OP_EXT_F32_UNARY_F32(SqrtF32, vm_sqrt_f32);
+      DISPATCH_OP_EXT_F32_UNARY_F32(TanhF32, vm_tanh_f32);
+      DISPATCH_OP_EXT_F32_UNARY_F32(ErfF32, vm_erf_f32);
+
+      //===----------------------------------------------------------------===//
+      // ExtF32: Casting and type conversion/emulation
+      //===----------------------------------------------------------------===//
+
+      DISPATCH_OP(EXT_F32, CastSI32F32, {
+        int32_t operand = (int32_t)VM_DecOperandRegI32("operand");
+        float* result = VM_DecResultRegF32("result");
+        *result = vm_cast_si32f32(operand);
+      });
+      DISPATCH_OP(EXT_F32, CastUI32F32, {
+        int32_t operand = (int32_t)VM_DecOperandRegI32("operand");
+        float* result = VM_DecResultRegF32("result");
+        *result = vm_cast_ui32f32(operand);
+      });
+      DISPATCH_OP(EXT_F32, CastF32SI32, {
+        float operand = VM_DecOperandRegF32("operand");
+        int32_t* result = VM_DecResultRegI32("result");
+        *result = vm_cast_f32si32(operand);
+      });
+      DISPATCH_OP(EXT_F32, CastF32UI32, {
+        float operand = VM_DecOperandRegF32("operand");
+        int32_t* result = VM_DecResultRegI32("result");
+        *result = vm_cast_f32ui32(operand);
+      });
+      DISPATCH_OP(EXT_F32, BitcastI32F32, {
+        int32_t operand = (int32_t)VM_DecOperandRegI32("operand");
+        float* result = VM_DecResultRegF32("result");
+        *result = vm_bitcast_i32f32(operand);
+      });
+      DISPATCH_OP(EXT_F32, BitcastF32I32, {
+        float operand = VM_DecOperandRegF32("operand");
+        int32_t* result = VM_DecResultRegI32("result");
+        *result = vm_bitcast_f32i32(operand);
+      });
+
+      //===----------------------------------------------------------------===//
+      // ExtF32: Comparison ops
+      //===----------------------------------------------------------------===//
+
+#define DISPATCH_OP_EXT_F32_CMP_F32(op_name, op_func) \
+  DISPATCH_OP(EXT_F32, op_name, {                     \
+    float lhs = VM_DecOperandRegF32("lhs");           \
+    float rhs = VM_DecOperandRegF32("rhs");           \
+    int32_t* result = VM_DecResultRegI32("result");   \
+    *result = op_func(lhs, rhs);                      \
+  });
+
+      DISPATCH_OP_EXT_F32_CMP_F32(CmpEQF32O, vm_cmp_eq_f32o);
+      DISPATCH_OP_EXT_F32_CMP_F32(CmpEQF32U, vm_cmp_eq_f32u);
+      DISPATCH_OP_EXT_F32_CMP_F32(CmpNEF32O, vm_cmp_ne_f32o);
+      DISPATCH_OP_EXT_F32_CMP_F32(CmpNEF32U, vm_cmp_ne_f32u);
+      DISPATCH_OP_EXT_F32_CMP_F32(CmpLTF32O, vm_cmp_lt_f32o);
+      DISPATCH_OP_EXT_F32_CMP_F32(CmpLTF32U, vm_cmp_lt_f32u);
+      DISPATCH_OP_EXT_F32_CMP_F32(CmpLTEF32O, vm_cmp_lte_f32o);
+      DISPATCH_OP_EXT_F32_CMP_F32(CmpLTEF32U, vm_cmp_lte_f32u);
+      DISPATCH_OP(EXT_F32, CmpNaNF32, {
+        float operand = VM_DecOperandRegF32("operand");
+        int32_t* result = VM_DecResultRegI32("result");
+        *result = vm_cmp_nan_f32(operand);
+      });
+
+      //===----------------------------------------------------------------===//
+      // ExtF32: Buffers
+      //===----------------------------------------------------------------===//
+
+      DISPATCH_OP(EXT_F32, BufferFillF32, {
+        bool buffer_is_move;
+        iree_vm_ref_t* buffer_ref =
+            VM_DecOperandRegRef("target_buffer", &buffer_is_move);
+        iree_vm_buffer_t* buffer = iree_vm_buffer_deref(*buffer_ref);
+        if (IREE_UNLIKELY(!buffer)) {
+          return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                                  "buffer is null");
+        }
+        uint32_t offset = VM_DecOperandRegI32("target_offset");
+        uint32_t length = VM_DecOperandRegI32("length");
+        float value = VM_DecOperandRegF32("value");
+        IREE_RETURN_IF_ERROR(iree_vm_buffer_fill_elements(
+            buffer, offset, length / sizeof(float), sizeof(float), &value));
+      });
+
+      DISPATCH_OP(EXT_F32, BufferLoadF32, {
+        bool buffer_is_move;
+        iree_vm_ref_t* buffer_ref =
+            VM_DecOperandRegRef("source_buffer", &buffer_is_move);
+        iree_vm_buffer_t* buffer = iree_vm_buffer_deref(*buffer_ref);
+        if (IREE_UNLIKELY(!buffer)) {
+          return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                                  "source_buffer is null");
+        }
+        uint32_t offset = VM_DecOperandRegI32("source_offset");
+        float* result = VM_DecResultRegF32("result");
+        IREE_RETURN_IF_ERROR(iree_vm_buffer_read_elements(
+            buffer, offset, result, 1, sizeof(*result)));
+      });
+
+      DISPATCH_OP(EXT_F32, BufferStoreF32, {
+        bool buffer_is_move;
+        iree_vm_ref_t* buffer_ref =
+            VM_DecOperandRegRef("target_buffer", &buffer_is_move);
+        iree_vm_buffer_t* buffer = iree_vm_buffer_deref(*buffer_ref);
+        if (IREE_UNLIKELY(!buffer)) {
+          return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                                  "target_buffer is null");
+        }
+        uint32_t offset = VM_DecOperandRegI32("target_offset");
+        float value = VM_DecOperandRegF32("value");
+        IREE_RETURN_IF_ERROR(iree_vm_buffer_write_elements(
+            &value, buffer, offset, 1, sizeof(float)));
+      });
+    }
+    END_DISPATCH_PREFIX();
+#else
+    UNHANDLED_DISPATCH_PREFIX(PrefixExtF32, EXT_F32);
+#endif  // IREE_VM_EXT_F32_ENABLE
+
+    DISPATCH_OP(CORE, PrefixExtF64,
+                { return iree_make_status(IREE_STATUS_UNIMPLEMENTED); });
+
+    // NOLINTNEXTLINE(misc-static-assert)
+    DISPATCH_UNHANDLED_CORE();
+  }
+  END_DISPATCH_CORE();
+}
diff --git a/runtime/src/iree/vm/bytecode_dispatch_test.cc b/runtime/src/iree/vm/bytecode_dispatch_test.cc
new file mode 100644
index 0000000..fa2a1a5
--- /dev/null
+++ b/runtime/src/iree/vm/bytecode_dispatch_test.cc
@@ -0,0 +1,137 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Tests covering the dispatch logic for individual ops.
+//
+// iree/vm/test/*.mlir contains the functions used here for testing. We
+// avoid defining the IR inline here so that we can run this test on platforms
+// that we can't run the full MLIR compiler stack on.
+
+#include "iree/base/logging.h"
+#include "iree/base/status_cc.h"
+#include "iree/testing/gtest.h"
+#include "iree/vm/api.h"
+#include "iree/vm/bytecode_module.h"
+
+// Compiled module embedded here to avoid file IO:
+#include "iree/vm/test/all_bytecode_modules.h"
+
+namespace {
+
+struct TestParams {
+  const struct iree_file_toc_t& module_file;
+  std::string function_name;
+};
+
+std::ostream& operator<<(std::ostream& os, const TestParams& params) {
+  std::string name{params.module_file.name};
+  auto name_sv = iree_make_string_view(name.data(), name.size());
+  iree_string_view_replace_char(name_sv, ':', '_');
+  iree_string_view_replace_char(name_sv, '.', '_');
+  return os << name << "_" << params.function_name;
+}
+
+std::vector<TestParams> GetModuleTestParams() {
+  std::vector<TestParams> test_params;
+
+  IREE_CHECK_OK(iree_vm_register_builtin_types());
+
+  const struct iree_file_toc_t* module_file_toc =
+      all_bytecode_modules_c_create();
+  for (size_t i = 0; i < all_bytecode_modules_c_size(); ++i) {
+    const auto& module_file = module_file_toc[i];
+    iree_vm_module_t* module = nullptr;
+    IREE_CHECK_OK(iree_vm_bytecode_module_create(
+        iree_const_byte_span_t{
+            reinterpret_cast<const uint8_t*>(module_file.data),
+            module_file.size},
+        iree_allocator_null(), iree_allocator_system(), &module));
+    iree_vm_module_signature_t signature = module->signature(module->self);
+    test_params.reserve(test_params.size() + signature.export_function_count);
+    for (int i = 0; i < signature.export_function_count; ++i) {
+      iree_vm_function_t function;
+      IREE_CHECK_OK(iree_vm_module_lookup_function_by_ordinal(
+          module, IREE_VM_FUNCTION_LINKAGE_EXPORT, i, &function));
+      iree_string_view_t function_name = iree_vm_function_name(&function);
+      test_params.push_back(
+          {module_file, std::string(function_name.data, function_name.size)});
+    }
+    iree_vm_module_release(module);
+  }
+
+  return test_params;
+}
+
+class VMBytecodeDispatchTest
+    : public ::testing::Test,
+      public ::testing::WithParamInterface<TestParams> {
+ protected:
+  virtual void SetUp() {
+    const auto& test_params = GetParam();
+
+    IREE_CHECK_OK(iree_vm_instance_create(iree_allocator_system(), &instance_));
+
+    IREE_CHECK_OK(iree_vm_bytecode_module_create(
+        iree_const_byte_span_t{
+            reinterpret_cast<const uint8_t*>(test_params.module_file.data),
+            test_params.module_file.size},
+        iree_allocator_null(), iree_allocator_system(), &bytecode_module_));
+
+    std::vector<iree_vm_module_t*> modules = {bytecode_module_};
+    IREE_CHECK_OK(iree_vm_context_create_with_modules(
+        instance_, IREE_VM_CONTEXT_FLAG_NONE, modules.data(), modules.size(),
+        iree_allocator_system(), &context_));
+  }
+
+  virtual void TearDown() {
+    iree_vm_module_release(bytecode_module_);
+    iree_vm_context_release(context_);
+    iree_vm_instance_release(instance_);
+  }
+
+  iree_status_t RunFunction(const char* function_name) {
+    iree_vm_function_t function;
+    IREE_CHECK_OK(iree_vm_module_lookup_function_by_name(
+        bytecode_module_, IREE_VM_FUNCTION_LINKAGE_EXPORT,
+        iree_make_cstring_view(function_name), &function));
+
+    return iree_vm_invoke(context_, function, IREE_VM_INVOCATION_FLAG_NONE,
+                          /*policy=*/nullptr, /*inputs=*/nullptr,
+                          /*outputs=*/nullptr, iree_allocator_system());
+  }
+
+  iree_vm_instance_t* instance_ = nullptr;
+  iree_vm_context_t* context_ = nullptr;
+  iree_vm_module_t* bytecode_module_ = nullptr;
+};
+
+TEST_P(VMBytecodeDispatchTest, Check) {
+  const auto& test_params = GetParam();
+  bool expect_failure = test_params.function_name.find("fail_") == 0;
+
+  iree_status_t status = RunFunction(test_params.function_name.c_str());
+  if (iree_status_is_ok(status)) {
+    if (expect_failure) {
+      GTEST_FAIL() << "Function expected failure but succeeded";
+    } else {
+      GTEST_SUCCEED();
+    }
+  } else {
+    if (expect_failure) {
+      iree_status_ignore(status);
+      GTEST_SUCCEED();
+    } else {
+      GTEST_FAIL() << "Function expected success but failed with error: "
+                   << iree::Status(std::move(status));
+    }
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(VMIRFunctions, VMBytecodeDispatchTest,
+                         ::testing::ValuesIn(GetModuleTestParams()),
+                         ::testing::PrintToStringParamName());
+
+}  // namespace
diff --git a/runtime/src/iree/vm/bytecode_dispatch_util.h b/runtime/src/iree/vm/bytecode_dispatch_util.h
new file mode 100644
index 0000000..676b342
--- /dev/null
+++ b/runtime/src/iree/vm/bytecode_dispatch_util.h
@@ -0,0 +1,500 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_VM_BYTECODE_DISPATCH_UTIL_H_
+#define IREE_VM_BYTECODE_DISPATCH_UTIL_H_
+
+#include <assert.h>
+#include <string.h>
+
+#include "iree/base/alignment.h"
+#include "iree/base/config.h"
+#include "iree/base/target_platform.h"
+#include "iree/vm/bytecode_module_impl.h"
+#include "iree/vm/generated/bytecode_op_table.h"
+
+//===----------------------------------------------------------------------===//
+// Shared data structures
+//===----------------------------------------------------------------------===//
+//
+// Register bounds checking
+// ------------------------
+// All accesses into the register lists are truncated to the valid range for the
+// typed bank. This allows us to directly use the register ordinals from the
+// bytecode without needing to perform any validation at load-time or run-time.
+// The worst that can happen is that the bytecode program being executed doesn't
+// work as intended - which, with a working compiler, shouldn't happen. Though
+// there are cases where the runtime produces the register values and may know
+// that they are in range it's a good habit to always mask the ordinal by the
+// type-specific mask so that it's not possible for out of bounds accesses to
+// sneak in. The iree_vm_registers_t struct is often kept in cache and the
+// masking is cheap relative to any other validation we could be performing.
+//
+// Alternative register widths
+// ---------------------------
+// Registers in the VM are just a blob of memory and not physical device
+// registers. They have a natural width of 32-bits as that covers a majority of
+// our usage for i32/f32 but can be accessed at larger widths such as 64-bits or
+// more for vector operations. The base of each frame's register memory is
+// 16-byte aligned and accessing any individual register as a 32-bit value is
+// always 4-byte aligned.
+//
+// Supporting other register widths is "free" in that the registers for all
+// widths alias the same register storage memory. This is similar to how
+// physical registers work in x86 where each register can be accessed at
+// different sizes (like EAX/RAX alias and the SIMD registers alias as XMM1 is
+// 128-bit, YMM1 is 256-bit, and ZMM1 is 512-bit but all the same storage).
+//
+// The requirements for doing this is that the base alignment for any register
+// must be a multiple of 4 (due to the native 32-bit storage) AND aligned to the
+// natural size of the register (so 8 bytes for i64, 16 bytes for v128, etc).
+// This alignment can easily be done by masking off the low bits such that we
+// know for any valid `reg` ordinal aligned to 4 bytes `reg/N` will still be
+// within register storage. For example, i64 registers are accessed as `reg&~1`
+// to align to 8 bytes starting at byte 0 of the register storage.
+//
+// Transferring between register types can be done with vm.ext.* and vm.trunc.*
+// ops. For example, vm.trunc.i64.i32 will read an 8 byte register and write a
+// two 4 byte registers (effectively) with hi=0 and lo=the lower 32-bits of the
+// value.
+
+// Pointers to typed register storage.
+typedef struct iree_vm_registers_t {
+  // Ordinal mask defining which ordinal bits are valid. All i32 indexing must
+  // be ANDed with this mask.
+  uint16_t i32_mask;
+  // 16-byte aligned i32 register array.
+  int32_t* i32;
+  // Ordinal mask defining which ordinal bits are valid. All ref indexing must
+  // be ANDed with this mask.
+  uint16_t ref_mask;
+  // Naturally aligned ref register array.
+  iree_vm_ref_t* ref;
+} iree_vm_registers_t;
+
+// Storage associated with each stack frame of a bytecode function.
+// NOTE: we cannot store pointers to the stack in here as the stack may be
+// reallocated.
+typedef struct iree_vm_bytecode_frame_storage_t {
+  // Pointer to a register list within the stack frame where return registers
+  // will be stored by callees upon return.
+  const iree_vm_register_list_t* return_registers;
+
+  // Counts of each register type rounded up to the next power of two.
+  iree_host_size_t i32_register_count;
+  iree_host_size_t ref_register_count;
+
+  // Relative byte offsets from the head of this struct.
+  iree_host_size_t i32_register_offset;
+  iree_host_size_t ref_register_offset;
+} iree_vm_bytecode_frame_storage_t;
+
+// Interleaved src-dst register sets for branch register remapping.
+// This structure is an overlay for the bytecode that is serialized in a
+// matching format.
+typedef struct iree_vm_register_remap_list_t {
+  uint16_t size;
+  struct pair {
+    uint16_t src_reg;
+    uint16_t dst_reg;
+  } pairs[];
+} iree_vm_register_remap_list_t;
+static_assert(iree_alignof(iree_vm_register_remap_list_t) == 2,
+              "Expecting byte alignment (to avoid padding)");
+static_assert(offsetof(iree_vm_register_remap_list_t, pairs) == 2,
+              "Expect no padding in the struct");
+
+// Maps a type ID to a type def with clamping for out of bounds values.
+static inline const iree_vm_type_def_t* iree_vm_map_type(
+    iree_vm_bytecode_module_t* module, int32_t type_id) {
+  type_id = type_id >= module->type_count ? 0 : type_id;
+  return &module->type_table[type_id];
+}
+
+//===----------------------------------------------------------------------===//
+// Debugging utilities
+//===----------------------------------------------------------------------===//
+
+#if IREE_VM_EXECUTION_TRACING_FORCE_ENABLE
+#define IREE_IS_DISPATCH_TRACING_ENABLED() true
+#else
+#define IREE_IS_DISPATCH_TRACING_ENABLED()   \
+  !!(iree_vm_stack_invocation_flags(stack) & \
+     IREE_VM_INVOCATION_FLAG_TRACE_EXECUTION)
+#endif  // IREE_VM_EXECUTION_TRACING_FORCE_ENABLE
+
+#if IREE_VM_EXECUTION_TRACING_ENABLE
+#define IREE_DISPATCH_TRACE_INSTRUCTION(pc_offset, op_name) \
+  if (IREE_IS_DISPATCH_TRACING_ENABLED()) {                 \
+    IREE_RETURN_IF_ERROR(iree_vm_bytecode_trace_disasm(     \
+        current_frame, (pc - (pc_offset)), &regs, stderr)); \
+  }
+
+#else
+#define IREE_DISPATCH_TRACE_INSTRUCTION(...)
+#endif  // IREE_VM_EXECUTION_TRACING_ENABLE
+
+#if defined(IREE_COMPILER_MSVC) && !defined(IREE_COMPILER_CLANG)
+#define IREE_DISPATCH_MODE_SWITCH 1
+#else
+#define IREE_DISPATCH_MODE_COMPUTED_GOTO 1
+#endif  // MSVC
+
+#ifndef NDEBUG
+#define VMCHECK(expr) assert(expr)
+#else
+#define VMCHECK(expr)
+#endif  // NDEBUG
+
+//===----------------------------------------------------------------------===//
+// Bytecode data reading with little-/big-endian support
+//===----------------------------------------------------------------------===//
+
+static const int kRegSize = sizeof(uint16_t);
+
+// Bytecode data access macros for reading values of a given type from a byte
+// offset within the current function.
+#define OP_I8(i) iree_unaligned_load_le((uint8_t*)&bytecode_data[pc + (i)])
+#define OP_I16(i) iree_unaligned_load_le((uint16_t*)&bytecode_data[pc + (i)])
+#define OP_I32(i) iree_unaligned_load_le((uint32_t*)&bytecode_data[pc + (i)])
+#define OP_I64(i) iree_unaligned_load_le((uint64_t*)&bytecode_data[pc + (i)])
+#define OP_F32(i) iree_unaligned_load_le((float*)&bytecode_data[pc + (i)])
+#define OP_F64(i) iree_unaligned_load_le((double*)&bytecode_data[pc + (i)])
+
+//===----------------------------------------------------------------------===//
+// Utilities matching the tablegen op encoding scheme
+//===----------------------------------------------------------------------===//
+// These utilities match the VM_Enc* statements in VMBase.td 1:1, allowing us
+// to have the inverse of the encoding which make things easier to read.
+//
+// Each macro will increment the pc by the number of bytes read and as such must
+// be called in the same order the values are encoded.
+
+#define VM_AlignPC(pc, alignment) \
+  (pc) = ((pc) + ((alignment)-1)) & ~((alignment)-1)
+
+#define VM_DecConstI8(name) \
+  OP_I8(0);                 \
+  ++pc;
+#define VM_DecConstI32(name) \
+  OP_I32(0);                 \
+  pc += 4;
+#define VM_DecConstI64(name) \
+  OP_I64(0);                 \
+  pc += 8;
+#define VM_DecConstF32(name) \
+  OP_F32(0);                 \
+  pc += 4;
+#define VM_DecConstF64(name) \
+  OP_F64(0);                 \
+  pc += 8;
+#define VM_DecOpcode(opcode) VM_DecConstI8(#opcode)
+#define VM_DecFuncAttr(name) VM_DecConstI32(name)
+#define VM_DecGlobalAttr(name) VM_DecConstI32(name)
+#define VM_DecRodataAttr(name) VM_DecConstI32(name)
+#define VM_DecType(name)               \
+  iree_vm_map_type(module, OP_I32(0)); \
+  pc += 4;
+#define VM_DecTypeOf(name) VM_DecType(name)
+#define VM_DecIntAttr32(name) VM_DecConstI32(name)
+#define VM_DecIntAttr64(name) VM_DecConstI64(name)
+#define VM_DecFloatAttr32(name) VM_DecConstF32(name)
+#define VM_DecFloatAttr64(name) VM_DecConstF64(name)
+#define VM_DecStrAttr(name, out_str)                     \
+  (out_str)->size = (iree_host_size_t)OP_I16(0);         \
+  (out_str)->data = (const char*)&bytecode_data[pc + 2]; \
+  pc += 2 + (out_str)->size;
+#define VM_DecBranchTarget(block_name) VM_DecConstI32(name)
+#define VM_DecBranchOperands(operands_name) \
+  VM_DecBranchOperandsImpl(bytecode_data, &pc)
+static inline const iree_vm_register_remap_list_t* VM_DecBranchOperandsImpl(
+    const uint8_t* IREE_RESTRICT bytecode_data, iree_vm_source_offset_t* pc) {
+  VM_AlignPC(*pc, kRegSize);
+  const iree_vm_register_remap_list_t* list =
+      (const iree_vm_register_remap_list_t*)&bytecode_data[*pc];
+  *pc = *pc + kRegSize + list->size * 2 * kRegSize;
+  return list;
+}
+#define VM_DecOperandRegI32(name)      \
+  regs.i32[OP_I16(0) & regs.i32_mask]; \
+  pc += kRegSize;
+#define VM_DecOperandRegI64(name)                           \
+  *((int64_t*)&regs.i32[OP_I16(0) & (regs.i32_mask & ~1)]); \
+  pc += kRegSize;
+#define VM_DecOperandRegF32(name)                  \
+  *((float*)&regs.i32[OP_I16(0) & regs.i32_mask]); \
+  pc += kRegSize;
+#define VM_DecOperandRegF64(name)                          \
+  *((double*)&regs.i32[OP_I16(0) & (regs.i32_mask & ~1)]); \
+  pc += kRegSize;
+#define VM_DecOperandRegRef(name, out_is_move)                      \
+  &regs.ref[OP_I16(0) & regs.ref_mask];                             \
+  *(out_is_move) = 0; /*= OP_I16(0) & IREE_REF_REGISTER_MOVE_BIT;*/ \
+  pc += kRegSize;
+#define VM_DecVariadicOperands(name) \
+  VM_DecVariadicOperandsImpl(bytecode_data, &pc)
+static inline const iree_vm_register_list_t* VM_DecVariadicOperandsImpl(
+    const uint8_t* IREE_RESTRICT bytecode_data, iree_vm_source_offset_t* pc) {
+  VM_AlignPC(*pc, kRegSize);
+  const iree_vm_register_list_t* list =
+      (const iree_vm_register_list_t*)&bytecode_data[*pc];
+  *pc = *pc + kRegSize + list->size * kRegSize;
+  return list;
+}
+#define VM_DecResultRegI32(name)        \
+  &regs.i32[OP_I16(0) & regs.i32_mask]; \
+  pc += kRegSize;
+#define VM_DecResultRegI64(name)                           \
+  ((int64_t*)&regs.i32[OP_I16(0) & (regs.i32_mask & ~1)]); \
+  pc += kRegSize;
+#define VM_DecResultRegF32(name)                  \
+  ((float*)&regs.i32[OP_I16(0) & regs.i32_mask]); \
+  pc += kRegSize;
+#define VM_DecResultRegF64(name)                          \
+  ((double*)&regs.i32[OP_I16(0) & (regs.i32_mask & ~1)]); \
+  pc += kRegSize;
+#define VM_DecResultRegRef(name, out_is_move)                       \
+  &regs.ref[OP_I16(0) & regs.ref_mask];                             \
+  *(out_is_move) = 0; /*= OP_I16(0) & IREE_REF_REGISTER_MOVE_BIT;*/ \
+  pc += kRegSize;
+#define VM_DecVariadicResults(name) VM_DecVariadicOperands(name)
+
+//===----------------------------------------------------------------------===//
+// Dispatch table structure
+//===----------------------------------------------------------------------===//
+// We support both computed goto (gcc/clang) and switch-based dispatch. Computed
+// goto is preferred when available as it has the most efficient codegen. MSVC
+// doesn't support it, though, and there may be other targets (like wasm) that
+// can only handle the switch-based approach.
+
+// Bytecode data -offset used when looking for the start of the currently
+// dispatched instruction: `instruction_start = pc - OFFSET`
+#define VM_PC_OFFSET_CORE 1
+#define VM_PC_OFFSET_EXT_I32 2
+#define VM_PC_OFFSET_EXT_I64 2
+#define VM_PC_OFFSET_EXT_F32 2
+#define VM_PC_OFFSET_EXT_F64 2
+
+#if defined(IREE_DISPATCH_MODE_COMPUTED_GOTO)
+
+// Dispatch table mapping 1:1 with bytecode ops.
+// Each entry is a label within this function that can be used for computed
+// goto. You can find more information on computed goto here:
+// https://eli.thegreenplace.net/2012/07/12/computed-goto-for-efficient-dispatch-tables
+//
+// Note that we ensure the table is 256 elements long exactly to make sure
+// that unused opcodes are handled gracefully.
+//
+// Computed gotos are pretty much the best way to dispatch interpreters but are
+// not part of the C standard; GCC and clang support them but MSVC does not.
+// Because the performance difference is significant we support both here but
+// prefer the computed goto path where available. Empirical data shows them to
+// still be a win in 2019 on x64 desktops and arm32/arm64 mobile devices.
+#define BEGIN_DISPATCH_CORE()                     \
+  goto* kDispatchTable_CORE[bytecode_data[pc++]]; \
+  while (1)
+#define END_DISPATCH_CORE()
+
+#define DECLARE_DISPATCH_CORE_OPC(ordinal, name) &&_dispatch_CORE_##name,
+#define DECLARE_DISPATCH_CORE_RSV(ordinal) &&_dispatch_unhandled,
+#define DEFINE_DISPATCH_TABLE_CORE()                                    \
+  static const void* kDispatchTable_CORE[256] = {IREE_VM_OP_CORE_TABLE( \
+      DECLARE_DISPATCH_CORE_OPC, DECLARE_DISPATCH_CORE_RSV)};
+
+#define DECLARE_DISPATCH_EXT_RSV(ordinal) &&_dispatch_unhandled,
+#if IREE_VM_EXT_I64_ENABLE
+#define DECLARE_DISPATCH_EXT_I64_OPC(ordinal, name) &&_dispatch_EXT_I64_##name,
+#define DEFINE_DISPATCH_TABLE_EXT_I64()                                       \
+  static const void* kDispatchTable_EXT_I64[256] = {IREE_VM_OP_EXT_I64_TABLE( \
+      DECLARE_DISPATCH_EXT_I64_OPC, DECLARE_DISPATCH_EXT_RSV)};
+#else
+#define DEFINE_DISPATCH_TABLE_EXT_I64()
+#endif  // IREE_VM_EXT_I64_ENABLE
+#if IREE_VM_EXT_F32_ENABLE
+#define DECLARE_DISPATCH_EXT_F32_OPC(ordinal, name) &&_dispatch_EXT_F32_##name,
+#define DEFINE_DISPATCH_TABLE_EXT_F32()                                       \
+  static const void* kDispatchTable_EXT_F32[256] = {IREE_VM_OP_EXT_F32_TABLE( \
+      DECLARE_DISPATCH_EXT_F32_OPC, DECLARE_DISPATCH_EXT_RSV)};
+#else
+#define DEFINE_DISPATCH_TABLE_EXT_F32()
+#endif  // IREE_VM_EXT_I64_ENABLE
+#if IREE_VM_EXT_F64_ENABLE
+#define DECLARE_DISPATCH_EXT_F64_OPC(ordinal, name) &&_dispatch_EXT_F64_##name,
+#define DEFINE_DISPATCH_TABLE_EXT_F64()                                       \
+  static const void* kDispatchTable_EXT_F64[256] = {IREE_VM_OP_EXT_F64_TABLE( \
+      DECLARE_DISPATCH_EXT_F64_OPC, DECLARE_DISPATCH_EXT_RSV)};
+#else
+#define DEFINE_DISPATCH_TABLE_EXT_F64()
+#endif  // IREE_VM_EXT_I64_ENABLE
+
+#define DEFINE_DISPATCH_TABLES()   \
+  DEFINE_DISPATCH_TABLE_CORE();    \
+  DEFINE_DISPATCH_TABLE_EXT_I64(); \
+  DEFINE_DISPATCH_TABLE_EXT_F32(); \
+  DEFINE_DISPATCH_TABLE_EXT_F64();
+
+#define DISPATCH_UNHANDLED_CORE()                                           \
+  _dispatch_unhandled : {                                                   \
+    VMCHECK(0);                                                             \
+    return iree_make_status(IREE_STATUS_UNIMPLEMENTED, "unhandled opcode"); \
+  }
+#define UNHANDLED_DISPATCH_PREFIX(op_name, ext)                    \
+  _dispatch_CORE_##op_name : {                                     \
+    VMCHECK(0);                                                    \
+    return iree_make_status(IREE_STATUS_UNIMPLEMENTED,             \
+                            "unhandled dispatch extension " #ext); \
+  }
+
+#define DISPATCH_OP(ext, op_name, body)                          \
+  _dispatch_##ext##_##op_name:;                                  \
+  IREE_DISPATCH_TRACE_INSTRUCTION(VM_PC_OFFSET_##ext, #op_name); \
+  body;                                                          \
+  goto* kDispatchTable_CORE[bytecode_data[pc++]];
+
+#define BEGIN_DISPATCH_PREFIX(op_name, ext)                                   \
+  _dispatch_CORE_##op_name : goto* kDispatchTable_##ext[bytecode_data[pc++]]; \
+  while (1)
+#define END_DISPATCH_PREFIX() goto* kDispatchTable_CORE[bytecode_data[pc++]];
+
+#else
+
+// Switch-based dispatch. This is strictly less efficient than the computed
+// goto approach above but is universally supported.
+
+#define BEGIN_DISPATCH_CORE() \
+  while (1) {                 \
+    switch (bytecode_data[pc++])
+#define END_DISPATCH_CORE() }
+
+#define DEFINE_DISPATCH_TABLES()
+
+#define DISPATCH_UNHANDLED_CORE()                      \
+  default: {                                           \
+    VMCHECK(0);                                        \
+    return iree_make_status(IREE_STATUS_UNIMPLEMENTED, \
+                            "unhandled core opcode");  \
+  }
+#define UNHANDLED_DISPATCH_PREFIX(op_name, ext)                    \
+  case IREE_VM_OP_CORE_##op_name: {                                \
+    VMCHECK(0);                                                    \
+    return iree_make_status(IREE_STATUS_UNIMPLEMENTED,             \
+                            "unhandled dispatch extension " #ext); \
+  }
+
+#define DISPATCH_OP(ext, op_name, body)                            \
+  case IREE_VM_OP_##ext##_##op_name: {                             \
+    IREE_DISPATCH_TRACE_INSTRUCTION(VM_PC_OFFSET_##ext, #op_name); \
+    body;                                                          \
+  } break;
+
+#define BEGIN_DISPATCH_PREFIX(op_name, ext) \
+  case IREE_VM_OP_CORE_##op_name: {         \
+    switch (bytecode_data[pc++])
+#define END_DISPATCH_PREFIX() \
+  break;                      \
+  }
+
+#endif  // IREE_DISPATCH_MODE_COMPUTED_GOTO
+
+// Common dispatch op macros
+
+#define DISPATCH_OP_CORE_UNARY_I32(op_name, op_func)  \
+  DISPATCH_OP(CORE, op_name, {                        \
+    int32_t operand = VM_DecOperandRegI32("operand"); \
+    int32_t* result = VM_DecResultRegI32("result");   \
+    *result = op_func(operand);                       \
+  });
+
+#define DISPATCH_OP_CORE_BINARY_I32(op_name, op_func) \
+  DISPATCH_OP(CORE, op_name, {                        \
+    int32_t lhs = VM_DecOperandRegI32("lhs");         \
+    int32_t rhs = VM_DecOperandRegI32("rhs");         \
+    int32_t* result = VM_DecResultRegI32("result");   \
+    *result = op_func(lhs, rhs);                      \
+  });
+
+#define DISPATCH_OP_CORE_TERNARY_I32(op_name, op_func) \
+  DISPATCH_OP(CORE, op_name, {                         \
+    int32_t a = VM_DecOperandRegI32("a");              \
+    int32_t b = VM_DecOperandRegI32("b");              \
+    int32_t c = VM_DecOperandRegI32("c");              \
+    int32_t* result = VM_DecResultRegI32("result");    \
+    *result = op_func(a, b, c);                        \
+  });
+
+#define DISPATCH_OP_EXT_I64_UNARY_I64(op_name, op_func) \
+  DISPATCH_OP(EXT_I64, op_name, {                       \
+    int64_t operand = VM_DecOperandRegI64("operand");   \
+    int64_t* result = VM_DecResultRegI64("result");     \
+    *result = op_func(operand);                         \
+  });
+
+#define DISPATCH_OP_EXT_I64_BINARY_I64(op_name, op_func) \
+  DISPATCH_OP(EXT_I64, op_name, {                        \
+    int64_t lhs = VM_DecOperandRegI64("lhs");            \
+    int64_t rhs = VM_DecOperandRegI64("rhs");            \
+    int64_t* result = VM_DecResultRegI64("result");      \
+    *result = op_func(lhs, rhs);                         \
+  });
+
+#define DISPATCH_OP_EXT_I64_TERNARY_I64(op_name, op_func) \
+  DISPATCH_OP(EXT_I64, op_name, {                         \
+    int64_t a = VM_DecOperandRegI64("a");                 \
+    int64_t b = VM_DecOperandRegI64("b");                 \
+    int64_t c = VM_DecOperandRegI64("c");                 \
+    int64_t* result = VM_DecResultRegI64("result");       \
+    *result = op_func(a, b, c);                           \
+  });
+
+#define DISPATCH_OP_EXT_F32_UNARY_F32(op_name, op_func) \
+  DISPATCH_OP(EXT_F32, op_name, {                       \
+    float operand = VM_DecOperandRegF32("operand");     \
+    float* result = VM_DecResultRegF32("result");       \
+    *result = op_func(operand);                         \
+  });
+
+#define DISPATCH_OP_EXT_F32_BINARY_F32(op_name, op_func) \
+  DISPATCH_OP(EXT_F32, op_name, {                        \
+    float lhs = VM_DecOperandRegF32("lhs");              \
+    float rhs = VM_DecOperandRegF32("rhs");              \
+    float* result = VM_DecResultRegF32("result");        \
+    *result = op_func(lhs, rhs);                         \
+  });
+
+#define DISPATCH_OP_EXT_F32_TERNARY_F32(op_name, op_func) \
+  DISPATCH_OP(EXT_F32, op_name, {                         \
+    float a = VM_DecOperandRegF32("a");                   \
+    float b = VM_DecOperandRegF32("b");                   \
+    float c = VM_DecOperandRegF32("c");                   \
+    float* result = VM_DecResultRegF32("result");         \
+    *result = op_func(a, b, c);                           \
+  });
+
+#define DISPATCH_OP_EXT_F64_UNARY_F64(op_name, op_func) \
+  DISPATCH_OP(EXT_F64, op_name, {                       \
+    double operand = VM_DecOperandRegF64("operand");    \
+    double* result = VM_DecResultRegF64("result");      \
+    *result = op_func(operand);                         \
+  });
+
+#define DISPATCH_OP_EXT_F64_BINARY_F64(op_name, op_func) \
+  DISPATCH_OP(EXT_F64, op_name, {                        \
+    double lhs = VM_DecOperandRegF64("lhs");             \
+    double rhs = VM_DecOperandRegF64("rhs");             \
+    double* result = VM_DecResultRegF64("result");       \
+    *result = op_func(lhs, rhs);                         \
+  });
+
+#define DISPATCH_OP_EXT_F64_TERNARY_F64(op_name, op_func) \
+  DISPATCH_OP(EXT_F64, op_name, {                         \
+    double a = VM_DecOperandRegF64("a");                  \
+    double b = VM_DecOperandRegF64("b");                  \
+    double c = VM_DecOperandRegF64("c");                  \
+    double* result = VM_DecResultRegF64("result");        \
+    *result = op_func(a, b, c);                           \
+  });
+
+#endif  // IREE_VM_BYTECODE_DISPATCH_UTIL_H_
diff --git a/runtime/src/iree/vm/bytecode_module.c b/runtime/src/iree/vm/bytecode_module.c
new file mode 100644
index 0000000..192464d
--- /dev/null
+++ b/runtime/src/iree/vm/bytecode_module.c
@@ -0,0 +1,941 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/vm/bytecode_module.h"
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+#include "iree/vm/api.h"
+#include "iree/vm/bytecode_module_impl.h"
+
+// Perform an strcmp between a flatbuffers string and an IREE string view.
+static bool iree_vm_flatbuffer_strcmp(flatbuffers_string_t lhs,
+                                      iree_string_view_t rhs) {
+  size_t lhs_size = flatbuffers_string_len(lhs);
+  int x = strncmp(lhs, rhs.data, lhs_size < rhs.size ? lhs_size : rhs.size);
+  return x != 0 ? x : lhs_size < rhs.size ? -1 : lhs_size > rhs.size;
+}
+
+// Resolves a type through either builtin rules or the ref registered types.
+static bool iree_vm_bytecode_module_resolve_type(
+    iree_vm_TypeDef_table_t type_def, iree_vm_type_def_t* out_type) {
+  memset(out_type, 0, sizeof(*out_type));
+  flatbuffers_string_t full_name = iree_vm_TypeDef_full_name(type_def);
+  if (!flatbuffers_string_len(full_name)) {
+    return false;
+  } else if (iree_vm_flatbuffer_strcmp(full_name,
+                                       iree_make_cstring_view("i8")) == 0) {
+    out_type->value_type = IREE_VM_VALUE_TYPE_I8;
+    return true;
+  } else if (iree_vm_flatbuffer_strcmp(full_name,
+                                       iree_make_cstring_view("i16")) == 0) {
+    out_type->value_type = IREE_VM_VALUE_TYPE_I16;
+    return true;
+  } else if (iree_vm_flatbuffer_strcmp(full_name,
+                                       iree_make_cstring_view("i32")) == 0) {
+    out_type->value_type = IREE_VM_VALUE_TYPE_I32;
+    return true;
+  } else if (iree_vm_flatbuffer_strcmp(full_name,
+                                       iree_make_cstring_view("i64")) == 0) {
+    out_type->value_type = IREE_VM_VALUE_TYPE_I64;
+    return true;
+  } else if (iree_vm_flatbuffer_strcmp(full_name,
+                                       iree_make_cstring_view("f32")) == 0) {
+    out_type->value_type = IREE_VM_VALUE_TYPE_F32;
+    return true;
+  } else if (iree_vm_flatbuffer_strcmp(full_name,
+                                       iree_make_cstring_view("f64")) == 0) {
+    out_type->value_type = IREE_VM_VALUE_TYPE_F64;
+    return true;
+  } else if (iree_vm_flatbuffer_strcmp(
+                 full_name, iree_make_cstring_view("!vm.opaque")) == 0) {
+    out_type->value_type = IREE_VM_VALUE_TYPE_NONE;
+    out_type->ref_type = IREE_VM_REF_TYPE_NULL;
+    return true;
+  } else if (full_name[0] == '!') {
+    // Note that we drop the ! prefix:
+    iree_string_view_t type_name = {full_name + 1,
+                                    flatbuffers_string_len(full_name) - 1};
+    if (iree_string_view_starts_with(type_name,
+                                     iree_make_cstring_view("vm.list"))) {
+      // This is a !vm.list<...> type. We don't actually care about the type as
+      // we allow list types to be widened. Rewrite to just vm.list as that's
+      // all we have registered.
+      type_name = iree_make_cstring_view("vm.list");
+    }
+    const iree_vm_ref_type_descriptor_t* type_descriptor =
+        iree_vm_ref_lookup_registered_type(type_name);
+    if (type_descriptor) {
+      out_type->ref_type = type_descriptor->type;
+    }
+    return true;
+  }
+  return false;
+}
+
+// Resolves all types through either builtin rules or the ref registered types.
+// |type_table| can be omitted to just perform verification that all types are
+// registered.
+static iree_status_t iree_vm_bytecode_module_resolve_types(
+    iree_vm_TypeDef_vec_t type_defs, iree_vm_type_def_t* type_table) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_status_t status = iree_ok_status();
+  for (size_t i = 0; i < iree_vm_TypeDef_vec_len(type_defs); ++i) {
+    iree_vm_TypeDef_table_t type_def = iree_vm_TypeDef_vec_at(type_defs, i);
+    if (!iree_vm_bytecode_module_resolve_type(type_def, &type_table[i])) {
+      status = iree_make_status(IREE_STATUS_NOT_FOUND,
+                                "no type registered with name '%s'",
+                                iree_vm_TypeDef_full_name(type_def));
+      break;
+    }
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+// Verifies the structure of the flatbuffer so that we can avoid doing so during
+// runtime. There are still some conditions we must be aware of (such as omitted
+// names on functions with internal linkage), however we shouldn't need to
+// bounds check anything within the flatbuffer after this succeeds.
+static iree_status_t iree_vm_bytecode_module_flatbuffer_verify(
+    iree_const_byte_span_t flatbuffer_data) {
+  if (!flatbuffer_data.data || flatbuffer_data.data_length < 16) {
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "flatbuffer data is not present or less than 16 bytes (%zu total)",
+        flatbuffer_data.data_length);
+  }
+
+  // Run flatcc generated verification. This ensures all pointers are in-bounds
+  // and that we can safely walk the file, but not that the actual contents of
+  // the flatbuffer meet our expectations.
+  int verify_ret = iree_vm_BytecodeModuleDef_verify_as_root(
+      flatbuffer_data.data, flatbuffer_data.data_length);
+  if (verify_ret != flatcc_verify_ok) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "flatbuffer verification failed: %s",
+                            flatcc_verify_error_string(verify_ret));
+  }
+
+  iree_vm_BytecodeModuleDef_table_t module_def =
+      iree_vm_BytecodeModuleDef_as_root(flatbuffer_data.data);
+
+  flatbuffers_string_t name = iree_vm_BytecodeModuleDef_name(module_def);
+  if (!flatbuffers_string_len(name)) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "module missing name field");
+  }
+
+  iree_vm_TypeDef_vec_t types = iree_vm_BytecodeModuleDef_types(module_def);
+  for (size_t i = 0; i < iree_vm_TypeDef_vec_len(types); ++i) {
+    iree_vm_TypeDef_table_t type_def = iree_vm_TypeDef_vec_at(types, i);
+    if (!type_def) {
+      return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                              "types[%zu] missing body", i);
+    }
+    flatbuffers_string_t full_name = iree_vm_TypeDef_full_name(type_def);
+    if (flatbuffers_string_len(full_name) <= 0) {
+      return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                              "types[%zu] missing name", i);
+    }
+  }
+
+  iree_vm_ImportFunctionDef_vec_t imported_functions =
+      iree_vm_BytecodeModuleDef_imported_functions(module_def);
+  iree_vm_ExportFunctionDef_vec_t exported_functions =
+      iree_vm_BytecodeModuleDef_exported_functions(module_def);
+  iree_vm_FunctionDescriptor_vec_t function_descriptors =
+      iree_vm_BytecodeModuleDef_function_descriptors(module_def);
+
+  for (size_t i = 0; i < iree_vm_ImportFunctionDef_vec_len(imported_functions);
+       ++i) {
+    iree_vm_ImportFunctionDef_table_t import_def =
+        iree_vm_ImportFunctionDef_vec_at(imported_functions, i);
+    if (!import_def) {
+      return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                              "imports[%zu] missing body", i);
+    }
+    flatbuffers_string_t full_name =
+        iree_vm_ImportFunctionDef_full_name(import_def);
+    if (!flatbuffers_string_len(full_name)) {
+      return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                              "imports[%zu] missing full_name", i);
+    }
+  }
+
+  for (size_t i = 0; i < iree_vm_ExportFunctionDef_vec_len(exported_functions);
+       ++i) {
+    iree_vm_ExportFunctionDef_table_t export_def =
+        iree_vm_ExportFunctionDef_vec_at(exported_functions, i);
+    if (!export_def) {
+      return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                              "exports[%zu] missing body", i);
+    }
+    flatbuffers_string_t local_name =
+        iree_vm_ExportFunctionDef_local_name(export_def);
+    if (!flatbuffers_string_len(local_name)) {
+      return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                              "exports[%zu] missing local_name", i);
+    }
+    iree_host_size_t internal_ordinal =
+        iree_vm_ExportFunctionDef_internal_ordinal(export_def);
+    if (internal_ordinal >=
+        iree_vm_FunctionDescriptor_vec_len(function_descriptors)) {
+      return iree_make_status(
+          IREE_STATUS_INVALID_ARGUMENT,
+          "exports[%zu] internal_ordinal out of bounds (0 < %zu < %zu)", i,
+          internal_ordinal,
+          iree_vm_FunctionDescriptor_vec_len(function_descriptors));
+    }
+  }
+
+  flatbuffers_uint8_vec_t bytecode_data =
+      iree_vm_BytecodeModuleDef_bytecode_data(module_def);
+  for (size_t i = 0;
+       i < iree_vm_FunctionDescriptor_vec_len(function_descriptors); ++i) {
+    iree_vm_FunctionDescriptor_struct_t function_descriptor =
+        iree_vm_FunctionDescriptor_vec_at(function_descriptors, i);
+    if (function_descriptor->bytecode_offset < 0 ||
+        function_descriptor->bytecode_offset +
+                function_descriptor->bytecode_length >
+            flatbuffers_uint8_vec_len(bytecode_data)) {
+      return iree_make_status(
+          IREE_STATUS_INVALID_ARGUMENT,
+          "functions[%zu] descriptor bytecode span out of range (0 < %d < %zu)",
+          i, function_descriptor->bytecode_offset,
+          flatbuffers_uint8_vec_len(bytecode_data));
+    }
+    if (function_descriptor->i32_register_count > IREE_I32_REGISTER_COUNT ||
+        function_descriptor->ref_register_count > IREE_REF_REGISTER_COUNT) {
+      return iree_make_status(
+          IREE_STATUS_INVALID_ARGUMENT,
+          "functions[%zu] descriptor register count out of range", i);
+    }
+
+    // TODO(benvanik): run bytecode verifier on contents.
+  }
+
+  return iree_ok_status();
+}
+
+static iree_status_t iree_vm_bytecode_map_internal_ordinal(
+    iree_vm_bytecode_module_t* module, iree_vm_function_t function,
+    uint16_t* out_ordinal,
+    iree_vm_FunctionSignatureDef_table_t* out_signature_def) {
+  *out_ordinal = 0;
+  if (out_signature_def) *out_signature_def = NULL;
+
+  uint16_t ordinal = function.ordinal;
+  iree_vm_FunctionSignatureDef_table_t signature_def = NULL;
+  if (function.linkage == IREE_VM_FUNCTION_LINKAGE_EXPORT) {
+    // Look up the internal ordinal index of this export in the function table.
+    iree_vm_ExportFunctionDef_vec_t exported_functions =
+        iree_vm_BytecodeModuleDef_exported_functions(module->def);
+    IREE_ASSERT_LT(ordinal,
+                   iree_vm_ExportFunctionDef_vec_len(exported_functions),
+                   "export ordinal out of range (0 < %zu < %zu)", ordinal,
+                   iree_vm_ExportFunctionDef_vec_len(exported_functions));
+    iree_vm_ExportFunctionDef_table_t function_def =
+        iree_vm_ExportFunctionDef_vec_at(exported_functions, function.ordinal);
+    ordinal = iree_vm_ExportFunctionDef_internal_ordinal(function_def);
+    signature_def = iree_vm_ExportFunctionDef_signature(function_def);
+  } else {
+    // TODO(benvanik): support querying the internal functions, which could be
+    // useful for debugging. Or maybe we just drop them forever?
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "cannot map imported/internal functions; no entry "
+                            "in the function table");
+  }
+
+  if (ordinal >= module->function_descriptor_count) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "function ordinal out of range (0 < %u < %zu)",
+                            function.ordinal,
+                            module->function_descriptor_count);
+  }
+
+  *out_ordinal = ordinal;
+  if (out_signature_def) *out_signature_def = signature_def;
+  return iree_ok_status();
+}
+
+static void iree_vm_bytecode_module_destroy(void* self) {
+  iree_vm_bytecode_module_t* module = (iree_vm_bytecode_module_t*)self;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_allocator_free(module->flatbuffer_allocator,
+                      (void*)module->flatbuffer_data.data);
+  module->flatbuffer_data = iree_make_const_byte_span(NULL, 0);
+  module->flatbuffer_allocator = iree_allocator_null();
+
+  iree_allocator_free(module->allocator, module);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_string_view_t iree_vm_bytecode_module_name(void* self) {
+  iree_vm_bytecode_module_t* module = (iree_vm_bytecode_module_t*)self;
+  flatbuffers_string_t name = iree_vm_BytecodeModuleDef_name(module->def);
+  return iree_make_string_view(name, flatbuffers_string_len(name));
+}
+
+static iree_vm_module_signature_t iree_vm_bytecode_module_signature(
+    void* self) {
+  iree_vm_bytecode_module_t* module = (iree_vm_bytecode_module_t*)self;
+  iree_vm_module_signature_t signature;
+  memset(&signature, 0, sizeof(signature));
+  signature.import_function_count = iree_vm_ImportFunctionDef_vec_len(
+      iree_vm_BytecodeModuleDef_imported_functions(module->def));
+  signature.export_function_count = iree_vm_ExportFunctionDef_vec_len(
+      iree_vm_BytecodeModuleDef_exported_functions(module->def));
+  signature.internal_function_count = module->function_descriptor_count;
+  return signature;
+}
+
+static iree_status_t iree_vm_bytecode_module_get_function(
+    void* self, iree_vm_function_linkage_t linkage, iree_host_size_t ordinal,
+    iree_vm_function_t* out_function, iree_string_view_t* out_name,
+    iree_vm_function_signature_t* out_signature) {
+  if (out_function) {
+    memset(out_function, 0, sizeof(*out_function));
+  }
+  if (out_name) {
+    memset(out_name, 0, sizeof(*out_name));
+  }
+  if (out_signature) {
+    memset(out_signature, 0, sizeof(*out_signature));
+  }
+
+  iree_vm_bytecode_module_t* module = (iree_vm_bytecode_module_t*)self;
+  flatbuffers_string_t name = NULL;
+  iree_vm_FunctionSignatureDef_table_t signature = NULL;
+  if (linkage == IREE_VM_FUNCTION_LINKAGE_IMPORT ||
+      linkage == IREE_VM_FUNCTION_LINKAGE_IMPORT_OPTIONAL) {
+    iree_vm_ImportFunctionDef_vec_t imported_functions =
+        iree_vm_BytecodeModuleDef_imported_functions(module->def);
+    if (ordinal >= iree_vm_ImportFunctionDef_vec_len(imported_functions)) {
+      return iree_make_status(
+          IREE_STATUS_INVALID_ARGUMENT,
+          "import ordinal out of range (0 < %zu < %zu)", ordinal,
+          iree_vm_ImportFunctionDef_vec_len(imported_functions));
+    }
+    iree_vm_ImportFunctionDef_table_t import_def =
+        iree_vm_ImportFunctionDef_vec_at(imported_functions, ordinal);
+    name = iree_vm_ImportFunctionDef_full_name(import_def);
+    signature = iree_vm_ImportFunctionDef_signature(import_def);
+    if (iree_all_bits_set(iree_vm_ImportFunctionDef_flags(import_def),
+                          iree_vm_ImportFlagBits_OPTIONAL)) {
+      linkage = IREE_VM_FUNCTION_LINKAGE_IMPORT_OPTIONAL;
+    }
+  } else if (linkage == IREE_VM_FUNCTION_LINKAGE_EXPORT) {
+    iree_vm_ExportFunctionDef_vec_t exported_functions =
+        iree_vm_BytecodeModuleDef_exported_functions(module->def);
+    if (ordinal >= iree_vm_ExportFunctionDef_vec_len(exported_functions)) {
+      return iree_make_status(
+          IREE_STATUS_INVALID_ARGUMENT,
+          "export ordinal out of range (0 < %zu < %zu)", ordinal,
+          iree_vm_ExportFunctionDef_vec_len(exported_functions));
+    }
+    iree_vm_ExportFunctionDef_table_t export_def =
+        iree_vm_ExportFunctionDef_vec_at(exported_functions, ordinal);
+    name = iree_vm_ExportFunctionDef_local_name(export_def);
+    signature = iree_vm_ExportFunctionDef_signature(export_def);
+  }
+
+  if (out_function) {
+    out_function->module = &module->interface;
+    out_function->linkage = linkage;
+    out_function->ordinal = (uint16_t)ordinal;
+  }
+  if (out_name && name) {
+    out_name->data = name;
+    out_name->size = flatbuffers_string_len(name);
+  }
+  if (out_signature && signature) {
+    flatbuffers_string_t calling_convention =
+        iree_vm_FunctionSignatureDef_calling_convention(signature);
+    out_signature->calling_convention.data = calling_convention;
+    out_signature->calling_convention.size =
+        flatbuffers_string_len(calling_convention);
+  }
+
+  return iree_ok_status();
+}
+
+static iree_status_t iree_vm_bytecode_module_get_function_reflection_attr(
+    void* self, iree_vm_function_linkage_t linkage, iree_host_size_t ordinal,
+    iree_host_size_t index, iree_string_view_t* key,
+    iree_string_view_t* value) {
+  if (linkage != IREE_VM_FUNCTION_LINKAGE_EXPORT) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "only exported functions can be queried");
+  }
+
+  iree_vm_bytecode_module_t* module = (iree_vm_bytecode_module_t*)self;
+  iree_vm_ExportFunctionDef_vec_t exported_functions =
+      iree_vm_BytecodeModuleDef_exported_functions(module->def);
+
+  if (ordinal >= iree_vm_ExportFunctionDef_vec_len(exported_functions)) {
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "function ordinal out of range (0 < %zu < %zu)", ordinal,
+        iree_vm_ExportFunctionDef_vec_len(exported_functions));
+  }
+
+  iree_vm_ExportFunctionDef_table_t function_def =
+      iree_vm_ExportFunctionDef_vec_at(exported_functions, ordinal);
+  iree_vm_FunctionSignatureDef_table_t signature_def =
+      iree_vm_ExportFunctionDef_signature(function_def);
+  if (!signature_def) {
+    return iree_make_status(
+        IREE_STATUS_NOT_FOUND,
+        "reflection attribute at index %zu not found; no signature", index);
+  }
+  iree_vm_ReflectionAttrDef_vec_t reflection_attrs =
+      iree_vm_FunctionSignatureDef_reflection_attrs(signature_def);
+  if (!reflection_attrs ||
+      index >= iree_vm_ReflectionAttrDef_vec_len(reflection_attrs)) {
+    return iree_make_status(IREE_STATUS_NOT_FOUND,
+                            "reflection attribute at index %zu not found",
+                            index);
+  }
+  iree_vm_ReflectionAttrDef_table_t attr =
+      iree_vm_ReflectionAttrDef_vec_at(reflection_attrs, index);
+  flatbuffers_string_t attr_key = iree_vm_ReflectionAttrDef_key(attr);
+  flatbuffers_string_t attr_value = iree_vm_ReflectionAttrDef_value(attr);
+  if (!flatbuffers_string_len(attr_key) ||
+      !flatbuffers_string_len(attr_value)) {
+    // Because reflection metadata should not impose any overhead for the
+    // non reflection case, we do not eagerly validate it on load -- instead
+    // verify it structurally as needed.
+    return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                            "reflection attribute missing fields");
+  }
+
+  key->data = attr_key;
+  key->size = flatbuffers_string_len(attr_key);
+  value->data = attr_value;
+  value->size = flatbuffers_string_len(attr_value);
+
+  return iree_ok_status();
+}
+
+static iree_status_t iree_vm_bytecode_module_lookup_function(
+    void* self, iree_vm_function_linkage_t linkage, iree_string_view_t name,
+    iree_vm_function_t* out_function) {
+  IREE_ASSERT_ARGUMENT(out_function);
+  memset(out_function, 0, sizeof(iree_vm_function_t));
+
+  if (iree_string_view_is_empty(name)) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "function name required for query");
+  }
+
+  iree_vm_bytecode_module_t* module = (iree_vm_bytecode_module_t*)self;
+  out_function->linkage = linkage;
+  out_function->module = &module->interface;
+
+  // NOTE: we could organize exports alphabetically so we could bsearch.
+  if (linkage == IREE_VM_FUNCTION_LINKAGE_IMPORT ||
+      linkage == IREE_VM_FUNCTION_LINKAGE_IMPORT_OPTIONAL) {
+    iree_vm_ImportFunctionDef_vec_t imported_functions =
+        iree_vm_BytecodeModuleDef_imported_functions(module->def);
+    for (iree_host_size_t ordinal = 0;
+         ordinal < iree_vm_ImportFunctionDef_vec_len(imported_functions);
+         ++ordinal) {
+      iree_vm_ImportFunctionDef_table_t import_def =
+          iree_vm_ImportFunctionDef_vec_at(imported_functions, ordinal);
+      if (iree_vm_flatbuffer_strcmp(
+              iree_vm_ImportFunctionDef_full_name(import_def), name) == 0) {
+        out_function->ordinal = ordinal;
+        if (iree_all_bits_set(iree_vm_ImportFunctionDef_flags(import_def),
+                              iree_vm_ImportFlagBits_OPTIONAL)) {
+          out_function->linkage = IREE_VM_FUNCTION_LINKAGE_IMPORT_OPTIONAL;
+        }
+        return iree_ok_status();
+      }
+    }
+  } else if (linkage == IREE_VM_FUNCTION_LINKAGE_EXPORT) {
+    iree_vm_ExportFunctionDef_vec_t exported_functions =
+        iree_vm_BytecodeModuleDef_exported_functions(module->def);
+    for (iree_host_size_t ordinal = 0;
+         ordinal < iree_vm_ExportFunctionDef_vec_len(exported_functions);
+         ++ordinal) {
+      iree_vm_ExportFunctionDef_table_t export_def =
+          iree_vm_ExportFunctionDef_vec_at(exported_functions, ordinal);
+      if (iree_vm_flatbuffer_strcmp(
+              iree_vm_ExportFunctionDef_local_name(export_def), name) == 0) {
+        out_function->ordinal = ordinal;
+        return iree_ok_status();
+      }
+    }
+  }
+
+  return iree_make_status(IREE_STATUS_NOT_FOUND,
+                          "function with the given name not found");
+}
+
+static iree_status_t iree_vm_bytecode_location_format(
+    int32_t location_ordinal,
+    iree_vm_LocationTypeDef_union_vec_t location_table,
+    iree_vm_source_location_format_flags_t flags,
+    iree_string_builder_t* builder) {
+  iree_vm_LocationTypeDef_union_t location =
+      iree_vm_LocationTypeDef_union_vec_at(location_table, location_ordinal);
+  switch (location.type) {
+    default:
+    case iree_vm_LocationTypeDef_NONE: {
+      return iree_string_builder_append_cstring(builder, "[unknown]");
+    }
+    case iree_vm_LocationTypeDef_CallSiteLocDef: {
+      // NOTE: MLIR prints caller->callee, but in a stack trace we want the
+      // upside-down callee->caller.
+      iree_vm_CallSiteLocDef_table_t loc =
+          (iree_vm_CallSiteLocDef_table_t)location.value;
+      IREE_RETURN_IF_ERROR(iree_vm_bytecode_location_format(
+          iree_vm_CallSiteLocDef_callee(loc), location_table, flags, builder));
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(builder, "\n      at "));
+      return iree_vm_bytecode_location_format(
+          iree_vm_CallSiteLocDef_caller(loc), location_table, flags, builder);
+    }
+    case iree_vm_LocationTypeDef_FileLineColLocDef: {
+      iree_vm_FileLineColLocDef_table_t loc =
+          (iree_vm_FileLineColLocDef_table_t)location.value;
+      flatbuffers_string_t filename = iree_vm_FileLineColLocDef_filename(loc);
+      return iree_string_builder_append_format(
+          builder, "%.*s:%d:%d", (int)flatbuffers_string_len(filename),
+          filename, iree_vm_FileLineColLocDef_line(loc),
+          iree_vm_FileLineColLocDef_column(loc));
+    }
+    case iree_vm_LocationTypeDef_FusedLocDef: {
+      iree_vm_FusedLocDef_table_t loc =
+          (iree_vm_FusedLocDef_table_t)location.value;
+      if (iree_vm_FusedLocDef_metadata_is_present(loc)) {
+        flatbuffers_string_t metadata = iree_vm_FusedLocDef_metadata(loc);
+        IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
+            builder, "<%.*s>", (int)flatbuffers_string_len(metadata),
+            metadata));
+      }
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(builder, "[\n"));
+      flatbuffers_int32_vec_t child_locs = iree_vm_FusedLocDef_locations(loc);
+      for (size_t i = 0; i < flatbuffers_int32_vec_len(child_locs); ++i) {
+        if (i == 0) {
+          IREE_RETURN_IF_ERROR(
+              iree_string_builder_append_cstring(builder, "    "));
+        } else {
+          IREE_RETURN_IF_ERROR(
+              iree_string_builder_append_cstring(builder, ",\n    "));
+        }
+        IREE_RETURN_IF_ERROR(iree_vm_bytecode_location_format(
+            flatbuffers_int32_vec_at(child_locs, i), location_table, flags,
+            builder));
+      }
+      IREE_RETURN_IF_ERROR(
+          iree_string_builder_append_cstring(builder, "\n  ]"));
+      return iree_ok_status();
+    }
+    case iree_vm_LocationTypeDef_NameLocDef: {
+      iree_vm_NameLocDef_table_t loc =
+          (iree_vm_NameLocDef_table_t)location.value;
+      flatbuffers_string_t name = iree_vm_NameLocDef_name(loc);
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
+          builder, "\"%.*s\"", (int)flatbuffers_string_len(name), name));
+      if (iree_vm_NameLocDef_child_location_is_present(loc)) {
+        IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(builder, "("));
+        IREE_RETURN_IF_ERROR(iree_vm_bytecode_location_format(
+            iree_vm_NameLocDef_child_location(loc), location_table, flags,
+            builder));
+        IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(builder, ")"));
+      }
+      return iree_ok_status();
+    }
+  }
+}
+
+static iree_status_t iree_vm_bytecode_module_source_location_format(
+    void* self, uint64_t data[2], iree_vm_source_location_format_flags_t flags,
+    iree_string_builder_t* builder) {
+  iree_vm_DebugDatabaseDef_table_t debug_database_def =
+      (iree_vm_DebugDatabaseDef_table_t)self;
+  iree_vm_FunctionSourceMapDef_table_t source_map_def =
+      (iree_vm_FunctionSourceMapDef_table_t)data[0];
+  iree_vm_BytecodeLocationDef_vec_t locations =
+      iree_vm_FunctionSourceMapDef_locations(source_map_def);
+  iree_vm_source_offset_t source_offset = (iree_vm_source_offset_t)data[1];
+
+  size_t location_def_ordinal =
+      iree_vm_BytecodeLocationDef_vec_scan_by_bytecode_offset(
+          locations, (int32_t)source_offset);
+  if (location_def_ordinal == -1) {
+    return iree_status_from_code(IREE_STATUS_UNAVAILABLE);
+  }
+  iree_vm_BytecodeLocationDef_struct_t location_def =
+      iree_vm_BytecodeLocationDef_vec_at(locations, location_def_ordinal);
+  if (!location_def) {
+    return iree_status_from_code(IREE_STATUS_UNAVAILABLE);
+  }
+
+  // Print source location stack trace.
+  iree_vm_LocationTypeDef_union_vec_t location_table =
+      iree_vm_DebugDatabaseDef_location_table_union(debug_database_def);
+  IREE_RETURN_IF_ERROR(iree_vm_bytecode_location_format(
+      location_def->location, location_table, flags, builder));
+
+  return iree_ok_status();
+}
+
+static iree_status_t iree_vm_bytecode_module_resolve_source_location(
+    void* self, iree_vm_stack_frame_t* frame,
+    iree_vm_source_location_t* out_source_location) {
+  // Get module debug database, if available.
+  iree_vm_bytecode_module_t* module = (iree_vm_bytecode_module_t*)self;
+  iree_vm_BytecodeModuleDef_table_t module_def = module->def;
+  iree_vm_DebugDatabaseDef_table_t debug_database_def =
+      iree_vm_BytecodeModuleDef_debug_database(module_def);
+  if (!debug_database_def) {
+    return iree_status_from_code(IREE_STATUS_UNAVAILABLE);
+  }
+
+  // Map the (potentially) export ordinal into the internal function ordinal in
+  // the function descriptor table.
+  uint16_t ordinal;
+  if (frame->function.linkage == IREE_VM_FUNCTION_LINKAGE_INTERNAL) {
+    ordinal = frame->function.ordinal;
+  } else {
+    IREE_RETURN_IF_ERROR(iree_vm_bytecode_map_internal_ordinal(
+        module, frame->function, &ordinal, NULL));
+  }
+
+  // Lookup the source map for the function, if available.
+  iree_vm_FunctionSourceMapDef_vec_t source_maps_vec =
+      iree_vm_DebugDatabaseDef_functions(debug_database_def);
+  iree_vm_FunctionSourceMapDef_table_t source_map_def =
+      ordinal < iree_vm_FunctionSourceMapDef_vec_len(source_maps_vec)
+          ? iree_vm_FunctionSourceMapDef_vec_at(source_maps_vec, ordinal)
+          : NULL;
+  if (!source_map_def) {
+    return iree_status_from_code(IREE_STATUS_UNAVAILABLE);
+  }
+
+  // The source location stores the source map and PC and will perform the
+  // actual lookup within the source map on demand.
+  out_source_location->self = (void*)debug_database_def;
+  out_source_location->data[0] = (uint64_t)source_map_def;
+  out_source_location->data[1] = (uint64_t)frame->pc;
+  out_source_location->format = iree_vm_bytecode_module_source_location_format;
+  return iree_ok_status();
+}
+
+// Lays out the nested tables within a |state| structure.
+// Returns the total size of the structure and all tables with padding applied.
+// |state| may be null if only the structure size is required for allocation.
+static iree_host_size_t iree_vm_bytecode_module_layout_state(
+    iree_vm_BytecodeModuleDef_table_t module_def,
+    iree_vm_bytecode_module_state_t* state) {
+  iree_vm_ModuleStateDef_table_t module_state_def =
+      iree_vm_BytecodeModuleDef_module_state(module_def);
+  iree_host_size_t rwdata_storage_capacity = 0;
+  iree_host_size_t global_ref_count = 0;
+  if (module_state_def) {
+    rwdata_storage_capacity =
+        iree_vm_ModuleStateDef_global_bytes_capacity(module_state_def);
+    global_ref_count =
+        iree_vm_ModuleStateDef_global_ref_count(module_state_def);
+  }
+  iree_host_size_t rodata_ref_count = iree_vm_RodataSegmentDef_vec_len(
+      iree_vm_BytecodeModuleDef_rodata_segments(module_def));
+  iree_host_size_t import_function_count = iree_vm_ImportFunctionDef_vec_len(
+      iree_vm_BytecodeModuleDef_imported_functions(module_def));
+
+  uint8_t* base_ptr = (uint8_t*)state;
+  iree_host_size_t offset =
+      iree_host_align(sizeof(iree_vm_bytecode_module_state_t), 16);
+
+  if (state) {
+    state->rwdata_storage =
+        iree_make_byte_span(base_ptr + offset, rwdata_storage_capacity);
+  }
+  offset += iree_host_align(rwdata_storage_capacity, 16);
+
+  if (state) {
+    state->global_ref_count = global_ref_count;
+    state->global_ref_table = (iree_vm_ref_t*)(base_ptr + offset);
+  }
+  offset += iree_host_align(global_ref_count * sizeof(iree_vm_ref_t), 16);
+
+  if (state) {
+    state->rodata_ref_count = rodata_ref_count;
+    state->rodata_ref_table = (iree_vm_buffer_t*)(base_ptr + offset);
+  }
+  offset += iree_host_align(rodata_ref_count * sizeof(iree_vm_buffer_t), 16);
+
+  if (state) {
+    state->import_count = import_function_count;
+    state->import_table = (iree_vm_bytecode_import_t*)(base_ptr + offset);
+  }
+  offset +=
+      iree_host_align(import_function_count * sizeof(*state->import_table), 16);
+
+  return offset;
+}
+
+static iree_status_t iree_vm_bytecode_module_alloc_state(
+    void* self, iree_allocator_t allocator,
+    iree_vm_module_state_t** out_module_state) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_ASSERT_ARGUMENT(out_module_state);
+  *out_module_state = NULL;
+
+  iree_vm_bytecode_module_t* module = (iree_vm_bytecode_module_t*)self;
+  iree_vm_BytecodeModuleDef_table_t module_def = module->def;
+
+  // Compute the total size required (with padding) for the state structure.
+  iree_host_size_t total_state_struct_size =
+      iree_vm_bytecode_module_layout_state(module_def, NULL);
+
+  // Allocate the storage for the structure and all its nested tables.
+  iree_vm_bytecode_module_state_t* state = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_allocator_malloc(allocator, total_state_struct_size,
+                                (void**)&state));
+  state->allocator = allocator;
+
+  // Perform layout to get the pointers into the storage for each nested table.
+  iree_vm_bytecode_module_layout_state(module_def, state);
+
+  // Setup rodata segments to point directly at the flatbuffer memory.
+  iree_vm_RodataSegmentDef_vec_t rodata_segments =
+      iree_vm_BytecodeModuleDef_rodata_segments(module_def);
+  for (int i = 0; i < state->rodata_ref_count; ++i) {
+    iree_vm_RodataSegmentDef_table_t segment =
+        iree_vm_RodataSegmentDef_vec_at(rodata_segments, i);
+    iree_vm_buffer_t* ref = &state->rodata_ref_table[i];
+    iree_vm_buffer_initialize(
+        IREE_VM_BUFFER_ACCESS_ORIGIN_MODULE,
+        iree_make_byte_span(
+            (uint8_t*)iree_vm_RodataSegmentDef_data(segment),
+            flatbuffers_uint8_vec_len(iree_vm_RodataSegmentDef_data(segment))),
+        iree_allocator_null(), ref);
+  }
+
+  *out_module_state = (iree_vm_module_state_t*)state;
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+static void iree_vm_bytecode_module_free_state(
+    void* self, iree_vm_module_state_t* module_state) {
+  if (!module_state) return;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_vm_bytecode_module_state_t* state =
+      (iree_vm_bytecode_module_state_t*)module_state;
+
+  // Release remaining global references.
+  for (int i = 0; i < state->global_ref_count; ++i) {
+    iree_vm_ref_release(&state->global_ref_table[i]);
+  }
+
+  // Ensure all rodata references are unused and deinitialized.
+  for (int i = 0; i < state->rodata_ref_count; ++i) {
+    iree_vm_buffer_t* ref = &state->rodata_ref_table[i];
+    iree_vm_buffer_deinitialize(ref);
+  }
+
+  iree_allocator_free(state->allocator, module_state);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_status_t iree_vm_bytecode_module_resolve_import(
+    void* self, iree_vm_module_state_t* module_state, iree_host_size_t ordinal,
+    const iree_vm_function_t* function,
+    const iree_vm_function_signature_t* signature) {
+  IREE_ASSERT_ARGUMENT(module_state);
+  iree_vm_bytecode_module_state_t* state =
+      (iree_vm_bytecode_module_state_t*)module_state;
+  if (ordinal >= state->import_count) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "import ordinal out of range (0 < %zu < %zu)",
+                            ordinal, state->import_count);
+  }
+
+  iree_vm_bytecode_import_t* import = &state->import_table[ordinal];
+  import->function = *function;
+
+  // Split up arguments/results into fragments so that we can avoid scanning
+  // during calling.
+  IREE_RETURN_IF_ERROR(iree_vm_function_call_get_cconv_fragments(
+      signature, &import->arguments, &import->results));
+
+  // Precalculate bytes required to marshal argument/results across the ABI
+  // boundary.
+  iree_host_size_t argument_buffer_size = 0;
+  iree_host_size_t result_buffer_size = 0;
+  if (!iree_vm_function_call_is_variadic_cconv(import->arguments)) {
+    // NOTE: variadic types don't support precalculation and the vm.call.import
+    // dispatch code will handle calculating it per-call.
+    IREE_RETURN_IF_ERROR(iree_vm_function_call_compute_cconv_fragment_size(
+        import->arguments, /*segment_size_list=*/NULL, &argument_buffer_size));
+  }
+  IREE_RETURN_IF_ERROR(iree_vm_function_call_compute_cconv_fragment_size(
+      import->results, /*segment_size_list=*/NULL, &result_buffer_size));
+  if (argument_buffer_size > 16 * 1024 || result_buffer_size > 16 * 1024) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "ABI marshaling buffer overflow on import %zu",
+                            ordinal);
+  }
+  import->argument_buffer_size = (uint16_t)argument_buffer_size;
+  import->result_buffer_size = (uint16_t)result_buffer_size;
+
+  return iree_ok_status();
+}
+
+static iree_status_t IREE_API_PTR iree_vm_bytecode_module_notify(
+    void* self, iree_vm_module_state_t* module_state, iree_vm_signal_t signal) {
+  return iree_ok_status();
+}
+
+static iree_status_t iree_vm_bytecode_module_begin_call(
+    void* self, iree_vm_stack_t* stack, const iree_vm_function_call_t* call,
+    iree_vm_execution_result_t* out_result) {
+  // NOTE: any work here adds directly to the invocation time. Avoid doing too
+  // much work or touching too many unlikely-to-be-cached structures (such as
+  // walking the FlatBuffer, which may cause page faults).
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_ASSERT_ARGUMENT(out_result);
+  memset(out_result, 0, sizeof(iree_vm_execution_result_t));
+
+  // Map the (potentially) export ordinal into the internal function ordinal in
+  // the function descriptor table.
+  iree_vm_bytecode_module_t* module = (iree_vm_bytecode_module_t*)self;
+  uint16_t ordinal = 0;
+  iree_vm_FunctionSignatureDef_table_t signature_def = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_vm_bytecode_map_internal_ordinal(module, call->function,
+                                                &ordinal, &signature_def));
+
+  // Grab calling convention string. This is not great as we are guaranteed to
+  // have a bunch of cache misses, but without putting it on the descriptor
+  // (which would duplicate data and slow down normal intra-module calls)
+  // there's not a good way around it. In the grand scheme of things users
+  // should be keeping their calls across this boundary relatively fat (compared
+  // to the real work they do), so this only needs to be fast enough to blend
+  // into the noise. Similar to JNI, P/Invoke, etc you don't want to have
+  // imports that cost less to execute than the marshaling overhead (dozens to
+  // hundreds of instructions).
+  flatbuffers_string_t calling_convention =
+      signature_def
+          ? iree_vm_FunctionSignatureDef_calling_convention(signature_def)
+          : 0;
+  iree_vm_function_signature_t signature;
+  memset(&signature, 0, sizeof(signature));
+  signature.calling_convention.data = calling_convention;
+  signature.calling_convention.size =
+      flatbuffers_string_len(calling_convention);
+  iree_string_view_t cconv_arguments = iree_string_view_empty();
+  iree_string_view_t cconv_results = iree_string_view_empty();
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_vm_function_call_get_cconv_fragments(
+              &signature, &cconv_arguments, &cconv_results));
+
+  // Jump into the dispatch routine to execute bytecode until the function
+  // either returns (synchronous) or yields (asynchronous).
+  iree_status_t status = iree_vm_bytecode_dispatch(
+      stack, module, call, cconv_arguments, cconv_results, out_result);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_bytecode_module_create(
+    iree_const_byte_span_t flatbuffer_data,
+    iree_allocator_t flatbuffer_allocator, iree_allocator_t allocator,
+    iree_vm_module_t** out_module) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_ASSERT_ARGUMENT(out_module);
+  *out_module = NULL;
+
+  IREE_TRACE_ZONE_BEGIN_NAMED(z1, "iree_vm_bytecode_module_flatbuffer_verify");
+  iree_status_t status =
+      iree_vm_bytecode_module_flatbuffer_verify(flatbuffer_data);
+  if (!iree_status_is_ok(status)) {
+    IREE_TRACE_ZONE_END(z1);
+    IREE_TRACE_ZONE_END(z0);
+    return status;
+  }
+  IREE_TRACE_ZONE_END(z1);
+
+  iree_vm_BytecodeModuleDef_table_t module_def =
+      iree_vm_BytecodeModuleDef_as_root(flatbuffer_data.data);
+  if (!module_def) {
+    IREE_TRACE_ZONE_END(z0);
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "failed getting root from flatbuffer; expected identifier "
+        "'" iree_vm_BytecodeModuleDef_file_identifier "' not found");
+  }
+
+  iree_vm_TypeDef_vec_t type_defs = iree_vm_BytecodeModuleDef_types(module_def);
+  size_t type_table_size =
+      iree_vm_TypeDef_vec_len(type_defs) * sizeof(iree_vm_type_def_t);
+
+  iree_vm_bytecode_module_t* module = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_allocator_malloc(allocator, sizeof(*module) + type_table_size,
+                                (void**)&module));
+  module->allocator = allocator;
+
+  iree_vm_FunctionDescriptor_vec_t function_descriptors =
+      iree_vm_BytecodeModuleDef_function_descriptors(module_def);
+  module->function_descriptor_count =
+      iree_vm_FunctionDescriptor_vec_len(function_descriptors);
+  module->function_descriptor_table = function_descriptors;
+
+  flatbuffers_uint8_vec_t bytecode_data =
+      iree_vm_BytecodeModuleDef_bytecode_data(module_def);
+  module->bytecode_data = iree_make_const_byte_span(
+      bytecode_data, flatbuffers_uint8_vec_len(bytecode_data));
+
+  module->flatbuffer_data = flatbuffer_data;
+  module->flatbuffer_allocator = flatbuffer_allocator;
+  module->def = module_def;
+
+  module->type_count = iree_vm_TypeDef_vec_len(type_defs);
+  iree_status_t resolve_status =
+      iree_vm_bytecode_module_resolve_types(type_defs, module->type_table);
+  if (!iree_status_is_ok(resolve_status)) {
+    iree_allocator_free(allocator, module);
+    IREE_TRACE_ZONE_END(z0);
+    return resolve_status;
+  }
+
+  iree_vm_module_initialize(&module->interface, module);
+  module->interface.destroy = iree_vm_bytecode_module_destroy;
+  module->interface.name = iree_vm_bytecode_module_name;
+  module->interface.signature = iree_vm_bytecode_module_signature;
+  module->interface.get_function = iree_vm_bytecode_module_get_function;
+  module->interface.lookup_function = iree_vm_bytecode_module_lookup_function;
+#if IREE_VM_BACKTRACE_ENABLE
+  module->interface.resolve_source_location =
+      iree_vm_bytecode_module_resolve_source_location;
+#endif  // IREE_VM_BACKTRACE_ENABLE
+  module->interface.alloc_state = iree_vm_bytecode_module_alloc_state;
+  module->interface.free_state = iree_vm_bytecode_module_free_state;
+  module->interface.resolve_import = iree_vm_bytecode_module_resolve_import;
+  module->interface.notify = iree_vm_bytecode_module_notify;
+  module->interface.begin_call = iree_vm_bytecode_module_begin_call;
+  module->interface.get_function_reflection_attr =
+      iree_vm_bytecode_module_get_function_reflection_attr;
+
+  *out_module = &module->interface;
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
diff --git a/runtime/src/iree/vm/bytecode_module.h b/runtime/src/iree/vm/bytecode_module.h
new file mode 100644
index 0000000..ed7bc04
--- /dev/null
+++ b/runtime/src/iree/vm/bytecode_module.h
@@ -0,0 +1,32 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_VM_BYTECODE_MODULE_H_
+#define IREE_VM_BYTECODE_MODULE_H_
+
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/vm/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Creates a VM module from an in-memory ModuleDef FlatBuffer.
+// If a |flatbuffer_allocator| is provided then it will be used to free the
+// |flatbuffer_data| when the module is destroyed and otherwise the ownership of
+// the flatbuffer_data remains with the caller.
+IREE_API_EXPORT iree_status_t iree_vm_bytecode_module_create(
+    iree_const_byte_span_t flatbuffer_data,
+    iree_allocator_t flatbuffer_allocator, iree_allocator_t allocator,
+    iree_vm_module_t** out_module);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_VM_BYTECODE_MODULE_H_
diff --git a/runtime/src/iree/vm/bytecode_module_benchmark.cc b/runtime/src/iree/vm/bytecode_module_benchmark.cc
new file mode 100644
index 0000000..9dd7960
--- /dev/null
+++ b/runtime/src/iree/vm/bytecode_module_benchmark.cc
@@ -0,0 +1,348 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <array>
+#include <vector>
+
+#include "benchmark/benchmark.h"
+#include "iree/base/api.h"
+#include "iree/base/logging.h"
+#include "iree/vm/api.h"
+#include "iree/vm/bytecode_module.h"
+#include "iree/vm/bytecode_module_benchmark_module_c.h"
+
+namespace {
+
+struct native_import_module_s;
+struct native_import_module_state_s;
+typedef struct native_import_module_t native_import_module_t;
+typedef struct native_import_module_state_t native_import_module_state_t;
+
+// vm.import @native_import_module.add_1(%arg0 : i32) -> i32
+static iree_status_t native_import_module_add_1(
+    iree_vm_stack_t* stack, const iree_vm_function_call_t* call,
+    iree_vm_native_function_target_t target_fn, void* module,
+    void* module_state, iree_vm_execution_result_t* out_result) {
+  // Add 1 to arg0 and return.
+  int32_t arg0 = *reinterpret_cast<int32_t*>(call->arguments.data);
+  int32_t ret0 = arg0 + 1;
+  *reinterpret_cast<int32_t*>(call->results.data) = ret0;
+  return iree_ok_status();
+}
+
+static const iree_vm_native_export_descriptor_t
+    native_import_module_exports_[] = {
+        {iree_make_cstring_view("add_1"), iree_make_cstring_view("0i_i"), 0,
+         NULL},
+};
+static const iree_vm_native_function_ptr_t native_import_module_funcs_[] = {
+    {(iree_vm_native_function_shim_t)native_import_module_add_1, NULL},
+};
+static_assert(IREE_ARRAYSIZE(native_import_module_funcs_) ==
+                  IREE_ARRAYSIZE(native_import_module_exports_),
+              "function pointer table must be 1:1 with exports");
+static const iree_vm_native_module_descriptor_t
+    native_import_module_descriptor_ = {
+        iree_make_cstring_view("native_import_module"),
+        0,
+        NULL,
+        IREE_ARRAYSIZE(native_import_module_exports_),
+        native_import_module_exports_,
+        IREE_ARRAYSIZE(native_import_module_funcs_),
+        native_import_module_funcs_,
+        0,
+        NULL,
+};
+
+static iree_status_t native_import_module_create(
+    iree_allocator_t allocator, iree_vm_module_t** out_module) {
+  iree_vm_module_t interface;
+  IREE_RETURN_IF_ERROR(iree_vm_module_initialize(&interface, NULL));
+  return iree_vm_native_module_create(
+      &interface, &native_import_module_descriptor_, allocator, out_module);
+}
+
+// Benchmarks the given exported function, optionally passing in arguments.
+static iree_status_t RunFunction(benchmark::State& state,
+                                 iree_string_view_t function_name,
+                                 std::vector<int32_t> i32_args,
+                                 int result_count, int64_t batch_size = 1) {
+  iree_vm_instance_t* instance = NULL;
+  IREE_CHECK_OK(iree_vm_instance_create(iree_allocator_system(), &instance));
+
+  iree_vm_module_t* import_module = NULL;
+  IREE_CHECK_OK(
+      native_import_module_create(iree_allocator_system(), &import_module));
+
+  const auto* module_file_toc =
+      iree_vm_bytecode_module_benchmark_module_create();
+  iree_vm_module_t* bytecode_module = nullptr;
+  IREE_CHECK_OK(iree_vm_bytecode_module_create(
+      iree_const_byte_span_t{
+          reinterpret_cast<const uint8_t*>(module_file_toc->data),
+          module_file_toc->size},
+      iree_allocator_null(), iree_allocator_system(), &bytecode_module));
+
+  std::array<iree_vm_module_t*, 2> modules = {import_module, bytecode_module};
+  iree_vm_context_t* context = NULL;
+  IREE_CHECK_OK(iree_vm_context_create_with_modules(
+      instance, IREE_VM_CONTEXT_FLAG_NONE, modules.data(), modules.size(),
+      iree_allocator_system(), &context));
+
+  iree_vm_function_t function;
+  IREE_CHECK_OK(
+      iree_vm_context_resolve_function(context, function_name, &function));
+
+  iree_vm_function_call_t call;
+  memset(&call, 0, sizeof(call));
+  call.function = function;
+  call.arguments =
+      iree_make_byte_span(iree_alloca(i32_args.size() * sizeof(int32_t)),
+                          i32_args.size() * sizeof(int32_t));
+  call.results =
+      iree_make_byte_span(iree_alloca(result_count * sizeof(int32_t)),
+                          result_count * sizeof(int32_t));
+
+  IREE_VM_INLINE_STACK_INITIALIZE(stack, IREE_VM_INVOCATION_FLAG_NONE,
+                                  iree_vm_context_state_resolver(context),
+                                  iree_allocator_system());
+  while (state.KeepRunningBatch(batch_size)) {
+    for (iree_host_size_t i = 0; i < i32_args.size(); ++i) {
+      reinterpret_cast<int32_t*>(call.arguments.data)[i] = i32_args[i];
+    }
+
+    iree_vm_execution_result_t result;
+    IREE_CHECK_OK(bytecode_module->begin_call(bytecode_module->self, stack,
+                                              &call, &result));
+  }
+  iree_vm_stack_deinitialize(stack);
+
+  iree_vm_module_release(import_module);
+  iree_vm_module_release(bytecode_module);
+  iree_vm_context_release(context);
+  iree_vm_instance_release(instance);
+
+  return iree_ok_status();
+}
+
+static void BM_ModuleCreate(benchmark::State& state) {
+  while (state.KeepRunning()) {
+    const auto* module_file_toc =
+        iree_vm_bytecode_module_benchmark_module_create();
+    iree_vm_module_t* module = nullptr;
+    IREE_CHECK_OK(iree_vm_bytecode_module_create(
+        iree_const_byte_span_t{
+            reinterpret_cast<const uint8_t*>(module_file_toc->data),
+            module_file_toc->size},
+        iree_allocator_null(), iree_allocator_system(), &module));
+
+    // Just testing creation and verification here!
+    benchmark::DoNotOptimize(module);
+
+    iree_vm_module_release(module);
+  }
+}
+BENCHMARK(BM_ModuleCreate);
+
+static void BM_ModuleCreateState(benchmark::State& state) {
+  const auto* module_file_toc =
+      iree_vm_bytecode_module_benchmark_module_create();
+  iree_vm_module_t* module = nullptr;
+  IREE_CHECK_OK(iree_vm_bytecode_module_create(
+      iree_const_byte_span_t{
+          reinterpret_cast<const uint8_t*>(module_file_toc->data),
+          module_file_toc->size},
+      iree_allocator_null(), iree_allocator_system(), &module));
+
+  while (state.KeepRunning()) {
+    iree_vm_module_state_t* module_state;
+    module->alloc_state(module->self, iree_allocator_system(), &module_state);
+
+    // Really just testing malloc overhead, though it'll be module-dependent
+    // and if we do anything heavyweight on state init it'll show here.
+    benchmark::DoNotOptimize(module_state);
+
+    module->free_state(module->self, module_state);
+  }
+
+  iree_vm_module_release(module);
+}
+BENCHMARK(BM_ModuleCreateState);
+
+static void BM_FullModuleInit(benchmark::State& state) {
+  while (state.KeepRunning()) {
+    const auto* module_file_toc =
+        iree_vm_bytecode_module_benchmark_module_create();
+    iree_vm_module_t* module = nullptr;
+    IREE_CHECK_OK(iree_vm_bytecode_module_create(
+        iree_const_byte_span_t{
+            reinterpret_cast<const uint8_t*>(module_file_toc->data),
+            module_file_toc->size},
+        iree_allocator_null(), iree_allocator_system(), &module));
+
+    iree_vm_module_state_t* module_state;
+    module->alloc_state(module->self, iree_allocator_system(), &module_state);
+
+    benchmark::DoNotOptimize(module_state);
+
+    module->free_state(module->self, module_state);
+    iree_vm_module_release(module);
+  }
+}
+BENCHMARK(BM_FullModuleInit);
+
+IREE_ATTRIBUTE_NOINLINE static int empty_fn(void) {
+  int ret = 1;
+  benchmark::DoNotOptimize(ret);
+  return ret;
+}
+
+static void BM_EmptyFuncReference(benchmark::State& state) {
+  while (state.KeepRunning()) {
+    int ret = empty_fn();
+    benchmark::DoNotOptimize(ret);
+    benchmark::ClobberMemory();
+  }
+}
+BENCHMARK(BM_EmptyFuncReference);
+
+static void BM_EmptyFuncBytecode(benchmark::State& state) {
+  IREE_CHECK_OK(RunFunction(
+      state, iree_make_cstring_view("bytecode_module_benchmark.empty_func"), {},
+      /*result_count=*/0));
+}
+BENCHMARK(BM_EmptyFuncBytecode);
+
+IREE_ATTRIBUTE_NOINLINE static int add_fn(int value) {
+  benchmark::DoNotOptimize(value += value);
+  return value;
+}
+
+static void BM_CallInternalFuncReference(benchmark::State& state) {
+  while (state.KeepRunningBatch(10)) {
+    int value = 1;
+    value = add_fn(value);
+    benchmark::DoNotOptimize(value);
+    value = add_fn(value);
+    benchmark::DoNotOptimize(value);
+    value = add_fn(value);
+    benchmark::DoNotOptimize(value);
+    value = add_fn(value);
+    benchmark::DoNotOptimize(value);
+    value = add_fn(value);
+    benchmark::DoNotOptimize(value);
+    value = add_fn(value);
+    benchmark::DoNotOptimize(value);
+    value = add_fn(value);
+    benchmark::DoNotOptimize(value);
+    value = add_fn(value);
+    benchmark::DoNotOptimize(value);
+    value = add_fn(value);
+    benchmark::DoNotOptimize(value);
+    value = add_fn(value);
+    benchmark::DoNotOptimize(value);
+    benchmark::ClobberMemory();
+  }
+}
+BENCHMARK(BM_CallInternalFuncReference);
+
+static void BM_CallInternalFuncBytecode(benchmark::State& state) {
+  IREE_CHECK_OK(RunFunction(
+      state,
+      iree_make_cstring_view("bytecode_module_benchmark.call_internal_func"),
+      {100},
+      /*result_count=*/1,
+      /*batch_size=*/20));
+}
+BENCHMARK(BM_CallInternalFuncBytecode);
+
+static void BM_CallImportedFuncBytecode(benchmark::State& state) {
+  IREE_CHECK_OK(RunFunction(
+      state,
+      iree_make_cstring_view("bytecode_module_benchmark.call_imported_func"),
+      {100},
+      /*result_count=*/1,
+      /*batch_size=*/20));
+}
+BENCHMARK(BM_CallImportedFuncBytecode);
+
+static void BM_LoopSumReference(benchmark::State& state) {
+  static auto work = +[](int x) {
+    benchmark::DoNotOptimize(x);
+    return x;
+  };
+  static auto loop = +[](int count) {
+    int i = 0;
+    for (; i < count; ++i) {
+      benchmark::DoNotOptimize(i = work(i));
+    }
+    return i;
+  };
+  while (state.KeepRunningBatch(state.range(0))) {
+    int ret = loop(static_cast<int>(state.range(0)));
+    benchmark::DoNotOptimize(ret);
+    benchmark::ClobberMemory();
+  }
+}
+BENCHMARK(BM_LoopSumReference)->Arg(100000);
+
+static void BM_LoopSumBytecode(benchmark::State& state) {
+  IREE_CHECK_OK(RunFunction(
+      state, iree_make_cstring_view("bytecode_module_benchmark.loop_sum"),
+      {static_cast<int32_t>(state.range(0))},
+      /*result_count=*/1,
+      /*batch_size=*/state.range(0)));
+}
+BENCHMARK(BM_LoopSumBytecode)->Arg(100000);
+
+static void BM_BufferReduceReference(benchmark::State& state) {
+  static auto work = +[](int32_t* buffer, int i, int sum) {
+    int new_sum = buffer[i] + sum;
+    benchmark::DoNotOptimize(new_sum);
+    return new_sum;
+  };
+  static auto loop = +[](int32_t* buffer, int count) {
+    int sum = 0;
+    for (int i = 0; i < count; ++i) {
+      benchmark::DoNotOptimize(sum = work(buffer, i, sum));
+    }
+    return sum;
+  };
+  while (state.KeepRunningBatch(state.range(0))) {
+    int32_t* buffer = (int32_t*)malloc(state.range(0) * 4);
+    for (int i = 0; i < state.range(0); ++i) {
+      buffer[i] = 1;
+    }
+    int ret = loop(buffer, static_cast<int>(state.range(0)));
+    benchmark::DoNotOptimize(ret);
+    benchmark::ClobberMemory();
+    free(buffer);
+  }
+}
+BENCHMARK(BM_BufferReduceReference)->Arg(100000);
+
+static void BM_BufferReduceBytecode(benchmark::State& state) {
+  IREE_CHECK_OK(RunFunction(
+      state, iree_make_cstring_view("bytecode_module_benchmark.buffer_reduce"),
+      {static_cast<int32_t>(state.range(0))},
+      /*result_count=*/1,
+      /*batch_size=*/state.range(0)));
+}
+BENCHMARK(BM_BufferReduceBytecode)->Arg(100000);
+
+// NOTE: unrolled 8x, requires %count to be % 8 = 0.
+static void BM_BufferReduceBytecodeUnrolled(benchmark::State& state) {
+  IREE_CHECK_OK(
+      RunFunction(state,
+                  iree_make_cstring_view(
+                      "bytecode_module_benchmark.buffer_reduce_unrolled"),
+                  {static_cast<int32_t>(state.range(0))},
+                  /*result_count=*/1,
+                  /*batch_size=*/state.range(0)));
+}
+BENCHMARK(BM_BufferReduceBytecodeUnrolled)->Arg(100000);
+
+}  // namespace
diff --git a/runtime/src/iree/vm/bytecode_module_benchmark.mlir b/runtime/src/iree/vm/bytecode_module_benchmark.mlir
new file mode 100644
index 0000000..6a076f5
--- /dev/null
+++ b/runtime/src/iree/vm/bytecode_module_benchmark.mlir
@@ -0,0 +1,143 @@
+vm.module @bytecode_module_benchmark {
+  // Measures the pure overhead of calling into/returning from a module.
+  vm.export @empty_func
+  vm.func @empty_func() {
+    vm.return
+  }
+
+  // Measures the cost of a call an internal function.
+  vm.func @internal_func(%arg0 : i32) -> i32 attributes {noinline} {
+    vm.return %arg0 : i32
+  }
+  vm.export @call_internal_func
+  vm.func @call_internal_func(%arg0 : i32) -> i32 {
+    %0 = vm.call @internal_func(%arg0) : (i32) -> i32
+    %1 = vm.call @internal_func(%0) : (i32) -> i32
+    %2 = vm.call @internal_func(%1) : (i32) -> i32
+    %3 = vm.call @internal_func(%2) : (i32) -> i32
+    %4 = vm.call @internal_func(%3) : (i32) -> i32
+    %5 = vm.call @internal_func(%4) : (i32) -> i32
+    %6 = vm.call @internal_func(%5) : (i32) -> i32
+    %7 = vm.call @internal_func(%6) : (i32) -> i32
+    %8 = vm.call @internal_func(%7) : (i32) -> i32
+    %9 = vm.call @internal_func(%8) : (i32) -> i32
+    %10 = vm.call @internal_func(%9) : (i32) -> i32
+    %11 = vm.call @internal_func(%10) : (i32) -> i32
+    %12 = vm.call @internal_func(%11) : (i32) -> i32
+    %13 = vm.call @internal_func(%12) : (i32) -> i32
+    %14 = vm.call @internal_func(%13) : (i32) -> i32
+    %15 = vm.call @internal_func(%14) : (i32) -> i32
+    %16 = vm.call @internal_func(%15) : (i32) -> i32
+    %17 = vm.call @internal_func(%16) : (i32) -> i32
+    %18 = vm.call @internal_func(%17) : (i32) -> i32
+    %19 = vm.call @internal_func(%18) : (i32) -> i32
+    %20 = vm.call @internal_func(%19) : (i32) -> i32
+    vm.return %20 : i32
+  }
+
+  // Measures the cost of a call to an imported function.
+  vm.import @native_import_module.add_1(%arg : i32) -> i32
+  vm.export @call_imported_func
+  vm.func @call_imported_func(%arg0 : i32) -> i32 {
+    %0 = vm.call @native_import_module.add_1(%arg0) : (i32) -> i32
+    %1 = vm.call @native_import_module.add_1(%0) : (i32) -> i32
+    %2 = vm.call @native_import_module.add_1(%1) : (i32) -> i32
+    %3 = vm.call @native_import_module.add_1(%2) : (i32) -> i32
+    %4 = vm.call @native_import_module.add_1(%3) : (i32) -> i32
+    %5 = vm.call @native_import_module.add_1(%4) : (i32) -> i32
+    %6 = vm.call @native_import_module.add_1(%5) : (i32) -> i32
+    %7 = vm.call @native_import_module.add_1(%6) : (i32) -> i32
+    %8 = vm.call @native_import_module.add_1(%7) : (i32) -> i32
+    %9 = vm.call @native_import_module.add_1(%8) : (i32) -> i32
+    %10 = vm.call @native_import_module.add_1(%9) : (i32) -> i32
+    %11 = vm.call @native_import_module.add_1(%10) : (i32) -> i32
+    %12 = vm.call @native_import_module.add_1(%11) : (i32) -> i32
+    %13 = vm.call @native_import_module.add_1(%12) : (i32) -> i32
+    %14 = vm.call @native_import_module.add_1(%13) : (i32) -> i32
+    %15 = vm.call @native_import_module.add_1(%14) : (i32) -> i32
+    %16 = vm.call @native_import_module.add_1(%15) : (i32) -> i32
+    %17 = vm.call @native_import_module.add_1(%16) : (i32) -> i32
+    %18 = vm.call @native_import_module.add_1(%17) : (i32) -> i32
+    %19 = vm.call @native_import_module.add_1(%18) : (i32) -> i32
+    %20 = vm.call @native_import_module.add_1(%19) : (i32) -> i32
+    vm.return %20 : i32
+  }
+
+  // Measures the cost of a simple for-loop.
+  vm.export @loop_sum
+  vm.func @loop_sum(%count : i32) -> i32 {
+    %c1 = vm.const.i32 1
+    %i0 = vm.const.i32.zero
+    vm.br ^loop(%i0 : i32)
+  ^loop(%i : i32):
+    %in = vm.add.i32 %i, %c1 : i32
+    %cmp = vm.cmp.lt.i32.s %in, %count : i32
+    vm.cond_br %cmp, ^loop(%in : i32), ^loop_exit(%in : i32)
+  ^loop_exit(%ie : i32):
+    vm.return %ie : i32
+  }
+
+  // Measures the cost of lots of buffer loads.
+  vm.export @buffer_reduce
+  vm.func @buffer_reduce(%count : i32) -> i32 {
+    %c0 = vm.const.i32.zero
+    %c1 = vm.const.i32 1
+    %c4 = vm.const.i32 4
+    %max = vm.mul.i32 %count, %c4 : i32
+    %buf = vm.buffer.alloc %max : !vm.buffer
+    vm.buffer.fill.i32 %buf, %c0, %max, %c1 : i32 -> !vm.buffer
+    vm.br ^loop(%c0, %c0 : i32, i32)
+  ^loop(%i : i32, %sum : i32):
+    %element = vm.buffer.load.i32 %buf[%i] : !vm.buffer -> i32
+    %new_sum = vm.add.i32 %sum, %element : i32
+    %ip4 = vm.add.i32 %i, %c4 : i32
+    %cmp = vm.cmp.lt.i32.s %ip4, %max : i32
+    vm.cond_br %cmp, ^loop(%ip4, %new_sum : i32, i32), ^loop_exit(%new_sum : i32)
+  ^loop_exit(%result : i32):
+    vm.return %result : i32
+  }
+
+  // Measures the cost of lots of buffer loads when somewhat unrolled.
+  // NOTE: unrolled 8x, requires %count to be % 8 = 0.
+  vm.export @buffer_reduce_unrolled
+  vm.func @buffer_reduce_unrolled(%count : i32) -> i32 {
+    %c0 = vm.const.i32.zero
+    %c1 = vm.const.i32 1
+    %c4 = vm.const.i32 4
+    %max = vm.mul.i32 %count, %c4 : i32
+    %buf = vm.buffer.alloc %max : !vm.buffer
+    vm.buffer.fill.i32 %buf, %c0, %max, %c1 : i32 -> !vm.buffer
+    vm.br ^loop(%c0, %c0 : i32, i32)
+  ^loop(%i0 : i32, %sum : i32):
+    // TODO(#5544): add addression modes to load/store.
+    %e0 = vm.buffer.load.i32 %buf[%i0] : !vm.buffer -> i32
+    %i1 = vm.add.i32 %i0, %c4 : i32
+    %e1 = vm.buffer.load.i32 %buf[%i1] : !vm.buffer -> i32
+    %i2 = vm.add.i32 %i1, %c4 : i32
+    %e2 = vm.buffer.load.i32 %buf[%i2] : !vm.buffer -> i32
+    %i3 = vm.add.i32 %i2, %c4 : i32
+    %e3 = vm.buffer.load.i32 %buf[%i3] : !vm.buffer -> i32
+    %i4 = vm.add.i32 %i3, %c4 : i32
+    %e4 = vm.buffer.load.i32 %buf[%i4] : !vm.buffer -> i32
+    %i5 = vm.add.i32 %i4, %c4 : i32
+    %e5 = vm.buffer.load.i32 %buf[%i5] : !vm.buffer -> i32
+    %i6 = vm.add.i32 %i5, %c4 : i32
+    %e6 = vm.buffer.load.i32 %buf[%i6] : !vm.buffer -> i32
+    %i7 = vm.add.i32 %i6, %c4 : i32
+    %e7 = vm.buffer.load.i32 %buf[%i7] : !vm.buffer -> i32
+    // If we do reductions like this we could add a horizontal-add op.
+    %new_sum0 = vm.add.i32 %sum, %e0 : i32
+    %new_sum1 = vm.add.i32 %new_sum0, %e1 : i32
+    %new_sum2 = vm.add.i32 %new_sum1, %e2 : i32
+    %new_sum3 = vm.add.i32 %new_sum2, %e3 : i32
+    %new_sum4 = vm.add.i32 %new_sum3, %e4 : i32
+    %new_sum5 = vm.add.i32 %new_sum4, %e5 : i32
+    %new_sum6 = vm.add.i32 %new_sum5, %e6 : i32
+    %new_sum7 = vm.add.i32 %new_sum6, %e7 : i32
+    %next_i = vm.add.i32 %i7, %c4 : i32
+    %cmp = vm.cmp.lt.i32.s %next_i, %max : i32
+    vm.cond_br %cmp, ^loop(%next_i, %new_sum7 : i32, i32), ^loop_exit(%new_sum7 : i32)
+  ^loop_exit(%result : i32):
+    vm.return %result : i32
+  }
+}
diff --git a/runtime/src/iree/vm/bytecode_module_impl.h b/runtime/src/iree/vm/bytecode_module_impl.h
new file mode 100644
index 0000000..01031b1
--- /dev/null
+++ b/runtime/src/iree/vm/bytecode_module_impl.h
@@ -0,0 +1,137 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_VM_BYTECODE_MODULE_IMPL_H_
+#define IREE_VM_BYTECODE_MODULE_IMPL_H_
+
+#include <stdint.h>
+#include <string.h>
+
+// VC++ does not have C11's stdalign.h.
+#if !defined(_MSC_VER)
+#include <stdalign.h>
+#endif  // _MSC_VER
+
+#include "iree/base/api.h"
+#include "iree/vm/api.h"
+
+// NOTE: include order matters:
+#include "iree/base/internal/flatcc/parsing.h"
+#include "iree/schemas/bytecode_module_def_reader.h"
+#include "iree/schemas/bytecode_module_def_verifier.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+#define VMMAX(a, b) (((a) > (b)) ? (a) : (b))
+#define VMMIN(a, b) (((a) < (b)) ? (a) : (b))
+
+// Maximum register count per bank.
+// This determines the bits required to reference registers in the VM bytecode.
+#define IREE_I32_REGISTER_COUNT 0x7FFF
+#define IREE_REF_REGISTER_COUNT 0x7FFF
+
+#define IREE_I32_REGISTER_MASK 0x7FFF
+
+#define IREE_REF_REGISTER_TYPE_BIT 0x8000
+#define IREE_REF_REGISTER_MOVE_BIT 0x4000
+#define IREE_REF_REGISTER_MASK 0x3FFF
+
+// A loaded bytecode module.
+typedef struct iree_vm_bytecode_module_t {
+  // Interface routing to the bytecode module functions.
+  // Must be first in the struct as we dereference the interface to find our
+  // members below.
+  iree_vm_module_t interface;
+
+  // Table of internal function bytecode descriptors.
+  // Mapped 1:1 with internal functions. Each defined bytecode span represents a
+  // range of bytes in |bytecode_data|.
+  iree_host_size_t function_descriptor_count;
+  const iree_vm_FunctionDescriptor_t* function_descriptor_table;
+
+  // A pointer to the bytecode data embedded within the module.
+  iree_const_byte_span_t bytecode_data;
+
+  // Allocator this module was allocated with and must be freed with.
+  iree_allocator_t allocator;
+
+  // Underlying FlatBuffer data and allocator (which may be null).
+  iree_const_byte_span_t flatbuffer_data;
+  iree_allocator_t flatbuffer_allocator;
+  iree_vm_BytecodeModuleDef_table_t def;
+
+  // Type table mapping module type IDs to registered VM types.
+  iree_host_size_t type_count;
+  iree_vm_type_def_t type_table[];
+} iree_vm_bytecode_module_t;
+
+// A resolved and split import in the module state table.
+//
+// NOTE: a table of these are stored per module per context so ideally we'd
+// only store the absolute minimum information to reduce our fixed overhead.
+// There's a big tradeoff though as a few extra bytes here can avoid non-trivial
+// work per import function invocation.
+typedef struct iree_vm_bytecode_import_t {
+  // Import function in the source module.
+  iree_vm_function_t function;
+
+  // Pre-parsed argument/result calling convention string fragments.
+  // For example, 0ii.r will be split to arguments=ii and results=r.
+  iree_string_view_t arguments;
+  iree_string_view_t results;
+
+  // Precomputed argument/result size requirements for marshaling values.
+  // Only usable for non-variadic signatures. Results are always usable as they
+  // don't support variadic values (yet).
+  uint16_t argument_buffer_size;
+  uint16_t result_buffer_size;
+} iree_vm_bytecode_import_t;
+
+// Per-instance module state.
+// This is allocated with a provided allocator as a single flat allocation.
+// This struct is a prefix to the allocation pointing into the dynamic offsets
+// of the allocation storage.
+typedef struct iree_vm_bytecode_module_state_t {
+  // Combined rwdata storage for the entire module, including globals.
+  // Aligned to 16 bytes (128-bits) for SIMD usage.
+  iree_byte_span_t rwdata_storage;
+
+  // Global ref values, indexed by global ordinal.
+  iree_host_size_t global_ref_count;
+  iree_vm_ref_t* global_ref_table;
+
+  // TODO(benvanik): move to iree_vm_bytecode_module_t if always static.
+  // Initialized references to rodata segments.
+  // Right now these don't do much, however we can perform lazy caching and
+  // on-the-fly decompression using this information.
+  iree_host_size_t rodata_ref_count;
+  iree_vm_buffer_t* rodata_ref_table;
+
+  // Resolved function imports.
+  iree_host_size_t import_count;
+  iree_vm_bytecode_import_t* import_table;
+
+  // Allocator used for the state itself and any runtime allocations needed.
+  iree_allocator_t allocator;
+} iree_vm_bytecode_module_state_t;
+
+// Begins (or resumes) execution of the current frame and continues until
+// either a yield or return. |out_result| will contain the result status for
+// continuation, if needed.
+iree_status_t iree_vm_bytecode_dispatch(iree_vm_stack_t* stack,
+                                        iree_vm_bytecode_module_t* module,
+                                        const iree_vm_function_call_t* call,
+                                        iree_string_view_t cconv_arguments,
+                                        iree_string_view_t cconv_results,
+                                        iree_vm_execution_result_t* out_result);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_VM_BYTECODE_MODULE_IMPL_H_
diff --git a/runtime/src/iree/vm/bytecode_module_size_benchmark.cc b/runtime/src/iree/vm/bytecode_module_size_benchmark.cc
new file mode 100644
index 0000000..164a223
--- /dev/null
+++ b/runtime/src/iree/vm/bytecode_module_size_benchmark.cc
@@ -0,0 +1,44 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/base/api.h"
+#include "iree/vm/api.h"
+#include "iree/vm/bytecode_module.h"
+#include "iree/vm/bytecode_module_size_benchmark_module_c.h"
+
+extern "C" int main(int argc, char** argv) {
+  iree_vm_instance_t* instance = nullptr;
+  iree_vm_instance_create(iree_allocator_system(), &instance);
+
+  const auto* module_file_toc =
+      iree_vm_bytecode_module_size_benchmark_module_create();
+  iree_vm_module_t* module = nullptr;
+  iree_vm_bytecode_module_create(
+      iree_const_byte_span_t{
+          reinterpret_cast<const uint8_t*>(module_file_toc->data),
+          module_file_toc->size},
+      iree_allocator_null(), iree_allocator_system(), &module);
+
+  iree_vm_context_t* context = nullptr;
+  iree_vm_context_create_with_modules(instance, IREE_VM_CONTEXT_FLAG_NONE,
+                                      &module, /*module_count=*/1,
+                                      iree_allocator_system(), &context);
+
+  iree_vm_function_t function;
+  iree_vm_module_lookup_function_by_name(
+      module, IREE_VM_FUNCTION_LINKAGE_EXPORT,
+      iree_make_cstring_view("empty_func"), &function);
+
+  iree_vm_invoke(context, function, IREE_VM_INVOCATION_FLAG_NONE,
+                 /*policy=*/nullptr, /*inputs=*/nullptr,
+                 /*outputs=*/nullptr, iree_allocator_system());
+
+  iree_vm_module_release(module);
+  iree_vm_context_release(context);
+  iree_vm_instance_release(instance);
+
+  return 0;
+}
diff --git a/runtime/src/iree/vm/bytecode_module_size_benchmark.mlir b/runtime/src/iree/vm/bytecode_module_size_benchmark.mlir
new file mode 100644
index 0000000..84ad13c
--- /dev/null
+++ b/runtime/src/iree/vm/bytecode_module_size_benchmark.mlir
@@ -0,0 +1,6 @@
+vm.module @bytecode_module_size_benchmark {
+  vm.export @empty_func
+  vm.func @empty_func() {
+    vm.return
+  }
+}
diff --git a/runtime/src/iree/vm/bytecode_module_test.cc b/runtime/src/iree/vm/bytecode_module_test.cc
new file mode 100644
index 0000000..71ffb27
--- /dev/null
+++ b/runtime/src/iree/vm/bytecode_module_test.cc
@@ -0,0 +1,19 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Tests for bytecode_module.cc implementations.
+// This means mostly just flatbuffer verification, module interface functions,
+// etc. bytecode_dispatch_test.cc covers actual dispatch.
+
+#include "iree/vm/bytecode_module.h"
+
+#include "iree/testing/gtest.h"
+
+namespace {
+
+// TODO(benvanik): bytecode_module_test.cc for flatbuffer/module implementation.
+
+}  // namespace
diff --git a/runtime/src/iree/vm/context.c b/runtime/src/iree/vm/context.c
new file mode 100644
index 0000000..2508075
--- /dev/null
+++ b/runtime/src/iree/vm/context.c
@@ -0,0 +1,627 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/vm/context.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "iree/base/internal/atomics.h"
+#include "iree/base/tracing.h"
+
+struct iree_vm_context_t {
+  iree_atomic_ref_count_t ref_count;
+  iree_vm_instance_t* instance;
+  iree_allocator_t allocator;
+  intptr_t context_id;
+
+  // Context has been frozen and can no longer be modified.
+  uint32_t is_frozen : 1;
+  // Context storage is statically allocated and need not be freed.
+  uint32_t is_static : 1;
+
+  // Configuration flags.
+  iree_vm_context_flags_t flags;
+
+  struct {
+    iree_host_size_t count;
+    iree_host_size_t capacity;
+    iree_vm_module_t** modules;
+    iree_vm_module_state_t** module_states;
+  } list;
+};
+
+static void iree_vm_context_destroy(iree_vm_context_t* context);
+
+// Runs a single `() -> ()` function from the module if it exists.
+static iree_status_t iree_vm_context_run_function(
+    iree_vm_stack_t* stack, iree_vm_module_t* module,
+    iree_string_view_t function_name) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_vm_function_call_t call;
+  memset(&call, 0, sizeof(call));
+  iree_status_t status = iree_vm_module_lookup_function_by_name(
+      module, IREE_VM_FUNCTION_LINKAGE_EXPORT, function_name, &call.function);
+  if (iree_status_is_not_found(status)) {
+    // Function doesn't exist; that's ok as this was an optional call.
+    iree_status_ignore(status);
+    IREE_TRACE_ZONE_END(z0);
+    return iree_ok_status();
+  } else if (!iree_status_is_ok(status)) {
+    IREE_TRACE_ZONE_END(z0);
+    return status;
+  }
+
+  iree_vm_execution_result_t result;
+  status = module->begin_call(module->self, stack, &call, &result);
+  if (!iree_status_is_ok(status)) {
+    status = IREE_VM_STACK_ANNOTATE_BACKTRACE_IF_ENABLED(stack, status);
+  }
+
+  // TODO(benvanik): ensure completed synchronously.
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static iree_status_t iree_vm_context_query_module_state(
+    void* state_resolver, iree_vm_module_t* module,
+    iree_vm_module_state_t** out_module_state) {
+  IREE_ASSERT_ARGUMENT(state_resolver);
+  IREE_ASSERT_ARGUMENT(module);
+  IREE_ASSERT_ARGUMENT(out_module_state);
+  iree_vm_context_t* context = (iree_vm_context_t*)state_resolver;
+  // NOTE: this is a linear scan, but given that the list of modules should be
+  // N<4 this is faster than just about anything else we could do.
+  // To future performance profilers: sorry when N>>4 :)
+  for (int i = 0; i < context->list.count; ++i) {
+    if (context->list.modules[i] == module) {
+      *out_module_state = context->list.module_states[i];
+      return iree_ok_status();
+    }
+  }
+  return iree_make_status(IREE_STATUS_NOT_FOUND);
+}
+
+static iree_status_t iree_vm_context_resolve_module_imports(
+    iree_vm_context_t* context, iree_vm_module_t* module,
+    iree_vm_module_state_t* module_state) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // NOTE: this has some bad characteristics, but the number of modules and the
+  // number of imported functions should be relatively small (even if the number
+  // of exported functions for particular modules is large).
+  iree_vm_module_signature_t module_signature = module->signature(module->self);
+  for (int i = 0; i < module_signature.import_function_count; ++i) {
+    iree_vm_function_t decl_function;
+    iree_string_view_t full_name;
+    iree_vm_function_signature_t expected_signature;
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z0,
+        module->get_function(module->self, IREE_VM_FUNCTION_LINKAGE_IMPORT, i,
+                             /*out_function=*/&decl_function,
+                             /*out_name=*/&full_name,
+                             /*out_signature=*/&expected_signature));
+
+    // Resolve the function to the module that contains it and return the
+    // information.
+    iree_vm_function_t import_function;
+    iree_status_t resolve_status =
+        iree_vm_context_resolve_function(context, full_name, &import_function);
+    if (!iree_status_is_ok(resolve_status)) {
+      if (iree_status_is_not_found(resolve_status) &&
+          decl_function.linkage == IREE_VM_FUNCTION_LINKAGE_IMPORT_OPTIONAL) {
+        // Failed to find the function but it was optionally imported and that's
+        // ok. We'll just continue the resolution process and leave the import
+        // unspecified on the target module.
+        iree_status_ignore(resolve_status);
+        continue;
+      } else {
+        // Failed to find the function.
+        IREE_TRACE_ZONE_END(z0);
+        return resolve_status;
+      }
+    }
+
+    // Query the function signature from the module that contains it; we don't
+    // use the signature from the module requesting the import as we want a
+    // single source of truth.
+    iree_vm_function_signature_t import_signature =
+        iree_vm_function_signature(&import_function);
+
+    // Simple check to confirm the signatures match. We still can't trust that
+    // the module using the import *actually* calls it with the right convention
+    // (so this is not a safety check!), but this will catch the 99% case of a
+    // signature changing out from under a module or using a module with a newer
+    // signature than that provided by the imported module.
+    //
+    // We allow modules to not define their cconv expectation as in a lot of
+    // cases where modules are all compiled into the same binary there's no
+    // value in performing the verification. Runtime checks during calls will
+    // fail with less awesome logging but that's the tradeoff.
+    if (expected_signature.calling_convention.size &&
+        !iree_string_view_equal(import_signature.calling_convention,
+                                expected_signature.calling_convention)) {
+      IREE_TRACE_ZONE_END(z0);
+      return iree_make_status(
+          IREE_STATUS_INTERNAL,
+          "import function signature mismatch between %.*s "
+          "and source %.*s; expected %.*s but got %.*s",
+          (int)iree_vm_module_name(module).size,
+          iree_vm_module_name(module).data,
+          (int)iree_vm_module_name(import_function.module).size,
+          iree_vm_module_name(import_function.module).data,
+          (int)expected_signature.calling_convention.size,
+          expected_signature.calling_convention.data,
+          (int)import_signature.calling_convention.size,
+          import_signature.calling_convention.data);
+    }
+
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z0, module->resolve_import(module->self, module_state, i,
+                                   &import_function, &import_signature));
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+static void iree_vm_context_release_modules(iree_vm_context_t* context,
+                                            iree_host_size_t start,
+                                            iree_host_size_t end) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Run module __deinit functions, if present (in reverse init order).
+  IREE_VM_INLINE_STACK_INITIALIZE(
+      stack,
+      context->flags & IREE_VM_CONTEXT_FLAG_TRACE_EXECUTION
+          ? IREE_VM_INVOCATION_FLAG_TRACE_EXECUTION
+          : IREE_VM_INVOCATION_FLAG_NONE,
+      iree_vm_context_state_resolver(context), context->allocator);
+  for (int i = (int)end; i >= (int)start; --i) {
+    iree_vm_module_t* module = context->list.modules[i];
+    iree_vm_module_state_t* module_state = context->list.module_states[i];
+    if (!module_state) {
+      // Partially initialized; skip.
+      continue;
+    }
+    IREE_IGNORE_ERROR(iree_vm_context_run_function(
+        stack, module, iree_make_cstring_view("__deinit")));
+  }
+  iree_vm_stack_deinitialize(stack);
+
+  // Release all module state (in reverse init order).
+  for (int i = (int)end; i >= (int)start; --i) {
+    iree_vm_module_t* module = context->list.modules[i];
+    // It is possible in error states to have partially initialized.
+    if (context->list.module_states[i]) {
+      module->free_state(module->self, context->list.module_states[i]);
+      context->list.module_states[i] = NULL;
+    }
+  }
+
+  // Release modules now that there are no import tables remaining.
+  for (int i = (int)end; i >= (int)start; --i) {
+    if (context->list.modules[i]) {
+      iree_vm_module_release(context->list.modules[i]);
+      context->list.modules[i] = NULL;
+    }
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_context_create(
+    iree_vm_instance_t* instance, iree_vm_context_flags_t flags,
+    iree_allocator_t allocator, iree_vm_context_t** out_context) {
+  return iree_vm_context_create_with_modules(instance, flags, NULL, 0,
+                                             allocator, out_context);
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_context_create_with_modules(
+    iree_vm_instance_t* instance, iree_vm_context_flags_t flags,
+    iree_vm_module_t** modules, iree_host_size_t module_count,
+    iree_allocator_t allocator, iree_vm_context_t** out_context) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_ASSERT_ARGUMENT(out_context);
+  *out_context = NULL;
+
+  iree_host_size_t context_size =
+      sizeof(iree_vm_context_t) + sizeof(iree_vm_module_t*) * module_count +
+      sizeof(iree_vm_module_state_t*) * module_count;
+
+  iree_vm_context_t* context = NULL;
+  iree_allocator_malloc(allocator, context_size, (void**)&context);
+  iree_atomic_ref_count_init(&context->ref_count);
+  context->instance = instance;
+  iree_vm_instance_retain(context->instance);
+  context->allocator = allocator;
+
+  static iree_atomic_int32_t next_context_id = IREE_ATOMIC_VAR_INIT(1);
+  context->context_id = iree_atomic_fetch_add_int32(&next_context_id, 1,
+                                                    iree_memory_order_seq_cst);
+
+  // TODO(benvanik): allow for non-frozen but static contexts.
+  context->is_frozen = module_count > 0;
+  context->is_static = module_count > 0;
+  context->flags = flags;
+
+  uint8_t* p = (uint8_t*)context + sizeof(iree_vm_context_t);
+  context->list.modules = (iree_vm_module_t**)p;
+  p += sizeof(iree_vm_module_t*) * module_count;
+  context->list.module_states = (iree_vm_module_state_t**)p;
+  p += sizeof(iree_vm_module_state_t*) * module_count;
+  context->list.count = 0;
+  context->list.capacity = module_count;
+
+  iree_status_t register_status =
+      iree_vm_context_register_modules(context, modules, module_count);
+  if (!iree_status_is_ok(register_status)) {
+    iree_vm_context_destroy(context);
+    IREE_TRACE_ZONE_END(z0);
+    return register_status;
+  }
+
+  *out_context = context;
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+static void iree_vm_context_destroy(iree_vm_context_t* context) {
+  if (!context) return;
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  if (context->list.count > 0) {
+    iree_vm_context_release_modules(context, 0, context->list.count - 1);
+  }
+
+  // Note: For non-static module lists, it is only dynamically allocated if
+  // capacity > 0.
+  if (!context->is_static && context->list.capacity > 0) {
+    iree_allocator_free(context->allocator, context->list.modules);
+    context->list.modules = NULL;
+    iree_allocator_free(context->allocator, context->list.module_states);
+    context->list.module_states = NULL;
+  }
+
+  iree_vm_instance_release(context->instance);
+  context->instance = NULL;
+
+  iree_allocator_free(context->allocator, context);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+IREE_API_EXPORT void iree_vm_context_retain(iree_vm_context_t* context) {
+  if (context) {
+    iree_atomic_ref_count_inc(&context->ref_count);
+  }
+}
+
+IREE_API_EXPORT void iree_vm_context_release(iree_vm_context_t* context) {
+  if (context && iree_atomic_ref_count_dec(&context->ref_count) == 1) {
+    iree_vm_context_destroy(context);
+  }
+}
+
+IREE_API_EXPORT intptr_t iree_vm_context_id(const iree_vm_context_t* context) {
+  if (!context) {
+    return -1;
+  }
+  return context->context_id;
+}
+
+IREE_API_EXPORT iree_vm_context_flags_t
+iree_vm_context_flags(const iree_vm_context_t* context) {
+  IREE_ASSERT_ARGUMENT(context);
+  return context->flags;
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_context_register_modules(
+    iree_vm_context_t* context, iree_vm_module_t** modules,
+    iree_host_size_t module_count) {
+  IREE_ASSERT_ARGUMENT(context);
+  if (!modules && module_count > 1) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "modules/module_count mismatch");
+  }
+  for (iree_host_size_t i = 0; i < module_count; ++i) {
+    if (!modules[i]) {
+      return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                              "modules[%zu] is null", i);
+    }
+  }
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Try growing both our storage lists first, if needed.
+  if (context->list.count + module_count > context->list.capacity) {
+    if (context->is_frozen) {
+      IREE_TRACE_ZONE_END(z0);
+      return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                              "context was allocated as static and cannot "
+                              "register modules after creation");
+    }
+    iree_host_size_t new_capacity = context->list.capacity + module_count;
+    if (new_capacity < context->list.capacity * 2) {
+      // TODO(benvanik): tune list growth for module count >> 4.
+      new_capacity = context->list.capacity * 2;
+    }
+    iree_vm_module_t** new_module_list = NULL;
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z0, iree_allocator_malloc(context->allocator,
+                                  sizeof(iree_vm_module_t*) * new_capacity,
+                                  (void**)&new_module_list));
+    iree_vm_module_state_t** new_module_state_list = NULL;
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z0,
+        iree_allocator_malloc(context->allocator,
+                              sizeof(iree_vm_module_state_t*) * new_capacity,
+                              (void**)&new_module_state_list));
+    memcpy(new_module_list, context->list.modules,
+           sizeof(iree_vm_module_t*) * context->list.count);
+    memcpy(new_module_state_list, context->list.module_states,
+           sizeof(iree_vm_module_state_t*) * context->list.count);
+    // The existing memory is only dynamically allocated if it has been
+    // grown.
+    if (context->list.capacity > 0) {
+      iree_allocator_free(context->allocator, context->list.modules);
+      iree_allocator_free(context->allocator, context->list.module_states);
+    }
+    context->list.modules = new_module_list;
+    context->list.module_states = new_module_state_list;
+    context->list.capacity = new_capacity;
+  }
+
+  // VM stack used to call into module __init methods.
+  IREE_VM_INLINE_STACK_INITIALIZE(
+      stack,
+      context->flags & IREE_VM_CONTEXT_FLAG_TRACE_EXECUTION
+          ? IREE_VM_INVOCATION_FLAG_TRACE_EXECUTION
+          : IREE_VM_INVOCATION_FLAG_NONE,
+      iree_vm_context_state_resolver(context), context->allocator);
+
+  // Retain all modules and allocate their state.
+  assert(context->list.capacity >= context->list.count + module_count);
+  iree_host_size_t original_count = context->list.count;
+  iree_status_t status = iree_ok_status();
+  iree_host_size_t i = 0;
+  for (i = 0; i < module_count; ++i) {
+    iree_vm_module_t* module = modules[i];
+    context->list.modules[original_count + i] = module;
+    context->list.module_states[original_count + i] = NULL;
+
+    iree_vm_module_retain(module);
+
+    // Allocate module state.
+    iree_vm_module_state_t* module_state = NULL;
+    status =
+        module->alloc_state(module->self, context->allocator, &module_state);
+    if (!iree_status_is_ok(status)) {
+      // Cleanup handled below.
+      break;
+    }
+    context->list.module_states[original_count + i] = module_state;
+
+    // Resolve imports for the modules.
+    status =
+        iree_vm_context_resolve_module_imports(context, module, module_state);
+    if (!iree_status_is_ok(status)) {
+      // Cleanup handled below.
+      break;
+    }
+
+    ++context->list.count;
+
+    // Run module __init functions, if present.
+    // As initialization functions may reference imports we need to perform
+    // all of these after we have resolved the imports above.
+    status = iree_vm_context_run_function(stack, module,
+                                          iree_make_cstring_view("__init"));
+    if (!iree_status_is_ok(status)) {
+      // Cleanup handled below.
+      break;
+    }
+  }
+
+  iree_vm_stack_deinitialize(stack);
+
+  // Cleanup for failure cases during module initialization; we need to
+  // ensure we release any modules we'd already initialized.
+  if (!iree_status_is_ok(status)) {
+    iree_vm_context_release_modules(context, original_count,
+                                    original_count + i);
+    context->list.count = original_count;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t
+iree_vm_context_freeze(iree_vm_context_t* context) {
+  IREE_ASSERT_ARGUMENT(context);
+  context->is_frozen = 1;
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_vm_state_resolver_t
+iree_vm_context_state_resolver(const iree_vm_context_t* context) {
+  iree_vm_state_resolver_t state_resolver = {0};
+  state_resolver.self = (void*)context;
+  state_resolver.query_module_state = iree_vm_context_query_module_state;
+  return state_resolver;
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_context_resolve_module_state(
+    const iree_vm_context_t* context, iree_vm_module_t* module,
+    iree_vm_module_state_t** out_module_state) {
+  return iree_vm_context_query_module_state((void*)context, module,
+                                            out_module_state);
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_context_resolve_function(
+    const iree_vm_context_t* context, iree_string_view_t full_name,
+    iree_vm_function_t* out_function) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_ASSERT_ARGUMENT(out_function);
+  memset(out_function, 0, sizeof(iree_vm_function_t));
+
+  iree_string_view_t module_name;
+  iree_string_view_t function_name;
+  if (iree_string_view_split(full_name, '.', &module_name, &function_name) ==
+      -1) {
+    IREE_TRACE_ZONE_END(z0);
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "import name not fully-qualified (module.func): '%.*s'",
+        (int)full_name.size, full_name.data);
+  }
+
+  for (int i = (int)context->list.count - 1; i >= 0; --i) {
+    iree_vm_module_t* module = context->list.modules[i];
+    if (iree_string_view_equal(module_name, iree_vm_module_name(module))) {
+      iree_status_t status = iree_vm_module_lookup_function_by_name(
+          module, IREE_VM_FUNCTION_LINKAGE_EXPORT, function_name, out_function);
+      IREE_TRACE_ZONE_END(z0);
+      return status;
+    }
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return iree_make_status(IREE_STATUS_NOT_FOUND,
+                          "module '%.*s' required for import '%.*s' not "
+                          "registered with the context",
+                          (int)module_name.size, module_name.data,
+                          (int)full_name.size, full_name.data);
+}
+
+// Calls the '__notify(i32)' function in |module|, if present.
+static iree_status_t iree_vm_context_call_module_notify(
+    iree_vm_stack_t* stack, iree_vm_module_t* module,
+    iree_vm_module_state_t* module_state, iree_vm_signal_t signal) {
+  // Single i32 argument with the signal number.
+  uint32_t signal_arg = (uint32_t)signal;
+  iree_vm_function_call_t call;
+  memset(&call, 0, sizeof(call));
+  call.arguments = iree_make_byte_span(&signal_arg, sizeof(signal_arg));
+
+  // Try to find the function. Modules are not required to export it.
+  iree_status_t status = iree_vm_module_lookup_function_by_name(
+      module, IREE_VM_FUNCTION_LINKAGE_EXPORT,
+      iree_make_cstring_view("__notify"), &call.function);
+  if (iree_status_is_not_found(status)) {
+    // Function doesn't exist; that's ok as this was an optional call.
+    return iree_status_ignore(status);
+  } else if (!iree_status_is_ok(status)) {
+    // Failed during trim.
+    return status;
+  }
+
+  // Call the resolved function.
+  iree_vm_execution_result_t result;
+  status = module->begin_call(module->self, stack, &call, &result);
+  if (!iree_status_is_ok(status)) {
+    status = IREE_VM_STACK_ANNOTATE_BACKTRACE_IF_ENABLED(stack, status);
+  }
+
+  // TODO(benvanik): ensure completed synchronously.
+
+  return status;
+}
+
+// Calls the module notify methods in registration order.
+static iree_status_t iree_vm_context_notify_forward(iree_vm_stack_t* stack,
+                                                    iree_vm_context_t* context,
+                                                    iree_vm_signal_t signal) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_status_t status = iree_ok_status();
+  for (iree_host_size_t i = 0; i < context->list.count; ++i) {
+    iree_vm_module_t* module = context->list.modules[i];
+    iree_vm_module_state_t* module_state = context->list.module_states[i];
+
+    // Call the module internal interface notify method.
+    // This handles the resources owned by the module implementation itself
+    // such as JITed binaries or other module infrastructure.
+    status = module->notify(module->self, module_state, signal);
+    if (!iree_status_is_ok(status)) break;
+
+    // Call the user-level notify method.
+    // This may new use the reallocated resources from the module internal
+    // implementation above.
+    status =
+        iree_vm_context_call_module_notify(stack, module, module_state, signal);
+    if (!iree_status_is_ok(status)) break;
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+// Calls the module notify methods in reverse registration order.
+static iree_status_t iree_vm_context_notify_reverse(iree_vm_stack_t* stack,
+                                                    iree_vm_context_t* context,
+                                                    iree_vm_signal_t signal) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_status_t status = iree_ok_status();
+  for (int i = (int)context->list.count - 1; i >= 0; --i) {
+    iree_vm_module_t* module = context->list.modules[i];
+    iree_vm_module_state_t* module_state = context->list.module_states[i];
+
+    // Call the user-level notify method first.
+    // This allows users to drop any state that they can rematerialize and
+    // return the resources to pools/caches to be trimmed below.
+    status =
+        iree_vm_context_call_module_notify(stack, module, module_state, signal);
+    if (!iree_status_is_ok(status)) break;
+
+    // Call the module internal interface notify method.
+    // This handles the resources owned by the module implementation itself
+    // such as JITed binaries or other module infrastructure. Since we've
+    // already called the user-level function we likely have all of the
+    // resources that could be returned to pools there for this to reclaim.
+    status = module->notify(module->self, module_state, signal);
+    if (!iree_status_is_ok(status)) break;
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_context_notify(iree_vm_context_t* context,
+                                                     iree_vm_signal_t signal) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, (uint64_t)signal);
+
+  // VM stack used to call into module __init methods.
+  IREE_VM_INLINE_STACK_INITIALIZE(
+      stack,
+      context->flags & IREE_VM_CONTEXT_FLAG_TRACE_EXECUTION
+          ? IREE_VM_INVOCATION_FLAG_TRACE_EXECUTION
+          : IREE_VM_INVOCATION_FLAG_NONE,
+      iree_vm_context_state_resolver(context), context->allocator);
+
+  // Resumes are walked forward while suspends are walked backward.
+  // This follows the expected construction/destruction pattern where for
+  // example on suspend one would walk user modules to release resources back
+  // to system module pools before the system modules then clean up the pools.
+  iree_status_t status = iree_ok_status();
+  switch (signal) {
+    default:
+    case IREE_VM_SIGNAL_RESUME:
+      status = iree_vm_context_notify_forward(stack, context, signal);
+      break;
+    case IREE_VM_SIGNAL_SUSPEND:
+    case IREE_VM_SIGNAL_LOW_MEMORY:
+      status = iree_vm_context_notify_reverse(stack, context, signal);
+      break;
+  }
+
+  iree_vm_stack_deinitialize(stack);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
diff --git a/runtime/src/iree/vm/context.h b/runtime/src/iree/vm/context.h
new file mode 100644
index 0000000..b58bca6
--- /dev/null
+++ b/runtime/src/iree/vm/context.h
@@ -0,0 +1,117 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_VM_CONTEXT_H_
+#define IREE_VM_CONTEXT_H_
+
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/vm/instance.h"
+#include "iree/vm/module.h"
+#include "iree/vm/stack.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// An isolated execution context.
+// Effectively a sandbox where modules can be loaded and run with restricted
+// visibility and where they can maintain state.
+//
+// Modules have imports resolved automatically when registered by searching
+// existing modules registered within the context and load order is used for
+// resolution. Functions are resolved from the most recently registered module
+// back to the first, such that modules can override implementations of
+// functions in previously registered modules.
+//
+// Thread-compatible and must be externally synchronized.
+typedef struct iree_vm_context_t iree_vm_context_t;
+
+enum iree_vm_context_flag_bits_t {
+  IREE_VM_CONTEXT_FLAG_NONE = 0u,
+
+  // Enables tracing of execution to stderr (when available).
+  // See iree/base/config.h for the flags that control whether this
+  // functionality is available; specifically:
+  //   -DIREE_VM_EXECUTION_TRACING_ENABLE=1
+  // All invocations made to this context - including initializers - will be
+  // traced. For fine-grained control use `iree_vm_invocation_flags_t`.
+  IREE_VM_CONTEXT_FLAG_TRACE_EXECUTION = 1u << 0,
+};
+typedef uint32_t iree_vm_context_flags_t;
+
+// Creates a new context that uses the given |instance| for device management.
+// |out_context| must be released by the caller.
+IREE_API_EXPORT iree_status_t iree_vm_context_create(
+    iree_vm_instance_t* instance, iree_vm_context_flags_t flags,
+    iree_allocator_t allocator, iree_vm_context_t** out_context);
+
+// Creates a new context with the given static set of modules.
+// This is equivalent to iree_vm_context_create+iree_vm_context_register_modules
+// but may be more efficient to allocate. Contexts created in this way cannot
+// have additional modules registered after creation.
+// |out_context| must be released by the caller.
+IREE_API_EXPORT iree_status_t iree_vm_context_create_with_modules(
+    iree_vm_instance_t* instance, iree_vm_context_flags_t flags,
+    iree_vm_module_t** modules, iree_host_size_t module_count,
+    iree_allocator_t allocator, iree_vm_context_t** out_context);
+
+// Retains the given |context| for the caller.
+IREE_API_EXPORT void iree_vm_context_retain(iree_vm_context_t* context);
+
+// Releases the given |context| from the caller.
+IREE_API_EXPORT void iree_vm_context_release(iree_vm_context_t* context);
+
+// Returns a process-unique ID for the |context|.
+IREE_API_EXPORT intptr_t iree_vm_context_id(const iree_vm_context_t* context);
+
+// Returns |context| flags.
+IREE_API_EXPORT iree_vm_context_flags_t
+iree_vm_context_flags(const iree_vm_context_t* context);
+
+// Registers a list of modules with the context and resolves imports in the
+// order provided.
+// The modules will be retained by the context until destruction.
+IREE_API_EXPORT iree_status_t iree_vm_context_register_modules(
+    iree_vm_context_t* context, iree_vm_module_t** modules,
+    iree_host_size_t module_count);
+
+// Freezes a context such that no more modules can be registered.
+// This can be used to ensure that context contents cannot be modified by other
+// code as the context is made available to other parts of the program.
+// No-op if already frozen.
+IREE_API_EXPORT iree_status_t
+iree_vm_context_freeze(iree_vm_context_t* context);
+
+// Returns a state resolver setup to use the |context| for resolving module
+// state.
+IREE_API_EXPORT iree_vm_state_resolver_t
+iree_vm_context_state_resolver(const iree_vm_context_t* context);
+
+// Sets |out_module_state| to the context-specific state for the given |module|.
+// The state is owned by the context and will only be live for as long as the
+// context is.
+IREE_API_EXPORT iree_status_t iree_vm_context_resolve_module_state(
+    const iree_vm_context_t* context, iree_vm_module_t* module,
+    iree_vm_module_state_t** out_module_state);
+
+// Sets |out_function| to to an exported function with the fully-qualified name
+// of |full_name| or returns IREE_STATUS_NOT_FOUND. The function reference is
+// valid for the lifetime of |context|.
+IREE_API_EXPORT iree_status_t iree_vm_context_resolve_function(
+    const iree_vm_context_t* context, iree_string_view_t full_name,
+    iree_vm_function_t* out_function);
+
+// Notifies all modules in the context of a system signal.
+IREE_API_EXPORT iree_status_t iree_vm_context_notify(iree_vm_context_t* context,
+                                                     iree_vm_signal_t signal);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_VM_CONTEXT_H_
diff --git a/runtime/src/iree/vm/generated/.clang-format b/runtime/src/iree/vm/generated/.clang-format
new file mode 100644
index 0000000..8844258
--- /dev/null
+++ b/runtime/src/iree/vm/generated/.clang-format
@@ -0,0 +1,9 @@
+# Copyright 2019 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# Disable formatting for generated code.
+DisableFormat: true
+
diff --git a/runtime/src/iree/vm/generated/bytecode_op_table.h b/runtime/src/iree/vm/generated/bytecode_op_table.h
new file mode 100644
index 0000000..6e172d3
--- /dev/null
+++ b/runtime/src/iree/vm/generated/bytecode_op_table.h
@@ -0,0 +1,2076 @@
+/*===- TableGen'erated file -------------------------------------*- C++ -*-===*\
+|*                                                                            *|
+|* IREE VM Operation Tables                                                   *|
+|*                                                                            *|
+|* Automatically generated file, do not edit!                                 *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
+
+typedef enum {
+  IREE_VM_OP_CORE_GlobalLoadI32 = 0x00,
+  IREE_VM_OP_CORE_GlobalStoreI32 = 0x01,
+  IREE_VM_OP_CORE_GlobalLoadIndirectI32 = 0x02,
+  IREE_VM_OP_CORE_GlobalStoreIndirectI32 = 0x03,
+  IREE_VM_OP_CORE_GlobalLoadRef = 0x04,
+  IREE_VM_OP_CORE_GlobalStoreRef = 0x05,
+  IREE_VM_OP_CORE_GlobalLoadIndirectRef = 0x06,
+  IREE_VM_OP_CORE_GlobalStoreIndirectRef = 0x07,
+  IREE_VM_OP_CORE_ConstI32Zero = 0x08,
+  IREE_VM_OP_CORE_ConstI32 = 0x09,
+  IREE_VM_OP_CORE_ConstRefZero = 0x0A,
+  IREE_VM_OP_CORE_ConstRefRodata = 0x0B,
+  IREE_VM_OP_CORE_RSV_0x0C,
+  IREE_VM_OP_CORE_RSV_0x0D,
+  IREE_VM_OP_CORE_RSV_0x0E,
+  IREE_VM_OP_CORE_RSV_0x0F,
+  IREE_VM_OP_CORE_ListAlloc = 0x10,
+  IREE_VM_OP_CORE_ListReserve = 0x11,
+  IREE_VM_OP_CORE_ListSize = 0x12,
+  IREE_VM_OP_CORE_ListResize = 0x13,
+  IREE_VM_OP_CORE_ListGetI32 = 0x14,
+  IREE_VM_OP_CORE_ListSetI32 = 0x15,
+  IREE_VM_OP_CORE_ListGetRef = 0x16,
+  IREE_VM_OP_CORE_ListSetRef = 0x17,
+  IREE_VM_OP_CORE_RSV_0x18,
+  IREE_VM_OP_CORE_RSV_0x19,
+  IREE_VM_OP_CORE_RSV_0x1A,
+  IREE_VM_OP_CORE_RSV_0x1B,
+  IREE_VM_OP_CORE_RSV_0x1C,
+  IREE_VM_OP_CORE_RSV_0x1D,
+  IREE_VM_OP_CORE_SelectI32 = 0x1E,
+  IREE_VM_OP_CORE_SelectRef = 0x1F,
+  IREE_VM_OP_CORE_SwitchI32 = 0x20,
+  IREE_VM_OP_CORE_SwitchRef = 0x21,
+  IREE_VM_OP_CORE_AddI32 = 0x22,
+  IREE_VM_OP_CORE_SubI32 = 0x23,
+  IREE_VM_OP_CORE_MulI32 = 0x24,
+  IREE_VM_OP_CORE_DivI32S = 0x25,
+  IREE_VM_OP_CORE_DivI32U = 0x26,
+  IREE_VM_OP_CORE_RemI32S = 0x27,
+  IREE_VM_OP_CORE_RemI32U = 0x28,
+  IREE_VM_OP_CORE_FMAI32 = 0x29,
+  IREE_VM_OP_CORE_RSV_0x2A,
+  IREE_VM_OP_CORE_RSV_0x2B,
+  IREE_VM_OP_CORE_RSV_0x2C,
+  IREE_VM_OP_CORE_RSV_0x2D,
+  IREE_VM_OP_CORE_RSV_0x2E,
+  IREE_VM_OP_CORE_RSV_0x2F,
+  IREE_VM_OP_CORE_NotI32 = 0x30,
+  IREE_VM_OP_CORE_AndI32 = 0x31,
+  IREE_VM_OP_CORE_OrI32 = 0x32,
+  IREE_VM_OP_CORE_XorI32 = 0x33,
+  IREE_VM_OP_CORE_ShlI32 = 0x34,
+  IREE_VM_OP_CORE_ShrI32S = 0x35,
+  IREE_VM_OP_CORE_ShrI32U = 0x36,
+  IREE_VM_OP_CORE_TruncI32I8 = 0x37,
+  IREE_VM_OP_CORE_TruncI32I16 = 0x38,
+  IREE_VM_OP_CORE_ExtI8I32S = 0x39,
+  IREE_VM_OP_CORE_ExtI8I32U = 0x3A,
+  IREE_VM_OP_CORE_ExtI16I32S = 0x3B,
+  IREE_VM_OP_CORE_ExtI16I32U = 0x3C,
+  IREE_VM_OP_CORE_RSV_0x3D,
+  IREE_VM_OP_CORE_RSV_0x3E,
+  IREE_VM_OP_CORE_RSV_0x3F,
+  IREE_VM_OP_CORE_CmpEQI32 = 0x40,
+  IREE_VM_OP_CORE_CmpNEI32 = 0x41,
+  IREE_VM_OP_CORE_CmpLTI32S = 0x42,
+  IREE_VM_OP_CORE_CmpLTI32U = 0x43,
+  IREE_VM_OP_CORE_RSV_0x44,
+  IREE_VM_OP_CORE_RSV_0x45,
+  IREE_VM_OP_CORE_RSV_0x46,
+  IREE_VM_OP_CORE_RSV_0x47,
+  IREE_VM_OP_CORE_RSV_0x48,
+  IREE_VM_OP_CORE_RSV_0x49,
+  IREE_VM_OP_CORE_CmpEQRef = 0x4A,
+  IREE_VM_OP_CORE_CmpNERef = 0x4B,
+  IREE_VM_OP_CORE_CmpNZRef = 0x4C,
+  IREE_VM_OP_CORE_CmpNZI32 = 0x4D,
+  IREE_VM_OP_CORE_RSV_0x4E,
+  IREE_VM_OP_CORE_RSV_0x4F,
+  IREE_VM_OP_CORE_Branch = 0x50,
+  IREE_VM_OP_CORE_CondBranch = 0x51,
+  IREE_VM_OP_CORE_Call = 0x52,
+  IREE_VM_OP_CORE_CallVariadic = 0x53,
+  IREE_VM_OP_CORE_Return = 0x54,
+  IREE_VM_OP_CORE_Fail = 0x55,
+  IREE_VM_OP_CORE_ImportResolved = 0x56,
+  IREE_VM_OP_CORE_RSV_0x57,
+  IREE_VM_OP_CORE_RSV_0x58,
+  IREE_VM_OP_CORE_RSV_0x59,
+  IREE_VM_OP_CORE_RSV_0x5A,
+  IREE_VM_OP_CORE_RSV_0x5B,
+  IREE_VM_OP_CORE_RSV_0x5C,
+  IREE_VM_OP_CORE_RSV_0x5D,
+  IREE_VM_OP_CORE_RSV_0x5E,
+  IREE_VM_OP_CORE_RSV_0x5F,
+  IREE_VM_OP_CORE_Yield = 0x60,
+  IREE_VM_OP_CORE_RSV_0x61,
+  IREE_VM_OP_CORE_RSV_0x62,
+  IREE_VM_OP_CORE_RSV_0x63,
+  IREE_VM_OP_CORE_RSV_0x64,
+  IREE_VM_OP_CORE_RSV_0x65,
+  IREE_VM_OP_CORE_RSV_0x66,
+  IREE_VM_OP_CORE_RSV_0x67,
+  IREE_VM_OP_CORE_RSV_0x68,
+  IREE_VM_OP_CORE_RSV_0x69,
+  IREE_VM_OP_CORE_RSV_0x6A,
+  IREE_VM_OP_CORE_RSV_0x6B,
+  IREE_VM_OP_CORE_RSV_0x6C,
+  IREE_VM_OP_CORE_RSV_0x6D,
+  IREE_VM_OP_CORE_RSV_0x6E,
+  IREE_VM_OP_CORE_RSV_0x6F,
+  IREE_VM_OP_CORE_RSV_0x70,
+  IREE_VM_OP_CORE_RSV_0x71,
+  IREE_VM_OP_CORE_RSV_0x72,
+  IREE_VM_OP_CORE_RSV_0x73,
+  IREE_VM_OP_CORE_RSV_0x74,
+  IREE_VM_OP_CORE_RSV_0x75,
+  IREE_VM_OP_CORE_RSV_0x76,
+  IREE_VM_OP_CORE_RSV_0x77,
+  IREE_VM_OP_CORE_RSV_0x78,
+  IREE_VM_OP_CORE_RSV_0x79,
+  IREE_VM_OP_CORE_RSV_0x7A,
+  IREE_VM_OP_CORE_RSV_0x7B,
+  IREE_VM_OP_CORE_Trace = 0x7C,
+  IREE_VM_OP_CORE_Print = 0x7D,
+  IREE_VM_OP_CORE_CondBreak = 0x7E,
+  IREE_VM_OP_CORE_Break = 0x7F,
+  IREE_VM_OP_CORE_RSV_0x80,
+  IREE_VM_OP_CORE_RSV_0x81,
+  IREE_VM_OP_CORE_RSV_0x82,
+  IREE_VM_OP_CORE_RSV_0x83,
+  IREE_VM_OP_CORE_RSV_0x84,
+  IREE_VM_OP_CORE_RSV_0x85,
+  IREE_VM_OP_CORE_RSV_0x86,
+  IREE_VM_OP_CORE_RSV_0x87,
+  IREE_VM_OP_CORE_RSV_0x88,
+  IREE_VM_OP_CORE_RSV_0x89,
+  IREE_VM_OP_CORE_RSV_0x8A,
+  IREE_VM_OP_CORE_RSV_0x8B,
+  IREE_VM_OP_CORE_RSV_0x8C,
+  IREE_VM_OP_CORE_RSV_0x8D,
+  IREE_VM_OP_CORE_RSV_0x8E,
+  IREE_VM_OP_CORE_RSV_0x8F,
+  IREE_VM_OP_CORE_RSV_0x90,
+  IREE_VM_OP_CORE_RSV_0x91,
+  IREE_VM_OP_CORE_RSV_0x92,
+  IREE_VM_OP_CORE_RSV_0x93,
+  IREE_VM_OP_CORE_RSV_0x94,
+  IREE_VM_OP_CORE_RSV_0x95,
+  IREE_VM_OP_CORE_RSV_0x96,
+  IREE_VM_OP_CORE_RSV_0x97,
+  IREE_VM_OP_CORE_RSV_0x98,
+  IREE_VM_OP_CORE_RSV_0x99,
+  IREE_VM_OP_CORE_RSV_0x9A,
+  IREE_VM_OP_CORE_RSV_0x9B,
+  IREE_VM_OP_CORE_RSV_0x9C,
+  IREE_VM_OP_CORE_RSV_0x9D,
+  IREE_VM_OP_CORE_RSV_0x9E,
+  IREE_VM_OP_CORE_RSV_0x9F,
+  IREE_VM_OP_CORE_RSV_0xA0,
+  IREE_VM_OP_CORE_RSV_0xA1,
+  IREE_VM_OP_CORE_RSV_0xA2,
+  IREE_VM_OP_CORE_RSV_0xA3,
+  IREE_VM_OP_CORE_RSV_0xA4,
+  IREE_VM_OP_CORE_RSV_0xA5,
+  IREE_VM_OP_CORE_RSV_0xA6,
+  IREE_VM_OP_CORE_RSV_0xA7,
+  IREE_VM_OP_CORE_RSV_0xA8,
+  IREE_VM_OP_CORE_RSV_0xA9,
+  IREE_VM_OP_CORE_RSV_0xAA,
+  IREE_VM_OP_CORE_RSV_0xAB,
+  IREE_VM_OP_CORE_RSV_0xAC,
+  IREE_VM_OP_CORE_RSV_0xAD,
+  IREE_VM_OP_CORE_RSV_0xAE,
+  IREE_VM_OP_CORE_RSV_0xAF,
+  IREE_VM_OP_CORE_BufferLoadI8U = 0xB0,
+  IREE_VM_OP_CORE_BufferLoadI16U = 0xB1,
+  IREE_VM_OP_CORE_RSV_0xB2,
+  IREE_VM_OP_CORE_BufferLoadI32 = 0xB3,
+  IREE_VM_OP_CORE_BufferLoadI8S = 0xB4,
+  IREE_VM_OP_CORE_BufferLoadI16S = 0xB5,
+  IREE_VM_OP_CORE_RSV_0xB6,
+  IREE_VM_OP_CORE_RSV_0xB7,
+  IREE_VM_OP_CORE_BufferStoreI8 = 0xB8,
+  IREE_VM_OP_CORE_BufferStoreI16 = 0xB9,
+  IREE_VM_OP_CORE_RSV_0xBA,
+  IREE_VM_OP_CORE_BufferStoreI32 = 0xBB,
+  IREE_VM_OP_CORE_RSV_0xBC,
+  IREE_VM_OP_CORE_RSV_0xBD,
+  IREE_VM_OP_CORE_RSV_0xBE,
+  IREE_VM_OP_CORE_RSV_0xBF,
+  IREE_VM_OP_CORE_BufferAlloc = 0xC0,
+  IREE_VM_OP_CORE_BufferClone = 0xC1,
+  IREE_VM_OP_CORE_BufferLength = 0xC2,
+  IREE_VM_OP_CORE_RSV_0xC3,
+  IREE_VM_OP_CORE_RSV_0xC4,
+  IREE_VM_OP_CORE_RSV_0xC5,
+  IREE_VM_OP_CORE_BufferCopy = 0xC6,
+  IREE_VM_OP_CORE_BufferCompare = 0xC7,
+  IREE_VM_OP_CORE_RSV_0xC8,
+  IREE_VM_OP_CORE_RSV_0xC9,
+  IREE_VM_OP_CORE_RSV_0xCA,
+  IREE_VM_OP_CORE_RSV_0xCB,
+  IREE_VM_OP_CORE_BufferFillI8 = 0xCC,
+  IREE_VM_OP_CORE_BufferFillI16 = 0xCD,
+  IREE_VM_OP_CORE_RSV_0xCE,
+  IREE_VM_OP_CORE_BufferFillI32 = 0xCF,
+  IREE_VM_OP_CORE_RSV_0xD0,
+  IREE_VM_OP_CORE_RSV_0xD1,
+  IREE_VM_OP_CORE_RSV_0xD2,
+  IREE_VM_OP_CORE_RSV_0xD3,
+  IREE_VM_OP_CORE_RSV_0xD4,
+  IREE_VM_OP_CORE_RSV_0xD5,
+  IREE_VM_OP_CORE_RSV_0xD6,
+  IREE_VM_OP_CORE_RSV_0xD7,
+  IREE_VM_OP_CORE_RSV_0xD8,
+  IREE_VM_OP_CORE_RSV_0xD9,
+  IREE_VM_OP_CORE_RSV_0xDA,
+  IREE_VM_OP_CORE_RSV_0xDB,
+  IREE_VM_OP_CORE_RSV_0xDC,
+  IREE_VM_OP_CORE_RSV_0xDD,
+  IREE_VM_OP_CORE_RSV_0xDE,
+  IREE_VM_OP_CORE_RSV_0xDF,
+  IREE_VM_OP_CORE_PrefixExtI64 = 0xE0,
+  IREE_VM_OP_CORE_PrefixExtF32 = 0xE1,
+  IREE_VM_OP_CORE_PrefixExtF64 = 0xE2,
+  IREE_VM_OP_CORE_RSV_0xE3,
+  IREE_VM_OP_CORE_RSV_0xE4,
+  IREE_VM_OP_CORE_RSV_0xE5,
+  IREE_VM_OP_CORE_RSV_0xE6,
+  IREE_VM_OP_CORE_RSV_0xE7,
+  IREE_VM_OP_CORE_RSV_0xE8,
+  IREE_VM_OP_CORE_RSV_0xE9,
+  IREE_VM_OP_CORE_RSV_0xEA,
+  IREE_VM_OP_CORE_RSV_0xEB,
+  IREE_VM_OP_CORE_RSV_0xEC,
+  IREE_VM_OP_CORE_RSV_0xED,
+  IREE_VM_OP_CORE_RSV_0xEE,
+  IREE_VM_OP_CORE_RSV_0xEF,
+  IREE_VM_OP_CORE_RSV_0xF0,
+  IREE_VM_OP_CORE_RSV_0xF1,
+  IREE_VM_OP_CORE_RSV_0xF2,
+  IREE_VM_OP_CORE_RSV_0xF3,
+  IREE_VM_OP_CORE_RSV_0xF4,
+  IREE_VM_OP_CORE_RSV_0xF5,
+  IREE_VM_OP_CORE_RSV_0xF6,
+  IREE_VM_OP_CORE_RSV_0xF7,
+  IREE_VM_OP_CORE_RSV_0xF8,
+  IREE_VM_OP_CORE_RSV_0xF9,
+  IREE_VM_OP_CORE_RSV_0xFA,
+  IREE_VM_OP_CORE_RSV_0xFB,
+  IREE_VM_OP_CORE_RSV_0xFC,
+  IREE_VM_OP_CORE_RSV_0xFD,
+  IREE_VM_OP_CORE_RSV_0xFE,
+  IREE_VM_OP_CORE_RSV_0xFF,
+} iree_vm_core_op_t;
+
+#define IREE_VM_OP_CORE_TABLE(OPC, RSV) \
+    OPC(0x00, GlobalLoadI32) \
+    OPC(0x01, GlobalStoreI32) \
+    OPC(0x02, GlobalLoadIndirectI32) \
+    OPC(0x03, GlobalStoreIndirectI32) \
+    OPC(0x04, GlobalLoadRef) \
+    OPC(0x05, GlobalStoreRef) \
+    OPC(0x06, GlobalLoadIndirectRef) \
+    OPC(0x07, GlobalStoreIndirectRef) \
+    OPC(0x08, ConstI32Zero) \
+    OPC(0x09, ConstI32) \
+    OPC(0x0A, ConstRefZero) \
+    OPC(0x0B, ConstRefRodata) \
+    RSV(0x0C) \
+    RSV(0x0D) \
+    RSV(0x0E) \
+    RSV(0x0F) \
+    OPC(0x10, ListAlloc) \
+    OPC(0x11, ListReserve) \
+    OPC(0x12, ListSize) \
+    OPC(0x13, ListResize) \
+    OPC(0x14, ListGetI32) \
+    OPC(0x15, ListSetI32) \
+    OPC(0x16, ListGetRef) \
+    OPC(0x17, ListSetRef) \
+    RSV(0x18) \
+    RSV(0x19) \
+    RSV(0x1A) \
+    RSV(0x1B) \
+    RSV(0x1C) \
+    RSV(0x1D) \
+    OPC(0x1E, SelectI32) \
+    OPC(0x1F, SelectRef) \
+    OPC(0x20, SwitchI32) \
+    OPC(0x21, SwitchRef) \
+    OPC(0x22, AddI32) \
+    OPC(0x23, SubI32) \
+    OPC(0x24, MulI32) \
+    OPC(0x25, DivI32S) \
+    OPC(0x26, DivI32U) \
+    OPC(0x27, RemI32S) \
+    OPC(0x28, RemI32U) \
+    OPC(0x29, FMAI32) \
+    RSV(0x2A) \
+    RSV(0x2B) \
+    RSV(0x2C) \
+    RSV(0x2D) \
+    RSV(0x2E) \
+    RSV(0x2F) \
+    OPC(0x30, NotI32) \
+    OPC(0x31, AndI32) \
+    OPC(0x32, OrI32) \
+    OPC(0x33, XorI32) \
+    OPC(0x34, ShlI32) \
+    OPC(0x35, ShrI32S) \
+    OPC(0x36, ShrI32U) \
+    OPC(0x37, TruncI32I8) \
+    OPC(0x38, TruncI32I16) \
+    OPC(0x39, ExtI8I32S) \
+    OPC(0x3A, ExtI8I32U) \
+    OPC(0x3B, ExtI16I32S) \
+    OPC(0x3C, ExtI16I32U) \
+    RSV(0x3D) \
+    RSV(0x3E) \
+    RSV(0x3F) \
+    OPC(0x40, CmpEQI32) \
+    OPC(0x41, CmpNEI32) \
+    OPC(0x42, CmpLTI32S) \
+    OPC(0x43, CmpLTI32U) \
+    RSV(0x44) \
+    RSV(0x45) \
+    RSV(0x46) \
+    RSV(0x47) \
+    RSV(0x48) \
+    RSV(0x49) \
+    OPC(0x4A, CmpEQRef) \
+    OPC(0x4B, CmpNERef) \
+    OPC(0x4C, CmpNZRef) \
+    OPC(0x4D, CmpNZI32) \
+    RSV(0x4E) \
+    RSV(0x4F) \
+    OPC(0x50, Branch) \
+    OPC(0x51, CondBranch) \
+    OPC(0x52, Call) \
+    OPC(0x53, CallVariadic) \
+    OPC(0x54, Return) \
+    OPC(0x55, Fail) \
+    OPC(0x56, ImportResolved) \
+    RSV(0x57) \
+    RSV(0x58) \
+    RSV(0x59) \
+    RSV(0x5A) \
+    RSV(0x5B) \
+    RSV(0x5C) \
+    RSV(0x5D) \
+    RSV(0x5E) \
+    RSV(0x5F) \
+    OPC(0x60, Yield) \
+    RSV(0x61) \
+    RSV(0x62) \
+    RSV(0x63) \
+    RSV(0x64) \
+    RSV(0x65) \
+    RSV(0x66) \
+    RSV(0x67) \
+    RSV(0x68) \
+    RSV(0x69) \
+    RSV(0x6A) \
+    RSV(0x6B) \
+    RSV(0x6C) \
+    RSV(0x6D) \
+    RSV(0x6E) \
+    RSV(0x6F) \
+    RSV(0x70) \
+    RSV(0x71) \
+    RSV(0x72) \
+    RSV(0x73) \
+    RSV(0x74) \
+    RSV(0x75) \
+    RSV(0x76) \
+    RSV(0x77) \
+    RSV(0x78) \
+    RSV(0x79) \
+    RSV(0x7A) \
+    RSV(0x7B) \
+    OPC(0x7C, Trace) \
+    OPC(0x7D, Print) \
+    OPC(0x7E, CondBreak) \
+    OPC(0x7F, Break) \
+    RSV(0x80) \
+    RSV(0x81) \
+    RSV(0x82) \
+    RSV(0x83) \
+    RSV(0x84) \
+    RSV(0x85) \
+    RSV(0x86) \
+    RSV(0x87) \
+    RSV(0x88) \
+    RSV(0x89) \
+    RSV(0x8A) \
+    RSV(0x8B) \
+    RSV(0x8C) \
+    RSV(0x8D) \
+    RSV(0x8E) \
+    RSV(0x8F) \
+    RSV(0x90) \
+    RSV(0x91) \
+    RSV(0x92) \
+    RSV(0x93) \
+    RSV(0x94) \
+    RSV(0x95) \
+    RSV(0x96) \
+    RSV(0x97) \
+    RSV(0x98) \
+    RSV(0x99) \
+    RSV(0x9A) \
+    RSV(0x9B) \
+    RSV(0x9C) \
+    RSV(0x9D) \
+    RSV(0x9E) \
+    RSV(0x9F) \
+    RSV(0xA0) \
+    RSV(0xA1) \
+    RSV(0xA2) \
+    RSV(0xA3) \
+    RSV(0xA4) \
+    RSV(0xA5) \
+    RSV(0xA6) \
+    RSV(0xA7) \
+    RSV(0xA8) \
+    RSV(0xA9) \
+    RSV(0xAA) \
+    RSV(0xAB) \
+    RSV(0xAC) \
+    RSV(0xAD) \
+    RSV(0xAE) \
+    RSV(0xAF) \
+    OPC(0xB0, BufferLoadI8U) \
+    OPC(0xB1, BufferLoadI16U) \
+    RSV(0xB2) \
+    OPC(0xB3, BufferLoadI32) \
+    OPC(0xB4, BufferLoadI8S) \
+    OPC(0xB5, BufferLoadI16S) \
+    RSV(0xB6) \
+    RSV(0xB7) \
+    OPC(0xB8, BufferStoreI8) \
+    OPC(0xB9, BufferStoreI16) \
+    RSV(0xBA) \
+    OPC(0xBB, BufferStoreI32) \
+    RSV(0xBC) \
+    RSV(0xBD) \
+    RSV(0xBE) \
+    RSV(0xBF) \
+    OPC(0xC0, BufferAlloc) \
+    OPC(0xC1, BufferClone) \
+    OPC(0xC2, BufferLength) \
+    RSV(0xC3) \
+    RSV(0xC4) \
+    RSV(0xC5) \
+    OPC(0xC6, BufferCopy) \
+    OPC(0xC7, BufferCompare) \
+    RSV(0xC8) \
+    RSV(0xC9) \
+    RSV(0xCA) \
+    RSV(0xCB) \
+    OPC(0xCC, BufferFillI8) \
+    OPC(0xCD, BufferFillI16) \
+    RSV(0xCE) \
+    OPC(0xCF, BufferFillI32) \
+    RSV(0xD0) \
+    RSV(0xD1) \
+    RSV(0xD2) \
+    RSV(0xD3) \
+    RSV(0xD4) \
+    RSV(0xD5) \
+    RSV(0xD6) \
+    RSV(0xD7) \
+    RSV(0xD8) \
+    RSV(0xD9) \
+    RSV(0xDA) \
+    RSV(0xDB) \
+    RSV(0xDC) \
+    RSV(0xDD) \
+    RSV(0xDE) \
+    RSV(0xDF) \
+    OPC(0xE0, PrefixExtI64) \
+    OPC(0xE1, PrefixExtF32) \
+    OPC(0xE2, PrefixExtF64) \
+    RSV(0xE3) \
+    RSV(0xE4) \
+    RSV(0xE5) \
+    RSV(0xE6) \
+    RSV(0xE7) \
+    RSV(0xE8) \
+    RSV(0xE9) \
+    RSV(0xEA) \
+    RSV(0xEB) \
+    RSV(0xEC) \
+    RSV(0xED) \
+    RSV(0xEE) \
+    RSV(0xEF) \
+    RSV(0xF0) \
+    RSV(0xF1) \
+    RSV(0xF2) \
+    RSV(0xF3) \
+    RSV(0xF4) \
+    RSV(0xF5) \
+    RSV(0xF6) \
+    RSV(0xF7) \
+    RSV(0xF8) \
+    RSV(0xF9) \
+    RSV(0xFA) \
+    RSV(0xFB) \
+    RSV(0xFC) \
+    RSV(0xFD) \
+    RSV(0xFE) \
+    RSV(0xFF)
+
+typedef enum {
+  IREE_VM_OP_EXT_F32_GlobalLoadF32 = 0x00,
+  IREE_VM_OP_EXT_F32_GlobalStoreF32 = 0x01,
+  IREE_VM_OP_EXT_F32_GlobalLoadIndirectF32 = 0x02,
+  IREE_VM_OP_EXT_F32_GlobalStoreIndirectF32 = 0x03,
+  IREE_VM_OP_EXT_F32_RSV_0x04,
+  IREE_VM_OP_EXT_F32_RSV_0x05,
+  IREE_VM_OP_EXT_F32_RSV_0x06,
+  IREE_VM_OP_EXT_F32_RSV_0x07,
+  IREE_VM_OP_EXT_F32_ConstF32Zero = 0x08,
+  IREE_VM_OP_EXT_F32_ConstF32 = 0x09,
+  IREE_VM_OP_EXT_F32_RSV_0x0A,
+  IREE_VM_OP_EXT_F32_RSV_0x0B,
+  IREE_VM_OP_EXT_F32_RSV_0x0C,
+  IREE_VM_OP_EXT_F32_RSV_0x0D,
+  IREE_VM_OP_EXT_F32_RSV_0x0E,
+  IREE_VM_OP_EXT_F32_RSV_0x0F,
+  IREE_VM_OP_EXT_F32_RSV_0x10,
+  IREE_VM_OP_EXT_F32_RSV_0x11,
+  IREE_VM_OP_EXT_F32_RSV_0x12,
+  IREE_VM_OP_EXT_F32_RSV_0x13,
+  IREE_VM_OP_EXT_F32_ListGetF32 = 0x14,
+  IREE_VM_OP_EXT_F32_ListSetF32 = 0x15,
+  IREE_VM_OP_EXT_F32_RSV_0x16,
+  IREE_VM_OP_EXT_F32_RSV_0x17,
+  IREE_VM_OP_EXT_F32_RSV_0x18,
+  IREE_VM_OP_EXT_F32_RSV_0x19,
+  IREE_VM_OP_EXT_F32_RSV_0x1A,
+  IREE_VM_OP_EXT_F32_RSV_0x1B,
+  IREE_VM_OP_EXT_F32_RSV_0x1C,
+  IREE_VM_OP_EXT_F32_RSV_0x1D,
+  IREE_VM_OP_EXT_F32_SelectF32 = 0x1E,
+  IREE_VM_OP_EXT_F32_RSV_0x1F,
+  IREE_VM_OP_EXT_F32_SwitchF32 = 0x20,
+  IREE_VM_OP_EXT_F32_RSV_0x21,
+  IREE_VM_OP_EXT_F32_AddF32 = 0x22,
+  IREE_VM_OP_EXT_F32_SubF32 = 0x23,
+  IREE_VM_OP_EXT_F32_MulF32 = 0x24,
+  IREE_VM_OP_EXT_F32_DivF32 = 0x25,
+  IREE_VM_OP_EXT_F32_RemF32 = 0x26,
+  IREE_VM_OP_EXT_F32_FMAF32 = 0x27,
+  IREE_VM_OP_EXT_F32_AbsF32 = 0x28,
+  IREE_VM_OP_EXT_F32_NegF32 = 0x29,
+  IREE_VM_OP_EXT_F32_CeilF32 = 0x2A,
+  IREE_VM_OP_EXT_F32_FloorF32 = 0x2B,
+  IREE_VM_OP_EXT_F32_RSV_0x2C,
+  IREE_VM_OP_EXT_F32_RSV_0x2D,
+  IREE_VM_OP_EXT_F32_RSV_0x2E,
+  IREE_VM_OP_EXT_F32_RSV_0x2F,
+  IREE_VM_OP_EXT_F32_CastSI32F32 = 0x30,
+  IREE_VM_OP_EXT_F32_CastUI32F32 = 0x31,
+  IREE_VM_OP_EXT_F32_CastF32SI32 = 0x32,
+  IREE_VM_OP_EXT_F32_CastF32UI32 = 0x33,
+  IREE_VM_OP_EXT_F32_BitcastI32F32 = 0x34,
+  IREE_VM_OP_EXT_F32_BitcastF32I32 = 0x35,
+  IREE_VM_OP_EXT_F32_RSV_0x36,
+  IREE_VM_OP_EXT_F32_RSV_0x37,
+  IREE_VM_OP_EXT_F32_RSV_0x38,
+  IREE_VM_OP_EXT_F32_RSV_0x39,
+  IREE_VM_OP_EXT_F32_RSV_0x3A,
+  IREE_VM_OP_EXT_F32_RSV_0x3B,
+  IREE_VM_OP_EXT_F32_RSV_0x3C,
+  IREE_VM_OP_EXT_F32_RSV_0x3D,
+  IREE_VM_OP_EXT_F32_RSV_0x3E,
+  IREE_VM_OP_EXT_F32_RSV_0x3F,
+  IREE_VM_OP_EXT_F32_AtanF32 = 0x40,
+  IREE_VM_OP_EXT_F32_Atan2F32 = 0x41,
+  IREE_VM_OP_EXT_F32_CosF32 = 0x42,
+  IREE_VM_OP_EXT_F32_SinF32 = 0x43,
+  IREE_VM_OP_EXT_F32_ExpF32 = 0x44,
+  IREE_VM_OP_EXT_F32_Exp2F32 = 0x45,
+  IREE_VM_OP_EXT_F32_ExpM1F32 = 0x46,
+  IREE_VM_OP_EXT_F32_LogF32 = 0x47,
+  IREE_VM_OP_EXT_F32_Log10F32 = 0x48,
+  IREE_VM_OP_EXT_F32_Log1pF32 = 0x49,
+  IREE_VM_OP_EXT_F32_Log2F32 = 0x4A,
+  IREE_VM_OP_EXT_F32_PowF32 = 0x4B,
+  IREE_VM_OP_EXT_F32_RsqrtF32 = 0x4C,
+  IREE_VM_OP_EXT_F32_SqrtF32 = 0x4D,
+  IREE_VM_OP_EXT_F32_TanhF32 = 0x4E,
+  IREE_VM_OP_EXT_F32_ErfF32 = 0x4F,
+  IREE_VM_OP_EXT_F32_RSV_0x50,
+  IREE_VM_OP_EXT_F32_RSV_0x51,
+  IREE_VM_OP_EXT_F32_RSV_0x52,
+  IREE_VM_OP_EXT_F32_RSV_0x53,
+  IREE_VM_OP_EXT_F32_RSV_0x54,
+  IREE_VM_OP_EXT_F32_RSV_0x55,
+  IREE_VM_OP_EXT_F32_RSV_0x56,
+  IREE_VM_OP_EXT_F32_RSV_0x57,
+  IREE_VM_OP_EXT_F32_RSV_0x58,
+  IREE_VM_OP_EXT_F32_RSV_0x59,
+  IREE_VM_OP_EXT_F32_RSV_0x5A,
+  IREE_VM_OP_EXT_F32_RSV_0x5B,
+  IREE_VM_OP_EXT_F32_RSV_0x5C,
+  IREE_VM_OP_EXT_F32_RSV_0x5D,
+  IREE_VM_OP_EXT_F32_RSV_0x5E,
+  IREE_VM_OP_EXT_F32_RSV_0x5F,
+  IREE_VM_OP_EXT_F32_CmpEQF32O = 0x60,
+  IREE_VM_OP_EXT_F32_CmpEQF32U = 0x61,
+  IREE_VM_OP_EXT_F32_CmpNEF32O = 0x62,
+  IREE_VM_OP_EXT_F32_CmpNEF32U = 0x63,
+  IREE_VM_OP_EXT_F32_CmpLTF32O = 0x64,
+  IREE_VM_OP_EXT_F32_CmpLTF32U = 0x65,
+  IREE_VM_OP_EXT_F32_CmpLTEF32O = 0x66,
+  IREE_VM_OP_EXT_F32_CmpLTEF32U = 0x67,
+  IREE_VM_OP_EXT_F32_RSV_0x68,
+  IREE_VM_OP_EXT_F32_RSV_0x69,
+  IREE_VM_OP_EXT_F32_RSV_0x6A,
+  IREE_VM_OP_EXT_F32_RSV_0x6B,
+  IREE_VM_OP_EXT_F32_RSV_0x6C,
+  IREE_VM_OP_EXT_F32_RSV_0x6D,
+  IREE_VM_OP_EXT_F32_RSV_0x6E,
+  IREE_VM_OP_EXT_F32_RSV_0x6F,
+  IREE_VM_OP_EXT_F32_CmpNaNF32 = 0x70,
+  IREE_VM_OP_EXT_F32_RSV_0x71,
+  IREE_VM_OP_EXT_F32_RSV_0x72,
+  IREE_VM_OP_EXT_F32_RSV_0x73,
+  IREE_VM_OP_EXT_F32_RSV_0x74,
+  IREE_VM_OP_EXT_F32_RSV_0x75,
+  IREE_VM_OP_EXT_F32_RSV_0x76,
+  IREE_VM_OP_EXT_F32_RSV_0x77,
+  IREE_VM_OP_EXT_F32_RSV_0x78,
+  IREE_VM_OP_EXT_F32_RSV_0x79,
+  IREE_VM_OP_EXT_F32_RSV_0x7A,
+  IREE_VM_OP_EXT_F32_RSV_0x7B,
+  IREE_VM_OP_EXT_F32_RSV_0x7C,
+  IREE_VM_OP_EXT_F32_RSV_0x7D,
+  IREE_VM_OP_EXT_F32_RSV_0x7E,
+  IREE_VM_OP_EXT_F32_RSV_0x7F,
+  IREE_VM_OP_EXT_F32_RSV_0x80,
+  IREE_VM_OP_EXT_F32_RSV_0x81,
+  IREE_VM_OP_EXT_F32_RSV_0x82,
+  IREE_VM_OP_EXT_F32_RSV_0x83,
+  IREE_VM_OP_EXT_F32_RSV_0x84,
+  IREE_VM_OP_EXT_F32_RSV_0x85,
+  IREE_VM_OP_EXT_F32_RSV_0x86,
+  IREE_VM_OP_EXT_F32_RSV_0x87,
+  IREE_VM_OP_EXT_F32_RSV_0x88,
+  IREE_VM_OP_EXT_F32_RSV_0x89,
+  IREE_VM_OP_EXT_F32_RSV_0x8A,
+  IREE_VM_OP_EXT_F32_RSV_0x8B,
+  IREE_VM_OP_EXT_F32_RSV_0x8C,
+  IREE_VM_OP_EXT_F32_RSV_0x8D,
+  IREE_VM_OP_EXT_F32_RSV_0x8E,
+  IREE_VM_OP_EXT_F32_RSV_0x8F,
+  IREE_VM_OP_EXT_F32_RSV_0x90,
+  IREE_VM_OP_EXT_F32_RSV_0x91,
+  IREE_VM_OP_EXT_F32_RSV_0x92,
+  IREE_VM_OP_EXT_F32_RSV_0x93,
+  IREE_VM_OP_EXT_F32_RSV_0x94,
+  IREE_VM_OP_EXT_F32_RSV_0x95,
+  IREE_VM_OP_EXT_F32_RSV_0x96,
+  IREE_VM_OP_EXT_F32_RSV_0x97,
+  IREE_VM_OP_EXT_F32_RSV_0x98,
+  IREE_VM_OP_EXT_F32_RSV_0x99,
+  IREE_VM_OP_EXT_F32_RSV_0x9A,
+  IREE_VM_OP_EXT_F32_RSV_0x9B,
+  IREE_VM_OP_EXT_F32_RSV_0x9C,
+  IREE_VM_OP_EXT_F32_RSV_0x9D,
+  IREE_VM_OP_EXT_F32_RSV_0x9E,
+  IREE_VM_OP_EXT_F32_RSV_0x9F,
+  IREE_VM_OP_EXT_F32_RSV_0xA0,
+  IREE_VM_OP_EXT_F32_RSV_0xA1,
+  IREE_VM_OP_EXT_F32_RSV_0xA2,
+  IREE_VM_OP_EXT_F32_RSV_0xA3,
+  IREE_VM_OP_EXT_F32_RSV_0xA4,
+  IREE_VM_OP_EXT_F32_RSV_0xA5,
+  IREE_VM_OP_EXT_F32_RSV_0xA6,
+  IREE_VM_OP_EXT_F32_RSV_0xA7,
+  IREE_VM_OP_EXT_F32_RSV_0xA8,
+  IREE_VM_OP_EXT_F32_RSV_0xA9,
+  IREE_VM_OP_EXT_F32_RSV_0xAA,
+  IREE_VM_OP_EXT_F32_RSV_0xAB,
+  IREE_VM_OP_EXT_F32_RSV_0xAC,
+  IREE_VM_OP_EXT_F32_RSV_0xAD,
+  IREE_VM_OP_EXT_F32_RSV_0xAE,
+  IREE_VM_OP_EXT_F32_RSV_0xAF,
+  IREE_VM_OP_EXT_F32_BufferLoadF32 = 0xB0,
+  IREE_VM_OP_EXT_F32_BufferStoreF32 = 0xB1,
+  IREE_VM_OP_EXT_F32_RSV_0xB2,
+  IREE_VM_OP_EXT_F32_RSV_0xB3,
+  IREE_VM_OP_EXT_F32_RSV_0xB4,
+  IREE_VM_OP_EXT_F32_RSV_0xB5,
+  IREE_VM_OP_EXT_F32_RSV_0xB6,
+  IREE_VM_OP_EXT_F32_RSV_0xB7,
+  IREE_VM_OP_EXT_F32_RSV_0xB8,
+  IREE_VM_OP_EXT_F32_RSV_0xB9,
+  IREE_VM_OP_EXT_F32_RSV_0xBA,
+  IREE_VM_OP_EXT_F32_RSV_0xBB,
+  IREE_VM_OP_EXT_F32_RSV_0xBC,
+  IREE_VM_OP_EXT_F32_RSV_0xBD,
+  IREE_VM_OP_EXT_F32_RSV_0xBE,
+  IREE_VM_OP_EXT_F32_RSV_0xBF,
+  IREE_VM_OP_EXT_F32_BufferFillF32 = 0xC0,
+  IREE_VM_OP_EXT_F32_RSV_0xC1,
+  IREE_VM_OP_EXT_F32_RSV_0xC2,
+  IREE_VM_OP_EXT_F32_RSV_0xC3,
+  IREE_VM_OP_EXT_F32_RSV_0xC4,
+  IREE_VM_OP_EXT_F32_RSV_0xC5,
+  IREE_VM_OP_EXT_F32_RSV_0xC6,
+  IREE_VM_OP_EXT_F32_RSV_0xC7,
+  IREE_VM_OP_EXT_F32_RSV_0xC8,
+  IREE_VM_OP_EXT_F32_RSV_0xC9,
+  IREE_VM_OP_EXT_F32_RSV_0xCA,
+  IREE_VM_OP_EXT_F32_RSV_0xCB,
+  IREE_VM_OP_EXT_F32_RSV_0xCC,
+  IREE_VM_OP_EXT_F32_RSV_0xCD,
+  IREE_VM_OP_EXT_F32_RSV_0xCE,
+  IREE_VM_OP_EXT_F32_RSV_0xCF,
+  IREE_VM_OP_EXT_F32_RSV_0xD0,
+  IREE_VM_OP_EXT_F32_RSV_0xD1,
+  IREE_VM_OP_EXT_F32_RSV_0xD2,
+  IREE_VM_OP_EXT_F32_RSV_0xD3,
+  IREE_VM_OP_EXT_F32_RSV_0xD4,
+  IREE_VM_OP_EXT_F32_RSV_0xD5,
+  IREE_VM_OP_EXT_F32_RSV_0xD6,
+  IREE_VM_OP_EXT_F32_RSV_0xD7,
+  IREE_VM_OP_EXT_F32_RSV_0xD8,
+  IREE_VM_OP_EXT_F32_RSV_0xD9,
+  IREE_VM_OP_EXT_F32_RSV_0xDA,
+  IREE_VM_OP_EXT_F32_RSV_0xDB,
+  IREE_VM_OP_EXT_F32_RSV_0xDC,
+  IREE_VM_OP_EXT_F32_RSV_0xDD,
+  IREE_VM_OP_EXT_F32_RSV_0xDE,
+  IREE_VM_OP_EXT_F32_RSV_0xDF,
+  IREE_VM_OP_EXT_F32_RSV_0xE0,
+  IREE_VM_OP_EXT_F32_RSV_0xE1,
+  IREE_VM_OP_EXT_F32_RSV_0xE2,
+  IREE_VM_OP_EXT_F32_RSV_0xE3,
+  IREE_VM_OP_EXT_F32_RSV_0xE4,
+  IREE_VM_OP_EXT_F32_RSV_0xE5,
+  IREE_VM_OP_EXT_F32_RSV_0xE6,
+  IREE_VM_OP_EXT_F32_RSV_0xE7,
+  IREE_VM_OP_EXT_F32_RSV_0xE8,
+  IREE_VM_OP_EXT_F32_RSV_0xE9,
+  IREE_VM_OP_EXT_F32_RSV_0xEA,
+  IREE_VM_OP_EXT_F32_RSV_0xEB,
+  IREE_VM_OP_EXT_F32_RSV_0xEC,
+  IREE_VM_OP_EXT_F32_RSV_0xED,
+  IREE_VM_OP_EXT_F32_RSV_0xEE,
+  IREE_VM_OP_EXT_F32_RSV_0xEF,
+  IREE_VM_OP_EXT_F32_RSV_0xF0,
+  IREE_VM_OP_EXT_F32_RSV_0xF1,
+  IREE_VM_OP_EXT_F32_RSV_0xF2,
+  IREE_VM_OP_EXT_F32_RSV_0xF3,
+  IREE_VM_OP_EXT_F32_RSV_0xF4,
+  IREE_VM_OP_EXT_F32_RSV_0xF5,
+  IREE_VM_OP_EXT_F32_RSV_0xF6,
+  IREE_VM_OP_EXT_F32_RSV_0xF7,
+  IREE_VM_OP_EXT_F32_RSV_0xF8,
+  IREE_VM_OP_EXT_F32_RSV_0xF9,
+  IREE_VM_OP_EXT_F32_RSV_0xFA,
+  IREE_VM_OP_EXT_F32_RSV_0xFB,
+  IREE_VM_OP_EXT_F32_RSV_0xFC,
+  IREE_VM_OP_EXT_F32_RSV_0xFD,
+  IREE_VM_OP_EXT_F32_RSV_0xFE,
+  IREE_VM_OP_EXT_F32_RSV_0xFF,
+} iree_vm_ext_f32_op_t;
+
+#define IREE_VM_OP_EXT_F32_TABLE(OPC, RSV) \
+    OPC(0x00, GlobalLoadF32) \
+    OPC(0x01, GlobalStoreF32) \
+    OPC(0x02, GlobalLoadIndirectF32) \
+    OPC(0x03, GlobalStoreIndirectF32) \
+    RSV(0x04) \
+    RSV(0x05) \
+    RSV(0x06) \
+    RSV(0x07) \
+    OPC(0x08, ConstF32Zero) \
+    OPC(0x09, ConstF32) \
+    RSV(0x0A) \
+    RSV(0x0B) \
+    RSV(0x0C) \
+    RSV(0x0D) \
+    RSV(0x0E) \
+    RSV(0x0F) \
+    RSV(0x10) \
+    RSV(0x11) \
+    RSV(0x12) \
+    RSV(0x13) \
+    OPC(0x14, ListGetF32) \
+    OPC(0x15, ListSetF32) \
+    RSV(0x16) \
+    RSV(0x17) \
+    RSV(0x18) \
+    RSV(0x19) \
+    RSV(0x1A) \
+    RSV(0x1B) \
+    RSV(0x1C) \
+    RSV(0x1D) \
+    OPC(0x1E, SelectF32) \
+    RSV(0x1F) \
+    OPC(0x20, SwitchF32) \
+    RSV(0x21) \
+    OPC(0x22, AddF32) \
+    OPC(0x23, SubF32) \
+    OPC(0x24, MulF32) \
+    OPC(0x25, DivF32) \
+    OPC(0x26, RemF32) \
+    OPC(0x27, FMAF32) \
+    OPC(0x28, AbsF32) \
+    OPC(0x29, NegF32) \
+    OPC(0x2A, CeilF32) \
+    OPC(0x2B, FloorF32) \
+    RSV(0x2C) \
+    RSV(0x2D) \
+    RSV(0x2E) \
+    RSV(0x2F) \
+    OPC(0x30, CastSI32F32) \
+    OPC(0x31, CastUI32F32) \
+    OPC(0x32, CastF32SI32) \
+    OPC(0x33, CastF32UI32) \
+    OPC(0x34, BitcastI32F32) \
+    OPC(0x35, BitcastF32I32) \
+    RSV(0x36) \
+    RSV(0x37) \
+    RSV(0x38) \
+    RSV(0x39) \
+    RSV(0x3A) \
+    RSV(0x3B) \
+    RSV(0x3C) \
+    RSV(0x3D) \
+    RSV(0x3E) \
+    RSV(0x3F) \
+    OPC(0x40, AtanF32) \
+    OPC(0x41, Atan2F32) \
+    OPC(0x42, CosF32) \
+    OPC(0x43, SinF32) \
+    OPC(0x44, ExpF32) \
+    OPC(0x45, Exp2F32) \
+    OPC(0x46, ExpM1F32) \
+    OPC(0x47, LogF32) \
+    OPC(0x48, Log10F32) \
+    OPC(0x49, Log1pF32) \
+    OPC(0x4A, Log2F32) \
+    OPC(0x4B, PowF32) \
+    OPC(0x4C, RsqrtF32) \
+    OPC(0x4D, SqrtF32) \
+    OPC(0x4E, TanhF32) \
+    OPC(0x4F, ErfF32) \
+    RSV(0x50) \
+    RSV(0x51) \
+    RSV(0x52) \
+    RSV(0x53) \
+    RSV(0x54) \
+    RSV(0x55) \
+    RSV(0x56) \
+    RSV(0x57) \
+    RSV(0x58) \
+    RSV(0x59) \
+    RSV(0x5A) \
+    RSV(0x5B) \
+    RSV(0x5C) \
+    RSV(0x5D) \
+    RSV(0x5E) \
+    RSV(0x5F) \
+    OPC(0x60, CmpEQF32O) \
+    OPC(0x61, CmpEQF32U) \
+    OPC(0x62, CmpNEF32O) \
+    OPC(0x63, CmpNEF32U) \
+    OPC(0x64, CmpLTF32O) \
+    OPC(0x65, CmpLTF32U) \
+    OPC(0x66, CmpLTEF32O) \
+    OPC(0x67, CmpLTEF32U) \
+    RSV(0x68) \
+    RSV(0x69) \
+    RSV(0x6A) \
+    RSV(0x6B) \
+    RSV(0x6C) \
+    RSV(0x6D) \
+    RSV(0x6E) \
+    RSV(0x6F) \
+    OPC(0x70, CmpNaNF32) \
+    RSV(0x71) \
+    RSV(0x72) \
+    RSV(0x73) \
+    RSV(0x74) \
+    RSV(0x75) \
+    RSV(0x76) \
+    RSV(0x77) \
+    RSV(0x78) \
+    RSV(0x79) \
+    RSV(0x7A) \
+    RSV(0x7B) \
+    RSV(0x7C) \
+    RSV(0x7D) \
+    RSV(0x7E) \
+    RSV(0x7F) \
+    RSV(0x80) \
+    RSV(0x81) \
+    RSV(0x82) \
+    RSV(0x83) \
+    RSV(0x84) \
+    RSV(0x85) \
+    RSV(0x86) \
+    RSV(0x87) \
+    RSV(0x88) \
+    RSV(0x89) \
+    RSV(0x8A) \
+    RSV(0x8B) \
+    RSV(0x8C) \
+    RSV(0x8D) \
+    RSV(0x8E) \
+    RSV(0x8F) \
+    RSV(0x90) \
+    RSV(0x91) \
+    RSV(0x92) \
+    RSV(0x93) \
+    RSV(0x94) \
+    RSV(0x95) \
+    RSV(0x96) \
+    RSV(0x97) \
+    RSV(0x98) \
+    RSV(0x99) \
+    RSV(0x9A) \
+    RSV(0x9B) \
+    RSV(0x9C) \
+    RSV(0x9D) \
+    RSV(0x9E) \
+    RSV(0x9F) \
+    RSV(0xA0) \
+    RSV(0xA1) \
+    RSV(0xA2) \
+    RSV(0xA3) \
+    RSV(0xA4) \
+    RSV(0xA5) \
+    RSV(0xA6) \
+    RSV(0xA7) \
+    RSV(0xA8) \
+    RSV(0xA9) \
+    RSV(0xAA) \
+    RSV(0xAB) \
+    RSV(0xAC) \
+    RSV(0xAD) \
+    RSV(0xAE) \
+    RSV(0xAF) \
+    OPC(0xB0, BufferLoadF32) \
+    OPC(0xB1, BufferStoreF32) \
+    RSV(0xB2) \
+    RSV(0xB3) \
+    RSV(0xB4) \
+    RSV(0xB5) \
+    RSV(0xB6) \
+    RSV(0xB7) \
+    RSV(0xB8) \
+    RSV(0xB9) \
+    RSV(0xBA) \
+    RSV(0xBB) \
+    RSV(0xBC) \
+    RSV(0xBD) \
+    RSV(0xBE) \
+    RSV(0xBF) \
+    OPC(0xC0, BufferFillF32) \
+    RSV(0xC1) \
+    RSV(0xC2) \
+    RSV(0xC3) \
+    RSV(0xC4) \
+    RSV(0xC5) \
+    RSV(0xC6) \
+    RSV(0xC7) \
+    RSV(0xC8) \
+    RSV(0xC9) \
+    RSV(0xCA) \
+    RSV(0xCB) \
+    RSV(0xCC) \
+    RSV(0xCD) \
+    RSV(0xCE) \
+    RSV(0xCF) \
+    RSV(0xD0) \
+    RSV(0xD1) \
+    RSV(0xD2) \
+    RSV(0xD3) \
+    RSV(0xD4) \
+    RSV(0xD5) \
+    RSV(0xD6) \
+    RSV(0xD7) \
+    RSV(0xD8) \
+    RSV(0xD9) \
+    RSV(0xDA) \
+    RSV(0xDB) \
+    RSV(0xDC) \
+    RSV(0xDD) \
+    RSV(0xDE) \
+    RSV(0xDF) \
+    RSV(0xE0) \
+    RSV(0xE1) \
+    RSV(0xE2) \
+    RSV(0xE3) \
+    RSV(0xE4) \
+    RSV(0xE5) \
+    RSV(0xE6) \
+    RSV(0xE7) \
+    RSV(0xE8) \
+    RSV(0xE9) \
+    RSV(0xEA) \
+    RSV(0xEB) \
+    RSV(0xEC) \
+    RSV(0xED) \
+    RSV(0xEE) \
+    RSV(0xEF) \
+    RSV(0xF0) \
+    RSV(0xF1) \
+    RSV(0xF2) \
+    RSV(0xF3) \
+    RSV(0xF4) \
+    RSV(0xF5) \
+    RSV(0xF6) \
+    RSV(0xF7) \
+    RSV(0xF8) \
+    RSV(0xF9) \
+    RSV(0xFA) \
+    RSV(0xFB) \
+    RSV(0xFC) \
+    RSV(0xFD) \
+    RSV(0xFE) \
+    RSV(0xFF)
+
+typedef enum {
+  IREE_VM_OP_EXT_F64_GlobalLoadF64 = 0x00,
+  IREE_VM_OP_EXT_F64_GlobalStoreF64 = 0x01,
+  IREE_VM_OP_EXT_F64_GlobalLoadIndirectF64 = 0x02,
+  IREE_VM_OP_EXT_F64_GlobalStoreIndirectF64 = 0x03,
+  IREE_VM_OP_EXT_F64_RSV_0x04,
+  IREE_VM_OP_EXT_F64_RSV_0x05,
+  IREE_VM_OP_EXT_F64_RSV_0x06,
+  IREE_VM_OP_EXT_F64_RSV_0x07,
+  IREE_VM_OP_EXT_F64_ConstF64Zero = 0x08,
+  IREE_VM_OP_EXT_F64_ConstF64 = 0x09,
+  IREE_VM_OP_EXT_F64_RSV_0x0A,
+  IREE_VM_OP_EXT_F64_RSV_0x0B,
+  IREE_VM_OP_EXT_F64_RSV_0x0C,
+  IREE_VM_OP_EXT_F64_RSV_0x0D,
+  IREE_VM_OP_EXT_F64_RSV_0x0E,
+  IREE_VM_OP_EXT_F64_RSV_0x0F,
+  IREE_VM_OP_EXT_F64_RSV_0x10,
+  IREE_VM_OP_EXT_F64_RSV_0x11,
+  IREE_VM_OP_EXT_F64_RSV_0x12,
+  IREE_VM_OP_EXT_F64_RSV_0x13,
+  IREE_VM_OP_EXT_F64_ListGetF64 = 0x14,
+  IREE_VM_OP_EXT_F64_ListSetF64 = 0x15,
+  IREE_VM_OP_EXT_F64_RSV_0x16,
+  IREE_VM_OP_EXT_F64_RSV_0x17,
+  IREE_VM_OP_EXT_F64_RSV_0x18,
+  IREE_VM_OP_EXT_F64_RSV_0x19,
+  IREE_VM_OP_EXT_F64_RSV_0x1A,
+  IREE_VM_OP_EXT_F64_RSV_0x1B,
+  IREE_VM_OP_EXT_F64_RSV_0x1C,
+  IREE_VM_OP_EXT_F64_RSV_0x1D,
+  IREE_VM_OP_EXT_F64_SelectF64 = 0x1E,
+  IREE_VM_OP_EXT_F64_RSV_0x1F,
+  IREE_VM_OP_EXT_F64_SwitchF64 = 0x20,
+  IREE_VM_OP_EXT_F64_RSV_0x21,
+  IREE_VM_OP_EXT_F64_AddF64 = 0x22,
+  IREE_VM_OP_EXT_F64_SubF64 = 0x23,
+  IREE_VM_OP_EXT_F64_MulF64 = 0x24,
+  IREE_VM_OP_EXT_F64_DivF64 = 0x25,
+  IREE_VM_OP_EXT_F64_RemF64 = 0x26,
+  IREE_VM_OP_EXT_F64_FMAF64 = 0x27,
+  IREE_VM_OP_EXT_F64_AbsF64 = 0x28,
+  IREE_VM_OP_EXT_F64_NegF64 = 0x29,
+  IREE_VM_OP_EXT_F64_CeilF64 = 0x2A,
+  IREE_VM_OP_EXT_F64_FloorF64 = 0x2B,
+  IREE_VM_OP_EXT_F64_TruncF64F32 = 0x2C,
+  IREE_VM_OP_EXT_F64_ExtF32F64 = 0x2D,
+  IREE_VM_OP_EXT_F64_RSV_0x2E,
+  IREE_VM_OP_EXT_F64_RSV_0x2F,
+  IREE_VM_OP_EXT_F64_CastSI32F64 = 0x30,
+  IREE_VM_OP_EXT_F64_CastUI32F64 = 0x31,
+  IREE_VM_OP_EXT_F64_CastF64SI32 = 0x32,
+  IREE_VM_OP_EXT_F64_CastF64UI32 = 0x33,
+  IREE_VM_OP_EXT_F64_CastSI64F64 = 0x34,
+  IREE_VM_OP_EXT_F64_CastUI64F64 = 0x35,
+  IREE_VM_OP_EXT_F64_CastF64SI64 = 0x36,
+  IREE_VM_OP_EXT_F64_CastF64UI64 = 0x37,
+  IREE_VM_OP_EXT_F64_BitcastI64F64 = 0x38,
+  IREE_VM_OP_EXT_F64_BitcastF64I64 = 0x39,
+  IREE_VM_OP_EXT_F64_RSV_0x3A,
+  IREE_VM_OP_EXT_F64_RSV_0x3B,
+  IREE_VM_OP_EXT_F64_RSV_0x3C,
+  IREE_VM_OP_EXT_F64_RSV_0x3D,
+  IREE_VM_OP_EXT_F64_RSV_0x3E,
+  IREE_VM_OP_EXT_F64_RSV_0x3F,
+  IREE_VM_OP_EXT_F64_AtanF64 = 0x40,
+  IREE_VM_OP_EXT_F64_Atan2F64 = 0x41,
+  IREE_VM_OP_EXT_F64_CosF64 = 0x42,
+  IREE_VM_OP_EXT_F64_SinF64 = 0x43,
+  IREE_VM_OP_EXT_F64_ExpF64 = 0x44,
+  IREE_VM_OP_EXT_F64_Exp2F64 = 0x45,
+  IREE_VM_OP_EXT_F64_ExpM1F64 = 0x46,
+  IREE_VM_OP_EXT_F64_LogF64 = 0x47,
+  IREE_VM_OP_EXT_F64_Log10F64 = 0x48,
+  IREE_VM_OP_EXT_F64_Log1pF64 = 0x49,
+  IREE_VM_OP_EXT_F64_Log2F64 = 0x4A,
+  IREE_VM_OP_EXT_F64_PowF64 = 0x4B,
+  IREE_VM_OP_EXT_F64_RsqrtF64 = 0x4C,
+  IREE_VM_OP_EXT_F64_SqrtF64 = 0x4D,
+  IREE_VM_OP_EXT_F64_TanhF64 = 0x4E,
+  IREE_VM_OP_EXT_F64_ErfF64 = 0x4F,
+  IREE_VM_OP_EXT_F64_RSV_0x50,
+  IREE_VM_OP_EXT_F64_RSV_0x51,
+  IREE_VM_OP_EXT_F64_RSV_0x52,
+  IREE_VM_OP_EXT_F64_RSV_0x53,
+  IREE_VM_OP_EXT_F64_RSV_0x54,
+  IREE_VM_OP_EXT_F64_RSV_0x55,
+  IREE_VM_OP_EXT_F64_RSV_0x56,
+  IREE_VM_OP_EXT_F64_RSV_0x57,
+  IREE_VM_OP_EXT_F64_RSV_0x58,
+  IREE_VM_OP_EXT_F64_RSV_0x59,
+  IREE_VM_OP_EXT_F64_RSV_0x5A,
+  IREE_VM_OP_EXT_F64_RSV_0x5B,
+  IREE_VM_OP_EXT_F64_RSV_0x5C,
+  IREE_VM_OP_EXT_F64_RSV_0x5D,
+  IREE_VM_OP_EXT_F64_RSV_0x5E,
+  IREE_VM_OP_EXT_F64_RSV_0x5F,
+  IREE_VM_OP_EXT_F64_CmpEQF64O = 0x60,
+  IREE_VM_OP_EXT_F64_CmpEQF64U = 0x61,
+  IREE_VM_OP_EXT_F64_CmpNEF64O = 0x62,
+  IREE_VM_OP_EXT_F64_CmpNEF64U = 0x63,
+  IREE_VM_OP_EXT_F64_CmpLTF64O = 0x64,
+  IREE_VM_OP_EXT_F64_CmpLTF64U = 0x65,
+  IREE_VM_OP_EXT_F64_CmpLTEF64O = 0x66,
+  IREE_VM_OP_EXT_F64_CmpLTEF64U = 0x67,
+  IREE_VM_OP_EXT_F64_RSV_0x68,
+  IREE_VM_OP_EXT_F64_RSV_0x69,
+  IREE_VM_OP_EXT_F64_RSV_0x6A,
+  IREE_VM_OP_EXT_F64_RSV_0x6B,
+  IREE_VM_OP_EXT_F64_RSV_0x6C,
+  IREE_VM_OP_EXT_F64_RSV_0x6D,
+  IREE_VM_OP_EXT_F64_RSV_0x6E,
+  IREE_VM_OP_EXT_F64_RSV_0x6F,
+  IREE_VM_OP_EXT_F64_CmpNaNF64 = 0x70,
+  IREE_VM_OP_EXT_F64_RSV_0x71,
+  IREE_VM_OP_EXT_F64_RSV_0x72,
+  IREE_VM_OP_EXT_F64_RSV_0x73,
+  IREE_VM_OP_EXT_F64_RSV_0x74,
+  IREE_VM_OP_EXT_F64_RSV_0x75,
+  IREE_VM_OP_EXT_F64_RSV_0x76,
+  IREE_VM_OP_EXT_F64_RSV_0x77,
+  IREE_VM_OP_EXT_F64_RSV_0x78,
+  IREE_VM_OP_EXT_F64_RSV_0x79,
+  IREE_VM_OP_EXT_F64_RSV_0x7A,
+  IREE_VM_OP_EXT_F64_RSV_0x7B,
+  IREE_VM_OP_EXT_F64_RSV_0x7C,
+  IREE_VM_OP_EXT_F64_RSV_0x7D,
+  IREE_VM_OP_EXT_F64_RSV_0x7E,
+  IREE_VM_OP_EXT_F64_RSV_0x7F,
+  IREE_VM_OP_EXT_F64_RSV_0x80,
+  IREE_VM_OP_EXT_F64_RSV_0x81,
+  IREE_VM_OP_EXT_F64_RSV_0x82,
+  IREE_VM_OP_EXT_F64_RSV_0x83,
+  IREE_VM_OP_EXT_F64_RSV_0x84,
+  IREE_VM_OP_EXT_F64_RSV_0x85,
+  IREE_VM_OP_EXT_F64_RSV_0x86,
+  IREE_VM_OP_EXT_F64_RSV_0x87,
+  IREE_VM_OP_EXT_F64_RSV_0x88,
+  IREE_VM_OP_EXT_F64_RSV_0x89,
+  IREE_VM_OP_EXT_F64_RSV_0x8A,
+  IREE_VM_OP_EXT_F64_RSV_0x8B,
+  IREE_VM_OP_EXT_F64_RSV_0x8C,
+  IREE_VM_OP_EXT_F64_RSV_0x8D,
+  IREE_VM_OP_EXT_F64_RSV_0x8E,
+  IREE_VM_OP_EXT_F64_RSV_0x8F,
+  IREE_VM_OP_EXT_F64_RSV_0x90,
+  IREE_VM_OP_EXT_F64_RSV_0x91,
+  IREE_VM_OP_EXT_F64_RSV_0x92,
+  IREE_VM_OP_EXT_F64_RSV_0x93,
+  IREE_VM_OP_EXT_F64_RSV_0x94,
+  IREE_VM_OP_EXT_F64_RSV_0x95,
+  IREE_VM_OP_EXT_F64_RSV_0x96,
+  IREE_VM_OP_EXT_F64_RSV_0x97,
+  IREE_VM_OP_EXT_F64_RSV_0x98,
+  IREE_VM_OP_EXT_F64_RSV_0x99,
+  IREE_VM_OP_EXT_F64_RSV_0x9A,
+  IREE_VM_OP_EXT_F64_RSV_0x9B,
+  IREE_VM_OP_EXT_F64_RSV_0x9C,
+  IREE_VM_OP_EXT_F64_RSV_0x9D,
+  IREE_VM_OP_EXT_F64_RSV_0x9E,
+  IREE_VM_OP_EXT_F64_RSV_0x9F,
+  IREE_VM_OP_EXT_F64_RSV_0xA0,
+  IREE_VM_OP_EXT_F64_RSV_0xA1,
+  IREE_VM_OP_EXT_F64_RSV_0xA2,
+  IREE_VM_OP_EXT_F64_RSV_0xA3,
+  IREE_VM_OP_EXT_F64_RSV_0xA4,
+  IREE_VM_OP_EXT_F64_RSV_0xA5,
+  IREE_VM_OP_EXT_F64_RSV_0xA6,
+  IREE_VM_OP_EXT_F64_RSV_0xA7,
+  IREE_VM_OP_EXT_F64_RSV_0xA8,
+  IREE_VM_OP_EXT_F64_RSV_0xA9,
+  IREE_VM_OP_EXT_F64_RSV_0xAA,
+  IREE_VM_OP_EXT_F64_RSV_0xAB,
+  IREE_VM_OP_EXT_F64_RSV_0xAC,
+  IREE_VM_OP_EXT_F64_RSV_0xAD,
+  IREE_VM_OP_EXT_F64_RSV_0xAE,
+  IREE_VM_OP_EXT_F64_RSV_0xAF,
+  IREE_VM_OP_EXT_F64_BufferLoadF64 = 0xB0,
+  IREE_VM_OP_EXT_F64_BufferStoreF64 = 0xB1,
+  IREE_VM_OP_EXT_F64_RSV_0xB2,
+  IREE_VM_OP_EXT_F64_RSV_0xB3,
+  IREE_VM_OP_EXT_F64_RSV_0xB4,
+  IREE_VM_OP_EXT_F64_RSV_0xB5,
+  IREE_VM_OP_EXT_F64_RSV_0xB6,
+  IREE_VM_OP_EXT_F64_RSV_0xB7,
+  IREE_VM_OP_EXT_F64_RSV_0xB8,
+  IREE_VM_OP_EXT_F64_RSV_0xB9,
+  IREE_VM_OP_EXT_F64_RSV_0xBA,
+  IREE_VM_OP_EXT_F64_RSV_0xBB,
+  IREE_VM_OP_EXT_F64_RSV_0xBC,
+  IREE_VM_OP_EXT_F64_RSV_0xBD,
+  IREE_VM_OP_EXT_F64_RSV_0xBE,
+  IREE_VM_OP_EXT_F64_RSV_0xBF,
+  IREE_VM_OP_EXT_F64_BufferFillF64 = 0xC0,
+  IREE_VM_OP_EXT_F64_RSV_0xC1,
+  IREE_VM_OP_EXT_F64_RSV_0xC2,
+  IREE_VM_OP_EXT_F64_RSV_0xC3,
+  IREE_VM_OP_EXT_F64_RSV_0xC4,
+  IREE_VM_OP_EXT_F64_RSV_0xC5,
+  IREE_VM_OP_EXT_F64_RSV_0xC6,
+  IREE_VM_OP_EXT_F64_RSV_0xC7,
+  IREE_VM_OP_EXT_F64_RSV_0xC8,
+  IREE_VM_OP_EXT_F64_RSV_0xC9,
+  IREE_VM_OP_EXT_F64_RSV_0xCA,
+  IREE_VM_OP_EXT_F64_RSV_0xCB,
+  IREE_VM_OP_EXT_F64_RSV_0xCC,
+  IREE_VM_OP_EXT_F64_RSV_0xCD,
+  IREE_VM_OP_EXT_F64_RSV_0xCE,
+  IREE_VM_OP_EXT_F64_RSV_0xCF,
+  IREE_VM_OP_EXT_F64_RSV_0xD0,
+  IREE_VM_OP_EXT_F64_RSV_0xD1,
+  IREE_VM_OP_EXT_F64_RSV_0xD2,
+  IREE_VM_OP_EXT_F64_RSV_0xD3,
+  IREE_VM_OP_EXT_F64_RSV_0xD4,
+  IREE_VM_OP_EXT_F64_RSV_0xD5,
+  IREE_VM_OP_EXT_F64_RSV_0xD6,
+  IREE_VM_OP_EXT_F64_RSV_0xD7,
+  IREE_VM_OP_EXT_F64_RSV_0xD8,
+  IREE_VM_OP_EXT_F64_RSV_0xD9,
+  IREE_VM_OP_EXT_F64_RSV_0xDA,
+  IREE_VM_OP_EXT_F64_RSV_0xDB,
+  IREE_VM_OP_EXT_F64_RSV_0xDC,
+  IREE_VM_OP_EXT_F64_RSV_0xDD,
+  IREE_VM_OP_EXT_F64_RSV_0xDE,
+  IREE_VM_OP_EXT_F64_RSV_0xDF,
+  IREE_VM_OP_EXT_F64_RSV_0xE0,
+  IREE_VM_OP_EXT_F64_RSV_0xE1,
+  IREE_VM_OP_EXT_F64_RSV_0xE2,
+  IREE_VM_OP_EXT_F64_RSV_0xE3,
+  IREE_VM_OP_EXT_F64_RSV_0xE4,
+  IREE_VM_OP_EXT_F64_RSV_0xE5,
+  IREE_VM_OP_EXT_F64_RSV_0xE6,
+  IREE_VM_OP_EXT_F64_RSV_0xE7,
+  IREE_VM_OP_EXT_F64_RSV_0xE8,
+  IREE_VM_OP_EXT_F64_RSV_0xE9,
+  IREE_VM_OP_EXT_F64_RSV_0xEA,
+  IREE_VM_OP_EXT_F64_RSV_0xEB,
+  IREE_VM_OP_EXT_F64_RSV_0xEC,
+  IREE_VM_OP_EXT_F64_RSV_0xED,
+  IREE_VM_OP_EXT_F64_RSV_0xEE,
+  IREE_VM_OP_EXT_F64_RSV_0xEF,
+  IREE_VM_OP_EXT_F64_RSV_0xF0,
+  IREE_VM_OP_EXT_F64_RSV_0xF1,
+  IREE_VM_OP_EXT_F64_RSV_0xF2,
+  IREE_VM_OP_EXT_F64_RSV_0xF3,
+  IREE_VM_OP_EXT_F64_RSV_0xF4,
+  IREE_VM_OP_EXT_F64_RSV_0xF5,
+  IREE_VM_OP_EXT_F64_RSV_0xF6,
+  IREE_VM_OP_EXT_F64_RSV_0xF7,
+  IREE_VM_OP_EXT_F64_RSV_0xF8,
+  IREE_VM_OP_EXT_F64_RSV_0xF9,
+  IREE_VM_OP_EXT_F64_RSV_0xFA,
+  IREE_VM_OP_EXT_F64_RSV_0xFB,
+  IREE_VM_OP_EXT_F64_RSV_0xFC,
+  IREE_VM_OP_EXT_F64_RSV_0xFD,
+  IREE_VM_OP_EXT_F64_RSV_0xFE,
+  IREE_VM_OP_EXT_F64_RSV_0xFF,
+} iree_vm_ext_f64_op_t;
+
+#define IREE_VM_OP_EXT_F64_TABLE(OPC, RSV) \
+    OPC(0x00, GlobalLoadF64) \
+    OPC(0x01, GlobalStoreF64) \
+    OPC(0x02, GlobalLoadIndirectF64) \
+    OPC(0x03, GlobalStoreIndirectF64) \
+    RSV(0x04) \
+    RSV(0x05) \
+    RSV(0x06) \
+    RSV(0x07) \
+    OPC(0x08, ConstF64Zero) \
+    OPC(0x09, ConstF64) \
+    RSV(0x0A) \
+    RSV(0x0B) \
+    RSV(0x0C) \
+    RSV(0x0D) \
+    RSV(0x0E) \
+    RSV(0x0F) \
+    RSV(0x10) \
+    RSV(0x11) \
+    RSV(0x12) \
+    RSV(0x13) \
+    OPC(0x14, ListGetF64) \
+    OPC(0x15, ListSetF64) \
+    RSV(0x16) \
+    RSV(0x17) \
+    RSV(0x18) \
+    RSV(0x19) \
+    RSV(0x1A) \
+    RSV(0x1B) \
+    RSV(0x1C) \
+    RSV(0x1D) \
+    OPC(0x1E, SelectF64) \
+    RSV(0x1F) \
+    OPC(0x20, SwitchF64) \
+    RSV(0x21) \
+    OPC(0x22, AddF64) \
+    OPC(0x23, SubF64) \
+    OPC(0x24, MulF64) \
+    OPC(0x25, DivF64) \
+    OPC(0x26, RemF64) \
+    OPC(0x27, FMAF64) \
+    OPC(0x28, AbsF64) \
+    OPC(0x29, NegF64) \
+    OPC(0x2A, CeilF64) \
+    OPC(0x2B, FloorF64) \
+    OPC(0x2C, TruncF64F32) \
+    OPC(0x2D, ExtF32F64) \
+    RSV(0x2E) \
+    RSV(0x2F) \
+    OPC(0x30, CastSI32F64) \
+    OPC(0x31, CastUI32F64) \
+    OPC(0x32, CastF64SI32) \
+    OPC(0x33, CastF64UI32) \
+    OPC(0x34, CastSI64F64) \
+    OPC(0x35, CastUI64F64) \
+    OPC(0x36, CastF64SI64) \
+    OPC(0x37, CastF64UI64) \
+    OPC(0x38, BitcastI64F64) \
+    OPC(0x39, BitcastF64I64) \
+    RSV(0x3A) \
+    RSV(0x3B) \
+    RSV(0x3C) \
+    RSV(0x3D) \
+    RSV(0x3E) \
+    RSV(0x3F) \
+    OPC(0x40, AtanF64) \
+    OPC(0x41, Atan2F64) \
+    OPC(0x42, CosF64) \
+    OPC(0x43, SinF64) \
+    OPC(0x44, ExpF64) \
+    OPC(0x45, Exp2F64) \
+    OPC(0x46, ExpM1F64) \
+    OPC(0x47, LogF64) \
+    OPC(0x48, Log10F64) \
+    OPC(0x49, Log1pF64) \
+    OPC(0x4A, Log2F64) \
+    OPC(0x4B, PowF64) \
+    OPC(0x4C, RsqrtF64) \
+    OPC(0x4D, SqrtF64) \
+    OPC(0x4E, TanhF64) \
+    OPC(0x4F, ErfF64) \
+    RSV(0x50) \
+    RSV(0x51) \
+    RSV(0x52) \
+    RSV(0x53) \
+    RSV(0x54) \
+    RSV(0x55) \
+    RSV(0x56) \
+    RSV(0x57) \
+    RSV(0x58) \
+    RSV(0x59) \
+    RSV(0x5A) \
+    RSV(0x5B) \
+    RSV(0x5C) \
+    RSV(0x5D) \
+    RSV(0x5E) \
+    RSV(0x5F) \
+    OPC(0x60, CmpEQF64O) \
+    OPC(0x61, CmpEQF64U) \
+    OPC(0x62, CmpNEF64O) \
+    OPC(0x63, CmpNEF64U) \
+    OPC(0x64, CmpLTF64O) \
+    OPC(0x65, CmpLTF64U) \
+    OPC(0x66, CmpLTEF64O) \
+    OPC(0x67, CmpLTEF64U) \
+    RSV(0x68) \
+    RSV(0x69) \
+    RSV(0x6A) \
+    RSV(0x6B) \
+    RSV(0x6C) \
+    RSV(0x6D) \
+    RSV(0x6E) \
+    RSV(0x6F) \
+    OPC(0x70, CmpNaNF64) \
+    RSV(0x71) \
+    RSV(0x72) \
+    RSV(0x73) \
+    RSV(0x74) \
+    RSV(0x75) \
+    RSV(0x76) \
+    RSV(0x77) \
+    RSV(0x78) \
+    RSV(0x79) \
+    RSV(0x7A) \
+    RSV(0x7B) \
+    RSV(0x7C) \
+    RSV(0x7D) \
+    RSV(0x7E) \
+    RSV(0x7F) \
+    RSV(0x80) \
+    RSV(0x81) \
+    RSV(0x82) \
+    RSV(0x83) \
+    RSV(0x84) \
+    RSV(0x85) \
+    RSV(0x86) \
+    RSV(0x87) \
+    RSV(0x88) \
+    RSV(0x89) \
+    RSV(0x8A) \
+    RSV(0x8B) \
+    RSV(0x8C) \
+    RSV(0x8D) \
+    RSV(0x8E) \
+    RSV(0x8F) \
+    RSV(0x90) \
+    RSV(0x91) \
+    RSV(0x92) \
+    RSV(0x93) \
+    RSV(0x94) \
+    RSV(0x95) \
+    RSV(0x96) \
+    RSV(0x97) \
+    RSV(0x98) \
+    RSV(0x99) \
+    RSV(0x9A) \
+    RSV(0x9B) \
+    RSV(0x9C) \
+    RSV(0x9D) \
+    RSV(0x9E) \
+    RSV(0x9F) \
+    RSV(0xA0) \
+    RSV(0xA1) \
+    RSV(0xA2) \
+    RSV(0xA3) \
+    RSV(0xA4) \
+    RSV(0xA5) \
+    RSV(0xA6) \
+    RSV(0xA7) \
+    RSV(0xA8) \
+    RSV(0xA9) \
+    RSV(0xAA) \
+    RSV(0xAB) \
+    RSV(0xAC) \
+    RSV(0xAD) \
+    RSV(0xAE) \
+    RSV(0xAF) \
+    OPC(0xB0, BufferLoadF64) \
+    OPC(0xB1, BufferStoreF64) \
+    RSV(0xB2) \
+    RSV(0xB3) \
+    RSV(0xB4) \
+    RSV(0xB5) \
+    RSV(0xB6) \
+    RSV(0xB7) \
+    RSV(0xB8) \
+    RSV(0xB9) \
+    RSV(0xBA) \
+    RSV(0xBB) \
+    RSV(0xBC) \
+    RSV(0xBD) \
+    RSV(0xBE) \
+    RSV(0xBF) \
+    OPC(0xC0, BufferFillF64) \
+    RSV(0xC1) \
+    RSV(0xC2) \
+    RSV(0xC3) \
+    RSV(0xC4) \
+    RSV(0xC5) \
+    RSV(0xC6) \
+    RSV(0xC7) \
+    RSV(0xC8) \
+    RSV(0xC9) \
+    RSV(0xCA) \
+    RSV(0xCB) \
+    RSV(0xCC) \
+    RSV(0xCD) \
+    RSV(0xCE) \
+    RSV(0xCF) \
+    RSV(0xD0) \
+    RSV(0xD1) \
+    RSV(0xD2) \
+    RSV(0xD3) \
+    RSV(0xD4) \
+    RSV(0xD5) \
+    RSV(0xD6) \
+    RSV(0xD7) \
+    RSV(0xD8) \
+    RSV(0xD9) \
+    RSV(0xDA) \
+    RSV(0xDB) \
+    RSV(0xDC) \
+    RSV(0xDD) \
+    RSV(0xDE) \
+    RSV(0xDF) \
+    RSV(0xE0) \
+    RSV(0xE1) \
+    RSV(0xE2) \
+    RSV(0xE3) \
+    RSV(0xE4) \
+    RSV(0xE5) \
+    RSV(0xE6) \
+    RSV(0xE7) \
+    RSV(0xE8) \
+    RSV(0xE9) \
+    RSV(0xEA) \
+    RSV(0xEB) \
+    RSV(0xEC) \
+    RSV(0xED) \
+    RSV(0xEE) \
+    RSV(0xEF) \
+    RSV(0xF0) \
+    RSV(0xF1) \
+    RSV(0xF2) \
+    RSV(0xF3) \
+    RSV(0xF4) \
+    RSV(0xF5) \
+    RSV(0xF6) \
+    RSV(0xF7) \
+    RSV(0xF8) \
+    RSV(0xF9) \
+    RSV(0xFA) \
+    RSV(0xFB) \
+    RSV(0xFC) \
+    RSV(0xFD) \
+    RSV(0xFE) \
+    RSV(0xFF)
+
+typedef enum {
+  IREE_VM_OP_EXT_I64_GlobalLoadI64 = 0x00,
+  IREE_VM_OP_EXT_I64_GlobalStoreI64 = 0x01,
+  IREE_VM_OP_EXT_I64_GlobalLoadIndirectI64 = 0x02,
+  IREE_VM_OP_EXT_I64_GlobalStoreIndirectI64 = 0x03,
+  IREE_VM_OP_EXT_I64_RSV_0x04,
+  IREE_VM_OP_EXT_I64_RSV_0x05,
+  IREE_VM_OP_EXT_I64_RSV_0x06,
+  IREE_VM_OP_EXT_I64_RSV_0x07,
+  IREE_VM_OP_EXT_I64_ConstI64Zero = 0x08,
+  IREE_VM_OP_EXT_I64_ConstI64 = 0x09,
+  IREE_VM_OP_EXT_I64_RSV_0x0A,
+  IREE_VM_OP_EXT_I64_RSV_0x0B,
+  IREE_VM_OP_EXT_I64_RSV_0x0C,
+  IREE_VM_OP_EXT_I64_RSV_0x0D,
+  IREE_VM_OP_EXT_I64_RSV_0x0E,
+  IREE_VM_OP_EXT_I64_RSV_0x0F,
+  IREE_VM_OP_EXT_I64_RSV_0x10,
+  IREE_VM_OP_EXT_I64_RSV_0x11,
+  IREE_VM_OP_EXT_I64_RSV_0x12,
+  IREE_VM_OP_EXT_I64_RSV_0x13,
+  IREE_VM_OP_EXT_I64_ListGetI64 = 0x14,
+  IREE_VM_OP_EXT_I64_ListSetI64 = 0x15,
+  IREE_VM_OP_EXT_I64_RSV_0x16,
+  IREE_VM_OP_EXT_I64_RSV_0x17,
+  IREE_VM_OP_EXT_I64_RSV_0x18,
+  IREE_VM_OP_EXT_I64_RSV_0x19,
+  IREE_VM_OP_EXT_I64_RSV_0x1A,
+  IREE_VM_OP_EXT_I64_RSV_0x1B,
+  IREE_VM_OP_EXT_I64_RSV_0x1C,
+  IREE_VM_OP_EXT_I64_RSV_0x1D,
+  IREE_VM_OP_EXT_I64_SelectI64 = 0x1E,
+  IREE_VM_OP_EXT_I64_RSV_0x1F,
+  IREE_VM_OP_EXT_I64_SwitchI64 = 0x20,
+  IREE_VM_OP_EXT_I64_RSV_0x21,
+  IREE_VM_OP_EXT_I64_AddI64 = 0x22,
+  IREE_VM_OP_EXT_I64_SubI64 = 0x23,
+  IREE_VM_OP_EXT_I64_MulI64 = 0x24,
+  IREE_VM_OP_EXT_I64_DivI64S = 0x25,
+  IREE_VM_OP_EXT_I64_DivI64U = 0x26,
+  IREE_VM_OP_EXT_I64_RemI64S = 0x27,
+  IREE_VM_OP_EXT_I64_RemI64U = 0x28,
+  IREE_VM_OP_EXT_I64_FMAI64 = 0x29,
+  IREE_VM_OP_EXT_I64_RSV_0x2A,
+  IREE_VM_OP_EXT_I64_RSV_0x2B,
+  IREE_VM_OP_EXT_I64_RSV_0x2C,
+  IREE_VM_OP_EXT_I64_RSV_0x2D,
+  IREE_VM_OP_EXT_I64_RSV_0x2E,
+  IREE_VM_OP_EXT_I64_RSV_0x2F,
+  IREE_VM_OP_EXT_I64_NotI64 = 0x30,
+  IREE_VM_OP_EXT_I64_AndI64 = 0x31,
+  IREE_VM_OP_EXT_I64_OrI64 = 0x32,
+  IREE_VM_OP_EXT_I64_XorI64 = 0x33,
+  IREE_VM_OP_EXT_I64_ShlI64 = 0x34,
+  IREE_VM_OP_EXT_I64_ShrI64S = 0x35,
+  IREE_VM_OP_EXT_I64_ShrI64U = 0x36,
+  IREE_VM_OP_EXT_I64_TruncI64I32 = 0x37,
+  IREE_VM_OP_EXT_I64_ExtI32I64S = 0x38,
+  IREE_VM_OP_EXT_I64_ExtI32I64U = 0x39,
+  IREE_VM_OP_EXT_I64_RSV_0x3A,
+  IREE_VM_OP_EXT_I64_RSV_0x3B,
+  IREE_VM_OP_EXT_I64_RSV_0x3C,
+  IREE_VM_OP_EXT_I64_RSV_0x3D,
+  IREE_VM_OP_EXT_I64_RSV_0x3E,
+  IREE_VM_OP_EXT_I64_RSV_0x3F,
+  IREE_VM_OP_EXT_I64_CmpEQI64 = 0x40,
+  IREE_VM_OP_EXT_I64_CmpNEI64 = 0x41,
+  IREE_VM_OP_EXT_I64_CmpLTI64S = 0x42,
+  IREE_VM_OP_EXT_I64_CmpLTI64U = 0x43,
+  IREE_VM_OP_EXT_I64_RSV_0x44,
+  IREE_VM_OP_EXT_I64_RSV_0x45,
+  IREE_VM_OP_EXT_I64_RSV_0x46,
+  IREE_VM_OP_EXT_I64_RSV_0x47,
+  IREE_VM_OP_EXT_I64_RSV_0x48,
+  IREE_VM_OP_EXT_I64_RSV_0x49,
+  IREE_VM_OP_EXT_I64_RSV_0x4A,
+  IREE_VM_OP_EXT_I64_RSV_0x4B,
+  IREE_VM_OP_EXT_I64_RSV_0x4C,
+  IREE_VM_OP_EXT_I64_CmpNZI64 = 0x4D,
+  IREE_VM_OP_EXT_I64_RSV_0x4E,
+  IREE_VM_OP_EXT_I64_RSV_0x4F,
+  IREE_VM_OP_EXT_I64_RSV_0x50,
+  IREE_VM_OP_EXT_I64_RSV_0x51,
+  IREE_VM_OP_EXT_I64_RSV_0x52,
+  IREE_VM_OP_EXT_I64_RSV_0x53,
+  IREE_VM_OP_EXT_I64_RSV_0x54,
+  IREE_VM_OP_EXT_I64_RSV_0x55,
+  IREE_VM_OP_EXT_I64_RSV_0x56,
+  IREE_VM_OP_EXT_I64_RSV_0x57,
+  IREE_VM_OP_EXT_I64_RSV_0x58,
+  IREE_VM_OP_EXT_I64_RSV_0x59,
+  IREE_VM_OP_EXT_I64_RSV_0x5A,
+  IREE_VM_OP_EXT_I64_RSV_0x5B,
+  IREE_VM_OP_EXT_I64_RSV_0x5C,
+  IREE_VM_OP_EXT_I64_RSV_0x5D,
+  IREE_VM_OP_EXT_I64_RSV_0x5E,
+  IREE_VM_OP_EXT_I64_RSV_0x5F,
+  IREE_VM_OP_EXT_I64_RSV_0x60,
+  IREE_VM_OP_EXT_I64_RSV_0x61,
+  IREE_VM_OP_EXT_I64_RSV_0x62,
+  IREE_VM_OP_EXT_I64_RSV_0x63,
+  IREE_VM_OP_EXT_I64_RSV_0x64,
+  IREE_VM_OP_EXT_I64_RSV_0x65,
+  IREE_VM_OP_EXT_I64_RSV_0x66,
+  IREE_VM_OP_EXT_I64_RSV_0x67,
+  IREE_VM_OP_EXT_I64_RSV_0x68,
+  IREE_VM_OP_EXT_I64_RSV_0x69,
+  IREE_VM_OP_EXT_I64_RSV_0x6A,
+  IREE_VM_OP_EXT_I64_RSV_0x6B,
+  IREE_VM_OP_EXT_I64_RSV_0x6C,
+  IREE_VM_OP_EXT_I64_RSV_0x6D,
+  IREE_VM_OP_EXT_I64_RSV_0x6E,
+  IREE_VM_OP_EXT_I64_RSV_0x6F,
+  IREE_VM_OP_EXT_I64_RSV_0x70,
+  IREE_VM_OP_EXT_I64_RSV_0x71,
+  IREE_VM_OP_EXT_I64_RSV_0x72,
+  IREE_VM_OP_EXT_I64_RSV_0x73,
+  IREE_VM_OP_EXT_I64_RSV_0x74,
+  IREE_VM_OP_EXT_I64_RSV_0x75,
+  IREE_VM_OP_EXT_I64_RSV_0x76,
+  IREE_VM_OP_EXT_I64_RSV_0x77,
+  IREE_VM_OP_EXT_I64_RSV_0x78,
+  IREE_VM_OP_EXT_I64_RSV_0x79,
+  IREE_VM_OP_EXT_I64_RSV_0x7A,
+  IREE_VM_OP_EXT_I64_RSV_0x7B,
+  IREE_VM_OP_EXT_I64_RSV_0x7C,
+  IREE_VM_OP_EXT_I64_RSV_0x7D,
+  IREE_VM_OP_EXT_I64_RSV_0x7E,
+  IREE_VM_OP_EXT_I64_RSV_0x7F,
+  IREE_VM_OP_EXT_I64_RSV_0x80,
+  IREE_VM_OP_EXT_I64_RSV_0x81,
+  IREE_VM_OP_EXT_I64_RSV_0x82,
+  IREE_VM_OP_EXT_I64_RSV_0x83,
+  IREE_VM_OP_EXT_I64_RSV_0x84,
+  IREE_VM_OP_EXT_I64_RSV_0x85,
+  IREE_VM_OP_EXT_I64_RSV_0x86,
+  IREE_VM_OP_EXT_I64_RSV_0x87,
+  IREE_VM_OP_EXT_I64_RSV_0x88,
+  IREE_VM_OP_EXT_I64_RSV_0x89,
+  IREE_VM_OP_EXT_I64_RSV_0x8A,
+  IREE_VM_OP_EXT_I64_RSV_0x8B,
+  IREE_VM_OP_EXT_I64_RSV_0x8C,
+  IREE_VM_OP_EXT_I64_RSV_0x8D,
+  IREE_VM_OP_EXT_I64_RSV_0x8E,
+  IREE_VM_OP_EXT_I64_RSV_0x8F,
+  IREE_VM_OP_EXT_I64_RSV_0x90,
+  IREE_VM_OP_EXT_I64_RSV_0x91,
+  IREE_VM_OP_EXT_I64_RSV_0x92,
+  IREE_VM_OP_EXT_I64_RSV_0x93,
+  IREE_VM_OP_EXT_I64_RSV_0x94,
+  IREE_VM_OP_EXT_I64_RSV_0x95,
+  IREE_VM_OP_EXT_I64_RSV_0x96,
+  IREE_VM_OP_EXT_I64_RSV_0x97,
+  IREE_VM_OP_EXT_I64_RSV_0x98,
+  IREE_VM_OP_EXT_I64_RSV_0x99,
+  IREE_VM_OP_EXT_I64_RSV_0x9A,
+  IREE_VM_OP_EXT_I64_RSV_0x9B,
+  IREE_VM_OP_EXT_I64_RSV_0x9C,
+  IREE_VM_OP_EXT_I64_RSV_0x9D,
+  IREE_VM_OP_EXT_I64_RSV_0x9E,
+  IREE_VM_OP_EXT_I64_RSV_0x9F,
+  IREE_VM_OP_EXT_I64_RSV_0xA0,
+  IREE_VM_OP_EXT_I64_RSV_0xA1,
+  IREE_VM_OP_EXT_I64_RSV_0xA2,
+  IREE_VM_OP_EXT_I64_RSV_0xA3,
+  IREE_VM_OP_EXT_I64_RSV_0xA4,
+  IREE_VM_OP_EXT_I64_RSV_0xA5,
+  IREE_VM_OP_EXT_I64_RSV_0xA6,
+  IREE_VM_OP_EXT_I64_RSV_0xA7,
+  IREE_VM_OP_EXT_I64_RSV_0xA8,
+  IREE_VM_OP_EXT_I64_RSV_0xA9,
+  IREE_VM_OP_EXT_I64_RSV_0xAA,
+  IREE_VM_OP_EXT_I64_RSV_0xAB,
+  IREE_VM_OP_EXT_I64_RSV_0xAC,
+  IREE_VM_OP_EXT_I64_RSV_0xAD,
+  IREE_VM_OP_EXT_I64_RSV_0xAE,
+  IREE_VM_OP_EXT_I64_RSV_0xAF,
+  IREE_VM_OP_EXT_I64_BufferLoadI64 = 0xB0,
+  IREE_VM_OP_EXT_I64_BufferStoreI64 = 0xB1,
+  IREE_VM_OP_EXT_I64_RSV_0xB2,
+  IREE_VM_OP_EXT_I64_RSV_0xB3,
+  IREE_VM_OP_EXT_I64_RSV_0xB4,
+  IREE_VM_OP_EXT_I64_RSV_0xB5,
+  IREE_VM_OP_EXT_I64_RSV_0xB6,
+  IREE_VM_OP_EXT_I64_RSV_0xB7,
+  IREE_VM_OP_EXT_I64_RSV_0xB8,
+  IREE_VM_OP_EXT_I64_RSV_0xB9,
+  IREE_VM_OP_EXT_I64_RSV_0xBA,
+  IREE_VM_OP_EXT_I64_RSV_0xBB,
+  IREE_VM_OP_EXT_I64_RSV_0xBC,
+  IREE_VM_OP_EXT_I64_RSV_0xBD,
+  IREE_VM_OP_EXT_I64_RSV_0xBE,
+  IREE_VM_OP_EXT_I64_RSV_0xBF,
+  IREE_VM_OP_EXT_I64_BufferFillI64 = 0xC0,
+  IREE_VM_OP_EXT_I64_RSV_0xC1,
+  IREE_VM_OP_EXT_I64_RSV_0xC2,
+  IREE_VM_OP_EXT_I64_RSV_0xC3,
+  IREE_VM_OP_EXT_I64_RSV_0xC4,
+  IREE_VM_OP_EXT_I64_RSV_0xC5,
+  IREE_VM_OP_EXT_I64_RSV_0xC6,
+  IREE_VM_OP_EXT_I64_RSV_0xC7,
+  IREE_VM_OP_EXT_I64_RSV_0xC8,
+  IREE_VM_OP_EXT_I64_RSV_0xC9,
+  IREE_VM_OP_EXT_I64_RSV_0xCA,
+  IREE_VM_OP_EXT_I64_RSV_0xCB,
+  IREE_VM_OP_EXT_I64_RSV_0xCC,
+  IREE_VM_OP_EXT_I64_RSV_0xCD,
+  IREE_VM_OP_EXT_I64_RSV_0xCE,
+  IREE_VM_OP_EXT_I64_RSV_0xCF,
+  IREE_VM_OP_EXT_I64_RSV_0xD0,
+  IREE_VM_OP_EXT_I64_RSV_0xD1,
+  IREE_VM_OP_EXT_I64_RSV_0xD2,
+  IREE_VM_OP_EXT_I64_RSV_0xD3,
+  IREE_VM_OP_EXT_I64_RSV_0xD4,
+  IREE_VM_OP_EXT_I64_RSV_0xD5,
+  IREE_VM_OP_EXT_I64_RSV_0xD6,
+  IREE_VM_OP_EXT_I64_RSV_0xD7,
+  IREE_VM_OP_EXT_I64_RSV_0xD8,
+  IREE_VM_OP_EXT_I64_RSV_0xD9,
+  IREE_VM_OP_EXT_I64_RSV_0xDA,
+  IREE_VM_OP_EXT_I64_RSV_0xDB,
+  IREE_VM_OP_EXT_I64_RSV_0xDC,
+  IREE_VM_OP_EXT_I64_RSV_0xDD,
+  IREE_VM_OP_EXT_I64_RSV_0xDE,
+  IREE_VM_OP_EXT_I64_RSV_0xDF,
+  IREE_VM_OP_EXT_I64_RSV_0xE0,
+  IREE_VM_OP_EXT_I64_RSV_0xE1,
+  IREE_VM_OP_EXT_I64_RSV_0xE2,
+  IREE_VM_OP_EXT_I64_RSV_0xE3,
+  IREE_VM_OP_EXT_I64_RSV_0xE4,
+  IREE_VM_OP_EXT_I64_RSV_0xE5,
+  IREE_VM_OP_EXT_I64_RSV_0xE6,
+  IREE_VM_OP_EXT_I64_RSV_0xE7,
+  IREE_VM_OP_EXT_I64_RSV_0xE8,
+  IREE_VM_OP_EXT_I64_RSV_0xE9,
+  IREE_VM_OP_EXT_I64_RSV_0xEA,
+  IREE_VM_OP_EXT_I64_RSV_0xEB,
+  IREE_VM_OP_EXT_I64_RSV_0xEC,
+  IREE_VM_OP_EXT_I64_RSV_0xED,
+  IREE_VM_OP_EXT_I64_RSV_0xEE,
+  IREE_VM_OP_EXT_I64_RSV_0xEF,
+  IREE_VM_OP_EXT_I64_RSV_0xF0,
+  IREE_VM_OP_EXT_I64_RSV_0xF1,
+  IREE_VM_OP_EXT_I64_RSV_0xF2,
+  IREE_VM_OP_EXT_I64_RSV_0xF3,
+  IREE_VM_OP_EXT_I64_RSV_0xF4,
+  IREE_VM_OP_EXT_I64_RSV_0xF5,
+  IREE_VM_OP_EXT_I64_RSV_0xF6,
+  IREE_VM_OP_EXT_I64_RSV_0xF7,
+  IREE_VM_OP_EXT_I64_RSV_0xF8,
+  IREE_VM_OP_EXT_I64_RSV_0xF9,
+  IREE_VM_OP_EXT_I64_RSV_0xFA,
+  IREE_VM_OP_EXT_I64_RSV_0xFB,
+  IREE_VM_OP_EXT_I64_RSV_0xFC,
+  IREE_VM_OP_EXT_I64_RSV_0xFD,
+  IREE_VM_OP_EXT_I64_RSV_0xFE,
+  IREE_VM_OP_EXT_I64_RSV_0xFF,
+} iree_vm_ext_i64_op_t;
+
+#define IREE_VM_OP_EXT_I64_TABLE(OPC, RSV) \
+    OPC(0x00, GlobalLoadI64) \
+    OPC(0x01, GlobalStoreI64) \
+    OPC(0x02, GlobalLoadIndirectI64) \
+    OPC(0x03, GlobalStoreIndirectI64) \
+    RSV(0x04) \
+    RSV(0x05) \
+    RSV(0x06) \
+    RSV(0x07) \
+    OPC(0x08, ConstI64Zero) \
+    OPC(0x09, ConstI64) \
+    RSV(0x0A) \
+    RSV(0x0B) \
+    RSV(0x0C) \
+    RSV(0x0D) \
+    RSV(0x0E) \
+    RSV(0x0F) \
+    RSV(0x10) \
+    RSV(0x11) \
+    RSV(0x12) \
+    RSV(0x13) \
+    OPC(0x14, ListGetI64) \
+    OPC(0x15, ListSetI64) \
+    RSV(0x16) \
+    RSV(0x17) \
+    RSV(0x18) \
+    RSV(0x19) \
+    RSV(0x1A) \
+    RSV(0x1B) \
+    RSV(0x1C) \
+    RSV(0x1D) \
+    OPC(0x1E, SelectI64) \
+    RSV(0x1F) \
+    OPC(0x20, SwitchI64) \
+    RSV(0x21) \
+    OPC(0x22, AddI64) \
+    OPC(0x23, SubI64) \
+    OPC(0x24, MulI64) \
+    OPC(0x25, DivI64S) \
+    OPC(0x26, DivI64U) \
+    OPC(0x27, RemI64S) \
+    OPC(0x28, RemI64U) \
+    OPC(0x29, FMAI64) \
+    RSV(0x2A) \
+    RSV(0x2B) \
+    RSV(0x2C) \
+    RSV(0x2D) \
+    RSV(0x2E) \
+    RSV(0x2F) \
+    OPC(0x30, NotI64) \
+    OPC(0x31, AndI64) \
+    OPC(0x32, OrI64) \
+    OPC(0x33, XorI64) \
+    OPC(0x34, ShlI64) \
+    OPC(0x35, ShrI64S) \
+    OPC(0x36, ShrI64U) \
+    OPC(0x37, TruncI64I32) \
+    OPC(0x38, ExtI32I64S) \
+    OPC(0x39, ExtI32I64U) \
+    RSV(0x3A) \
+    RSV(0x3B) \
+    RSV(0x3C) \
+    RSV(0x3D) \
+    RSV(0x3E) \
+    RSV(0x3F) \
+    OPC(0x40, CmpEQI64) \
+    OPC(0x41, CmpNEI64) \
+    OPC(0x42, CmpLTI64S) \
+    OPC(0x43, CmpLTI64U) \
+    RSV(0x44) \
+    RSV(0x45) \
+    RSV(0x46) \
+    RSV(0x47) \
+    RSV(0x48) \
+    RSV(0x49) \
+    RSV(0x4A) \
+    RSV(0x4B) \
+    RSV(0x4C) \
+    OPC(0x4D, CmpNZI64) \
+    RSV(0x4E) \
+    RSV(0x4F) \
+    RSV(0x50) \
+    RSV(0x51) \
+    RSV(0x52) \
+    RSV(0x53) \
+    RSV(0x54) \
+    RSV(0x55) \
+    RSV(0x56) \
+    RSV(0x57) \
+    RSV(0x58) \
+    RSV(0x59) \
+    RSV(0x5A) \
+    RSV(0x5B) \
+    RSV(0x5C) \
+    RSV(0x5D) \
+    RSV(0x5E) \
+    RSV(0x5F) \
+    RSV(0x60) \
+    RSV(0x61) \
+    RSV(0x62) \
+    RSV(0x63) \
+    RSV(0x64) \
+    RSV(0x65) \
+    RSV(0x66) \
+    RSV(0x67) \
+    RSV(0x68) \
+    RSV(0x69) \
+    RSV(0x6A) \
+    RSV(0x6B) \
+    RSV(0x6C) \
+    RSV(0x6D) \
+    RSV(0x6E) \
+    RSV(0x6F) \
+    RSV(0x70) \
+    RSV(0x71) \
+    RSV(0x72) \
+    RSV(0x73) \
+    RSV(0x74) \
+    RSV(0x75) \
+    RSV(0x76) \
+    RSV(0x77) \
+    RSV(0x78) \
+    RSV(0x79) \
+    RSV(0x7A) \
+    RSV(0x7B) \
+    RSV(0x7C) \
+    RSV(0x7D) \
+    RSV(0x7E) \
+    RSV(0x7F) \
+    RSV(0x80) \
+    RSV(0x81) \
+    RSV(0x82) \
+    RSV(0x83) \
+    RSV(0x84) \
+    RSV(0x85) \
+    RSV(0x86) \
+    RSV(0x87) \
+    RSV(0x88) \
+    RSV(0x89) \
+    RSV(0x8A) \
+    RSV(0x8B) \
+    RSV(0x8C) \
+    RSV(0x8D) \
+    RSV(0x8E) \
+    RSV(0x8F) \
+    RSV(0x90) \
+    RSV(0x91) \
+    RSV(0x92) \
+    RSV(0x93) \
+    RSV(0x94) \
+    RSV(0x95) \
+    RSV(0x96) \
+    RSV(0x97) \
+    RSV(0x98) \
+    RSV(0x99) \
+    RSV(0x9A) \
+    RSV(0x9B) \
+    RSV(0x9C) \
+    RSV(0x9D) \
+    RSV(0x9E) \
+    RSV(0x9F) \
+    RSV(0xA0) \
+    RSV(0xA1) \
+    RSV(0xA2) \
+    RSV(0xA3) \
+    RSV(0xA4) \
+    RSV(0xA5) \
+    RSV(0xA6) \
+    RSV(0xA7) \
+    RSV(0xA8) \
+    RSV(0xA9) \
+    RSV(0xAA) \
+    RSV(0xAB) \
+    RSV(0xAC) \
+    RSV(0xAD) \
+    RSV(0xAE) \
+    RSV(0xAF) \
+    OPC(0xB0, BufferLoadI64) \
+    OPC(0xB1, BufferStoreI64) \
+    RSV(0xB2) \
+    RSV(0xB3) \
+    RSV(0xB4) \
+    RSV(0xB5) \
+    RSV(0xB6) \
+    RSV(0xB7) \
+    RSV(0xB8) \
+    RSV(0xB9) \
+    RSV(0xBA) \
+    RSV(0xBB) \
+    RSV(0xBC) \
+    RSV(0xBD) \
+    RSV(0xBE) \
+    RSV(0xBF) \
+    OPC(0xC0, BufferFillI64) \
+    RSV(0xC1) \
+    RSV(0xC2) \
+    RSV(0xC3) \
+    RSV(0xC4) \
+    RSV(0xC5) \
+    RSV(0xC6) \
+    RSV(0xC7) \
+    RSV(0xC8) \
+    RSV(0xC9) \
+    RSV(0xCA) \
+    RSV(0xCB) \
+    RSV(0xCC) \
+    RSV(0xCD) \
+    RSV(0xCE) \
+    RSV(0xCF) \
+    RSV(0xD0) \
+    RSV(0xD1) \
+    RSV(0xD2) \
+    RSV(0xD3) \
+    RSV(0xD4) \
+    RSV(0xD5) \
+    RSV(0xD6) \
+    RSV(0xD7) \
+    RSV(0xD8) \
+    RSV(0xD9) \
+    RSV(0xDA) \
+    RSV(0xDB) \
+    RSV(0xDC) \
+    RSV(0xDD) \
+    RSV(0xDE) \
+    RSV(0xDF) \
+    RSV(0xE0) \
+    RSV(0xE1) \
+    RSV(0xE2) \
+    RSV(0xE3) \
+    RSV(0xE4) \
+    RSV(0xE5) \
+    RSV(0xE6) \
+    RSV(0xE7) \
+    RSV(0xE8) \
+    RSV(0xE9) \
+    RSV(0xEA) \
+    RSV(0xEB) \
+    RSV(0xEC) \
+    RSV(0xED) \
+    RSV(0xEE) \
+    RSV(0xEF) \
+    RSV(0xF0) \
+    RSV(0xF1) \
+    RSV(0xF2) \
+    RSV(0xF3) \
+    RSV(0xF4) \
+    RSV(0xF5) \
+    RSV(0xF6) \
+    RSV(0xF7) \
+    RSV(0xF8) \
+    RSV(0xF9) \
+    RSV(0xFA) \
+    RSV(0xFB) \
+    RSV(0xFC) \
+    RSV(0xFD) \
+    RSV(0xFE) \
+    RSV(0xFF)
+
diff --git a/runtime/src/iree/vm/instance.c b/runtime/src/iree/vm/instance.c
new file mode 100644
index 0000000..9d1f4c6
--- /dev/null
+++ b/runtime/src/iree/vm/instance.c
@@ -0,0 +1,57 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/vm/instance.h"
+
+#include <stddef.h>
+
+#include "iree/base/internal/atomics.h"
+#include "iree/base/tracing.h"
+#include "iree/vm/builtin_types.h"
+
+struct iree_vm_instance_t {
+  iree_atomic_ref_count_t ref_count;
+  iree_allocator_t allocator;
+};
+
+IREE_API_EXPORT iree_status_t iree_vm_instance_create(
+    iree_allocator_t allocator, iree_vm_instance_t** out_instance) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_ASSERT_ARGUMENT(out_instance);
+  *out_instance = NULL;
+
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(z0, iree_vm_register_builtin_types());
+
+  iree_vm_instance_t* instance = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0,
+      iree_allocator_malloc(allocator, sizeof(*instance), (void**)&instance));
+  instance->allocator = allocator;
+  iree_atomic_ref_count_init(&instance->ref_count);
+
+  *out_instance = instance;
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+static void iree_vm_instance_destroy(iree_vm_instance_t* instance) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_ASSERT_ARGUMENT(instance);
+  iree_allocator_free(instance->allocator, instance);
+  IREE_TRACE_ZONE_END(z0);
+}
+
+IREE_API_EXPORT void iree_vm_instance_retain(iree_vm_instance_t* instance) {
+  if (instance) {
+    iree_atomic_ref_count_inc(&instance->ref_count);
+  }
+}
+
+IREE_API_EXPORT void iree_vm_instance_release(iree_vm_instance_t* instance) {
+  if (instance && iree_atomic_ref_count_dec(&instance->ref_count) == 1) {
+    iree_vm_instance_destroy(instance);
+  }
+}
diff --git a/runtime/src/iree/vm/instance.h b/runtime/src/iree/vm/instance.h
new file mode 100644
index 0000000..e54e7bb
--- /dev/null
+++ b/runtime/src/iree/vm/instance.h
@@ -0,0 +1,46 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_VM_INSTANCE_H_
+#define IREE_VM_INSTANCE_H_
+
+#include "iree/base/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Shared runtime instance responsible for routing iree_vm_context_ events,
+// enumerating and creating hardware device interfaces, and managing device
+// resource pools.
+//
+// A single runtime instance can service multiple contexts and hosting
+// applications should try to reuse instances as much as possible. This ensures
+// that resource allocation across contexts is handled and extraneous device
+// interaction is avoided. For devices that may have exclusive access
+// restrictions it is mandatory to share instances, so plan accordingly.
+//
+// Thread-safe.
+typedef struct iree_vm_instance_t iree_vm_instance_t;
+
+// Creates a new instance. This should be shared with all contexts in an
+// application to ensure that resources are tracked properly and threads are
+// managed correctly.
+// |out_instance| must be released by the caller.
+IREE_API_EXPORT iree_status_t iree_vm_instance_create(
+    iree_allocator_t allocator, iree_vm_instance_t** out_instance);
+
+// Retains the given |instance| for the caller.
+IREE_API_EXPORT void iree_vm_instance_retain(iree_vm_instance_t* instance);
+
+// Releases the given |instance| from the caller.
+IREE_API_EXPORT void iree_vm_instance_release(iree_vm_instance_t* instance);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_VM_INSTANCE_H_
diff --git a/runtime/src/iree/vm/invocation.c b/runtime/src/iree/vm/invocation.c
new file mode 100644
index 0000000..58e385c
--- /dev/null
+++ b/runtime/src/iree/vm/invocation.c
@@ -0,0 +1,226 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/vm/invocation.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+#include "iree/vm/ref.h"
+#include "iree/vm/stack.h"
+#include "iree/vm/value.h"
+
+// Marshals caller arguments from the variant list to the ABI convention.
+static iree_status_t iree_vm_invoke_marshal_inputs(
+    iree_string_view_t cconv_arguments, iree_vm_list_t* inputs,
+    iree_byte_span_t arguments) {
+  // We are 1:1 right now with no variadic args, so do a quick verification on
+  // the input list.
+  iree_host_size_t expected_input_count =
+      cconv_arguments.size > 0
+          ? (cconv_arguments.data[0] == 'v' ? 0 : cconv_arguments.size)
+          : 0;
+  if (IREE_UNLIKELY(!inputs)) {
+    if (IREE_UNLIKELY(expected_input_count > 0)) {
+      return iree_make_status(
+          IREE_STATUS_INVALID_ARGUMENT,
+          "no input provided to a function that has inputs");
+    }
+    return iree_ok_status();
+  } else if (IREE_UNLIKELY(expected_input_count != iree_vm_list_size(inputs))) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "input list and function mismatch; expected %zu "
+                            "arguments but passed %zu",
+                            expected_input_count, iree_vm_list_size(inputs));
+  }
+
+  uint8_t* p = arguments.data;
+  for (iree_host_size_t cconv_i = 0, arg_i = 0; cconv_i < cconv_arguments.size;
+       ++cconv_i, ++arg_i) {
+    switch (cconv_arguments.data[cconv_i]) {
+      case IREE_VM_CCONV_TYPE_VOID:
+        break;
+      case IREE_VM_CCONV_TYPE_I32: {
+        iree_vm_value_t value;
+        IREE_RETURN_IF_ERROR(iree_vm_list_get_value_as(
+            inputs, arg_i, IREE_VM_VALUE_TYPE_I32, &value));
+        memcpy(p, &value.i32, sizeof(int32_t));
+        p += sizeof(int32_t);
+      } break;
+      case IREE_VM_CCONV_TYPE_I64: {
+        iree_vm_value_t value;
+        IREE_RETURN_IF_ERROR(iree_vm_list_get_value_as(
+            inputs, arg_i, IREE_VM_VALUE_TYPE_I64, &value));
+        memcpy(p, &value.i64, sizeof(int64_t));
+        p += sizeof(int64_t);
+      } break;
+      case IREE_VM_CCONV_TYPE_F32: {
+        iree_vm_value_t value;
+        IREE_RETURN_IF_ERROR(iree_vm_list_get_value_as(
+            inputs, arg_i, IREE_VM_VALUE_TYPE_F32, &value));
+        memcpy(p, &value.f32, sizeof(float));
+        p += sizeof(float);
+      } break;
+      case IREE_VM_CCONV_TYPE_F64: {
+        iree_vm_value_t value;
+        IREE_RETURN_IF_ERROR(iree_vm_list_get_value_as(
+            inputs, arg_i, IREE_VM_VALUE_TYPE_F64, &value));
+        memcpy(p, &value.f64, sizeof(double));
+        p += sizeof(double);
+      } break;
+      case IREE_VM_CCONV_TYPE_REF: {
+        // TODO(benvanik): see if we can't remove this retain by instead relying
+        // on the caller still owning the list.
+        IREE_RETURN_IF_ERROR(
+            iree_vm_list_get_ref_retain(inputs, arg_i, (iree_vm_ref_t*)p));
+        p += sizeof(iree_vm_ref_t);
+      } break;
+    }
+  }
+  return iree_ok_status();
+}
+
+// Marshals callee results from the ABI convention to the variant list.
+static iree_status_t iree_vm_invoke_marshal_outputs(
+    iree_string_view_t cconv_results, iree_byte_span_t results,
+    iree_vm_list_t* outputs) {
+  iree_host_size_t expected_output_count =
+      cconv_results.size > 0
+          ? (cconv_results.data[0] == 'v' ? 0 : cconv_results.size)
+          : 0;
+  if (IREE_UNLIKELY(!outputs)) {
+    if (IREE_UNLIKELY(expected_output_count > 0)) {
+      return iree_make_status(
+          IREE_STATUS_INVALID_ARGUMENT,
+          "no output provided to a function that has outputs");
+    }
+    return iree_ok_status();
+  }
+
+  // Resize the output list to hold all results (and kill anything that may
+  // have been in there).
+  IREE_RETURN_IF_ERROR(iree_vm_list_resize(outputs, 0));
+  IREE_RETURN_IF_ERROR(iree_vm_list_resize(outputs, expected_output_count));
+
+  uint8_t* p = results.data;
+  for (iree_host_size_t cconv_i = 0, arg_i = 0; cconv_i < cconv_results.size;
+       ++cconv_i, ++arg_i) {
+    switch (cconv_results.data[cconv_i]) {
+      case IREE_VM_CCONV_TYPE_VOID:
+        break;
+      case IREE_VM_CCONV_TYPE_I32: {
+        iree_vm_value_t value = iree_vm_value_make_i32(*(int32_t*)p);
+        IREE_RETURN_IF_ERROR(iree_vm_list_set_value(outputs, arg_i, &value));
+        p += sizeof(int32_t);
+      } break;
+      case IREE_VM_CCONV_TYPE_I64: {
+        iree_vm_value_t value = iree_vm_value_make_i64(*(int64_t*)p);
+        IREE_RETURN_IF_ERROR(iree_vm_list_set_value(outputs, arg_i, &value));
+        p += sizeof(int64_t);
+      } break;
+      case IREE_VM_CCONV_TYPE_F32: {
+        iree_vm_value_t value = iree_vm_value_make_f32(*(float*)p);
+        IREE_RETURN_IF_ERROR(iree_vm_list_set_value(outputs, arg_i, &value));
+        p += sizeof(float);
+      } break;
+      case IREE_VM_CCONV_TYPE_F64: {
+        iree_vm_value_t value = iree_vm_value_make_f64(*(double*)p);
+        IREE_RETURN_IF_ERROR(iree_vm_list_set_value(outputs, arg_i, &value));
+        p += sizeof(double);
+      } break;
+      case IREE_VM_CCONV_TYPE_REF: {
+        IREE_RETURN_IF_ERROR(
+            iree_vm_list_set_ref_move(outputs, arg_i, (iree_vm_ref_t*)p));
+        p += sizeof(iree_vm_ref_t);
+      } break;
+    }
+  }
+  return iree_ok_status();
+}
+
+// TODO(benvanik): implement this as an iree_vm_invocation_t sequence.
+static iree_status_t iree_vm_invoke_within(
+    iree_vm_context_t* context, iree_vm_stack_t* stack,
+    iree_vm_function_t function, const iree_vm_invocation_policy_t* policy,
+    iree_vm_list_t* inputs, iree_vm_list_t* outputs) {
+  IREE_ASSERT_ARGUMENT(context);
+  IREE_ASSERT_ARGUMENT(stack);
+
+  iree_vm_function_signature_t signature =
+      iree_vm_function_signature(&function);
+  iree_string_view_t cconv_arguments = iree_string_view_empty();
+  iree_string_view_t cconv_results = iree_string_view_empty();
+  IREE_RETURN_IF_ERROR(iree_vm_function_call_get_cconv_fragments(
+      &signature, &cconv_arguments, &cconv_results));
+
+  // Marshal the input arguments into the VM ABI and preallocate the result
+  // buffer.
+  // NOTE: today we don't support variadic arguments through this interface.
+  iree_byte_span_t arguments = iree_make_byte_span(NULL, 0);
+  IREE_RETURN_IF_ERROR(iree_vm_function_call_compute_cconv_fragment_size(
+      cconv_arguments, /*segment_size_list=*/NULL, &arguments.data_length));
+  arguments.data = iree_alloca(arguments.data_length);
+  memset(arguments.data, 0, arguments.data_length);
+  IREE_RETURN_IF_ERROR(
+      iree_vm_invoke_marshal_inputs(cconv_arguments, inputs, arguments));
+
+  // Allocate the result output that will be populated by the callee.
+  iree_byte_span_t results = iree_make_byte_span(NULL, 0);
+  IREE_RETURN_IF_ERROR(iree_vm_function_call_compute_cconv_fragment_size(
+      cconv_results, /*segment_size_list=*/NULL, &results.data_length));
+  results.data = iree_alloca(results.data_length);
+  memset(results.data, 0, results.data_length);
+
+  // Perform execution. Note that for synchronous execution we expect this to
+  // complete without yielding.
+  iree_vm_function_call_t call;
+  memset(&call, 0, sizeof(call));
+  call.function = function;
+  call.arguments = arguments;
+  call.results = results;
+  iree_vm_execution_result_t result;
+  iree_status_t status =
+      function.module->begin_call(function.module->self, stack, &call, &result);
+  if (!iree_status_is_ok(status)) {
+    iree_vm_function_call_release(&call, &signature);
+    return status;
+  }
+
+  // Read back the outputs from the result buffer.
+  IREE_RETURN_IF_ERROR(
+      iree_vm_invoke_marshal_outputs(cconv_results, results, outputs));
+
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_invoke(
+    iree_vm_context_t* context, iree_vm_function_t function,
+    iree_vm_invocation_flags_t flags, const iree_vm_invocation_policy_t* policy,
+    iree_vm_list_t* inputs, iree_vm_list_t* outputs,
+    iree_allocator_t allocator) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Force tracing if specified on the context.
+  if (iree_vm_context_flags(context) & IREE_VM_CONTEXT_FLAG_TRACE_EXECUTION) {
+    flags |= IREE_VM_INVOCATION_FLAG_TRACE_EXECUTION;
+  }
+
+  // Allocate a VM stack on the host stack and initialize it.
+  IREE_VM_INLINE_STACK_INITIALIZE(
+      stack, flags, iree_vm_context_state_resolver(context), allocator);
+  iree_status_t status =
+      iree_vm_invoke_within(context, stack, function, policy, inputs, outputs);
+  if (!iree_status_is_ok(status)) {
+    status = IREE_VM_STACK_ANNOTATE_BACKTRACE_IF_ENABLED(stack, status);
+  }
+  iree_vm_stack_deinitialize(stack);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
diff --git a/runtime/src/iree/vm/invocation.h b/runtime/src/iree/vm/invocation.h
new file mode 100644
index 0000000..9de07b2
--- /dev/null
+++ b/runtime/src/iree/vm/invocation.h
@@ -0,0 +1,93 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// See iree/base/api.h for documentation on the API conventions used.
+
+#ifndef IREE_VM_INVOCATION_H_
+#define IREE_VM_INVOCATION_H_
+
+#include "iree/base/api.h"
+#include "iree/vm/context.h"
+#include "iree/vm/list.h"
+#include "iree/vm/module.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef struct iree_vm_invocation_t iree_vm_invocation_t;
+typedef struct iree_vm_invocation_policy_t iree_vm_invocation_policy_t;
+
+// Synchronously invokes a function in the VM.
+//
+// |policy| is used to schedule the invocation relative to other pending or
+// in-flight invocations. It may be omitted to leave the behavior up to the
+// implementation.
+//
+// |inputs| is used to pass values and objects into the target function and must
+// match the signature defined by the compiled function. List ownership remains
+// with the caller.
+//
+// |outputs| is populated after the function completes execution with the
+// output values and objects of the function. List ownership remains with the
+// caller.
+IREE_API_EXPORT iree_status_t iree_vm_invoke(
+    iree_vm_context_t* context, iree_vm_function_t function,
+    iree_vm_invocation_flags_t flags, const iree_vm_invocation_policy_t* policy,
+    iree_vm_list_t* inputs, iree_vm_list_t* outputs,
+    iree_allocator_t allocator);
+
+// TODO(benvanik): document and implement.
+IREE_API_EXPORT iree_status_t iree_vm_invocation_create(
+    iree_vm_context_t* context, iree_vm_function_t function,
+    iree_vm_invocation_flags_t flags, const iree_vm_invocation_policy_t* policy,
+    const iree_vm_list_t* inputs, iree_allocator_t allocator,
+    iree_vm_invocation_t** out_invocation);
+
+// Retains the given |invocation| for the caller.
+IREE_API_EXPORT iree_status_t
+iree_vm_invocation_retain(iree_vm_invocation_t* invocation);
+
+// Releases the given |invocation| from the caller.
+IREE_API_EXPORT iree_status_t
+iree_vm_invocation_release(iree_vm_invocation_t* invocation);
+
+// Queries the completion status of the invocation.
+// Returns one of the following:
+//   IREE_STATUS_OK: the invocation completed successfully.
+//   IREE_STATUS_UNAVAILABLE: the invocation has not yet completed.
+//   IREE_STATUS_CANCELLED: the invocation was cancelled internally.
+//   IREE_STATUS_ABORTED: the invocation was aborted.
+//   IREE_STATUS_*: an error occurred during invocation.
+IREE_API_EXPORT iree_status_t
+iree_vm_invocation_query_status(iree_vm_invocation_t* invocation);
+
+// Returns a reference to the output of the invocation.
+// The returned structure is valid for the lifetime of the invocation and
+// callers must retain any refs they want to outlive the invocation once
+// released.
+//
+// Returns NULL if the invocation did not complete successfully.
+IREE_API_EXPORT const iree_vm_list_t* iree_vm_invocation_output(
+    iree_vm_invocation_t* invocation);
+
+// Blocks the caller until the invocation completes (successfully or otherwise).
+//
+// Returns IREE_STATUS_DEADLINE_EXCEEDED if |deadline| elapses before the
+// invocation completes and otherwise returns iree_vm_invocation_query_status.
+IREE_API_EXPORT iree_status_t iree_vm_invocation_await(
+    iree_vm_invocation_t* invocation, iree_time_t deadline);
+
+// Attempts to abort the invocation if it is in-flight.
+// A no-op if the invocation has already completed.
+IREE_API_EXPORT iree_status_t
+iree_vm_invocation_abort(iree_vm_invocation_t* invocation);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_VM_INVOCATION_H_
diff --git a/runtime/src/iree/vm/list.c b/runtime/src/iree/vm/list.c
new file mode 100644
index 0000000..e9a47b1
--- /dev/null
+++ b/runtime/src/iree/vm/list.c
@@ -0,0 +1,707 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/vm/list.h"
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "iree/base/tracing.h"
+
+static uint8_t iree_vm_value_type_size(iree_vm_value_type_t type) {
+  // Size of each iree_vm_value_type_t in bytes. We bitpack these so that we
+  // can do a simple shift and mask to get the size.
+  const uint32_t kValueTypeSizes = (0u << 0) |   // IREE_VM_VALUE_TYPE_NONE
+                                   (1u << 4) |   // IREE_VM_VALUE_TYPE_I8
+                                   (2u << 8) |   // IREE_VM_VALUE_TYPE_I16
+                                   (4u << 12) |  // IREE_VM_VALUE_TYPE_I32
+                                   (8u << 16) |  // IREE_VM_VALUE_TYPE_I64
+                                   (4u << 20) |  // IREE_VM_VALUE_TYPE_F32
+                                   (8u << 24) |  // IREE_VM_VALUE_TYPE_F64
+                                   (0u << 28);   // unused
+  return (kValueTypeSizes >> ((type & 0x7) * 4)) & 0xF;
+}
+
+// Defines how the iree_vm_list_t storage is allocated and what elements are
+// interpreted as.
+typedef enum iree_vm_list_storage_mode_e {
+  // Each element is a primitive value and stored as a dense array.
+  IREE_VM_LIST_STORAGE_MODE_VALUE = 0,
+  // Each element is an iree_vm_ref_t of some type.
+  IREE_VM_LIST_STORAGE_MODE_REF,
+  // Each element is a variant of any type (possibly all different).
+  IREE_VM_LIST_STORAGE_MODE_VARIANT,
+} iree_vm_list_storage_mode_t;
+
+// A list able to hold either flat primitive elements or ref values.
+struct iree_vm_list_t {
+  iree_vm_ref_object_t ref_object;
+  iree_allocator_t allocator;
+
+  // Current capacity of the list storage, in elements.
+  iree_host_size_t capacity;
+  // Current count of elements in the list.
+  iree_host_size_t count;
+
+  // Element type stored within the list.
+  iree_vm_type_def_t element_type;
+  // Size of each element in the storage in bytes.
+  iree_host_size_t element_size;
+
+  // Storage mode defining how the storage array is managed.
+  iree_vm_list_storage_mode_t storage_mode;
+  // A flat dense array of elements in the type defined by storage_mode.
+  // For certain storage modes, such as IREE_VM_STORAGE_MODE_REF, special
+  // lifetime management and cleanup logic is required.
+  void* storage;
+};
+
+static iree_vm_ref_type_descriptor_t iree_vm_list_descriptor = {0};
+
+IREE_VM_DEFINE_TYPE_ADAPTERS(iree_vm_list, iree_vm_list_t);
+
+static void iree_vm_list_reset_range(iree_vm_list_t* list,
+                                     iree_host_size_t offset,
+                                     iree_host_size_t length) {
+  switch (list->storage_mode) {
+    case IREE_VM_LIST_STORAGE_MODE_VALUE: {
+      void* base_ptr =
+          (void*)((uintptr_t)list->storage + offset * list->element_size);
+      memset(base_ptr, 0, length * list->element_size);
+      break;
+    }
+    case IREE_VM_LIST_STORAGE_MODE_REF: {
+      iree_vm_ref_t* ref_storage = (iree_vm_ref_t*)list->storage;
+      for (iree_host_size_t i = offset; i < offset + length; ++i) {
+        iree_vm_ref_release(&ref_storage[i]);
+      }
+      break;
+    }
+    case IREE_VM_LIST_STORAGE_MODE_VARIANT: {
+      iree_vm_variant_t* variant_storage = (iree_vm_variant_t*)list->storage;
+      for (iree_host_size_t i = offset; i < offset + length; ++i) {
+        if (iree_vm_type_def_is_ref(&variant_storage[i].type)) {
+          iree_vm_ref_release(&variant_storage[i].ref);
+          memset(&variant_storage[i].type, 0, sizeof(variant_storage[i].type));
+        } else {
+          memset(&variant_storage[i], 0, sizeof(variant_storage[i]));
+        }
+      }
+      break;
+    }
+  }
+}
+
+IREE_API_EXPORT iree_host_size_t iree_vm_list_storage_size(
+    const iree_vm_type_def_t* element_type, iree_host_size_t capacity) {
+  iree_host_size_t element_size = sizeof(iree_vm_variant_t);
+  if (element_type) {
+    if (iree_vm_type_def_is_value(element_type)) {
+      element_size = iree_vm_value_type_size(element_type->value_type);
+    } else if (iree_vm_type_def_is_ref(element_type)) {
+      element_size = sizeof(iree_vm_ref_t);
+    } else {
+      element_size = sizeof(iree_vm_variant_t);
+    }
+  }
+  return iree_host_align(sizeof(iree_vm_list_t), 8) +
+         iree_host_align(capacity * element_size, 8);
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_list_initialize(
+    iree_byte_span_t storage, const iree_vm_type_def_t* element_type,
+    iree_host_size_t capacity, iree_vm_list_t** out_list) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_vm_list_storage_mode_t storage_mode = IREE_VM_LIST_STORAGE_MODE_VARIANT;
+  iree_host_size_t element_size = sizeof(iree_vm_variant_t);
+  if (element_type) {
+    if (iree_vm_type_def_is_value(element_type)) {
+      storage_mode = IREE_VM_LIST_STORAGE_MODE_VALUE;
+      element_size = iree_vm_value_type_size(element_type->value_type);
+    } else if (iree_vm_type_def_is_ref(element_type)) {
+      storage_mode = IREE_VM_LIST_STORAGE_MODE_REF;
+      element_size = sizeof(iree_vm_ref_t);
+    } else {
+      storage_mode = IREE_VM_LIST_STORAGE_MODE_VARIANT;
+      element_size = sizeof(iree_vm_variant_t);
+    }
+  }
+
+  iree_host_size_t storage_offset = iree_host_align(sizeof(iree_vm_list_t), 8);
+  iree_host_size_t required_storage_size =
+      storage_offset + iree_host_align(capacity * element_size, 8);
+  if (storage.data_length < required_storage_size) {
+    return iree_make_status(
+        IREE_STATUS_OUT_OF_RANGE,
+        "storage buffer underflow: provided=%zu < required=%zu",
+        storage.data_length, required_storage_size);
+  }
+  memset(storage.data, 0, required_storage_size);
+
+  iree_vm_list_t* list = (iree_vm_list_t*)storage.data;
+  iree_atomic_ref_count_init(&list->ref_object.counter);
+  if (element_type) {
+    list->element_type = *element_type;
+  }
+  list->element_size = element_size;
+  list->storage_mode = storage_mode;
+  list->capacity = capacity;
+  list->storage = storage.data + storage_offset;
+
+  *out_list = list;
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT void iree_vm_list_deinitialize(iree_vm_list_t* list) {
+  IREE_ASSERT_ARGUMENT(list);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_atomic_ref_count_abort_if_uses(&list->ref_object.counter);
+  iree_vm_list_reset_range(list, 0, list->count);
+  list->count = 0;
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_list_create(
+    const iree_vm_type_def_t* element_type, iree_host_size_t initial_capacity,
+    iree_allocator_t allocator, iree_vm_list_t** out_list) {
+  IREE_ASSERT_ARGUMENT(out_list);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_vm_list_t* list = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_allocator_malloc(allocator, sizeof(*list), (void**)&list));
+  memset(list, 0, sizeof(*list));
+  iree_atomic_ref_count_init(&list->ref_object.counter);
+  list->allocator = allocator;
+  if (element_type) {
+    list->element_type = *element_type;
+  }
+
+  if (iree_vm_type_def_is_value(&list->element_type) && element_type) {
+    list->storage_mode = IREE_VM_LIST_STORAGE_MODE_VALUE;
+    list->element_size = iree_vm_value_type_size(element_type->value_type);
+  } else if (iree_vm_type_def_is_ref(&list->element_type)) {
+    list->storage_mode = IREE_VM_LIST_STORAGE_MODE_REF;
+    list->element_size = sizeof(iree_vm_ref_t);
+  } else {
+    list->storage_mode = IREE_VM_LIST_STORAGE_MODE_VARIANT;
+    list->element_size = sizeof(iree_vm_variant_t);
+  }
+
+  iree_status_t status = iree_vm_list_reserve(list, initial_capacity);
+
+  if (iree_status_is_ok(status)) {
+    *out_list = list;
+  } else {
+    iree_allocator_free(allocator, list);
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_vm_list_destroy(void* ptr) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_vm_list_t* list = (iree_vm_list_t*)ptr;
+  iree_vm_list_reset_range(list, 0, list->count);
+  iree_allocator_free(list->allocator, list->storage);
+  iree_allocator_free(list->allocator, list);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+IREE_API_EXPORT void iree_vm_list_retain(iree_vm_list_t* list) {
+  iree_vm_ref_object_retain(list, &iree_vm_list_descriptor);
+}
+
+IREE_API_EXPORT void iree_vm_list_release(iree_vm_list_t* list) {
+  iree_vm_ref_object_release(list, &iree_vm_list_descriptor);
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_list_element_type(
+    const iree_vm_list_t* list, iree_vm_type_def_t* out_element_type) {
+  *out_element_type = list->element_type;
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_host_size_t
+iree_vm_list_capacity(const iree_vm_list_t* list) {
+  return list->capacity;
+}
+
+IREE_API_EXPORT iree_status_t
+iree_vm_list_reserve(iree_vm_list_t* list, iree_host_size_t minimum_capacity) {
+  if (list->capacity >= minimum_capacity) {
+    return iree_ok_status();
+  }
+  iree_host_size_t old_capacity = list->capacity;
+  iree_host_size_t new_capacity = iree_host_align(minimum_capacity, 64);
+  IREE_RETURN_IF_ERROR(iree_allocator_realloc(
+      list->allocator, new_capacity * list->element_size, &list->storage));
+  memset((void*)((uintptr_t)list->storage + old_capacity * list->element_size),
+         0, (new_capacity - old_capacity) * list->element_size);
+  list->capacity = new_capacity;
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_host_size_t iree_vm_list_size(const iree_vm_list_t* list) {
+  return list->count;
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_list_resize(iree_vm_list_t* list,
+                                                  iree_host_size_t new_size) {
+  if (new_size == list->count) {
+    return iree_ok_status();
+  } else if (new_size < list->count) {
+    // Truncating.
+    iree_vm_list_reset_range(list, new_size, list->count - new_size);
+    list->count = new_size;
+  } else if (new_size > list->capacity) {
+    // Extending beyond capacity.
+    IREE_RETURN_IF_ERROR(iree_vm_list_reserve(
+        list, iree_max(list->capacity * 2, iree_host_align(new_size, 64))));
+  }
+  list->count = new_size;
+  return iree_ok_status();
+}
+
+static void iree_vm_list_convert_value_type(
+    const iree_vm_value_t* source_value, iree_vm_value_type_t target_value_type,
+    iree_vm_value_t* out_value) {
+  if (target_value_type == source_value->type) {
+    memcpy(out_value, source_value, sizeof(*out_value));
+    return;
+  }
+  out_value->type = target_value_type;
+  out_value->i64 = 0;
+  switch (source_value->type) {
+    default:
+      return;
+    case IREE_VM_VALUE_TYPE_I8:
+      switch (target_value_type) {
+        case IREE_VM_VALUE_TYPE_I16:
+          out_value->i16 = (int16_t)source_value->i8;
+          return;
+        case IREE_VM_VALUE_TYPE_I32:
+          out_value->i32 = (int32_t)source_value->i8;
+          return;
+        case IREE_VM_VALUE_TYPE_I64:
+          out_value->i64 = (int64_t)source_value->i8;
+          return;
+        default:
+          return;
+      }
+    case IREE_VM_VALUE_TYPE_I16:
+      switch (target_value_type) {
+        case IREE_VM_VALUE_TYPE_I8:
+          out_value->i8 = (int8_t)source_value->i16;
+          return;
+        case IREE_VM_VALUE_TYPE_I32:
+          out_value->i32 = (int32_t)source_value->i16;
+          return;
+        case IREE_VM_VALUE_TYPE_I64:
+          out_value->i64 = (int64_t)source_value->i16;
+          return;
+        default:
+          return;
+      }
+    case IREE_VM_VALUE_TYPE_I32:
+      switch (target_value_type) {
+        case IREE_VM_VALUE_TYPE_I8:
+          out_value->i8 = (int8_t)source_value->i32;
+          return;
+        case IREE_VM_VALUE_TYPE_I16:
+          out_value->i16 = (int16_t)source_value->i32;
+          return;
+        case IREE_VM_VALUE_TYPE_I64:
+          out_value->i64 = (int64_t)source_value->i32;
+          return;
+        default:
+          return;
+      }
+    case IREE_VM_VALUE_TYPE_I64:
+      switch (target_value_type) {
+        case IREE_VM_VALUE_TYPE_I8:
+          out_value->i8 = (int8_t)source_value->i64;
+          return;
+        case IREE_VM_VALUE_TYPE_I16:
+          out_value->i16 = (int16_t)source_value->i64;
+          return;
+        case IREE_VM_VALUE_TYPE_I32:
+          out_value->i32 = (int32_t)source_value->i64;
+          return;
+        default:
+          return;
+      }
+  }
+}
+
+IREE_API_EXPORT iree_status_t
+iree_vm_list_get_value(const iree_vm_list_t* list, iree_host_size_t i,
+                       iree_vm_value_t* out_value) {
+  if (i >= list->count) {
+    return iree_make_status(IREE_STATUS_OUT_OF_RANGE,
+                            "index %zu out of bounds (%zu)", i, list->count);
+  }
+  uintptr_t element_ptr = (uintptr_t)list->storage + i * list->element_size;
+  memset(out_value, 0, sizeof(*out_value));
+  switch (list->storage_mode) {
+    case IREE_VM_LIST_STORAGE_MODE_VALUE: {
+      out_value->type = list->element_type.value_type;
+      // TODO(benvanik): #ifdef on LITTLE/BIG_ENDIAN and just memcpy.
+      switch (list->element_size) {
+        case 1:
+          out_value->i8 = *(int8_t*)element_ptr;
+          break;
+        case 2:
+          out_value->i16 = *(int16_t*)element_ptr;
+          break;
+        case 4:
+          out_value->i32 = *(int32_t*)element_ptr;
+          break;
+        case 8:
+          out_value->i64 = *(int64_t*)element_ptr;
+          break;
+      }
+      break;
+    }
+    case IREE_VM_LIST_STORAGE_MODE_VARIANT: {
+      iree_vm_variant_t* variant = (iree_vm_variant_t*)element_ptr;
+      if (!iree_vm_type_def_is_value(&variant->type)) {
+        return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                                "variant at index %zu is not a value type", i);
+      }
+      out_value->type = variant->type.value_type;
+      memcpy(out_value->value_storage, variant->value_storage,
+             sizeof(out_value->value_storage));
+      break;
+    }
+    default:
+      return iree_make_status(IREE_STATUS_FAILED_PRECONDITION);
+  }
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_list_get_value_as(
+    const iree_vm_list_t* list, iree_host_size_t i,
+    iree_vm_value_type_t value_type, iree_vm_value_t* out_value) {
+  if (i >= list->count) {
+    return iree_make_status(IREE_STATUS_OUT_OF_RANGE,
+                            "index %zu out of bounds (%zu)", i, list->count);
+  }
+  uintptr_t element_ptr = (uintptr_t)list->storage + i * list->element_size;
+  iree_vm_value_t value;
+  value.i64 = 0;
+  switch (list->storage_mode) {
+    case IREE_VM_LIST_STORAGE_MODE_VALUE: {
+      value.type = list->element_type.value_type;
+      // TODO(benvanik): #ifdef on LITTLE/BIG_ENDIAN and just memcpy.
+      switch (list->element_size) {
+        case 1:
+          value.i8 = *(int8_t*)element_ptr;
+          break;
+        case 2:
+          value.i16 = *(int16_t*)element_ptr;
+          break;
+        case 4:
+          value.i32 = *(int32_t*)element_ptr;
+          break;
+        case 8:
+          value.i64 = *(int64_t*)element_ptr;
+          break;
+      }
+      break;
+    }
+    case IREE_VM_LIST_STORAGE_MODE_VARIANT: {
+      iree_vm_variant_t* variant = (iree_vm_variant_t*)element_ptr;
+      if (!iree_vm_type_def_is_value(&variant->type)) {
+        return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                                "variant at index %zu is not a value type", i);
+      }
+      value.type = variant->type.value_type;
+      memcpy(value.value_storage, variant->value_storage,
+             sizeof(value.value_storage));
+      break;
+    }
+    default:
+      return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                              "list does not store values");
+  }
+  iree_vm_list_convert_value_type(&value, value_type, out_value);
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_list_set_value(
+    iree_vm_list_t* list, iree_host_size_t i, const iree_vm_value_t* value) {
+  if (i >= list->count) {
+    return iree_make_status(IREE_STATUS_OUT_OF_RANGE,
+                            "index %zu out of bounds (%zu)", i, list->count);
+  }
+  iree_vm_value_type_t target_type;
+  switch (list->storage_mode) {
+    case IREE_VM_LIST_STORAGE_MODE_VALUE: {
+      target_type = list->element_type.value_type;
+      break;
+    }
+    case IREE_VM_LIST_STORAGE_MODE_VARIANT: {
+      target_type = value->type;
+      break;
+    }
+    default:
+      return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                              "list cannot store values");
+  }
+  iree_vm_value_t converted_value;
+  iree_vm_list_convert_value_type(value, target_type, &converted_value);
+  uintptr_t element_ptr = (uintptr_t)list->storage + i * list->element_size;
+  switch (list->storage_mode) {
+    case IREE_VM_LIST_STORAGE_MODE_VALUE: {
+      // TODO(benvanik): #ifdef on LITTLE/BIG_ENDIAN and just memcpy.
+      switch (list->element_size) {
+        case 1:
+          *(int8_t*)element_ptr = converted_value.i8;
+          break;
+        case 2:
+          *(int16_t*)element_ptr = converted_value.i16;
+          break;
+        case 4:
+          *(int32_t*)element_ptr = converted_value.i32;
+          break;
+        case 8:
+          *(int64_t*)element_ptr = converted_value.i64;
+          break;
+      }
+      break;
+    }
+    case IREE_VM_LIST_STORAGE_MODE_VARIANT: {
+      iree_vm_variant_t* variant = (iree_vm_variant_t*)element_ptr;
+      if (variant->type.ref_type) {
+        iree_vm_ref_release(&variant->ref);
+      }
+      variant->type.value_type = target_type;
+      variant->type.ref_type = IREE_VM_REF_TYPE_NULL;
+      memcpy(variant->value_storage, converted_value.value_storage,
+             sizeof(variant->value_storage));
+      break;
+    }
+    default:
+      return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                              "list cannot store values");
+  }
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_status_t
+iree_vm_list_push_value(iree_vm_list_t* list, const iree_vm_value_t* value) {
+  iree_host_size_t i = iree_vm_list_size(list);
+  IREE_RETURN_IF_ERROR(iree_vm_list_resize(list, i + 1));
+  return iree_vm_list_set_value(list, i, value);
+}
+
+IREE_API_EXPORT void* iree_vm_list_get_ref_deref(
+    const iree_vm_list_t* list, iree_host_size_t i,
+    const iree_vm_ref_type_descriptor_t* type_descriptor) {
+  iree_vm_ref_t value = {0};
+  iree_status_t status = iree_vm_list_get_ref_assign(list, i, &value);
+  if (!iree_status_is_ok(iree_status_consume_code(status))) {
+    return NULL;
+  }
+  status = iree_vm_ref_check(value, type_descriptor->type);
+  if (!iree_status_is_ok(iree_status_consume_code(status))) {
+    return NULL;
+  }
+  return value.ptr;
+}
+
+// Gets a ref type |list| element at |i| and stores it into |out_value|.
+// If |is_retain|=true then the reference count is incremented and otherwise
+// the ref type is assigned directly (as with iree_vm_ref_assign).
+static iree_status_t iree_vm_list_get_ref_assign_or_retain(
+    const iree_vm_list_t* list, iree_host_size_t i, bool is_retain,
+    iree_vm_ref_t* out_value) {
+  if (i >= list->count) {
+    return iree_make_status(IREE_STATUS_OUT_OF_RANGE,
+                            "index %zu out of bounds (%zu)", i, list->count);
+  }
+  uintptr_t element_ptr = (uintptr_t)list->storage + i * list->element_size;
+  switch (list->storage_mode) {
+    case IREE_VM_LIST_STORAGE_MODE_REF: {
+      iree_vm_ref_t* element_ref = (iree_vm_ref_t*)element_ptr;
+      is_retain ? iree_vm_ref_retain(element_ref, out_value)
+                : iree_vm_ref_assign(element_ref, out_value);
+      break;
+    }
+    case IREE_VM_LIST_STORAGE_MODE_VARIANT: {
+      iree_vm_variant_t* variant = (iree_vm_variant_t*)element_ptr;
+      if (!iree_vm_type_def_is_ref(&variant->type)) {
+        return iree_make_status(IREE_STATUS_FAILED_PRECONDITION);
+      }
+      is_retain ? iree_vm_ref_retain(&variant->ref, out_value)
+                : iree_vm_ref_assign(&variant->ref, out_value);
+      break;
+    }
+    default:
+      return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                              "list does not store refs");
+  }
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_list_get_ref_assign(
+    const iree_vm_list_t* list, iree_host_size_t i, iree_vm_ref_t* out_value) {
+  return iree_vm_list_get_ref_assign_or_retain(list, i, /*is_retain=*/false,
+                                               out_value);
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_list_get_ref_retain(
+    const iree_vm_list_t* list, iree_host_size_t i, iree_vm_ref_t* out_value) {
+  return iree_vm_list_get_ref_assign_or_retain(list, i, /*is_retain=*/true,
+                                               out_value);
+}
+
+static iree_status_t iree_vm_list_set_ref(iree_vm_list_t* list,
+                                          iree_host_size_t i, bool is_move,
+                                          iree_vm_ref_t* value) {
+  if (i >= list->count) {
+    return iree_make_status(IREE_STATUS_OUT_OF_RANGE,
+                            "index %zu out of bounds (%zu)", i, list->count);
+  }
+  uintptr_t element_ptr = (uintptr_t)list->storage + i * list->element_size;
+  switch (list->storage_mode) {
+    case IREE_VM_LIST_STORAGE_MODE_REF: {
+      iree_vm_ref_t* element_ref = (iree_vm_ref_t*)element_ptr;
+      IREE_RETURN_IF_ERROR(iree_vm_ref_retain_or_move_checked(
+          is_move, value, list->element_type.ref_type, element_ref));
+      break;
+    }
+    case IREE_VM_LIST_STORAGE_MODE_VARIANT: {
+      iree_vm_variant_t* variant = (iree_vm_variant_t*)element_ptr;
+      if (variant->type.value_type) {
+        memset(&variant->ref, 0, sizeof(variant->ref));
+      }
+      variant->type.value_type = IREE_VM_VALUE_TYPE_NONE;
+      variant->type.ref_type = value->type;
+      iree_vm_ref_retain_or_move(is_move, value, &variant->ref);
+      break;
+    }
+    default:
+      return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                              "list cannot store refs");
+  }
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_list_set_ref_retain(
+    iree_vm_list_t* list, iree_host_size_t i, const iree_vm_ref_t* value) {
+  return iree_vm_list_set_ref(list, i, /*is_move=*/false,
+                              (iree_vm_ref_t*)value);
+}
+
+IREE_API_EXPORT iree_status_t
+iree_vm_list_push_ref_retain(iree_vm_list_t* list, const iree_vm_ref_t* value) {
+  iree_host_size_t i = iree_vm_list_size(list);
+  IREE_RETURN_IF_ERROR(iree_vm_list_resize(list, i + 1));
+  return iree_vm_list_set_ref_retain(list, i, value);
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_list_set_ref_move(iree_vm_list_t* list,
+                                                        iree_host_size_t i,
+                                                        iree_vm_ref_t* value) {
+  return iree_vm_list_set_ref(list, i, /*is_move=*/true, value);
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_list_push_ref_move(iree_vm_list_t* list,
+                                                         iree_vm_ref_t* value) {
+  iree_host_size_t i = iree_vm_list_size(list);
+  IREE_RETURN_IF_ERROR(iree_vm_list_resize(list, i + 1));
+  return iree_vm_list_set_ref_move(list, i, value);
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_list_pop_front_ref_move(
+    iree_vm_list_t* list, iree_vm_ref_t* out_value) {
+  iree_host_size_t list_size = iree_vm_list_size(list);
+  if (list_size == 0) {
+    return iree_make_status(IREE_STATUS_OUT_OF_RANGE,
+                            "cannot pop from an empty list");
+  }
+  IREE_RETURN_IF_ERROR(iree_vm_list_get_ref_assign(list, 0, out_value));
+  memmove(list->storage, (uint8_t*)list->storage + list->element_size,
+          (list_size - 1) * list->element_size);
+  --list->count;
+  memset((uint8_t*)list->storage + list->count * list->element_size, 0,
+         list->element_size);
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_status_t
+iree_vm_list_get_variant(const iree_vm_list_t* list, iree_host_size_t i,
+                         iree_vm_variant_t* out_value) {
+  if (i >= list->count) {
+    return iree_make_status(IREE_STATUS_OUT_OF_RANGE,
+                            "index %zu out of bounds (%zu)", i, list->count);
+  }
+  uintptr_t element_ptr = (uintptr_t)list->storage + i * list->element_size;
+  switch (list->storage_mode) {
+    case IREE_VM_LIST_STORAGE_MODE_VALUE: {
+      out_value->type = list->element_type;
+      memcpy(out_value->value_storage, (void*)element_ptr, list->element_size);
+      break;
+    }
+    case IREE_VM_LIST_STORAGE_MODE_REF: {
+      iree_vm_ref_t* element_ref = (iree_vm_ref_t*)element_ptr;
+      out_value->type.ref_type = element_ref->type;
+      out_value->type.value_type = IREE_VM_VALUE_TYPE_NONE;
+      iree_vm_ref_retain(element_ref, &out_value->ref);
+      break;
+    }
+    case IREE_VM_LIST_STORAGE_MODE_VARIANT: {
+      iree_vm_variant_t* variant = (iree_vm_variant_t*)element_ptr;
+      out_value->type = variant->type;
+      if (iree_vm_type_def_is_ref(&variant->type)) {
+        iree_vm_ref_assign(&variant->ref, &out_value->ref);
+      } else {
+        memcpy(out_value->value_storage, variant->value_storage,
+               sizeof(variant->value_storage));
+      }
+      break;
+    }
+    default:
+      return iree_make_status(IREE_STATUS_FAILED_PRECONDITION);
+  }
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_list_set_variant(
+    iree_vm_list_t* list, iree_host_size_t i, const iree_vm_variant_t* value) {
+  return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                          "iree_vm_list_set_variant unimplemented");
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_list_push_variant(
+    iree_vm_list_t* list, const iree_vm_variant_t* value) {
+  iree_host_size_t i = iree_vm_list_size(list);
+  IREE_RETURN_IF_ERROR(iree_vm_list_resize(list, i + 1));
+  return iree_vm_list_set_variant(list, i, value);
+}
+
+iree_status_t iree_vm_list_register_types(void) {
+  if (iree_vm_list_descriptor.type != IREE_VM_REF_TYPE_NULL) {
+    // Already registered.
+    return iree_ok_status();
+  }
+
+  iree_vm_list_descriptor.destroy = iree_vm_list_destroy;
+  iree_vm_list_descriptor.offsetof_counter =
+      offsetof(iree_vm_list_t, ref_object.counter);
+  iree_vm_list_descriptor.type_name = iree_make_cstring_view("vm.list");
+  return iree_vm_ref_register_type(&iree_vm_list_descriptor);
+}
diff --git a/runtime/src/iree/vm/list.h b/runtime/src/iree/vm/list.h
new file mode 100644
index 0000000..bded73b
--- /dev/null
+++ b/runtime/src/iree/vm/list.h
@@ -0,0 +1,193 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_VM_LIST_H_
+#define IREE_VM_LIST_H_
+
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/vm/ref.h"
+#include "iree/vm/type_def.h"
+#include "iree/vm/value.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// A growable list that can hold primitive value types or ref objects or a mix.
+// This maps to the `!vm.list<...>` type in the VM IR and is designed to enable
+// flexible interop between hosting applications using the VM C API to invoke IR
+// and custom modules that need to pass arbitrary list-like data across the VM
+// ABI. It is not designed for efficiency: if you are performing large amounts
+// of work on the list type you should instead be representing that using the
+// HAL types so that you can get acceleration.
+//
+// This type the same performance characteristics as std::vector; pushes may
+// grow the capacity of the list and to ensure minimal wastage it is always
+// better to reserve the exact desired element count first.
+typedef struct iree_vm_list_t iree_vm_list_t;
+
+// Returns the size in bytes required to store a list with the given element
+// type and capacity. This storage size can be used to stack allocate or reserve
+// memory that is then used by iree_vm_list_initialize to avoid dynamic
+// allocations.
+IREE_API_EXPORT iree_host_size_t iree_vm_list_storage_size(
+    const iree_vm_type_def_t* element_type, iree_host_size_t capacity);
+
+// Initializes a statically-allocated list in the |storage| memory.
+// The storage capacity must be large enough to hold the list internals and
+// its contents which may vary across compilers/platforms/etc; use
+// iree_vm_list_storage_size to query the required capacity.
+//
+// Statically-allocated lists have their lifetime controlled by the caller and
+// must be deinitialized with iree_vm_list_deinitialize only when there are no
+// more users of the list.
+IREE_API_EXPORT iree_status_t iree_vm_list_initialize(
+    iree_byte_span_t storage, const iree_vm_type_def_t* element_type,
+    iree_host_size_t capacity, iree_vm_list_t** out_list);
+
+// Deinitializes a statically-allocated |list| previously initialized with
+// iree_vm_list_initialize. Aborts if there are still references remaining.
+IREE_API_EXPORT void iree_vm_list_deinitialize(iree_vm_list_t* list);
+
+// Creates a growable list containing the given |element_type|, which may either
+// be a primitive iree_vm_value_type_t value (like i32) or a ref type. When
+// storing ref types the list may either store a specific iree_vm_ref_type_t
+// and ensure that all elements set match the type or IREE_VM_REF_TYPE_ANY to
+// indicate that any ref type is allowed.
+//
+// |element_type| can be set to iree_vm_type_def_make_variant_type (or null) to
+// indicate that the list stores variants (each element can differ in type).
+IREE_API_EXPORT iree_status_t iree_vm_list_create(
+    const iree_vm_type_def_t* element_type, iree_host_size_t initial_capacity,
+    iree_allocator_t allocator, iree_vm_list_t** out_list);
+
+// Retains the given |list| for the caller.
+IREE_API_EXPORT void iree_vm_list_retain(iree_vm_list_t* list);
+
+// Releases the given |list| from the caller.
+IREE_API_EXPORT void iree_vm_list_release(iree_vm_list_t* list);
+
+// Returns the element type stored in the list.
+IREE_API_EXPORT iree_status_t iree_vm_list_element_type(
+    const iree_vm_list_t* list, iree_vm_type_def_t* out_element_type);
+
+// Returns the capacity of the list in elements.
+IREE_API_EXPORT iree_host_size_t
+iree_vm_list_capacity(const iree_vm_list_t* list);
+
+// Reserves storage for at least minimum_capacity elements. If the list already
+// has at least the specified capacity the operation is ignored.
+IREE_API_EXPORT iree_status_t
+iree_vm_list_reserve(iree_vm_list_t* list, iree_host_size_t minimum_capacity);
+
+// Returns the current size of the list in elements.
+IREE_API_EXPORT iree_host_size_t iree_vm_list_size(const iree_vm_list_t* list);
+
+// Resizes the list to contain new_size elements. This will either truncate
+// the list if the existing size is greater than new_size or extend the list
+// with the default list value of 0 if storing primitives, null if refs, or
+// empty if variants.
+IREE_API_EXPORT iree_status_t iree_vm_list_resize(iree_vm_list_t* list,
+                                                  iree_host_size_t new_size);
+
+// Returns the value of the element at the given index.
+// Note that the value type may vary from element to element in variant lists
+// and callers should check the |out_value| type.
+IREE_API_EXPORT iree_status_t iree_vm_list_get_value(
+    const iree_vm_list_t* list, iree_host_size_t i, iree_vm_value_t* out_value);
+
+// Returns the value of the element at the given index. If the specified
+// |value_type| differs from the list storage type the value will be converted
+// using the value type semantics (such as sign/zero extend, etc).
+IREE_API_EXPORT iree_status_t iree_vm_list_get_value_as(
+    const iree_vm_list_t* list, iree_host_size_t i,
+    iree_vm_value_type_t value_type, iree_vm_value_t* out_value);
+
+// Sets the value of the element at the given index. If the specified |value|
+// type differs from the list storage type the value will be converted using the
+// value type semantics (such as sign/zero extend, etc).
+IREE_API_EXPORT iree_status_t iree_vm_list_set_value(
+    iree_vm_list_t* list, iree_host_size_t i, const iree_vm_value_t* value);
+
+// Pushes the value of the element to the end of the list.
+// If the specified |value| type differs from the list storage type the value
+// will be converted using the value type semantics (such as sign/zero extend,
+// etc).
+IREE_API_EXPORT iree_status_t
+iree_vm_list_push_value(iree_vm_list_t* list, const iree_vm_value_t* value);
+
+// Returns a dereferenced pointer to the given type if the element at the given
+// index matches the type. Returns NULL on error.
+IREE_API_EXPORT void* iree_vm_list_get_ref_deref(
+    const iree_vm_list_t* list, iree_host_size_t i,
+    const iree_vm_ref_type_descriptor_t* type_descriptor);
+
+// Returns the ref value of the element at the given index.
+// The ref will not be retained and must be retained by the caller to extend
+// its lifetime.
+IREE_API_EXPORT iree_status_t iree_vm_list_get_ref_assign(
+    const iree_vm_list_t* list, iree_host_size_t i, iree_vm_ref_t* out_value);
+
+// Returns the ref value of the element at the given index.
+// The ref will be retained and must be released by the caller.
+IREE_API_EXPORT iree_status_t iree_vm_list_get_ref_retain(
+    const iree_vm_list_t* list, iree_host_size_t i, iree_vm_ref_t* out_value);
+
+// Sets the ref value of the element at the given index, retaining a reference
+// in the list until the element is cleared or the list is disposed.
+IREE_API_EXPORT iree_status_t iree_vm_list_set_ref_retain(
+    iree_vm_list_t* list, iree_host_size_t i, const iree_vm_ref_t* value);
+
+// Pushes the ref value of the element to the end of the list, retaining a
+// reference in the list until the element is cleared or the list is disposed.
+IREE_API_EXPORT iree_status_t
+iree_vm_list_push_ref_retain(iree_vm_list_t* list, const iree_vm_ref_t* value);
+
+// Sets the ref value of the element at the given index, moving ownership of the
+// |value| reference to the list.
+IREE_API_EXPORT iree_status_t iree_vm_list_set_ref_move(iree_vm_list_t* list,
+                                                        iree_host_size_t i,
+                                                        iree_vm_ref_t* value);
+
+// Pushes the ref value of the element to the end of the list, moving ownership
+// of the |value| reference to the list.
+IREE_API_EXPORT iree_status_t iree_vm_list_push_ref_move(iree_vm_list_t* list,
+                                                         iree_vm_ref_t* value);
+
+// Pops the front ref value from the list and transfers ownership to the caller.
+IREE_API_EXPORT iree_status_t
+iree_vm_list_pop_front_ref_move(iree_vm_list_t* list, iree_vm_ref_t* out_value);
+
+// Returns the value of the element at the given index. If the element contains
+// a ref it will *not* be retained and the caller must retain it to extend its
+// lifetime.
+IREE_API_EXPORT iree_status_t
+iree_vm_list_get_variant(const iree_vm_list_t* list, iree_host_size_t i,
+                         iree_vm_variant_t* out_value);
+
+// Sets the value of the element at the given index. If the specified |value|
+// type differs from the list storage type the value will be converted using the
+// value type semantics (such as sign/zero extend, etc). If the variant is a ref
+// then it will be retained.
+IREE_API_EXPORT iree_status_t iree_vm_list_set_variant(
+    iree_vm_list_t* list, iree_host_size_t i, const iree_vm_variant_t* value);
+
+// Pushes the value of the element to the end of the list. If the specified
+// |value| type differs from the list storage type the value will be converted
+// using the value type semantics (such as sign/zero extend, etc). If the
+// variant is a ref then it will be retained.
+IREE_API_EXPORT iree_status_t
+iree_vm_list_push_variant(iree_vm_list_t* list, const iree_vm_variant_t* value);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+IREE_VM_DECLARE_TYPE_ADAPTERS(iree_vm_list, iree_vm_list_t);
+
+#endif  // IREE_VM_LIST_H_
diff --git a/runtime/src/iree/vm/list_test.cc b/runtime/src/iree/vm/list_test.cc
new file mode 100644
index 0000000..d1b43be
--- /dev/null
+++ b/runtime/src/iree/vm/list_test.cc
@@ -0,0 +1,460 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/vm/list.h"
+
+#include <cstdint>
+#include <cstring>
+
+#include "iree/base/api.h"
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+#include "iree/vm/builtin_types.h"
+#include "iree/vm/ref_cc.h"
+
+class A : public iree::vm::RefObject<A> {
+ public:
+  float data() const { return data_; }
+  void set_data(float value) { data_ = value; }
+
+ private:
+  float data_ = 1.0f;
+};
+static iree_vm_ref_type_descriptor_t test_a_descriptor = {0};
+IREE_VM_DECLARE_TYPE_ADAPTERS(test_a, A);
+IREE_VM_DEFINE_TYPE_ADAPTERS(test_a, A);
+
+class B : public iree::vm::RefObject<B> {
+ public:
+  int data() const { return data_; }
+  void set_data(int value) { data_ = value; }
+
+ private:
+  int data_ = 2;
+};
+static iree_vm_ref_type_descriptor_t test_b_descriptor = {0};
+IREE_VM_DECLARE_TYPE_ADAPTERS(test_b, B);
+IREE_VM_DEFINE_TYPE_ADAPTERS(test_b, B);
+
+namespace {
+
+using ::iree::Status;
+using ::iree::testing::status::StatusIs;
+
+template <typename T>
+static void RegisterRefType(iree_vm_ref_type_descriptor_t* descriptor,
+                            const char* type_name) {
+  if (descriptor->type == IREE_VM_REF_TYPE_NULL) {
+    descriptor->type_name = iree_make_cstring_view(type_name);
+    descriptor->offsetof_counter = T::offsetof_counter();
+    descriptor->destroy = T::DirectDestroy;
+    IREE_CHECK_OK(iree_vm_ref_register_type(descriptor));
+  }
+}
+
+static void RegisterRefTypes() {
+  RegisterRefType<A>(&test_a_descriptor, "AType");
+  RegisterRefType<B>(&test_b_descriptor, "BType");
+}
+
+template <typename T, typename V>
+static iree_vm_ref_t MakeRef(V value) {
+  iree_vm_ref_t ref = {0};
+  auto* obj = new T();
+  obj->set_data(value);
+  IREE_CHECK_OK(iree_vm_ref_wrap_assign(
+      obj, iree::vm::ref_type_descriptor<T>::get()->type, &ref));
+  return ref;
+}
+
+class VMListTest : public ::testing::Test {
+ protected:
+  static void SetUpTestSuite() {
+    IREE_CHECK_OK(iree_vm_register_builtin_types());
+    RegisterRefTypes();
+  }
+};
+
+// Tests simple primitive value list usage, mainly just for demonstration.
+// Stores only i32 element types, equivalent to `!vm.list<i32>`.
+TEST_F(VMListTest, UsageI32) {
+  iree_vm_type_def_t element_type =
+      iree_vm_type_def_make_value_type(IREE_VM_VALUE_TYPE_I32);
+  iree_host_size_t initial_capacity = 123;
+  iree_vm_list_t* list = nullptr;
+  IREE_ASSERT_OK(iree_vm_list_create(&element_type, initial_capacity,
+                                     iree_allocator_system(), &list));
+
+  iree_vm_type_def_t queried_element_type;
+  IREE_ASSERT_OK(iree_vm_list_element_type(list, &queried_element_type));
+  EXPECT_TRUE(iree_vm_type_def_is_value(&queried_element_type));
+  EXPECT_EQ(0,
+            memcmp(&element_type, &queried_element_type, sizeof(element_type)));
+  EXPECT_LE(initial_capacity, iree_vm_list_capacity(list));
+  EXPECT_EQ(0, iree_vm_list_size(list));
+
+  IREE_ASSERT_OK(iree_vm_list_resize(list, 5));
+  EXPECT_EQ(5, iree_vm_list_size(list));
+
+  for (iree_host_size_t i = 0; i < 5; ++i) {
+    iree_vm_value_t value = iree_vm_value_make_i32((int32_t)i);
+    IREE_ASSERT_OK(iree_vm_list_set_value(list, i, &value));
+  }
+
+  for (iree_host_size_t i = 0; i < 5; ++i) {
+    iree_vm_value_t value;
+    IREE_ASSERT_OK(
+        iree_vm_list_get_value_as(list, i, IREE_VM_VALUE_TYPE_I32, &value));
+    EXPECT_EQ(IREE_VM_VALUE_TYPE_I32, value.type);
+    EXPECT_EQ(i, value.i32);
+  }
+
+  iree_vm_list_release(list);
+}
+
+// Tests simple ref object list usage, mainly just for demonstration.
+// Stores ref object type A elements only, equivalent to `!vm.list<!vm.ref<A>>`.
+TEST_F(VMListTest, UsageRef) {
+  iree_vm_type_def_t element_type =
+      iree_vm_type_def_make_ref_type(test_a_type_id());
+  iree_host_size_t initial_capacity = 123;
+  iree_vm_list_t* list = nullptr;
+  IREE_ASSERT_OK(iree_vm_list_create(&element_type, initial_capacity,
+                                     iree_allocator_system(), &list));
+
+  iree_vm_type_def_t queried_element_type;
+  IREE_ASSERT_OK(iree_vm_list_element_type(list, &queried_element_type));
+  EXPECT_TRUE(iree_vm_type_def_is_ref(&queried_element_type));
+  EXPECT_EQ(0,
+            memcmp(&element_type, &queried_element_type, sizeof(element_type)));
+  EXPECT_LE(initial_capacity, iree_vm_list_capacity(list));
+  EXPECT_EQ(0, iree_vm_list_size(list));
+
+  IREE_ASSERT_OK(iree_vm_list_resize(list, 5));
+  EXPECT_EQ(5, iree_vm_list_size(list));
+
+  for (iree_host_size_t i = 0; i < 5; ++i) {
+    iree_vm_ref_t ref_a = MakeRef<A>((float)i);
+    IREE_ASSERT_OK(iree_vm_list_set_ref_move(list, i, &ref_a));
+  }
+
+  for (iree_host_size_t i = 0; i < 5; ++i) {
+    iree_vm_ref_t ref_a{0};
+    IREE_ASSERT_OK(iree_vm_list_get_ref_retain(list, i, &ref_a));
+    EXPECT_TRUE(test_a_isa(ref_a));
+    auto* a = test_a_deref(ref_a);
+    EXPECT_EQ(i, a->data());
+    iree_vm_ref_release(&ref_a);
+  }
+
+  iree_vm_list_release(list);
+}
+
+// Tests simple variant list usage, mainly just for demonstration.
+// Stores any heterogeneous element type, equivalent to `!vm.list<?>`.
+TEST_F(VMListTest, UsageVariant) {
+  iree_vm_type_def_t element_type = iree_vm_type_def_make_variant_type();
+  iree_host_size_t initial_capacity = 123;
+  iree_vm_list_t* list = nullptr;
+  IREE_ASSERT_OK(iree_vm_list_create(&element_type, initial_capacity,
+                                     iree_allocator_system(), &list));
+
+  iree_vm_type_def_t queried_element_type;
+  IREE_ASSERT_OK(iree_vm_list_element_type(list, &queried_element_type));
+  EXPECT_TRUE(iree_vm_type_def_is_variant(&queried_element_type));
+  EXPECT_LE(initial_capacity, iree_vm_list_capacity(list));
+  EXPECT_EQ(0, iree_vm_list_size(list));
+
+  IREE_ASSERT_OK(iree_vm_list_resize(list, 10));
+  EXPECT_EQ(10, iree_vm_list_size(list));
+
+  for (iree_host_size_t i = 0; i < 5; ++i) {
+    iree_vm_value_t value = iree_vm_value_make_i32((int32_t)i);
+    IREE_ASSERT_OK(iree_vm_list_set_value(list, i, &value));
+  }
+  for (iree_host_size_t i = 5; i < 10; ++i) {
+    iree_vm_ref_t ref_a = MakeRef<A>(static_cast<float>(i));
+    IREE_ASSERT_OK(iree_vm_list_set_ref_move(list, i, &ref_a));
+  }
+
+  for (iree_host_size_t i = 0; i < 5; ++i) {
+    iree_vm_value_t value;
+    IREE_ASSERT_OK(
+        iree_vm_list_get_value_as(list, i, IREE_VM_VALUE_TYPE_I32, &value));
+    EXPECT_EQ(IREE_VM_VALUE_TYPE_I32, value.type);
+    EXPECT_EQ(i, value.i32);
+  }
+  for (iree_host_size_t i = 5; i < 10; ++i) {
+    iree_vm_ref_t ref_a{0};
+    IREE_ASSERT_OK(iree_vm_list_get_ref_retain(list, i, &ref_a));
+    EXPECT_TRUE(test_a_isa(ref_a));
+    auto* a = test_a_deref(ref_a);
+    EXPECT_EQ(i, a->data());
+    iree_vm_ref_release(&ref_a);
+  }
+
+  iree_vm_list_release(list);
+}
+
+// Tests capacity reservation.
+TEST_F(VMListTest, Reserve) {
+  // Allocate with 0 initial capacity (which may get rounded up).
+  iree_vm_type_def_t element_type = iree_vm_type_def_make_variant_type();
+  iree_host_size_t initial_capacity = 0;
+  iree_vm_list_t* list = nullptr;
+  IREE_ASSERT_OK(iree_vm_list_create(&element_type, initial_capacity,
+                                     iree_allocator_system(), &list));
+  EXPECT_LE(initial_capacity, iree_vm_list_capacity(list));
+  EXPECT_EQ(0, iree_vm_list_size(list));
+
+  // Reserve some capacity, which may allocate.
+  IREE_ASSERT_OK(iree_vm_list_reserve(list, 100));
+  iree_host_size_t current_capacity = iree_vm_list_capacity(list);
+  EXPECT_LE(100, current_capacity);
+
+  // Resize to add items, which should not change capacity.
+  IREE_ASSERT_OK(iree_vm_list_resize(list, 1));
+  EXPECT_EQ(1, iree_vm_list_size(list));
+  EXPECT_EQ(current_capacity, iree_vm_list_capacity(list));
+
+  // Reserving <= the current capacity should be a no-op.
+  IREE_ASSERT_OK(iree_vm_list_reserve(list, current_capacity));
+  EXPECT_EQ(current_capacity, iree_vm_list_capacity(list));
+
+  iree_vm_list_release(list);
+}
+
+// Tests the behavior of resize for truncation and extension on primitives.
+TEST_F(VMListTest, ResizeI32) {
+  iree_vm_type_def_t element_type =
+      iree_vm_type_def_make_value_type(IREE_VM_VALUE_TYPE_I32);
+  iree_host_size_t initial_capacity = 4;
+  iree_vm_list_t* list = nullptr;
+  IREE_ASSERT_OK(iree_vm_list_create(&element_type, initial_capacity,
+                                     iree_allocator_system(), &list));
+  EXPECT_LE(initial_capacity, iree_vm_list_capacity(list));
+  EXPECT_EQ(0, iree_vm_list_size(list));
+
+  // Extend and zero-initialize.
+  IREE_ASSERT_OK(iree_vm_list_resize(list, 5));
+  for (iree_host_size_t i = 0; i < 5; ++i) {
+    iree_vm_value_t value;
+    IREE_ASSERT_OK(
+        iree_vm_list_get_value_as(list, i, IREE_VM_VALUE_TYPE_I32, &value));
+    EXPECT_EQ(0, value.i32);
+  }
+
+  // Overwrite with [0, 5).
+  for (iree_host_size_t i = 0; i < 5; ++i) {
+    iree_vm_value_t value = iree_vm_value_make_i32((int32_t)i);
+    IREE_ASSERT_OK(iree_vm_list_set_value(list, i, &value));
+  }
+
+  // Truncate to [0, 2) and then extend again.
+  // This ensures that we test the primitive clearing path during cleanup:
+  // [int, int, int, int, int]
+  //            |___________| <- truncation region
+  IREE_ASSERT_OK(iree_vm_list_resize(list, 2));
+  IREE_ASSERT_OK(iree_vm_list_resize(list, 5));
+
+  // Ensure that elements 2+ are zeroed after having been reset while 0 and 1
+  // are still valid as before.
+  for (iree_host_size_t i = 0; i < 2; ++i) {
+    iree_vm_value_t value;
+    IREE_ASSERT_OK(
+        iree_vm_list_get_value_as(list, i, IREE_VM_VALUE_TYPE_I32, &value));
+    EXPECT_EQ(i, value.i32);
+  }
+  for (iree_host_size_t i = 2; i < 5; ++i) {
+    iree_vm_value_t value;
+    IREE_ASSERT_OK(
+        iree_vm_list_get_value_as(list, i, IREE_VM_VALUE_TYPE_I32, &value));
+    EXPECT_EQ(0, value.i32);
+  }
+
+  iree_vm_list_release(list);
+}
+
+// Tests the behavior of resize for truncation and extension on refs.
+TEST_F(VMListTest, ResizeRef) {
+  iree_vm_type_def_t element_type =
+      iree_vm_type_def_make_ref_type(test_a_type_id());
+  iree_host_size_t initial_capacity = 4;
+  iree_vm_list_t* list = nullptr;
+  IREE_ASSERT_OK(iree_vm_list_create(&element_type, initial_capacity,
+                                     iree_allocator_system(), &list));
+  EXPECT_LE(initial_capacity, iree_vm_list_capacity(list));
+  EXPECT_EQ(0, iree_vm_list_size(list));
+
+  // Extend and zero-initialize.
+  IREE_ASSERT_OK(iree_vm_list_resize(list, 5));
+  for (iree_host_size_t i = 0; i < 5; ++i) {
+    iree_vm_ref_t ref_a{0};
+    IREE_ASSERT_OK(iree_vm_list_get_ref_assign(list, i, &ref_a));
+    EXPECT_TRUE(iree_vm_ref_is_null(&ref_a));
+  }
+
+  // Overwrite with [0, 5).
+  for (iree_host_size_t i = 0; i < 5; ++i) {
+    iree_vm_ref_t ref_a = MakeRef<A>((float)i);
+    IREE_ASSERT_OK(iree_vm_list_set_ref_move(list, i, &ref_a));
+  }
+
+  // Truncate to [0, 2) and then extend again.
+  // This ensures that we test the ref path during cleanup:
+  // [ref, ref, ref, ref, ref]
+  //            |___________| <- truncation region
+  IREE_ASSERT_OK(iree_vm_list_resize(list, 2));
+  IREE_ASSERT_OK(iree_vm_list_resize(list, 5));
+
+  // Ensure that elements 2+ are reset after having been reset while 0 and 1
+  // are still valid as before.
+  for (iree_host_size_t i = 0; i < 2; ++i) {
+    iree_vm_ref_t ref_a{0};
+    IREE_ASSERT_OK(iree_vm_list_get_ref_retain(list, i, &ref_a));
+    EXPECT_TRUE(test_a_isa(ref_a));
+    auto* a = test_a_deref(ref_a);
+    EXPECT_EQ(i, a->data());
+    iree_vm_ref_release(&ref_a);
+  }
+  for (iree_host_size_t i = 2; i < 5; ++i) {
+    iree_vm_ref_t ref_a{0};
+    IREE_ASSERT_OK(iree_vm_list_get_ref_assign(list, i, &ref_a));
+    EXPECT_TRUE(iree_vm_ref_is_null(&ref_a));
+  }
+
+  iree_vm_list_release(list);
+}
+
+// Tests the behavior of resize for truncation and extension on variants.
+TEST_F(VMListTest, ResizeVariant) {
+  iree_vm_type_def_t element_type = iree_vm_type_def_make_variant_type();
+  iree_host_size_t initial_capacity = 4;
+  iree_vm_list_t* list = nullptr;
+  IREE_ASSERT_OK(iree_vm_list_create(&element_type, initial_capacity,
+                                     iree_allocator_system(), &list));
+  EXPECT_LE(initial_capacity, iree_vm_list_capacity(list));
+  EXPECT_EQ(0, iree_vm_list_size(list));
+
+  // Extend and zero-initialize.
+  IREE_ASSERT_OK(iree_vm_list_resize(list, 5));
+  for (iree_host_size_t i = 0; i < 5; ++i) {
+    iree_vm_variant_t value = iree_vm_variant_empty();
+    IREE_ASSERT_OK(iree_vm_list_get_variant(list, i, &value));
+    EXPECT_TRUE(iree_vm_variant_is_empty(value));
+  }
+
+  // Overwrite with [0, 5) in mixed types.
+  for (iree_host_size_t i = 0; i < 4; ++i) {
+    iree_vm_ref_t ref_a = MakeRef<A>((float)i);
+    IREE_ASSERT_OK(iree_vm_list_set_ref_move(list, i, &ref_a));
+  }
+  for (iree_host_size_t i = 4; i < 5; ++i) {
+    iree_vm_value_t value = iree_vm_value_make_i32((int32_t)i);
+    IREE_ASSERT_OK(iree_vm_list_set_value(list, i, &value));
+  }
+
+  // Truncate to [0, 2) and then extend again.
+  // This ensures that we test the variant path during cleanup:
+  // [ref, ref, ref, ref, int]
+  //            |___________| <- truncation region
+  IREE_ASSERT_OK(iree_vm_list_resize(list, 2));
+  IREE_ASSERT_OK(iree_vm_list_resize(list, 5));
+
+  // Ensure that elements 2+ are reset after having been reset while 0 and 1
+  // are still valid as before.
+  for (iree_host_size_t i = 0; i < 2; ++i) {
+    iree_vm_ref_t ref_a{0};
+    IREE_ASSERT_OK(iree_vm_list_get_ref_retain(list, i, &ref_a));
+    EXPECT_TRUE(test_a_isa(ref_a));
+    auto* a = test_a_deref(ref_a);
+    EXPECT_EQ(i, a->data());
+    iree_vm_ref_release(&ref_a);
+  }
+  for (iree_host_size_t i = 2; i < 5; ++i) {
+    iree_vm_variant_t value = iree_vm_variant_empty();
+    IREE_ASSERT_OK(iree_vm_list_get_variant(list, i, &value));
+    EXPECT_TRUE(iree_vm_variant_is_empty(value));
+  }
+
+  iree_vm_list_release(list);
+}
+
+// TODO(benvanik): test value get/set.
+
+// TODO(benvanik): test value conversion.
+
+// TODO(benvanik): test ref get/set.
+
+// Tests pushing and popping ref objects.
+TEST_F(VMListTest, PushPopRef) {
+  iree_vm_type_def_t element_type =
+      iree_vm_type_def_make_ref_type(test_a_type_id());
+  iree_host_size_t initial_capacity = 4;
+  iree_vm_list_t* list = nullptr;
+  IREE_ASSERT_OK(iree_vm_list_create(&element_type, initial_capacity,
+                                     iree_allocator_system(), &list));
+  EXPECT_LE(initial_capacity, iree_vm_list_capacity(list));
+  EXPECT_EQ(0, iree_vm_list_size(list));
+
+  // Pops when empty fail.
+  iree_vm_ref_t empty_ref{0};
+  EXPECT_THAT(Status(iree_vm_list_pop_front_ref_move(list, &empty_ref)),
+              StatusIs(iree::StatusCode::kOutOfRange));
+
+  // Push back [0, 5).
+  for (iree_host_size_t i = 0; i < 5; ++i) {
+    iree_vm_ref_t ref_a = MakeRef<A>((float)i);
+    IREE_ASSERT_OK(iree_vm_list_push_ref_move(list, &ref_a));
+  }
+
+  // Pop the first two [0, 1] and leave [2, 5).
+  // This ensures that we test the ref path during cleanup:
+  // [ref, ref, ref, ref, ref]
+  //  |______| <- popped region
+  for (iree_host_size_t i = 0; i < 2; ++i) {
+    iree_vm_ref_t ref_a{0};
+    IREE_ASSERT_OK(iree_vm_list_pop_front_ref_move(list, &ref_a));
+    EXPECT_TRUE(test_a_isa(ref_a));
+    auto* a = test_a_deref(ref_a);
+    EXPECT_EQ(i, a->data());
+    iree_vm_ref_release(&ref_a);
+  }
+
+  // Ensure that elements 2+ are valid but now at offset 0.
+  for (iree_host_size_t i = 2; i < 5; ++i) {
+    iree_vm_ref_t ref_a{0};
+    IREE_ASSERT_OK(iree_vm_list_get_ref_retain(list, i - 2, &ref_a));
+    EXPECT_TRUE(test_a_isa(ref_a));
+    auto* a = test_a_deref(ref_a);
+    EXPECT_EQ(i, a->data());
+    iree_vm_ref_release(&ref_a);
+  }
+
+  // Push back two more to get [2, 7).
+  for (iree_host_size_t i = 5; i < 7; ++i) {
+    iree_vm_ref_t ref_a = MakeRef<A>((float)i);
+    IREE_ASSERT_OK(iree_vm_list_push_ref_move(list, &ref_a));
+  }
+
+  // Ensure the new elements got added to the end.
+  for (iree_host_size_t i = 2; i < 7; ++i) {
+    iree_vm_ref_t ref_a{0};
+    IREE_ASSERT_OK(iree_vm_list_get_ref_retain(list, i - 2, &ref_a));
+    EXPECT_TRUE(test_a_isa(ref_a));
+    auto* a = test_a_deref(ref_a);
+    EXPECT_EQ(i, a->data());
+    iree_vm_ref_release(&ref_a);
+  }
+
+  iree_vm_list_release(list);
+}
+
+// TODO(benvanik): test primitive variant get/set.
+
+// TODO(benvanik): test ref variant get/set.
+
+}  // namespace
diff --git a/runtime/src/iree/vm/module.c b/runtime/src/iree/vm/module.c
new file mode 100644
index 0000000..144d200
--- /dev/null
+++ b/runtime/src/iree/vm/module.c
@@ -0,0 +1,347 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/vm/module.h"
+
+#include <string.h>
+
+#include "iree/base/internal/atomics.h"
+#include "iree/base/tracing.h"
+#include "iree/vm/ref.h"
+#include "iree/vm/stack.h"
+
+IREE_API_EXPORT iree_status_t iree_vm_function_call_get_cconv_fragments(
+    const iree_vm_function_signature_t* signature,
+    iree_string_view_t* out_arguments, iree_string_view_t* out_results) {
+  memset(out_arguments, 0, sizeof(*out_arguments));
+  memset(out_results, 0, sizeof(*out_results));
+  iree_string_view_t cconv = signature->calling_convention;
+  if (!cconv.size) {
+    // No cconv string, so function is `()->()`.
+    return iree_ok_status();
+  } else if (cconv.data[0] != '0') {
+    return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                            "unsupported cconv version %c", cconv.data[0]);
+  }
+  iree_string_view_t cconv_body = iree_string_view_substr(cconv, 1, INTPTR_MAX);
+  if (iree_string_view_split(cconv_body, '_', out_arguments, out_results) ==
+      -1) {
+    *out_arguments = cconv_body;
+  }
+  return iree_ok_status();
+}
+
+static iree_status_t iree_vm_function_call_count_fragment_values(
+    iree_string_view_t cconv_fragment, iree_host_size_t* out_count) {
+  IREE_ASSERT_ARGUMENT(out_count);
+  *out_count = 0;
+  iree_host_size_t count = 0;
+  for (iree_host_size_t i = 0; i < cconv_fragment.size; ++i) {
+    switch (cconv_fragment.data[i]) {
+      case IREE_VM_CCONV_TYPE_VOID:
+        break;
+      case IREE_VM_CCONV_TYPE_I32:
+      case IREE_VM_CCONV_TYPE_F32:
+      case IREE_VM_CCONV_TYPE_I64:
+      case IREE_VM_CCONV_TYPE_F64:
+      case IREE_VM_CCONV_TYPE_REF:
+        ++count;
+        break;
+      case IREE_VM_CCONV_TYPE_SPAN_START: {
+        for (i = i + 1; i < cconv_fragment.size &&
+                        cconv_fragment.data[i] != IREE_VM_CCONV_TYPE_SPAN_END;
+             ++i) {
+          switch (cconv_fragment.data[i]) {
+            case IREE_VM_CCONV_TYPE_VOID:
+              break;
+            case IREE_VM_CCONV_TYPE_I32:
+            case IREE_VM_CCONV_TYPE_F32:
+            case IREE_VM_CCONV_TYPE_I64:
+            case IREE_VM_CCONV_TYPE_F64:
+            case IREE_VM_CCONV_TYPE_REF:
+              ++count;
+              break;
+            default:
+              return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                                      "unsupported cconv span type %c",
+                                      cconv_fragment.data[i]);
+          }
+        }
+      } break;
+      default:
+        return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                                "unsupported cconv type %c",
+                                cconv_fragment.data[i]);
+    }
+  }
+  *out_count = count;
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_function_call_count_arguments_and_results(
+    const iree_vm_function_signature_t* signature,
+    iree_host_size_t* out_argument_count, iree_host_size_t* out_result_count) {
+  IREE_ASSERT_ARGUMENT(signature);
+  IREE_ASSERT_ARGUMENT(out_argument_count);
+  IREE_ASSERT_ARGUMENT(out_result_count);
+  *out_argument_count = 0;
+  *out_result_count = 0;
+  iree_string_view_t arguments, results;
+  IREE_RETURN_IF_ERROR(iree_vm_function_call_get_cconv_fragments(
+      signature, &arguments, &results));
+  IREE_RETURN_IF_ERROR(iree_vm_function_call_count_fragment_values(
+      arguments, out_argument_count));
+  IREE_RETURN_IF_ERROR(
+      iree_vm_function_call_count_fragment_values(results, out_result_count));
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT bool iree_vm_function_call_is_variadic_cconv(
+    iree_string_view_t cconv) {
+  return iree_string_view_find_char(cconv, IREE_VM_CCONV_TYPE_SPAN_START, 0) !=
+         IREE_STRING_VIEW_NPOS;
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_function_call_compute_cconv_fragment_size(
+    iree_string_view_t cconv_fragment,
+    const iree_vm_register_list_t* segment_size_list,
+    iree_host_size_t* out_required_size) {
+  iree_host_size_t required_size = 0;
+  for (iree_host_size_t i = 0, seg_i = 0; i < cconv_fragment.size;
+       ++i, ++seg_i) {
+    switch (cconv_fragment.data[i]) {
+      case IREE_VM_CCONV_TYPE_VOID:
+        break;
+      case IREE_VM_CCONV_TYPE_I32:
+      case IREE_VM_CCONV_TYPE_F32:
+        required_size += sizeof(int32_t);
+        break;
+      case IREE_VM_CCONV_TYPE_I64:
+      case IREE_VM_CCONV_TYPE_F64:
+        required_size += sizeof(int64_t);
+        break;
+      case IREE_VM_CCONV_TYPE_REF:
+        required_size += sizeof(iree_vm_ref_t);
+        break;
+      case IREE_VM_CCONV_TYPE_SPAN_START: {
+        if (IREE_UNLIKELY(!segment_size_list) ||
+            IREE_UNLIKELY(seg_i >= segment_size_list->size)) {
+          return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                                  "variadic argument found but segment size "
+                                  "list is missing/underflowed");
+        }
+        iree_host_size_t span_count = segment_size_list->registers[seg_i];
+        required_size += sizeof(int32_t);  // count
+        iree_host_size_t span_size = 0;
+        for (i = i + 1; i < cconv_fragment.size &&
+                        cconv_fragment.data[i] != IREE_VM_CCONV_TYPE_SPAN_END;
+             ++i) {
+          switch (cconv_fragment.data[i]) {
+            case IREE_VM_CCONV_TYPE_VOID:
+              break;
+            case IREE_VM_CCONV_TYPE_I32:
+            case IREE_VM_CCONV_TYPE_F32:
+              span_size += sizeof(int32_t);
+              break;
+            case IREE_VM_CCONV_TYPE_I64:
+            case IREE_VM_CCONV_TYPE_F64:
+              span_size += sizeof(int64_t);
+              break;
+            case IREE_VM_CCONV_TYPE_REF:
+              span_size += sizeof(iree_vm_ref_t);
+              break;
+            default:
+              return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                                      "unsupported cconv span type %c",
+                                      cconv_fragment.data[i]);
+          }
+        }
+        required_size += span_size * span_count;
+      } break;
+      default:
+        return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                                "unsupported cconv type %c",
+                                cconv_fragment.data[i]);
+    }
+  }
+  *out_required_size = required_size;
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT void iree_vm_function_call_release(
+    iree_vm_function_call_t* call,
+    const iree_vm_function_signature_t* signature) {
+  if (!call->arguments.data_length || !call->results.data_length) {
+    return;
+  }
+  iree_string_view_t cconv = signature->calling_convention;
+  if (cconv.size == 0 || cconv.data[0] != '0') return;
+  uint8_t* p = call->arguments.data;
+  for (iree_host_size_t i = 1; i < cconv.size; ++i) {
+    char c = cconv.data[i];
+    if (c == '_') {
+      // Switch to results.
+      p = call->results.data;
+    }
+    switch (c) {
+      case IREE_VM_CCONV_TYPE_VOID:
+        break;
+      case IREE_VM_CCONV_TYPE_I32:
+      case IREE_VM_CCONV_TYPE_F32:
+        p += sizeof(int32_t);
+        break;
+      case IREE_VM_CCONV_TYPE_I64:
+      case IREE_VM_CCONV_TYPE_F64:
+        p += sizeof(int64_t);
+        break;
+      case IREE_VM_CCONV_TYPE_REF:
+        iree_vm_ref_release((iree_vm_ref_t*)p);
+        p += sizeof(iree_vm_ref_t);
+        break;
+    }
+  }
+}
+
+IREE_API_EXPORT iree_status_t
+iree_vm_module_initialize(iree_vm_module_t* module, void* self) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  memset(module, 0, sizeof(iree_vm_module_t));
+  module->self = self;
+  iree_atomic_ref_count_init(&module->ref_count);
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT void iree_vm_module_retain(iree_vm_module_t* module) {
+  if (module) {
+    iree_atomic_ref_count_inc(&module->ref_count);
+  }
+}
+
+IREE_API_EXPORT void iree_vm_module_release(iree_vm_module_t* module) {
+  if (module && iree_atomic_ref_count_dec(&module->ref_count) == 1) {
+    module->destroy(module->self);
+  }
+}
+
+IREE_API_EXPORT iree_string_view_t
+iree_vm_module_name(const iree_vm_module_t* module) {
+  if (!module) {
+    return iree_make_cstring_view("null");
+  }
+  return module->name(module->self);
+}
+
+IREE_API_EXPORT iree_vm_module_signature_t
+iree_vm_module_signature(const iree_vm_module_t* module) {
+  if (!module) {
+    iree_vm_module_signature_t empty;
+    memset(&empty, 0, sizeof(empty));
+    return empty;
+  }
+  return module->signature(module->self);
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_module_lookup_function_by_name(
+    const iree_vm_module_t* module, iree_vm_function_linkage_t linkage,
+    iree_string_view_t name, iree_vm_function_t* out_function) {
+  return module->lookup_function(module->self, linkage, name, out_function);
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_module_lookup_function_by_ordinal(
+    const iree_vm_module_t* module, iree_vm_function_linkage_t linkage,
+    iree_host_size_t ordinal, iree_vm_function_t* out_function) {
+  return module->get_function(module->self, linkage, ordinal, out_function,
+                              /*out_name=*/NULL,
+                              /*out_signature=*/NULL);
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_module_resolve_source_location(
+    const iree_vm_module_t* module, iree_vm_stack_frame_t* frame,
+    iree_vm_source_location_t* out_source_location) {
+  IREE_ASSERT_ARGUMENT(module);
+  IREE_ASSERT_ARGUMENT(frame);
+  IREE_ASSERT_ARGUMENT(out_source_location);
+  memset(out_source_location, 0, sizeof(*out_source_location));
+  if (module->resolve_source_location) {
+    return module->resolve_source_location(module->self, frame,
+                                           out_source_location);
+  }
+  return iree_status_from_code(IREE_STATUS_UNAVAILABLE);
+}
+
+IREE_API_EXPORT iree_status_t
+iree_vm_source_location_format(iree_vm_source_location_t* source_location,
+                               iree_vm_source_location_format_flags_t flags,
+                               iree_string_builder_t* builder) {
+  IREE_ASSERT_ARGUMENT(builder);
+  if (!source_location || !source_location->format) {
+    return iree_status_from_code(IREE_STATUS_UNAVAILABLE);
+  }
+  return source_location->format(source_location->self, source_location->data,
+                                 flags, builder);
+}
+
+IREE_API_EXPORT iree_string_view_t
+iree_vm_function_name(const iree_vm_function_t* function) {
+  iree_string_view_t name;
+  iree_status_t status = function->module->get_function(
+      function->module->self, function->linkage, function->ordinal,
+      /*out_function=*/NULL,
+      /*out_name=*/&name,
+      /*out_signature=*/NULL);
+  if (!iree_status_is_ok(status)) {
+    iree_status_ignore(status);
+    return iree_make_cstring_view("<error>");
+  }
+  return name;
+}
+
+IREE_API_EXPORT iree_vm_function_signature_t
+iree_vm_function_signature(const iree_vm_function_t* function) {
+  iree_vm_function_signature_t signature;
+  memset(&signature, 0, sizeof(signature));
+  IREE_IGNORE_ERROR(function->module->get_function(
+      function->module->self, function->linkage, function->ordinal,
+      /*out_function=*/NULL,
+      /*out_name=*/NULL,
+      /*out_signature=*/&signature));
+  return signature;
+}
+
+IREE_API_EXPORT iree_string_view_t iree_vm_function_reflection_attr(
+    const iree_vm_function_t* function, iree_string_view_t key) {
+  iree_vm_module_t* module = function->module;
+  if (!module->get_function_reflection_attr) {
+    return iree_string_view_empty();
+  }
+  for (int index = 0;; ++index) {
+    iree_string_view_t index_key, index_value;
+    iree_status_t status = module->get_function_reflection_attr(
+        module->self, function->linkage, function->ordinal, index, &index_key,
+        &index_value);
+    if (!iree_status_is_ok(status)) {
+      iree_status_ignore(status);
+      break;
+    }
+    if (iree_string_view_equal(key, index_key)) {
+      return index_value;
+    }
+  }
+  return iree_string_view_empty();
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_get_function_reflection_attr(
+    iree_vm_function_t function, iree_host_size_t index,
+    iree_string_view_t* key, iree_string_view_t* value) {
+  if (!function.module->get_function_reflection_attr) {
+    return iree_make_status(IREE_STATUS_NOT_FOUND,
+                            "reflection not available for the given module");
+  }
+  return function.module->get_function_reflection_attr(
+      function.module->self, function.linkage, function.ordinal, index, key,
+      value);
+}
diff --git a/runtime/src/iree/vm/module.h b/runtime/src/iree/vm/module.h
new file mode 100644
index 0000000..55c858e
--- /dev/null
+++ b/runtime/src/iree/vm/module.h
@@ -0,0 +1,498 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_VM_MODULE_H_
+#define IREE_VM_MODULE_H_
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "iree/base/alignment.h"
+#include "iree/base/api.h"
+#include "iree/base/internal/atomics.h"
+#include "iree/base/string_builder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef struct iree_vm_module_t iree_vm_module_t;
+typedef struct iree_vm_stack_t iree_vm_stack_t;
+typedef struct iree_vm_stack_frame_t iree_vm_stack_frame_t;
+
+//===----------------------------------------------------------------------===//
+// Module / function reflection
+//===----------------------------------------------------------------------===//
+
+// A key-value pair of module/function reflection information.
+typedef struct iree_vm_reflection_attr_t {
+  iree_string_view_t key;
+  iree_string_view_t value;
+} iree_vm_reflection_attr_t;
+
+// Describes the type of a function reference.
+typedef enum iree_vm_function_linkage_e {
+  // Function is internal to the module and may not be reflectable.
+  IREE_VM_FUNCTION_LINKAGE_INTERNAL = 0,
+  // Function is an import from another module.
+  IREE_VM_FUNCTION_LINKAGE_IMPORT = 1,
+  // Function is an export from the module.
+  IREE_VM_FUNCTION_LINKAGE_EXPORT = 2,
+  // Function is an import from another module that may be unavailable.
+  IREE_VM_FUNCTION_LINKAGE_IMPORT_OPTIONAL = 3,
+  // TODO(#1979): add linkage types for well-known functions like __init.
+} iree_vm_function_linkage_t;
+
+// A function reference that can be used with the iree_vm_function_* methods.
+// These should be treated as opaque and the accessor functions should be used
+// instead.
+//
+// The register counts specify required internal storage used for VM for stack
+// frame management and debugging. They must at least be able to contain all
+// entry arguments for the function. The counts may be omitted if the function
+// will not be referenced by a VM stack frame.
+typedef struct iree_vm_function_t {
+  // Module the function is contained within.
+  iree_vm_module_t* module;
+  // Linkage of the function. Note that IREE_VM_FUNCTION_LINKAGE_INTERNAL
+  // functions may be missing reflection information.
+  uint16_t linkage;
+  // Ordinal within the module in the linkage scope.
+  uint16_t ordinal;
+} iree_vm_function_t;
+static_assert(sizeof(iree_vm_function_t) <= 3 * sizeof(void*),
+              "Must remain small as stored on the stack");
+
+// Returns true if the |function| is null (didn't exist, etc).
+static inline bool iree_vm_function_is_null(iree_vm_function_t function) {
+  return function.module == NULL;
+}
+
+// Describes the expected calling convention and arguments/results of a
+// function.
+typedef struct iree_vm_function_signature_t {
+  // The VM calling convention declaration used to marshal arguments and
+  // results into and out of the function.
+  // Optional for imports and internal functions but required for exports.
+  //
+  // Format:
+  // - '0': version 0 prefix
+  // - Zero or more arguments:
+  //   - 'i': int32_t integer (i32)
+  //   - 'I': int64_t integer (i64)
+  //   - 'r': ref-counted type pointer (!vm.ref<?>)
+  //   - 'C' ... 'D': variadic list of flattened tuples of a specified type
+  // - EOL or '_'
+  // - Zero or more results:
+  //   - 'i' or 'I'
+  //   - 'r'
+  //
+  // Examples:
+  //   `0` or `0_`: () -> ()
+  //   `0i` or `0i_`: (i32) -> ()
+  //   `0iiCiiD_i`: (i32, i32, tuple<i32, i32>...) -> i32
+  //   `0irCirD_r`: (i32, !vm.ref<?>, tuple<i32, !vm.ref<?>>) -> !vm.ref<?>
+  //
+  // Users of this field must verify the version prefix in the first byte before
+  // using the declaration.
+  iree_string_view_t calling_convention;
+} iree_vm_function_signature_t;
+
+// Describes the imports, exports, and capabilities of a module.
+typedef struct iree_vm_module_signature_t {
+  // Total number of imported functions.
+  iree_host_size_t import_function_count;
+  // Total number of exported functions.
+  iree_host_size_t export_function_count;
+  // Total number of internal functions, if debugging info is present and they
+  // can be queried.
+  iree_host_size_t internal_function_count;
+} iree_vm_module_signature_t;
+
+// Internal storage for the module state.
+// Thread-compatible; it's expected that only one thread at a time is executing
+// VM functions and accessing this state.
+typedef struct iree_vm_module_state_t iree_vm_module_state_t;
+
+//===----------------------------------------------------------------------===//
+// Function calls and coroutines
+//===----------------------------------------------------------------------===//
+
+// A variable-length list of registers.
+//
+// This structure is an overlay for the bytecode that is serialized in a
+// matching format, though it can be stack allocated as needed.
+//
+// TODO(benvanik): this should be made private to the bytecode module, but is
+// used for toll-free variadic argument lists here. We could just define an
+// identical structure (and static_assert) to at least rename it to something
+// sensible (iree_vm_segment_size_list_t).
+typedef struct iree_vm_register_list_t {
+  uint16_t size;
+  uint16_t registers[];
+} iree_vm_register_list_t;
+static_assert(iree_alignof(iree_vm_register_list_t) == 2,
+              "expecting byte alignment (to avoid padding)");
+static_assert(offsetof(iree_vm_register_list_t, registers) == 2,
+              "expect no padding in the struct");
+
+// Function call data.
+//
+// Arguments and results are encoded following a standard format shared across
+// all module types. This allows implementations that have different storage
+// types (such as physical machine registers vs. virtual registers) to use the
+// same cross-module calling convention.
+//
+// Callees can assume that callers have properly allocated and setup the
+// argument and result buffers and need not verify them. This works only because
+// the calling convention format is directly queried from the callee module.
+//
+// Encoding:
+// - each int is encoded as a 4-byte aligned value
+// - each ref is encoded as a 4-byte aligned iree_vm_ref_t value
+// - variadic tuples are encoded as a 4-byte count prefix and the tuple values
+//
+// For example, (i32, tuple<!vm.ref<?>, i32>..., i32) is encoded as:
+//    4b: i32
+//    4b: tuple count
+//    repeated:
+//      8b-16b: iree_vm_ref_t
+//      4b: i32
+//    4b: i32
+//
+// Example sequence:
+//  1. ModuleA wants to call SomeFunction from ModuleB
+//  2. ModuleA imports SomeFunction from ModuleB and gets its
+//     iree_vm_function_signature_t during import resolution
+//  3. ModuleA checks that it understands/supports that calling convention
+//     with error handling if needed (e.g. if ModuleB is newer and uses a newer
+//     version that ModuleA wasn't compiled knowing about, or ModuleB is ancient
+//     and uses a deprecated version that ModuleA has already dropped)
+//  4. ModuleA prepares argument and result buffers according to the calling
+//     convention defined by ModuleB and calls SomeFunction
+//  5. ModuleB handles the call, trusting that the input and output buffers are
+//     as expected
+//
+// NOTE: we could switch to using libffi, but I didn't want to require that for
+// all uses and didn't want to enable the issues that can arise when crossing
+// device boundaries. With what we have here we can rather easily serialize the
+// argument/result buffers and map them between independent address spaces.
+// Instead, implementing a native_module-alike of libffi_module would be a
+// better layering for callee modules.
+typedef struct iree_vm_function_call_t {
+  // Function to call.
+  iree_vm_function_t function;
+
+  // Argument buffer in the format described above.
+  // This is only read on beginning the function and need not live beyond that.
+  //
+  // Refs contained are retained by the caller and callees must retain them if
+  // they need them to live beyond the call.
+  iree_byte_span_t arguments;
+
+  // Storage for the result buffer; assumed undefined and then populated with
+  // data in a format described above. This is required for both the beginning
+  // of function invocation as well as each resume (as any may actually return
+  // control flow).
+  //
+  // Refs contained will be retained in the results buffer and callers must
+  // either move or release them upon return from the call.
+  iree_byte_span_t results;
+} iree_vm_function_call_t;
+
+#define IREE_VM_CCONV_TYPE_VOID 'v'
+#define IREE_VM_CCONV_TYPE_I32 'i'
+#define IREE_VM_CCONV_TYPE_I64 'I'
+#define IREE_VM_CCONV_TYPE_F32 'f'
+#define IREE_VM_CCONV_TYPE_F64 'F'
+#define IREE_VM_CCONV_TYPE_REF 'r'
+#define IREE_VM_CCONV_TYPE_SPAN_START 'C'
+#define IREE_VM_CCONV_TYPE_SPAN_END 'D'
+
+// Returns the arguments and results fragments from the function signature.
+// Either may be empty if they have no values.
+//
+// Example:
+//  ``          -> arguments = ``, results = ``
+//  `0`         -> arguments = ``, results = ``
+//  `0v`        -> arguments = ``, results = ``
+//  `0ri`       -> arguments = `ri`, results = ``
+//  `0_ir`      -> arguments = ``, results = `ir`
+//  `0v_ir`     -> arguments = ``, results = `ir`
+//  `0iCiD_rr`  -> arguments = `iCiD`, results = `rr`
+IREE_API_EXPORT iree_status_t iree_vm_function_call_get_cconv_fragments(
+    const iree_vm_function_signature_t* signature,
+    iree_string_view_t* out_arguments, iree_string_view_t* out_results);
+
+// Returns true if the given cconv contains one or more variadic types.
+IREE_API_EXPORT bool iree_vm_function_call_is_variadic_cconv(
+    iree_string_view_t cconv);
+
+// Counts the total number of arguments and results of a function.
+IREE_API_EXPORT iree_status_t iree_vm_function_call_count_arguments_and_results(
+    const iree_vm_function_signature_t* signature,
+    iree_host_size_t* out_argument_count, iree_host_size_t* out_result_count);
+
+// Returns the required size, in bytes, to store the data in the given cconv
+// fragment (like `iICriDr`).
+//
+// The provided |segment_size_list| is used for variadic arguments/results. Each
+// entry represents one of the top level arguments with spans being flattened.
+IREE_API_EXPORT iree_status_t iree_vm_function_call_compute_cconv_fragment_size(
+    iree_string_view_t cconv_fragment,
+    const iree_vm_register_list_t* segment_size_list,
+    iree_host_size_t* out_required_size);
+
+// Releases any retained refs within the call (either arguments or results).
+// This needs only be called if a call fails as implementations are required to
+// clean up the arguments as they are marshaled in and callers are required to
+// clean up the results as they are marshaled out.
+IREE_API_EXPORT void iree_vm_function_call_release(
+    iree_vm_function_call_t* call,
+    const iree_vm_function_signature_t* signature);
+
+// Results of an iree_vm_module_execute request.
+typedef struct iree_vm_execution_result_t {
+  // TODO(benvanik): yield information.
+  // Yield modes:
+  // - yield (yield instruction)
+  // - await (with 1+ wait handles)
+  // - break
+  int reserved;
+} iree_vm_execution_result_t;
+
+//===----------------------------------------------------------------------===//
+// Source locations
+//===----------------------------------------------------------------------===//
+
+// An opaque offset into a source map that a source resolver can calculate.
+// Do not assume that iree_vm_source_offset_t+1 means the next byte offset as
+// backends are free to treat these as everything from pointers to machine code
+// to hash codes.
+typedef int64_t iree_vm_source_offset_t;
+
+// Controls how source locations are formatted into strings.
+enum iree_vm_source_location_format_flag_bits_e {
+  IREE_VM_SOURCE_LOCATION_FORMAT_FLAG_NONE = 0u,
+  // Only formats a single line (excluding \n) for the source location, even
+  // if the full location information (such as a backtrace) is available.
+  IREE_VM_SOURCE_LOCATION_FORMAT_FLAG_SINGLE_LINE = 1u << 0,
+};
+typedef uint32_t iree_vm_source_location_format_flags_t;
+
+// Source location interface.
+typedef struct iree_vm_source_location_t {
+  IREE_API_UNSTABLE
+
+  // Implementation-specified fields. Do not use directly.
+  void* self;
+  uint64_t data[2];
+
+  iree_status_t(IREE_API_PTR* format)(
+      void* self, uint64_t data[2],
+      iree_vm_source_location_format_flags_t flags,
+      iree_string_builder_t* builder);
+} iree_vm_source_location_t;
+
+// Formats the |source_location| to its canonical string form.
+IREE_API_EXPORT iree_status_t
+iree_vm_source_location_format(iree_vm_source_location_t* source_location,
+                               iree_vm_source_location_format_flags_t flags,
+                               iree_string_builder_t* builder);
+
+//===----------------------------------------------------------------------===//
+// iree_vm_module_t
+//===----------------------------------------------------------------------===//
+
+// Indicates an event that can be signaled in modules from the hosting program.
+typedef enum iree_vm_signal_e {
+  // Program is resuming from a suspended state.
+  // Modules may reallocate memory for pools and caches.
+  //
+  // Modules are walked in registration order (A->B->C).
+  IREE_VM_SIGNAL_RESUME = 0,
+
+  // Program is entering a suspended state.
+  // Modules should drop any transient memory that is possible to reallocate
+  // upon resume.
+  //
+  // Modules are walked in reverse registration order (C->B->A).
+  IREE_VM_SIGNAL_SUSPEND = 1,
+
+  // Program has received a low memory alert.
+  // Modules must aggressively drop all possible memory even if expensive to
+  // rematerialize it. On some platforms this is sent as a threat that if
+  // sufficient memory is not unwired/freed ASAP the process will be killed.
+  //
+  // Modules are walked in reverse registration order (C->B->A).
+  IREE_VM_SIGNAL_LOW_MEMORY = 2,
+} iree_vm_signal_t;
+
+// Defines an interface that can be used to reflect and execute functions on a
+// module.
+//
+// Module implementations must be thread-safe as lookups and executions may
+// occur in any order from any thread.
+// TODO(benvanik): version this interface.
+typedef struct iree_vm_module_t {
+  IREE_API_UNSTABLE
+
+  void* self;
+  iree_atomic_ref_count_t ref_count;
+
+  // Destroys |self| when all references to the module have been released.
+  void(IREE_API_PTR* destroy)(void* self);
+
+  // Returns the name of the module (used during resolution).
+  iree_string_view_t(IREE_API_PTR* name)(void* self);
+
+  // Returns the reflected signature of the module.
+  iree_vm_module_signature_t(IREE_API_PTR* signature)(void* self);
+
+  // Gets one or more pieces of function information:
+  // - |out_function| set to the function reference.
+  // - |out_name| set to the function name.
+  // - |out_signature| set to the function signature.
+  iree_status_t(IREE_API_PTR* get_function)(
+      void* self, iree_vm_function_linkage_t linkage, iree_host_size_t ordinal,
+      iree_vm_function_t* out_function, iree_string_view_t* out_name,
+      iree_vm_function_signature_t* out_signature);
+
+  // Looks up a function with the given name and linkage in the module.
+  // This may perform a linear scan and results should be cached.
+  iree_status_t(IREE_API_PTR* lookup_function)(
+      void* self, iree_vm_function_linkage_t linkage, iree_string_view_t name,
+      iree_vm_function_t* out_function);
+
+  // Resolves a stack |frame| from the module to a |out_source_location|, if
+  // debug information is available.
+  iree_status_t(IREE_API_PTR* resolve_source_location)(
+      void* self, iree_vm_stack_frame_t* frame,
+      iree_vm_source_location_t* out_source_location);
+
+  // Allocates module state data.
+  iree_status_t(IREE_API_PTR* alloc_state)(
+      void* self, iree_allocator_t allocator,
+      iree_vm_module_state_t** out_module_state);
+
+  // Frees module state data.
+  void(IREE_API_PTR* free_state)(void* self,
+                                 iree_vm_module_state_t* module_state);
+
+  // Resolves the import with the given ordinal to |function|.
+  // The function is guaranteed to remain valid for the lifetime of the module
+  // state.
+  iree_status_t(IREE_API_PTR* resolve_import)(
+      void* self, iree_vm_module_state_t* module_state,
+      iree_host_size_t ordinal, const iree_vm_function_t* function,
+      const iree_vm_function_signature_t* signature);
+
+  // Notifies the module of a system signal.
+  iree_status_t(IREE_API_PTR* notify)(void* self,
+                                      iree_vm_module_state_t* module_state,
+                                      iree_vm_signal_t signal);
+
+  // Begins a function call with the given |call| arguments.
+  // Execution may yield in the case of asynchronous code and require one or
+  // more calls to the resume method to complete.
+  iree_status_t(IREE_API_PTR* begin_call)(
+      void* self, iree_vm_stack_t* stack, const iree_vm_function_call_t* call,
+      iree_vm_execution_result_t* out_result);
+
+  // Resumes execution of a previously-yielded call.
+  iree_status_t(IREE_API_PTR* resume_call)(
+      void* self, iree_vm_stack_t* stack,
+      iree_vm_execution_result_t* out_result);
+
+  // TODO(benvanik): move this/refactor.
+  // Gets a reflection attribute for a function by index.
+  // The returned key and value strings are guaranteed valid for the life
+  // of the module. Note that not all modules and functions have reflection
+  // attributes.
+  // Returns IREE_STATUS_NOT_FOUND if index >= the number of attributes for
+  // the function.
+  // See: docs/developers/design_docs/function_abi.md
+  iree_status_t(IREE_API_PTR* get_function_reflection_attr)(
+      void* self, iree_vm_function_linkage_t linkage, iree_host_size_t ordinal,
+      iree_host_size_t index, iree_string_view_t* key,
+      iree_string_view_t* value);
+} iree_vm_module_t;
+
+// Initializes the interface of a module handle.
+// This should be called by module implementations after they allocate
+// themselves to properly initialize the module interface prior to populating
+// interface function pointers. This ensures that version adaptation can be
+// performed by the library as needed.
+// TODO(benvanik): version/module size.
+IREE_API_EXPORT iree_status_t
+iree_vm_module_initialize(iree_vm_module_t* module, void* self);
+
+// Retains the given |module| for the caller.
+IREE_API_EXPORT void iree_vm_module_retain(iree_vm_module_t* module);
+
+// Releases the given |module| from the caller.
+IREE_API_EXPORT void iree_vm_module_release(iree_vm_module_t* module);
+
+// Returns the name of the module (used during resolution).
+IREE_API_EXPORT iree_string_view_t
+iree_vm_module_name(const iree_vm_module_t* module);
+
+// Returns the signature of the module describing the contents.
+IREE_API_EXPORT iree_vm_module_signature_t
+iree_vm_module_signature(const iree_vm_module_t* module);
+
+// Looks up a function with the given name and linkage in the |module|.
+// This may perform a linear scan and results should be cached.
+IREE_API_EXPORT iree_status_t iree_vm_module_lookup_function_by_name(
+    const iree_vm_module_t* module, iree_vm_function_linkage_t linkage,
+    iree_string_view_t name, iree_vm_function_t* out_function);
+
+// Looks up a function with the given ordinal and linkage in the |module|.
+IREE_API_EXPORT iree_status_t iree_vm_module_lookup_function_by_ordinal(
+    const iree_vm_module_t* module, iree_vm_function_linkage_t linkage,
+    iree_host_size_t ordinal, iree_vm_function_t* out_function);
+
+// Resolves a stack |frame| from the module to a |out_source_location|, if
+// debug information is available.
+IREE_API_EXPORT iree_status_t iree_vm_module_resolve_source_location(
+    const iree_vm_module_t* module, iree_vm_stack_frame_t* frame,
+    iree_vm_source_location_t* out_source_location);
+
+// Returns the name of the given function or empty string if not available.
+IREE_API_EXPORT iree_string_view_t
+iree_vm_function_name(const iree_vm_function_t* function);
+
+// Returns the signature of the function if reflection metadata is available.
+IREE_API_EXPORT iree_vm_function_signature_t
+iree_vm_function_signature(const iree_vm_function_t* function);
+
+// Returns a value for the given reflection attribute |key|, if found.
+// Returns the empty string if the reflection data in general or the specific
+// key is not found.
+//
+// See: docs/developers/design_docs/function_abi.md for documentation on the
+// ABI.
+IREE_API_EXPORT iree_string_view_t iree_vm_function_reflection_attr(
+    const iree_vm_function_t* function, iree_string_view_t key);
+
+// TODO(#1979): remove this and use iree_vm_function_reflection_attr.
+// Gets a reflection attribute for a function by index.
+// The returned key and value strings are guaranteed valid for the life
+// of the module. Note that not all modules and functions have reflection
+// attributes.
+// Returns IREE_STATUS_NOT_FOUND if index >= the number of attributes for
+// the function.
+// See: docs/developers/design_docs/function_abi.md
+IREE_API_EXPORT iree_status_t iree_vm_get_function_reflection_attr(
+    iree_vm_function_t function, iree_host_size_t index,
+    iree_string_view_t* key, iree_string_view_t* value);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_VM_MODULE_H_
diff --git a/runtime/src/iree/vm/module_impl_emitc.c b/runtime/src/iree/vm/module_impl_emitc.c
new file mode 100644
index 0000000..c03694c
--- /dev/null
+++ b/runtime/src/iree/vm/module_impl_emitc.c
@@ -0,0 +1,7 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include EMITC_IMPLEMENTATION
diff --git a/runtime/src/iree/vm/native_module.c b/runtime/src/iree/vm/native_module.c
new file mode 100644
index 0000000..eff076a
--- /dev/null
+++ b/runtime/src/iree/vm/native_module.c
@@ -0,0 +1,449 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/vm/native_module.h"
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "iree/vm/stack.h"
+
+// Native module implementation allocated for all modules.
+typedef struct iree_vm_native_module_t {
+  // Interface containing default function pointers.
+  // base_interface.self will be the self pointer to iree_vm_native_module_t.
+  //
+  // Must be first in the struct as we dereference the interface to find our
+  // members below.
+  iree_vm_module_t base_interface;
+
+  // Interface with optional user-provided function pointers.
+  iree_vm_module_t user_interface;
+
+  // The self passed to user_interface functions. Will either be the value of
+  // user_interface.self when initialized and the base pointer of the base
+  // native module otherwise.
+  void* self;
+
+  // Allocator this module was allocated with and must be freed with.
+  iree_allocator_t allocator;
+
+  // Module descriptor used for reflection.
+  const iree_vm_native_module_descriptor_t* descriptor;
+} iree_vm_native_module_t;
+
+IREE_API_EXPORT iree_host_size_t iree_vm_native_module_size(void) {
+  return sizeof(iree_vm_native_module_t);
+}
+
+#if defined(NDEBUG)
+static iree_status_t iree_vm_native_module_verify_descriptor(
+    const iree_vm_native_module_descriptor_t* module_descriptor) {
+  return iree_ok_status();
+}
+#else
+static iree_status_t iree_vm_native_module_verify_descriptor(
+    const iree_vm_native_module_descriptor_t* module_descriptor) {
+  // Verify the export table is sorted by name. This will help catch issues with
+  // people appending to tables instead of inserting in the proper order.
+  for (iree_host_size_t i = 1; i < module_descriptor->export_count; ++i) {
+    iree_string_view_t prev_export_name =
+        module_descriptor->exports[i - 1].local_name;
+    iree_string_view_t export_name = module_descriptor->exports[i].local_name;
+    int cmp = iree_string_view_compare(prev_export_name, export_name);
+    if (IREE_UNLIKELY(cmp >= 0)) {
+      return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                              "module export table is not sorted by name "
+                              "(export %zu ('%.*s') >= %zu ('%.*s'))",
+                              i - 1, (int)prev_export_name.size,
+                              prev_export_name.data, i, (int)export_name.size,
+                              export_name.data);
+    }
+  }
+  return iree_ok_status();
+}
+#endif  // NDEBUG
+
+static void IREE_API_PTR iree_vm_native_module_destroy(void* self) {
+  iree_vm_native_module_t* module = (iree_vm_native_module_t*)self;
+  iree_allocator_t allocator = module->allocator;
+
+  // Destroy the optional user-provided self.
+  if (module->user_interface.destroy) {
+    module->user_interface.destroy(module->self);
+  }
+
+  iree_allocator_free(allocator, module);
+}
+
+static iree_string_view_t IREE_API_PTR iree_vm_native_module_name(void* self) {
+  iree_vm_native_module_t* module = (iree_vm_native_module_t*)self;
+  if (module->user_interface.name) {
+    return module->user_interface.name(module->self);
+  }
+  return module->descriptor->module_name;
+}
+
+static iree_vm_module_signature_t IREE_API_PTR
+iree_vm_native_module_signature(void* self) {
+  iree_vm_native_module_t* module = (iree_vm_native_module_t*)self;
+  if (module->user_interface.signature) {
+    return module->user_interface.signature(module->self);
+  }
+  iree_vm_module_signature_t signature;
+  memset(&signature, 0, sizeof(signature));
+  signature.import_function_count = module->descriptor->import_count;
+  signature.export_function_count = module->descriptor->export_count;
+  signature.internal_function_count = 0;  // unused
+  return signature;
+}
+
+static iree_status_t IREE_API_PTR iree_vm_native_module_get_import_function(
+    iree_vm_native_module_t* module, iree_host_size_t ordinal,
+    iree_vm_function_t* out_function, iree_string_view_t* out_name,
+    iree_vm_function_signature_t* out_signature) {
+  if (IREE_UNLIKELY(ordinal >= module->descriptor->import_count)) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "import ordinal out of range (0 < %zu < %zu)",
+                            ordinal, module->descriptor->import_count);
+  }
+  const iree_vm_native_import_descriptor_t* import_descriptor =
+      &module->descriptor->imports[ordinal];
+  if (out_function) {
+    out_function->module = &module->base_interface;
+    out_function->linkage = iree_all_bits_set(import_descriptor->flags,
+                                              IREE_VM_NATIVE_IMPORT_OPTIONAL)
+                                ? IREE_VM_FUNCTION_LINKAGE_IMPORT_OPTIONAL
+                                : IREE_VM_FUNCTION_LINKAGE_IMPORT;
+    out_function->ordinal = (uint16_t)ordinal;
+  }
+  if (out_name) {
+    *out_name = import_descriptor->full_name;
+  }
+  // TODO(#1979): signature queries when info is useful.
+  return iree_ok_status();
+}
+
+static iree_status_t IREE_API_PTR iree_vm_native_module_get_export_function(
+    iree_vm_native_module_t* module, iree_host_size_t ordinal,
+    iree_vm_function_t* out_function, iree_string_view_t* out_name,
+    iree_vm_function_signature_t* out_signature) {
+  if (IREE_UNLIKELY(ordinal >= module->descriptor->export_count)) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "export ordinal out of range (0 < %zu < %zu)",
+                            ordinal, module->descriptor->export_count);
+  }
+  if (out_function) {
+    out_function->module = &module->base_interface;
+    out_function->linkage = IREE_VM_FUNCTION_LINKAGE_EXPORT;
+    out_function->ordinal = (uint16_t)ordinal;
+  }
+  const iree_vm_native_export_descriptor_t* export_descriptor =
+      &module->descriptor->exports[ordinal];
+  if (out_name) {
+    *out_name = export_descriptor->local_name;
+  }
+  if (out_signature) {
+    out_signature->calling_convention = export_descriptor->calling_convention;
+  }
+  return iree_ok_status();
+}
+
+static iree_status_t IREE_API_PTR iree_vm_native_module_get_function(
+    void* self, iree_vm_function_linkage_t linkage, iree_host_size_t ordinal,
+    iree_vm_function_t* out_function, iree_string_view_t* out_name,
+    iree_vm_function_signature_t* out_signature) {
+  iree_vm_native_module_t* module = (iree_vm_native_module_t*)self;
+  if (out_function) memset(out_function, 0, sizeof(*out_function));
+  if (out_name) memset(out_name, 0, sizeof(*out_name));
+  if (out_signature) memset(out_signature, 0, sizeof(*out_signature));
+  if (module->user_interface.get_function) {
+    return module->user_interface.get_function(
+        module->self, linkage, ordinal, out_function, out_name, out_signature);
+  }
+  switch (linkage) {
+    case IREE_VM_FUNCTION_LINKAGE_IMPORT:
+    case IREE_VM_FUNCTION_LINKAGE_IMPORT_OPTIONAL:
+      return iree_vm_native_module_get_import_function(
+          module, ordinal, out_function, out_name, out_signature);
+    case IREE_VM_FUNCTION_LINKAGE_EXPORT:
+      return iree_vm_native_module_get_export_function(
+          module, ordinal, out_function, out_name, out_signature);
+    default:
+      return iree_make_status(
+          IREE_STATUS_UNIMPLEMENTED,
+          "native modules do not support internal function queries");
+  }
+}
+
+static iree_status_t IREE_API_PTR
+iree_vm_native_module_get_function_reflection_attr(
+    void* self, iree_vm_function_linkage_t linkage, iree_host_size_t ordinal,
+    iree_host_size_t index, iree_string_view_t* key,
+    iree_string_view_t* value) {
+  iree_vm_native_module_t* module = (iree_vm_native_module_t*)self;
+  if (module->user_interface.get_function_reflection_attr) {
+    return module->user_interface.get_function_reflection_attr(
+        module->self, linkage, ordinal, index, key, value);
+  }
+  // TODO(benvanik): implement native module reflection.
+  return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                          "reflection not yet implemented");
+}
+
+static iree_status_t IREE_API_PTR iree_vm_native_module_lookup_function(
+    void* self, iree_vm_function_linkage_t linkage, iree_string_view_t name,
+    iree_vm_function_t* out_function) {
+  iree_vm_native_module_t* module = (iree_vm_native_module_t*)self;
+  memset(out_function, 0, sizeof(*out_function));
+  if (module->user_interface.lookup_function) {
+    return module->user_interface.lookup_function(module->self, linkage, name,
+                                                  out_function);
+  }
+
+  if (IREE_UNLIKELY(linkage != IREE_VM_FUNCTION_LINKAGE_EXPORT)) {
+    // NOTE: we could support imports if required.
+    return iree_make_status(
+        IREE_STATUS_UNIMPLEMENTED,
+        "native modules do not support import/internal function queries");
+  }
+
+  // Binary search through the export descriptors.
+  ptrdiff_t min_ordinal = 0;
+  ptrdiff_t max_ordinal = module->descriptor->export_count - 1;
+  const iree_vm_native_export_descriptor_t* exports =
+      module->descriptor->exports;
+  while (min_ordinal <= max_ordinal) {
+    ptrdiff_t ordinal = (min_ordinal + max_ordinal) / 2;
+    int cmp = iree_string_view_compare(exports[ordinal].local_name, name);
+    if (cmp == 0) {
+      return iree_vm_native_module_get_function(self, linkage, ordinal,
+                                                out_function, NULL, NULL);
+    } else if (cmp < 0) {
+      min_ordinal = ordinal + 1;
+    } else {
+      max_ordinal = ordinal - 1;
+    }
+  }
+  return iree_make_status(
+      IREE_STATUS_NOT_FOUND, "no function %.*s.%.*s exported by module",
+      (int)module->descriptor->module_name.size,
+      module->descriptor->module_name.data, (int)name.size, name.data);
+}
+
+static iree_status_t IREE_API_PTR
+iree_vm_native_module_alloc_state(void* self, iree_allocator_t allocator,
+                                  iree_vm_module_state_t** out_module_state) {
+  iree_vm_native_module_t* module = (iree_vm_native_module_t*)self;
+  *out_module_state = NULL;
+  if (module->user_interface.alloc_state) {
+    return module->user_interface.alloc_state(module->self, allocator,
+                                              out_module_state);
+  }
+  // Default to no state.
+  return iree_ok_status();
+}
+
+static void IREE_API_PTR iree_vm_native_module_free_state(
+    void* self, iree_vm_module_state_t* module_state) {
+  iree_vm_native_module_t* module = (iree_vm_native_module_t*)self;
+  if (module->user_interface.free_state) {
+    module->user_interface.free_state(module->self, module_state);
+    return;
+  }
+  // No-op in the default implementation.
+  // TODO(#2843): IREE_DCHECK_EQ(NULL, module_state);
+  assert(!module_state);
+}
+
+static iree_status_t IREE_API_PTR iree_vm_native_module_resolve_import(
+    void* self, iree_vm_module_state_t* module_state, iree_host_size_t ordinal,
+    const iree_vm_function_t* function,
+    const iree_vm_function_signature_t* signature) {
+  iree_vm_native_module_t* module = (iree_vm_native_module_t*)self;
+  if (module->user_interface.resolve_import) {
+    return module->user_interface.resolve_import(module->self, module_state,
+                                                 ordinal, function, signature);
+  }
+  return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                          "native module does not support imports");
+}
+
+static iree_status_t IREE_API_PTR iree_vm_native_module_notify(
+    void* self, iree_vm_module_state_t* module_state, iree_vm_signal_t signal) {
+  iree_vm_native_module_t* module = (iree_vm_native_module_t*)self;
+  if (module->user_interface.notify) {
+    return module->user_interface.notify(module->self, module_state, signal);
+  }
+  return iree_ok_status();
+}
+
+static iree_status_t IREE_API_PTR iree_vm_native_module_begin_call(
+    void* self, iree_vm_stack_t* stack, const iree_vm_function_call_t* call,
+    iree_vm_execution_result_t* out_result) {
+  iree_vm_native_module_t* module = (iree_vm_native_module_t*)self;
+  if (IREE_UNLIKELY(call->function.linkage !=
+                    IREE_VM_FUNCTION_LINKAGE_EXPORT) ||
+      IREE_UNLIKELY(call->function.ordinal >=
+                    module->descriptor->export_count)) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "function ordinal out of bounds: 0 < %u < %zu",
+                            call->function.ordinal,
+                            module->descriptor->export_count);
+  }
+  if (module->user_interface.begin_call) {
+    return module->user_interface.begin_call(module->self, stack, call,
+                                             out_result);
+  }
+
+  // NOTE: VM stack is currently unused. We could stash things here for the
+  // debugger or use it for coroutine state.
+  iree_host_size_t frame_size = 0;
+
+  iree_vm_stack_frame_t* callee_frame = NULL;
+  IREE_RETURN_IF_ERROR(iree_vm_stack_function_enter(
+      stack, &call->function, IREE_VM_STACK_FRAME_NATIVE, frame_size,
+      /*frame_cleanup_fn=*/NULL, &callee_frame));
+
+  // Call the target function using the shim.
+  const iree_vm_native_function_ptr_t* function_ptr =
+      &module->descriptor->functions[call->function.ordinal];
+  iree_vm_module_state_t* module_state = callee_frame->module_state;
+  iree_status_t status = function_ptr->shim(stack, call, function_ptr->target,
+                                            module, module_state, out_result);
+  if (IREE_UNLIKELY(!iree_status_is_ok(status))) {
+#if IREE_STATUS_FEATURES & IREE_STATUS_FEATURE_ANNOTATIONS
+    iree_string_view_t module_name IREE_ATTRIBUTE_UNUSED =
+        iree_vm_native_module_name(module);
+    iree_string_view_t function_name IREE_ATTRIBUTE_UNUSED =
+        iree_string_view_empty();
+    iree_status_ignore(iree_vm_native_module_get_export_function(
+        module, call->function.ordinal, NULL, &function_name, NULL));
+    return iree_status_annotate_f(status,
+                                  "while invoking native function %.*s.%.*s",
+                                  (int)module_name.size, module_name.data,
+                                  (int)function_name.size, function_name.data);
+#else
+    return status;
+#endif  // IREE_STATUS_FEATURES & IREE_STATUS_FEATURE_ANNOTATIONS
+  }
+
+  return iree_vm_stack_function_leave(stack);
+}
+
+static iree_status_t IREE_API_PTR
+iree_vm_native_module_resume_call(void* self, iree_vm_stack_t* stack,
+                                  iree_vm_execution_result_t* out_result) {
+  iree_vm_native_module_t* module = (iree_vm_native_module_t*)self;
+  if (module->user_interface.resume_call) {
+    return module->user_interface.resume_call(module->self, stack, out_result);
+  }
+  return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                          "native module does not support resume");
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_native_module_create(
+    const iree_vm_module_t* interface,
+    const iree_vm_native_module_descriptor_t* module_descriptor,
+    iree_allocator_t allocator, iree_vm_module_t** out_module) {
+  IREE_ASSERT_ARGUMENT(out_module);
+  *out_module = NULL;
+
+  if (IREE_UNLIKELY(!interface->begin_call) &&
+      IREE_UNLIKELY(!module_descriptor->functions)) {
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "native modules must provide call support or function pointers");
+  } else if (IREE_UNLIKELY(!interface->begin_call) &&
+             IREE_UNLIKELY(module_descriptor->export_count !=
+                           module_descriptor->function_count)) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "native modules using the default call support "
+                            "must have 1:1 exports:function pointers");
+  }
+
+  // Perform some optional debug-only verification of the descriptor.
+  // Since native modules are designed to be compiled in we don't need to do
+  // this in release builds.
+  IREE_RETURN_IF_ERROR(
+      iree_vm_native_module_verify_descriptor(module_descriptor));
+
+  // TODO(benvanik): invert allocation such that caller allocates and we init.
+  // This would avoid the need for any dynamic memory allocation in the common
+  // case as the outer user module interface could nest us. Note that we'd need
+  // to expose this via a query_size function so that we could adjust the size
+  // of our storage independent of the definition of the user module.
+  iree_vm_native_module_t* module = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_allocator_malloc(allocator, sizeof(*module), (void**)&module));
+
+  iree_status_t status = iree_vm_native_module_initialize(
+      interface, module_descriptor, allocator, (iree_vm_module_t*)module);
+  if (!iree_status_is_ok(status)) {
+    iree_allocator_free(allocator, module);
+    return status;
+  }
+
+  *out_module = &module->base_interface;
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_native_module_initialize(
+    const iree_vm_module_t* interface,
+    const iree_vm_native_module_descriptor_t* module_descriptor,
+    iree_allocator_t allocator, iree_vm_module_t* base_module) {
+  IREE_ASSERT_ARGUMENT(interface);
+  IREE_ASSERT_ARGUMENT(module_descriptor);
+  IREE_ASSERT_ARGUMENT(base_module);
+  iree_vm_native_module_t* module = (iree_vm_native_module_t*)base_module;
+
+  if (IREE_UNLIKELY(!interface->begin_call) &&
+      IREE_UNLIKELY(!module_descriptor->functions)) {
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "native modules must provide call support or function pointers");
+  } else if (IREE_UNLIKELY(!interface->begin_call) &&
+             IREE_UNLIKELY(module_descriptor->export_count !=
+                           module_descriptor->function_count)) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "native modules using the default call support "
+                            "must have 1:1 exports:function pointers");
+  }
+
+  // Perform some optional debug-only verification of the descriptor.
+  // Since native modules are designed to be compiled in we don't need to do
+  // this in release builds.
+  IREE_RETURN_IF_ERROR(
+      iree_vm_native_module_verify_descriptor(module_descriptor));
+  module->allocator = allocator;
+  module->descriptor = module_descriptor;
+
+  // TODO(benvanik): version interface and copy only valid bytes.
+  memcpy(&module->user_interface, interface, sizeof(*interface));
+  module->self =
+      module->user_interface.self ? module->user_interface.self : module;
+
+  // Base interface that routes through our thunks.
+  iree_vm_module_initialize(&module->base_interface, module);
+  module->base_interface.destroy = iree_vm_native_module_destroy;
+  module->base_interface.name = iree_vm_native_module_name;
+  module->base_interface.signature = iree_vm_native_module_signature;
+  module->base_interface.get_function = iree_vm_native_module_get_function;
+  module->base_interface.get_function_reflection_attr =
+      iree_vm_native_module_get_function_reflection_attr;
+  module->base_interface.lookup_function =
+      iree_vm_native_module_lookup_function;
+  module->base_interface.alloc_state = iree_vm_native_module_alloc_state;
+  module->base_interface.free_state = iree_vm_native_module_free_state;
+  module->base_interface.resolve_import = iree_vm_native_module_resolve_import;
+  module->base_interface.notify = iree_vm_native_module_notify;
+  module->base_interface.begin_call = iree_vm_native_module_begin_call;
+  module->base_interface.resume_call = iree_vm_native_module_resume_call;
+
+  return iree_ok_status();
+}
diff --git a/runtime/src/iree/vm/native_module.h b/runtime/src/iree/vm/native_module.h
new file mode 100644
index 0000000..a569452
--- /dev/null
+++ b/runtime/src/iree/vm/native_module.h
@@ -0,0 +1,136 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// NOTE: native_module_test.h contains documented examples of how to use this!
+
+#ifndef IREE_VM_NATIVE_MODULE_H_
+#define IREE_VM_NATIVE_MODULE_H_
+
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/vm/module.h"
+#include "iree/vm/stack.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+enum iree_vm_native_import_flag_bits_e {
+  IREE_VM_NATIVE_IMPORT_REQUIRED = 1u << 0,
+  IREE_VM_NATIVE_IMPORT_OPTIONAL = 1u << 1,
+};
+typedef uint32_t iree_vm_native_import_flags_t;
+
+// Describes an imported native function in a native module.
+// All of this information is assumed read-only and will be referenced for the
+// lifetime of any module created with the descriptor.
+typedef struct iree_vm_native_import_descriptor_t {
+  // Flags controlling import resolution.
+  iree_vm_native_import_flags_t flags;
+  // Fully-qualified function name (for example, 'other_module.foo').
+  iree_string_view_t full_name;
+} iree_vm_native_import_descriptor_t;
+
+// Describes an exported native function in a native module.
+// All of this information is assumed read-only and will be referenced for the
+// lifetime of any module created with the descriptor.
+typedef struct iree_vm_native_export_descriptor_t {
+  // Module-local function name (for example, 'foo' for function 'module.foo').
+  iree_string_view_t local_name;
+
+  // Calling convention string; see iree/vm/module.h for details.
+  iree_string_view_t calling_convention;
+
+  // An optional list of function-level reflection attributes.
+  iree_host_size_t reflection_attr_count;
+  const iree_vm_reflection_attr_t* reflection_attrs;
+} iree_vm_native_export_descriptor_t;
+
+typedef iree_status_t(IREE_API_PTR* iree_vm_native_function_target_t)(
+    iree_vm_stack_t* stack, void* module, void* module_state);
+
+typedef iree_status_t(IREE_API_PTR* iree_vm_native_function_shim_t)(
+    iree_vm_stack_t* stack, const iree_vm_function_call_t* call,
+    iree_vm_native_function_target_t target_fn, void* module,
+    void* module_state, iree_vm_execution_result_t* out_result);
+
+// An entry in the function pointer table.
+typedef struct iree_vm_native_function_ptr_t {
+  // A shim function that takes the VM ABI and maps it to the target ABI.
+  iree_vm_native_function_shim_t shim;
+  // Target function passed to the shim.
+  iree_vm_native_function_target_t target;
+} iree_vm_native_function_ptr_t;
+
+// Describes a native module implementation by way of descriptor tables.
+// All of this information is assumed read-only and will be referenced for the
+// lifetime of any module created with the descriptor.
+//
+// The common native module code will use this descriptor to return metadata on
+// query, lookup exported functions, and call module-provided implementation
+// functions for state and call management.
+typedef struct iree_vm_native_module_descriptor_t {
+  IREE_API_UNSTABLE
+
+  // Name of the module prefixed on all exported functions.
+  iree_string_view_t module_name;
+
+  // All imported function descriptors.
+  // interface.resolve_import will be called for each import.
+  // Imports must be in order sorted by name compatible with
+  // iree_string_view_compare.
+  iree_host_size_t import_count;
+  const iree_vm_native_import_descriptor_t* imports;
+
+  // All exported function descriptors.
+  // Exports must be in order sorted by name compatible with
+  // iree_string_view_compare.
+  iree_host_size_t export_count;
+  const iree_vm_native_export_descriptor_t* exports;
+
+  // All function shims and target function pointers.
+  // These must match 1:1 with the exports if using the default begin_call
+  // implementation and are optional if overriding begin_call.
+  iree_host_size_t function_count;
+  const iree_vm_native_function_ptr_t* functions;
+
+  // An optional list of module-level reflection attributes.
+  iree_host_size_t reflection_attr_count;
+  const iree_vm_reflection_attr_t* reflection_attrs;
+} iree_vm_native_module_descriptor_t;
+
+// Returns the size, in bytes, of the allocation required for native modules.
+// Callers may allocate more memory if they need additional storage.
+IREE_API_EXPORT iree_host_size_t iree_vm_native_module_size(void);
+
+// Creates a new native module with the metadata tables in |descriptor|.
+// These tables will be used for reflection and function lookup, and the
+// provided function pointers will be called when state needs to be managed or
+// exported functions need to be called.
+//
+// An implementation |interface| providing functions for state management and
+// function calls can be provided to override default implementations of
+// functions. The structure will be copied and the self pointer will be passed
+// to all |interface| functions.
+//
+// The provided |descriptor| will be referenced by the created module and must
+// be kept live for the lifetime of the module.
+IREE_API_EXPORT iree_status_t iree_vm_native_module_create(
+    const iree_vm_module_t* interface,
+    const iree_vm_native_module_descriptor_t* module_descriptor,
+    iree_allocator_t allocator, iree_vm_module_t** out_module);
+
+IREE_API_EXPORT iree_status_t iree_vm_native_module_initialize(
+    const iree_vm_module_t* interface,
+    const iree_vm_native_module_descriptor_t* module_descriptor,
+    iree_allocator_t allocator, iree_vm_module_t* module);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_VM_NATIVE_MODULE_H_
diff --git a/runtime/src/iree/vm/native_module_benchmark.cc b/runtime/src/iree/vm/native_module_benchmark.cc
new file mode 100644
index 0000000..14da29b
--- /dev/null
+++ b/runtime/src/iree/vm/native_module_benchmark.cc
@@ -0,0 +1,19 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "benchmark/benchmark.h"
+#include "iree/base/api.h"
+#include "iree/base/logging.h"
+#include "iree/vm/module.h"
+#include "iree/vm/native_module.h"
+#include "iree/vm/native_module_test.h"
+#include "iree/vm/stack.h"
+
+namespace {
+
+// TODO(benvanik): native module benchmarks.
+
+}  // namespace
diff --git a/runtime/src/iree/vm/native_module_cc.h b/runtime/src/iree/vm/native_module_cc.h
new file mode 100644
index 0000000..015fdb8
--- /dev/null
+++ b/runtime/src/iree/vm/native_module_cc.h
@@ -0,0 +1,263 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_VM_NATIVE_MODULE_CC_H_
+#define IREE_VM_NATIVE_MODULE_CC_H_
+
+#include <cstring>
+#include <memory>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/span.h"
+#include "iree/base/status_cc.h"
+#include "iree/vm/module.h"
+#include "iree/vm/native_module_packing.h"  // IWYU pragma: export
+#include "iree/vm/stack.h"
+
+#ifndef __cplusplus
+#error "This header is meant for use with C++ module implementations."
+#endif  // __cplusplus
+
+namespace iree {
+namespace vm {
+
+// A native module as exported to the VM dynamic module linking API.
+// This allows easy wrapping of C++ module implementations and removes a
+// majority of the boilerplate required with marshaling args/results out/in of
+// the VM via the ABI.
+//
+// Functions are defined on the State type as member functions returning either
+// Status or StatusOr. Arguments are passed as primitive types (int32_t),
+// wrapped ref objects (vm::ref<my_type_t>&), or some nesting of std::array,
+// std::tuple, and std::span to match fixed-length arrays of the same type,
+// tuples of mixed types, or dynamic arrays (variadic arguments). Results may be
+// returned as either their type or an std::tuple/std::array of types.
+//
+// Usage:
+//   // Per-context module state that must only be thread-compatible.
+//   // Define
+//   struct MyState final {
+//     StatusOr<std::tuple<int32_t, int32_t>> MyMethod1(vm::ref<my_type_t> t);
+//   };
+//
+//   // Table of functions mapped to their name in the IR.
+//   static const vm::NativeFunction<MyState> kMyFunctions[] = {
+//     vm::MakeNativeFunction("my_method_1", &MyState::MyMethod1),
+//   };
+//
+//   // The outer module wrapper shared across contexts.
+//   // Must be thread-safe.
+//   struct MyModule : public NativeModule<MyState> {
+//     StatusOr<std::unique_ptr<MyState>> CreateState(iree_allocator_t) {
+//       // You could pass in thread-safe shared resources to MyState.
+//       return std::make_unique<MyState>();
+//     }
+//   };
+//
+//   // Creates the module and exposes it as a C interface.
+//   // Ownership transfers to the caller.
+//   iree_vm_module_t* create_my_module(iree_allocator_t allocator) {
+//     return std::make_unique<MyModule>("my_module", allocator,
+//         std::span{kCustomModuleFunctions}).release()->interface();
+//   }
+template <typename State>
+class NativeModule {
+ public:
+  NativeModule(const char* name, iree_allocator_t allocator,
+               iree::span<const NativeFunction<State>> dispatch_table)
+      : name_(name), allocator_(allocator), dispatch_table_(dispatch_table) {
+    IREE_CHECK_OK(iree_vm_module_initialize(&interface_, this));
+    interface_.destroy = NativeModule::ModuleDestroy;
+    interface_.name = NativeModule::ModuleName;
+    interface_.signature = NativeModule::ModuleSignature;
+    interface_.get_function = NativeModule::ModuleGetFunction;
+    interface_.lookup_function = NativeModule::ModuleLookupFunction;
+    interface_.alloc_state = NativeModule::ModuleAllocState;
+    interface_.free_state = NativeModule::ModuleFreeState;
+    interface_.resolve_import = NativeModule::ModuleResolveImport;
+    interface_.notify = NativeModule::ModuleNotify;
+    interface_.begin_call = NativeModule::ModuleBeginCall;
+  }
+
+  virtual ~NativeModule() = default;
+
+  // C API module interface bound to this NativeModule instance.
+  iree_vm_module_t* interface() { return &interface_; }
+
+ protected:
+  // Creates a new per-context module State holder.
+  virtual StatusOr<std::unique_ptr<State>> CreateState(
+      iree_allocator_t allocator) = 0;
+
+  // Notifies the module a signal has been raised.
+  virtual Status Notify(State* state, iree_vm_signal_t signal) {
+    return OkStatus();
+  }
+
+ private:
+  static NativeModule* FromModulePointer(void* self) {
+    return reinterpret_cast<NativeModule*>(self);
+  }
+  static State* FromStatePointer(void* self) {
+    return reinterpret_cast<State*>(self);
+  }
+
+  static void ModuleDestroy(void* self) { delete FromModulePointer(self); }
+
+  static iree_string_view_t ModuleName(void* self) {
+    auto* module = FromModulePointer(self);
+    return iree_make_cstring_view(module->name_);
+  }
+
+  static iree_vm_module_signature_t ModuleSignature(void* self) {
+    auto* module = FromModulePointer(self);
+    iree_vm_module_signature_t signature = {0};
+    signature.import_function_count = 0;
+    signature.export_function_count = module->dispatch_table_.size();
+    signature.internal_function_count = 0;
+    return signature;
+  }
+
+  static iree_status_t ModuleGetFunction(
+      void* self, iree_vm_function_linkage_t linkage, iree_host_size_t ordinal,
+      iree_vm_function_t* out_function, iree_string_view_t* out_name,
+      iree_vm_function_signature_t* out_signature) {
+    if (out_function) {
+      std::memset(out_function, 0, sizeof(*out_function));
+    }
+    if (out_name) {
+      out_name->data = nullptr;
+      out_name->size = 0;
+    }
+    if (out_signature) {
+      std::memset(out_signature, 0, sizeof(*out_signature));
+    }
+    auto* module = FromModulePointer(self);
+    if (IREE_UNLIKELY(ordinal > module->dispatch_table_.size())) {
+      return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                              "function out of bounds: 0 < %zu < %zu", ordinal,
+                              module->dispatch_table_.size());
+    }
+    const auto& dispatch_function = module->dispatch_table_[ordinal];
+    if (out_function) {
+      out_function->module = module->interface();
+      out_function->linkage = IREE_VM_FUNCTION_LINKAGE_EXPORT;
+      out_function->ordinal = static_cast<uint16_t>(ordinal);
+    }
+    if (out_name) {
+      *out_name = dispatch_function.name;
+    }
+    if (out_signature) {
+      out_signature->calling_convention = dispatch_function.cconv;
+    }
+    return iree_ok_status();
+  }
+
+  static iree_status_t ModuleLookupFunction(void* self,
+                                            iree_vm_function_linkage_t linkage,
+                                            iree_string_view_t name,
+                                            iree_vm_function_t* out_function) {
+    IREE_ASSERT_ARGUMENT(out_function);
+    std::memset(out_function, 0, sizeof(*out_function));
+    if (IREE_UNLIKELY(!name.data || !name.size)) {
+      return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                              "function name empty");
+    }
+
+    auto* module = FromModulePointer(self);
+    out_function->module = module->interface();
+    out_function->linkage = IREE_VM_FUNCTION_LINKAGE_EXPORT;
+    for (int i = 0; i < module->dispatch_table_.size(); ++i) {
+      if (iree_string_view_equal(name, module->dispatch_table_[i].name)) {
+        out_function->ordinal = i;
+        return iree_ok_status();
+      }
+    }
+    return iree_make_status(IREE_STATUS_NOT_FOUND, "function %.*s not exported",
+                            (int)name.size, name.data);
+  }
+
+  static iree_status_t ModuleAllocState(
+      void* self, iree_allocator_t allocator,
+      iree_vm_module_state_t** out_module_state) {
+    IREE_ASSERT_ARGUMENT(out_module_state);
+    *out_module_state = nullptr;
+
+    auto* module = FromModulePointer(self);
+    IREE_ASSIGN_OR_RETURN(auto module_state, module->CreateState(allocator));
+
+    *out_module_state =
+        reinterpret_cast<iree_vm_module_state_t*>(module_state.release());
+    return iree_ok_status();
+  }
+
+  static void ModuleFreeState(void* self,
+                              iree_vm_module_state_t* module_state) {
+    if (module_state) delete FromStatePointer(module_state);
+  }
+
+  static iree_status_t ModuleResolveImport(
+      void* self, iree_vm_module_state_t* module_state,
+      iree_host_size_t ordinal, const iree_vm_function_t* function,
+      const iree_vm_function_signature_t* signature) {
+    return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                            "C++ API does not support imports");
+  }
+
+  static iree_status_t ModuleNotify(void* self,
+                                    iree_vm_module_state_t* module_state,
+                                    iree_vm_signal_t signal) {
+    auto* module = FromModulePointer(self);
+    return module->Notify(FromStatePointer(module_state), signal);
+  }
+
+  static iree_status_t ModuleBeginCall(void* self, iree_vm_stack_t* stack,
+                                       const iree_vm_function_call_t* call,
+                                       iree_vm_execution_result_t* out_result) {
+    IREE_ASSERT_ARGUMENT(out_result);
+    std::memset(out_result, 0, sizeof(*out_result));
+    auto* module = FromModulePointer(self);
+    if (IREE_UNLIKELY(call->function.ordinal >=
+                      module->dispatch_table_.size())) {
+      return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                              "function ordinal out of bounds: 0 < %u < %zu",
+                              call->function.ordinal,
+                              module->dispatch_table_.size());
+    }
+    const auto& info = module->dispatch_table_[call->function.ordinal];
+
+    // NOTE: VM stack is currently unused. We could stash things here for the
+    // debugger or use it for coroutine state.
+    iree_host_size_t frame_size = 0;
+
+    iree_vm_stack_frame_t* callee_frame = NULL;
+    IREE_RETURN_IF_ERROR(iree_vm_stack_function_enter(
+        stack, &call->function, IREE_VM_STACK_FRAME_NATIVE, frame_size,
+        /*frame_cleanup_fn=*/nullptr, &callee_frame));
+
+    auto* state = FromStatePointer(callee_frame->module_state);
+    iree_status_t status = info.call(info.ptr, state, stack, call, out_result);
+    if (IREE_UNLIKELY(!iree_status_is_ok(status))) {
+      status = iree_status_annotate_f(
+          status, "while invoking C++ function %s.%.*s", module->name_,
+          (int)info.name.size, info.name.data);
+      return status;
+    }
+
+    return iree_vm_stack_function_leave(stack);
+  }
+
+  const char* name_;
+  const iree_allocator_t allocator_;
+  iree_vm_module_t interface_;
+
+  const iree::span<const NativeFunction<State>> dispatch_table_;
+};
+
+}  // namespace vm
+}  // namespace iree
+
+#endif  // IREE_VM_NATIVE_MODULE_CC_H_
diff --git a/runtime/src/iree/vm/native_module_packing.h b/runtime/src/iree/vm/native_module_packing.h
new file mode 100644
index 0000000..db98523
--- /dev/null
+++ b/runtime/src/iree/vm/native_module_packing.h
@@ -0,0 +1,705 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_VM_MODULE_ABI_PACKING_H_
+#define IREE_VM_MODULE_ABI_PACKING_H_
+
+#include <memory>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/span.h"
+#include "iree/base/status_cc.h"
+#include "iree/vm/builtin_types.h"
+#include "iree/vm/module.h"
+#include "iree/vm/ref.h"
+#include "iree/vm/ref_cc.h"
+#include "iree/vm/stack.h"
+
+// std::string_view is available starting in C++17.
+// Prior to that only IREE's C iree_string_view_t is available.
+#if defined(__has_include)
+#if __has_include(<string_view>) && __cplusplus >= 201703L
+#define IREE_HAVE_STD_STRING_VIEW 1
+#include <string_view>
+#endif  // __has_include(<string_view>)
+#endif  // __has_include
+
+namespace iree {
+namespace vm {
+namespace packing {
+
+namespace impl {
+
+// Workaround required to ensure proper evaluation order of parameter packs.
+// MSVC (and other compilers, like clang-cl in MSVC compat mode) may evaluate
+// parameter pack function arguments in any order. This shim allows us to expand
+// the parameter pack inside of an initializer list, which unlike function
+// arguments must be evaluated by the compiler in the order the elements appear
+// in the list.
+//
+// Example:
+//  impl::order_sequence{(ExpandedAction(), 0)...};
+//
+// More information:
+// https://stackoverflow.com/questions/29194858/order-of-function-calls-in-variadic-template-expansion
+struct order_sequence {
+  template <typename... T>
+  order_sequence(T&&...) {}
+};
+
+// Coming in C++20, but not widely available yet.
+template <class T>
+struct remove_cvref {
+  typedef std::remove_cv_t<std::remove_reference_t<T>> type;
+};
+
+}  // namespace impl
+
+template <typename T>
+using enable_if_primitive =
+    typename std::enable_if<std::is_arithmetic<T>::value ||
+                            std::is_enum<T>::value>::type;
+template <typename T>
+using enable_if_not_primitive = typename std::enable_if<!(
+    std::is_arithmetic<T>::value || std::is_enum<T>::value)>::type;
+
+//===----------------------------------------------------------------------===//
+// Compile-time string literals
+//===----------------------------------------------------------------------===//
+
+// Compile-time constant string.
+// This allows us to concat string literals and produce a single flattened
+// char[] containing the results. Includes a \0 so the character storage is
+// length N + 1 and can be accessed as a c_str.
+//
+// Use the `literal` helper function to define a const string literal without
+// needing the size.
+//
+// Example:
+//  // produces: const_string<2>("ab")
+//  constexpr const auto str = literal("a") + literal("b");
+template <size_t N>
+class const_string {
+ public:
+  constexpr const_string(const char (&data)[N + 1])
+      : const_string(data, std::make_index_sequence<N>()) {}
+  template <size_t N1, typename std::enable_if<(N1 <= N), bool>::type = true>
+  constexpr const_string(const const_string<N1>& lhs,
+                         const const_string<N - N1>& rhs)
+      : const_string{lhs, rhs, std::make_index_sequence<N1>{},
+                     std::make_index_sequence<N - N1>{}} {}
+
+  constexpr std::size_t size() const { return N; }
+  constexpr const char* data() const { return data_; }
+  constexpr const char* c_str() const { return data_; }
+  constexpr operator const char*() const { return data_; }
+  constexpr char operator[](size_t i) const { return data_[i]; }
+
+ private:
+  template <size_t... PACK>
+  constexpr const_string(const char (&data)[N + 1],
+                         std::index_sequence<PACK...>)
+      : data_{data[PACK]..., '\0'} {}
+  template <size_t N1, size_t... PACK1, size_t... PACK2>
+  constexpr const_string(const const_string<N1>& lhs,
+                         const const_string<N - N1>& rhs,
+                         std::index_sequence<PACK1...>,
+                         std::index_sequence<PACK2...>)
+      : data_{lhs[PACK1]..., rhs[PACK2]..., '\0'} {}
+
+  const char data_[N + 1];
+};
+
+template <size_t N1, size_t N2>
+constexpr auto operator+(const const_string<N1>& lhs,
+                         const const_string<N2>& rhs) {
+  return const_string<N1 + N2>(lhs, rhs);
+}
+
+// Defines a compile-time constant string literal.
+template <size_t N_PLUS_1>
+constexpr auto literal(const char (&data)[N_PLUS_1]) {
+  return const_string<N_PLUS_1 - 1>(data);
+}
+
+constexpr auto concat_impl() { return literal(""); }
+template <typename T>
+constexpr auto concat_impl(const T& lhs) {
+  return lhs;
+}
+template <typename T, typename... Ts>
+constexpr auto concat_impl(const T& lhs, const Ts&... s) {
+  return lhs + concat_impl(s...);
+}
+
+// Concatenates one or more const_string values into a new const_string.
+//
+// Example:
+//  constexpr const auto abc = concat_literals(literal("a"),
+//                                             literal("b"),
+//                                             literal("c"));
+template <typename... Ts>
+constexpr auto concat_literals(const Ts&... s) {
+  return concat_impl(s...);
+}
+
+template <size_t C, typename T>
+struct splat_impl {
+  static constexpr auto apply(const T& v) {
+    return concat_literals(v, splat_impl<C - 1, T>::apply(v));
+  }
+};
+template <typename T>
+struct splat_impl<1, T> {
+  static constexpr auto apply(const T& v) { return v; }
+};
+
+// Splats a single const_string value C times.
+//
+// Example:
+//  constexpr const auto aaa = splat_literal<3>(literal("a"));
+template <size_t C, typename T>
+constexpr auto splat_literal(const T& v) {
+  return splat_impl<C, T>::apply(v);
+}
+
+//===----------------------------------------------------------------------===//
+// Calling convention format generation
+//===----------------------------------------------------------------------===//
+// Prototyped here: https://godbolt.org/z/Tvhh7M
+
+template <typename T>
+struct cconv_map;
+
+template <typename T>
+struct cconv_map {
+  static constexpr const auto conv_chars = literal("i");
+};
+
+template <>
+struct cconv_map<int64_t> {
+  static constexpr const auto conv_chars = literal("I");
+};
+template <>
+struct cconv_map<uint64_t> {
+  static constexpr const auto conv_chars = literal("I");
+};
+
+template <>
+struct cconv_map<opaque_ref> {
+  static constexpr const auto conv_chars = literal("r");
+};
+template <typename T>
+struct cconv_map<ref<T>> {
+  static constexpr const auto conv_chars = literal("r");
+};
+template <>
+struct cconv_map<iree_string_view_t> {
+  static constexpr const auto conv_chars = literal("r");
+};
+#if defined(IREE_HAVE_STD_STRING_VIEW)
+template <>
+struct cconv_map<std::string_view> {
+  static constexpr const auto conv_chars = literal("r");
+};
+#endif  // IREE_HAVE_STD_STRING_VIEW
+
+template <typename U, size_t S>
+struct cconv_map<std::array<U, S>> {
+  static constexpr const auto conv_chars = splat_literal<S>(
+      cconv_map<typename impl::remove_cvref<U>::type>::conv_chars);
+};
+
+template <typename... Ts>
+struct cconv_map<std::tuple<Ts...>> {
+  static constexpr const auto conv_chars = concat_literals(
+      cconv_map<typename impl::remove_cvref<Ts>::type>::conv_chars...);
+};
+
+template <typename U>
+struct cconv_map<iree::span<U>> {
+  static constexpr const auto conv_chars = concat_literals(
+      literal("C"), cconv_map<typename impl::remove_cvref<U>::type>::conv_chars,
+      literal("D"));
+};
+
+template <typename Result, size_t ParamsCount, typename... Params>
+struct cconv_storage {
+  static const iree_string_view_t value() {
+    static constexpr const auto value = concat_literals(
+        literal("0"),
+        concat_literals(
+            cconv_map<
+                typename impl::remove_cvref<Params>::type>::conv_chars...),
+        literal("_"),
+        concat_literals(
+            cconv_map<typename impl::remove_cvref<Result>::type>::conv_chars));
+    static constexpr const auto str =
+        iree_string_view_t{value.data(), value.size()};
+    return str;
+  }
+};
+
+template <typename Result>
+struct cconv_storage<Result, 0> {
+  static const iree_string_view_t value() {
+    static constexpr const auto value = concat_literals(
+        literal("0v_"),
+        concat_literals(
+            cconv_map<typename impl::remove_cvref<Result>::type>::conv_chars));
+    static constexpr const auto str =
+        iree_string_view_t{value.data(), value.size()};
+    return str;
+  }
+};
+
+template <size_t ParamsCount, typename... Params>
+struct cconv_storage_void {
+  static const iree_string_view_t value() {
+    static constexpr const auto value = concat_literals(
+        literal("0"),
+        concat_literals(
+            cconv_map<
+                typename impl::remove_cvref<Params>::type>::conv_chars...),
+        literal("_v"));
+    static constexpr const auto str =
+        iree_string_view_t{value.data(), value.size()};
+    return str;
+  }
+};
+
+template <>
+struct cconv_storage_void<0> {
+  static const iree_string_view_t value() {
+    static constexpr const auto value = concat_literals(literal("0v_v"));
+    static constexpr const auto str =
+        iree_string_view_t{value.data(), value.size()};
+    return str;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Parameter unpacking
+//===----------------------------------------------------------------------===//
+
+// TODO(benvanik): see if we can't use `extern template` to share
+// implementations of these and prevent code bloat across many modules.
+// We can also try some non-templated base functions (like "UnpackI32") that the
+// templated ones simply wrap with type casts.
+
+namespace impl {
+
+using params_ptr_t = uint8_t*;
+
+template <typename T, typename EN = void>
+struct ParamUnpack;
+template <>
+struct ParamUnpack<opaque_ref>;
+template <typename T>
+struct ParamUnpack<ref<T>>;
+template <typename T>
+struct ParamUnpack<const ref<T>>;
+template <>
+struct ParamUnpack<iree_string_view_t>;
+#if defined(IREE_HAVE_STD_STRING_VIEW)
+template <>
+struct ParamUnpack<std::string_view>;
+#endif  // IREE_HAVE_STD_STRING_VIEW
+template <typename U, size_t S>
+struct ParamUnpack<std::array<U, S>>;
+template <typename... Ts>
+struct ParamUnpack<std::tuple<Ts...>>;
+template <typename U>
+struct ParamUnpack<iree::span<U>, enable_if_not_primitive<U>>;
+template <typename U>
+struct ParamUnpack<iree::span<U>, enable_if_primitive<U>>;
+
+struct Unpacker {
+  template <typename... Ts>
+  static StatusOr<std::tuple<typename impl::ParamUnpack<
+      typename std::remove_reference<Ts>::type>::storage_type...>>
+  LoadSequence(iree_byte_span_t storage) {
+    auto params = std::make_tuple(
+        typename impl::ParamUnpack<
+            typename impl::remove_cvref<Ts>::type>::storage_type()...);
+    Status status;
+    params_ptr_t ptr = storage.data;
+    ApplyLoad<Ts...>(status, ptr, params,
+                     std::make_index_sequence<sizeof...(Ts)>());
+    IREE_RETURN_IF_ERROR(std::move(status));
+    params_ptr_t limit = storage.data + storage.data_length;
+    if (IREE_UNLIKELY(ptr != limit)) {
+      return iree_make_status(
+          IREE_STATUS_INVALID_ARGUMENT,
+          "argument buffer unpacking failure; consumed %zu of %zu bytes",
+          (reinterpret_cast<intptr_t>(ptr) -
+           reinterpret_cast<intptr_t>(storage.data)),
+          storage.data_length);
+    }
+    return std::move(params);
+  }
+
+ private:
+  template <typename... Ts, typename T, size_t... I>
+  static void ApplyLoad(Status& status, params_ptr_t& ptr, T&& params,
+                        std::index_sequence<I...>) {
+    impl::order_sequence{
+        (impl::ParamUnpack<typename impl::remove_cvref<
+             typename std::tuple_element<I, std::tuple<Ts...>>::type>::type>::
+             Load(status, ptr, std::get<I>(params)),
+         0)...};
+  }
+};
+
+// Common primitive types (`i32`, `i64`, `f32`, enums, etc).
+template <typename T>
+struct ParamUnpack<T, enable_if_primitive<T>> {
+  using storage_type = T;
+  static void Load(Status& status, params_ptr_t& ptr, storage_type& out_param) {
+    out_param = *reinterpret_cast<const T*>(ptr);
+    ptr += sizeof(T);
+  }
+};
+
+// An opaque ref type (`vm.ref<?>`), possibly null.
+template <>
+struct ParamUnpack<opaque_ref> {
+  using storage_type = opaque_ref;
+  static void Load(Status& status, params_ptr_t& ptr, storage_type& out_param) {
+    iree_vm_ref_retain(reinterpret_cast<iree_vm_ref_t*>(ptr), &out_param);
+    ptr += sizeof(iree_vm_ref_t);
+  }
+};
+
+// A `vm.ref<T>` type, possibly null.
+// Ownership is transferred to the parameter.
+template <typename T>
+struct ParamUnpack<ref<T>> {
+  using storage_type = ref<T>;
+  static void Load(Status& status, params_ptr_t& ptr, storage_type& out_param) {
+    auto* reg_ptr = reinterpret_cast<iree_vm_ref_t*>(ptr);
+    ptr += sizeof(iree_vm_ref_t);
+    if (reg_ptr->type == ref_type_descriptor<T>::get()->type) {
+      out_param = vm::retain_ref(reinterpret_cast<T*>(reg_ptr->ptr));
+      memset(reg_ptr, 0, sizeof(*reg_ptr));
+    } else if (IREE_UNLIKELY(reg_ptr->type != IREE_VM_REF_TYPE_NULL)) {
+      status =
+          iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                           "parameter contains a reference to the wrong type; "
+                           "have %.*s but expected %.*s",
+                           (int)iree_vm_ref_type_name(reg_ptr->type).size,
+                           iree_vm_ref_type_name(reg_ptr->type).data,
+                           (int)ref_type_descriptor<T>::get()->type_name.size,
+                           ref_type_descriptor<T>::get()->type_name.data);
+    } else {
+      out_param = {};
+    }
+  }
+};
+
+// TODO(benvanik): merge with above somehow?
+template <typename T>
+struct ParamUnpack<const ref<T>> {
+  using storage_type = ref<T>;
+  static void Load(Status& status, params_ptr_t& ptr, storage_type& out_param) {
+    auto* reg_ptr = reinterpret_cast<iree_vm_ref_t*>(ptr);
+    ptr += sizeof(iree_vm_ref_t);
+    if (reg_ptr->type == ref_type_descriptor<T>::get()->type) {
+      out_param = vm::retain_ref(reinterpret_cast<T*>(reg_ptr->ptr));
+      memset(reg_ptr, 0, sizeof(*reg_ptr));
+    } else if (IREE_UNLIKELY(reg_ptr->type != IREE_VM_REF_TYPE_NULL)) {
+      status =
+          iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                           "parameter contains a reference to the wrong type; "
+                           "have %.*s but expected %.*s",
+                           (int)iree_vm_ref_type_name(reg_ptr->type).size,
+                           iree_vm_ref_type_name(reg_ptr->type).data,
+                           (int)ref_type_descriptor<T>::get()->type_name.size,
+                           ref_type_descriptor<T>::get()->type_name.data);
+    } else {
+      out_param = {};
+    }
+  }
+};
+
+// An `util.byte_buffer` containing a string.
+// The string view is aliased directly into the underlying byte buffer.
+template <>
+struct ParamUnpack<iree_string_view_t> {
+  using storage_type = iree_string_view_t;
+  static void Load(Status& status, params_ptr_t& ptr, storage_type& out_param) {
+    auto* reg_ptr = reinterpret_cast<iree_vm_ref_t*>(ptr);
+    ptr += sizeof(iree_vm_ref_t);
+    if (reg_ptr->type == ref_type_descriptor<iree_vm_buffer_t>::get()->type) {
+      auto byte_span = reinterpret_cast<iree_vm_buffer_t*>(reg_ptr->ptr)->data;
+      out_param = iree_make_string_view(
+          reinterpret_cast<const char*>(byte_span.data), byte_span.data_length);
+    } else if (IREE_UNLIKELY(reg_ptr->type != IREE_VM_REF_TYPE_NULL)) {
+      status = iree_make_status(
+          IREE_STATUS_INVALID_ARGUMENT,
+          "parameter contains a reference to the wrong type; "
+          "have %.*s but expected %.*s",
+          (int)iree_vm_ref_type_name(reg_ptr->type).size,
+          iree_vm_ref_type_name(reg_ptr->type).data,
+          (int)ref_type_descriptor<iree_vm_buffer_t>::get()->type_name.size,
+          ref_type_descriptor<iree_vm_buffer_t>::get()->type_name.data);
+    } else {
+      // NOTE: empty string is allowed here!
+      out_param = iree_string_view_empty();
+    }
+  }
+};
+#if defined(IREE_HAVE_STD_STRING_VIEW)
+template <>
+struct ParamUnpack<std::string_view> {
+  using storage_type = std::string_view;
+  static void Load(Status& status, params_ptr_t& ptr, storage_type& out_param) {
+    auto* reg_ptr = reinterpret_cast<iree_vm_ref_t*>(ptr);
+    ptr += sizeof(iree_vm_ref_t);
+    if (reg_ptr->type == ref_type_descriptor<iree_vm_buffer_t>::get()->type) {
+      auto byte_span = reinterpret_cast<iree_vm_buffer_t*>(reg_ptr->ptr)->data;
+      out_param = std::string_view{
+          reinterpret_cast<const char*>(byte_span.data), byte_span.data_length};
+    } else if (IREE_UNLIKELY(reg_ptr->type != IREE_VM_REF_TYPE_NULL)) {
+      status = iree_make_status(
+          IREE_STATUS_INVALID_ARGUMENT,
+          "parameter contains a reference to the wrong type; "
+          "have %.*s but expected %.*s",
+          (int)iree_vm_ref_type_name(reg_ptr->type).size,
+          iree_vm_ref_type_name(reg_ptr->type).data,
+          (int)ref_type_descriptor<iree_vm_buffer_t>::get()->type_name.size,
+          ref_type_descriptor<iree_vm_buffer_t>::get()->type_name.data);
+    } else {
+      // NOTE: empty string is allowed here!
+      out_param = {};
+    }
+  }
+};
+#endif  // IREE_HAVE_STD_STRING_VIEW
+
+// Arrays are C++ ABI only representing a fixed repeated field (`i32, i32`).
+template <typename U, size_t S>
+struct ParamUnpack<std::array<U, S>> {
+  using element_type = typename impl::remove_cvref<U>::type;
+  using storage_type = std::array<element_type, S>;
+  static void Load(Status& status, params_ptr_t& ptr, storage_type& out_param) {
+    for (size_t i = 0; i < S; ++i) {
+      ParamUnpack::Load(status, ptr, out_param[i]);
+    }
+  }
+};
+
+// Tuples (`tuple<i32, i64>`) expand to just their flattened contents.
+template <typename... Ts>
+struct ParamUnpack<std::tuple<Ts...>> {
+  using storage_type = std::tuple<typename impl::remove_cvref<Ts>::type...>;
+  static void Load(Status& status, params_ptr_t& ptr, storage_type& out_param) {
+    UnpackTuple(status, ptr, out_param,
+                std::make_index_sequence<sizeof...(Ts)>());
+  }
+  template <size_t... I>
+  static void UnpackTuple(Status& status, params_ptr_t& ptr,
+                          storage_type& params, std::index_sequence<I...>) {
+    impl::order_sequence{
+        (ParamUnpack<typename std::tuple_element<I, std::tuple<Ts...>>::type>::
+             Load(status, ptr, std::get<I>(params)),
+         0)...};
+  }
+};
+
+// Complex variadic span (like `tuple<i32, tuple<ref<...>, i64>>...`).
+// We need to allocate storage here so that we can marshal the element type out.
+// In the future we could check that all subelements are primitives and alias if
+// the host machine endianness is the same.
+template <typename U>
+struct ParamUnpack<iree::span<U>, enable_if_not_primitive<U>> {
+  using element_type = typename impl::remove_cvref<U>::type;
+  using storage_type = std::vector<element_type>;
+  static void Load(Status& status, params_ptr_t& ptr, storage_type& out_param) {
+    iree_host_size_t count = *reinterpret_cast<const int32_t*>(ptr);
+    ptr += sizeof(int32_t);
+    out_param.resize(count);
+    for (iree_host_size_t i = 0; i < count; ++i) {
+      ParamUnpack<element_type>::Load(status, ptr, out_param[i]);
+    }
+  }
+};
+
+// Simple primitive variadic span (like `i32...`). We can alias directly into
+// the argument buffer so long as endianness matches.
+template <typename U>
+struct ParamUnpack<iree::span<U>, enable_if_primitive<U>> {
+  using element_type = U;
+  using storage_type = iree::span<const element_type>;
+  static void Load(Status& status, params_ptr_t& ptr, storage_type& out_param) {
+    iree_host_size_t count = *reinterpret_cast<const int32_t*>(ptr);
+    ptr += sizeof(int32_t);
+    out_param =
+        iree::span<U>(reinterpret_cast<const element_type*>(ptr), count);
+    ptr += sizeof(element_type) * count;
+  }
+};
+
+}  // namespace impl
+
+//===----------------------------------------------------------------------===//
+// Result packing
+//===----------------------------------------------------------------------===//
+
+namespace impl {
+
+using result_ptr_t = uint8_t*;
+
+template <typename T>
+struct ResultPack {
+  static void Store(result_ptr_t& ptr, T value) {
+    *reinterpret_cast<T*>(ptr) = value;
+    ptr += sizeof(T);
+  }
+};
+
+template <>
+struct ResultPack<opaque_ref> {
+  static void Store(result_ptr_t& ptr, opaque_ref value) {
+    iree_vm_ref_move(value.get(), reinterpret_cast<iree_vm_ref_t*>(ptr));
+    ptr += sizeof(iree_vm_ref_t);
+  }
+};
+
+template <typename T>
+struct ResultPack<ref<T>> {
+  static void Store(result_ptr_t& ptr, ref<T> value) {
+    iree_vm_ref_wrap_assign(value.release(), value.type(),
+                            reinterpret_cast<iree_vm_ref_t*>(ptr));
+    ptr += sizeof(iree_vm_ref_t);
+  }
+};
+
+template <typename U, size_t S>
+struct ResultPack<std::array<U, S>>;
+template <typename... Ts>
+struct ResultPack<std::tuple<Ts...>>;
+
+template <typename U, size_t S>
+struct ResultPack<std::array<U, S>> {
+  static void Store(result_ptr_t& ptr, std::array<U, S> value) {
+    for (size_t i = 0; i < S; ++i) {
+      ResultPack<U>::Store(ptr, std::move(value[i]));
+    }
+  }
+};
+
+template <typename... Ts>
+struct ResultPack<std::tuple<Ts...>> {
+  static void Store(result_ptr_t& ptr, std::tuple<Ts...> results) {
+    PackTuple(ptr, results, std::make_index_sequence<sizeof...(Ts)>());
+  }
+  template <typename... T, size_t... I>
+  static inline void PackTuple(result_ptr_t& ptr, std::tuple<T...>& value,
+                               std::index_sequence<I...>) {
+    impl::order_sequence{
+        (ResultPack<typename std::tuple_element<I, std::tuple<T...>>::type>::
+             Store(ptr, std::move(std::get<I>(value))),
+         0)...};
+  }
+};
+
+}  // namespace impl
+
+//===----------------------------------------------------------------------===//
+// Function wrapping
+//===----------------------------------------------------------------------===//
+
+template <typename Owner, typename Results, typename... Params>
+struct DispatchFunctor {
+  using FnPtr = StatusOr<Results> (Owner::*)(Params...);
+
+  static Status Call(void (Owner::*ptr)(), Owner* self, iree_vm_stack_t* stack,
+                     const iree_vm_function_call_t* call,
+                     iree_vm_execution_result_t* out_result) {
+    // Marshal arguments into types/locals we can forward to the function.
+    IREE_ASSIGN_OR_RETURN(
+        auto params, impl::Unpacker::LoadSequence<Params...>(call->arguments));
+
+    // Call the target function with the params.
+    IREE_ASSIGN_OR_RETURN(
+        auto results,
+        ApplyFn(reinterpret_cast<FnPtr>(ptr), self, std::move(params),
+                std::make_index_sequence<sizeof...(Params)>()));
+
+    // Marshal call results back into the ABI results buffer.
+    impl::result_ptr_t result_ptr = call->results.data;
+    impl::ResultPack<Results>::Store(result_ptr, std::move(results));
+
+    return OkStatus();
+  }
+
+  template <typename T, size_t... I>
+  static StatusOr<Results> ApplyFn(FnPtr ptr, Owner* self, T&& params,
+                                   std::index_sequence<I...>) {
+    return (self->*ptr)(std::move(std::get<I>(params))...);
+  }
+};
+
+// A DispatchFunctor specialization for methods with no return values.
+template <typename Owner, typename... Params>
+struct DispatchFunctorVoid {
+  using FnPtr = Status (Owner::*)(Params...);
+
+  static Status Call(void (Owner::*ptr)(), Owner* self, iree_vm_stack_t* stack,
+                     const iree_vm_function_call_t* call,
+                     iree_vm_execution_result_t* out_result) {
+    IREE_ASSIGN_OR_RETURN(
+        auto params, impl::Unpacker::LoadSequence<Params...>(call->arguments));
+    return ApplyFn(reinterpret_cast<FnPtr>(ptr), self, std::move(params),
+                   std::make_index_sequence<sizeof...(Params)>());
+  }
+
+  template <typename T, size_t... I>
+  static Status ApplyFn(FnPtr ptr, Owner* self, T&& params,
+                        std::index_sequence<I...>) {
+    return (self->*ptr)(std::move(std::get<I>(params))...);
+  }
+};
+
+}  // namespace packing
+
+template <typename Owner>
+struct NativeFunction {
+  iree_string_view_t name;
+  iree_string_view_t cconv;
+  void (Owner::*const ptr)();
+  Status (*const call)(void (Owner::*ptr)(), Owner* self,
+                       iree_vm_stack_t* stack,
+                       const iree_vm_function_call_t* call,
+                       iree_vm_execution_result_t* out_result);
+};
+
+template <typename Owner, typename Result, typename... Params>
+constexpr NativeFunction<Owner> MakeNativeFunction(
+    const char* name, StatusOr<Result> (Owner::*fn)(Params...)) {
+  using dispatch_functor_t = packing::DispatchFunctor<Owner, Result, Params...>;
+  return {iree_make_cstring_view(name),
+          packing::cconv_storage<Result, sizeof...(Params), Params...>::value(),
+          (void (Owner::*)())fn, &dispatch_functor_t::Call};
+}
+
+template <typename Owner, typename... Params>
+constexpr NativeFunction<Owner> MakeNativeFunction(
+    const char* name, Status (Owner::*fn)(Params...)) {
+  using dispatch_functor_t = packing::DispatchFunctorVoid<Owner, Params...>;
+  return {iree_make_cstring_view(name),
+          packing::cconv_storage_void<sizeof...(Params), Params...>::value(),
+          (void (Owner::*)())fn, &dispatch_functor_t::Call};
+}
+
+}  // namespace vm
+}  // namespace iree
+
+#endif  // IREE_VM_MODULE_ABI_PACKING_H_
diff --git a/runtime/src/iree/vm/native_module_test.cc b/runtime/src/iree/vm/native_module_test.cc
new file mode 100644
index 0000000..84202d0
--- /dev/null
+++ b/runtime/src/iree/vm/native_module_test.cc
@@ -0,0 +1,110 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/vm/native_module_test.h"
+
+#include <vector>
+
+#include "iree/base/status_cc.h"
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+#include "iree/vm/context.h"
+#include "iree/vm/instance.h"
+#include "iree/vm/invocation.h"
+#include "iree/vm/list.h"
+#include "iree/vm/ref_cc.h"
+#include "iree/vm/value.h"
+
+namespace iree {
+namespace {
+
+// Test suite that uses module_a and module_b defined in native_module_test.h.
+// Both modules are put in a context and the module_b.entry function can be
+// executed with RunFunction.
+class VMNativeModuleTest : public ::testing::Test {
+ protected:
+  virtual void SetUp() {
+    IREE_CHECK_OK(iree_vm_instance_create(iree_allocator_system(), &instance_));
+
+    // Create both modules shared instances. These are generally immutable and
+    // can be shared by multiple contexts.
+    iree_vm_module_t* module_a = nullptr;
+    IREE_CHECK_OK(module_a_create(iree_allocator_system(), &module_a));
+    iree_vm_module_t* module_b = nullptr;
+    IREE_CHECK_OK(module_b_create(iree_allocator_system(), &module_b));
+
+    // Create the context with both modules and perform runtime linkage.
+    // Imports from module_a -> module_b will be resolved and per-context state
+    // will be allocated.
+    std::vector<iree_vm_module_t*> modules = {module_a, module_b};
+    IREE_CHECK_OK(iree_vm_context_create_with_modules(
+        instance_, IREE_VM_CONTEXT_FLAG_NONE, modules.data(), modules.size(),
+        iree_allocator_system(), &context_));
+
+    // No longer need the modules as the context retains them.
+    iree_vm_module_release(module_a);
+    iree_vm_module_release(module_b);
+  }
+
+  virtual void TearDown() {
+    iree_vm_context_release(context_);
+    iree_vm_instance_release(instance_);
+  }
+
+  StatusOr<int32_t> RunFunction(iree_string_view_t function_name,
+                                int32_t arg0) {
+    // Lookup the entry function. This can be cached in an application if
+    // multiple calls will be made.
+    iree_vm_function_t function;
+    IREE_RETURN_IF_ERROR(
+        iree_vm_context_resolve_function(
+            context_, iree_make_cstring_view("module_b.entry"), &function),
+        "unable to resolve entry point");
+
+    // Setup I/O lists and pass in the argument. The result list will be
+    // populated upon return.
+    vm::ref<iree_vm_list_t> input_list;
+    IREE_RETURN_IF_ERROR(iree_vm_list_create(
+        /*element_type=*/nullptr, 1, iree_allocator_system(), &input_list));
+    auto arg0_value = iree_vm_value_make_i32(arg0);
+    IREE_RETURN_IF_ERROR(
+        iree_vm_list_push_value(input_list.get(), &arg0_value));
+    vm::ref<iree_vm_list_t> output_list;
+    IREE_RETURN_IF_ERROR(iree_vm_list_create(
+        /*element_type=*/nullptr, 1, iree_allocator_system(), &output_list));
+
+    // Invoke the entry function to do our work. Runs synchronously.
+    IREE_RETURN_IF_ERROR(
+        iree_vm_invoke(context_, function, IREE_VM_INVOCATION_FLAG_NONE,
+                       /*policy=*/nullptr, input_list.get(), output_list.get(),
+                       iree_allocator_system()));
+
+    // Load the output result.
+    iree_vm_value_t ret0_value;
+    IREE_RETURN_IF_ERROR(
+        iree_vm_list_get_value(output_list.get(), 0, &ret0_value));
+    return ret0_value.i32;
+  }
+
+ private:
+  iree_vm_instance_t* instance_ = nullptr;
+  iree_vm_context_t* context_ = nullptr;
+};
+
+TEST_F(VMNativeModuleTest, Example) {
+  IREE_ASSERT_OK_AND_ASSIGN(
+      int32_t v0, RunFunction(iree_make_cstring_view("module_b.entry"), 1));
+  ASSERT_EQ(v0, 1);
+  IREE_ASSERT_OK_AND_ASSIGN(
+      int32_t v1, RunFunction(iree_make_cstring_view("module_b.entry"), 2));
+  ASSERT_EQ(v1, 4);
+  IREE_ASSERT_OK_AND_ASSIGN(
+      int32_t v2, RunFunction(iree_make_cstring_view("module_b.entry"), 3));
+  ASSERT_EQ(v2, 8);
+}
+
+}  // namespace
+}  // namespace iree
diff --git a/runtime/src/iree/vm/native_module_test.h b/runtime/src/iree/vm/native_module_test.h
new file mode 100644
index 0000000..4585223
--- /dev/null
+++ b/runtime/src/iree/vm/native_module_test.h
@@ -0,0 +1,307 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "iree/base/api.h"
+#include "iree/vm/context.h"
+#include "iree/vm/instance.h"
+#include "iree/vm/module.h"
+#include "iree/vm/native_module.h"
+#include "iree/vm/ref.h"
+#include "iree/vm/stack.h"
+
+// Wrapper for calling the import functions with type (i32)->i32.
+// NOTE: we should have some common ones prebuilt or can generate and rely on
+// LTO to strip duplicates across the entire executable.
+// TODO(benvanik): generate/export these shims/call functions in stack.h.
+static iree_status_t call_import_i32_i32(iree_vm_stack_t* stack,
+                                         const iree_vm_function_t* import,
+                                         int32_t arg0, int32_t* out_ret0) {
+  iree_vm_function_call_t call;
+  call.function = *import;
+  call.arguments = iree_make_byte_span(&arg0, sizeof(arg0));
+  call.results = iree_make_byte_span(out_ret0, sizeof(*out_ret0));
+
+  iree_vm_execution_result_t result;
+  memset(&result, 0, sizeof(result));
+  return import->module->begin_call(import->module, stack, &call, &result);
+}
+
+typedef iree_status_t (*call_i32_i32_t)(iree_vm_stack_t* stack,
+                                        void* module_ptr, void* module_state,
+                                        int32_t arg0, int32_t* out_ret0);
+
+// Wrapper for calling a |target_fn| C function from the VM ABI.
+// It's optional to bounce through like this; if the function can more
+// efficiently directly access the arguments from the |call| then it can do so.
+// This approach is most useful when the function may also be exported/used by
+// non-VM code or may be internally referenced using a target-specific ABI.
+// TODO(benvanik): generate/export these shims/call functions in stack.h.
+static iree_status_t call_shim_i32_i32(iree_vm_stack_t* stack,
+                                       const iree_vm_function_call_t* call,
+                                       call_i32_i32_t target_fn, void* module,
+                                       void* module_state,
+                                       iree_vm_execution_result_t* out_result) {
+  // We can use structs to allow compiler-controlled indexing optimizations,
+  // though this won't work for variadic cases.
+  // TODO(benvanik): packed attributes.
+  typedef struct {
+    int32_t arg0;
+  } args_t;
+  typedef struct {
+    int32_t ret0;
+  } results_t;
+
+  const args_t* args = (const args_t*)call->arguments.data;
+  results_t* results = (results_t*)call->results.data;
+
+  // For simple cases like this (zero or 1 result) we can tail-call.
+  return target_fn(stack, module, module_state, args->arg0, &results->ret0);
+}
+
+//===----------------------------------------------------------------------===//
+// module_a
+//===----------------------------------------------------------------------===//
+// This simple stateless module exports two functions that can be imported by
+// other modules or called directly by the user. When no imports, custom types,
+// or per-context state is required this simplifies module definitions.
+//
+// module_b below imports these functions and demonstrates a more complex module
+// with state.
+
+typedef struct module_a_t module_a_t;
+typedef struct module_a_state_t module_a_state_t;
+
+// vm.import @module_a.add_1(%arg0 : i32) -> i32
+static iree_status_t module_a_add_1(iree_vm_stack_t* stack, module_a_t* module,
+                                    module_a_state_t* module_state,
+                                    int32_t arg0, int32_t* out_ret0) {
+  // Add 1 to arg0 and return.
+  *out_ret0 = arg0 + 1;
+  return iree_ok_status();
+}
+
+// vm.import @module_a.sub_1(%arg0 : i32) -> i32
+static iree_status_t module_a_sub_1(iree_vm_stack_t* stack, module_a_t* module,
+                                    module_a_state_t* module_state,
+                                    int32_t arg0, int32_t* out_ret0) {
+  // Sub 1 to arg0 and return. Fail if < 0.
+  *out_ret0 = arg0 - 1;
+  return iree_ok_status();
+}
+
+static const iree_vm_native_export_descriptor_t module_a_exports_[] = {
+    {iree_make_cstring_view("add_1"), iree_make_cstring_view("0i_i"), 0, NULL},
+    {iree_make_cstring_view("sub_1"), iree_make_cstring_view("0i_i"), 0, NULL},
+};
+static const iree_vm_native_function_ptr_t module_a_funcs_[] = {
+    {(iree_vm_native_function_shim_t)call_shim_i32_i32,
+     (iree_vm_native_function_target_t)module_a_add_1},
+    {(iree_vm_native_function_shim_t)call_shim_i32_i32,
+     (iree_vm_native_function_target_t)module_a_sub_1},
+};
+static_assert(IREE_ARRAYSIZE(module_a_funcs_) ==
+                  IREE_ARRAYSIZE(module_a_exports_),
+              "function pointer table must be 1:1 with exports");
+static const iree_vm_native_module_descriptor_t module_a_descriptor_ = {
+    iree_make_cstring_view("module_a"),
+    0,
+    NULL,
+    IREE_ARRAYSIZE(module_a_exports_),
+    module_a_exports_,
+    IREE_ARRAYSIZE(module_a_funcs_),
+    module_a_funcs_,
+    0,
+    NULL,
+};
+
+static iree_status_t module_a_create(iree_allocator_t allocator,
+                                     iree_vm_module_t** out_module) {
+  // NOTE: this module has neither shared or per-context module state.
+  iree_vm_module_t interface;
+  IREE_RETURN_IF_ERROR(iree_vm_module_initialize(&interface, NULL));
+  return iree_vm_native_module_create(&interface, &module_a_descriptor_,
+                                      allocator, out_module);
+}
+
+//===----------------------------------------------------------------------===//
+// module_b
+//===----------------------------------------------------------------------===//
+// A more complex module that holds state for resolved types (shared across
+// all instances), imported functions (stored per-context), per-context user
+// data, and reflection metadata.
+
+typedef struct module_b_t module_b_t;
+typedef struct module_b_state_t module_b_state_t;
+
+// Stores shared state across all instances of the module.
+// This should generally be treated as read-only and if mutation is possible
+// then users must synchronize themselves.
+typedef struct module_b_t {
+  // Allocator the module must be freed with and that can be used for any other
+  // shared dynamic allocations.
+  iree_allocator_t allocator;
+  // Resolved types; these never change once queried and are safe to store on
+  // the shared structure to avoid needing to look them up again.
+  const iree_vm_ref_type_descriptor_t* types[1];
+} module_b_t;
+
+// Stores per-context state; at the minimum imports, but possibly other user
+// state data. No synchronization is required as the VM will not call functions
+// with the same state from multiple threads concurrently.
+typedef struct module_b_state_t {
+  // Allocator the state must be freed with and that can be used for any other
+  // per-context dynamic allocations.
+  iree_allocator_t allocator;
+  // Resolved import functions matching 1:1 with the module import descriptors.
+  iree_vm_function_t imports[2];
+  // Example user data stored per-state.
+  int counter;
+} module_b_state_t;
+
+// Frees the shared module; by this point all per-context states have been
+// freed and no more shared data is required.
+static void IREE_API_PTR module_b_destroy(void* self) {
+  module_b_t* module = (module_b_t*)self;
+  iree_allocator_free(module->allocator, module);
+}
+
+// Allocates per-context state, which stores resolved import functions and any
+// other non-shared user state.
+static iree_status_t IREE_API_PTR
+module_b_alloc_state(void* self, iree_allocator_t allocator,
+                     iree_vm_module_state_t** out_module_state) {
+  module_b_state_t* state = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_allocator_malloc(allocator, sizeof(*state), (void**)&state));
+  memset(state, 0, sizeof(*state));
+  state->allocator = allocator;
+  *out_module_state = (iree_vm_module_state_t*)state;
+  return iree_ok_status();
+}
+
+// Frees the per-context state.
+static void IREE_API_PTR
+module_b_free_state(void* self, iree_vm_module_state_t* module_state) {
+  module_b_state_t* state = (module_b_state_t*)module_state;
+  iree_allocator_free(state->allocator, state);
+}
+
+// Called once per import function so the module can store the function ref.
+static iree_status_t IREE_API_PTR module_b_resolve_import(
+    void* self, iree_vm_module_state_t* module_state, iree_host_size_t ordinal,
+    const iree_vm_function_t* function,
+    const iree_vm_function_signature_t* signature) {
+  module_b_state_t* state = (module_b_state_t*)module_state;
+  state->imports[ordinal] = *function;
+  return iree_ok_status();
+}
+
+// Our actual function. Here we directly access the registers but one could also
+// use this as a trampoline into user code with a native signature (such as
+// fetching the args, calling the function as a normal C function, and stashing
+// back the results).
+//
+// vm.import @module_b.entry(%arg0 : i32) -> i32
+static iree_status_t module_b_entry(iree_vm_stack_t* stack, module_b_t* module,
+                                    module_b_state_t* module_state,
+                                    int32_t arg0, int32_t* out_ret0) {
+  // NOTE: if we needed to use ref types here we have them under module->types.
+  assert(module->types[0]);
+
+  // Call module_a.add_1.
+  IREE_RETURN_IF_ERROR(
+      call_import_i32_i32(stack, &module_state->imports[0], arg0, &arg0));
+
+  // Increment per-context state (persists across calls). No need for a mutex as
+  // only one thread can be using the per-context state at a time.
+  module_state->counter += arg0;
+  int32_t ret0 = module_state->counter;
+
+  // Call module_a.sub_1.
+  IREE_RETURN_IF_ERROR(
+      call_import_i32_i32(stack, &module_state->imports[1], ret0, &ret0));
+
+  *out_ret0 = ret0;
+  return iree_ok_status();
+}
+
+// Table of exported function pointers. Note that this table could be read-only
+// (like here) or shared/per-context to allow exposing different functions based
+// on versions, access rights, etc.
+static const iree_vm_native_function_ptr_t module_b_funcs_[] = {
+    {(iree_vm_native_function_shim_t)call_shim_i32_i32,
+     (iree_vm_native_function_target_t)module_b_entry},
+};
+
+static const iree_vm_native_import_descriptor_t module_b_imports_[] = {
+    {IREE_VM_NATIVE_IMPORT_REQUIRED, iree_make_cstring_view("module_a.add_1")},
+    {IREE_VM_NATIVE_IMPORT_REQUIRED, iree_make_cstring_view("module_a.sub_1")},
+};
+static_assert(IREE_ARRAYSIZE(module_b_state_t::imports) ==
+                  IREE_ARRAYSIZE(module_b_imports_),
+              "import storage must be able to hold all imports");
+static const iree_vm_reflection_attr_t module_b_entry_attrs_[] = {
+    {iree_make_cstring_view("key1"), iree_make_cstring_view("value1")},
+};
+static const iree_vm_native_export_descriptor_t module_b_exports_[] = {
+    {iree_make_cstring_view("entry"), iree_make_cstring_view("0i_i"),
+     IREE_ARRAYSIZE(module_b_entry_attrs_), module_b_entry_attrs_},
+};
+static_assert(IREE_ARRAYSIZE(module_b_funcs_) ==
+                  IREE_ARRAYSIZE(module_b_exports_),
+              "function pointer table must be 1:1 with exports");
+static const iree_vm_native_module_descriptor_t module_b_descriptor_ = {
+    iree_make_cstring_view("module_b"),
+    IREE_ARRAYSIZE(module_b_imports_),
+    module_b_imports_,
+    IREE_ARRAYSIZE(module_b_exports_),
+    module_b_exports_,
+    IREE_ARRAYSIZE(module_b_funcs_),
+    module_b_funcs_,
+    0,
+    NULL,
+};
+
+static iree_status_t module_b_create(iree_allocator_t allocator,
+                                     iree_vm_module_t** out_module) {
+  // Allocate shared module state.
+  module_b_t* module = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_allocator_malloc(allocator, sizeof(*module), (void**)&module));
+  memset(module, 0, sizeof(*module));
+  module->allocator = allocator;
+
+  // Resolve types used by the module once so that we can share it across all
+  // instances of the module.
+  module->types[0] =
+      iree_vm_ref_lookup_registered_type(iree_make_cstring_view("vm.buffer"));
+  if (!module->types[0]) {
+    iree_allocator_free(allocator, module);
+    return iree_make_status(
+        IREE_STATUS_NOT_FOUND,
+        "required type vm.buffer not registered with the type system");
+  }
+
+  // Setup the interface with the functions we implement ourselves. Any function
+  // we omit will be handled by the base native module.
+  iree_vm_module_t interface;
+  iree_status_t status = iree_vm_module_initialize(&interface, module);
+  if (!iree_status_is_ok(status)) {
+    iree_allocator_free(allocator, module);
+    return status;
+  }
+  interface.destroy = module_b_destroy;
+  interface.alloc_state = module_b_alloc_state;
+  interface.free_state = module_b_free_state;
+  interface.resolve_import = module_b_resolve_import;
+  return iree_vm_native_module_create(&interface, &module_b_descriptor_,
+                                      allocator, out_module);
+}
diff --git a/runtime/src/iree/vm/ops.h b/runtime/src/iree/vm/ops.h
new file mode 100644
index 0000000..8395f83
--- /dev/null
+++ b/runtime/src/iree/vm/ops.h
@@ -0,0 +1,365 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_VM_OPS_H_
+#define IREE_VM_OPS_H_
+
+#include <math.h>
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/vm/value.h"
+
+//===------------------------------------------------------------------===//
+// Globals
+//===------------------------------------------------------------------===//
+
+static inline int32_t vm_global_load_i32(uint8_t* base, uint32_t byte_offset) {
+  const int32_t* global_ptr = (const int32_t*)(base + byte_offset);
+  return *global_ptr;
+}
+
+static inline void vm_global_store_i32(uint8_t* base, uint32_t byte_offset,
+                                       int32_t value) {
+  int32_t* global_ptr = (int32_t*)(base + byte_offset);
+  *global_ptr = value;
+}
+
+//===------------------------------------------------------------------===//
+// Conditional assignment
+//===------------------------------------------------------------------===//
+
+static inline int32_t vm_select_i32(int32_t condition, int32_t true_value,
+                                    int32_t false_value) {
+  return condition ? true_value : false_value;
+}
+
+//===------------------------------------------------------------------===//
+// Native integer arithmetic
+//===------------------------------------------------------------------===//
+
+static inline int32_t vm_add_i32(int32_t lhs, int32_t rhs) { return lhs + rhs; }
+static inline int32_t vm_sub_i32(int32_t lhs, int32_t rhs) { return lhs - rhs; }
+static inline int32_t vm_mul_i32(int32_t lhs, int32_t rhs) { return lhs * rhs; }
+static inline int32_t vm_div_i32s(int32_t lhs, int32_t rhs) {
+  return lhs / rhs;
+}
+static inline int32_t vm_div_i32u(int32_t lhs, int32_t rhs) {
+  return (int32_t)(((uint32_t)lhs) / ((uint32_t)rhs));
+}
+static inline int32_t vm_rem_i32s(int32_t lhs, int32_t rhs) {
+  return lhs % rhs;
+}
+static inline int32_t vm_rem_i32u(int32_t lhs, int32_t rhs) {
+  return (int32_t)(((uint32_t)lhs) % ((uint32_t)rhs));
+}
+static inline int32_t vm_fma_i32(int32_t a, int32_t b, int32_t c) {
+  return a * b + c;
+}
+static inline int32_t vm_not_i32(int32_t operand) {
+  return (int32_t)(~((uint32_t)operand));
+}
+static inline int32_t vm_and_i32(int32_t lhs, int32_t rhs) { return lhs & rhs; }
+static inline int32_t vm_or_i32(int32_t lhs, int32_t rhs) { return lhs | rhs; }
+static inline int32_t vm_xor_i32(int32_t lhs, int32_t rhs) { return lhs ^ rhs; }
+
+//===------------------------------------------------------------------===//
+// Casting and type conversion/emulation
+//===------------------------------------------------------------------===//
+
+static inline int32_t vm_trunc_i32i8(int32_t operand) {
+  return (uint8_t)((uint32_t)operand);
+}
+static inline int32_t vm_trunc_i32i16(int32_t operand) {
+  return (uint16_t)((uint32_t)operand);
+}
+static inline int32_t vm_ext_i8i32s(int32_t operand) {
+  return (int32_t)((int8_t)operand);
+}
+static inline int32_t vm_ext_i8i32u(int32_t operand) {
+  return (uint32_t)((uint8_t)operand);
+}
+static inline int32_t vm_ext_i16i32s(int32_t operand) {
+  return (int32_t)((int16_t)operand);
+}
+static inline int32_t vm_ext_i16i32u(int32_t operand) {
+  return (uint32_t)((uint16_t)operand);
+}
+
+//===------------------------------------------------------------------===//
+// Native bitwise shifts and rotates
+//===------------------------------------------------------------------===//
+
+static inline int32_t vm_shl_i32(int32_t operand, int32_t amount) {
+  amount &= 0x1F;
+  return (int32_t)(operand << amount);
+}
+static inline int32_t vm_shr_i32s(int32_t operand, int32_t amount) {
+  amount &= 0x1F;
+  return (int32_t)(operand >> amount);
+}
+static inline int32_t vm_shr_i32u(int32_t operand, int32_t amount) {
+  amount &= 0x1F;
+  return (int32_t)(((uint32_t)operand) >> amount);
+}
+
+//===------------------------------------------------------------------===//
+// Comparison ops
+//===------------------------------------------------------------------===//
+
+static inline int32_t vm_cmp_eq_i32(int32_t lhs, int32_t rhs) {
+  return (lhs == rhs) ? 1 : 0;
+}
+static inline int32_t vm_cmp_ne_i32(int32_t lhs, int32_t rhs) {
+  return (lhs != rhs) ? 1 : 0;
+}
+static inline int32_t vm_cmp_lt_i32s(int32_t lhs, int32_t rhs) {
+  return (lhs < rhs) ? 1 : 0;
+}
+static inline int32_t vm_cmp_lt_i32u(int32_t lhs, int32_t rhs) {
+  return (((uint32_t)lhs) < ((uint32_t)rhs)) ? 1 : 0;
+}
+static inline int32_t vm_cmp_nz_i32(int32_t operand) {
+  return (operand != 0) ? 1 : 0;
+}
+static inline int32_t vm_cmp_eq_ref(iree_vm_ref_t* lhs, iree_vm_ref_t* rhs) {
+  return iree_vm_ref_equal(lhs, rhs) ? 1 : 0;
+}
+static inline int32_t vm_cmp_ne_ref(iree_vm_ref_t* lhs, iree_vm_ref_t* rhs) {
+  return (!iree_vm_ref_equal(lhs, rhs)) ? 1 : 0;
+}
+static inline int32_t vm_cmp_nz_ref(iree_vm_ref_t* operand) {
+  return (operand->ptr != NULL) ? 1 : 0;
+}
+
+//===------------------------------------------------------------------===//
+// ExtI64: Globals
+//===------------------------------------------------------------------===//
+
+static inline int64_t vm_global_load_i64(uint8_t* base, uint32_t byte_offset) {
+  const int64_t* global_ptr = (const int64_t*)(base + byte_offset);
+  return *global_ptr;
+}
+
+static inline void vm_global_store_i64(uint8_t* base, uint32_t byte_offset,
+                                       int64_t value) {
+  int64_t* global_ptr = (int64_t*)(base + byte_offset);
+  *global_ptr = value;
+}
+
+//===------------------------------------------------------------------===//
+// ExtI64: Conditional assignment
+//===------------------------------------------------------------------===//
+
+static inline int64_t vm_select_i64(int32_t condition, int64_t true_value,
+                                    int64_t false_value) {
+  return condition ? true_value : false_value;
+}
+
+//===------------------------------------------------------------------===//
+// ExtI64: Native integer arithmetic ops
+//===------------------------------------------------------------------===//
+
+static inline int64_t vm_add_i64(int64_t lhs, int64_t rhs) { return lhs + rhs; }
+static inline int64_t vm_sub_i64(int64_t lhs, int64_t rhs) { return lhs - rhs; }
+static inline int64_t vm_mul_i64(int64_t lhs, int64_t rhs) { return lhs * rhs; }
+static inline int64_t vm_div_i64s(int64_t lhs, int64_t rhs) {
+  return lhs / rhs;
+}
+static inline int64_t vm_div_i64u(int64_t lhs, int64_t rhs) {
+  return (int64_t)(((uint64_t)lhs) / ((uint64_t)rhs));
+}
+static inline int64_t vm_rem_i64s(int64_t lhs, int64_t rhs) {
+  return lhs % rhs;
+}
+static inline int64_t vm_rem_i64u(int64_t lhs, int64_t rhs) {
+  return (int64_t)(((uint64_t)lhs) % ((uint64_t)rhs));
+}
+static inline int64_t vm_fma_i64(int64_t a, int64_t b, int64_t c) {
+  return a * b + c;
+}
+static inline int64_t vm_not_i64(int64_t operand) {
+  return (int64_t)(~((uint64_t)operand));
+}
+static inline int64_t vm_and_i64(int64_t lhs, int64_t rhs) { return lhs & rhs; }
+static inline int64_t vm_or_i64(int64_t lhs, int64_t rhs) { return lhs | rhs; }
+static inline int64_t vm_xor_i64(int64_t lhs, int64_t rhs) { return lhs ^ rhs; }
+
+//===------------------------------------------------------------------===//
+// ExtI64: Casting and type conversion/emulation
+//===------------------------------------------------------------------===//
+
+static inline int32_t vm_trunc_i64i32(int64_t operand) {
+  return (uint32_t)((uint64_t)operand);
+}
+static inline int64_t vm_ext_i32i64s(int32_t operand) {
+  return (int64_t)((int32_t)operand);
+}
+static inline int64_t vm_ext_i32i64u(int32_t operand) {
+  return (uint64_t)((uint32_t)operand);
+}
+
+//===------------------------------------------------------------------===//
+// ExtI64: Native bitwise shifts and rotates
+//===------------------------------------------------------------------===//
+
+static inline int64_t vm_shl_i64(int64_t operand, int32_t amount) {
+  amount &= 0x3F;
+  return (int64_t)(operand << amount);
+}
+static inline int64_t vm_shr_i64s(int64_t operand, int32_t amount) {
+  amount &= 0x3F;
+  return (int64_t)(operand >> amount);
+}
+static inline int64_t vm_shr_i64u(int64_t operand, int32_t amount) {
+  amount &= 0x3F;
+  return (int64_t)(((uint64_t)operand) >> amount);
+}
+
+//===------------------------------------------------------------------===//
+// ExtI64: Comparison ops
+//===------------------------------------------------------------------===//
+
+static inline int32_t vm_cmp_eq_i64(int64_t lhs, int64_t rhs) {
+  return (lhs == rhs) ? 1 : 0;
+}
+static inline int32_t vm_cmp_ne_i64(int64_t lhs, int64_t rhs) {
+  return (lhs != rhs) ? 1 : 0;
+}
+static inline int32_t vm_cmp_lt_i64s(int64_t lhs, int64_t rhs) {
+  return (lhs < rhs) ? 1 : 0;
+}
+static inline int32_t vm_cmp_lt_i64u(int64_t lhs, int64_t rhs) {
+  return (((uint64_t)lhs) < ((uint64_t)rhs)) ? 1 : 0;
+}
+static inline int32_t vm_cmp_nz_i64(int64_t operand) {
+  return (operand != 0) ? 1 : 0;
+}
+
+//===------------------------------------------------------------------===//
+// ExtF32: Globals
+//===------------------------------------------------------------------===//
+
+static inline float vm_global_load_f32(uint8_t* base, uint32_t byte_offset) {
+  const float* global_ptr = (const float*)(base + byte_offset);
+  return *global_ptr;
+}
+
+static inline void vm_global_store_f32(uint8_t* base, uint32_t byte_offset,
+                                       float value) {
+  float* global_ptr = (float*)(base + byte_offset);
+  *global_ptr = value;
+}
+
+//===------------------------------------------------------------------===//
+// ExtF32: Conditional assignment
+//===------------------------------------------------------------------===//
+
+static inline float vm_select_f32(int32_t condition, float true_value,
+                                  float false_value) {
+  return condition ? true_value : false_value;
+}
+
+//===------------------------------------------------------------------===//
+// ExtF32: Native floating-point arithmetic
+//===------------------------------------------------------------------===//
+
+static inline float vm_add_f32(float lhs, float rhs) { return lhs + rhs; }
+static inline float vm_sub_f32(float lhs, float rhs) { return lhs - rhs; }
+static inline float vm_mul_f32(float lhs, float rhs) { return lhs * rhs; }
+static inline float vm_div_f32(float lhs, float rhs) { return lhs / rhs; }
+static inline float vm_rem_f32(float lhs, float rhs) {
+  return remainderf(lhs, rhs);
+}
+static inline float vm_fma_f32(float a, float b, float c) {
+#ifdef FP_FAST_FMAF
+  return fmaf(a, b, c);
+#else
+  return a * b + c;
+#endif  // FP_FAST_FMAF
+}
+static inline float vm_abs_f32(float operand) { return fabsf(operand); }
+static inline float vm_neg_f32(float operand) { return -operand; }
+static inline float vm_ceil_f32(float operand) { return ceilf(operand); }
+static inline float vm_floor_f32(float operand) { return floorf(operand); }
+
+static inline float vm_atan_f32(float operand) { return atanf(operand); }
+static inline float vm_atan2_f32(float y, float x) { return atan2f(y, x); }
+static inline float vm_cos_f32(float operand) { return cosf(operand); }
+static inline float vm_sin_f32(float operand) { return sinf(operand); }
+static inline float vm_exp_f32(float operand) { return expf(operand); }
+static inline float vm_exp2_f32(float operand) { return exp2f(operand); }
+static inline float vm_expm1_f32(float operand) { return expm1f(operand); }
+static inline float vm_log_f32(float operand) { return logf(operand); }
+static inline float vm_log10_f32(float operand) { return log10f(operand); }
+static inline float vm_log1p_f32(float operand) { return log1pf(operand); }
+static inline float vm_log2_f32(float operand) { return log2f(operand); }
+static inline float vm_pow_f32(float b, float e) { return powf(b, e); }
+static inline float vm_rsqrt_f32(float operand) {
+  return 1.0f / sqrtf(operand);
+}
+static inline float vm_sqrt_f32(float operand) { return sqrtf(operand); }
+static inline float vm_tanh_f32(float operand) { return tanhf(operand); }
+static inline float vm_erf_f32(float operand) { return erff(operand); }
+
+//===------------------------------------------------------------------===//
+// ExtF32: Casting and type conversion/emulation
+//===------------------------------------------------------------------===//
+
+static inline float vm_cast_si32f32(int32_t operand) { return (float)operand; }
+static inline float vm_cast_ui32f32(int32_t operand) {
+  return (float)(uint32_t)operand;
+}
+static inline int32_t vm_cast_f32si32(float operand) {
+  return (int32_t)lroundf(operand);
+}
+static inline int32_t vm_cast_f32ui32(float operand) {
+  return (uint32_t)lroundf(operand);
+}
+static inline float vm_bitcast_i32f32(int32_t operand) {
+  float result;
+  memcpy(&result, &operand, sizeof(result));
+  return result;
+}
+static inline int32_t vm_bitcast_f32i32(float operand) {
+  int32_t result;
+  memcpy(&result, &operand, sizeof(result));
+  return result;
+}
+
+//===------------------------------------------------------------------===//
+// ExtF32: Comparison ops
+//===------------------------------------------------------------------===//
+
+static inline int32_t vm_cmp_eq_f32o(float lhs, float rhs) {
+  return (lhs == rhs) ? 1 : 0;
+}
+static inline int32_t vm_cmp_eq_f32u(float lhs, float rhs) {
+  return (isunordered(lhs, rhs) || (lhs == rhs)) ? 1 : 0;
+}
+static inline int32_t vm_cmp_ne_f32o(float lhs, float rhs) {
+  return (lhs != rhs) ? 1 : 0;
+}
+static inline int32_t vm_cmp_ne_f32u(float lhs, float rhs) {
+  return (isunordered(lhs, rhs) || (lhs != rhs)) ? 1 : 0;
+}
+static inline int32_t vm_cmp_lt_f32o(float lhs, float rhs) {
+  return isless(lhs, rhs) ? 1 : 0;
+}
+static inline int32_t vm_cmp_lt_f32u(float lhs, float rhs) {
+  return (isunordered(lhs, rhs) || isless(lhs, rhs)) ? 1 : 0;
+}
+static inline int32_t vm_cmp_lte_f32o(float lhs, float rhs) {
+  return islessequal(lhs, rhs) ? 1 : 0;
+}
+static inline int32_t vm_cmp_lte_f32u(float lhs, float rhs) {
+  return (isunordered(lhs, rhs) || islessequal(lhs, rhs)) ? 1 : 0;
+}
+static inline int32_t vm_cmp_nan_f32(float operand) {
+  return isnan(operand) ? 1 : 0;
+}
+
+#endif  // IREE_VM_OPS_H_
diff --git a/runtime/src/iree/vm/ops_emitc.h b/runtime/src/iree/vm/ops_emitc.h
new file mode 100644
index 0000000..7ed52a0
--- /dev/null
+++ b/runtime/src/iree/vm/ops_emitc.h
@@ -0,0 +1,64 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_VM_OPS_EMITC_H_
+#define IREE_VM_OPS_EMITC_H_
+
+// This file contains utility macros used for things that EmitC  can't handle
+// directly.
+
+// Assign a value through a pointer variable
+#define EMITC_DEREF_ASSIGN_VALUE(ptr, value) *(ptr) = (value)
+
+// Assign a value pointed to by `ptr` through a pointer variable
+#define EMITC_DEREF_ASSIGN_PTR(ptr, value) *(ptr) = *(value)
+
+// Access a member of a struct
+#define EMITC_STRUCT_MEMBER(struct, member) (struct).member
+
+// Access the address of a member of a struct
+#define EMITC_STRUCT_MEMBER_ADDRESS(struct, member) &(struct).member
+
+// Assign a value to a member of a struct
+#define EMITC_STRUCT_MEMBER_ASSIGN(struct, member, value) \
+  (struct).member = (value)
+
+// Access a member of a pointer to a struct
+#define EMITC_STRUCT_PTR_MEMBER(struct, member) (struct)->member
+
+// Call a function pointer of a pointer to a struct with the given arguments
+#define EMITC_STRUCT_PTR_MEMBER_CALL(struct, member, ...) \
+  (struct)->member(__VA_ARGS__)
+
+// Access the address of a member of a pointer to a struct
+#define EMITC_STRUCT_PTR_MEMBER_ADDRESS(struct, member) &(struct)->member
+
+// Assign a value to a member of a pointer to a struct
+#define EMITC_STRUCT_PTR_MEMBER_ASSIGN(struct, member, value) \
+  (struct)->member = (value)
+
+// Create a typdef struct
+#define EMITC_TYPEDEF_STRUCT(typename, body) \
+  typedef struct {                           \
+    body                                     \
+  } typename;
+
+// Get the address of an array element
+#define EMITC_ARRAY_ELEMENT_ADDRESS(array, index) &(array)[index]
+
+// Unary operations
+#define EMITC_CAST(arg, type) ((type)(arg))
+#define EMITC_NOT(arg) (!(arg))
+
+// Binary operations
+#define EMITC_AND(lhs, rhs) ((lhs) && (rhs))
+#define EMITC_EQ(lhs, rhs) ((lhs) == (rhs))
+#define EMITC_NE(lhs, rhs) ((lhs) != (rhs))
+#define EMITC_OR(lhs, rhs) ((lhs) || (rhs))
+
+#define EMITC_ADD(lhs, rhs) ((lhs) + (rhs))
+
+#endif  // IREE_VM_OPS_EMITC_H_
diff --git a/runtime/src/iree/vm/ref.c b/runtime/src/iree/vm/ref.c
new file mode 100644
index 0000000..7c2a966
--- /dev/null
+++ b/runtime/src/iree/vm/ref.c
@@ -0,0 +1,272 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/vm/ref.h"
+
+#include <string.h>
+
+#include "iree/base/internal/atomics.h"
+
+// TODO(benvanik): dynamic, if we care - otherwise keep small.
+// After a dozen or so types the linear scan will likely start to spill the
+// DCACHE and need to be reworked. I suspect at the time we have >=64 types
+// we'll want to rewrite all of this anyway (using externalized type ID storage
+// or something more complex).
+#define IREE_VM_MAX_TYPE_ID 64
+
+static inline volatile iree_atomic_ref_count_t* iree_vm_get_raw_counter_ptr(
+    void* ptr, const iree_vm_ref_type_descriptor_t* type_descriptor) {
+  return (volatile iree_atomic_ref_count_t*)(((uintptr_t)(ptr)) +
+                                             type_descriptor->offsetof_counter);
+}
+
+static inline volatile iree_atomic_ref_count_t* iree_vm_get_ref_counter_ptr(
+    iree_vm_ref_t* ref) {
+  return (volatile iree_atomic_ref_count_t*)(((uintptr_t)ref->ptr) +
+                                             ref->offsetof_counter);
+}
+
+IREE_API_EXPORT void iree_vm_ref_object_retain(
+    void* ptr, const iree_vm_ref_type_descriptor_t* type_descriptor) {
+  if (!ptr) return;
+  volatile iree_atomic_ref_count_t* counter =
+      iree_vm_get_raw_counter_ptr(ptr, type_descriptor);
+  iree_atomic_ref_count_inc(counter);
+}
+
+IREE_API_EXPORT void iree_vm_ref_object_release(
+    void* ptr, const iree_vm_ref_type_descriptor_t* type_descriptor) {
+  if (!ptr) return;
+  volatile iree_atomic_ref_count_t* counter =
+      iree_vm_get_raw_counter_ptr(ptr, type_descriptor);
+  if (iree_atomic_ref_count_dec(counter) == 1) {
+    if (type_descriptor->destroy) {
+      // NOTE: this makes us not re-entrant, but I think that's OK.
+      type_descriptor->destroy(ptr);
+    }
+  }
+}
+
+// A table of type descriptors registered at startup.
+// These provide quick dereferencing of destruction functions and type names for
+// debugging. Note that this just points to registered descriptors (or NULL) for
+// each type ID in the type range and does not own the descriptors.
+//
+// Note that [0] is always the NULL type and has a NULL descriptor. We don't
+// allow types to be registered there.
+static const iree_vm_ref_type_descriptor_t*
+    iree_vm_ref_type_descriptors[IREE_VM_MAX_TYPE_ID] = {0};
+
+// Returns the type descriptor (or NULL) for the given type ID.
+static const iree_vm_ref_type_descriptor_t* iree_vm_ref_get_type_descriptor(
+    iree_vm_ref_type_t type) {
+  if (type >= IREE_VM_MAX_TYPE_ID) {
+    return NULL;
+  }
+  return iree_vm_ref_type_descriptors[type];
+}
+
+IREE_API_EXPORT iree_status_t
+iree_vm_ref_register_type(iree_vm_ref_type_descriptor_t* descriptor) {
+  for (int i = 1; i <= IREE_VM_MAX_TYPE_ID; ++i) {
+    if (!iree_vm_ref_type_descriptors[i]) {
+      iree_vm_ref_type_descriptors[i] = descriptor;
+      descriptor->type = i;
+      return iree_ok_status();
+    }
+  }
+  // Too many user-defined types registered; need to increase
+  // IREE_VM_MAX_TYPE_ID.
+  return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED,
+                          "too many user-defined types registered; new type "
+                          "would exceed maximum of %d",
+                          IREE_VM_MAX_TYPE_ID);
+}
+
+IREE_API_EXPORT iree_string_view_t
+iree_vm_ref_type_name(iree_vm_ref_type_t type) {
+  if (type == 0 || type >= IREE_VM_MAX_TYPE_ID) {
+    return iree_string_view_empty();
+  }
+  return iree_vm_ref_type_descriptors[type]->type_name;
+}
+
+IREE_API_EXPORT const iree_vm_ref_type_descriptor_t*
+iree_vm_ref_lookup_registered_type(iree_string_view_t full_name) {
+  for (int i = 1; i <= IREE_VM_MAX_TYPE_ID; ++i) {
+    if (!iree_vm_ref_type_descriptors[i]) break;
+    if (iree_string_view_equal(iree_vm_ref_type_descriptors[i]->type_name,
+                               full_name)) {
+      return iree_vm_ref_type_descriptors[i];
+    }
+  }
+  return NULL;
+}
+
+// Useful debugging tool:
+#if 0
+static void iree_vm_ref_trace(const char* msg, iree_vm_ref_t* ref) {
+  volatile iree_atomic_ref_count_t* counter = iree_vm_get_ref_counter_ptr(ref);
+  iree_string_view_t name = iree_vm_ref_type_name(ref->type);
+  fprintf(stderr, "%s %.*s 0x%p %d\n", msg, (int)name.size, name.data, ref->ptr,
+          counter->__val);
+}
+#else
+#define iree_vm_ref_trace(...)
+#endif  // 0
+
+IREE_API_EXPORT iree_status_t iree_vm_ref_wrap_assign(void* ptr,
+                                                      iree_vm_ref_type_t type,
+                                                      iree_vm_ref_t* out_ref) {
+  const iree_vm_ref_type_descriptor_t* type_descriptor =
+      iree_vm_ref_get_type_descriptor(type);
+  if (!type_descriptor) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "type not registered");
+  }
+
+  if (out_ref->ptr != NULL && out_ref->ptr != ptr) {
+    // Release existing value.
+    iree_vm_ref_release(out_ref);
+  }
+
+  // NOTE: we do not manipulate the counter here as we assume it starts at 1
+  // or it's already coming in with some references.
+  out_ref->ptr = ptr;
+  out_ref->offsetof_counter = type_descriptor->offsetof_counter;
+  out_ref->type = type;
+
+  iree_vm_ref_trace("WRAP ASSIGN", out_ref);
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_ref_wrap_retain(void* ptr,
+                                                      iree_vm_ref_type_t type,
+                                                      iree_vm_ref_t* out_ref) {
+  IREE_RETURN_IF_ERROR(iree_vm_ref_wrap_assign(ptr, type, out_ref));
+  if (out_ref->ptr) {
+    volatile iree_atomic_ref_count_t* counter =
+        iree_vm_get_ref_counter_ptr(out_ref);
+    iree_atomic_ref_count_inc(counter);
+    iree_vm_ref_trace("WRAP RETAIN", out_ref);
+  }
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT void iree_vm_ref_retain(iree_vm_ref_t* ref,
+                                        iree_vm_ref_t* out_ref) {
+  // NOTE: ref and out_ref may alias or be nested so we retain before we
+  // potentially release.
+  iree_vm_ref_t temp_ref = *ref;
+  if (ref->ptr) {
+    volatile iree_atomic_ref_count_t* counter =
+        iree_vm_get_ref_counter_ptr(ref);
+    iree_atomic_ref_count_inc(counter);
+    iree_vm_ref_trace("RETAIN", ref);
+  }
+  if (out_ref->ptr) {
+    // Output ref contains a value that should be released first.
+    // Note that we check above for it being the same as the new value so we
+    // don't do extra work unless we have to.
+    iree_vm_ref_release(out_ref);
+  }
+  *out_ref = temp_ref;
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_ref_retain_checked(
+    iree_vm_ref_t* ref, iree_vm_ref_type_t type, iree_vm_ref_t* out_ref) {
+  if (ref->type != IREE_VM_REF_TYPE_NULL && ref->type != type) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "source ref type mismatch");
+  }
+  iree_vm_ref_retain(ref, out_ref);
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT void iree_vm_ref_retain_or_move(int is_move, iree_vm_ref_t* ref,
+                                                iree_vm_ref_t* out_ref) {
+  if (is_move) {
+    iree_vm_ref_move(ref, out_ref);
+  } else {
+    iree_vm_ref_retain(ref, out_ref);
+  }
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_ref_retain_or_move_checked(
+    int is_move, iree_vm_ref_t* ref, iree_vm_ref_type_t type,
+    iree_vm_ref_t* out_ref) {
+  if (ref->type != IREE_VM_REF_TYPE_NULL && ref->type != type) {
+    // Make no changes on failure.
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "source ref type mismatch");
+  }
+  iree_vm_ref_retain_or_move(is_move, ref, out_ref);
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT void iree_vm_ref_release(iree_vm_ref_t* ref) {
+  if (ref->type == IREE_VM_REF_TYPE_NULL || ref->ptr == NULL) return;
+
+  iree_vm_ref_trace("RELEASE", ref);
+  volatile iree_atomic_ref_count_t* counter = iree_vm_get_ref_counter_ptr(ref);
+  if (iree_atomic_ref_count_dec(counter) == 1) {
+    const iree_vm_ref_type_descriptor_t* type_descriptor =
+        iree_vm_ref_get_type_descriptor(ref->type);
+    if (type_descriptor->destroy) {
+      // NOTE: this makes us not re-entrant, but I think that's OK.
+      iree_vm_ref_trace("DESTROY", ref);
+      type_descriptor->destroy(ref->ptr);
+    }
+  }
+
+  // Reset ref to point at nothing.
+  memset(ref, 0, sizeof(*ref));
+}
+
+IREE_API_EXPORT void iree_vm_ref_assign(iree_vm_ref_t* ref,
+                                        iree_vm_ref_t* out_ref) {
+  // NOTE: ref and out_ref may alias.
+  iree_vm_ref_t temp_ref = *ref;
+  if (ref == out_ref) {
+    // Source == target; ignore entirely.
+    return;
+  } else if (out_ref->ptr != NULL) {
+    // Release existing value.
+    iree_vm_ref_release(out_ref);
+  }
+
+  // Assign ref to out_ref (without incrementing counter).
+  *out_ref = temp_ref;
+}
+
+IREE_API_EXPORT void iree_vm_ref_move(iree_vm_ref_t* ref,
+                                      iree_vm_ref_t* out_ref) {
+  // NOTE: ref and out_ref may alias.
+  if (ref == out_ref) {
+    // Source == target; ignore entirely.
+    return;
+  }
+
+  // Reset input ref so it points at nothing.
+  iree_vm_ref_t temp_ref = *ref;
+  memset(ref, 0, sizeof(*ref));
+
+  if (out_ref->ptr != NULL) {
+    // Release existing value.
+    iree_vm_ref_release(out_ref);
+  }
+
+  // Assign ref to out_ref (without incrementing counter).
+  *out_ref = temp_ref;
+}
+
+IREE_API_EXPORT bool iree_vm_ref_is_null(iree_vm_ref_t* ref) {
+  return ref->type == IREE_VM_REF_TYPE_NULL;
+}
+
+IREE_API_EXPORT bool iree_vm_ref_equal(iree_vm_ref_t* lhs, iree_vm_ref_t* rhs) {
+  return lhs == rhs || memcmp(lhs, rhs, sizeof(*lhs)) == 0;
+}
diff --git a/runtime/src/iree/vm/ref.h b/runtime/src/iree/vm/ref.h
new file mode 100644
index 0000000..5ee6343
--- /dev/null
+++ b/runtime/src/iree/vm/ref.h
@@ -0,0 +1,310 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_VM_REF_H_
+#define IREE_VM_REF_H_
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/atomics.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Defines the type of the reference-counted pointer.
+// This is used to verify that operations dealing with the variant ref struct
+// are correct at runtime. We don't allow control over the ref types from the
+// VM ops and as such we can use the type specified as a safe way to avoid
+// reinterpreting memory incorrectly.
+enum iree_vm_ref_type_bits_t {
+  IREE_VM_REF_TYPE_NULL = 0,
+
+  // NOTE: these type values are assigned dynamically right now. Treat them as
+  // opaque and unstable across process invocations.
+
+  // Maximum type ID value. Type IDs are limited to 24-bits.
+  IREE_VM_REF_TYPE_MAX_VALUE = 0x00FFFFFEu,
+
+  // Wildcard type that indicates that a value may be a ref type but of an
+  // unspecified internal type.
+  IREE_VM_REF_TYPE_ANY = 0x00FFFFFFu,
+};
+typedef uint32_t iree_vm_ref_type_t;
+
+// Base for iree_vm_ref_t object targets.
+//
+// Usage (C):
+//  typedef struct my_type_t {
+//    iree_vm_ref_object_t ref_object;
+//    int my_fields;
+//  } my_type_t;
+//  void my_type_destroy(void* ptr) {
+//    free(ptr);
+//  }
+//  static iree_vm_ref_type_descriptor_t my_type_descriptor;
+//  my_type_descriptor.type_name = iree_string_view_t{"my_type", 7};
+//  my_type_descriptor.destroy = my_type_destroy;
+//  my_type_descriptor.offsetof_counter = offsetof(my_type_t,
+//                                                 ref_object.counter);
+//  iree_vm_ref_register_defined_type(&my_type_descriptor);
+//
+// Usage (C++):
+//  Prefer using iree::vm::RefObject as a base type.
+typedef struct iree_vm_ref_object_t {
+  iree_atomic_ref_count_t counter;
+} iree_vm_ref_object_t;
+
+// A pointer reference to a reference-counted object.
+// The counter is stored within the target object itself ala intrusive_ptr.
+//
+// NOTE: we try to keep this small so that we aren't wasting stack space or
+// copying around too much when we pass it to functions by value. This also
+// helps make the CPU caches happier as we need no indirections to check the
+// type and adjusting the counter happens without needing to query a descriptor.
+// Ideally the iree_vm_ref_t is in-cache on the stack and the target ptr is
+// either in cache from a previous use or will be used again after manipulating
+// its ref count.
+typedef struct iree_vm_ref_t {
+  // Pointer to the object. Type is resolved based on the |type| field.
+  // Will be NULL if the reference points to nothing.
+  void* ptr;
+  // Offset from ptr, in bytes, to the start of an atomic_int32_t representing
+  // the current reference count. We store this here to avoid the need for an
+  // indirection in the (extremely common) case of just reference count inc/dec.
+  uint32_t offsetof_counter : 8;
+  // Registered type of the object pointed to by ptr.
+  iree_vm_ref_type_t type : 24;
+} iree_vm_ref_t;
+static_assert(
+    sizeof(iree_vm_ref_t) <= sizeof(void*) * 2,
+    "iree_vm_ref_t dominates stack space usage and should be kept tiny");
+
+typedef void(IREE_API_PTR* iree_vm_ref_destroy_t)(void* ptr);
+
+// Describes a type for the VM.
+typedef struct iree_vm_ref_type_descriptor_t {
+  // Function called when references of this type reach 0 and should be
+  // destroyed.
+  iree_vm_ref_destroy_t destroy;
+  // Offset from ptr, in bytes, to the start of an atomic_int32_t representing
+  // the current reference count.
+  uint32_t offsetof_counter : 8;
+  // The type ID assigned to this type from the iree_vm_ref_type_t table (or an
+  // external user source).
+  iree_vm_ref_type_t type : 24;
+  // Unretained type name that can be used for debugging.
+  iree_string_view_t type_name;
+} iree_vm_ref_type_descriptor_t;
+
+// Directly retains the object with base |ptr| with the given |type_descriptor|.
+//
+// Note that this avoids any kind of type checking; for untrusted inputs use
+// the iree_vm_ref_t-based methods.
+IREE_API_EXPORT void iree_vm_ref_object_retain(
+    void* ptr, const iree_vm_ref_type_descriptor_t* type_descriptor);
+
+// Directly release the object with base |ptr| with the given |type_descriptor|,
+// possibly destroying it if it is the last reference. Assume that |ptr| is
+// invalid after this function returns.
+//
+// Note that this avoids any kind of type checking; for untrusted inputs use
+// the iree_vm_ref_t-based methods.
+IREE_API_EXPORT void iree_vm_ref_object_release(
+    void* ptr, const iree_vm_ref_type_descriptor_t* type_descriptor);
+
+// Registers a user-defined type with the IREE C ref system.
+// The provided destroy function will be used to destroy objects when their
+// reference count goes to 0. NULL can be used to no-op the destruction if the
+// type is not owned by the VM.
+//
+// TODO(benvanik): keep names alive for user types?
+// NOTE: the name is not retained and must be kept live by the caller. Ideally
+// it is stored in static read-only memory in the binary.
+//
+// WARNING: this function is not thread-safe and should only be used at startup
+// to register the types. Do not call this while any refs may be alive.
+IREE_API_EXPORT iree_status_t
+iree_vm_ref_register_type(iree_vm_ref_type_descriptor_t* descriptor);
+
+// Returns the type name for the given type, if found.
+IREE_API_EXPORT iree_string_view_t
+iree_vm_ref_type_name(iree_vm_ref_type_t type);
+
+// Returns the registered type descriptor for the given type, if found.
+IREE_API_EXPORT const iree_vm_ref_type_descriptor_t*
+iree_vm_ref_lookup_registered_type(iree_string_view_t full_name);
+
+// Wraps a raw pointer in a iree_vm_ref_t reference and assigns it to |out_ref|.
+// |out_ref| will be released if it already contains a reference. The target
+// object will not be retained and must come in with a count >= 1.
+//
+// Usage (C):
+//  my_type_t* my_type = (my_type_t*)malloc(sizeof(my_type_t));
+//  my_type.ref_object.counter = IREE_ATOMIC_VAR_INIT(1);
+//  iree_vm_ref_t my_ref;
+//  iree_vm_ref_wrap_assign(my_type, IREE_VM_REF_TYPE_MY_TYPE, &my_ref);
+//  iree_vm_ref_release(&my_ref);
+//
+// Usage (C++):
+//  iree_vm_ref_t my_ref;
+//  iree_vm_ref_wrap_assign(new MyType(), IREE_VM_REF_TYPE_MY_TYPE, &my_ref);
+//  iree_vm_ref_release(&my_ref);
+IREE_API_EXPORT iree_status_t iree_vm_ref_wrap_assign(void* ptr,
+                                                      iree_vm_ref_type_t type,
+                                                      iree_vm_ref_t* out_ref);
+
+// Wraps a raw pointer in a iree_vm_ref_t reference and retains it in |out_ref|.
+// |out_ref| will be released if it already contains a reference.
+IREE_API_EXPORT iree_status_t iree_vm_ref_wrap_retain(void* ptr,
+                                                      iree_vm_ref_type_t type,
+                                                      iree_vm_ref_t* out_ref);
+
+// Checks that the given reference-counted pointer |ref| is of |type|.
+static inline iree_status_t iree_vm_ref_check(const iree_vm_ref_t ref,
+                                              iree_vm_ref_type_t type) {
+  return IREE_LIKELY(ref.type == type)
+             ? iree_ok_status()
+             : iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                                ref.type == IREE_VM_REF_TYPE_NULL
+                                    ? "ref is null"
+                                    : "ref type mismatch");
+}
+
+// Retains the reference-counted pointer |ref|.
+// |out_ref| will be released if it already contains a reference.
+IREE_API_EXPORT void iree_vm_ref_retain(iree_vm_ref_t* ref,
+                                        iree_vm_ref_t* out_ref);
+
+// Retains the reference-counted pointer |ref| and checks that it is of |type|.
+// |out_ref| will be released if it already contains a reference.
+IREE_API_EXPORT iree_status_t iree_vm_ref_retain_checked(
+    iree_vm_ref_t* ref, iree_vm_ref_type_t type, iree_vm_ref_t* out_ref);
+
+// Retains or moves |ref| to |out_ref|.
+// |out_ref| will be released if it already contains a reference.
+IREE_API_EXPORT void iree_vm_ref_retain_or_move(int is_move, iree_vm_ref_t* ref,
+                                                iree_vm_ref_t* out_ref);
+
+// Retains or moves |ref| to |out_ref| and checks that |ref| is of |type|.
+// |out_ref| will be released if it already contains a reference.
+IREE_API_EXPORT iree_status_t iree_vm_ref_retain_or_move_checked(
+    int is_move, iree_vm_ref_t* ref, iree_vm_ref_type_t type,
+    iree_vm_ref_t* out_ref);
+
+// Releases the reference-counted pointer |ref|, possibly freeing it.
+IREE_API_EXPORT void iree_vm_ref_release(iree_vm_ref_t* ref);
+
+// Assigns the reference-counted pointer |ref| without incrementing the count.
+// |out_ref| will be released if it already contains a reference.
+IREE_API_EXPORT void iree_vm_ref_assign(iree_vm_ref_t* ref,
+                                        iree_vm_ref_t* out_ref);
+
+// Moves one reference to another without changing the reference count.
+// |out_ref| will be released if it already contains a reference.
+IREE_API_EXPORT void iree_vm_ref_move(iree_vm_ref_t* ref,
+                                      iree_vm_ref_t* out_ref);
+
+// Returns true if the given |ref| is NULL.
+IREE_API_EXPORT bool iree_vm_ref_is_null(iree_vm_ref_t* ref);
+
+// Returns true if the two references point at the same value (or are both
+// null).
+IREE_API_EXPORT bool iree_vm_ref_equal(iree_vm_ref_t* lhs, iree_vm_ref_t* rhs);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+//===----------------------------------------------------------------------===//
+// Type adapter utilities for interfacing with the VM
+//===----------------------------------------------------------------------===//
+
+#ifdef __cplusplus
+namespace iree {
+namespace vm {
+template <typename T>
+struct ref_type_descriptor {
+  static const iree_vm_ref_type_descriptor_t* get();
+};
+}  // namespace vm
+}  // namespace iree
+#define IREE_VM_DECLARE_CC_TYPE_LOOKUP(name, T)         \
+  namespace iree {                                      \
+  namespace vm {                                        \
+  template <>                                           \
+  struct ref_type_descriptor<T> {                       \
+    static const iree_vm_ref_type_descriptor_t* get() { \
+      return name##_get_descriptor();                   \
+    }                                                   \
+  };                                                    \
+  }                                                     \
+  }
+
+#define IREE_VM_REGISTER_CC_TYPE(type, name, descriptor)  \
+  descriptor.type_name = iree_make_cstring_view(name);    \
+  descriptor.offsetof_counter = type::offsetof_counter(); \
+  descriptor.destroy = type::DirectDestroy;               \
+  IREE_RETURN_IF_ERROR(iree_vm_ref_register_type(&descriptor));
+#else
+#define IREE_VM_DECLARE_CC_TYPE_LOOKUP(name, T)
+#define IREE_VM_REGISTER_CC_TYPE(type, name, descriptor)
+#endif  // __cplusplus
+
+// TODO(benvanik): make these macros standard/document them.
+#define IREE_VM_DECLARE_TYPE_ADAPTERS(name, T)                              \
+  IREE_API_EXPORT iree_vm_ref_t name##_retain_ref(T* value);                \
+  IREE_API_EXPORT iree_vm_ref_t name##_move_ref(T* value);                  \
+  IREE_API_EXPORT T* name##_deref(const iree_vm_ref_t ref);                 \
+  IREE_API_EXPORT iree_status_t name##_check_deref(const iree_vm_ref_t ref, \
+                                                   T** out_ptr);            \
+  IREE_API_EXPORT const iree_vm_ref_type_descriptor_t*                      \
+      name##_get_descriptor();                                              \
+  static inline bool name##_isa(const iree_vm_ref_t ref) {                  \
+    return name##_get_descriptor()->type == ref.type;                       \
+  }                                                                         \
+  IREE_API_EXPORT iree_vm_ref_type_t name##_type_id();                      \
+  IREE_VM_DECLARE_CC_TYPE_LOOKUP(name, T)
+
+// TODO(benvanik): make these macros standard/document them.
+#define IREE_VM_DEFINE_TYPE_ADAPTERS(name, T)                               \
+  IREE_API_EXPORT iree_vm_ref_t name##_retain_ref(T* value) {               \
+    iree_vm_ref_t ref = {0};                                                \
+    iree_vm_ref_wrap_retain(value, name##_descriptor.type, &ref);           \
+    return ref;                                                             \
+  }                                                                         \
+  IREE_API_EXPORT iree_vm_ref_t name##_move_ref(T* value) {                 \
+    iree_vm_ref_t ref = {0};                                                \
+    iree_vm_ref_wrap_assign(value, name##_descriptor.type, &ref);           \
+    return ref;                                                             \
+  }                                                                         \
+  IREE_API_EXPORT T* name##_deref(const iree_vm_ref_t ref) {                \
+    iree_status_t status = iree_vm_ref_check(ref, name##_descriptor.type);  \
+    if (IREE_UNLIKELY(!iree_status_is_ok(status))) {                        \
+      IREE_IGNORE_ERROR(status);                                            \
+      return NULL;                                                          \
+    }                                                                       \
+    return (T*)ref.ptr;                                                     \
+  }                                                                         \
+  IREE_API_EXPORT iree_status_t name##_check_deref(const iree_vm_ref_t ref, \
+                                                   T** out_ptr) {           \
+    IREE_RETURN_IF_ERROR(iree_vm_ref_check(ref, name##_descriptor.type));   \
+    *out_ptr = (T*)ref.ptr;                                                 \
+    return iree_ok_status();                                                \
+  }                                                                         \
+  IREE_API_EXPORT const iree_vm_ref_type_descriptor_t*                      \
+      name##_get_descriptor() {                                             \
+    return &name##_descriptor;                                              \
+  }                                                                         \
+  IREE_API_EXPORT iree_vm_ref_type_t name##_type_id() {                     \
+    return name##_descriptor.type;                                          \
+  }
+
+#endif  // IREE_VM_REF_H_
diff --git a/runtime/src/iree/vm/ref_cc.h b/runtime/src/iree/vm/ref_cc.h
new file mode 100644
index 0000000..ac5fe02
--- /dev/null
+++ b/runtime/src/iree/vm/ref_cc.h
@@ -0,0 +1,466 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_VM_REF_CC_H_
+#define IREE_VM_REF_CC_H_
+
+#include <atomic>
+#include <memory>
+#include <utility>
+
+#include "iree/base/api.h"
+#include "iree/base/attributes.h"
+#include "iree/vm/ref.h"
+
+#ifndef __cplusplus
+#error "This header is meant for use with C++ implementations."
+#endif  // __cplusplus
+
+namespace iree {
+namespace vm {
+
+//===----------------------------------------------------------------------===//
+// iree::vm::RefObject C++ base type equivalent of iree_vm_ref_t
+//===----------------------------------------------------------------------===//
+
+// TODO(benvanik): make this automatic for most types, or use type lookup.
+// This could be done with SFINAE to detect iree_vm_ref_object_t or RefObject
+// types. We may still need the iree_vm_ref_type_t exposed but that's relatively
+// simple compared to getting the typed retain/release functions.
+
+// Users may override this with their custom types to allow the packing code to
+// access their registered type ID at runtime.
+template <typename T>
+IREE_ATTRIBUTE_ALWAYS_INLINE void ref_type_retain(T* p) {
+  iree_vm_ref_object_retain(p, ref_type_descriptor<T>::get());
+}
+
+template <typename T>
+IREE_ATTRIBUTE_ALWAYS_INLINE void ref_type_release(T* p) {
+  iree_vm_ref_object_release(p, ref_type_descriptor<T>::get());
+}
+
+// Base class for reference counted objects.
+// Reference counted objects should be used with the iree::vm::ref<T> pointer
+// type. As reference counting can be tricky always prefer to use unique_ptr and
+// avoid this type. Only use this when unique_ptr is not possible, such as
+// when round-tripping objects through marshaling boundaries (v8/Java) or
+// any objects that may have their lifetime tied to a garbage collected
+// object.
+//
+// Subclasses should protect their dtor so that reference counting must
+// be used.
+//
+// This is designed to avoid the need for extra vtable space or for adding
+// methods to the vtable of subclasses. This differs from the boost Pointable
+// version of this object.
+// Inspiration for this comes from Peter Weinert's Dr. Dobb's article:
+// http://www.drdobbs.com/cpp/a-base-class-for-intrusively-reference-c/229218807
+//
+// RefObjects are thread safe and may be used with iree::vm::ref<T>s from
+// multiple threads.
+//
+// Subclasses may implement a custom Delete operator to handle their
+// deallocation. It should be thread safe as it may be called from any thread.
+//
+// Usage:
+//   class MyRefObject : public RefObject<MyRefObject> {
+//    public:
+//     MyRefObject() = default;
+//     // Optional; can be used to return to pool/etc - must be public:
+//     static void Delete(MyRefObject* ptr) {
+//       ::operator delete(ptr);
+//     }
+//   };
+template <class T>
+class RefObject {
+  static_assert(!std::is_array<T>::value, "T must not be an array");
+
+  // value is true if a static Delete(T*) function is present.
+  struct has_custom_deleter {
+    template <typename C>
+    static auto Test(C* p) -> decltype(C::Delete(nullptr), std::true_type());
+    template <typename>
+    static std::false_type Test(...);
+    static constexpr bool value =
+        std::is_same<std::true_type, decltype(Test<T>(nullptr))>::value;
+  };
+
+  template <typename V, bool has_custom_deleter>
+  struct delete_thunk {
+    static void Delete(V* p) {
+      auto ref_obj = static_cast<RefObject<V>*>(p);
+      int previous_count = ref_obj->counter_.fetch_sub(1);
+      if (previous_count == 1) {
+        // We delete type T pointer here to avoid the need for a virtual dtor.
+        V::Delete(p);
+      }
+    }
+    static void Destroy(V* p) { V::Delete(p); }
+  };
+
+  template <typename V>
+  struct delete_thunk<V, false> {
+    static void Delete(V* p) {
+      auto ref_obj = static_cast<RefObject<V>*>(p);
+      int previous_count = ref_obj->counter_.fetch_sub(1);
+      if (previous_count == 1) {
+        // We delete type T pointer here to avoid the need for a virtual dtor.
+        delete p;
+      }
+    }
+    static void Destroy(V* p) { delete p; }
+  };
+
+ public:
+  // Adds a reference; used by ref_ptr.
+  friend void ref_ptr_add_ref(T* p) {
+    auto ref_obj = static_cast<RefObject*>(p);
+    ++ref_obj->counter_;
+  }
+
+  // Releases a reference, potentially deleting the object; used by ref_ptr.
+  friend void ref_ptr_release_ref(T* p) {
+    delete_thunk<T, has_custom_deleter::value>::Delete(p);
+  }
+
+  // Deletes the object (precondition: ref count is zero).
+  friend void ref_ptr_destroy_ref(T* p) {
+    delete_thunk<T, has_custom_deleter::value>::Destroy(p);
+  }
+
+  // Deletes the object (precondition: ref count is zero).
+  static void DirectDestroy(void* p) {
+    ref_ptr_destroy_ref(reinterpret_cast<T*>(p));
+  }
+
+  // Adds a reference.
+  // ref_ptr should be used instead of this in most cases. This is required
+  // for when interoperating with marshaling APIs.
+  void AddReference() { ref_ptr_add_ref(static_cast<T*>(this)); }
+
+  // Releases a reference, potentially deleting the object.
+  // ref_ptr should be used instead of this in most cases. This is required
+  // for when interoperating with marshaling APIs.
+  void ReleaseReference() { ref_ptr_release_ref(static_cast<T*>(this)); }
+
+  // Returns the offset of the reference counter field from the start of the
+  // type T.
+  //
+  // This is generally unsafe to use and is here for support of the
+  // iree_vm_ref_t glue that allows RefObject-derived types to be round-tripped
+  // through the VM.
+  //
+  // For simple POD types or non-virtual classes we expect this to return 0.
+  // If the type has virtual methods (dtors/etc) then it should be 4 or 8
+  // (depending on pointer width). It may be other things, and instead of too
+  // much crazy magic we just rely on offsetof doing the right thing here.
+  static constexpr size_t offsetof_counter() { return offsetof(T, counter_); }
+
+ protected:
+  RefObject() { ref_ptr_add_ref(static_cast<T*>(this)); }
+  RefObject(const RefObject&) = default;
+  RefObject& operator=(const RefObject&) { return *this; }
+
+  // TODO(benvanik): replace this with just iree_vm_ref_object_t.
+  // That would allow us to remove a lot of these methods and reuse the C ones.
+  std::atomic<int32_t> counter_{0};
+};
+
+//===----------------------------------------------------------------------===//
+// iree::vm::ref<T> RAII equivalent of iree_vm_ref_t
+//===----------------------------------------------------------------------===//
+
+// Reference counted pointer container wrapping iree_vm_ref_t.
+// This is modeled on boost::instrusive_ptr in that it requires no
+// extra storage over the pointer type and should compile to almost
+// no additional code. It also allows us to round-trip object pointers
+// through regular pointers, which is critical when having to round-trip
+// them through JNI/etc where we can't use things like unique_ptr/shared_ptr.
+//
+// The ref wrapper calls the iree_vm_ref_* functions and uses the
+// iree_vm_ref_type_descriptor_t registered for the type T to manipulate the
+// reference counter and, when needed, destroy the object using
+// iree_vm_ref_destroy_t. Any iree_vm_ref_t can be used interchangably with
+// ref<T> when RAII is needed.
+//
+// Example:
+//   ref<Foo> p1(new Foo());    // ref count 1
+//   ref<Foo> p2(p1);           // ref count 2
+//   p1.reset();                // ref count 1
+//   p2.reset();                // ref count 0, deleted
+//
+// When round-tripping the pointer through external APIs, use release():
+//   ref<Foo> p1(new Foo());    // ref count 1
+//   Foo* raw_p = p1.release(); // ref count 1
+//   // pass to API
+//   ref<Foo> p2(raw_p);        // ref count 1 (don't add ref)
+//   p2.reset();                // ref count 0, deleted
+//
+// See the boost intrusive_ptr docs for details of behavior:
+// http://www.boost.org/doc/libs/1_55_0/libs/smart_ptr/intrusive_ptr.html
+//
+// The retain_ref and assign_ref helpers can be used to make it easier to
+// declare and use ref types:
+//   ref<Foo> p = assign_ref(new Foo());  // ref count 1
+//   PassRefWithRetain(retain_ref(p));
+//   PassRefWithMove(std::move(p));       // ala unique_ptr/shared_ptr
+//
+// ref manages the target objects in a thread-safe way, though you'll want
+// to take care with objects that may have pinned threads for deallocation. If
+// you release the last reference to an object on a thread other than what it
+// was expecting you're gonna have a bad time.
+//
+// Compatible only with types that implement the following methods:
+//   ref_type_retain(T*)
+//   ref_type_release(T*)
+//   ref_type_descriptor<T>::get()
+//
+// If you get link errors pertaining to ref_type_descriptor then ensure that you
+// have included the header file containing the IREE_VM_DECLARE_TYPE_ADAPTERS
+// for the given type.
+//
+// TODO(benvanik): reconcile RefObject, iree_vm_ref_t, and this.
+template <typename T>
+class ref {
+ private:
+  typedef ref this_type;
+  typedef T* this_type::*unspecified_bool_type;
+
+ public:
+  IREE_ATTRIBUTE_ALWAYS_INLINE iree_vm_ref_type_t type() const noexcept {
+    return ref_type_descriptor<T>::get()->type;
+  }
+
+  IREE_ATTRIBUTE_ALWAYS_INLINE ref() noexcept
+      : ref_({
+            0,
+            ref_type_descriptor<T>::get()->offsetof_counter,
+            ref_type_descriptor<T>::get()->type,
+        }) {}
+  IREE_ATTRIBUTE_ALWAYS_INLINE ref(std::nullptr_t) noexcept  // NOLINT
+      : ref_({
+            0,
+            ref_type_descriptor<T>::get()->offsetof_counter,
+            ref_type_descriptor<T>::get()->type,
+        }) {}
+  IREE_ATTRIBUTE_ALWAYS_INLINE ref(T* p) noexcept  // NOLINT
+      : ref_({
+            p,
+            ref_type_descriptor<T>::get()->offsetof_counter,
+            ref_type_descriptor<T>::get()->type,
+        }) {}
+  IREE_ATTRIBUTE_ALWAYS_INLINE ~ref() noexcept { ref_type_release<T>(get()); }
+
+  // Don't use implicit ref copying; use retain_ref instead to make things more
+  // readable. We can't delete the ctor (or, I couldn't find a way not to)
+  // because the templated parameter packing magic needs it.
+  ref(const ref& rhs) noexcept : ref_(rhs.ref_) { ref_type_retain<T>(get()); }
+  ref& operator=(const ref&) noexcept = delete;
+
+  // Move support to transfer ownership from one ref to another.
+  ref(ref&& rhs) noexcept : ref_(rhs.ref_) { rhs.release(); }
+  ref& operator=(ref&& rhs) noexcept {
+    if (get() != rhs.get()) {
+      ref_type_release<T>(get());
+      ref_ = rhs.ref_;
+      rhs.release();
+    }
+    return *this;
+  }
+
+  // Move support from another compatible type.
+  template <typename U>
+  ref(ref<U>&& rhs) noexcept {  // NOLINT
+    ref_.ptr = static_cast<T*>(rhs.release());
+    ref_.offsetof_counter = rhs.ref_.offsetof_counter;
+    ref_.type = rhs.ref_.type;
+  }
+  template <typename U>
+  ref& operator=(ref<U>&& rhs) noexcept {
+    if (get() != rhs.get()) {
+      ref_type_release<T>(get());
+      ref_.ptr = static_cast<T*>(rhs.release());
+    }
+    return *this;
+  }
+
+  // Resets the object to nullptr and decrements the reference count, possibly
+  // deleting it.
+  void reset() noexcept {
+    ref_type_release<T>(get());
+    ref_.ptr = nullptr;
+  }
+
+  // Releases a pointer.
+  // Returns the current pointer held by this object without having
+  // its reference count decremented and resets the ref to empty.
+  // Returns nullptr if the ref holds no value.
+  // To re-wrap in a ref use either ref<T>(value) or assign().
+  IREE_ATTRIBUTE_ALWAYS_INLINE T* release() noexcept {
+    T* p = get();
+    ref_.ptr = nullptr;
+    return p;
+  }
+
+  // Assigns a pointer.
+  // The pointer will be accepted by the ref and its reference count will
+  // not be incremented.
+  IREE_ATTRIBUTE_ALWAYS_INLINE void assign(T* value) noexcept {
+    reset();
+    ref_.ptr = value;
+  }
+
+  // Gets the pointer referenced by this instance.
+  // operator* and operator-> will assert() if there is no current object.
+  constexpr T* get() const noexcept { return reinterpret_cast<T*>(ref_.ptr); }
+  constexpr T& operator*() const noexcept { return *get(); }
+  constexpr T* operator->() const noexcept { return get(); }
+
+  // Returns a pointer to the inner pointer storage.
+  // This allows passing a pointer to the ref as an output argument to C-style
+  // creation functions.
+  constexpr T** operator&() noexcept {  // NOLINT
+    return reinterpret_cast<T**>(&ref_.ptr);
+  }
+
+  // Support boolean expression evaluation ala unique_ptr/shared_ptr:
+  // https://en.cppreference.com/w/cpp/memory/shared_ptr/operator_bool
+  constexpr operator unspecified_bool_type() const noexcept {  // NOLINT
+    return get() ? reinterpret_cast<unspecified_bool_type>(&this_type::ref_.ptr)
+                 : nullptr;
+  }
+  // Supports unary expression evaluation.
+  constexpr bool operator!() const noexcept { return !get(); }
+
+  // Swap support.
+  void swap(ref& rhs) { std::swap(ref_.ptr, rhs.ref_.ptr); }
+
+  // Allows directly passing the ref to a C-API function for creation.
+  // Example:
+  //    iree::vm::ref<my_type_t> value;
+  //    my_type_create(..., &value);
+  constexpr operator iree_vm_ref_t*() const noexcept {  // NOLINT
+    return &ref_;
+  }
+
+ private:
+  mutable iree_vm_ref_t ref_;
+};
+
+// Adds a reference to the given ref and returns the same ref.
+//
+// Usage:
+//  ref<MyType> a = AcquireRefFromSomewhere();
+//  ref<MyType> b = retain_ref(a);  // ref count + 1
+//  retain_ref(b);  // ref count + 1
+template <typename T>
+inline ref<T> retain_ref(const ref<T>& value) {
+  ref_type_retain<T>(value.get());
+  return ref<T>(value.get());
+}
+
+// Adds a reference to the given raw pointer and returns it wrapped in a ref.
+//
+// Usage:
+//  MyType* raw_ptr = AcquirePointerFromSomewhere();
+//  ref<MyType> p = retain_ref(raw_ptr);  // ref count + 1
+template <typename T>
+inline ref<T> retain_ref(T* value) {
+  ref_type_retain<T>(value);
+  return ref<T>(value);
+}
+
+// Assigns a raw pointer to a ref without adding a reference.
+//
+// Usage:
+//  ref<MyType> p = assign_ref(new MyType());  // ref count untouched
+template <typename T>
+inline ref<T> assign_ref(T* value) {
+  return ref<T>(value);
+}
+
+template <class T, class U>
+inline bool operator==(ref<T> const& a, ref<U> const& b) {
+  return a.get() == b.get();
+}
+
+template <class T, class U>
+inline bool operator!=(ref<T> const& a, ref<U> const& b) {
+  return a.get() != b.get();
+}
+
+template <class T, class U>
+inline bool operator==(ref<T> const& a, U* b) {
+  return a.get() == b;
+}
+
+template <class T, class U>
+inline bool operator!=(ref<T> const& a, U* b) {
+  return a.get() != b;
+}
+
+template <class T, class U>
+inline bool operator==(T* a, ref<U> const& b) {
+  return a == b.get();
+}
+
+template <class T, class U>
+inline bool operator!=(T* a, ref<U> const& b) {
+  return a != b.get();
+}
+
+template <class T>
+inline bool operator<(ref<T> const& a, ref<T> const& b) {
+  return a.get() < b.get();
+}
+
+// Swaps the pointers of two refs.
+template <class T>
+void swap(ref<T>& lhs, ref<T>& rhs) {
+  lhs.swap(rhs);
+}
+
+//===----------------------------------------------------------------------===//
+// iree::opaque_ref utility for type-erased ref values
+//===----------------------------------------------------------------------===//
+
+// An opaque reference that does not make any assertions about the type of the
+// ref contained. This can be used to accept arbitrary ref objects that are then
+// dynamically handled based on type.
+class opaque_ref {
+ public:
+  opaque_ref() = default;
+  opaque_ref(const opaque_ref&) = delete;
+  opaque_ref& operator=(const opaque_ref&) = delete;
+  opaque_ref(opaque_ref&& rhs) noexcept {
+    iree_vm_ref_move(&rhs.value_, &value_);
+  }
+  opaque_ref& operator=(opaque_ref&& rhs) noexcept {
+    iree_vm_ref_move(&rhs.value_, &value_);
+    return *this;
+  }
+  ~opaque_ref() { iree_vm_ref_release(&value_); }
+
+  constexpr iree_vm_ref_t* get() const noexcept { return &value_; }
+  constexpr operator iree_vm_ref_t*() const noexcept {  // NOLINT
+    return &value_;
+  }
+  constexpr bool operator!() const noexcept { return !value_.ptr; }
+
+  // Returns a pointer to the inner pointer storage.
+  // This allows passing a pointer to the ref as an output argument to C-style
+  // creation functions.
+  constexpr iree_vm_ref_t* operator&() noexcept { return &value_; }  // NOLINT
+
+ private:
+  mutable iree_vm_ref_t value_ = {0};
+};
+
+}  // namespace vm
+}  // namespace iree
+
+#endif  // IREE_VM_REF_CC_H_
diff --git a/runtime/src/iree/vm/ref_test.cc b/runtime/src/iree/vm/ref_test.cc
new file mode 100644
index 0000000..d709d18
--- /dev/null
+++ b/runtime/src/iree/vm/ref_test.cc
@@ -0,0 +1,452 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/vm/ref.h"
+
+#include <cstddef>
+
+#include "iree/base/api.h"
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+#include "iree/vm/ref_cc.h"
+
+namespace {
+
+class A : public iree::vm::RefObject<A> {
+ public:
+  static iree_vm_ref_type_t kTypeID;
+
+  int data() const { return data_; }
+
+ private:
+  int data_ = 1;
+};
+iree_vm_ref_type_t A::kTypeID = IREE_VM_REF_TYPE_NULL;
+
+class B : public iree::vm::RefObject<B> {
+ public:
+  static iree_vm_ref_type_t kTypeID;
+
+  int data() const { return data_; }
+
+ private:
+  int data_ = 2;
+};
+iree_vm_ref_type_t B::kTypeID = IREE_VM_REF_TYPE_NULL;
+
+struct ref_object_c_t {
+  iree_vm_ref_object_t ref_object = {1};
+  int data = 1;
+};
+
+template <typename T>
+static iree_vm_ref_t MakeRef(const char* type_name) {
+  // Safe to do multiple times, so we do it to ensure the tests don't care what
+  // order they run in/don't need to preregister types.
+  static iree_vm_ref_type_descriptor_t descriptor = {0};
+  if (descriptor.type == IREE_VM_REF_TYPE_NULL) {
+    descriptor.type_name = iree_make_cstring_view(type_name);
+    descriptor.offsetof_counter = T::offsetof_counter();
+    descriptor.destroy = T::DirectDestroy;
+    IREE_CHECK_OK(iree_vm_ref_register_type(&descriptor));
+    T::kTypeID = descriptor.type;
+  }
+
+  iree_vm_ref_t ref = {0};
+  IREE_CHECK_OK(iree_vm_ref_wrap_assign(new T(), T::kTypeID, &ref));
+  return ref;
+}
+
+static int32_t ReadCounter(iree_vm_ref_t* ref) {
+  return iree_atomic_load_int32(
+      (iree_atomic_ref_count_t*)(((uintptr_t)ref->ptr) + ref->offsetof_counter),
+      iree_memory_order_seq_cst);
+}
+
+static iree_vm_ref_type_t kCTypeID = IREE_VM_REF_TYPE_NULL;
+static void RegisterTypeC() {
+  static iree_vm_ref_type_descriptor_t descriptor = {0};
+  if (descriptor.type == IREE_VM_REF_TYPE_NULL) {
+    descriptor.type_name = iree_make_cstring_view("CType");
+    descriptor.offsetof_counter = offsetof(ref_object_c_t, ref_object.counter);
+    descriptor.destroy =
+        +[](void* ptr) { delete reinterpret_cast<ref_object_c_t*>(ptr); };
+    IREE_CHECK_OK(iree_vm_ref_register_type(&descriptor));
+    kCTypeID = descriptor.type;
+  }
+}
+
+// Tests type registration and lookup.
+TEST(VMRefTest, TypeRegistration) {
+  RegisterTypeC();
+  ASSERT_NE(nullptr, iree_vm_ref_lookup_registered_type(
+                         iree_make_cstring_view("CType")));
+  ASSERT_EQ(nullptr, iree_vm_ref_lookup_registered_type(
+                         iree_make_cstring_view("asodjfaoisdjfaoisdfj")));
+}
+
+// Tests wrapping a simple C struct.
+TEST(VMRefTest, WrappingCStruct) {
+  RegisterTypeC();
+  iree_vm_ref_t ref = {0};
+  IREE_EXPECT_OK(iree_vm_ref_wrap_assign(new ref_object_c_t(), kCTypeID, &ref));
+  EXPECT_EQ(1, ReadCounter(&ref));
+  iree_vm_ref_release(&ref);
+}
+
+// Tests wrapping a C++ RefObject with a vtable.
+TEST(VMRefTest, WrappingSubclassedRefObject) {
+  struct BaseType : public iree::vm::RefObject<BaseType> {
+    virtual ~BaseType() = default;
+    virtual int DoSomething() = 0;
+  };
+  static int allocated_derived_types = 0;
+  struct DerivedType : public BaseType {
+    DerivedType() { ++allocated_derived_types; }
+    ~DerivedType() override { --allocated_derived_types; }
+    int DoSomething() override { return 123 + allocated_derived_types; }
+  };
+
+  static iree_vm_ref_type_descriptor_t descriptor;
+  descriptor.type_name = iree_make_cstring_view("BaseType");
+  descriptor.offsetof_counter = BaseType::offsetof_counter();
+  descriptor.destroy = BaseType::DirectDestroy;
+  IREE_ASSERT_OK(iree_vm_ref_register_type(&descriptor));
+
+  allocated_derived_types = 0;
+
+  iree_vm_ref_t ref = {0};
+  IREE_EXPECT_OK(
+      iree_vm_ref_wrap_assign(new DerivedType(), descriptor.type, &ref));
+  EXPECT_EQ(1, ReadCounter(&ref));
+  EXPECT_EQ(1, allocated_derived_types);
+
+  EXPECT_EQ(123 + 1, reinterpret_cast<BaseType*>(ref.ptr)->DoSomething());
+
+  iree_vm_ref_release(&ref);
+  EXPECT_EQ(0, allocated_derived_types);
+}
+
+// Tests that wrapping a type that has not been registered fails.
+TEST(VMRefTest, WrappingRequriesTypeRegistration) {
+  iree_vm_ref_t ref = {0};
+  int dummy = 0;
+  iree_status_t status = iree_vm_ref_wrap_assign(
+      &dummy, static_cast<iree_vm_ref_type_t>(1234), &ref);
+  IREE_EXPECT_STATUS_IS(IREE_STATUS_INVALID_ARGUMENT, status);
+  iree_status_free(status);
+}
+
+// Tests that wrapping releases any existing ref in out_ref.
+TEST(VMRefTest, WrappingReleasesExisting) {
+  RegisterTypeC();
+  iree_vm_ref_t ref = {0};
+  iree_vm_ref_wrap_assign(new ref_object_c_t(), kCTypeID, &ref);
+  EXPECT_EQ(1, ReadCounter(&ref));
+  iree_vm_ref_release(&ref);
+}
+
+// Checking null refs is fine.
+TEST(VMRefTest, CheckNull) {
+  iree_vm_ref_t null_ref = {0};
+  IREE_EXPECT_OK(iree_vm_ref_check(null_ref, IREE_VM_REF_TYPE_NULL));
+  iree_status_t status =
+      iree_vm_ref_check(null_ref, static_cast<iree_vm_ref_type_t>(1234));
+  IREE_EXPECT_STATUS_IS(IREE_STATUS_INVALID_ARGUMENT, status);
+  iree_status_free(status);
+}
+
+// Tests type checks.
+TEST(VMRefTest, Check) {
+  iree_vm_ref_t a_ref = MakeRef<A>("AType");
+  IREE_EXPECT_OK(iree_vm_ref_check(a_ref, A::kTypeID));
+  iree_status_t status = iree_vm_ref_check(a_ref, B::kTypeID);
+  IREE_EXPECT_STATUS_IS(IREE_STATUS_INVALID_ARGUMENT, status);
+  iree_status_free(status);
+  iree_vm_ref_release(&a_ref);
+}
+
+// Tests retaining a null ref does nothing.
+TEST(VMRefTest, RetainNull) {
+  iree_vm_ref_t null_ref_0 = {0};
+  iree_vm_ref_t null_ref_1 = {0};
+  iree_vm_ref_retain(&null_ref_0, &null_ref_1);
+}
+
+// Tests that retaining into itself is a no-op.
+TEST(VMRefTest, RetainIntoSelf) {
+  iree_vm_ref_t a_ref = MakeRef<A>("AType");
+  EXPECT_EQ(1, ReadCounter(&a_ref));
+  iree_vm_ref_retain(&a_ref, &a_ref);
+  EXPECT_EQ(1, ReadCounter(&a_ref));
+  iree_vm_ref_release(&a_ref);
+}
+
+// Tests that retaining into out_ref releases the existing contents.
+TEST(VMRefTest, RetainReleasesExisting) {
+  iree_vm_ref_t a_ref = MakeRef<A>("AType");
+  iree_vm_ref_t b_ref = MakeRef<B>("BType");
+  iree_vm_ref_retain(&a_ref, &b_ref);
+  EXPECT_EQ(1, iree_vm_ref_equal(&a_ref, &b_ref));
+  EXPECT_EQ(2, ReadCounter(&a_ref));
+  iree_vm_ref_release(&a_ref);
+  iree_vm_ref_release(&b_ref);
+}
+
+// Tests that null refs are always fine.
+TEST(VMRefTest, RetainCheckedNull) {
+  iree_vm_ref_t null_ref_0 = {0};
+  iree_vm_ref_t null_ref_1 = {0};
+  IREE_EXPECT_OK(
+      iree_vm_ref_retain_checked(&null_ref_0, A::kTypeID, &null_ref_1));
+}
+
+// Tests that types are verified and retains fail if types don't match.
+TEST(VMRefTest, RetainChecked) {
+  iree_vm_ref_t a_ref_0 = MakeRef<A>("AType");
+  iree_vm_ref_t a_ref_1 = {0};
+  IREE_EXPECT_OK(iree_vm_ref_retain_checked(&a_ref_0, A::kTypeID, &a_ref_1));
+  iree_vm_ref_release(&a_ref_0);
+  iree_vm_ref_release(&a_ref_1);
+}
+
+// Tests that working with null refs is fine.
+TEST(VMRefTest, RetainOrMoveNull) {
+  iree_vm_ref_t null_ref_0 = {0};
+  iree_vm_ref_t null_ref_1 = {0};
+  iree_vm_ref_retain_or_move(/*is_move=*/0, &null_ref_0, &null_ref_1);
+  iree_vm_ref_retain_or_move(/*is_move=*/1, &null_ref_0, &null_ref_1);
+}
+
+// Tests that is_move=false increments the ref count.
+TEST(VMRefTest, RetainOrMoveRetaining) {
+  iree_vm_ref_t a_ref_0 = MakeRef<A>("AType");
+  iree_vm_ref_t a_ref_1 = {0};
+  iree_vm_ref_retain_or_move(/*is_move=*/0, &a_ref_0, &a_ref_1);
+  EXPECT_EQ(1, iree_vm_ref_equal(&a_ref_0, &a_ref_1));
+  EXPECT_EQ(2, ReadCounter(&a_ref_0));
+  iree_vm_ref_release(&a_ref_0);
+  iree_vm_ref_release(&a_ref_1);
+}
+
+// Tests that is_move=true does not increment the ref count.
+TEST(VMRefTest, RetainOrMoveMoving) {
+  iree_vm_ref_t a_ref_0 = MakeRef<A>("AType");
+  iree_vm_ref_t a_ref_1 = {0};
+  iree_vm_ref_retain_or_move(/*is_move=*/1, &a_ref_0, &a_ref_1);
+  IREE_EXPECT_OK(iree_vm_ref_check(a_ref_0, IREE_VM_REF_TYPE_NULL));
+  iree_vm_ref_release(&a_ref_1);
+}
+
+// Tests that retaining into itself just increments the ref count.
+TEST(VMRefTest, RetainOrMoveRetainingIntoSelf) {
+  iree_vm_ref_t a_ref = MakeRef<A>("AType");
+  EXPECT_EQ(1, ReadCounter(&a_ref));
+  iree_vm_ref_retain_or_move(/*is_move=*/0, &a_ref, &a_ref);
+  EXPECT_EQ(1, ReadCounter(&a_ref));
+  iree_vm_ref_release(&a_ref);
+}
+
+// Tests that moving into itself is a no-op.
+TEST(VMRefTest, RetainOrMoveMovingIntoSelf) {
+  iree_vm_ref_t a_ref = MakeRef<A>("AType");
+  iree_vm_ref_retain_or_move(/*is_move=*/1, &a_ref, &a_ref);
+  IREE_EXPECT_OK(iree_vm_ref_check(a_ref, A::kTypeID));
+  iree_vm_ref_release(&a_ref);
+}
+
+// Tests that retaining into out_ref releases the existing contents.
+TEST(VMRefTest, RetainOrMoveRetainingReleasesExisting) {
+  iree_vm_ref_t a_ref = MakeRef<A>("AType");
+  iree_vm_ref_t b_ref = MakeRef<B>("BType");
+  iree_vm_ref_retain_or_move(/*is_move=*/0, &a_ref, &b_ref);
+  EXPECT_EQ(1, iree_vm_ref_equal(&a_ref, &b_ref));
+  EXPECT_EQ(2, ReadCounter(&a_ref));
+  iree_vm_ref_release(&a_ref);
+  iree_vm_ref_release(&b_ref);
+}
+
+// Tests that moving into out_ref releases the existing contents.
+TEST(VMRefTest, RetainOrMoveMovingReleasesExisting) {
+  iree_vm_ref_t a_ref = MakeRef<A>("AType");
+  iree_vm_ref_t b_ref = MakeRef<B>("BType");
+  iree_vm_ref_retain_or_move(/*is_move=*/1, &a_ref, &b_ref);
+  EXPECT_EQ(0, iree_vm_ref_equal(&a_ref, &b_ref));
+  EXPECT_EQ(1, ReadCounter(&b_ref));
+  iree_vm_ref_release(&b_ref);
+}
+
+// Tests that null refs are always fine.
+TEST(VMRefTest, RetainOrMoveCheckedNull) {
+  iree_vm_ref_t null_ref_0 = {0};
+  iree_vm_ref_t null_ref_1 = {0};
+  IREE_EXPECT_OK(iree_vm_ref_retain_or_move_checked(
+      /*is_move=*/0, &null_ref_0, A::kTypeID, &null_ref_1));
+  IREE_EXPECT_OK(iree_vm_ref_retain_or_move_checked(
+      /*is_move=*/1, &null_ref_0, A::kTypeID, &null_ref_1));
+}
+
+// Tests that retains/moves work when types match.
+TEST(VMRefTest, RetainOrMoveCheckedMatch) {
+  // Retain.
+  iree_vm_ref_t a_ref_0 = MakeRef<A>("AType");
+  iree_vm_ref_t a_ref_1 = {0};
+  IREE_EXPECT_OK(iree_vm_ref_retain_or_move_checked(
+      /*is_move=*/0, &a_ref_0, A::kTypeID, &a_ref_1));
+  EXPECT_EQ(1, iree_vm_ref_equal(&a_ref_0, &a_ref_1));
+  EXPECT_EQ(2, ReadCounter(&a_ref_0));
+  iree_vm_ref_release(&a_ref_0);
+  iree_vm_ref_release(&a_ref_1);
+
+  // Move.
+  iree_vm_ref_t b_ref_0 = MakeRef<B>("BType");
+  iree_vm_ref_t b_ref_1 = {0};
+  IREE_EXPECT_OK(iree_vm_ref_retain_or_move_checked(
+      /*is_move=*/1, &b_ref_0, B::kTypeID, &b_ref_1));
+  EXPECT_EQ(0, iree_vm_ref_equal(&b_ref_0, &b_ref_1));
+  EXPECT_EQ(1, ReadCounter(&b_ref_1));
+  iree_vm_ref_release(&b_ref_1);
+}
+
+// Tests that types are verified and retains/moves fail if types don't match.
+TEST(VMRefTest, RetainOrMoveCheckedMismatch) {
+  // Retain.
+  iree_vm_ref_t a_ref_0 = MakeRef<A>("AType");
+  iree_vm_ref_t a_ref_1 = {0};
+  iree_status_t status = iree_vm_ref_retain_or_move_checked(
+      /*is_move=*/0, &a_ref_0, B::kTypeID, &a_ref_1);
+  IREE_EXPECT_STATUS_IS(IREE_STATUS_INVALID_ARGUMENT, status);
+  iree_status_free(status);
+  EXPECT_EQ(0, iree_vm_ref_equal(&a_ref_0, &a_ref_1));
+  EXPECT_EQ(1, ReadCounter(&a_ref_0));
+  iree_vm_ref_release(&a_ref_0);
+
+  // Move.
+  iree_vm_ref_t b_ref_0 = MakeRef<B>("BType");
+  iree_vm_ref_t b_ref_1 = {0};
+  status = iree_vm_ref_retain_or_move_checked(
+      /*is_move=*/1, &b_ref_0, A::kTypeID, &b_ref_1);
+  IREE_EXPECT_STATUS_IS(IREE_STATUS_INVALID_ARGUMENT, status);
+  iree_status_free(status);
+  EXPECT_EQ(1, ReadCounter(&b_ref_0));
+  iree_vm_ref_release(&b_ref_0);
+}
+
+// Tests that existing references are released when being overwritten.
+TEST(VMRefTest, RetainOrMoveCheckedReleasesExistingNull) {
+  iree_vm_ref_t null_ref = {0};
+  iree_vm_ref_t a_ref = MakeRef<A>("AType");
+  IREE_EXPECT_OK(iree_vm_ref_retain_or_move_checked(
+      /*is_move=*/0, &null_ref, A::kTypeID, &a_ref));
+}
+
+// Tests that existing references are released when being overwritten.
+TEST(VMRefTest, RetainOrMoveCheckedReleasesExisting) {
+  iree_vm_ref_t a_ref_0 = MakeRef<A>("AType");
+  iree_vm_ref_t a_ref_1 = MakeRef<A>("AType");
+  IREE_EXPECT_OK(iree_vm_ref_retain_or_move_checked(
+      /*is_move=*/1, &a_ref_0, A::kTypeID, &a_ref_1));
+  iree_vm_ref_release(&a_ref_1);
+}
+
+// Checks that assigning null refs is fine.
+TEST(VMRefTest, AssignNull) {
+  iree_vm_ref_t null_ref_0 = {0};
+  iree_vm_ref_t null_ref_1 = {0};
+  iree_vm_ref_assign(&null_ref_0, &null_ref_1);
+}
+
+// Tests that assigning does not reset the source ref nor inc the ref count.
+TEST(VMRefTest, Assign) {
+  iree_vm_ref_t a_ref_0 = MakeRef<A>("AType");
+  iree_vm_ref_t a_ref_1 = {0};
+  iree_vm_ref_assign(&a_ref_0, &a_ref_1);
+  EXPECT_EQ(1, iree_vm_ref_equal(&a_ref_0, &a_ref_1));
+  EXPECT_EQ(1, ReadCounter(&a_ref_0));
+  iree_vm_ref_release(&a_ref_0);
+}
+
+// Tests that assigning into itself is a no-op.
+TEST(VMRefTest, AssignSelf) {
+  iree_vm_ref_t a_ref = MakeRef<A>("AType");
+  iree_vm_ref_assign(&a_ref, &a_ref);
+  EXPECT_EQ(1, ReadCounter(&a_ref));
+  iree_vm_ref_release(&a_ref);
+}
+
+// Tests that assigning into out_ref releases the existing contents.
+TEST(VMRefTest, AssignReleasesExisting) {
+  iree_vm_ref_t a_ref = MakeRef<A>("AType");
+  iree_vm_ref_t b_ref = MakeRef<B>("BType");
+  iree_vm_ref_assign(&a_ref, &b_ref);
+  EXPECT_EQ(1, iree_vm_ref_equal(&a_ref, &b_ref));
+  EXPECT_EQ(1, ReadCounter(&a_ref));
+  iree_vm_ref_release(&a_ref);
+  // NOTE: do not release b - it was just assigned!
+}
+
+// Checks that moving null refs is fine.
+TEST(VMRefTest, MovingNull) {
+  iree_vm_ref_t null_ref_0 = {0};
+  iree_vm_ref_t null_ref_1 = {0};
+  iree_vm_ref_move(&null_ref_0, &null_ref_1);
+}
+
+// Tests that moving resets the source ref.
+TEST(VMRefTest, MovingResetsSource) {
+  iree_vm_ref_t a_ref_0 = MakeRef<A>("AType");
+  iree_vm_ref_t a_ref_1 = {0};
+  iree_vm_ref_move(&a_ref_0, &a_ref_1);
+  IREE_EXPECT_OK(iree_vm_ref_check(a_ref_0, IREE_VM_REF_TYPE_NULL));
+  iree_vm_ref_release(&a_ref_1);
+}
+
+// Tests that moving into itself is a no-op.
+TEST(VMRefTest, MovingIntoSelf) {
+  iree_vm_ref_t a_ref = MakeRef<A>("AType");
+  iree_vm_ref_move(&a_ref, &a_ref);
+  IREE_EXPECT_OK(iree_vm_ref_check(a_ref, A::kTypeID));
+  iree_vm_ref_release(&a_ref);
+}
+
+// Tests that moving into out_ref releases the existing contents.
+TEST(VMRefTest, MovingReleasesExisting) {
+  iree_vm_ref_t a_ref_0 = MakeRef<A>("AType");
+  iree_vm_ref_t a_ref_1 = MakeRef<A>("AType");
+  iree_vm_ref_move(&a_ref_0, &a_ref_1);
+  iree_vm_ref_release(&a_ref_1);
+}
+
+// Null references should always be equal.
+TEST(VMRefTest, EqualityNull) {
+  iree_vm_ref_t null_ref_0 = {0};
+  iree_vm_ref_t null_ref_1 = {0};
+  EXPECT_EQ(1, iree_vm_ref_equal(&null_ref_0, &null_ref_0));
+  EXPECT_EQ(1, iree_vm_ref_equal(&null_ref_0, &null_ref_1));
+  EXPECT_EQ(1, iree_vm_ref_equal(&null_ref_1, &null_ref_0));
+}
+
+// Tests comparing with self and against null.
+TEST(VMRefTest, EqualitySelfOrNull) {
+  iree_vm_ref_t a_ref = MakeRef<A>("AType");
+  iree_vm_ref_t null_ref = {0};
+  EXPECT_EQ(1, iree_vm_ref_equal(&a_ref, &a_ref));
+  EXPECT_EQ(0, iree_vm_ref_equal(&a_ref, &null_ref));
+  EXPECT_EQ(0, iree_vm_ref_equal(&null_ref, &a_ref));
+  iree_vm_ref_release(&a_ref);
+}
+
+// Tests comparing between different types.
+TEST(VMRefTest, EqualityDifferentTypes) {
+  iree_vm_ref_t a_ref = MakeRef<A>("AType");
+  iree_vm_ref_t b_ref = MakeRef<B>("BType");
+  EXPECT_EQ(0, iree_vm_ref_equal(&a_ref, &b_ref));
+  EXPECT_EQ(0, iree_vm_ref_equal(&b_ref, &a_ref));
+  iree_vm_ref_release(&b_ref);
+  iree_vm_ref_release(&a_ref);
+}
+
+}  // namespace
diff --git a/runtime/src/iree/vm/shims.c b/runtime/src/iree/vm/shims.c
new file mode 100644
index 0000000..7d79fa1
--- /dev/null
+++ b/runtime/src/iree/vm/shims.c
@@ -0,0 +1,52 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/vm/shims.h"
+
+IREE_VM_ABI_DEFINE_SHIM(irii, v);
+IREE_VM_ABI_DEFINE_SHIM(r, i);
+IREE_VM_ABI_DEFINE_SHIM(r, ii);
+IREE_VM_ABI_DEFINE_SHIM(r, iii);
+IREE_VM_ABI_DEFINE_SHIM(r, iiii);
+IREE_VM_ABI_DEFINE_SHIM(r, r);
+IREE_VM_ABI_DEFINE_SHIM(r, v);
+IREE_VM_ABI_DEFINE_SHIM(rCiD, i);
+IREE_VM_ABI_DEFINE_SHIM(rCrD, v);
+IREE_VM_ABI_DEFINE_SHIM(ri, i);
+IREE_VM_ABI_DEFINE_SHIM(ri, f);
+IREE_VM_ABI_DEFINE_SHIM(ri, r);
+IREE_VM_ABI_DEFINE_SHIM(ri, v);
+IREE_VM_ABI_DEFINE_SHIM(riCiD, r);
+IREE_VM_ABI_DEFINE_SHIM(riiCiD, r);
+IREE_VM_ABI_DEFINE_SHIM(riCiiD, r);
+IREE_VM_ABI_DEFINE_SHIM(riCrD, r);
+IREE_VM_ABI_DEFINE_SHIM(rii, i);
+IREE_VM_ABI_DEFINE_SHIM(rii, r);
+IREE_VM_ABI_DEFINE_SHIM(rii, v);
+IREE_VM_ABI_DEFINE_SHIM(rif, v);
+IREE_VM_ABI_DEFINE_SHIM(riii, r);
+IREE_VM_ABI_DEFINE_SHIM(riii, v);
+IREE_VM_ABI_DEFINE_SHIM(riirii, r);
+IREE_VM_ABI_DEFINE_SHIM(riiirii, r);
+IREE_VM_ABI_DEFINE_SHIM(rrrrCrD, r);
+IREE_VM_ABI_DEFINE_SHIM(ririi, v);
+IREE_VM_ABI_DEFINE_SHIM(rr, i);
+IREE_VM_ABI_DEFINE_SHIM(rr, r);
+IREE_VM_ABI_DEFINE_SHIM(rr, v);
+IREE_VM_ABI_DEFINE_SHIM(rr, ii);
+IREE_VM_ABI_DEFINE_SHIM(rrr, ii);
+IREE_VM_ABI_DEFINE_SHIM(rrCiriiD, r);
+IREE_VM_ABI_DEFINE_SHIM(rriCiD, v);
+IREE_VM_ABI_DEFINE_SHIM(rriiCiD, v);
+IREE_VM_ABI_DEFINE_SHIM(rriCiriiD, v);
+IREE_VM_ABI_DEFINE_SHIM(rriiii, v);
+IREE_VM_ABI_DEFINE_SHIM(rrirCiD, v);
+IREE_VM_ABI_DEFINE_SHIM(rriri, v);
+IREE_VM_ABI_DEFINE_SHIM(rririi, v);
+IREE_VM_ABI_DEFINE_SHIM(rrriii, v);
+IREE_VM_ABI_DEFINE_SHIM(v, i);
+IREE_VM_ABI_DEFINE_SHIM(v, r);
+IREE_VM_ABI_DEFINE_SHIM(v, v);
diff --git a/runtime/src/iree/vm/shims.h b/runtime/src/iree/vm/shims.h
new file mode 100644
index 0000000..0524452
--- /dev/null
+++ b/runtime/src/iree/vm/shims.h
@@ -0,0 +1,453 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_VM_SHIMS_H_
+#define IREE_VM_SHIMS_H_
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "iree/base/api.h"
+#include "iree/base/attributes.h"
+#include "iree/base/target_platform.h"
+#include "iree/vm/module.h"
+#include "iree/vm/ref.h"
+#include "iree/vm/stack.h"
+#include "iree/vm/value.h"
+
+//===----------------------------------------------------------------------===//
+// Argument/result struct utilities
+//===----------------------------------------------------------------------===//
+
+#define IREE_VM_ABI_TYPE_NAME(types) iree_vm_abi_##types##_t
+
+#define IREE_VM_ABI_FIXED_STRUCT(types, body) \
+  IREE_VM_ABI_FIXED_STRUCT_IMPL(types, IREE_VM_ABI_TYPE_NAME(types), body)
+
+#define IREE_VM_ABI_VLA_STRUCT(types, vla_count, vla_field, body) \
+  IREE_VM_ABI_VLA_STRUCT_IMPL(types, vla_count, vla_field,        \
+                              IREE_VM_ABI_TYPE_NAME(types), body)
+
+#define IREE_VM_ABI_FIXED_STRUCT_IMPL(types, struct_type, body)        \
+  typedef struct iree_vm_abi_##types##_t body IREE_ATTRIBUTE_PACKED    \
+      struct_type;                                                     \
+  static inline struct_type* iree_vm_abi_##types##_checked_deref(      \
+      iree_byte_span_t buffer) {                                       \
+    return IREE_LIKELY(buffer.data_length == sizeof(struct_type))      \
+               ? (struct_type*)buffer.data                             \
+               : NULL;                                                 \
+  }                                                                    \
+  static inline void iree_vm_abi_##types##_reset(struct_type* value) { \
+    memset(value, 0, sizeof(struct_type));                             \
+  }
+
+#define IREE_VM_ABI_FIELD_SIZE(type, member) sizeof(((type*)NULL)->member)
+#define IREE_VM_ABI_VLA_STRUCT_IMPL(types, vla_count, vla_field, struct_type, \
+                                    body)                                     \
+  typedef struct iree_vm_abi_##types##_t body IREE_ATTRIBUTE_PACKED           \
+      struct_type;                                                            \
+  static inline struct_type* iree_vm_abi_##types##_checked_deref(             \
+      iree_byte_span_t buffer) {                                              \
+    return IREE_LIKELY(buffer.data_length >= sizeof(struct_type)) &&          \
+                   IREE_LIKELY(                                               \
+                       buffer.data_length ==                                  \
+                       sizeof(struct_type) +                                  \
+                           ((const struct_type*)buffer.data)->vla_count *     \
+                               IREE_VM_ABI_FIELD_SIZE(struct_type,            \
+                                                      vla_field[0]))          \
+               ? (struct_type*)buffer.data                                    \
+               : NULL;                                                        \
+  }
+
+//===----------------------------------------------------------------------===//
+// Shim function declaration/definition and accessor utilities
+//===----------------------------------------------------------------------===//
+
+typedef iree_status_t(IREE_API_PTR* iree_vm_native_function_target2_t)(
+    iree_vm_stack_t* IREE_RESTRICT stack, void* IREE_RESTRICT module,
+    void* IREE_RESTRICT module_state, const void* IREE_RESTRICT args,
+    void* IREE_RESTRICT rets);
+
+#define IREE_VM_ABI_DECLARE_SHIM(arg_types, ret_types)                         \
+  iree_status_t iree_vm_shim_##arg_types##_##ret_types(                        \
+      iree_vm_stack_t* IREE_RESTRICT stack,                                    \
+      const iree_vm_function_call_t* IREE_RESTRICT call,                       \
+      iree_vm_native_function_target2_t target_fn, void* IREE_RESTRICT module, \
+      void* IREE_RESTRICT module_state,                                        \
+      iree_vm_execution_result_t* IREE_RESTRICT out_result);
+
+#define IREE_VM_ABI_DEFINE_SHIM(arg_types, ret_types)                          \
+  iree_status_t iree_vm_shim_##arg_types##_##ret_types(                        \
+      iree_vm_stack_t* IREE_RESTRICT stack,                                    \
+      const iree_vm_function_call_t* IREE_RESTRICT call,                       \
+      iree_vm_native_function_target2_t target_fn, void* IREE_RESTRICT module, \
+      void* IREE_RESTRICT module_state,                                        \
+      iree_vm_execution_result_t* IREE_RESTRICT out_result) {                  \
+    const IREE_VM_ABI_TYPE_NAME(arg_types)* args =                             \
+        iree_vm_abi_##arg_types##_checked_deref(call->arguments);              \
+    IREE_VM_ABI_TYPE_NAME(ret_types)* rets =                                   \
+        iree_vm_abi_##ret_types##_checked_deref(call->results);                \
+    if (IREE_UNLIKELY(!args || !rets)) {                                       \
+      return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,                    \
+                              "argument/result signature mismatch");           \
+    }                                                                          \
+    iree_vm_abi_##ret_types##_reset(rets);                                     \
+    return target_fn(stack, module, module_state, args, rets);                 \
+  }
+
+#define IREE_VM_ABI_EXPORT(function_name, module_state, arg_types, ret_types) \
+  static iree_status_t function_name(                                         \
+      iree_vm_stack_t* IREE_RESTRICT stack, void* IREE_RESTRICT module,       \
+      module_state* IREE_RESTRICT state,                                      \
+      IREE_VM_ABI_TYPE_NAME(arg_types) * IREE_RESTRICT args,                  \
+      IREE_VM_ABI_TYPE_NAME(ret_types) * IREE_RESTRICT rets)
+
+// TODO(benvanik): special case when source type and target type match.
+#define IREE_VM_ABI_VLA_STACK_CAST(args, vla_count, vla_field, target_type, \
+                                   max_count, out_count, out_ptrs)          \
+  *(out_count) = (args)->vla_count;                                         \
+  if (IREE_UNLIKELY((args)->vla_count > (max_count))) {                     \
+    return iree_make_status(IREE_STATUS_OUT_OF_RANGE, "count %u > %u",      \
+                            (args)->vla_count, (uint32_t)(max_count));      \
+  }                                                                         \
+  *(out_ptrs) =                                                             \
+      (target_type*)iree_alloca((args)->vla_count * sizeof(target_type));   \
+  for (iree_host_size_t i = 0; i < (args)->vla_count; ++i) {                \
+    (*(out_ptrs))[i] = (target_type)((args)->vla_field[i].i0);              \
+  }
+
+#define IREE_VM_ABI_VLA_STACK_DEREF(args, vla_count, vla_field, ref_type,     \
+                                    max_count, out_count, out_ptrs)           \
+  *(out_count) = (args)->vla_count;                                           \
+  if (IREE_UNLIKELY((args)->vla_count > (max_count))) {                       \
+    return iree_make_status(IREE_STATUS_OUT_OF_RANGE,                         \
+                            "count %u of " #ref_type " > %u",                 \
+                            (args)->vla_count, (uint32_t)(max_count));        \
+  }                                                                           \
+  *(out_ptrs) =                                                               \
+      (ref_type##_t**)iree_alloca((args)->vla_count * sizeof(ref_type##_t*)); \
+  for (iree_host_size_t i = 0; i < (args)->vla_count; ++i) {                  \
+    IREE_RETURN_IF_ERROR(                                                     \
+        ref_type##_check_deref((args)->vla_field[i].r0, &(*(out_ptrs))[i]));  \
+  }
+
+#define IREE_VM_ABI_VLA_HEAP_DEREF(args, vla_count, vla_field, ref_type,         \
+                                   host_allocator, out_count, out_ptrs)          \
+  *(out_count) = (args)->vla_count;                                              \
+  IREE_RETURN_IF_ERROR(iree_alloca((args)->vla_count * sizeof(ref_type##_t*));  \
+  for (iree_host_size_t i = 0; i < (args)->vla_count; ++i) {                   \
+    IREE_RETURN_IF_ERROR(                                                      \
+        ref_type##_check_deref((args)->vla_field[i].r0, &(*(out_ptrs))[i]));  \
+  }
+
+//===----------------------------------------------------------------------===//
+// Structures used for arguments and results.
+//===----------------------------------------------------------------------===//
+
+#if defined(IREE_COMPILER_MSVC)
+#pragma pack(push, 1)
+#endif  // IREE_COMPILER_MSVC
+
+// Special case for void (empty args/rets) as C structs can't have a 0 length.
+typedef struct iree_vm_abi_v_t {
+  int unused;
+} iree_vm_abi_v_t;
+static inline iree_vm_abi_v_t* iree_vm_abi_v_checked_deref(
+    iree_byte_span_t buffer) {
+  return (iree_vm_abi_v_t*)buffer.data;
+}
+static inline void iree_vm_abi_v_reset(iree_vm_abi_v_t* value) {}
+
+IREE_VM_ABI_FIXED_STRUCT(i, { int32_t i0; });
+
+IREE_VM_ABI_FIXED_STRUCT(f, { float f0; });
+
+IREE_VM_ABI_FIXED_STRUCT(ii, {
+  int32_t i0;
+  int32_t i1;
+});
+
+IREE_VM_ABI_FIXED_STRUCT(iii, {
+  int32_t i0;
+  int32_t i1;
+  int32_t i2;
+});
+
+IREE_VM_ABI_FIXED_STRUCT(iiii, {
+  int32_t i0;
+  int32_t i1;
+  int32_t i2;
+  int32_t i3;
+});
+
+IREE_VM_ABI_FIXED_STRUCT(irii, {
+  int32_t i0;
+  iree_vm_ref_t r1;
+  int32_t i2;
+  int32_t i3;
+});
+
+IREE_VM_ABI_FIXED_STRUCT(r, { iree_vm_ref_t r0; });
+
+IREE_VM_ABI_FIXED_STRUCT(rr, {
+  iree_vm_ref_t r0;
+  iree_vm_ref_t r1;
+});
+
+IREE_VM_ABI_FIXED_STRUCT(rrr, {
+  iree_vm_ref_t r0;
+  iree_vm_ref_t r1;
+  iree_vm_ref_t r2;
+});
+
+IREE_VM_ABI_FIXED_STRUCT(ri, {
+  iree_vm_ref_t r0;
+  int32_t i1;
+});
+
+IREE_VM_ABI_FIXED_STRUCT(ririi, {
+  iree_vm_ref_t r0;
+  int32_t i1;
+  iree_vm_ref_t r2;
+  int32_t i3;
+  int32_t i4;
+});
+
+IREE_VM_ABI_FIXED_STRUCT(rii, {
+  iree_vm_ref_t r0;
+  int32_t i1;
+  int32_t i2;
+});
+
+IREE_VM_ABI_FIXED_STRUCT(rif, {
+  iree_vm_ref_t r0;
+  int32_t i1;
+  float f2;
+});
+
+IREE_VM_ABI_FIXED_STRUCT(riii, {
+  iree_vm_ref_t r0;
+  int32_t i1;
+  int32_t i2;
+  int32_t i3;
+});
+
+IREE_VM_ABI_FIXED_STRUCT(riirii, {
+  iree_vm_ref_t r0;
+  int32_t i1;
+  int32_t i2;
+  iree_vm_ref_t r3;
+  int32_t i4;
+  int32_t i5;
+});
+
+IREE_VM_ABI_FIXED_STRUCT(riiirii, {
+  iree_vm_ref_t r0;
+  int32_t i1;
+  int32_t i2;
+  int32_t i3;
+  iree_vm_ref_t r4;
+  int32_t i5;
+  int32_t i6;
+});
+
+IREE_VM_ABI_FIXED_STRUCT(rriiii, {
+  iree_vm_ref_t r0;
+  iree_vm_ref_t r1;
+  int32_t i2;
+  int32_t i3;
+  int32_t i4;
+  int32_t i5;
+});
+
+IREE_VM_ABI_FIXED_STRUCT(rriri, {
+  iree_vm_ref_t r0;
+  iree_vm_ref_t r1;
+  int32_t i2;
+  iree_vm_ref_t r3;
+  int32_t i4;
+});
+
+IREE_VM_ABI_FIXED_STRUCT(rririi, {
+  iree_vm_ref_t r0;
+  iree_vm_ref_t r1;
+  int32_t i2;
+  iree_vm_ref_t r3;
+  int32_t i4;
+  int32_t i5;
+});
+
+IREE_VM_ABI_FIXED_STRUCT(rrriii, {
+  iree_vm_ref_t r0;
+  iree_vm_ref_t r1;
+  iree_vm_ref_t r2;
+  int32_t i3;
+  int32_t i4;
+  int32_t i5;
+});
+
+IREE_VM_ABI_VLA_STRUCT(rCiD, a1_count, a1, {
+  iree_vm_ref_t r0;
+  iree_vm_size_t a1_count;
+  iree_vm_abi_i_t a1[0];
+});
+
+IREE_VM_ABI_VLA_STRUCT(rCrD, a1_count, a1, {
+  iree_vm_ref_t r0;
+  iree_vm_size_t a1_count;
+  iree_vm_abi_r_t a1[0];
+});
+
+IREE_VM_ABI_VLA_STRUCT(riCiD, a2_count, a2, {
+  iree_vm_ref_t r0;
+  int32_t i1;
+  iree_vm_size_t a2_count;
+  iree_vm_abi_i_t a2[0];
+});
+
+IREE_VM_ABI_VLA_STRUCT(riiCiD, a3_count, a3, {
+  iree_vm_ref_t r0;
+  int32_t i1;
+  int32_t i2;
+  iree_vm_size_t a3_count;
+  iree_vm_abi_i_t a3[0];
+});
+
+IREE_VM_ABI_VLA_STRUCT(rriiCiD, a4_count, a4, {
+  iree_vm_ref_t r0;
+  iree_vm_ref_t r1;
+  int32_t i2;
+  int32_t i3;
+  iree_vm_size_t a4_count;
+  iree_vm_abi_i_t a4[0];
+});
+
+IREE_VM_ABI_VLA_STRUCT(riCrD, a2_count, a2, {
+  iree_vm_ref_t r0;
+  int32_t i1;
+  iree_vm_size_t a2_count;
+  iree_vm_abi_r_t a2[0];
+});
+
+IREE_VM_ABI_VLA_STRUCT(riiCriD, a3_count, a3, {
+  iree_vm_ref_t r0;
+  int32_t i1;
+  int32_t i2;
+  iree_vm_size_t a3_count;
+  iree_vm_abi_ri_t a3[0];
+});
+
+IREE_VM_ABI_VLA_STRUCT(rirCrD, a3_count, a3, {
+  iree_vm_ref_t r0;
+  int32_t i1;
+  iree_vm_ref_t r2;
+  iree_vm_size_t a3_count;
+  iree_vm_abi_r_t a3[0];
+});
+
+IREE_VM_ABI_VLA_STRUCT(rrrrCrD, a4_count, a4, {
+  iree_vm_ref_t r0;
+  iree_vm_ref_t r1;
+  iree_vm_ref_t r2;
+  iree_vm_ref_t r3;
+  iree_vm_size_t a4_count;
+  iree_vm_abi_r_t a4[0];
+});
+
+IREE_VM_ABI_VLA_STRUCT(rriCiD, a3_count, a3, {
+  iree_vm_ref_t r0;
+  iree_vm_ref_t r1;
+  int32_t i2;
+  iree_vm_size_t a3_count;
+  iree_vm_abi_i_t a3[0];
+});
+
+IREE_VM_ABI_VLA_STRUCT(rrirCiD, a4_count, a4, {
+  iree_vm_ref_t r0;
+  iree_vm_ref_t r1;
+  int32_t i2;
+  iree_vm_ref_t r3;
+  iree_vm_size_t a4_count;
+  iree_vm_abi_i_t a4[0];
+});
+
+IREE_VM_ABI_VLA_STRUCT(riCiiD, a2_count, a2, {
+  iree_vm_ref_t r0;
+  int32_t i1;
+  iree_vm_size_t a2_count;
+  iree_vm_abi_ii_t a2[0];
+});
+
+IREE_VM_ABI_VLA_STRUCT(rrCiriiD, a2_count, a2, {
+  iree_vm_ref_t r0;
+  iree_vm_ref_t r1;
+  iree_vm_size_t a2_count;
+  iree_vm_abi_irii_t a2[0];
+});
+
+IREE_VM_ABI_VLA_STRUCT(rriCiriiD, a3_count, a3, {
+  iree_vm_ref_t r0;
+  iree_vm_ref_t r1;
+  int32_t i2;
+  iree_vm_size_t a3_count;
+  iree_vm_abi_irii_t a3[0];
+});
+
+#if defined(IREE_COMPILER_MSVC)
+#pragma pack(pop)
+#endif  // IREE_COMPILER_MSVC
+
+//===----------------------------------------------------------------------===//
+// Shims for marshaling arguments and results
+//===----------------------------------------------------------------------===//
+
+IREE_VM_ABI_DECLARE_SHIM(irii, v);
+IREE_VM_ABI_DECLARE_SHIM(r, i);
+IREE_VM_ABI_DECLARE_SHIM(r, ii);
+IREE_VM_ABI_DECLARE_SHIM(r, iii);
+IREE_VM_ABI_DECLARE_SHIM(r, iiii);
+IREE_VM_ABI_DECLARE_SHIM(r, r);
+IREE_VM_ABI_DECLARE_SHIM(r, v);
+IREE_VM_ABI_DECLARE_SHIM(rCiD, i);
+IREE_VM_ABI_DECLARE_SHIM(rCrD, v);
+IREE_VM_ABI_DECLARE_SHIM(ri, i);
+IREE_VM_ABI_DECLARE_SHIM(ri, f);
+IREE_VM_ABI_DECLARE_SHIM(ri, r);
+IREE_VM_ABI_DECLARE_SHIM(ri, v);
+IREE_VM_ABI_DECLARE_SHIM(riCiD, r);
+IREE_VM_ABI_DECLARE_SHIM(riiCiD, r);
+IREE_VM_ABI_DECLARE_SHIM(riCiiD, r);
+IREE_VM_ABI_DECLARE_SHIM(riCrD, r);
+IREE_VM_ABI_DECLARE_SHIM(rii, i);
+IREE_VM_ABI_DECLARE_SHIM(rii, r);
+IREE_VM_ABI_DECLARE_SHIM(rii, v);
+IREE_VM_ABI_DECLARE_SHIM(rif, v);
+IREE_VM_ABI_DECLARE_SHIM(riii, r);
+IREE_VM_ABI_DECLARE_SHIM(riii, v);
+IREE_VM_ABI_DECLARE_SHIM(riirii, r);
+IREE_VM_ABI_DECLARE_SHIM(riiirii, r);
+IREE_VM_ABI_DECLARE_SHIM(rrrrCrD, r);
+IREE_VM_ABI_DECLARE_SHIM(ririi, v);
+IREE_VM_ABI_DECLARE_SHIM(rr, i);
+IREE_VM_ABI_DECLARE_SHIM(rr, r);
+IREE_VM_ABI_DECLARE_SHIM(rr, v);
+IREE_VM_ABI_DECLARE_SHIM(rr, ii);
+IREE_VM_ABI_DECLARE_SHIM(rrr, ii);
+IREE_VM_ABI_DECLARE_SHIM(rrCiriiD, r);
+IREE_VM_ABI_DECLARE_SHIM(rriCiD, v);
+IREE_VM_ABI_DECLARE_SHIM(rriiCiD, v);
+IREE_VM_ABI_DECLARE_SHIM(rriCiriiD, v);
+IREE_VM_ABI_DECLARE_SHIM(rriiii, v);
+IREE_VM_ABI_DECLARE_SHIM(rrirCiD, v);
+IREE_VM_ABI_DECLARE_SHIM(rriri, v);
+IREE_VM_ABI_DECLARE_SHIM(rririi, v);
+IREE_VM_ABI_DECLARE_SHIM(rrriii, v);
+IREE_VM_ABI_DECLARE_SHIM(v, i);
+IREE_VM_ABI_DECLARE_SHIM(v, r);
+IREE_VM_ABI_DECLARE_SHIM(v, v);
+
+#endif  // IREE_VM_SHIMS_H_
diff --git a/runtime/src/iree/vm/shims_emitc.h b/runtime/src/iree/vm/shims_emitc.h
new file mode 100644
index 0000000..76d76d2
--- /dev/null
+++ b/runtime/src/iree/vm/shims_emitc.h
@@ -0,0 +1,29 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_VM_SHIMS_EMITC_H_
+#define IREE_VM_SHIMS_EMITC_H_
+
+#include "iree/base/attributes.h"
+#include "iree/vm/module.h"
+#include "iree/vm/stack.h"
+
+typedef iree_status_t (*iree_vm_native_function_target_emitc)(
+    iree_vm_stack_t* IREE_RESTRICT stack,
+    iree_vm_function_call_t* IREE_RESTRICT call, void* IREE_RESTRICT module,
+    void* IREE_RESTRICT module_state,
+    iree_vm_execution_result_t* IREE_RESTRICT);
+
+static iree_status_t iree_emitc_shim(
+    iree_vm_stack_t* IREE_RESTRICT stack,
+    /*const*/ iree_vm_function_call_t* IREE_RESTRICT call,
+    iree_vm_native_function_target_emitc target_fn, void* IREE_RESTRICT module,
+    void* IREE_RESTRICT module_state,
+    iree_vm_execution_result_t* IREE_RESTRICT out_result) {
+  return target_fn(stack, call, module, module_state, out_result);
+}
+
+#endif  // IREE_VM_SHIMS_EMITC_H_
diff --git a/runtime/src/iree/vm/stack.c b/runtime/src/iree/vm/stack.c
new file mode 100644
index 0000000..5ea5b9c
--- /dev/null
+++ b/runtime/src/iree/vm/stack.c
@@ -0,0 +1,541 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/vm/stack.h"
+
+#include <assert.h>
+#include <inttypes.h>
+#include <stdbool.h>
+#include <string.h>
+
+#include "iree/base/alignment.h"
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+#include "iree/vm/module.h"
+
+#ifndef NDEBUG
+#define VMCHECK(expr) assert(expr)
+#else
+#define VMCHECK(expr)
+#endif  // NDEBUG
+
+//===----------------------------------------------------------------------===//
+// Stack implementation
+//===----------------------------------------------------------------------===//
+//
+// The stack is (currently) designed to contain enough information to allow us
+// to build some nice debugging tools. This means that we try hard to preserve
+// all information needed for complete and precise stack dumps as well as
+// allowing inspection of both current and previous stack frame registers.
+// In the future we may want to toggle these modes such that registers, for
+// example, are hidden by the module implementations to allow for more
+// optimization opportunity but as a whole we tradeoff minimal memory
+// consumption for flexibility and debugging. Given that a single activation
+// tensor will usually dwarf the entire size of the stack used for an invocation
+// it's generally acceptable :)
+//
+// Stack frames and storage
+// ------------------------
+// Frames are stored as a linked list of iree_vm_stack_frame_header_t's
+// containing the API-visible stack frame information (such as which function
+// the frame is in and it's program counter) and the storage for registers used
+// by the frame. As all operations including stack dumps only ever need to
+// enumerate the frames in storage order there's no need to be able to randomly
+// index into them and the linked list combined with dynamic stack growth gives
+// us (practically) unlimited stack depth.
+//
+// [iree_vm_stack_t]
+//   +- top -------> [frame 3 header] [registers] ---+
+//                                                   |
+//              +--- [frame 2 header] [registers] <--+
+//              |
+//              +--> [frame 1 header] [registers] ---+
+//                                                   |
+//         NULL <--- [frame 0 header] [registers] <--+
+//
+// To allow for static stack allocation and make allocating the VM stack on the
+// host stack or within an existing data structure the entire stack, including
+// all frame storage, can be placed into an existing allocation. This is similar
+// to inlined vectors/etc where some storage is available directly in the object
+// and only when exceeded will it switch to a dynamic allocation.
+//
+// Dynamic stack growth
+// --------------------
+// Though most of the stacks we deal with are rather shallow due to aggressive
+// inlining in the compiler it's still possible to spill any reasonably-sized
+// static storage allocation. This can be especially true in modules compiled
+// with optimizations disabled; for example the debug register allocator may
+// expand the required register count for a function from 30 to 3000.
+//
+// To support these cases the stack can optionally be provided an allocator to
+// enable it to grow the stack when the initial storage is exhausted. As we
+// store pointers to the stack storage within the storage itself (such as the
+// iree_vm_registers_t pointers) this means we need to perform a fixup step
+// during reallocation to ensure they are all updated. This also means that the
+// pointers to the stack frames are possibly invalidated on every function
+// entry and that users of the stack cannot rely on pointer stability during
+// execution.
+//
+// Calling convention
+// ------------------
+// Callers provide an arguments buffer and results buffer sized appropriately
+// for the call and with the arguments buffer populated. Callees will push
+// their new stack frame, copy or move the arguments from the caller buffer into
+// the callee frame, and then begin execution. Upon return the callee function
+// will move return values to the result buffer and pop their stack frame.
+//
+// By making the actual stack frame setup and teardown callee-controlled we can
+// have optimized implementations that treat register storage differently across
+// various frames. For example, native modules that store their registers in
+// host-machine specific registers can marshal the caller registers in/out of
+// the host registers (or stack/etc) without exposing the actual implementation
+// to the caller.
+//
+// Calling into the VM
+// -------------------
+// Calls from external code into the VM such as via iree_vm_invoke reuse the
+// same calling convention as internal-to-internal calls: callees load arguments
+// from the caller frame and store results into the caller frame.
+//
+// Marshaling arguments is easy given that the caller controls these and we can
+// trivially map the ordered set of argument types into the VM calling
+// convention buffers.
+//
+// A side-effect (beyond code reuse) is that ref types are retained by the VM
+// for the entire lifetime they may be accessible by VM routines. This lets us
+// get rich stack traces without needing to hook into external code and lets us
+// timeshift via coroutines where we may otherwise not know when the external
+// caller will resume a yielded call and actually read back the results.
+//
+// The overhead of this marshaling is minimal as external functions can always
+// use move semantics on the ref objects. Since we are reusing the normal VM
+// code paths which are likely still in instruction cache the bulk of the work
+// amounts to some small memcpys.
+
+// Multiplier on the capacity of the stack frame storage when growing.
+// Since we never shrink stacks it's nice to keep this relative low. If we
+// measure a lot of growth happening in normal models we should increase this
+// but otherwise leave as small as we can to avoid overallocation.
+#define IREE_VM_STACK_GROWTH_FACTOR 2
+
+// A private stack frame header that allows us to walk the linked list of
+// frames without exposing their exact structure through the API. This makes it
+// easier for us to add/version additional information or hide implementation
+// details.
+typedef struct iree_vm_stack_frame_header_t {
+  // Size, in bytes, of the frame header and frame payload including registers.
+  // Adding this value to the base header pointer will yield the next available
+  // memory location. Ensure that it does not exceed the total
+  // frame_storage_capacity.
+  iree_host_size_t frame_size;
+
+  // Pointer to the parent stack frame, usually immediately preceding this one
+  // in the frame storage. May be NULL.
+  struct iree_vm_stack_frame_header_t* parent;
+
+  // Stack frame type used to determine which fields are valid.
+  iree_vm_stack_frame_type_t type;
+
+  // Size, in bytes, of the additional stack frame data that follows the frame.
+  iree_host_size_t data_size;
+
+  // Function called when the stack frame is left.
+  iree_vm_stack_frame_cleanup_fn_t frame_cleanup_fn;
+
+  // Actual stack frame as visible through the API.
+  // The registers within the frame will (likely) point to addresses immediately
+  // following this header in memory.
+  iree_vm_stack_frame_t frame;
+} iree_vm_stack_frame_header_t;
+
+// Core stack storage. This will be mapped either into dynamic memory allocated
+// by the member allocator or static memory allocated externally. Static stacks
+// cannot grow when storage runs out while dynamic ones will resize their stack.
+struct iree_vm_stack_t {
+  // NOTE: to get better cache hit rates we put the most frequently accessed
+  // members first.
+
+  // Pointer to the current top of the stack.
+  // This can be used to walk the stack from top to bottom by following the
+  // |parent| pointers. Note that these pointers are invalidated each time the
+  // stack grows (if dynamic growth is enabled) and all of the frames will need
+  // updating.
+  iree_vm_stack_frame_header_t* top;
+
+  // Base pointer to stack storage.
+  // For statically-allocated stacks this will (likely) point to immediately
+  // after the iree_vm_stack_t in memory. For dynamically-allocated stacks this
+  // will (likely) point to heap memory.
+  iree_host_size_t frame_storage_capacity;
+  iree_host_size_t frame_storage_size;
+  void* frame_storage;
+
+  // Flags controlling the behavior of the invocation owning this stack.
+  iree_vm_invocation_flags_t flags;
+
+  // True if the stack owns the frame_storage and should free it when it is no
+  // longer required. Host stack-allocated stacks don't own their storage but
+  // may transition to owning it on dynamic growth.
+  bool owns_frame_storage;
+
+  // Resolves a module to a module state within a context.
+  // This will be called on function entry whenever module transitions occur.
+  iree_vm_state_resolver_t state_resolver;
+
+  // Allocator used for dynamic stack allocations. May be the null allocator
+  // if growth is prohibited.
+  iree_allocator_t allocator;
+};
+
+//===----------------------------------------------------------------------===//
+// Stack implementation
+//===----------------------------------------------------------------------===//
+
+IREE_API_EXPORT iree_status_t iree_vm_stack_initialize(
+    iree_byte_span_t storage, iree_vm_invocation_flags_t flags,
+    iree_vm_state_resolver_t state_resolver, iree_allocator_t allocator,
+    iree_vm_stack_t** out_stack) {
+  IREE_ASSERT_ARGUMENT(out_stack);
+  *out_stack = NULL;
+  if (storage.data_length < IREE_VM_STACK_MIN_SIZE) {
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "stack storage under minimum required amount: %zu < %d",
+        storage.data_length, IREE_VM_STACK_MIN_SIZE);
+  }
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_vm_stack_t* stack = (iree_vm_stack_t*)storage.data;
+  memset(stack, 0, sizeof(iree_vm_stack_t));
+  stack->owns_frame_storage = false;
+  stack->flags = flags;
+  stack->state_resolver = state_resolver;
+  stack->allocator = allocator;
+
+  iree_host_size_t storage_offset =
+      iree_host_align(sizeof(iree_vm_stack_t), 16);
+  stack->frame_storage_capacity = storage.data_length - storage_offset;
+  stack->frame_storage_size = 0;
+  stack->frame_storage = storage.data + storage_offset;
+
+  stack->top = NULL;
+
+  *out_stack = stack;
+
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT void iree_vm_stack_deinitialize(iree_vm_stack_t* stack) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  while (stack->top) {
+    iree_status_ignore(iree_vm_stack_function_leave(stack));
+  }
+
+  if (stack->owns_frame_storage) {
+    iree_allocator_free(stack->allocator, stack->frame_storage);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_stack_allocate(
+    iree_vm_invocation_flags_t flags, iree_vm_state_resolver_t state_resolver,
+    iree_allocator_t allocator, iree_vm_stack_t** out_stack) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  *out_stack = NULL;
+
+  iree_host_size_t storage_size = IREE_VM_STACK_DEFAULT_SIZE;
+  void* storage = NULL;
+  iree_status_t status =
+      iree_allocator_malloc(allocator, storage_size, &storage);
+  iree_vm_stack_t* stack = NULL;
+  if (iree_status_is_ok(status)) {
+    iree_byte_span_t storage_span = iree_make_byte_span(storage, storage_size);
+    status = iree_vm_stack_initialize(storage_span, flags, state_resolver,
+                                      allocator, &stack);
+  }
+
+  *out_stack = stack;
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT void iree_vm_stack_free(iree_vm_stack_t* stack) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_allocator_t allocator = stack->allocator;
+  void* storage = (void*)stack;
+  iree_vm_stack_deinitialize(stack);
+  iree_allocator_free(allocator, storage);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+IREE_API_EXPORT iree_vm_invocation_flags_t
+iree_vm_stack_invocation_flags(const iree_vm_stack_t* stack) {
+  return stack->flags;
+}
+
+IREE_API_EXPORT iree_vm_stack_frame_t* iree_vm_stack_current_frame(
+    iree_vm_stack_t* stack) {
+  return stack->top ? &stack->top->frame : NULL;
+}
+
+IREE_API_EXPORT iree_vm_stack_frame_t* iree_vm_stack_parent_frame(
+    iree_vm_stack_t* stack) {
+  if (!stack->top) return NULL;
+  iree_vm_stack_frame_header_t* parent_header = stack->top->parent;
+  return parent_header ? &parent_header->frame : NULL;
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_stack_query_module_state(
+    iree_vm_stack_t* stack, iree_vm_module_t* module,
+    iree_vm_module_state_t** out_module_state) {
+  return stack->state_resolver.query_module_state(stack->state_resolver.self,
+                                                  module, out_module_state);
+}
+
+// Attempts to grow the stack store to hold at least |minimum_capacity|.
+// Pointers to existing stack frames will be invalidated and any pointers
+// embedded in the stack frame data structures will be updated.
+// Fails if dynamic stack growth is disabled or the allocator is OOM.
+static iree_status_t iree_vm_stack_grow(iree_vm_stack_t* stack,
+                                        iree_host_size_t minimum_capacity) {
+  if (IREE_UNLIKELY(stack->allocator.ctl == NULL)) {
+    return iree_make_status(
+        IREE_STATUS_RESOURCE_EXHAUSTED,
+        "stack initialized on the host stack and cannot grow");
+  }
+
+  // Ensure we grow at least as much as required.
+  iree_host_size_t new_capacity = stack->frame_storage_capacity;
+  do {
+    new_capacity *= IREE_VM_STACK_GROWTH_FACTOR;
+  } while (new_capacity < minimum_capacity);
+  if (new_capacity > IREE_VM_STACK_MAX_SIZE) {
+    return iree_make_status(
+        IREE_STATUS_RESOURCE_EXHAUSTED,
+        "new stack size would exceed maximum size: %zu > %d", new_capacity,
+        IREE_VM_STACK_MAX_SIZE);
+  }
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Reallocate the frame storage. 99.9999% chance the new storage pointer will
+  // differ and we'll need to fix up pointers so we just always do that.
+  void* old_storage = stack->frame_storage;
+  void* new_storage = stack->frame_storage;
+  iree_status_t status;
+  if (stack->owns_frame_storage) {
+    // We own the storage already likely from a previous growth operation.
+    status =
+        iree_allocator_realloc(stack->allocator, new_capacity, &new_storage);
+  } else {
+    // We don't own the original storage so we are going to switch to our own
+    // newly-allocated storage instead. We need to make sure we copy over the
+    // existing stack contents.
+    status =
+        iree_allocator_malloc(stack->allocator, new_capacity, &new_storage);
+    if (iree_status_is_ok(status)) {
+      memcpy(new_storage, old_storage, stack->frame_storage_capacity);
+    }
+  }
+  if (!iree_status_is_ok(status)) {
+    IREE_TRACE_ZONE_END(z0);
+    return status;
+  }
+  stack->frame_storage = new_storage;
+  stack->frame_storage_capacity = new_capacity;
+  stack->owns_frame_storage = true;
+
+#define REBASE_POINTER(type, ptr, old_base, new_base)           \
+  if (ptr) {                                                    \
+    (ptr) = (type)(((uintptr_t)(ptr) - (uintptr_t)(old_base)) + \
+                   (uintptr_t)(new_base));                      \
+  }
+
+  // Fixup embedded stack frame pointers.
+  REBASE_POINTER(iree_vm_stack_frame_header_t*, stack->top, old_storage,
+                 new_storage);
+  iree_vm_stack_frame_header_t* frame_header = stack->top;
+  while (frame_header != NULL) {
+    REBASE_POINTER(iree_vm_stack_frame_header_t*, frame_header->parent,
+                   old_storage, new_storage);
+    frame_header = frame_header->parent;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_stack_function_enter(
+    iree_vm_stack_t* stack, const iree_vm_function_t* function,
+    iree_vm_stack_frame_type_t frame_type, iree_host_size_t frame_size,
+    iree_vm_stack_frame_cleanup_fn_t frame_cleanup_fn,
+    iree_vm_stack_frame_t** out_callee_frame) {
+  if (out_callee_frame) *out_callee_frame = NULL;
+
+  // Allocate stack space and grow stack, if required.
+  iree_host_size_t header_size = sizeof(iree_vm_stack_frame_header_t);
+  iree_host_size_t new_top =
+      stack->frame_storage_size + header_size + frame_size;
+  if (IREE_UNLIKELY(new_top > stack->frame_storage_capacity)) {
+    IREE_RETURN_IF_ERROR(iree_vm_stack_grow(stack, new_top));
+  }
+
+  // Try to reuse the same module state if the caller and callee are from the
+  // same module. Otherwise, query the state from the registered handler.
+  iree_vm_stack_frame_header_t* caller_frame_header = stack->top;
+  iree_vm_stack_frame_t* caller_frame =
+      caller_frame_header ? &caller_frame_header->frame : NULL;
+  iree_vm_module_state_t* module_state = NULL;
+  if (caller_frame && caller_frame->function.module == function->module) {
+    module_state = caller_frame->module_state;
+  } else if (function->module != NULL) {
+    IREE_RETURN_IF_ERROR(stack->state_resolver.query_module_state(
+        stack->state_resolver.self, function->module, &module_state));
+  }
+
+  // Bump pointer and get real stack pointer offsets.
+  iree_vm_stack_frame_header_t* frame_header =
+      (iree_vm_stack_frame_header_t*)((uintptr_t)stack->frame_storage +
+                                      stack->frame_storage_size);
+  memset(frame_header, 0, header_size + frame_size);
+
+  frame_header->frame_size = header_size + frame_size;
+  frame_header->parent = stack->top;
+  frame_header->type = frame_type;
+  frame_header->data_size = frame_size;
+  frame_header->frame_cleanup_fn = frame_cleanup_fn;
+
+  iree_vm_stack_frame_t* callee_frame = &frame_header->frame;
+  callee_frame->function = *function;
+  callee_frame->module_state = module_state;
+  callee_frame->pc = 0;
+  callee_frame->depth = caller_frame ? caller_frame->depth + 1 : 0;
+
+  stack->frame_storage_size = new_top;
+  stack->top = frame_header;
+
+  IREE_TRACE({
+    if (frame_type != IREE_VM_STACK_FRAME_NATIVE) {
+      // TODO(benvanik): cache source location and query from module.
+      iree_string_view_t function_name = iree_vm_function_name(function);
+      IREE_TRACE_ZONE_BEGIN_NAMED_DYNAMIC(z0, function_name.data,
+                                          function_name.size);
+      callee_frame->trace_zone = z0;
+      if (frame_size) {
+        IREE_TRACE_ZONE_APPEND_VALUE(z0, frame_size);
+      }
+    }
+  });
+
+  if (out_callee_frame) *out_callee_frame = callee_frame;
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_status_t
+iree_vm_stack_function_leave(iree_vm_stack_t* stack) {
+  if (IREE_UNLIKELY(!stack->top)) {
+    return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                            "unbalanced stack leave");
+  }
+
+  // Call (optional) frame storage cleanup function.
+  if (stack->top->frame_cleanup_fn) {
+    stack->top->frame_cleanup_fn(&stack->top->frame);
+  }
+
+  IREE_TRACE({
+    if (stack->top->frame.trace_zone) {
+      IREE_TRACE_ZONE_END(stack->top->frame.trace_zone);
+    }
+  });
+
+  // Restore the frame pointer to the caller.
+  stack->frame_storage_size -= stack->top->frame_size;
+  stack->top = stack->top->parent;
+
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_stack_format_backtrace(
+    iree_vm_stack_t* stack, iree_string_builder_t* builder) {
+  for (iree_vm_stack_frame_header_t* frame = stack->top; frame != NULL;
+       frame = frame->parent) {
+    // Stack frame prefix.
+    const char* type_str;
+    switch (frame->type) {
+      default:
+        type_str = "??";
+        break;
+      case IREE_VM_STACK_FRAME_EXTERNAL:
+        type_str = "external";
+        break;
+      case IREE_VM_STACK_FRAME_NATIVE:
+        type_str = "native";
+        break;
+      case IREE_VM_STACK_FRAME_BYTECODE:
+        type_str = "bytecode";
+        break;
+    }
+    IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
+        builder, "\n[%*" PRId32 "] %*s ", 2, frame->frame.depth, 8, type_str));
+
+    // Common module/function name and PC.
+    iree_string_view_t module_name =
+        iree_vm_module_name(frame->frame.function.module);
+    iree_string_view_t function_name =
+        iree_vm_function_name(&frame->frame.function);
+    if (iree_string_view_is_empty(function_name)) {
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
+          builder, "%.*s@%d", (int)module_name.size, module_name.data,
+          (int)frame->frame.function.ordinal));
+    } else {
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
+          builder, "%.*s.%.*s", (int)module_name.size, module_name.data,
+          (int)function_name.size, function_name.data));
+    }
+    IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
+        builder, ":%" PRIu64 " ", (uint64_t)frame->frame.pc));
+
+    iree_vm_module_t* module = frame->frame.function.module;
+    iree_vm_source_location_t source_location;
+    iree_status_t status = iree_vm_module_resolve_source_location(
+        module, &frame->frame, &source_location);
+    if (iree_status_is_ok(status)) {
+      status = iree_vm_source_location_format(
+          &source_location, IREE_VM_SOURCE_LOCATION_FORMAT_FLAG_NONE, builder);
+    }
+    if (iree_status_is_unavailable(status)) {
+      // TODO(benvanik): if this is an import/export we can get that name.
+      IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(builder, "-"));
+    } else if (!iree_status_is_ok(status)) {
+      return status;
+    }
+  }
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_status_t iree_vm_stack_annotate_backtrace(
+    iree_vm_stack_t* stack, iree_status_t base_status) {
+  iree_string_builder_t builder;
+  iree_string_builder_initialize(stack->allocator, &builder);
+  iree_status_t status = iree_vm_stack_format_backtrace(stack, &builder);
+  if (iree_status_is_ok(status)) {
+    // TODO(benvanik): don't duplicate the buffer here - we should be attaching
+    // a payload but that requires additional plumbing.
+    status = iree_status_annotate_f(base_status, "%.*s",
+                                    (int)iree_string_builder_size(&builder),
+                                    iree_string_builder_buffer(&builder));
+  }
+  iree_string_builder_deinitialize(&builder);
+  return status;
+}
diff --git a/runtime/src/iree/vm/stack.h b/runtime/src/iree/vm/stack.h
new file mode 100644
index 0000000..abd5b75
--- /dev/null
+++ b/runtime/src/iree/vm/stack.h
@@ -0,0 +1,248 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_VM_STACK_H_
+#define IREE_VM_STACK_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "iree/base/alignment.h"
+#include "iree/base/api.h"
+#include "iree/base/attributes.h"
+#include "iree/base/string_builder.h"
+#include "iree/base/tracing.h"
+#include "iree/vm/module.h"
+#include "iree/vm/ref.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// A reasonable default stack storage size, in bytes.
+// This will allow most (reasonable) programs to run. If running
+// unverified/untested programs then prefer to use a dynamically growable stack
+// until the expectations of the programs are checked; for example, hopefully
+// in a year or two we have much more complex models with much deeper call
+// stacks and we may want to re-evaluate the host-stack allocation size.
+//
+// The value was chosen to fit quite a few i32 registers and a reasonable amount
+// of ref registers (that are 2 * sizeof(void*)). For many invocations this will
+// be more than enough to perform the work without needing an additional dynamic
+// allocation/resize.
+#define IREE_VM_STACK_DEFAULT_SIZE (8 * 1024)
+
+// The minimum size of VM stack storage.
+#define IREE_VM_STACK_MIN_SIZE (1 * 1024)
+
+// The maximum size of VM stack storage; anything larger is probably a bug.
+#define IREE_VM_STACK_MAX_SIZE (1 * 1024 * 1024)
+
+enum iree_vm_invocation_flag_bits_t {
+  IREE_VM_INVOCATION_FLAG_NONE = 0u,
+
+  // Enables tracing of execution to stderr (when available) for the invocation.
+  // See iree/base/config.h for the flags that control whether this
+  // functionality is available; specifically:
+  //   -DIREE_VM_EXECUTION_TRACING_ENABLE=1
+  IREE_VM_INVOCATION_FLAG_TRACE_EXECUTION = 1u << 0,
+};
+typedef uint32_t iree_vm_invocation_flags_t;
+
+typedef enum iree_vm_stack_frame_type_e {
+  // Represents an `[external]` frame that needs to marshal args/results.
+  // These frames have no source location and are tracked so that we know when
+  // transitions occur into/out-of external code.
+  IREE_VM_STACK_FRAME_EXTERNAL = 0,
+  // Represents a `[native]` frame that has no persistent register storage.
+  // These frames may have source location information provided by the
+  // implementation.
+  IREE_VM_STACK_FRAME_NATIVE = 1,
+  // VM stack frame in bytecode using internal register storage.
+  IREE_VM_STACK_FRAME_BYTECODE = 2,
+} iree_vm_stack_frame_type_t;
+
+// A single stack frame within the VM.
+//
+// NOTE: to (try to) get better cache hit rates we put the most frequently
+// accessed members **LAST**. This is because the custom frame storage data
+// immediately follows this struct in memory and is highly likely to be touched
+// by the callee immediately and repeatedly.
+typedef struct iree_vm_stack_frame_t {
+  // Function that the stack frame is within.
+  iree_vm_function_t function;
+
+  // Cached module state pointer for the module containing |function|.
+  // This removes the need to lookup the module state when control returns to
+  // the function during continuation or from a return instruction.
+  iree_vm_module_state_t* module_state;
+
+  // Current program counter within the function.
+  // Implementations may treat this offset differently, treating it as a byte
+  // offset (such as in the case of VM bytecode), a block identifier (compiled
+  // code), etc.
+  iree_vm_source_offset_t pc;
+
+  // Depth of the frame within the stack.
+  // As stack frame pointers are not stable this can be used instead to detect
+  // stack enter/leave balance issues.
+  int32_t depth;
+
+  IREE_TRACE(iree_zone_id_t trace_zone;)
+} iree_vm_stack_frame_t;
+
+// Returns the implementation-defined frame storage associated with |frame|.
+// The pointer will contain at least as many bytes as requested by frame_size.
+static inline void* iree_vm_stack_frame_storage(iree_vm_stack_frame_t* frame) {
+  return (void*)((uintptr_t)frame + sizeof(iree_vm_stack_frame_t));
+}
+
+// Callback for cleaning up stack frame storage before a frame is left or the
+// stack is destroyed.
+typedef void(IREE_API_PTR* iree_vm_stack_frame_cleanup_fn_t)(
+    iree_vm_stack_frame_t* frame);
+
+// A state resolver that can allocate or lookup module state.
+typedef struct iree_vm_state_resolver_t {
+  void* self;
+  iree_status_t(IREE_API_PTR* query_module_state)(
+      void* state_resolver, iree_vm_module_t* module,
+      iree_vm_module_state_t** out_module_state);
+} iree_vm_state_resolver_t;
+
+// A fiber stack used for storing stack frame state during execution.
+// All required state is stored within the stack and no host thread-local state
+// is used allowing us to execute multiple fibers on the same host thread.
+typedef struct iree_vm_stack_t iree_vm_stack_t;
+
+// Defines and initializes an inline VM stack.
+// The stack will be ready for use and must be deinitialized with
+// iree_vm_stack_deinitialize when no longer required.
+//
+// Example:
+//  IREE_VM_INLINE_STACK_INITIALIZE(
+//      stack,
+//      IREE_VM_INVOCATION_FLAG_NONE,
+//      iree_vm_context_state_resolver(context),
+//      iree_allocator_system());
+//  ...
+//  iree_vm_stack_deinitialize(stack);
+#define IREE_VM_INLINE_STACK_INITIALIZE(stack, flags, state_resolver, \
+                                        allocator)                    \
+  uint8_t __stack_storage[IREE_VM_STACK_DEFAULT_SIZE];                \
+  iree_byte_span_t __stack_storage_span =                             \
+      iree_make_byte_span(__stack_storage, sizeof(__stack_storage));  \
+  iree_vm_stack_t* stack = NULL;                                      \
+  IREE_IGNORE_ERROR(iree_vm_stack_initialize(                         \
+      __stack_storage_span, (flags), (state_resolver), (allocator), &stack));
+
+// Initializes a statically-allocated stack in |storage|.
+// The contents of the |storage| can be anything upon initialization and the
+// stack must be deinitialized with iree_vm_stack_deinitialize before the
+// storage is freed. The provided |allocator| is only used for stack growth
+// beyond the initial storage capacity and may be iree_allocator_null() to
+// prevent growth. Use IREE_VM_STACK_DEFAULT_SIZE for a reasonable default or
+// use iree_vm_stack_allocate if the input programs may exceed reason.
+//
+// The provided |state_resolver| will be used to resolve a module to a module
+// state within a context. This will be called on function entry whenever module
+// transitions occur.
+//
+// Example:
+//  uint8_t stack_storage[IREE_VM_STACK_DEFAULT_SIZE];
+//  iree_vm_stack_t* stack = NULL;
+//  iree_vm_stack_initialize(stack_storage, ..., &stack);
+//  ...
+//  iree_vm_stack_deinitialize(stack);
+//  // stack_storage can now be reused/freed/etc
+IREE_API_EXPORT iree_status_t iree_vm_stack_initialize(
+    iree_byte_span_t storage, iree_vm_invocation_flags_t flags,
+    iree_vm_state_resolver_t state_resolver, iree_allocator_t allocator,
+    iree_vm_stack_t** out_stack);
+
+// Deinitializes a statically-allocated |stack| previously initialized with
+// iree_vm_stack_initialize.
+IREE_API_EXPORT void iree_vm_stack_deinitialize(iree_vm_stack_t* stack);
+
+// Allocates a dynamically-growable stack.
+//
+// The provided |state_resolver| will be used to resolve a module to a module
+// state within a context. This will be called on function entry whenever module
+// transitions occur.
+//
+// The stack will be allocated from |allocator| and returned in |out_stack|.
+// It must be freed with iree_vm_stack_free.
+//
+// Example:
+//  iree_vm_stack_t* stack = NULL;
+//  iree_vm_stack_allocate(..., iree_allocator_system(), &stack);
+//  ...
+//  iree_vm_stack_free(stack);
+IREE_API_EXPORT iree_status_t iree_vm_stack_allocate(
+    iree_vm_invocation_flags_t flags, iree_vm_state_resolver_t state_resolver,
+    iree_allocator_t allocator, iree_vm_stack_t** out_stack);
+
+// Frees a dynamically-allocated |stack| from iree_vm_stack_allocate.
+IREE_API_EXPORT void iree_vm_stack_free(iree_vm_stack_t* stack);
+
+// Returns the flags controlling the invocation this stack is used with.
+IREE_API_EXPORT iree_vm_invocation_flags_t
+iree_vm_stack_invocation_flags(const iree_vm_stack_t* stack);
+
+// Returns the current stack frame or nullptr if the stack is empty.
+IREE_API_EXPORT iree_vm_stack_frame_t* iree_vm_stack_current_frame(
+    iree_vm_stack_t* stack);
+
+// Returns the parent stack frame or nullptr if the stack is empty.
+IREE_API_EXPORT iree_vm_stack_frame_t* iree_vm_stack_parent_frame(
+    iree_vm_stack_t* stack);
+
+// Queries the context-specific module state for the given module.
+IREE_API_EXPORT iree_status_t iree_vm_stack_query_module_state(
+    iree_vm_stack_t* stack, iree_vm_module_t* module,
+    iree_vm_module_state_t** out_module_state);
+
+// Enters into the given |function| and returns the callee stack frame.
+// May invalidate any pointers to stack frames and the only pointer that can be
+// assumed valid after return is the one in |out_callee_frame|.
+//
+// |frame_size| can optionally be used to allocate storage within the stack for
+// callee data. |frame_cleanup_fn| will be called when the frame is left either
+// normally via an iree_vm_stack_function_leave call or if an error occurs and
+// the stack needs to be torn down.
+IREE_API_EXPORT iree_status_t iree_vm_stack_function_enter(
+    iree_vm_stack_t* stack, const iree_vm_function_t* function,
+    iree_vm_stack_frame_type_t frame_type, iree_host_size_t frame_size,
+    iree_vm_stack_frame_cleanup_fn_t frame_cleanup_fn,
+    iree_vm_stack_frame_t** out_callee_frame);
+
+// Leaves the current stack frame.
+IREE_API_EXPORT iree_status_t
+iree_vm_stack_function_leave(iree_vm_stack_t* stack);
+
+// Formats a backtrace of the current stack to the given string |builder|.
+IREE_API_EXPORT iree_status_t iree_vm_stack_format_backtrace(
+    iree_vm_stack_t* stack, iree_string_builder_t* builder);
+
+// Annotates |status| with the backtrace of |stack| and returns |base_status|.
+IREE_API_EXPORT IREE_MUST_USE_RESULT iree_status_t
+iree_vm_stack_annotate_backtrace(iree_vm_stack_t* stack,
+                                 iree_status_t base_status);
+
+#if IREE_VM_BACKTRACE_ENABLE && \
+    (IREE_STATUS_FEATURES & IREE_STATUS_FEATURE_ANNOTATIONS)
+#define IREE_VM_STACK_ANNOTATE_BACKTRACE_IF_ENABLED(stack, base_status) \
+  iree_vm_stack_annotate_backtrace(stack, base_status)
+#else
+#define IREE_VM_STACK_ANNOTATE_BACKTRACE_IF_ENABLED(stack, base_status) \
+  (base_status)
+#endif  // IREE_VM_BACKTRACE_ENABLE && IREE_STATUS_FEATURE_ANNOTATIONS
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_VM_STACK_H_
diff --git a/runtime/src/iree/vm/stack_test.cc b/runtime/src/iree/vm/stack_test.cc
new file mode 100644
index 0000000..80303df
--- /dev/null
+++ b/runtime/src/iree/vm/stack_test.cc
@@ -0,0 +1,202 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/vm/stack.h"
+
+#include "iree/base/api.h"
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace {
+
+#define MODULE_A_SENTINEL reinterpret_cast<iree_vm_module_t*>(1)
+#define MODULE_B_SENTINEL reinterpret_cast<iree_vm_module_t*>(2)
+#define MODULE_A_STATE_SENTINEL reinterpret_cast<iree_vm_module_state_t*>(101)
+#define MODULE_B_STATE_SENTINEL reinterpret_cast<iree_vm_module_state_t*>(102)
+
+static int module_a_state_resolve_count = 0;
+static int module_b_state_resolve_count = 0;
+static iree_status_t SentinelStateResolver(
+    void* state_resolver, iree_vm_module_t* module,
+    iree_vm_module_state_t** out_module_state) {
+  if (module == MODULE_A_SENTINEL) {
+    ++module_a_state_resolve_count;
+    *out_module_state = MODULE_A_STATE_SENTINEL;
+    return iree_ok_status();
+  } else if (module == MODULE_B_SENTINEL) {
+    ++module_b_state_resolve_count;
+    *out_module_state = MODULE_B_STATE_SENTINEL;
+    return iree_ok_status();
+  }
+  return iree_make_status(IREE_STATUS_NOT_FOUND);
+}
+
+// Tests simple stack usage, mainly just for demonstration.
+TEST(VMStackTest, Usage) {
+  iree_vm_state_resolver_t state_resolver = {nullptr, SentinelStateResolver};
+  IREE_VM_INLINE_STACK_INITIALIZE(stack, IREE_VM_INVOCATION_FLAG_NONE,
+                                  state_resolver, iree_allocator_system());
+
+  EXPECT_EQ(nullptr, iree_vm_stack_current_frame(stack));
+  EXPECT_EQ(nullptr, iree_vm_stack_parent_frame(stack));
+
+  iree_vm_function_t function_a = {MODULE_A_SENTINEL,
+                                   IREE_VM_FUNCTION_LINKAGE_INTERNAL, 0};
+  iree_vm_stack_frame_t* frame_a = nullptr;
+  IREE_EXPECT_OK(iree_vm_stack_function_enter(
+      stack, &function_a, IREE_VM_STACK_FRAME_NATIVE, 0, NULL, &frame_a));
+  EXPECT_EQ(0, frame_a->function.ordinal);
+  EXPECT_EQ(frame_a, iree_vm_stack_current_frame(stack));
+  EXPECT_EQ(nullptr, iree_vm_stack_parent_frame(stack));
+
+  iree_vm_function_t function_b = {MODULE_B_SENTINEL,
+                                   IREE_VM_FUNCTION_LINKAGE_INTERNAL, 1};
+  iree_vm_stack_frame_t* frame_b = nullptr;
+  IREE_EXPECT_OK(iree_vm_stack_function_enter(
+      stack, &function_b, IREE_VM_STACK_FRAME_NATIVE, 0, NULL, &frame_b));
+  EXPECT_EQ(1, frame_b->function.ordinal);
+  EXPECT_EQ(frame_b, iree_vm_stack_current_frame(stack));
+  EXPECT_EQ(frame_a, iree_vm_stack_parent_frame(stack));
+
+  IREE_EXPECT_OK(iree_vm_stack_function_leave(stack));
+  EXPECT_EQ(frame_a, iree_vm_stack_current_frame(stack));
+  EXPECT_EQ(nullptr, iree_vm_stack_parent_frame(stack));
+  IREE_EXPECT_OK(iree_vm_stack_function_leave(stack));
+  EXPECT_EQ(nullptr, iree_vm_stack_current_frame(stack));
+  EXPECT_EQ(nullptr, iree_vm_stack_parent_frame(stack));
+
+  iree_vm_stack_deinitialize(stack);
+}
+
+// Tests stack cleanup with unpopped frames (like during failure teardown).
+TEST(VMStackTest, DeinitWithRemainingFrames) {
+  iree_vm_state_resolver_t state_resolver = {nullptr, SentinelStateResolver};
+  IREE_VM_INLINE_STACK_INITIALIZE(stack, IREE_VM_INVOCATION_FLAG_NONE,
+                                  state_resolver, iree_allocator_system());
+
+  iree_vm_function_t function_a = {MODULE_A_SENTINEL,
+                                   IREE_VM_FUNCTION_LINKAGE_INTERNAL, 0};
+  iree_vm_stack_frame_t* frame_a = nullptr;
+  IREE_EXPECT_OK(iree_vm_stack_function_enter(
+      stack, &function_a, IREE_VM_STACK_FRAME_NATIVE, 0, NULL, &frame_a));
+  EXPECT_EQ(0, frame_a->function.ordinal);
+  EXPECT_EQ(frame_a, iree_vm_stack_current_frame(stack));
+  EXPECT_EQ(nullptr, iree_vm_stack_parent_frame(stack));
+
+  // Don't pop the last frame before deinit; it should handle it.
+  iree_vm_stack_deinitialize(stack);
+}
+
+// Tests stack overflow detection.
+TEST(VMStackTest, StackOverflow) {
+  iree_vm_state_resolver_t state_resolver = {nullptr, SentinelStateResolver};
+  IREE_VM_INLINE_STACK_INITIALIZE(stack, IREE_VM_INVOCATION_FLAG_NONE,
+                                  state_resolver, iree_allocator_system());
+
+  EXPECT_EQ(nullptr, iree_vm_stack_current_frame(stack));
+  EXPECT_EQ(nullptr, iree_vm_stack_parent_frame(stack));
+
+  // Fill the entire stack up to the max.
+  iree_vm_function_t function_a = {MODULE_A_SENTINEL,
+                                   IREE_VM_FUNCTION_LINKAGE_INTERNAL, 0};
+  bool did_overflow = false;
+  for (int i = 0; i < 99999; ++i) {
+    iree_vm_stack_frame_t* frame_a = nullptr;
+    iree_status_t status = iree_vm_stack_function_enter(
+        stack, &function_a, IREE_VM_STACK_FRAME_NATIVE, 0, NULL, &frame_a);
+    if (iree_status_is_resource_exhausted(status)) {
+      // Hit the stack overflow, as expected.
+      did_overflow = true;
+      IREE_IGNORE_ERROR(status);
+      break;
+    }
+    IREE_EXPECT_OK(status);
+  }
+  ASSERT_TRUE(did_overflow);
+
+  iree_vm_stack_deinitialize(stack);
+}
+
+// Tests unbalanced stack popping.
+TEST(VMStackTest, UnbalancedPop) {
+  iree_vm_state_resolver_t state_resolver = {nullptr, SentinelStateResolver};
+  IREE_VM_INLINE_STACK_INITIALIZE(stack, IREE_VM_INVOCATION_FLAG_NONE,
+                                  state_resolver, iree_allocator_system());
+
+  iree_status_t status = iree_vm_stack_function_leave(stack);
+  IREE_EXPECT_STATUS_IS(IREE_STATUS_FAILED_PRECONDITION, status);
+  iree_status_free(status);
+
+  iree_vm_stack_deinitialize(stack);
+}
+
+// Tests module state reuse and querying.
+TEST(VMStackTest, ModuleStateQueries) {
+  iree_vm_state_resolver_t state_resolver = {nullptr, SentinelStateResolver};
+  IREE_VM_INLINE_STACK_INITIALIZE(stack, IREE_VM_INVOCATION_FLAG_NONE,
+                                  state_resolver, iree_allocator_system());
+
+  EXPECT_EQ(nullptr, iree_vm_stack_current_frame(stack));
+  EXPECT_EQ(nullptr, iree_vm_stack_parent_frame(stack));
+
+  module_a_state_resolve_count = 0;
+  module_b_state_resolve_count = 0;
+
+  // [A (queried)]
+  iree_vm_function_t function_a = {MODULE_A_SENTINEL,
+                                   IREE_VM_FUNCTION_LINKAGE_INTERNAL, 0};
+  iree_vm_stack_frame_t* frame_a = nullptr;
+  IREE_EXPECT_OK(iree_vm_stack_function_enter(
+      stack, &function_a, IREE_VM_STACK_FRAME_NATIVE, 0, NULL, &frame_a));
+  EXPECT_EQ(MODULE_A_STATE_SENTINEL, frame_a->module_state);
+  EXPECT_EQ(1, module_a_state_resolve_count);
+
+  // [A, B (queried)]
+  iree_vm_function_t function_b = {MODULE_B_SENTINEL,
+                                   IREE_VM_FUNCTION_LINKAGE_INTERNAL, 1};
+  iree_vm_stack_frame_t* frame_b = nullptr;
+  IREE_EXPECT_OK(iree_vm_stack_function_enter(
+      stack, &function_b, IREE_VM_STACK_FRAME_NATIVE, 0, NULL, &frame_b));
+  EXPECT_EQ(MODULE_B_STATE_SENTINEL, frame_b->module_state);
+  EXPECT_EQ(1, module_b_state_resolve_count);
+
+  // [A, B, B (reuse)]
+  IREE_EXPECT_OK(iree_vm_stack_function_enter(
+      stack, &function_b, IREE_VM_STACK_FRAME_NATIVE, 0, NULL, &frame_b));
+  EXPECT_EQ(MODULE_B_STATE_SENTINEL, frame_b->module_state);
+  EXPECT_EQ(1, module_b_state_resolve_count);
+
+  IREE_EXPECT_OK(iree_vm_stack_function_leave(stack));
+  IREE_EXPECT_OK(iree_vm_stack_function_leave(stack));
+  IREE_EXPECT_OK(iree_vm_stack_function_leave(stack));
+
+  iree_vm_stack_deinitialize(stack);
+}
+
+// Tests that module state query failures propagate to callers correctly.
+TEST(VMStackTest, ModuleStateQueryFailure) {
+  iree_vm_state_resolver_t state_resolver = {
+      nullptr,
+      +[](void* state_resolver, iree_vm_module_t* module,
+          iree_vm_module_state_t** out_module_state) -> iree_status_t {
+        // NOTE: always failing.
+        return iree_make_status(IREE_STATUS_INTERNAL);
+      }};
+  IREE_VM_INLINE_STACK_INITIALIZE(stack, IREE_VM_INVOCATION_FLAG_NONE,
+                                  state_resolver, iree_allocator_system());
+
+  // Push should fail if we can't query state, status should propagate.
+  iree_vm_function_t function_a = {MODULE_A_SENTINEL,
+                                   IREE_VM_FUNCTION_LINKAGE_INTERNAL, 0};
+  iree_vm_stack_frame_t* frame_a = nullptr;
+  iree_status_t status = iree_vm_stack_function_enter(
+      stack, &function_a, IREE_VM_STACK_FRAME_NATIVE, 0, NULL, &frame_a);
+  IREE_EXPECT_STATUS_IS(IREE_STATUS_INTERNAL, status);
+  iree_status_free(status);
+  iree_vm_stack_deinitialize(stack);
+}
+
+}  // namespace
diff --git a/runtime/src/iree/vm/test/BUILD b/runtime/src/iree/vm/test/BUILD
new file mode 100644
index 0000000..c0f5ba5
--- /dev/null
+++ b/runtime/src/iree/vm/test/BUILD
@@ -0,0 +1,225 @@
+# Copyright 2020 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("//iree:build_defs.oss.bzl", "iree_cmake_extra_content")
+load("//build_tools/bazel:iree_bytecode_module.bzl", "iree_bytecode_module")
+load("//build_tools/embed_data:build_defs.bzl", "c_embed_data")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+iree_cmake_extra_content(
+    content = """
+if (NOT ${IREE_BUILD_COMPILER} OR NOT ${IREE_BUILD_TESTS})
+  return()
+endif()
+""",
+    inline = True,
+)
+
+c_embed_data(
+    name = "all_bytecode_modules_c",
+    srcs = [
+        ":arithmetic_ops.vmfb",
+        ":arithmetic_ops_f32.vmfb",
+        ":arithmetic_ops_i64.vmfb",
+        ":assignment_ops.vmfb",
+        ":assignment_ops_f32.vmfb",
+        ":assignment_ops_i64.vmfb",
+        ":buffer_ops.vmfb",
+        ":call_ops.vmfb",
+        ":comparison_ops.vmfb",
+        ":comparison_ops_f32.vmfb",
+        ":comparison_ops_i64.vmfb",
+        ":control_flow_ops.vmfb",
+        ":conversion_ops.vmfb",
+        ":conversion_ops_f32.vmfb",
+        ":conversion_ops_i64.vmfb",
+        ":global_ops.vmfb",
+        ":global_ops_f32.vmfb",
+        ":global_ops_i64.vmfb",
+        ":list_ops.vmfb",
+        ":list_ops_i64.vmfb",
+        ":list_variant_ops.vmfb",
+        ":ref_ops.vmfb",
+        ":shift_ops.vmfb",
+        ":shift_ops_i64.vmfb",
+    ],
+    c_file_output = "all_bytecode_modules.c",
+    flatten = True,
+    h_file_output = "all_bytecode_modules.h",
+)
+
+iree_bytecode_module(
+    name = "arithmetic_ops",
+    src = "arithmetic_ops.mlir",
+    flags = ["-iree-vm-ir-to-bytecode-module"],
+    translate_tool = "//iree/tools:iree-translate",
+)
+
+iree_bytecode_module(
+    name = "arithmetic_ops_f32",
+    src = "arithmetic_ops_f32.mlir",
+    flags = ["-iree-vm-ir-to-bytecode-module"],
+    translate_tool = "//iree/tools:iree-translate",
+)
+
+iree_bytecode_module(
+    name = "arithmetic_ops_i64",
+    src = "arithmetic_ops_i64.mlir",
+    flags = ["-iree-vm-ir-to-bytecode-module"],
+    translate_tool = "//iree/tools:iree-translate",
+)
+
+iree_bytecode_module(
+    name = "assignment_ops",
+    src = "assignment_ops.mlir",
+    flags = ["-iree-vm-ir-to-bytecode-module"],
+    translate_tool = "//iree/tools:iree-translate",
+)
+
+iree_bytecode_module(
+    name = "assignment_ops_f32",
+    src = "assignment_ops_f32.mlir",
+    flags = ["-iree-vm-ir-to-bytecode-module"],
+    translate_tool = "//iree/tools:iree-translate",
+)
+
+iree_bytecode_module(
+    name = "assignment_ops_i64",
+    src = "assignment_ops_i64.mlir",
+    flags = ["-iree-vm-ir-to-bytecode-module"],
+    translate_tool = "//iree/tools:iree-translate",
+)
+
+iree_bytecode_module(
+    name = "buffer_ops",
+    src = "buffer_ops.mlir",
+    flags = ["-iree-vm-ir-to-bytecode-module"],
+    translate_tool = "//iree/tools:iree-translate",
+)
+
+iree_bytecode_module(
+    name = "call_ops",
+    src = "call_ops.mlir",
+    flags = ["-iree-vm-ir-to-bytecode-module"],
+    translate_tool = "//iree/tools:iree-translate",
+)
+
+iree_bytecode_module(
+    name = "comparison_ops",
+    src = "comparison_ops.mlir",
+    flags = ["-iree-vm-ir-to-bytecode-module"],
+    translate_tool = "//iree/tools:iree-translate",
+)
+
+iree_bytecode_module(
+    name = "comparison_ops_f32",
+    src = "comparison_ops_f32.mlir",
+    flags = ["-iree-vm-ir-to-bytecode-module"],
+    translate_tool = "//iree/tools:iree-translate",
+)
+
+iree_bytecode_module(
+    name = "comparison_ops_i64",
+    src = "comparison_ops_i64.mlir",
+    flags = ["-iree-vm-ir-to-bytecode-module"],
+    translate_tool = "//iree/tools:iree-translate",
+)
+
+iree_bytecode_module(
+    name = "control_flow_ops",
+    src = "control_flow_ops.mlir",
+    flags = ["-iree-vm-ir-to-bytecode-module"],
+    translate_tool = "//iree/tools:iree-translate",
+)
+
+iree_bytecode_module(
+    name = "conversion_ops",
+    src = "conversion_ops.mlir",
+    flags = ["-iree-vm-ir-to-bytecode-module"],
+    translate_tool = "//iree/tools:iree-translate",
+)
+
+iree_bytecode_module(
+    name = "conversion_ops_f32",
+    src = "conversion_ops_f32.mlir",
+    flags = ["-iree-vm-ir-to-bytecode-module"],
+    translate_tool = "//iree/tools:iree-translate",
+)
+
+iree_bytecode_module(
+    name = "conversion_ops_i64",
+    src = "conversion_ops_i64.mlir",
+    flags = ["-iree-vm-ir-to-bytecode-module"],
+    translate_tool = "//iree/tools:iree-translate",
+)
+
+iree_bytecode_module(
+    name = "global_ops",
+    src = "global_ops.mlir",
+    flags = ["-iree-vm-ir-to-bytecode-module"],
+    translate_tool = "//iree/tools:iree-translate",
+)
+
+iree_bytecode_module(
+    name = "global_ops_f32",
+    src = "global_ops_f32.mlir",
+    flags = ["-iree-vm-ir-to-bytecode-module"],
+    translate_tool = "//iree/tools:iree-translate",
+)
+
+iree_bytecode_module(
+    name = "global_ops_i64",
+    src = "global_ops_i64.mlir",
+    flags = ["-iree-vm-ir-to-bytecode-module"],
+    translate_tool = "//iree/tools:iree-translate",
+)
+
+iree_bytecode_module(
+    name = "list_ops",
+    src = "list_ops.mlir",
+    flags = ["-iree-vm-ir-to-bytecode-module"],
+    translate_tool = "//iree/tools:iree-translate",
+)
+
+iree_bytecode_module(
+    name = "list_ops_i64",
+    src = "list_ops_i64.mlir",
+    flags = ["-iree-vm-ir-to-bytecode-module"],
+    translate_tool = "//iree/tools:iree-translate",
+)
+
+iree_bytecode_module(
+    name = "list_variant_ops",
+    src = "list_variant_ops.mlir",
+    flags = ["-iree-vm-ir-to-bytecode-module"],
+    translate_tool = "//iree/tools:iree-translate",
+)
+
+iree_bytecode_module(
+    name = "ref_ops",
+    src = "ref_ops.mlir",
+    flags = ["-iree-vm-ir-to-bytecode-module"],
+    translate_tool = "//iree/tools:iree-translate",
+)
+
+iree_bytecode_module(
+    name = "shift_ops",
+    src = "shift_ops.mlir",
+    flags = ["-iree-vm-ir-to-bytecode-module"],
+    translate_tool = "//iree/tools:iree-translate",
+)
+
+iree_bytecode_module(
+    name = "shift_ops_i64",
+    src = "shift_ops_i64.mlir",
+    flags = ["-iree-vm-ir-to-bytecode-module"],
+    translate_tool = "//iree/tools:iree-translate",
+)
diff --git a/runtime/src/iree/vm/test/CMakeLists.txt b/runtime/src/iree/vm/test/CMakeLists.txt
new file mode 100644
index 0000000..80e96c0
--- /dev/null
+++ b/runtime/src/iree/vm/test/CMakeLists.txt
@@ -0,0 +1,341 @@
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# runtime/src/iree/vm/test/BUILD                                               #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
+
+iree_add_all_subdirs()
+
+if (NOT ${IREE_BUILD_COMPILER} OR NOT ${IREE_BUILD_TESTS})
+  return()
+endif()
+
+iree_c_embed_data(
+  NAME
+    all_bytecode_modules_c
+  GENERATED_SRCS
+    "arithmetic_ops.vmfb"
+    "arithmetic_ops_f32.vmfb"
+    "arithmetic_ops_i64.vmfb"
+    "assignment_ops.vmfb"
+    "assignment_ops_f32.vmfb"
+    "assignment_ops_i64.vmfb"
+    "buffer_ops.vmfb"
+    "call_ops.vmfb"
+    "comparison_ops.vmfb"
+    "comparison_ops_f32.vmfb"
+    "comparison_ops_i64.vmfb"
+    "control_flow_ops.vmfb"
+    "conversion_ops.vmfb"
+    "conversion_ops_f32.vmfb"
+    "conversion_ops_i64.vmfb"
+    "global_ops.vmfb"
+    "global_ops_f32.vmfb"
+    "global_ops_i64.vmfb"
+    "list_ops.vmfb"
+    "list_ops_i64.vmfb"
+    "list_variant_ops.vmfb"
+    "ref_ops.vmfb"
+    "shift_ops.vmfb"
+    "shift_ops_i64.vmfb"
+  C_FILE_OUTPUT
+    "all_bytecode_modules.c"
+  H_FILE_OUTPUT
+    "all_bytecode_modules.h"
+  FLATTEN
+  PUBLIC
+)
+
+iree_bytecode_module(
+  NAME
+    arithmetic_ops
+  SRC
+    "arithmetic_ops.mlir"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+  FLAGS
+    "-iree-vm-ir-to-bytecode-module"
+  PUBLIC
+)
+
+iree_bytecode_module(
+  NAME
+    arithmetic_ops_f32
+  SRC
+    "arithmetic_ops_f32.mlir"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+  FLAGS
+    "-iree-vm-ir-to-bytecode-module"
+  PUBLIC
+)
+
+iree_bytecode_module(
+  NAME
+    arithmetic_ops_i64
+  SRC
+    "arithmetic_ops_i64.mlir"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+  FLAGS
+    "-iree-vm-ir-to-bytecode-module"
+  PUBLIC
+)
+
+iree_bytecode_module(
+  NAME
+    assignment_ops
+  SRC
+    "assignment_ops.mlir"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+  FLAGS
+    "-iree-vm-ir-to-bytecode-module"
+  PUBLIC
+)
+
+iree_bytecode_module(
+  NAME
+    assignment_ops_f32
+  SRC
+    "assignment_ops_f32.mlir"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+  FLAGS
+    "-iree-vm-ir-to-bytecode-module"
+  PUBLIC
+)
+
+iree_bytecode_module(
+  NAME
+    assignment_ops_i64
+  SRC
+    "assignment_ops_i64.mlir"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+  FLAGS
+    "-iree-vm-ir-to-bytecode-module"
+  PUBLIC
+)
+
+iree_bytecode_module(
+  NAME
+    buffer_ops
+  SRC
+    "buffer_ops.mlir"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+  FLAGS
+    "-iree-vm-ir-to-bytecode-module"
+  PUBLIC
+)
+
+iree_bytecode_module(
+  NAME
+    call_ops
+  SRC
+    "call_ops.mlir"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+  FLAGS
+    "-iree-vm-ir-to-bytecode-module"
+  PUBLIC
+)
+
+iree_bytecode_module(
+  NAME
+    comparison_ops
+  SRC
+    "comparison_ops.mlir"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+  FLAGS
+    "-iree-vm-ir-to-bytecode-module"
+  PUBLIC
+)
+
+iree_bytecode_module(
+  NAME
+    comparison_ops_f32
+  SRC
+    "comparison_ops_f32.mlir"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+  FLAGS
+    "-iree-vm-ir-to-bytecode-module"
+  PUBLIC
+)
+
+iree_bytecode_module(
+  NAME
+    comparison_ops_i64
+  SRC
+    "comparison_ops_i64.mlir"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+  FLAGS
+    "-iree-vm-ir-to-bytecode-module"
+  PUBLIC
+)
+
+iree_bytecode_module(
+  NAME
+    control_flow_ops
+  SRC
+    "control_flow_ops.mlir"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+  FLAGS
+    "-iree-vm-ir-to-bytecode-module"
+  PUBLIC
+)
+
+iree_bytecode_module(
+  NAME
+    conversion_ops
+  SRC
+    "conversion_ops.mlir"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+  FLAGS
+    "-iree-vm-ir-to-bytecode-module"
+  PUBLIC
+)
+
+iree_bytecode_module(
+  NAME
+    conversion_ops_f32
+  SRC
+    "conversion_ops_f32.mlir"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+  FLAGS
+    "-iree-vm-ir-to-bytecode-module"
+  PUBLIC
+)
+
+iree_bytecode_module(
+  NAME
+    conversion_ops_i64
+  SRC
+    "conversion_ops_i64.mlir"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+  FLAGS
+    "-iree-vm-ir-to-bytecode-module"
+  PUBLIC
+)
+
+iree_bytecode_module(
+  NAME
+    global_ops
+  SRC
+    "global_ops.mlir"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+  FLAGS
+    "-iree-vm-ir-to-bytecode-module"
+  PUBLIC
+)
+
+iree_bytecode_module(
+  NAME
+    global_ops_f32
+  SRC
+    "global_ops_f32.mlir"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+  FLAGS
+    "-iree-vm-ir-to-bytecode-module"
+  PUBLIC
+)
+
+iree_bytecode_module(
+  NAME
+    global_ops_i64
+  SRC
+    "global_ops_i64.mlir"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+  FLAGS
+    "-iree-vm-ir-to-bytecode-module"
+  PUBLIC
+)
+
+iree_bytecode_module(
+  NAME
+    list_ops
+  SRC
+    "list_ops.mlir"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+  FLAGS
+    "-iree-vm-ir-to-bytecode-module"
+  PUBLIC
+)
+
+iree_bytecode_module(
+  NAME
+    list_ops_i64
+  SRC
+    "list_ops_i64.mlir"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+  FLAGS
+    "-iree-vm-ir-to-bytecode-module"
+  PUBLIC
+)
+
+iree_bytecode_module(
+  NAME
+    list_variant_ops
+  SRC
+    "list_variant_ops.mlir"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+  FLAGS
+    "-iree-vm-ir-to-bytecode-module"
+  PUBLIC
+)
+
+iree_bytecode_module(
+  NAME
+    ref_ops
+  SRC
+    "ref_ops.mlir"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+  FLAGS
+    "-iree-vm-ir-to-bytecode-module"
+  PUBLIC
+)
+
+iree_bytecode_module(
+  NAME
+    shift_ops
+  SRC
+    "shift_ops.mlir"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+  FLAGS
+    "-iree-vm-ir-to-bytecode-module"
+  PUBLIC
+)
+
+iree_bytecode_module(
+  NAME
+    shift_ops_i64
+  SRC
+    "shift_ops_i64.mlir"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+  FLAGS
+    "-iree-vm-ir-to-bytecode-module"
+  PUBLIC
+)
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/runtime/src/iree/vm/test/arithmetic_ops.mlir b/runtime/src/iree/vm/test/arithmetic_ops.mlir
new file mode 100644
index 0000000..65b82dc
--- /dev/null
+++ b/runtime/src/iree/vm/test/arithmetic_ops.mlir
@@ -0,0 +1,146 @@
+vm.module @arithmetic_ops {
+
+  //===--------------------------------------------------------------------===//
+  // Native integer arithmetic
+  //===--------------------------------------------------------------------===//
+
+  vm.export @test_add_i32
+  vm.func @test_add_i32() {
+    %c1 = vm.const.i32 1
+    %c1dno = util.do_not_optimize(%c1) : i32
+    %v = vm.add.i32 %c1dno, %c1dno : i32
+    %c2 = vm.const.i32 2
+    vm.check.eq %v, %c2, "1+1=2" : i32
+    vm.return
+  }
+
+  vm.export @test_sub_i32
+  vm.func @test_sub_i32() {
+    %c1 = vm.const.i32 3
+    %c1dno = util.do_not_optimize(%c1) : i32
+    %c2 = vm.const.i32 2
+    %c2dno = util.do_not_optimize(%c2) : i32
+    %v = vm.sub.i32 %c1dno, %c2dno : i32
+    %c3 = vm.const.i32 1
+    vm.check.eq %v, %c3, "3-2=1" : i32
+    vm.return
+  }
+
+  vm.export @test_mul_i32
+  vm.func @test_mul_i32() {
+    %c1 = vm.const.i32 2
+    %c1dno = util.do_not_optimize(%c1) : i32
+    %v = vm.mul.i32 %c1dno, %c1dno : i32
+    %c2 = vm.const.i32 4
+    vm.check.eq %v, %c2, "2*2=4" : i32
+    vm.return
+  }
+
+  vm.export @test_div_i32s
+  vm.func @test_div_i32s() {
+    %c1 = vm.const.i32 4
+    %c1dno = util.do_not_optimize(%c1) : i32
+    %c2 = vm.const.i32 -2
+    %c2dno = util.do_not_optimize(%c2) : i32
+    %v = vm.div.i32.s %c1dno, %c2dno : i32
+    %c3 = vm.const.i32 -2
+    vm.check.eq %v, %c3, "4/-2=-2" : i32
+    vm.return
+  }
+
+  vm.export @test_div_i32u
+  vm.func @test_div_i32u() {
+    %c1 = vm.const.i32 4
+    %c1dno = util.do_not_optimize(%c1) : i32
+    %c2 = vm.const.i32 2
+    %c2dno = util.do_not_optimize(%c2) : i32
+    %v = vm.div.i32.u %c1dno, %c2dno : i32
+    %c3 = vm.const.i32 2
+    vm.check.eq %v, %c3, "4/2=2" : i32
+    vm.return
+  }
+
+  vm.export @test_rem_i32s
+  vm.func @test_rem_i32s() {
+    %c1 = vm.const.i32 -3
+    %c1dno = util.do_not_optimize(%c1) : i32
+    %c2 = vm.const.i32 -2
+    %c2dno = util.do_not_optimize(%c2) : i32
+    %v = vm.rem.i32.s %c1dno, %c2dno : i32
+    %c3 = vm.const.i32 -1
+    vm.check.eq %v, %c3, "-3%-2=-1" : i32
+    vm.return
+  }
+
+  vm.export @test_rem_i32u
+  vm.func @test_rem_i32u() {
+    %c1 = vm.const.i32 3
+    %c1dno = util.do_not_optimize(%c1) : i32
+    %c2 = vm.const.i32 2
+    %c2dno = util.do_not_optimize(%c2) : i32
+    %v = vm.rem.i32.u %c1dno, %c2dno : i32
+    %c3 = vm.const.i32 1
+    vm.check.eq %v, %c3, "3%2=1" : i32
+    vm.return
+  }
+
+  vm.export @test_fma_i32
+  vm.func @test_fma_i32() {
+    %c2 = vm.const.i32 2
+    %c2dno = util.do_not_optimize(%c2) : i32
+    %c3 = vm.const.i32 3
+    %c3dno = util.do_not_optimize(%c3) : i32
+    %c5 = vm.const.i32 5
+    %c5dno = util.do_not_optimize(%c5) : i32
+    %v = vm.fma.i32 %c2dno, %c3dno, %c5dno : i32
+    %c11 = vm.const.i32 11
+    vm.check.eq %v, %c11, "2*3+5=11" : i32
+    vm.return
+  }
+
+  vm.export @test_not_i32
+  vm.func @test_not_i32() {
+    %c1 = vm.const.i32 0
+    %c1dno = util.do_not_optimize(%c1) : i32
+    %v = vm.not.i32 %c1dno : i32
+    %c2 = vm.const.i32 -1
+    vm.check.eq %v, %c2, "~0=-1" : i32
+    vm.return
+  }
+
+  vm.export @test_and_i32
+  vm.func @test_and_i32() {
+    %c1 = vm.const.i32 5
+    %c1dno = util.do_not_optimize(%c1) : i32
+    %c2 = vm.const.i32 3
+    %c2dno = util.do_not_optimize(%c2) : i32
+    %v = vm.and.i32 %c1dno, %c2dno : i32
+    %c3 = vm.const.i32 1
+    vm.check.eq %v, %c3, "5&3=1" : i32
+    vm.return
+  }
+
+  vm.export @test_or_i32
+  vm.func @test_or_i32() {
+    %c1 = vm.const.i32 5
+    %c1dno = util.do_not_optimize(%c1) : i32
+    %c2 = vm.const.i32 3
+    %c2dno = util.do_not_optimize(%c2) : i32
+    %v = vm.or.i32 %c1dno, %c2dno : i32
+    %c3 = vm.const.i32 7
+    vm.check.eq %v, %c3, "5|3=7" : i32
+    vm.return
+  }
+
+  vm.export @test_xor_i32
+  vm.func @test_xor_i32() {
+    %c1 = vm.const.i32 5
+    %c1dno = util.do_not_optimize(%c1) : i32
+    %c2 = vm.const.i32 3
+    %c2dno = util.do_not_optimize(%c2) : i32
+    %v = vm.xor.i32 %c1dno, %c2dno : i32
+    %c3 = vm.const.i32 6
+    vm.check.eq %v, %c3, "5^3=6" : i32
+    vm.return
+  }
+}
diff --git a/runtime/src/iree/vm/test/arithmetic_ops_f32.mlir b/runtime/src/iree/vm/test/arithmetic_ops_f32.mlir
new file mode 100644
index 0000000..f23cf94
--- /dev/null
+++ b/runtime/src/iree/vm/test/arithmetic_ops_f32.mlir
@@ -0,0 +1,281 @@
+vm.module @arithmetic_ops_f32 {
+
+  //===--------------------------------------------------------------------===//
+  // ExtF32: Native floating-point arithmetic
+  //===--------------------------------------------------------------------===//
+
+  vm.export @test_add_f32
+  vm.func @test_add_f32() {
+    %c1 = vm.const.f32 1.5
+    %c1dno = util.do_not_optimize(%c1) : f32
+    %v = vm.add.f32 %c1dno, %c1dno : f32
+    %c2 = vm.const.f32 3.0
+    vm.check.eq %v, %c2, "1.5+1.5=3" : f32
+    vm.return
+  }
+
+  vm.export @test_sub_f32
+  vm.func @test_sub_f32() {
+    %c1 = vm.const.f32 3.0
+    %c1dno = util.do_not_optimize(%c1) : f32
+    %c2 = vm.const.f32 2.5
+    %c2dno = util.do_not_optimize(%c2) : f32
+    %v = vm.sub.f32 %c1dno, %c2dno : f32
+    %c3 = vm.const.f32 0.5
+    vm.check.eq %v, %c3, "3.0-2.5=0.5" : f32
+    vm.return
+  }
+
+  vm.export @test_mul_f32
+  vm.func @test_mul_f32() {
+    %c1 = vm.const.f32 2.5
+    %c1dno = util.do_not_optimize(%c1) : f32
+    %v = vm.mul.f32 %c1dno, %c1dno : f32
+    %c2 = vm.const.f32 6.25
+    vm.check.eq %v, %c2, "2.5*2.5=6.25" : f32
+    vm.return
+  }
+
+  vm.export @test_div_f32
+  vm.func @test_div_f32() {
+    %c1 = vm.const.f32 4.0
+    %c1dno = util.do_not_optimize(%c1) : f32
+    %c2 = vm.const.f32 -2.0
+    %c2dno = util.do_not_optimize(%c2) : f32
+    %v = vm.div.f32 %c1dno, %c2dno : f32
+    %c3 = vm.const.f32 -2.0
+    vm.check.eq %v, %c3, "4.0/-2.0=-2.0" : f32
+    vm.return
+  }
+
+  vm.export @test_rem_f32
+  vm.func @test_rem_f32() {
+    %c1 = vm.const.f32 -3.0
+    %c1dno = util.do_not_optimize(%c1) : f32
+    %c2 = vm.const.f32 -2.0
+    %c2dno = util.do_not_optimize(%c2) : f32
+    %v = vm.rem.f32 %c1dno, %c2dno : f32
+    %c3 = vm.const.f32 1.0
+    vm.check.eq %v, %c3, "-3.0%-2.0=1.0" : f32
+    vm.return
+  }
+
+  vm.export @test_fma_f32
+  vm.func @test_fma_f32() {
+    %c2 = vm.const.f32 2.0
+    %c2dno = util.do_not_optimize(%c2) : f32
+    %c3 = vm.const.f32 3.0
+    %c3dno = util.do_not_optimize(%c3) : f32
+    %c5 = vm.const.f32 5.0
+    %c5dno = util.do_not_optimize(%c5) : f32
+    %v = vm.fma.f32 %c2dno, %c3dno, %c5dno : f32
+    %c11 = vm.const.f32 11.0
+    vm.check.eq %v, %c11, "2.0*3.0+5.0=11.0" : f32
+    vm.return
+  }
+
+  vm.export @test_abs_f32
+  vm.func @test_abs_f32() {
+    %c1 = vm.const.f32 -1.0
+    %c1dno = util.do_not_optimize(%c1) : f32
+    %v = vm.abs.f32 %c1dno : f32
+    %c2 = vm.const.f32 1.0
+    vm.check.eq %v, %c2, "abs(-1.0)=1.0" : f32
+    vm.return
+  }
+
+  vm.export @test_neg_f32
+  vm.func @test_neg_f32() {
+    %c1 = vm.const.f32 -1.0
+    %c1dno = util.do_not_optimize(%c1) : f32
+    %v = vm.neg.f32 %c1dno : f32
+    %c2 = vm.const.f32 1.0
+    vm.check.eq %v, %c2, "neg(-1.0)=1.0" : f32
+    vm.return
+  }
+
+  vm.export @test_ceil_f32
+  vm.func @test_ceil_f32() {
+    %c1 = vm.const.f32 1.5
+    %c1dno = util.do_not_optimize(%c1) : f32
+    %v = vm.ceil.f32 %c1dno : f32
+    %c2 = vm.const.f32 2.0
+    vm.check.eq %v, %c2, "ceil(1.5)=2.0" : f32
+    vm.return
+  }
+
+  vm.export @test_floor_f32
+  vm.func @test_floor_f32() {
+    %c1 = vm.const.f32 1.5
+    %c1dno = util.do_not_optimize(%c1) : f32
+    %v = vm.floor.f32 %c1dno : f32
+    %c2 = vm.const.f32 1.0
+    vm.check.eq %v, %c2, "floor(1.5)=1.0" : f32
+    vm.return
+  }
+
+  vm.export @test_atan_f32
+  vm.func @test_atan_f32() {
+    %c1 = vm.const.f32 1.0
+    %c1dno = util.do_not_optimize(%c1) : f32
+    %v = vm.atan.f32 %c1dno : f32
+    %c2 = vm.const.f32 0.7853981633974483
+    vm.check.eq %v, %c2, "atan(1.0)=0.7853981633974483" : f32
+    vm.return
+  }
+
+  vm.export @test_atan2_f32
+  vm.func @test_atan2_f32() {
+    %c1 = vm.const.f32 1.0
+    %c1dno = util.do_not_optimize(%c1) : f32
+    %c2 = vm.const.f32 0.0
+    %c2dno = util.do_not_optimize(%c2) : f32
+    %v = vm.atan2.f32 %c1dno, %c2dno : f32
+    %c3 = vm.const.f32 1.5707963267948966
+    vm.check.eq %v, %c3, "atan2(1.0,0.0)=1.5707963267948966" : f32
+    vm.return
+  }
+
+  vm.export @test_cos_f32
+  vm.func @test_cos_f32() {
+    %c1 = vm.const.f32 0.5
+    %c1dno = util.do_not_optimize(%c1) : f32
+    %v = vm.cos.f32 %c1dno : f32
+    %c2 = vm.const.f32 0.8775825618903728
+    vm.check.eq %v, %c2, "cos(0.5)=0.8775825618903728" : f32
+    vm.return
+  }
+
+  vm.export @test_sin_f32
+  vm.func @test_sin_f32() {
+    %c1 = vm.const.f32 0.5
+    %c1dno = util.do_not_optimize(%c1) : f32
+    %v = vm.sin.f32 %c1dno : f32
+    %c2 = vm.const.f32 0.479425538604203
+    vm.check.eq %v, %c2, "sin(0.5)=0.479425538604203" : f32
+    vm.return
+  }
+
+  vm.export @test_exp_f32
+  vm.func @test_exp_f32() {
+    %c1 = vm.const.f32 1.0
+    %c1dno = util.do_not_optimize(%c1) : f32
+    %v = vm.exp.f32 %c1dno : f32
+    %c2 = vm.const.f32 2.718281828459045
+    vm.check.eq %v, %c2, "exp(1.0)=2.718281828459045" : f32
+    vm.return
+  }
+
+  vm.export @test_exp2_f32
+  vm.func @test_exp2_f32() {
+    %c1 = vm.const.f32 2.0
+    %c1dno = util.do_not_optimize(%c1) : f32
+    %v = vm.exp2.f32 %c1dno : f32
+    %c2 = vm.const.f32 4.0
+    vm.check.eq %v, %c2, "exp(2.0)=4.0" : f32
+    vm.return
+  }
+
+  vm.export @test_expm1_f32
+  vm.func @test_expm1_f32() {
+    %c1 = vm.const.f32 2.0
+    %c1dno = util.do_not_optimize(%c1) : f32
+    %v = vm.expm1.f32 %c1dno : f32
+    %c2 = vm.const.f32 6.38905609893065
+    vm.check.eq %v, %c2, "expm1(2.0)=6.38905609893065" : f32
+    vm.return
+  }
+
+  vm.export @test_log_f32
+  vm.func @test_log_f32() {
+    %c1 = vm.const.f32 10.0
+    %c1dno = util.do_not_optimize(%c1) : f32
+    %v = vm.log.f32 %c1dno : f32
+    %c2 = vm.const.f32 2.302585092994046
+    vm.check.eq %v, %c2, "log(10.0)=2.302585092994046" : f32
+    vm.return
+  }
+
+  vm.export @test_log10_f32
+  vm.func @test_log10_f32() {
+    %c1 = vm.const.f32 10.0
+    %c1dno = util.do_not_optimize(%c1) : f32
+    %v = vm.log10.f32 %c1dno : f32
+    %c2 = vm.const.f32 1.0
+    vm.check.eq %v, %c2, "log10(10.0)=1.0" : f32
+    vm.return
+  }
+
+  vm.export @test_log1p_f32
+  vm.func @test_log1p_f32() {
+    %c1 = vm.const.f32 10.0
+    %c1dno = util.do_not_optimize(%c1) : f32
+    %v = vm.log1p.f32 %c1dno : f32
+    %c2 = vm.const.f32 2.3978952727983707
+    vm.check.eq %v, %c2, "log1p(10.0)=2.3978952727983707" : f32
+    vm.return
+  }
+
+  vm.export @test_log2_f32
+  vm.func @test_log2_f32() {
+    %c1 = vm.const.f32 10.0
+    %c1dno = util.do_not_optimize(%c1) : f32
+    %v = vm.log2.f32 %c1dno : f32
+    %c2 = vm.const.f32 3.321928094887362
+    vm.check.eq %v, %c2, "log2(10.0)=3.321928094887362" : f32
+    vm.return
+  }
+
+  vm.export @test_pow_f32
+  vm.func @test_pow_f32() {
+    %c1 = vm.const.f32 3.0
+    %c1dno = util.do_not_optimize(%c1) : f32
+    %c2 = vm.const.f32 2.0
+    %c2dno = util.do_not_optimize(%c2) : f32
+    %v = vm.pow.f32 %c1dno, %c2dno : f32
+    %c3 = vm.const.f32 9.0
+    vm.check.eq %v, %c3, "pow(3.0,2.0)=9.0" : f32
+    vm.return
+  }
+
+  vm.export @test_rsqrt_f32
+  vm.func @test_rsqrt_f32() {
+    %c1 = vm.const.f32 4.0
+    %c1dno = util.do_not_optimize(%c1) : f32
+    %v = vm.rsqrt.f32 %c1dno : f32
+    %c2 = vm.const.f32 0.5
+    vm.check.eq %v, %c2, "rsqrt(4.0)=0.5" : f32
+    vm.return
+  }
+
+  vm.export @test_sqrt_f32
+  vm.func @test_sqrt_f32() {
+    %c1 = vm.const.f32 4.0
+    %c1dno = util.do_not_optimize(%c1) : f32
+    %v = vm.sqrt.f32 %c1dno : f32
+    %c2 = vm.const.f32 2.0
+    vm.check.eq %v, %c2, "sqrt(4.0)=2.0" : f32
+    vm.return
+  }
+
+  vm.export @test_tanh_f32
+  vm.func @test_tanh_f32() {
+    %c1 = vm.const.f32 0.5
+    %c1dno = util.do_not_optimize(%c1) : f32
+    %v = vm.tanh.f32 %c1dno : f32
+    %c2 = vm.const.f32 0.46211715726000974
+    vm.check.eq %v, %c2, "tanh(0.5)=0.46211715726000974" : f32
+    vm.return
+  }
+
+  // TODO(#5854): vm.check.nearly_eq; this can differ across libm impls.
+  // vm.export @test_erf_f32
+  // vm.func @test_erf_f32() {
+  //   %c1 = vm.const.f32 0.5
+  //   %c1dno = util.do_not_optimize(%c1) : f32
+  //   %v = vm.erf.f32 %c1dno : f32
+  //   %c2 = vm.const.f32 0.520499945
+  //   vm.check.eq %v, %c2, "erf(0.5)=0.520499945" : f32
+  //   vm.return
+  // }
+}
diff --git a/runtime/src/iree/vm/test/arithmetic_ops_i64.mlir b/runtime/src/iree/vm/test/arithmetic_ops_i64.mlir
new file mode 100644
index 0000000..65f2c7d
--- /dev/null
+++ b/runtime/src/iree/vm/test/arithmetic_ops_i64.mlir
@@ -0,0 +1,146 @@
+vm.module @arithmetic_ops_i64 {
+
+  //===--------------------------------------------------------------------===//
+  // ExtI64: Native integer arithmetic
+  //===--------------------------------------------------------------------===//
+
+  vm.export @test_add_i64
+  vm.func @test_add_i64() {
+    %c1 = vm.const.i64 1
+    %c1dno = util.do_not_optimize(%c1) : i64
+    %v = vm.add.i64 %c1dno, %c1dno : i64
+    %c2 = vm.const.i64 2
+    vm.check.eq %v, %c2, "1+1=2" : i64
+    vm.return
+  }
+
+  vm.export @test_sub_i64
+  vm.func @test_sub_i64() {
+    %c1 = vm.const.i64 3
+    %c1dno = util.do_not_optimize(%c1) : i64
+    %c2 = vm.const.i64 2
+    %c2dno = util.do_not_optimize(%c2) : i64
+    %v = vm.sub.i64 %c1dno, %c2dno : i64
+    %c3 = vm.const.i64 1
+    vm.check.eq %v, %c3, "3-2=1" : i64
+    vm.return
+  }
+
+  vm.export @test_mul_i64
+  vm.func @test_mul_i64() {
+    %c1 = vm.const.i64 2
+    %c1dno = util.do_not_optimize(%c1) : i64
+    %v = vm.mul.i64 %c1dno, %c1dno : i64
+    %c2 = vm.const.i64 4
+    vm.check.eq %v, %c2, "2*2=4" : i64
+    vm.return
+  }
+
+  vm.export @test_div_i64s
+  vm.func @test_div_i64s() {
+    %c1 = vm.const.i64 4
+    %c1dno = util.do_not_optimize(%c1) : i64
+    %c2 = vm.const.i64 -2
+    %c2dno = util.do_not_optimize(%c2) : i64
+    %v = vm.div.i64.s %c1dno, %c2dno : i64
+    %c3 = vm.const.i64 -2
+    vm.check.eq %v, %c3, "4/-2=-2" : i64
+    vm.return
+  }
+
+  vm.export @test_div_i64u
+  vm.func @test_div_i64u() {
+    %c1 = vm.const.i64 4
+    %c1dno = util.do_not_optimize(%c1) : i64
+    %c2 = vm.const.i64 2
+    %c2dno = util.do_not_optimize(%c2) : i64
+    %v = vm.div.i64.u %c1dno, %c2dno : i64
+    %c3 = vm.const.i64 2
+    vm.check.eq %v, %c3, "4/2=2" : i64
+    vm.return
+  }
+
+  vm.export @test_rem_i64s
+  vm.func @test_rem_i64s() {
+    %c1 = vm.const.i64 -3
+    %c1dno = util.do_not_optimize(%c1) : i64
+    %c2 = vm.const.i64 -2
+    %c2dno = util.do_not_optimize(%c2) : i64
+    %v = vm.rem.i64.s %c1dno, %c2dno : i64
+    %c3 = vm.const.i64 -1
+    vm.check.eq %v, %c3, "-3%-2=-1" : i64
+    vm.return
+  }
+
+  vm.export @test_rem_i64u
+  vm.func @test_rem_i64u() {
+    %c1 = vm.const.i64 3
+    %c1dno = util.do_not_optimize(%c1) : i64
+    %c2 = vm.const.i64 2
+    %c2dno = util.do_not_optimize(%c2) : i64
+    %v = vm.rem.i64.u %c1dno, %c2dno : i64
+    %c3 = vm.const.i64 1
+    vm.check.eq %v, %c3, "3%2=1" : i64
+    vm.return
+  }
+
+  vm.export @test_fma_i64
+  vm.func @test_fma_i64() {
+    %c2 = vm.const.i64 2
+    %c2dno = util.do_not_optimize(%c2) : i64
+    %c3 = vm.const.i64 3
+    %c3dno = util.do_not_optimize(%c3) : i64
+    %c5 = vm.const.i64 5
+    %c5dno = util.do_not_optimize(%c5) : i64
+    %v = vm.fma.i64 %c2dno, %c3dno, %c5dno : i64
+    %c11 = vm.const.i64 11
+    vm.check.eq %v, %c11, "2*3+5=11" : i64
+    vm.return
+  }
+
+  vm.export @test_not_i64
+  vm.func @test_not_i64() {
+    %c1 = vm.const.i64 0
+    %c1dno = util.do_not_optimize(%c1) : i64
+    %v = vm.not.i64 %c1dno : i64
+    %c2 = vm.const.i64 -1
+    vm.check.eq %v, %c2, "~0=-1" : i64
+    vm.return
+  }
+
+  vm.export @test_and_i64
+  vm.func @test_and_i64() {
+    %c1 = vm.const.i64 5
+    %c1dno = util.do_not_optimize(%c1) : i64
+    %c2 = vm.const.i64 3
+    %c2dno = util.do_not_optimize(%c2) : i64
+    %v = vm.and.i64 %c1dno, %c2dno : i64
+    %c3 = vm.const.i64 1
+    vm.check.eq %v, %c3, "5&3=1" : i64
+    vm.return
+  }
+
+  vm.export @test_or_i64
+  vm.func @test_or_i64() {
+    %c1 = vm.const.i64 5
+    %c1dno = util.do_not_optimize(%c1) : i64
+    %c2 = vm.const.i64 3
+    %c2dno = util.do_not_optimize(%c2) : i64
+    %v = vm.or.i64 %c1dno, %c2dno : i64
+    %c3 = vm.const.i64 7
+    vm.check.eq %v, %c3, "5|3=7" : i64
+    vm.return
+  }
+
+  vm.export @test_xor_i64
+  vm.func @test_xor_i64() {
+    %c1 = vm.const.i64 5
+    %c1dno = util.do_not_optimize(%c1) : i64
+    %c2 = vm.const.i64 3
+    %c2dno = util.do_not_optimize(%c2) : i64
+    %v = vm.xor.i64 %c1dno, %c2dno : i64
+    %c3 = vm.const.i64 6
+    vm.check.eq %v, %c3, "5^3=6" : i64
+    vm.return
+  }
+}
diff --git a/runtime/src/iree/vm/test/assignment_ops.mlir b/runtime/src/iree/vm/test/assignment_ops.mlir
new file mode 100644
index 0000000..a5b77c7
--- /dev/null
+++ b/runtime/src/iree/vm/test/assignment_ops.mlir
@@ -0,0 +1,32 @@
+vm.module @assignment_ops {
+
+  //===--------------------------------------------------------------------===//
+  // Conditional assignment
+  //===--------------------------------------------------------------------===//
+
+  vm.export @test_select_i32
+  vm.func @test_select_i32() {
+    %c0 = vm.const.i32 0
+    %c0dno = util.do_not_optimize(%c0) : i32
+    %c1 = vm.const.i32 1
+    %c1dno = util.do_not_optimize(%c1) : i32
+    %v1 = vm.select.i32 %c0dno, %c0dno, %c1dno : i32
+    vm.check.eq %v1, %c1, "0 ? 0 : 1 = 1" : i32
+    %v2 = vm.select.i32 %c1dno, %c0dno, %c1dno : i32
+    vm.check.eq %v2, %c0, "1 ? 0 : 1 = 0" : i32
+    vm.return
+  }
+
+  vm.export @test_select_ref attributes {emitc.exclude}
+  vm.func private @test_select_ref() {
+    %c0 = vm.const.i32 0
+    %list0 = vm.list.alloc %c0 : (i32) -> !vm.list<i8>
+    %c1 = vm.const.i32 1
+    %list1 = vm.list.alloc %c1 : (i32) -> !vm.list<i8>
+    %cond = vm.const.i32 0
+    %cond_dno = util.do_not_optimize(%cond) : i32
+    %list = vm.select.ref %cond_dno, %list0, %list1 : !vm.list<i8>
+    vm.check.eq %list, %list1, "0 ? list0 : list1 = list1" : !vm.list<i8>
+    vm.return
+  }
+}
diff --git a/runtime/src/iree/vm/test/assignment_ops_f32.mlir b/runtime/src/iree/vm/test/assignment_ops_f32.mlir
new file mode 100644
index 0000000..1a88bd0
--- /dev/null
+++ b/runtime/src/iree/vm/test/assignment_ops_f32.mlir
@@ -0,0 +1,21 @@
+vm.module @assignment_ops_f32 {
+
+  //===--------------------------------------------------------------------===//
+  // ExtF32: Conditional assignment
+  //===--------------------------------------------------------------------===//
+
+  vm.export @test_select_f32
+  vm.func @test_select_f32() {
+    %c0 = vm.const.i32 0
+    %c0dno = util.do_not_optimize(%c0) : i32
+    %c1 = vm.const.i32 1
+    %c1dno = util.do_not_optimize(%c1) : i32
+    %c2 = vm.const.f32 0.0
+    %c3 = vm.const.f32 1.0
+    %v1 = vm.select.f32 %c0dno, %c2, %c3 : f32
+    vm.check.eq %v1, %c3, "0 ? 0.0 : 1.0 = 1.0" : f32
+    %v2 = vm.select.f32 %c1dno, %c2, %c3 : f32
+    vm.check.eq %v2, %c2, "1 ? 0.0 : 1.0 = 0.0" : f32
+    vm.return
+  }
+}
diff --git a/runtime/src/iree/vm/test/assignment_ops_i64.mlir b/runtime/src/iree/vm/test/assignment_ops_i64.mlir
new file mode 100644
index 0000000..72429f3
--- /dev/null
+++ b/runtime/src/iree/vm/test/assignment_ops_i64.mlir
@@ -0,0 +1,21 @@
+vm.module @assignment_ops_i64 {
+
+  //===--------------------------------------------------------------------===//
+  // ExtI64: Conditional assignment
+  //===--------------------------------------------------------------------===//
+
+  vm.export @test_select_i64
+  vm.func @test_select_i64() {
+    %c0 = vm.const.i32 0
+    %c0dno = util.do_not_optimize(%c0) : i32
+    %c1 = vm.const.i32 1
+    %c1dno = util.do_not_optimize(%c1) : i32
+    %c2 = vm.const.i64 0
+    %c3 = vm.const.i64 1
+    %v1 = vm.select.i64 %c0dno, %c2, %c3 : i64
+    vm.check.eq %v1, %c3, "0 ? 0 : 1 = 1" : i64
+    %v2 = vm.select.i64 %c1dno, %c2, %c3 : i64
+    vm.check.eq %v2, %c2, "1 ? 0 : 1 = 0" : i64
+    vm.return
+  }
+}
diff --git a/runtime/src/iree/vm/test/buffer_ops.mlir b/runtime/src/iree/vm/test/buffer_ops.mlir
new file mode 100644
index 0000000..754b97b
--- /dev/null
+++ b/runtime/src/iree/vm/test/buffer_ops.mlir
@@ -0,0 +1,635 @@
+vm.module @buffer_ops {
+
+  vm.rodata private @rodata_3xi32 dense<[1, 2, 3]> : tensor<3xi32>
+
+  //===--------------------------------------------------------------------===//
+  // Compare
+  //===--------------------------------------------------------------------===//
+  // NOTE: we test this first because all of the other tests rely on it and we
+  // can do it with rodata.
+
+  vm.rodata private @rodata_cmp_3xi32_a dense<[100, 200, 300]> : tensor<3xi32>
+  vm.rodata private @rodata_cmp_3xi32_b dense<[100, 201, 300]> : tensor<3xi32>
+
+  // Compares some multi-element buffers. Note that comparisons are bytewise.
+  vm.export @test_compare attributes {emitc.exclude}
+  vm.func private @test_compare() {
+    %rodata_a = vm.const.ref.rodata @rodata_cmp_3xi32_a : !vm.buffer
+    %rodata_b = vm.const.ref.rodata @rodata_cmp_3xi32_b : !vm.buffer
+    %rodata_a_dno = util.do_not_optimize(%rodata_a) : !vm.buffer
+    %rodata_b_dno = util.do_not_optimize(%rodata_b) : !vm.buffer
+
+    %c0 = vm.const.i32 0
+    %length = vm.buffer.length %rodata_a_dno : !vm.buffer -> i32
+
+    %cmp0 = vm.buffer.compare %rodata_a_dno, %c0, %rodata_a_dno, %c0, %length : !vm.buffer, !vm.buffer
+    vm.check.nz %cmp0, "buffer a == a" : i32
+
+    %cmp1 = vm.buffer.compare %rodata_a_dno, %c0, %rodata_b_dno, %c0, %length : !vm.buffer, !vm.buffer
+    vm.check.eq %cmp1, %c0, "buffer a != b" : i32
+
+    vm.return
+  }
+
+  // Tests comparing an empty range, which should always be equal.
+  vm.export @test_compare_empty attributes {emitc.exclude}
+  vm.func private @test_compare_empty() {
+    %rodata_a = vm.const.ref.rodata @rodata_cmp_3xi32_a : !vm.buffer
+    %rodata_b = vm.const.ref.rodata @rodata_cmp_3xi32_b : !vm.buffer
+    %rodata_a_dno = util.do_not_optimize(%rodata_a) : !vm.buffer
+    %rodata_b_dno = util.do_not_optimize(%rodata_b) : !vm.buffer
+
+    %c0 = vm.const.i32 0
+    %c2 = vm.const.i32 2
+
+    %cmp = vm.buffer.compare %rodata_a_dno, %c2, %rodata_a_dno, %c2, %c0 : !vm.buffer, !vm.buffer
+    vm.check.nz %cmp, "empty buffer ranges are always equal" : i32
+
+    vm.return
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Allocation
+  //===--------------------------------------------------------------------===//
+
+  // Tests allocating a buffer.
+  vm.export @test_alloc attributes {emitc.exclude}
+  vm.func private @test_alloc() {
+    %c128 = vm.const.i32 128
+    %buf = vm.buffer.alloc %c128 : !vm.buffer
+    %buf_dno = util.do_not_optimize(%buf) : !vm.buffer
+    vm.check.nz %buf_dno, "!null" : !vm.buffer
+
+    %buf_length = vm.buffer.length %buf_dno : !vm.buffer -> i32
+    vm.check.eq %c128, %buf_length, "buffer length == 128" : i32
+
+    vm.return
+  }
+
+  // Tests that zero-length buffers can be allocated.
+  vm.export @test_alloc_empty attributes {emitc.exclude}
+  vm.func private @test_alloc_empty() {
+    %c0 = vm.const.i32 0
+    %buf = vm.buffer.alloc %c0 : !vm.buffer
+    %buf_dno = util.do_not_optimize(%buf) : !vm.buffer
+    vm.check.nz %buf_dno, "!null" : !vm.buffer
+
+    %buf_length = vm.buffer.length %buf_dno : !vm.buffer -> i32
+    vm.check.eq %c0, %buf_length, "buffer length == 0" : i32
+
+    vm.return
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Cloning
+  //===--------------------------------------------------------------------===//
+
+  // Tests cloning a subrange of a buffer.
+  vm.export @test_clone attributes {emitc.exclude}
+  vm.func private @test_clone() {
+    // Fetch source .rodata blob.
+    %rodata = vm.const.ref.rodata @rodata_3xi32 : !vm.buffer
+
+    // Clone the last two 32-bit elements.
+    %c4 = vm.const.i32 4
+    %c8 = vm.const.i32 8
+    %buf = vm.buffer.clone %rodata, %c4, %c8 : !vm.buffer -> !vm.buffer
+    %buf_dno = util.do_not_optimize(%buf) : !vm.buffer
+    vm.check.nz %buf_dno, "!null" : !vm.buffer
+
+    // Compare the cloned range to the original.
+    %c0 = vm.const.i32 0
+    %cmp = vm.buffer.compare %rodata, %c4, %buf_dno, %c0, %c8 : !vm.buffer, !vm.buffer
+    vm.check.nz %cmp, "buffer subspans are equal" : i32
+
+    vm.return
+  }
+
+  // Tests cloning a zero-length buffer.
+  vm.export @test_clone_empty attributes {emitc.exclude}
+  vm.func private @test_clone_empty() {
+    // Allocate source zero-length buffer.
+    %c0 = vm.const.i32 0
+    %buf0 = vm.buffer.alloc %c0 : !vm.buffer
+    %buf0_dno = util.do_not_optimize(%buf0) : !vm.buffer
+    vm.check.nz %buf0_dno, "!null" : !vm.buffer
+    %buf0_length = vm.buffer.length %buf0_dno : !vm.buffer -> i32
+    vm.check.eq %c0, %buf0_length, "buffer length == 0" : i32
+
+    // Clone it all (or, clone nothing?).
+    %buf1 = vm.buffer.clone %buf0_dno, %c0, %c0 : !vm.buffer -> !vm.buffer
+    %buf1_dno = util.do_not_optimize(%buf1) : !vm.buffer
+    vm.check.nz %buf1_dno, "!null" : !vm.buffer
+    %buf1_length = vm.buffer.length %buf1_dno : !vm.buffer -> i32
+    vm.check.eq %c0, %buf1_length, "buffer length == 0" : i32
+
+    vm.return
+  }
+
+  // Tests an out-of-bounds cloning subrange.
+  vm.export @fail_clone_out_of_range attributes {emitc.exclude}
+  vm.func private @fail_clone_out_of_range() {
+    // Fetch source .rodata blob.
+    %rodata = vm.const.ref.rodata @rodata_3xi32 : !vm.buffer
+    %rodata_dno = util.do_not_optimize(%rodata) : !vm.buffer
+    vm.check.nz %rodata_dno, "!null" : !vm.buffer
+
+    // Try to clone off the end of the buffer.
+    %c8 = vm.const.i32 8
+    %buf = vm.buffer.clone %rodata, %c8, %c8 : !vm.buffer -> !vm.buffer
+
+    vm.return
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Copy
+  //===--------------------------------------------------------------------===//
+
+  // Tests copying an entire buffer from one buffer to another.
+  vm.export @test_copy_full attributes {emitc.exclude}
+  vm.func private @test_copy_full() {
+    // Fetch source .rodata blob.
+    %rodata = vm.const.ref.rodata @rodata_3xi32 : !vm.buffer
+    %rodata_length = vm.buffer.length %rodata : !vm.buffer -> i32
+    vm.check.nz %rodata, "!null" : !vm.buffer
+
+    // Allocate target buffer.
+    %buf = vm.buffer.alloc %rodata_length : !vm.buffer
+    %buf_dno = util.do_not_optimize(%buf) : !vm.buffer
+    vm.check.nz %buf_dno, "!null" : !vm.buffer
+
+    // Copy the entire contents.
+    %c0 = vm.const.i32 0
+    vm.buffer.copy %rodata, %c0, %buf_dno, %c0, %rodata_length : !vm.buffer -> !vm.buffer
+
+    // Compare to source.
+    %cmp = vm.buffer.compare %rodata, %c0, %buf_dno, %c0, %rodata_length : !vm.buffer, !vm.buffer
+    vm.check.nz %cmp, "source and target match" : i32
+
+    vm.return
+  }
+
+  vm.rodata private @test_copy_partial_ref dense<[2]> : tensor<1xi32>
+
+  // Tests copying a range of bytes from one buffer to another.
+  vm.export @test_copy_partial attributes {emitc.exclude}
+  vm.func private @test_copy_partial() {
+    // Allocate target buffer.
+    %c4 = vm.const.i32 4
+    %buf = vm.buffer.alloc %c4 : !vm.buffer
+    %buf_dno = util.do_not_optimize(%buf) : !vm.buffer
+    vm.check.nz %buf_dno, "!null" : !vm.buffer
+
+    // Copy the middle 4-byte element.
+    %rodata = vm.const.ref.rodata @rodata_3xi32 : !vm.buffer
+    %c0 = vm.const.i32 0
+    vm.buffer.copy %rodata, %c4, %buf, %c0, %c4 : !vm.buffer -> !vm.buffer
+
+    // Compare to reference.
+    %ref = vm.const.ref.rodata @test_copy_partial_ref : !vm.buffer
+    %cmp = vm.buffer.compare %ref, %c0, %buf, %c0, %c4 : !vm.buffer, !vm.buffer
+    vm.check.nz %cmp, "source and target match" : i32
+
+    vm.return
+  }
+
+  // Tests an out-of-bounds copy source.
+  vm.export @fail_copy_out_of_range_source_offset attributes {emitc.exclude}
+  vm.func private @fail_copy_out_of_range_source_offset() {
+    %rodata = vm.const.ref.rodata @rodata_3xi32 : !vm.buffer
+    %c128 = vm.const.i32 128
+    %buf = vm.buffer.alloc %c128 : !vm.buffer
+    %buf_dno = util.do_not_optimize(%buf) : !vm.buffer
+    vm.check.nz %buf_dno, "!null" : !vm.buffer
+
+    // Try to clone off the end of the source buffer.
+    %c0 = vm.const.i32 0
+    vm.buffer.copy %rodata, %c0, %buf_dno, %c0, %c128 : !vm.buffer -> !vm.buffer
+
+    vm.return
+  }
+
+  // Tests an out-of-bounds copy source.
+  vm.export @fail_copy_out_of_range_source_length attributes {emitc.exclude}
+  vm.func private @fail_copy_out_of_range_source_length() {
+    %rodata = vm.const.ref.rodata @rodata_3xi32 : !vm.buffer
+    %c128 = vm.const.i32 128
+    %buf = vm.buffer.alloc %c128 : !vm.buffer
+    %buf_dno = util.do_not_optimize(%buf) : !vm.buffer
+    vm.check.nz %buf_dno, "!null" : !vm.buffer
+
+    // Try to clone off the end of the source buffer.
+    %c0 = vm.const.i32 0
+    %c8 = vm.const.i32 8
+    vm.buffer.copy %rodata, %c8, %buf_dno, %c0, %c8 : !vm.buffer -> !vm.buffer
+
+    vm.return
+  }
+
+  // Tests an out-of-bounds copy target.
+  vm.export @fail_copy_out_of_range_target_offset attributes {emitc.exclude}
+  vm.func private @fail_copy_out_of_range_target_offset() {
+    %rodata = vm.const.ref.rodata @rodata_3xi32 : !vm.buffer
+    %rodata_length = vm.buffer.length %rodata : !vm.buffer -> i32
+    %c8 = vm.const.i32 8
+    %buf = vm.buffer.alloc %c8 : !vm.buffer
+    %buf_dno = util.do_not_optimize(%buf) : !vm.buffer
+    vm.check.nz %buf_dno, "!null" : !vm.buffer
+
+    // Try to clone off the end of the target buffer.
+    %c0 = vm.const.i32 0
+    vm.buffer.copy %rodata, %c0, %buf_dno, %c0, %rodata_length : !vm.buffer -> !vm.buffer
+
+    vm.return
+  }
+
+  // Tests an out-of-bounds copy target.
+  vm.export @fail_copy_out_of_range_target_length attributes {emitc.exclude}
+  vm.func private @fail_copy_out_of_range_target_length() {
+    %rodata = vm.const.ref.rodata @rodata_3xi32 : !vm.buffer
+    %c8 = vm.const.i32 8
+    %buf = vm.buffer.alloc %c8 : !vm.buffer
+    %buf_dno = util.do_not_optimize(%buf) : !vm.buffer
+    vm.check.nz %buf_dno, "!null" : !vm.buffer
+
+    // Try to clone off the end of the target buffer.
+    %c0 = vm.const.i32 0
+    vm.buffer.copy %rodata, %c0, %buf_dno, %c8, %c8 : !vm.buffer -> !vm.buffer
+
+    vm.return
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Fill
+  //===--------------------------------------------------------------------===//
+
+  vm.rodata private @test_fill_i16_ref dense<[0, 51966, 51966, 0]> : tensor<4xi16>
+
+  // Tests filling a buffer with 16-bit values.
+  vm.export @test_fill_i16 attributes {emitc.exclude}
+  vm.func private @test_fill_i16() {
+    // Allocate zeroed buffer.
+    %c8 = vm.const.i32 8
+    %buf = vm.buffer.alloc %c8 : !vm.buffer
+    %buf_dno = util.do_not_optimize(%buf) : !vm.buffer
+    vm.check.nz %buf_dno, "!null" : !vm.buffer
+
+    // Fill the middle two elements.
+    %c2 = vm.const.i32 2
+    %c4 = vm.const.i32 4
+    %cafe = vm.const.i32 0xCAFE
+    vm.buffer.fill.i16 %buf_dno, %c2, %c4, %cafe : i32 -> !vm.buffer
+
+    // Compare to reference.
+    %c0 = vm.const.i32 0
+    %rodata_ref = vm.const.ref.rodata @test_fill_i16_ref : !vm.buffer
+    %cmp = vm.buffer.compare %rodata_ref, %c0, %buf_dno, %c0, %c8 : !vm.buffer, !vm.buffer
+    vm.check.nz %cmp, "buffer should match reference" : i32
+
+    vm.return
+  }
+
+  vm.rodata private @test_fill_i16_misaligned_offset_ref dense<[0xCAFE, 0xCAFE, 0, 0]> : tensor<4xi16>
+
+  // Tests that misaligned fill offsets will succeed but round down.
+  vm.export @test_fill_i16_misaligned_offset attributes {emitc.exclude}
+  vm.func private @test_fill_i16_misaligned_offset() {
+    // Allocate zeroed buffer.
+    %c8 = vm.const.i32 8
+    %buf = vm.buffer.alloc %c8 : !vm.buffer
+    %buf_dno = util.do_not_optimize(%buf) : !vm.buffer
+
+    // Try filling from offset 1, which is not i16-aligned.
+    %c1 = vm.const.i32 1
+    %c4 = vm.const.i32 4
+    %cafe = vm.const.i32 0xCAFE
+    vm.buffer.fill.i16 %buf_dno, %c1, %c4, %cafe : i32 -> !vm.buffer
+
+    // Compare to reference - should have written at offset 0.
+    %c0 = vm.const.i32 0
+    %rodata_ref = vm.const.ref.rodata @test_fill_i16_misaligned_offset_ref : !vm.buffer
+    %cmp = vm.buffer.compare %rodata_ref, %c0, %buf_dno, %c0, %c8 : !vm.buffer, !vm.buffer
+    vm.check.nz %cmp, "buffer should match reference" : i32
+
+
+    vm.return
+  }
+
+  vm.rodata private @test_fill_i16_misaligned_length_ref dense<[0, 0, 0, 0]> : tensor<4xi16>
+
+  // Tests that misaligned fill lengths will succeed but round down.
+  vm.export @test_fill_i16_misaligned_length attributes {emitc.exclude}
+  vm.func private @test_fill_i16_misaligned_length() {
+    // Allocate zeroed buffer.
+    %c8 = vm.const.i32 8
+    %buf = vm.buffer.alloc %c8 : !vm.buffer
+    %buf_dno = util.do_not_optimize(%buf) : !vm.buffer
+
+    // Try filling for length 1, which is not i16-aligned.
+    %c0 = vm.const.i32 0
+    %c1 = vm.const.i32 1
+    %cafe = vm.const.i32 0xCAFE
+    vm.buffer.fill.i16 %buf_dno, %c0, %c1, %cafe : i32 -> !vm.buffer
+
+    // Compare to reference - should have written 0 bytes.
+    %rodata_ref = vm.const.ref.rodata @test_fill_i16_misaligned_length_ref : !vm.buffer
+    %cmp = vm.buffer.compare %rodata_ref, %c0, %buf_dno, %c0, %c8 : !vm.buffer, !vm.buffer
+    vm.check.nz %cmp, "buffer should match reference" : i32
+
+    vm.return
+  }
+
+  // Tests that trying to fill .rodata will fail.
+  vm.export @fail_fill_i16_rodata attributes {emitc.exclude}
+  vm.func private @fail_fill_i16_rodata() {
+    %rodata = vm.const.ref.rodata @rodata_3xi32 : !vm.buffer
+
+    // Permission denied:
+    %c0 = vm.const.i32 0
+    %c2 = vm.const.i32 2
+    %cafe = vm.const.i32 0xCAFE
+    vm.buffer.fill.i16 %rodata, %c0, %c2, %cafe : i32 -> !vm.buffer
+
+    vm.return
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Load
+  //===--------------------------------------------------------------------===//
+
+  vm.rodata private @test_load_i8_data dense<[0x00, 0x01, 0x7F, 0x80, 0xFF]> : tensor<5xui8>
+
+  vm.export @test_load_i8u attributes {emitc.exclude}
+  vm.func private @test_load_i8u() {
+    %c0 = vm.const.i32 0
+    %c1 = vm.const.i32 1
+    %c2 = vm.const.i32 2
+    %c3 = vm.const.i32 3
+    %c4 = vm.const.i32 4
+    %rodata = vm.const.ref.rodata @test_load_i8_data : !vm.buffer
+    %v0 = vm.buffer.load.i8.u %rodata[%c0] : !vm.buffer -> i32
+    %e0 = vm.const.i32 0
+    vm.check.eq %v0, %e0, "0" : i32
+    %v1 = vm.buffer.load.i8.u %rodata[%c1] : !vm.buffer -> i32
+    %e1 = vm.const.i32 1
+    vm.check.eq %v1, %e1, "1" : i32
+    %v2 = vm.buffer.load.i8.u %rodata[%c2] : !vm.buffer -> i32
+    %e2 = vm.const.i32 0x7F
+    vm.check.eq %v2, %e2, "0x7F" : i32
+    %v3 = vm.buffer.load.i8.u %rodata[%c3] : !vm.buffer -> i32
+    %e3 = vm.const.i32 0x80
+    vm.check.eq %v3, %e3, "0x80" : i32
+    %v4 = vm.buffer.load.i8.u %rodata[%c4] : !vm.buffer -> i32
+    %e4 = vm.const.i32 0xFF
+    vm.check.eq %v4, %e4, "0xFF" : i32
+    vm.return
+  }
+
+  vm.export @test_load_i8s attributes {emitc.exclude}
+  vm.func private @test_load_i8s() {
+    %c0 = vm.const.i32 0
+    %c1 = vm.const.i32 1
+    %c2 = vm.const.i32 2
+    %c3 = vm.const.i32 3
+    %c4 = vm.const.i32 4
+    %rodata = vm.const.ref.rodata @test_load_i8_data : !vm.buffer
+    %v0 = vm.buffer.load.i8.s %rodata[%c0] : !vm.buffer -> i32
+    %e0 = vm.const.i32 0
+    vm.check.eq %v0, %e0, "0" : i32
+    %v1 = vm.buffer.load.i8.s %rodata[%c1] : !vm.buffer -> i32
+    %e1 = vm.const.i32 1
+    vm.check.eq %v1, %e1, "1" : i32
+    %v2 = vm.buffer.load.i8.s %rodata[%c2] : !vm.buffer -> i32
+    %e2 = vm.const.i32 0x7F
+    vm.check.eq %v2, %e2, "0x7F" : i32
+    %v3 = vm.buffer.load.i8.s %rodata[%c3] : !vm.buffer -> i32
+    %e3 = vm.const.i32 -128
+    vm.check.eq %v3, %e3, "-128" : i32
+    %v4 = vm.buffer.load.i8.s %rodata[%c4] : !vm.buffer -> i32
+    %e4 = vm.const.i32 -1
+    vm.check.eq %v4, %e4, "-1" : i32
+    vm.return
+  }
+
+  vm.rodata private @test_load_i16_data dense<[0x0000, 0x0001, 0x7FFF, 0x8000, 0xFFFF]> : tensor<5xui16>
+
+  vm.export @test_load_i16u attributes {emitc.exclude}
+  vm.func private @test_load_i16u() {
+    %c0 = vm.const.i32 0
+    %c2 = vm.const.i32 2
+    %c4 = vm.const.i32 4
+    %c6 = vm.const.i32 6
+    %c8 = vm.const.i32 8
+    %rodata = vm.const.ref.rodata @test_load_i16_data : !vm.buffer
+    %v0 = vm.buffer.load.i16.u %rodata[%c0] : !vm.buffer -> i32
+    %e0 = vm.const.i32 0
+    vm.check.eq %v0, %e0, "0" : i32
+    %v1 = vm.buffer.load.i16.u %rodata[%c2] : !vm.buffer -> i32
+    %e1 = vm.const.i32 1
+    vm.check.eq %v1, %e1, "1" : i32
+    %v2 = vm.buffer.load.i16.u %rodata[%c4] : !vm.buffer -> i32
+    %e2 = vm.const.i32 0x7FFF
+    vm.check.eq %v2, %e2, "0x7FFF" : i32
+    %v3 = vm.buffer.load.i16.u %rodata[%c6] : !vm.buffer -> i32
+    %e3 = vm.const.i32 0x8000
+    vm.check.eq %v3, %e3, "0x8000" : i32
+    %v4 = vm.buffer.load.i16.u %rodata[%c8] : !vm.buffer -> i32
+    %e4 = vm.const.i32 0xFFFF
+    vm.check.eq %v4, %e4, "0xFFFF" : i32
+    vm.return
+  }
+
+  vm.export @test_load_i16s attributes {emitc.exclude}
+  vm.func private @test_load_i16s() {
+    %c0 = vm.const.i32 0
+    %c2 = vm.const.i32 2
+    %c4 = vm.const.i32 4
+    %c6 = vm.const.i32 6
+    %c8 = vm.const.i32 8
+    %rodata = vm.const.ref.rodata @test_load_i16_data : !vm.buffer
+    %v0 = vm.buffer.load.i16.s %rodata[%c0] : !vm.buffer -> i32
+    %e0 = vm.const.i32 0
+    vm.check.eq %v0, %e0, "0" : i32
+    %v1 = vm.buffer.load.i16.s %rodata[%c2] : !vm.buffer -> i32
+    %e1 = vm.const.i32 1
+    vm.check.eq %v1, %e1, "1" : i32
+    %v2 = vm.buffer.load.i16.s %rodata[%c4] : !vm.buffer -> i32
+    %e2 = vm.const.i32 0x7FFF
+    vm.check.eq %v2, %e2, "0x7FFF" : i32
+    %v3 = vm.buffer.load.i16.s %rodata[%c6] : !vm.buffer -> i32
+    %e3 = vm.const.i32 -32768
+    vm.check.eq %v3, %e3, "-32768" : i32
+    %v4 = vm.buffer.load.i16.s %rodata[%c8] : !vm.buffer -> i32
+    %e4 = vm.const.i32 -1
+    vm.check.eq %v4, %e4, "-1" : i32
+    vm.return
+  }
+
+  vm.rodata private @test_load_i32_data dense<[0x00000000, 0x00000001, 0x7FFFFFFF, 0x80000000, 0xFFFFFFFF]> : tensor<5xui32>
+
+  vm.export @test_load_i32 attributes {emitc.exclude}
+  vm.func private @test_load_i32() {
+    %c0 = vm.const.i32 0
+    %c4 = vm.const.i32 4
+    %c8 = vm.const.i32 8
+    %c12 = vm.const.i32 12
+    %c16 = vm.const.i32 16
+    %rodata = vm.const.ref.rodata @test_load_i32_data : !vm.buffer
+    %v0 = vm.buffer.load.i32 %rodata[%c0] : !vm.buffer -> i32
+    %e0 = vm.const.i32 0
+    vm.check.eq %v0, %e0, "0" : i32
+    %v1 = vm.buffer.load.i32 %rodata[%c4] : !vm.buffer -> i32
+    %e1 = vm.const.i32 1
+    vm.check.eq %v1, %e1, "1" : i32
+    %v2 = vm.buffer.load.i32 %rodata[%c8] : !vm.buffer -> i32
+    %e2 = vm.const.i32 0x7FFFFFFF
+    vm.check.eq %v2, %e2, "0x7FFFFFFF" : i32
+    %v3 = vm.buffer.load.i32 %rodata[%c12] : !vm.buffer -> i32
+    %e3 = vm.const.i32 0x80000000
+    vm.check.eq %v3, %e3, "0x80000000" : i32
+    %v4 = vm.buffer.load.i32 %rodata[%c16] : !vm.buffer -> i32
+    %e4 = vm.const.i32 0xFFFFFFFF
+    vm.check.eq %v4, %e4, "0xFFFFFFFF" : i32
+    vm.return
+  }
+
+  vm.rodata private @test_load_i32_unaligned_data dense<[0x00112233, 0x44556677, 0x8899AABB, 0xCCDDEEFF]> : tensor<4xui32>
+
+  // Unaligned loads are not supported and offsets will be rounded down.
+  vm.export @test_load_i32_unaligned attributes {emitc.exclude}
+  vm.func private @test_load_i32_unaligned() {
+    %rodata = vm.const.ref.rodata @test_load_i32_unaligned_data : !vm.buffer
+
+    // Byte offset 5 rounded to byte offset 4 (element 1).
+    %c5 = vm.const.i32 5
+    %v1 = vm.buffer.load.i32 %rodata[%c5] : !vm.buffer -> i32
+    %e1 = vm.const.i32 0x44556677
+    vm.check.eq %v1, %e1, "0x44556677" : i32
+
+    vm.return
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Store
+  //===--------------------------------------------------------------------===//
+
+  vm.rodata private @test_store_i8_ref dense<[0x00, 0x01, 0x7F, 0x80, 0xFF]> : tensor<5xui8>
+
+  vm.export @test_store_i8 attributes {emitc.exclude}
+  vm.func private @test_store_i8() {
+    %ref = vm.const.ref.rodata @test_store_i8_ref : !vm.buffer
+    %ref_dno = util.do_not_optimize(%ref) : !vm.buffer
+    %ref_length = vm.buffer.length %ref_dno : !vm.buffer -> i32
+
+    %buf = vm.buffer.alloc %ref_length : !vm.buffer
+    %buf_dno = util.do_not_optimize(%buf) : !vm.buffer
+
+    %c0 = vm.const.i32 0
+    %e0 = vm.const.i32 0
+    vm.buffer.store.i8 %e0, %buf_dno[%c0] : i32 -> !vm.buffer
+    %c1 = vm.const.i32 1
+    %e1 = vm.const.i32 1
+    vm.buffer.store.i8 %e1, %buf_dno[%c1] : i32 -> !vm.buffer
+    %c2 = vm.const.i32 2
+    %e2 = vm.const.i32 0x7F
+    vm.buffer.store.i8 %e2, %buf_dno[%c2] : i32 -> !vm.buffer
+    %c3 = vm.const.i32 3
+    %e3 = vm.const.i32 0x80
+    vm.buffer.store.i8 %e3, %buf_dno[%c3] : i32 -> !vm.buffer
+    %c4 = vm.const.i32 4
+    %e4 = vm.const.i32 0xFF
+    vm.buffer.store.i8 %e4, %buf_dno[%c4] : i32 -> !vm.buffer
+
+    %cmp = vm.buffer.compare %ref_dno, %c0, %buf_dno, %c0, %ref_length : !vm.buffer, !vm.buffer
+    vm.check.nz %cmp, "source and target match" : i32
+
+    vm.return
+  }
+
+  vm.rodata private @test_store_i16_ref dense<[0x0000, 0x0001, 0x7FFF, 0x8000, 0xFFFF]> : tensor<5xui16>
+
+  vm.export @test_store_i16 attributes {emitc.exclude}
+  vm.func private @test_store_i16() {
+    %ref = vm.const.ref.rodata @test_store_i16_ref : !vm.buffer
+    %ref_dno = util.do_not_optimize(%ref) : !vm.buffer
+    %ref_length = vm.buffer.length %ref_dno : !vm.buffer -> i32
+
+    %buf = vm.buffer.alloc %ref_length : !vm.buffer
+    %buf_dno = util.do_not_optimize(%buf) : !vm.buffer
+
+    %c0 = vm.const.i32 0
+    %e0 = vm.const.i32 0
+    vm.buffer.store.i16 %e0, %buf_dno[%c0] : i32 -> !vm.buffer
+    %c2 = vm.const.i32 2
+    %e1 = vm.const.i32 1
+    vm.buffer.store.i16 %e1, %buf_dno[%c2] : i32 -> !vm.buffer
+    %c4 = vm.const.i32 4
+    %e2 = vm.const.i32 0x7FFF
+    vm.buffer.store.i16 %e2, %buf_dno[%c4] : i32 -> !vm.buffer
+    %c6 = vm.const.i32 6
+    %e3 = vm.const.i32 0x8000
+    vm.buffer.store.i16 %e3, %buf_dno[%c6] : i32 -> !vm.buffer
+    %c8 = vm.const.i32 8
+    %e4 = vm.const.i32 0xFFFF
+    vm.buffer.store.i16 %e4, %buf_dno[%c8] : i32 -> !vm.buffer
+
+    %cmp = vm.buffer.compare %ref_dno, %c0, %buf_dno, %c0, %ref_length : !vm.buffer, !vm.buffer
+    vm.check.nz %cmp, "source and target match" : i32
+
+    vm.return
+  }
+
+  vm.rodata private @test_store_i32_ref dense<[0x00000000, 0x00000001, 0x7FFFFFFF, 0x80000000, 0xFFFFFFFF]> : tensor<5xui32>
+
+  vm.export @test_store_i32 attributes {emitc.exclude}
+  vm.func private @test_store_i32() {
+    %ref = vm.const.ref.rodata @test_store_i32_ref : !vm.buffer
+    %ref_dno = util.do_not_optimize(%ref) : !vm.buffer
+    %ref_length = vm.buffer.length %ref_dno : !vm.buffer -> i32
+
+    %buf = vm.buffer.alloc %ref_length : !vm.buffer
+    %buf_dno = util.do_not_optimize(%buf) : !vm.buffer
+
+    %c0 = vm.const.i32 0
+    %e0 = vm.const.i32 0
+    vm.buffer.store.i32 %e0, %buf_dno[%c0] : i32 -> !vm.buffer
+    %c4 = vm.const.i32 4
+    %e1 = vm.const.i32 1
+    vm.buffer.store.i32 %e1, %buf_dno[%c4] : i32 -> !vm.buffer
+    %c8 = vm.const.i32 8
+    %e2 = vm.const.i32 0x7FFFFFFF
+    vm.buffer.store.i32 %e2, %buf_dno[%c8] : i32 -> !vm.buffer
+    %c12 = vm.const.i32 12
+    %e3 = vm.const.i32 0x80000000
+    vm.buffer.store.i32 %e3, %buf_dno[%c12] : i32 -> !vm.buffer
+    %c16 = vm.const.i32 16
+    %e4 = vm.const.i32 0xFFFFFFFF
+    vm.buffer.store.i32 %e4, %buf_dno[%c16] : i32 -> !vm.buffer
+
+    %cmp = vm.buffer.compare %ref_dno, %c0, %buf_dno, %c0, %ref_length : !vm.buffer, !vm.buffer
+    vm.check.nz %cmp, "source and target match" : i32
+
+    vm.return
+  }
+
+  // Unaligned stores are not supported and offsets will be rounded down.
+  vm.export @test_store_i32_unaligned attributes {emitc.exclude}
+  vm.func private @test_store_i32_unaligned() {
+    %c12 = vm.const.i32 12
+    %buf = vm.buffer.alloc %c12 : !vm.buffer
+    %buf_dno = util.do_not_optimize(%buf) : !vm.buffer
+
+    // Byte offset 5 rounded to byte offset 4 (element 1).
+    %c5 = vm.const.i32 5
+    %e1 = vm.const.i32 0x44556677
+    vm.buffer.store.i32 %e1, %buf_dno[%c5] : i32 -> !vm.buffer
+
+    // Read back at offset 4 (where the data should be).
+    %c4 = vm.const.i32 4
+    %a1 = vm.buffer.load.i32 %buf_dno[%c4] : !vm.buffer -> i32
+    vm.check.eq %a1, %e1, "0x44556677" : i32
+
+    vm.return
+  }
+
+}
diff --git a/runtime/src/iree/vm/test/call_ops.mlir b/runtime/src/iree/vm/test/call_ops.mlir
new file mode 100644
index 0000000..6f5072d
--- /dev/null
+++ b/runtime/src/iree/vm/test/call_ops.mlir
@@ -0,0 +1,139 @@
+vm.module @call_ops {
+
+  vm.rodata private @buffer dense<[1, 2, 3]> : tensor<3xi8>
+
+  vm.export @fail_call_v_v
+  vm.func @fail_call_v_v() {
+    vm.call @_v_v_fail() : () -> ()
+    vm.return
+  }
+
+  vm.export @test_call_i_v
+  vm.func @test_call_i_v() {
+    %c1 = vm.const.i32 1
+    vm.call @_i_v(%c1) : (i32) -> ()
+    vm.return
+  }
+
+  vm.export @test_call_r_v
+  vm.func @test_call_r_v() {
+    %ref = vm.const.ref.zero : !vm.ref<?>
+    vm.call @_r_v(%ref) : (!vm.ref<?>) -> ()
+    vm.return
+  }
+
+  // Check that reused ref argument slots are handled properly
+  vm.export @test_call_r_v_reuse_reg
+  vm.func @test_call_r_v_reuse_reg() {
+    %ref = vm.const.ref.zero : !vm.buffer
+    %unused = vm.const.ref.zero : !vm.buffer
+    vm.call @_r_v_reuse_reg(%ref, %unused) : (!vm.buffer, !vm.buffer) -> ()
+    vm.return
+  }
+
+  // Check passing refs as arguments doesn't alter values on the call site
+  // TODO(simon-camp): In the C target we run the DropCompilerHintsPass after
+  // ordinal allocation and vm to EmitC conversion to prevent constant folding
+  // of the tests during the lattter. This means we would need to add a pattern
+  // that inserts calls to `iree_vm_ref_retain` for operand/result pairs of the
+  // do_not_optimize op.
+  // TODO(simon-camp): Enable the test for emitc.
+  vm.export @test_call_r_v_preserve_ref attributes {emitc.exclude}
+  vm.func private @test_call_r_v_preserve_ref() {
+    %ref = vm.const.ref.zero : !vm.buffer
+    %unused = vm.const.ref.rodata @buffer : !vm.buffer
+    %unusued_dno_1 = util.do_not_optimize(%unused) : !vm.buffer
+    vm.check.nz %unused : !vm.buffer
+    vm.call @_r_v_preserve_reg(%ref, %unused) : (!vm.buffer, !vm.buffer) -> ()
+    %unusued_dno_2 = util.do_not_optimize(%unused) : !vm.buffer
+    vm.check.nz %unusued_dno_2 : !vm.buffer
+    vm.return
+  }
+
+  vm.export @test_call_v_i
+  vm.func @test_call_v_i() {
+    %c1 = vm.const.i32 1
+    %0 = vm.call @_v_i() : () -> (i32)
+    vm.check.eq %0, %c1, "_v_i()=1" : i32
+    vm.return
+  }
+
+  vm.export @test_call_v_r
+  vm.func @test_call_v_r() {
+    %ref = vm.const.ref.zero : !vm.ref<?>
+    %ref_dno = util.do_not_optimize(%ref) : !vm.ref<?>
+    %res = vm.call @_v_r() : () -> (!vm.ref<?>)
+    vm.check.eq %ref_dno, %res, "_v_r()=NULL" : !vm.ref<?>
+    vm.return
+  }
+
+  vm.export @test_call_v_ii
+  vm.func @test_call_v_ii() {
+    %c1 = vm.const.i32 1
+    %c2 = vm.const.i32 2
+    %0:2 = vm.call @_v_ii() : () -> (i32, i32)
+    vm.check.eq %0#0, %c1, "_v_ii()#0=1" : i32
+    vm.check.eq %0#1, %c2, "_v_ii()#1=2" : i32
+    vm.return
+  }
+
+  vm.export @test_call_v_v
+  vm.func @test_call_v_v() {
+    vm.call @_v_v() : () -> ()
+    vm.return
+  }
+
+  vm.func @_i_v(%arg : i32) attributes {noinline} {
+    %c1 = vm.const.i32 1
+    vm.check.eq %arg, %c1, "Expected %arg to be 1" : i32
+    vm.return
+  }
+
+  vm.func @_r_v(%arg : !vm.ref<?>) attributes {noinline} {
+    %ref = vm.const.ref.zero : !vm.ref<?>
+    %ref_dno = util.do_not_optimize(%ref) : !vm.ref<?>
+    vm.check.eq %arg, %ref_dno, "Expected %arg to be NULL" : !vm.ref<?>
+    vm.return
+  }
+
+  vm.func @_r_v_reuse_reg(%arg : !vm.ref<?>, %unused : !vm.ref<?>) attributes {noinline} {
+    %ref = vm.const.ref.zero : !vm.ref<?>
+    %ref_dno = util.do_not_optimize(%ref) : !vm.ref<?>
+    vm.check.eq %arg, %ref_dno, "Expected %arg to be NULL" : !vm.ref<?>
+    vm.return
+  }
+
+  vm.func @_r_v_preserve_reg(%arg1 : !vm.ref<?>, %arg2 : !vm.ref<?>) attributes {noinline} {
+    %ref = vm.const.ref.zero : !vm.ref<?>
+    %ref_dno = util.do_not_optimize(%ref) : !vm.ref<?>
+    vm.check.eq %arg1, %ref_dno, "Expected %arg1 to be NULL" : !vm.ref<?>
+    vm.check.nz %arg2, "Expected %arg2 to be not NULL" : !vm.ref<?>
+    vm.return
+  }
+
+  vm.func @_v_i() -> i32 attributes {noinline} {
+    %c1 = vm.const.i32 1
+    vm.return %c1 : i32
+  }
+
+  vm.func @_v_r() -> !vm.ref<?> attributes {noinline} {
+    %ref = vm.const.ref.zero : !vm.ref<?>
+    vm.return %ref : !vm.ref<?>
+  }
+
+  vm.func @_v_ii() -> (i32, i32) attributes {noinline} {
+    %c1 = vm.const.i32 1
+    %c2 = vm.const.i32 2
+    vm.return %c1, %c2 : i32, i32
+  }
+
+  vm.func @_v_v() attributes {noinline} {
+    vm.return
+  }
+
+  vm.func @_v_v_fail() attributes {noinline} {
+    %c2 = vm.const.i32 2
+    vm.fail %c2
+  }
+
+}
diff --git a/runtime/src/iree/vm/test/comparison_ops.mlir b/runtime/src/iree/vm/test/comparison_ops.mlir
new file mode 100644
index 0000000..56067b8
--- /dev/null
+++ b/runtime/src/iree/vm/test/comparison_ops.mlir
@@ -0,0 +1,172 @@
+vm.module @comparison_ops {
+
+  //===--------------------------------------------------------------------===//
+  // vm.cmp.lt.i32.s
+  //===--------------------------------------------------------------------===//
+
+  vm.export @test_cmp_lt_s_0
+  vm.func @test_cmp_lt_s_0() {
+    %lhs = vm.const.i32 2
+    %lhs_dno = util.do_not_optimize(%lhs) : i32
+    %rhs = vm.const.i32 -2
+    %rhs_dno = util.do_not_optimize(%rhs) : i32
+    %actual = vm.cmp.lt.i32.s %lhs_dno, %rhs_dno : i32
+    %expected = vm.const.i32 0
+    vm.check.eq %actual, %expected, "2 < -2" : i32
+    vm.return
+  }
+
+  vm.export @test_cmp_lt_s_1
+  vm.func @test_cmp_lt_s_1() {
+    %lhs = vm.const.i32 -2
+    %lhs_dno = util.do_not_optimize(%lhs) : i32
+    %rhs = vm.const.i32 2
+    %rhs_dno = util.do_not_optimize(%rhs) : i32
+    %actual = vm.cmp.lt.i32.s %lhs_dno, %rhs_dno : i32
+    %expected = vm.const.i32 1
+    vm.check.eq %actual, %expected, "-2 < 2" : i32
+    vm.return
+  }
+
+  // Expect UINT_MAX to be interpreted as -1 when doing a signed compare.
+  vm.export @test_cmp_lt_s_2
+  vm.func @test_cmp_lt_s_2() {
+    %lhs = vm.const.i32 4294967295
+    %lhs_dno = util.do_not_optimize(%lhs) : i32
+    %rhs = vm.const.i32 2
+    %rhs_dno = util.do_not_optimize(%rhs) : i32
+    %actual = vm.cmp.lt.i32.s %lhs_dno, %rhs_dno : i32
+    %expected = vm.const.i32 1
+    vm.check.eq %actual, %expected, "4294967295 (UINT_MAX) < 2" : i32
+    vm.return
+  }
+
+  //===--------------------------------------------------------------------===//
+  // vm.cmp.lt.i32.u
+  //===--------------------------------------------------------------------===//
+
+  vm.export @test_cmp_lt_u_0
+  vm.func @test_cmp_lt_u_0() {
+    %lhs = vm.const.i32 2
+    %lhs_dno = util.do_not_optimize(%lhs) : i32
+    %rhs = vm.const.i32 -2
+    %rhs_dno = util.do_not_optimize(%rhs) : i32
+    %actual = vm.cmp.lt.i32.u %lhs_dno, %rhs_dno : i32
+    %expected = vm.const.i32 1
+    vm.check.eq %actual, %expected, "2 < -2 (as unsigned)" : i32
+    vm.return
+  }
+
+  vm.export @test_cmp_lt_u_1
+  vm.func @test_cmp_lt_u_1() {
+    %lhs = vm.const.i32 -2
+    %lhs_dno = util.do_not_optimize(%lhs) : i32
+    %rhs = vm.const.i32 2
+    %rhs_dno = util.do_not_optimize(%rhs) : i32
+    %actual = vm.cmp.lt.i32.u %lhs_dno, %rhs_dno : i32
+    %expected = vm.const.i32 0
+    vm.check.eq %actual, %expected, "-2 < 2 (as unsigned)" : i32
+    vm.return
+  }
+
+  vm.export @test_cmp_lt_u_2
+  vm.func @test_cmp_lt_u_2() {
+    %lhs = vm.const.i32 4294967295
+    %lhs_dno = util.do_not_optimize(%lhs) : i32
+    %rhs = vm.const.i32 2
+    %rhs_dno = util.do_not_optimize(%rhs) : i32
+    %actual = vm.cmp.lt.i32.u %lhs_dno, %rhs_dno : i32
+    %expected = vm.const.i32 0
+    vm.check.eq %actual, %expected, "4294967295 (UINT_MAX) < 2 (as unsigned)" : i32
+    vm.return
+  }
+
+  //===--------------------------------------------------------------------===//
+  // vm.cmp.*.i32.* pseudo-ops
+  //===--------------------------------------------------------------------===//
+  // NOTE: all of these are turned in to some variants of vm.cmp.lt by the
+  // compiler and are here as a way to test the runtime behavior of the
+  // pseudo-op expansions.
+
+  vm.export @test_cmp_lte
+  vm.func @test_cmp_lte() {
+    %true = vm.const.i32 1
+    %false = vm.const.i32 0
+
+    %cn2 = vm.const.i32 -2
+    %cn2_dno = util.do_not_optimize(%cn2) : i32
+    %c2 = vm.const.i32 2
+    %c2_dno = util.do_not_optimize(%c2) : i32
+
+    %cmp_0 = vm.cmp.lte.i32.s %cn2_dno, %c2_dno : i32
+    vm.check.eq %cmp_0, %true, "-2 <= 2" : i32
+    %cmp_1 = vm.cmp.lte.i32.s %c2_dno, %cn2_dno : i32
+    vm.check.eq %cmp_1, %false, "2 <= -2" : i32
+    %cmp_2 = vm.cmp.lte.i32.s %c2_dno, %c2_dno : i32
+    vm.check.eq %cmp_2, %true, "2 <= 2" : i32
+
+    %cmp_3 = vm.cmp.lte.i32.u %cn2_dno, %c2_dno : i32
+    vm.check.eq %cmp_3, %false, "-2 <= 2 (unsigned)" : i32
+    %cmp_4 = vm.cmp.lte.i32.u %c2_dno, %cn2_dno : i32
+    vm.check.eq %cmp_4, %true, "2 <= -2 (unsigned)" : i32
+    %cmp_5 = vm.cmp.lte.i32.u %c2_dno, %c2_dno : i32
+    vm.check.eq %cmp_5, %true, "2 <= 2 (unsigned)" : i32
+
+    vm.return
+  }
+
+  vm.export @test_cmp_gt
+  vm.func @test_cmp_gt() {
+    %true = vm.const.i32 1
+    %false = vm.const.i32 0
+
+    %cn2 = vm.const.i32 -2
+    %cn2_dno = util.do_not_optimize(%cn2) : i32
+    %c2 = vm.const.i32 2
+    %c2_dno = util.do_not_optimize(%c2) : i32
+
+    %cmp_0 = vm.cmp.gt.i32.s %cn2_dno, %c2_dno : i32
+    vm.check.eq %cmp_0, %false, "-2 > 2" : i32
+    %cmp_1 = vm.cmp.gt.i32.s %c2_dno, %cn2_dno : i32
+    vm.check.eq %cmp_1, %true, "2 > -2" : i32
+    %cmp_2 = vm.cmp.gt.i32.s %c2_dno, %c2_dno : i32
+    vm.check.eq %cmp_2, %false, "2 > 2" : i32
+
+    %cmp_3 = vm.cmp.gt.i32.u %cn2_dno, %c2_dno : i32
+    vm.check.eq %cmp_3, %true, "-2 > 2 (unsigned)" : i32
+    %cmp_4 = vm.cmp.gt.i32.u %c2_dno, %cn2_dno : i32
+    vm.check.eq %cmp_4, %false, "2 > -2 (unsigned)" : i32
+    %cmp_5 = vm.cmp.gt.i32.u %c2_dno, %c2_dno : i32
+    vm.check.eq %cmp_5, %false, "2 > 2 (unsigned)" : i32
+
+    vm.return
+  }
+
+  vm.export @test_cmp_gte
+  vm.func @test_cmp_gte() {
+    %true = vm.const.i32 1
+    %false = vm.const.i32 0
+
+    %cn2 = vm.const.i32 -2
+    %cn2_dno = util.do_not_optimize(%cn2) : i32
+    %c2 = vm.const.i32 2
+    %c2_dno = util.do_not_optimize(%c2) : i32
+
+    %cmp_0 = vm.cmp.gte.i32.s %cn2_dno, %c2_dno : i32
+    vm.check.eq %cmp_0, %false, "-2 >= 2" : i32
+    %cmp_1 = vm.cmp.gte.i32.s %c2_dno, %cn2_dno : i32
+    vm.check.eq %cmp_1, %true, "2 >= -2" : i32
+    %cmp_2 = vm.cmp.gte.i32.s %c2_dno, %c2_dno : i32
+    vm.check.eq %cmp_2, %true, "2 >= 2" : i32
+
+    %cmp_3 = vm.cmp.gte.i32.u %cn2_dno, %c2_dno : i32
+    vm.check.eq %cmp_3, %true, "-2 >= 2 (unsigned)" : i32
+    %cmp_4 = vm.cmp.gte.i32.u %c2_dno, %cn2_dno : i32
+    vm.check.eq %cmp_4, %false, "2 >= -2 (unsigned)" : i32
+    %cmp_5 = vm.cmp.gte.i32.u %c2_dno, %c2_dno : i32
+    vm.check.eq %cmp_5, %true, "2 >= 2 (unsigned)" : i32
+
+    vm.return
+  }
+
+}
diff --git a/runtime/src/iree/vm/test/comparison_ops_f32.mlir b/runtime/src/iree/vm/test/comparison_ops_f32.mlir
new file mode 100644
index 0000000..fe66f4a
--- /dev/null
+++ b/runtime/src/iree/vm/test/comparison_ops_f32.mlir
@@ -0,0 +1,97 @@
+vm.module @comparison_ops_f32 {
+
+  //===--------------------------------------------------------------------===//
+  // vm.cmp.lt.f32
+  //===--------------------------------------------------------------------===//
+
+  vm.export @test_cmp_lt_0_f32
+  vm.func @test_cmp_lt_0_f32() {
+    %lhs = vm.const.f32 4.0
+    %lhs_dno = util.do_not_optimize(%lhs) : f32
+    %rhs = vm.const.f32 -4.0
+    %rhs_dno = util.do_not_optimize(%rhs) : f32
+    %actual = vm.cmp.lt.f32.o %lhs_dno, %rhs_dno : f32
+    %expected = vm.const.i32 0
+    vm.check.eq %actual, %expected, "4.0 < -4.0" : i32
+    vm.return
+  }
+
+  vm.export @test_cmp_lt_1_f32
+  vm.func @test_cmp_lt_1_f32() {
+    %lhs = vm.const.f32 -4.0
+    %lhs_dno = util.do_not_optimize(%lhs) : f32
+    %rhs = vm.const.f32 4.0
+    %rhs_dno = util.do_not_optimize(%rhs) : f32
+    %actual = vm.cmp.lt.f32.o %lhs_dno, %rhs_dno : f32
+    %expected = vm.const.i32 1
+    vm.check.eq %actual, %expected, "-4.0 < 4.0" : i32
+    vm.return
+  }
+
+  //===--------------------------------------------------------------------===//
+  // vm.cmp.*.f32 pseudo-ops
+  //===--------------------------------------------------------------------===//
+  // NOTE: all of these are turned in to some variants of vm.cmp.lt by the
+  // compiler and are here as a way to test the runtime behavior of the
+  // pseudo-op expansions.
+
+  vm.export @test_cmp_lte_f32
+  vm.func @test_cmp_lte_f32() {
+    %true = vm.const.i32 1
+    %false = vm.const.i32 0
+
+    %cn2 = vm.const.f32 -2.0
+    %cn2_dno = util.do_not_optimize(%cn2) : f32
+    %c2 = vm.const.f32 2.0
+    %c2_dno = util.do_not_optimize(%c2) : f32
+
+    %cmp_0 = vm.cmp.lte.f32.o %cn2_dno, %c2_dno : f32
+    vm.check.eq %cmp_0, %true, "-2 <= 2" : i32
+    %cmp_1 = vm.cmp.lte.f32.o %c2_dno, %cn2_dno : f32
+    vm.check.eq %cmp_1, %false, "2 <= -2" : i32
+    %cmp_2 = vm.cmp.lte.f32.o %c2_dno, %c2_dno : f32
+    vm.check.eq %cmp_2, %true, "2 <= 2" : i32
+
+    vm.return
+  }
+
+  vm.export @test_cmp_gt_f32
+  vm.func @test_cmp_gt_f32() {
+    %true = vm.const.i32 1
+    %false = vm.const.i32 0
+
+    %cn2 = vm.const.f32 -2.0
+    %cn2_dno = util.do_not_optimize(%cn2) : f32
+    %c2 = vm.const.f32 2.0
+    %c2_dno = util.do_not_optimize(%c2) : f32
+
+    %cmp_0 = vm.cmp.gt.f32.o %cn2_dno, %c2_dno : f32
+    vm.check.eq %cmp_0, %false, "-2 > 2" : i32
+    %cmp_1 = vm.cmp.gt.f32.o %c2_dno, %cn2_dno : f32
+    vm.check.eq %cmp_1, %true, "2 > -2" : i32
+    %cmp_2 = vm.cmp.gt.f32.o %c2_dno, %c2_dno : f32
+    vm.check.eq %cmp_2, %false, "2 > 2" : i32
+
+    vm.return
+  }
+
+  vm.export @test_cmp_gte_f32
+  vm.func @test_cmp_gte_f32() {
+    %true = vm.const.i32 1
+    %false = vm.const.i32 0
+
+    %cn2 = vm.const.f32 -2.0
+    %cn2_dno = util.do_not_optimize(%cn2) : f32
+    %c2 = vm.const.f32 2.0
+    %c2_dno = util.do_not_optimize(%c2) : f32
+
+    %cmp_0 = vm.cmp.gte.f32.o %cn2_dno, %c2_dno : f32
+    vm.check.eq %cmp_0, %false, "-2 >= 2" : i32
+    %cmp_1 = vm.cmp.gte.f32.o %c2_dno, %cn2_dno : f32
+    vm.check.eq %cmp_1, %true, "2 >= -2" : i32
+    %cmp_2 = vm.cmp.gte.f32.o %c2_dno, %c2_dno : f32
+    vm.check.eq %cmp_2, %true, "2 >= 2" : i32
+
+    vm.return
+  }
+}
diff --git a/runtime/src/iree/vm/test/comparison_ops_i64.mlir b/runtime/src/iree/vm/test/comparison_ops_i64.mlir
new file mode 100644
index 0000000..2e1bd76
--- /dev/null
+++ b/runtime/src/iree/vm/test/comparison_ops_i64.mlir
@@ -0,0 +1,171 @@
+vm.module @comparison_ops_i64 {
+
+  //===--------------------------------------------------------------------===//
+  // vm.cmp.lt.i64.s
+  //===--------------------------------------------------------------------===//
+
+  vm.export @test_cmp_lt_s_0_i64
+  vm.func @test_cmp_lt_s_0_i64() {
+    %lhs = vm.const.i64 4294967295
+    %lhs_dno = util.do_not_optimize(%lhs) : i64
+    %rhs = vm.const.i64 -4294967295
+    %rhs_dno = util.do_not_optimize(%rhs) : i64
+    %actual = vm.cmp.lt.i64.s %lhs_dno, %rhs_dno : i64
+    %expected = vm.const.i32 0
+    vm.check.eq %actual, %expected, "4294967295 (UINT_MAX) < -4294967295 (UINT_MAX)" : i32
+    vm.return
+  }
+
+  vm.export @test_cmp_lt_s_1_i64
+  vm.func @test_cmp_lt_s_1_i64() {
+    %lhs = vm.const.i64 -4294967295
+    %lhs_dno = util.do_not_optimize(%lhs) : i64
+    %rhs = vm.const.i64 4294967295
+    %rhs_dno = util.do_not_optimize(%rhs) : i64
+    %actual = vm.cmp.lt.i64.s %lhs_dno, %rhs_dno : i64
+    %expected = vm.const.i32 1
+    vm.check.eq %actual, %expected, "-4294967295 (UINT_MAX) < 4294967295 (UINT_MAX)" : i32
+    vm.return
+  }
+
+  // Expect ULONG_MAX to be interpreted as -1 when doing a signed compare.
+  vm.export @test_cmp_lt_s_2_i64
+  vm.func @test_cmp_lt_s_2_i64() {
+    %lhs = vm.const.i64 18446744073709551615
+    %lhs_dno = util.do_not_optimize(%lhs) : i64
+    %rhs = vm.const.i64 2
+    %rhs_dno = util.do_not_optimize(%rhs) : i64
+    %actual = vm.cmp.lt.i64.s %lhs_dno, %rhs_dno : i64
+    %expected = vm.const.i32 1
+    vm.check.eq %actual, %expected, "18446744073709551615 (ULONG_MAX) < 2" : i32
+    vm.return
+  }
+
+  //===--------------------------------------------------------------------===//
+  // vm.cmp.lt.i64.u
+  //===--------------------------------------------------------------------===//
+
+  vm.export @test_cmp_lt_u_0_i64
+  vm.func @test_cmp_lt_u_0_i64() {
+    %lhs = vm.const.i64 2
+    %lhs_dno = util.do_not_optimize(%lhs) : i64
+    %rhs = vm.const.i64 -2
+    %rhs_dno = util.do_not_optimize(%rhs) : i64
+    %actual = vm.cmp.lt.i64.u %lhs_dno, %rhs_dno : i64
+    %expected = vm.const.i32 1
+    vm.check.eq %actual, %expected, "2 < -2 (as unsigned)" : i32
+    vm.return
+  }
+
+  vm.export @test_cmp_lt_u_1_i64
+  vm.func @test_cmp_lt_u_1_i64() {
+    %lhs = vm.const.i64 -2
+    %lhs_dno = util.do_not_optimize(%lhs) : i64
+    %rhs = vm.const.i64 2
+    %rhs_dno = util.do_not_optimize(%rhs) : i64
+    %actual = vm.cmp.lt.i64.u %lhs_dno, %rhs_dno : i64
+    %expected = vm.const.i32 0
+    vm.check.eq %actual, %expected, "-2 < 2 (as unsigned)" : i32
+    vm.return
+  }
+
+  vm.export @test_cmp_lt_u_2_i64
+  vm.func @test_cmp_lt_u_2_i64() {
+    %lhs = vm.const.i64 18446744073709551615
+    %lhs_dno = util.do_not_optimize(%lhs) : i64
+    %rhs = vm.const.i64 2
+    %rhs_dno = util.do_not_optimize(%rhs) : i64
+    %actual = vm.cmp.lt.i64.u %lhs_dno, %rhs_dno : i64
+    %expected = vm.const.i32 0
+    vm.check.eq %actual, %expected, "18446744073709551615 (ULONG_MAX) < 2 (as unsigned)" : i32
+    vm.return
+  }
+
+  //===--------------------------------------------------------------------===//
+  // vm.cmp.*.i64.* pseudo-ops
+  //===--------------------------------------------------------------------===//
+  // NOTE: all of these are turned in to some variants of vm.cmp.lt by the
+  // compiler and are here as a way to test the runtime behavior of the
+  // pseudo-op expansions.
+
+  vm.export @test_cmp_lte_i64
+  vm.func @test_cmp_lte_i64() {
+    %true = vm.const.i32 1
+    %false = vm.const.i32 0
+
+    %cn2 = vm.const.i64 -2
+    %cn2_dno = util.do_not_optimize(%cn2) : i64
+    %c2 = vm.const.i64 2
+    %c2_dno = util.do_not_optimize(%c2) : i64
+
+    %cmp_0 = vm.cmp.lte.i64.s %cn2_dno, %c2_dno : i64
+    vm.check.eq %cmp_0, %true, "-2 <= 2" : i32
+    %cmp_1 = vm.cmp.lte.i64.s %c2_dno, %cn2_dno : i64
+    vm.check.eq %cmp_1, %false, "2 <= -2" : i32
+    %cmp_2 = vm.cmp.lte.i64.s %c2_dno, %c2_dno : i64
+    vm.check.eq %cmp_2, %true, "2 <= 2" : i32
+
+    %cmp_3 = vm.cmp.lte.i64.u %cn2_dno, %c2_dno : i64
+    vm.check.eq %cmp_3, %false, "-2 <= 2 (unsigned)" : i32
+    %cmp_4 = vm.cmp.lte.i64.u %c2_dno, %cn2_dno : i64
+    vm.check.eq %cmp_4, %true, "2 <= -2 (unsigned)" : i32
+    %cmp_5 = vm.cmp.lte.i64.u %c2_dno, %c2_dno : i64
+    vm.check.eq %cmp_5, %true, "2 <= 2 (unsigned)" : i32
+
+    vm.return
+  }
+
+  vm.export @test_cmp_gt_i64
+  vm.func @test_cmp_gt_i64() {
+    %true = vm.const.i32 1
+    %false = vm.const.i32 0
+
+    %cn2 = vm.const.i64 -2
+    %cn2_dno = util.do_not_optimize(%cn2) : i64
+    %c2 = vm.const.i64 2
+    %c2_dno = util.do_not_optimize(%c2) : i64
+
+    %cmp_0 = vm.cmp.gt.i64.s %cn2_dno, %c2_dno : i64
+    vm.check.eq %cmp_0, %false, "-2 > 2" : i32
+    %cmp_1 = vm.cmp.gt.i64.s %c2_dno, %cn2_dno : i64
+    vm.check.eq %cmp_1, %true, "2 > -2" : i32
+    %cmp_2 = vm.cmp.gt.i64.s %c2_dno, %c2_dno : i64
+    vm.check.eq %cmp_2, %false, "2 > 2" : i32
+
+    %cmp_3 = vm.cmp.gt.i64.u %cn2_dno, %c2_dno : i64
+    vm.check.eq %cmp_3, %true, "-2 > 2 (unsigned)" : i32
+    %cmp_4 = vm.cmp.gt.i64.u %c2_dno, %cn2_dno : i64
+    vm.check.eq %cmp_4, %false, "2 > -2 (unsigned)" : i32
+    %cmp_5 = vm.cmp.gt.i64.u %c2_dno, %c2_dno : i64
+    vm.check.eq %cmp_5, %false, "2 > 2 (unsigned)" : i32
+
+    vm.return
+  }
+
+  vm.export @test_cmp_gte_i64
+  vm.func @test_cmp_gte_i64() {
+    %true = vm.const.i32 1
+    %false = vm.const.i32 0
+
+    %cn2 = vm.const.i64 -2
+    %cn2_dno = util.do_not_optimize(%cn2) : i64
+    %c2 = vm.const.i64 2
+    %c2_dno = util.do_not_optimize(%c2) : i64
+
+    %cmp_0 = vm.cmp.gte.i64.s %cn2_dno, %c2_dno : i64
+    vm.check.eq %cmp_0, %false, "-2 >= 2" : i32
+    %cmp_1 = vm.cmp.gte.i64.s %c2_dno, %cn2_dno : i64
+    vm.check.eq %cmp_1, %true, "2 >= -2" : i32
+    %cmp_2 = vm.cmp.gte.i64.s %c2_dno, %c2_dno : i64
+    vm.check.eq %cmp_2, %true, "2 >= 2" : i32
+
+    %cmp_3 = vm.cmp.gte.i64.u %cn2_dno, %c2_dno : i64
+    vm.check.eq %cmp_3, %true, "-2 >= 2 (unsigned)" : i32
+    %cmp_4 = vm.cmp.gte.i64.u %c2_dno, %cn2_dno : i64
+    vm.check.eq %cmp_4, %false, "2 >= -2 (unsigned)" : i32
+    %cmp_5 = vm.cmp.gte.i64.u %c2_dno, %c2_dno : i64
+    vm.check.eq %cmp_5, %true, "2 >= 2 (unsigned)" : i32
+
+    vm.return
+  }
+}
diff --git a/runtime/src/iree/vm/test/control_flow_ops.mlir b/runtime/src/iree/vm/test/control_flow_ops.mlir
new file mode 100644
index 0000000..c4015e6
--- /dev/null
+++ b/runtime/src/iree/vm/test/control_flow_ops.mlir
@@ -0,0 +1,112 @@
+vm.module @control_flow_ops {
+
+  //===--------------------------------------------------------------------===//
+  // vm.return
+  //===--------------------------------------------------------------------===//
+
+  vm.export @test_return_empty
+  vm.func @test_return_empty() {
+    vm.return
+  }
+
+  //===--------------------------------------------------------------------===//
+  // vm.fail
+  //===--------------------------------------------------------------------===//
+
+  vm.export @fail_always
+  vm.func @fail_always() {
+    %code = vm.const.i32 4
+    vm.fail %code, "error!"
+  }
+
+  //===--------------------------------------------------------------------===//
+  // vm.check.*
+  //===--------------------------------------------------------------------===//
+
+  vm.export @test_check_eq_always
+  vm.func @test_check_eq_always() {
+    %c1 = vm.const.i32 1
+    %c1dno = util.do_not_optimize(%c1) : i32
+    vm.check.eq %c1, %c1dno, "error!" : i32
+    vm.return
+  }
+
+  vm.export @fail_check_eq_never
+  vm.func @fail_check_eq_never() {
+    %c1 = vm.const.i32 1
+    %c2 = vm.const.i32 2
+    %c1dno = util.do_not_optimize(%c1) : i32
+    %c2dno = util.do_not_optimize(%c2) : i32
+    vm.check.eq %c1dno, %c2dno, "error!" : i32
+    vm.return
+  }
+
+  //===--------------------------------------------------------------------===//
+  // vm.import.resolved
+  //===--------------------------------------------------------------------===//
+
+  vm.import optional @reserved.optional(%arg0: i32) -> i32
+
+  // The optional import should not be found.
+  vm.export @test_optional_import_resolved
+  vm.func @test_optional_import_resolved() {
+    %c1 = vm.const.i32 1
+    %has_reserved_optional = vm.import.resolved @reserved.optional : i32
+    vm.check.ne %has_reserved_optional, %c1, "missing optional import found" : i32
+    vm.return
+  }
+
+  // The call should fail at runtime because the optional import is not resolved.
+  vm.export @fail_optional_import_call
+  vm.func @fail_optional_import_call() {
+    %c1 = vm.const.i32 1
+    %0 = vm.call @reserved.optional(%c1) : (i32) -> i32
+    %code = vm.const.i32 4
+    vm.fail %code, "unreachable!"
+  }
+
+  //===--------------------------------------------------------------------===//
+  // vm.cond_br
+  //===--------------------------------------------------------------------===//
+
+  vm.export @test_cond_br
+  vm.func @test_cond_br() {
+    %c1 = vm.const.i32 1
+    %c1dno = util.do_not_optimize(%c1) : i32
+    vm.cond_br %c1dno, ^bb1, ^bb2
+  ^bb1:
+    vm.check.eq %c1dno, %c1dno, "error!" : i32
+    vm.return
+  ^bb2:
+    %code = vm.const.i32 4
+    vm.fail %code, "unreachable!"
+  }
+
+  vm.export @test_cond_br_int_arg
+  vm.func @test_cond_br_int_arg() {
+    %c1 = vm.const.i32 1
+    %c1dno = util.do_not_optimize(%c1) : i32
+    vm.cond_br %c1dno, ^bb1(%c1dno : i32), ^bb2(%c1dno : i32)
+  ^bb1(%arg1 : i32):
+    vm.check.eq %arg1, %c1dno, "error!" : i32
+    vm.return
+  ^bb2(%arg2 : i32):
+    %code = vm.const.i32 4
+    vm.fail %code, "unreachable!"
+  }
+
+  vm.export @test_cond_br_ref_arg
+  vm.func @test_cond_br_ref_arg() {
+    %c1 = vm.const.i32 1
+    %c1dno = util.do_not_optimize(%c1) : i32
+    %ref = vm.const.ref.zero : !vm.ref<?>
+    vm.cond_br %c1dno, ^bb1(%ref : !vm.ref<?>), ^bb2(%ref : !vm.ref<?>)
+  ^bb1(%arg1 : !vm.ref<?>):
+    vm.check.eq %arg1, %ref, "error!" : !vm.ref<?>
+    vm.return
+  ^bb2(%arg2 : !vm.ref<?>):
+    %code = vm.const.i32 4
+    vm.fail %code, "unreachable!"
+  }
+
+}
diff --git a/runtime/src/iree/vm/test/conversion_ops.mlir b/runtime/src/iree/vm/test/conversion_ops.mlir
new file mode 100644
index 0000000..799376e
--- /dev/null
+++ b/runtime/src/iree/vm/test/conversion_ops.mlir
@@ -0,0 +1,27 @@
+vm.module @conversion_ops {
+
+  //===----------------------------------------------------------------------===//
+  // Casting and type conversion/emulation
+  //===----------------------------------------------------------------------===//
+
+  vm.export @test_trunc_i32_i8
+  vm.func @test_trunc_i32_i8() {
+    %c1 = vm.const.i32 2147483647
+    %c1dno = util.do_not_optimize(%c1) : i32
+    %v = vm.trunc.i32.i8 %c1dno : i32 -> i32
+    %c2 = vm.const.i32 255
+    vm.check.eq %v, %c2, "truncate unsigned i32 to unsigned i8" : i32
+    vm.return
+  }
+
+  vm.export @test_trunc_i32_i16
+  vm.func @test_trunc_i32_i16() {
+    %c1 = vm.const.i32 2147483647
+    %c1dno = util.do_not_optimize(%c1) : i32
+    %v = vm.trunc.i32.i16 %c1dno : i32 -> i32
+    %c2 = vm.const.i32 65535
+    vm.check.eq %v, %c2, "truncate unsigned i32 to unsigned i16" : i32
+    vm.return
+  }
+
+}
diff --git a/runtime/src/iree/vm/test/conversion_ops_f32.mlir b/runtime/src/iree/vm/test/conversion_ops_f32.mlir
new file mode 100644
index 0000000..a68bccc
--- /dev/null
+++ b/runtime/src/iree/vm/test/conversion_ops_f32.mlir
@@ -0,0 +1,119 @@
+vm.module @conversion_ops_f32 {
+
+  //===----------------------------------------------------------------------===//
+  // Casting and type conversion/emulation
+  //===----------------------------------------------------------------------===//
+
+  // 5.5 f32 (0x40b00000 hex) -> 1085276160 int32
+  vm.export @test_bitcast_i32_f32
+  vm.func @test_bitcast_i32_f32() {
+    %c1 = vm.const.i32 1085276160
+    %c1dno = util.do_not_optimize(%c1) : i32
+    %v = vm.bitcast.i32.f32 %c1dno : i32 -> f32
+    %c2 = vm.const.f32 5.5
+    vm.check.eq %v, %c2, "bitcast i32 to f32" : f32
+    vm.return
+  }
+
+  // 1085276160 int32 (0x40b00000 hex) -> 5.5 f32
+  vm.export @test_bitcast_f32_i32
+  vm.func @test_bitcast_f32_i32() {
+    %c1 = vm.const.f32 5.5
+    %c1dno = util.do_not_optimize(%c1) : f32
+    %v = vm.bitcast.f32.i32 %c1dno : f32 -> i32
+    %c2 = vm.const.i32 1085276160
+    vm.check.eq %v, %c2, "bitcast f32 to i32" : i32
+    vm.return
+  }
+
+  vm.export @test_cast_si32_f32_int_max
+  vm.func @test_cast_si32_f32_int_max() {
+    %c1 = vm.const.i32 2147483647
+    %c1dno = util.do_not_optimize(%c1) : i32
+    %v = vm.cast.si32.f32 %c1dno : i32 -> f32
+    %c2 = vm.const.f32 2147483647.0
+    vm.check.eq %v, %c2, "cast signed integer to a floating-point value" : f32
+    vm.return
+  }
+
+  vm.export @test_cast_si32_f32_int_min
+  vm.func @test_cast_si32_f32_int_min() {
+    %c1 = vm.const.i32 -2147483648
+    %c1dno = util.do_not_optimize(%c1) : i32
+    %v = vm.cast.si32.f32 %c1dno : i32 -> f32
+    %c2 = vm.const.f32 -2147483648.0
+    vm.check.eq %v, %c2, "cast signed integer to a floating-point value" : f32
+    vm.return
+  }
+
+  vm.export @test_cast_ui32_f32_int_max
+  vm.func @test_cast_ui32_f32_int_max() {
+    %c1 = vm.const.i32 4294967295
+    %c1dno = util.do_not_optimize(%c1) : i32
+    %v = vm.cast.ui32.f32 %c1dno : i32 -> f32
+    %c2 = vm.const.f32 4294967295.0
+    vm.check.eq %v, %c2, "cast unsigned integer to a floating-point value" : f32
+    vm.return
+  }
+
+  vm.export @test_cast_f32_si32_int_max
+  vm.func @test_cast_f32_si32_int_max() {
+    %c1 = vm.const.f32 2147483647.0
+    %c1dno = util.do_not_optimize(%c1) : f32
+    %v = vm.cast.f32.si32 %c1dno : f32 -> i32
+    %c2 = vm.const.i32 -2147483648
+    vm.check.eq %v, %c2, "cast floating-point value to a signed integer" : i32
+    vm.return
+  }
+
+  vm.export @test_cast_f32_si32_int_min
+  vm.func @test_cast_f32_si32_int_min() {
+    %c1 = vm.const.f32 -2147483648.0
+    %c1dno = util.do_not_optimize(%c1) : f32
+    %v = vm.cast.f32.si32 %c1dno : f32 -> i32
+    %c2 = vm.const.i32 -2147483648
+    vm.check.eq %v, %c2, "cast floating-point value to a signed integer" : i32
+    vm.return
+  }
+
+  vm.export @test_cast_f32_si32_away_from_zero_pos
+  vm.func @test_cast_f32_si32_away_from_zero_pos() {
+    %c1 = vm.const.f32 2.5
+    %c1dno = util.do_not_optimize(%c1) : f32
+    %v = vm.cast.f32.si32 %c1dno : f32 -> i32
+    %c2 = vm.const.i32 3
+    vm.check.eq %v, %c2, "cast floating-point value to a signed integer" : i32
+    vm.return
+  }
+
+  vm.export @test_cast_f32_si32_away_from_zero_neg
+  vm.func @test_cast_f32_si32_away_from_zero_neg() {
+    %c1 = vm.const.f32 -2.5
+    %c1dno = util.do_not_optimize(%c1) : f32
+    %v = vm.cast.f32.si32 %c1dno : f32 -> i32
+    %c2 = vm.const.i32 -3
+    vm.check.eq %v, %c2, "cast floating-point value to a signed integer" : i32
+    vm.return
+  }
+
+  vm.export @test_cast_f32_ui32_int_max
+  vm.func @test_cast_f32_ui32_int_max() {
+    %c1 = vm.const.f32 4294967295.0
+    %c1dno = util.do_not_optimize(%c1) : f32
+    %v = vm.cast.f32.ui32 %c1dno : f32 -> i32
+    %c2 = vm.const.i32 0
+    vm.check.eq %v, %c2, "cast floating-point value to an unsigned integer" : i32
+    vm.return
+  }
+
+  vm.export @test_cast_f32_ui32_away_from_zero
+  vm.func @test_cast_f32_ui32_away_from_zero() {
+    %c1 = vm.const.f32 2.5
+    %c1dno = util.do_not_optimize(%c1) : f32
+    %v = vm.cast.f32.ui32 %c1dno : f32 -> i32
+    %c2 = vm.const.i32 3
+    vm.check.eq %v, %c2, "cast floating-point value to a signed integer" : i32
+    vm.return
+  }
+
+}
diff --git a/runtime/src/iree/vm/test/conversion_ops_i64.mlir b/runtime/src/iree/vm/test/conversion_ops_i64.mlir
new file mode 100644
index 0000000..f790e5a
--- /dev/null
+++ b/runtime/src/iree/vm/test/conversion_ops_i64.mlir
@@ -0,0 +1,17 @@
+vm.module @conversion_ops_i64 {
+
+  //===----------------------------------------------------------------------===//
+  // ExtI64: Casting and type conversion/emulation
+  //===----------------------------------------------------------------------===//
+
+  vm.export @test_trunc_i64_i32
+  vm.func @test_trunc_i64_i32() {
+    %c1 = vm.const.i64 9223372036854775807
+    %c1dno = util.do_not_optimize(%c1) : i64
+    %v = vm.trunc.i64.i32 %c1dno : i64 -> i32
+    %c2 = vm.const.i32 4294967295
+    vm.check.eq %v, %c2, "truncate unsigned i64 to unsigned i32" : i32
+    vm.return
+  }
+
+}
diff --git a/runtime/src/iree/vm/test/emitc/CMakeLists.txt b/runtime/src/iree/vm/test/emitc/CMakeLists.txt
new file mode 100644
index 0000000..68df57d
--- /dev/null
+++ b/runtime/src/iree/vm/test/emitc/CMakeLists.txt
@@ -0,0 +1,346 @@
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+iree_add_all_subdirs()
+
+if(${IREE_ENABLE_EMITC})
+
+iree_cc_test(
+  NAME
+    module_test
+  SRCS
+    "module_test.cc"
+  DEPS
+    iree::base::cc
+    iree::base::logging
+    iree::testing::gtest
+    iree::testing::gtest_main
+    iree::vm
+    ::arithmetic_ops
+    ::arithmetic_ops_f32
+    ::arithmetic_ops_i64
+    ::assignment_ops
+    ::assignment_ops_f32
+    ::assignment_ops_i64
+    ::buffer_ops
+    ::call_ops
+    ::comparison_ops
+    ::comparison_ops_f32
+    ::comparison_ops_i64
+    ::control_flow_ops
+    ::conversion_ops
+    ::conversion_ops_f32
+    ::conversion_ops_i64
+    ::global_ops
+    ::global_ops_f32
+    ::global_ops_i64
+    ::list_ops
+    ::list_variant_ops
+    ::ref_ops
+    ::shift_ops
+    ::shift_ops_i64
+)
+
+iree_c_module(
+  NAME
+    arithmetic_ops
+  SRC
+    "../arithmetic_ops.mlir"
+  H_FILE_OUTPUT
+    "arithmetic_ops.h"
+  FLAGS
+    "-iree-vm-ir-to-c-module"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+)
+
+iree_c_module(
+  NAME
+    arithmetic_ops_f32
+  SRC
+    "../arithmetic_ops_f32.mlir"
+  H_FILE_OUTPUT
+    "arithmetic_ops_f32.h"
+  FLAGS
+    "-iree-vm-ir-to-c-module"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+)
+
+iree_c_module(
+  NAME
+    arithmetic_ops_i64
+  SRC
+    "../arithmetic_ops_i64.mlir"
+  H_FILE_OUTPUT
+    "arithmetic_ops_i64.h"
+  FLAGS
+    "-iree-vm-ir-to-c-module"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+)
+
+iree_c_module(
+  NAME
+    assignment_ops
+  SRC
+    "../assignment_ops.mlir"
+  H_FILE_OUTPUT
+    "assignment_ops.h"
+  FLAGS
+    "-iree-vm-ir-to-c-module"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+)
+
+iree_c_module(
+  NAME
+    assignment_ops_f32
+  SRC
+    "../assignment_ops_f32.mlir"
+  H_FILE_OUTPUT
+    "assignment_ops_f32.h"
+  FLAGS
+    "-iree-vm-ir-to-c-module"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+)
+
+iree_c_module(
+  NAME
+    assignment_ops_i64
+  SRC
+    "../assignment_ops_i64.mlir"
+  H_FILE_OUTPUT
+    "assignment_ops_i64.h"
+  FLAGS
+    "-iree-vm-ir-to-c-module"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+)
+
+iree_c_module(
+  NAME
+    buffer_ops
+  SRC
+    "../buffer_ops.mlir"
+  H_FILE_OUTPUT
+    "buffer_ops.h"
+  FLAGS
+    "-iree-vm-ir-to-c-module"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+)
+
+iree_c_module(
+  NAME
+    call_ops
+  SRC
+    "../call_ops.mlir"
+  H_FILE_OUTPUT
+    "call_ops.h"
+  FLAGS
+    "-iree-vm-ir-to-c-module"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+)
+
+iree_c_module(
+  NAME
+    comparison_ops
+  SRC
+    "../comparison_ops.mlir"
+  H_FILE_OUTPUT
+    "comparison_ops.h"
+  FLAGS
+    "-iree-vm-ir-to-c-module"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+)
+
+iree_c_module(
+  NAME
+    comparison_ops_f32
+  SRC
+    "../comparison_ops_f32.mlir"
+  H_FILE_OUTPUT
+    "comparison_ops_f32.h"
+  FLAGS
+    "-iree-vm-ir-to-c-module"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+)
+
+iree_c_module(
+  NAME
+    comparison_ops_i64
+  SRC
+    "../comparison_ops_i64.mlir"
+  H_FILE_OUTPUT
+    "comparison_ops_i64.h"
+  FLAGS
+    "-iree-vm-ir-to-c-module"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+)
+
+iree_c_module(
+  NAME
+    control_flow_ops
+  SRC
+    "../control_flow_ops.mlir"
+  H_FILE_OUTPUT
+    "control_flow_ops.h"
+  FLAGS
+    "-iree-vm-ir-to-c-module"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+)
+
+iree_c_module(
+  NAME
+    conversion_ops
+  SRC
+    "../conversion_ops.mlir"
+  H_FILE_OUTPUT
+    "conversion_ops.h"
+  FLAGS
+    "-iree-vm-ir-to-c-module"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+)
+
+iree_c_module(
+  NAME
+    conversion_ops_f32
+  SRC
+    "../conversion_ops_f32.mlir"
+  H_FILE_OUTPUT
+    "conversion_ops_f32.h"
+  FLAGS
+    "-iree-vm-ir-to-c-module"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+)
+
+iree_c_module(
+  NAME
+    conversion_ops_i64
+  SRC
+    "../conversion_ops_i64.mlir"
+  H_FILE_OUTPUT
+    "conversion_ops_i64.h"
+  FLAGS
+    "-iree-vm-ir-to-c-module"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+)
+
+iree_c_module(
+  NAME
+    global_ops
+  SRC
+    "../global_ops.mlir"
+  H_FILE_OUTPUT
+    "global_ops.h"
+  FLAGS
+    "-iree-vm-ir-to-c-module"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+)
+
+iree_c_module(
+  NAME
+    global_ops_f32
+  SRC
+    "../global_ops_f32.mlir"
+  H_FILE_OUTPUT
+    "global_ops_f32.h"
+  FLAGS
+    "-iree-vm-ir-to-c-module"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+)
+
+iree_c_module(
+  NAME
+    global_ops_i64
+  SRC
+    "../global_ops_i64.mlir"
+  H_FILE_OUTPUT
+    "global_ops_i64.h"
+  FLAGS
+    "-iree-vm-ir-to-c-module"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+)
+
+iree_c_module(
+  NAME
+    list_ops
+  SRC
+    "../list_ops.mlir"
+  H_FILE_OUTPUT
+    "list_ops.h"
+  FLAGS
+    "-iree-vm-ir-to-c-module"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+)
+
+iree_c_module(
+  NAME
+    list_variant_ops
+  SRC
+    "../list_variant_ops.mlir"
+  H_FILE_OUTPUT
+    "list_variant_ops.h"
+  FLAGS
+    "-iree-vm-ir-to-c-module"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+)
+
+iree_c_module(
+  NAME
+    ref_ops
+  SRC
+    "../ref_ops.mlir"
+  H_FILE_OUTPUT
+    "ref_ops.h"
+  FLAGS
+    "-iree-vm-ir-to-c-module"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+)
+
+iree_c_module(
+  NAME
+    shift_ops
+  SRC
+    "../shift_ops.mlir"
+  H_FILE_OUTPUT
+    "shift_ops.h"
+  FLAGS
+    "-iree-vm-ir-to-c-module"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+)
+
+iree_c_module(
+  NAME
+    shift_ops_i64
+  SRC
+    "../shift_ops_i64.mlir"
+  H_FILE_OUTPUT
+    "shift_ops_i64.h"
+  FLAGS
+    "-iree-vm-ir-to-c-module"
+  TRANSLATE_TOOL
+    iree_tools_iree-translate
+)
+
+endif()
diff --git a/runtime/src/iree/vm/test/emitc/module_test.cc b/runtime/src/iree/vm/test/emitc/module_test.cc
new file mode 100644
index 0000000..eee8d61
--- /dev/null
+++ b/runtime/src/iree/vm/test/emitc/module_test.cc
@@ -0,0 +1,184 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// TODO: We should not be including C implementation-only headers in a C++
+// module like this. In order to make this work for the moment across
+// runtime libraries that are strict, do a global using of the std namespace.
+// See #7605
+#include <cmath>
+using namespace std;
+
+#include "iree/base/logging.h"
+#include "iree/base/status_cc.h"
+#include "iree/testing/gtest.h"
+#include "iree/vm/api.h"
+#define EMITC_IMPLEMENTATION
+#include "iree/vm/test/emitc/arithmetic_ops.h"
+#include "iree/vm/test/emitc/arithmetic_ops_f32.h"
+#include "iree/vm/test/emitc/arithmetic_ops_i64.h"
+#include "iree/vm/test/emitc/assignment_ops.h"
+#include "iree/vm/test/emitc/assignment_ops_f32.h"
+#include "iree/vm/test/emitc/assignment_ops_i64.h"
+#include "iree/vm/test/emitc/buffer_ops.h"
+#include "iree/vm/test/emitc/call_ops.h"
+#include "iree/vm/test/emitc/comparison_ops.h"
+#include "iree/vm/test/emitc/comparison_ops_f32.h"
+#include "iree/vm/test/emitc/comparison_ops_i64.h"
+#include "iree/vm/test/emitc/control_flow_ops.h"
+#include "iree/vm/test/emitc/conversion_ops.h"
+#include "iree/vm/test/emitc/conversion_ops_f32.h"
+#include "iree/vm/test/emitc/conversion_ops_i64.h"
+#include "iree/vm/test/emitc/global_ops.h"
+#include "iree/vm/test/emitc/global_ops_f32.h"
+#include "iree/vm/test/emitc/global_ops_i64.h"
+#include "iree/vm/test/emitc/list_ops.h"
+#include "iree/vm/test/emitc/list_variant_ops.h"
+#include "iree/vm/test/emitc/ref_ops.h"
+#include "iree/vm/test/emitc/shift_ops.h"
+#include "iree/vm/test/emitc/shift_ops_i64.h"
+
+namespace {
+
+typedef iree_status_t (*create_function_t)(iree_allocator_t,
+                                           iree_vm_module_t**);
+
+struct TestParams {
+  std::string module_name;
+  std::string local_name;
+  create_function_t create_function;
+};
+
+struct ModuleDescription {
+  iree_vm_native_module_descriptor_t descriptor;
+  create_function_t create_function;
+};
+
+std::ostream& operator<<(std::ostream& os, const TestParams& params) {
+  std::string qualified_name = params.module_name + "." + params.local_name;
+  auto name_sv =
+      iree_make_string_view(qualified_name.data(), qualified_name.size());
+  iree_string_view_replace_char(name_sv, ':', '_');
+  iree_string_view_replace_char(name_sv, '.', '_');
+  return os << qualified_name;
+}
+
+std::vector<TestParams> GetModuleTestParams() {
+  std::vector<TestParams> test_params;
+
+  // TODO(simon-camp): get these automatically
+  std::vector<ModuleDescription> modules = {
+      {arithmetic_ops_descriptor_, arithmetic_ops_create},
+      {arithmetic_ops_f32_descriptor_, arithmetic_ops_f32_create},
+      {arithmetic_ops_i64_descriptor_, arithmetic_ops_i64_create},
+      {assignment_ops_descriptor_, assignment_ops_create},
+      {assignment_ops_f32_descriptor_, assignment_ops_f32_create},
+      {assignment_ops_i64_descriptor_, assignment_ops_i64_create},
+      {buffer_ops_descriptor_, buffer_ops_create},
+      {call_ops_descriptor_, call_ops_create},
+      {comparison_ops_descriptor_, comparison_ops_create},
+      {comparison_ops_f32_descriptor_, comparison_ops_f32_create},
+      {comparison_ops_i64_descriptor_, comparison_ops_i64_create},
+      {control_flow_ops_descriptor_, control_flow_ops_create},
+      {conversion_ops_descriptor_, conversion_ops_create},
+      {conversion_ops_f32_descriptor_, conversion_ops_f32_create},
+      {conversion_ops_i64_descriptor_, conversion_ops_i64_create},
+      {global_ops_descriptor_, global_ops_create},
+      {global_ops_f32_descriptor_, global_ops_f32_create},
+      {global_ops_i64_descriptor_, global_ops_i64_create},
+      {list_ops_descriptor_, list_ops_create},
+      {list_variant_ops_descriptor_, list_variant_ops_create},
+      {ref_ops_descriptor_, ref_ops_create},
+      {shift_ops_descriptor_, shift_ops_create},
+      {shift_ops_i64_descriptor_, shift_ops_i64_create}};
+
+  for (size_t i = 0; i < modules.size(); i++) {
+    iree_vm_native_module_descriptor_t descriptor = modules[i].descriptor;
+    create_function_t function = modules[i].create_function;
+
+    std::string module_name =
+        std::string(descriptor.module_name.data, descriptor.module_name.size);
+
+    for (iree_host_size_t i = 0; i < descriptor.export_count; i++) {
+      iree_vm_native_export_descriptor_t export_descriptor =
+          descriptor.exports[i];
+      std::string local_name = std::string(export_descriptor.local_name.data,
+                                           export_descriptor.local_name.size);
+      test_params.push_back({module_name, local_name, function});
+    }
+  }
+
+  return test_params;
+}
+
+class VMCModuleTest : public ::testing::Test,
+                      public ::testing::WithParamInterface<TestParams> {
+ protected:
+  virtual void SetUp() {
+    const auto& test_params = GetParam();
+
+    IREE_CHECK_OK(iree_vm_instance_create(iree_allocator_system(), &instance_));
+
+    iree_vm_module_t* module_ = nullptr;
+    IREE_CHECK_OK(
+        test_params.create_function(iree_allocator_system(), &module_));
+
+    std::vector<iree_vm_module_t*> modules = {module_};
+    IREE_CHECK_OK(iree_vm_context_create_with_modules(
+        instance_, IREE_VM_CONTEXT_FLAG_NONE, modules.data(), modules.size(),
+        iree_allocator_system(), &context_));
+
+    iree_vm_module_release(module_);
+  }
+
+  virtual void TearDown() {
+    iree_vm_context_release(context_);
+    iree_vm_instance_release(instance_);
+  }
+
+  iree_status_t RunFunction(std::string module_name, std::string local_name) {
+    std::string qualified_name = module_name + "." + local_name;
+    iree_vm_function_t function;
+    IREE_CHECK_OK(iree_vm_context_resolve_function(
+        context_,
+        iree_string_view_t{qualified_name.data(), qualified_name.size()},
+        &function));
+
+    return iree_vm_invoke(context_, function, IREE_VM_INVOCATION_FLAG_NONE,
+                          /*policy=*/nullptr, /*inputs=*/nullptr,
+                          /*outputs=*/nullptr, iree_allocator_system());
+  }
+
+  iree_vm_instance_t* instance_ = nullptr;
+  iree_vm_context_t* context_ = nullptr;
+};
+
+TEST_P(VMCModuleTest, Check) {
+  const auto& test_params = GetParam();
+  bool expect_failure = test_params.local_name.find("fail_") == 0;
+
+  iree::Status result =
+      RunFunction(test_params.module_name, test_params.local_name);
+  if (result.ok()) {
+    if (expect_failure) {
+      GTEST_FAIL() << "Function expected failure but succeeded";
+    } else {
+      GTEST_SUCCEED();
+    }
+  } else {
+    if (expect_failure) {
+      GTEST_SUCCEED();
+    } else {
+      GTEST_FAIL() << "Function expected success but failed with error: "
+                   << result.ToString();
+    }
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(VMIRFunctions, VMCModuleTest,
+                         ::testing::ValuesIn(GetModuleTestParams()),
+                         ::testing::PrintToStringParamName());
+
+}  // namespace
diff --git a/runtime/src/iree/vm/test/global_ops.mlir b/runtime/src/iree/vm/test/global_ops.mlir
new file mode 100644
index 0000000..a7c718b
--- /dev/null
+++ b/runtime/src/iree/vm/test/global_ops.mlir
@@ -0,0 +1,50 @@
+vm.module @global_ops {
+
+  //===--------------------------------------------------------------------===//
+  // global.i32
+  //===--------------------------------------------------------------------===//
+
+  vm.global.i32 private @c42 = 42 : i32
+  vm.global.i32 private mutable @c107_mut = 107 : i32
+  vm.global.ref mutable @g0 : !vm.buffer
+  // TODO(simon-camp): Add test for initializer
+
+  vm.rodata private @buffer dense<[1, 2, 3]> : tensor<3xi8>
+
+  // TODO(simon-camp) This test gets constant folded
+  vm.export @test_global_load_i32
+  vm.func @test_global_load_i32() {
+    %actual = vm.global.load.i32 @c42 : i32
+    %expected = vm.const.i32 42
+    vm.check.eq %actual, %expected, "@c42 != 42" : i32
+    vm.return
+  }
+
+  vm.export @test_global_load_ref
+  vm.func @test_global_load_ref() {
+    %actual = vm.global.load.ref @g0 : !vm.buffer
+    %expected = vm.const.ref.zero : !vm.buffer
+    %expecteddno = util.do_not_optimize(%expected) : !vm.buffer
+    vm.check.eq %actual, %expecteddno : !vm.buffer
+    vm.return
+  }
+
+  vm.export @test_global_store_i32
+  vm.func @test_global_store_i32() {
+    %c17 = vm.const.i32 17
+    vm.global.store.i32 %c17, @c107_mut : i32
+    %actual = vm.global.load.i32 @c107_mut : i32
+    vm.check.eq %actual, %c17, "@c107_mut != 17" : i32
+    vm.return
+  }
+
+  vm.export @test_global_store_ref
+  vm.func @test_global_store_ref() {
+    %ref_buffer = vm.const.ref.rodata @buffer : !vm.buffer
+    vm.global.store.ref %ref_buffer, @g0 : !vm.buffer
+    %actual = vm.global.load.ref @g0 : !vm.buffer
+    vm.check.eq %actual, %ref_buffer, "@g0 != buffer" : !vm.buffer
+    vm.return
+  }
+
+}
diff --git a/runtime/src/iree/vm/test/global_ops_f32.mlir b/runtime/src/iree/vm/test/global_ops_f32.mlir
new file mode 100644
index 0000000..865f711
--- /dev/null
+++ b/runtime/src/iree/vm/test/global_ops_f32.mlir
@@ -0,0 +1,28 @@
+vm.module @global_ops_f32 {
+
+  //===--------------------------------------------------------------------===//
+  // global.f32
+  //===--------------------------------------------------------------------===//
+
+  vm.global.f32 private @c42 = 42.5 : f32
+  vm.global.f32 private mutable @c107_mut = 107.5 : f32
+  // TODO(simon-camp): Add test for initializer
+
+  vm.export @test_global_load_f32
+  vm.func @test_global_load_f32() {
+    %actual = vm.global.load.f32 @c42 : f32
+    %expected = vm.const.f32 42.5
+    vm.check.eq %actual, %expected, "@c42 != 42.5" : f32
+    vm.return
+  }
+
+  vm.export @test_global_store_f32
+  vm.func @test_global_store_f32() {
+    %c17 = vm.const.f32 17.5
+    vm.global.store.f32 %c17, @c107_mut : f32
+    %actual = vm.global.load.f32 @c107_mut : f32
+    vm.check.eq %actual, %c17, "@c107_mut != 17.5" : f32
+    vm.return
+  }
+
+}
diff --git a/runtime/src/iree/vm/test/global_ops_i64.mlir b/runtime/src/iree/vm/test/global_ops_i64.mlir
new file mode 100644
index 0000000..b567d71
--- /dev/null
+++ b/runtime/src/iree/vm/test/global_ops_i64.mlir
@@ -0,0 +1,28 @@
+vm.module @global_ops_i64 {
+
+  //===--------------------------------------------------------------------===//
+  // global.i64
+  //===--------------------------------------------------------------------===//
+
+  vm.global.i64 private @c42 = 42 : i64
+  vm.global.i64 private mutable @c107_mut = 107 : i64
+  // TODO(simon-camp): Add test for initializer
+
+  vm.export @test_global_load_i64
+  vm.func @test_global_load_i64() {
+    %actual = vm.global.load.i64 @c42 : i64
+    %expected = vm.const.i64 42
+    vm.check.eq %actual, %expected, "@c42 != 42" : i64
+    vm.return
+  }
+
+  vm.export @test_global_store_i64
+  vm.func @test_global_store_i64() {
+    %c17 = vm.const.i64 17
+    vm.global.store.i64 %c17, @c107_mut : i64
+    %actual = vm.global.load.i64 @c107_mut : i64
+    vm.check.eq %actual, %c17, "@c107_mut != 17" : i64
+    vm.return
+  }
+
+}
diff --git a/runtime/src/iree/vm/test/list_ops.mlir b/runtime/src/iree/vm/test/list_ops.mlir
new file mode 100644
index 0000000..81e6b95
--- /dev/null
+++ b/runtime/src/iree/vm/test/list_ops.mlir
@@ -0,0 +1,124 @@
+vm.module @list_ops {
+
+  //===--------------------------------------------------------------------===//
+  // vm.list.* with I8 types
+  //===--------------------------------------------------------------------===//
+
+  vm.export @test_i8
+  vm.func @test_i8() {
+    %c42 = vm.const.i32 42
+    %c100 = vm.const.i32 100
+    %c0 = vm.const.i32 0
+    %list = vm.list.alloc %c42 : (i32) -> !vm.list<i8>
+    vm.list.reserve %list, %c100 : (!vm.list<i8>, i32)
+    %sz = vm.list.size %list : (!vm.list<i8>) -> i32
+    %sz_dno = util.do_not_optimize(%sz) : i32
+    vm.check.eq %sz_dno, %c0, "list<i8>.empty.size()=0" : i32
+    vm.return
+  }
+
+  //===--------------------------------------------------------------------===//
+  // vm.list.* with I16 types
+  //===--------------------------------------------------------------------===//
+
+  vm.export @test_i16
+  vm.func @test_i16() {
+    %c0 = vm.const.i32 0
+    %c1 = vm.const.i32 1
+    %c27 = vm.const.i32 27
+    %list = vm.list.alloc %c1 : (i32) -> !vm.list<i16>
+    vm.list.resize %list, %c1 : (!vm.list<i16>, i32)
+    vm.list.set.i32 %list, %c0, %c27 : (!vm.list<i16>, i32, i32)
+    %v = vm.list.get.i32 %list, %c0 : (!vm.list<i16>, i32) -> i32
+    vm.check.eq %v, %c27, "list<i16>.empty.set(0, 27).get(0)=27" : i32
+    vm.return
+  }
+
+  //===--------------------------------------------------------------------===//
+  // vm.list.* with I32 types
+  //===--------------------------------------------------------------------===//
+
+  vm.export @test_i32
+  vm.func @test_i32() {
+    %c42 = vm.const.i32 42
+    %list = vm.list.alloc %c42 : (i32) -> !vm.list<i32>
+    %sz = vm.list.size %list : (!vm.list<i32>) -> i32
+    %c100 = vm.const.i32 100
+    %c101 = vm.const.i32 101
+    vm.list.resize %list, %c101 : (!vm.list<i32>, i32)
+    vm.list.set.i32 %list, %c100, %c42 : (!vm.list<i32>, i32, i32)
+    %v = vm.list.get.i32 %list, %c100 : (!vm.list<i32>, i32) -> i32
+    vm.check.eq %v, %c42, "list<i32>.empty.set(100, 42).get(100)=42" : i32
+    vm.return
+  }
+
+  //===--------------------------------------------------------------------===//
+  // vm.list.* with ref types
+  //===--------------------------------------------------------------------===//
+
+  vm.export @test_ref
+  vm.func @test_ref() {
+    // TODO(benvanik): test vm.list with ref types.
+    vm.return
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Multiple lists within the same block
+  //===--------------------------------------------------------------------===//
+
+  vm.export @test_multiple_lists
+  vm.func @test_multiple_lists() {
+    %c0 = vm.const.i32 0
+    %c1 = vm.const.i32 1
+    %c27 = vm.const.i32 27
+    %c42 = vm.const.i32 42
+
+    // These allocs shouldn't be CSE'd.
+    %list0 = vm.list.alloc %c1 : (i32) -> !vm.list<i8>
+    %list1 = vm.list.alloc %c1 : (i32) -> !vm.list<i8>
+    vm.list.resize %list0, %c1 : (!vm.list<i8>, i32)
+    vm.list.resize %list1, %c1 : (!vm.list<i8>, i32)
+    vm.list.set.i32 %list0, %c0, %c27 : (!vm.list<i8>, i32, i32)
+    vm.list.set.i32 %list1, %c0, %c42 : (!vm.list<i8>, i32, i32)
+    %res0 = vm.list.get.i32 %list0, %c0 : (!vm.list<i8>, i32) -> i32
+    %res1 = vm.list.get.i32 %list1, %c0 : (!vm.list<i8>, i32) -> i32
+    vm.check.eq %res0, %c27, "list0.get(0)=27" : i32
+    vm.check.eq %res1, %c42, "list1.get(0)=42" : i32
+
+    vm.return
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Failure tests
+  //===--------------------------------------------------------------------===//
+
+  vm.export @fail_uninitialized_access
+  vm.func @fail_uninitialized_access() {
+    %c0 = vm.const.i32 0
+    %c1 = vm.const.i32 1
+    %list = vm.list.alloc %c1 : (i32) -> !vm.list<i32>
+    vm.list.set.i32 %list, %c0, %c1 : (!vm.list<i32>, i32, i32)
+    vm.return
+  }
+
+  vm.export @fail_out_of_bounds_read
+  vm.func @fail_out_of_bounds_read() {
+    %c1 = vm.const.i32 1
+    %list = vm.list.alloc %c1 : (i32) -> !vm.list<i32>
+    vm.list.resize %list, %c1 : (!vm.list<i32>, i32)
+    %v = vm.list.get.i32 %list, %c1 : (!vm.list<i32>, i32) -> i32
+    %v_dno = util.do_not_optimize(%v) : i32
+    // Add a dummy use of %v_dno to please recent versions of clang for the C target
+    vm.list.set.i32 %list, %c1, %v_dno : (!vm.list<i32>, i32, i32)
+    vm.return
+  }
+
+  vm.export @fail_out_of_bounds_write
+  vm.func @fail_out_of_bounds_write() {
+    %c1 = vm.const.i32 1
+    %list = vm.list.alloc %c1 : (i32) -> !vm.list<i32>
+    vm.list.resize %list, %c1 : (!vm.list<i32>, i32)
+    vm.list.set.i32 %list, %c1, %c1 : (!vm.list<i32>, i32, i32)
+    vm.return
+  }
+}
diff --git a/runtime/src/iree/vm/test/list_ops_i64.mlir b/runtime/src/iree/vm/test/list_ops_i64.mlir
new file mode 100644
index 0000000..97f8681
--- /dev/null
+++ b/runtime/src/iree/vm/test/list_ops_i64.mlir
@@ -0,0 +1,21 @@
+vm.module @list_ops_i64 {
+
+  //===--------------------------------------------------------------------===//
+  // vm.list.* with I64 types
+  //===--------------------------------------------------------------------===//
+
+  vm.export @test_i64
+  vm.func @test_i64() {
+    %capacity = vm.const.i32 42
+    %index = vm.const.i32 41
+    %max_int_plus_1 = vm.const.i64 2147483648
+    %list = vm.list.alloc %capacity : (i32) -> !vm.list<i64>
+    %sz = vm.list.size %list : (!vm.list<i64>) -> i32
+    vm.list.resize %list, %capacity : (!vm.list<i64>, i32)
+    vm.list.set.i64 %list, %index, %max_int_plus_1 : (!vm.list<i64>, i32, i64)
+    %v = vm.list.get.i64 %list, %index : (!vm.list<i64>, i32) -> i64
+    vm.check.eq %v, %max_int_plus_1, "list<i64>.empty.set(41, MAX_INT_PLUS_1).get(41)=MAX_INT_PLUS_1" : i64
+    vm.return
+  }
+
+}
diff --git a/runtime/src/iree/vm/test/list_variant_ops.mlir b/runtime/src/iree/vm/test/list_variant_ops.mlir
new file mode 100644
index 0000000..5a8d23c
--- /dev/null
+++ b/runtime/src/iree/vm/test/list_variant_ops.mlir
@@ -0,0 +1,162 @@
+vm.module @list_variant_ops {
+
+  //===--------------------------------------------------------------------===//
+  // vm.list.* with list types (nesting)
+  //===--------------------------------------------------------------------===//
+
+  vm.export @test_listception
+  vm.func @test_listception() {
+    %c0 = vm.const.i32 0
+    %c1 = vm.const.i32 1
+    %c2 = vm.const.i32 2
+    %c3 = vm.const.i32 3
+    %c100 = vm.const.i32 100
+    %c101 = vm.const.i32 101
+    %c102 = vm.const.i32 102
+
+    // [100, 101, 102]
+    %inner0 = vm.list.alloc %c3 : (i32) -> !vm.list<i32>
+    vm.list.resize %inner0, %c3 : (!vm.list<i32>, i32)
+    vm.list.set.i32 %inner0, %c0, %c100 : (!vm.list<i32>, i32, i32)
+    vm.list.set.i32 %inner0, %c1, %c101 : (!vm.list<i32>, i32, i32)
+    vm.list.set.i32 %inner0, %c2, %c102 : (!vm.list<i32>, i32, i32)
+
+    // [102, 101, 100]
+    %inner1 = vm.list.alloc %c3 : (i32) -> !vm.list<i32>
+    vm.list.resize %inner1, %c3 : (!vm.list<i32>, i32)
+    vm.list.set.i32 %inner1, %c0, %c102 : (!vm.list<i32>, i32, i32)
+    vm.list.set.i32 %inner1, %c1, %c101 : (!vm.list<i32>, i32, i32)
+    vm.list.set.i32 %inner1, %c2, %c100 : (!vm.list<i32>, i32, i32)
+
+    // [ [100, 101, 102], [102, 101, 100] ]
+    %capacity = vm.const.i32 8
+    %outer = vm.list.alloc %capacity : (i32) -> !vm.list<!vm.list<i32>>
+    vm.list.resize %outer, %c2 : (!vm.list<!vm.list<i32>>, i32)
+    vm.list.set.ref %outer, %c0, %inner0 : (!vm.list<!vm.list<i32>>, i32, !vm.list<i32>)
+    vm.list.set.ref %outer, %c1, %inner1 : (!vm.list<!vm.list<i32>>, i32, !vm.list<i32>)
+
+    %inner0_ret = vm.list.get.ref %outer, %c0 : (!vm.list<!vm.list<i32>>, i32) -> !vm.list<i32>
+    vm.check.eq %inner0_ret, %inner0 : !vm.list<i32>
+    %inner0_e2 = vm.list.get.i32 %inner0_ret, %c2 : (!vm.list<i32>, i32) -> i32
+    vm.check.eq %inner0_e2, %c102 : i32
+
+    %inner1_ret = vm.list.get.ref %outer, %c1 : (!vm.list<!vm.list<i32>>, i32) -> !vm.list<i32>
+    vm.check.eq %inner1_ret, %inner1 : !vm.list<i32>
+    %inner1_e2 = vm.list.get.i32 %inner1_ret, %c2 : (!vm.list<i32>, i32) -> i32
+    vm.check.eq %inner1_e2, %c100 : i32
+
+    vm.return
+  }
+
+  //===--------------------------------------------------------------------===//
+  // vm.list.* with variant types
+  //===--------------------------------------------------------------------===//
+
+  vm.rodata private @byte_buffer dense<[1, 2, 3]> : tensor<3xi32>
+
+  vm.export @test_variant
+  vm.func @test_variant() {
+    %capacity = vm.const.i32 42
+    %list = vm.list.alloc %capacity : (i32) -> !vm.list<?>
+    vm.list.resize %list, %capacity : (!vm.list<?>, i32)
+
+    // Access element 10 as an i32.
+    %c10 = vm.const.i32 10
+    %v10_i32 = vm.const.i32 1234
+    vm.list.set.i32 %list, %c10, %v10_i32 : (!vm.list<?>, i32, i32)
+    %e10_i32 = vm.list.get.i32 %list, %c10 : (!vm.list<?>, i32) -> i32
+    vm.check.eq %e10_i32, %v10_i32 : i32
+
+    // Access element 10 as an i64.
+    %v10_i64 = vm.const.i64 1234
+    vm.list.set.i64 %list, %c10, %v10_i64 : (!vm.list<?>, i32, i64)
+    %e10_i64 = vm.list.get.i64 %list, %c10 : (!vm.list<?>, i32) -> i64
+    vm.check.eq %e10_i64, %v10_i64 : i64
+
+    // Access element 11 as a ref object.
+    %c11 = vm.const.i32 11
+    %v11_buf = vm.const.ref.rodata @byte_buffer : !vm.buffer
+    vm.list.set.ref %list, %c11, %v11_buf : (!vm.list<?>, i32, !vm.buffer)
+    %e11_buf = vm.list.get.ref %list, %c11 : (!vm.list<?>, i32) -> !vm.buffer
+    vm.check.eq %e11_buf, %v11_buf : !vm.buffer
+
+    // Access element 11 as a different kind of ref object (incompatible).
+    // Should return null.
+    %e11_bad = vm.list.get.ref %list, %c11 : (!vm.list<?>, i32) -> !vm.list<i8>
+    %null = vm.const.ref.zero : !vm.list<i8>
+    vm.check.eq %e11_bad, %null : !vm.list<i8>
+
+    vm.return
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Failure tests
+  //===--------------------------------------------------------------------===//
+
+  vm.export @fail_uninitialized_access
+  vm.func @fail_uninitialized_access() {
+    %c0 = vm.const.i32 0
+    %c1 = vm.const.i32 1
+
+    %ref = vm.const.ref.rodata @byte_buffer : !vm.buffer
+    %list = vm.list.alloc %c1 : (i32) -> !vm.list<?>
+
+    vm.list.set.ref %list, %c0, %ref : (!vm.list<?>, i32, !vm.buffer)
+    vm.return
+  }
+
+  vm.export @fail_out_of_bounds_read
+  vm.func @fail_out_of_bounds_read() {
+    %c1 = vm.const.i32 1
+
+    %list = vm.list.alloc %c1 : (i32) -> !vm.list<?>
+    vm.list.resize %list, %c1 : (!vm.list<?>, i32)
+
+    %ref = vm.list.get.ref %list, %c1 : (!vm.list<?>, i32) -> !vm.buffer
+    %ref_dno = util.do_not_optimize(%ref) : !vm.buffer
+    vm.return
+  }
+
+  vm.export @fail_out_of_bounds_write
+  vm.func @fail_out_of_bounds_write() {
+    %c0 = vm.const.i32 0
+    %c1 = vm.const.i32 1
+
+    %ref = vm.const.ref.rodata @byte_buffer : !vm.buffer
+    %list = vm.list.alloc %c1 : (i32) -> !vm.list<?>
+    vm.list.resize %list, %c1 : (!vm.list<?>, i32)
+
+    vm.list.set.ref %list, %c1, %ref : (!vm.list<?>, i32, !vm.buffer)
+    vm.return
+  }
+
+  vm.export @fail_variant_slot_change
+  vm.func @fail_variant_slot_change() {
+    %capacity = vm.const.i32 42
+    %list = vm.list.alloc %capacity : (i32) -> !vm.list<?>
+    vm.list.resize %list, %capacity : (!vm.list<?>, i32)
+
+    %c10 = vm.const.i32 10
+
+    // Access element 10 as an i32.
+    %v10_i32 = vm.const.i32 1234
+    vm.list.set.i32 %list, %c10, %v10_i32 : (!vm.list<?>, i32, i32)
+    %e10_i32 = vm.list.get.i32 %list, %c10 : (!vm.list<?>, i32) -> i32
+    vm.check.eq %e10_i32, %v10_i32 : i32
+
+    // Access element 10 as a ref object.
+    %v10_buf = vm.const.ref.rodata @byte_buffer : !vm.buffer
+    vm.list.set.ref %list, %c10, %v10_buf : (!vm.list<?>, i32, !vm.buffer)
+    %e10_buf = vm.list.get.ref %list, %c10 : (!vm.list<?>, i32) -> !vm.buffer
+    vm.check.eq %e10_buf, %v10_buf : !vm.buffer
+
+    // Accessing it as an i32 now that it stores the ref should fail at runtime.
+    // TODO(benvanik): support type queries and/or make this silently return 0.
+    %e10_any = vm.list.get.i32 %list, %c10 : (!vm.list<?>, i32) -> i32
+    // -- FAILURE HERE --
+    %zero = vm.const.i32.zero
+    vm.check.eq %e10_any, %zero : i32
+
+    vm.return
+  }
+}
diff --git a/runtime/src/iree/vm/test/ref_ops.mlir b/runtime/src/iree/vm/test/ref_ops.mlir
new file mode 100644
index 0000000..862a75e
--- /dev/null
+++ b/runtime/src/iree/vm/test/ref_ops.mlir
@@ -0,0 +1,47 @@
+vm.module @ref_ops {
+
+  vm.rodata private @buffer_i8 dense<[1, 2, 3]> : tensor<3xi8>
+  vm.rodata private @buffer_i32 dense<[1, 2, 3]> : tensor<3xi32>
+
+  vm.export @test_zero_ref_eq
+  vm.func @test_zero_ref_eq() {
+    %ref = vm.const.ref.zero : !vm.ref<?>
+    %ref_dno = util.do_not_optimize(%ref) : !vm.ref<?>
+    vm.check.eq %ref_dno, %ref_dno : !vm.ref<?>
+    vm.return
+  }
+
+  // TODO(simon-camp): In the C target we run the DropCompilerHintsPass after
+  // ordinal allocation and vm to EmitC conversion to prevent constant folding
+  // of the tests during the lattter. This means we would need to add a pattern
+  // that inserts calls to `iree_vm_ref_retain` for operand/result pairs of the
+  // do_not_optimize op.
+  vm.export @test_ref_eq attributes {emitc.exclude}
+  vm.func @test_ref_eq() {
+    %ref_1 = vm.const.ref.rodata @buffer_i8 : !vm.buffer
+    %ref_1_dno = util.do_not_optimize(%ref_1) : !vm.buffer
+    %ref_2 = vm.const.ref.rodata @buffer_i8 : !vm.buffer
+    %ref_2_dno = util.do_not_optimize(%ref_2) : !vm.buffer
+    vm.check.eq %ref_1_dno, %ref_2_dno : !vm.buffer
+    vm.return
+  }
+
+  vm.export @test_ref_ne
+  vm.func @test_ref_ne() {
+    %ref_i8 = vm.const.ref.rodata @buffer_i8 : !vm.buffer
+    %ref_i8_dno = util.do_not_optimize(%ref_i8) : !vm.buffer
+    %ref_i32 = vm.const.ref.rodata @buffer_i32 : !vm.buffer
+    %ref_i32_dno = util.do_not_optimize(%ref_i32) : !vm.buffer
+    vm.check.ne %ref_i8_dno, %ref_i32_dno : !vm.buffer
+    vm.return
+  }
+
+  vm.export @test_ref_nz
+  vm.func @test_ref_nz() {
+    %ref = vm.const.ref.rodata @buffer_i8 : !vm.buffer
+    %ref_dno = util.do_not_optimize(%ref) : !vm.buffer
+    vm.check.nz %ref_dno : !vm.buffer
+    vm.return
+  }
+
+}
diff --git a/runtime/src/iree/vm/test/shift_ops.mlir b/runtime/src/iree/vm/test/shift_ops.mlir
new file mode 100644
index 0000000..4905ea9
--- /dev/null
+++ b/runtime/src/iree/vm/test/shift_ops.mlir
@@ -0,0 +1,38 @@
+vm.module @shift_ops {
+
+  //===--------------------------------------------------------------------===//
+  // Native bitwise shifts and rotates
+  //===--------------------------------------------------------------------===//
+
+  vm.export @test_shl_i32
+  vm.func @test_shl_i32() {
+    %c1 = vm.const.i32 1
+    %c1dno = util.do_not_optimize(%c1) : i32
+    %c2 = vm.const.i32 2
+    %v = vm.shl.i32 %c1dno, %c2 : i32
+    %c4 = vm.const.i32 4
+    vm.check.eq %v, %c4, "1<<2=4" : i32
+    vm.return
+  }
+
+  vm.export @test_shr_i32s
+  vm.func @test_shr_i32s() {
+    %cn1 = vm.const.i32 -1
+    %cn1dno = util.do_not_optimize(%cn1) : i32
+    %c2 = vm.const.i32 2
+    %v = vm.shr.i32.s %cn1dno, %c2 : i32
+    vm.check.eq %v, %cn1dno, "-1>>2=-1" : i32
+    vm.return
+  }
+
+  vm.export @test_shr_i32u
+  vm.func @test_shr_i32u() {
+    %c4 = vm.const.i32 4
+    %c4dno = util.do_not_optimize(%c4) : i32
+    %c2 = vm.const.i32 2
+    %v = vm.shr.i32.u %c4dno, %c2 : i32
+    %c1 = vm.const.i32 1
+    vm.check.eq %v, %c1, "4>>2=1" : i32
+    vm.return
+  }
+}
diff --git a/runtime/src/iree/vm/test/shift_ops_i64.mlir b/runtime/src/iree/vm/test/shift_ops_i64.mlir
new file mode 100644
index 0000000..6632b2e
--- /dev/null
+++ b/runtime/src/iree/vm/test/shift_ops_i64.mlir
@@ -0,0 +1,39 @@
+vm.module @shift_ops_i64 {
+
+  //===--------------------------------------------------------------------===//
+  // ExtI64: Native bitwise shifts and rotates
+  //===--------------------------------------------------------------------===//
+
+  vm.export @test_shl_i64
+  vm.func @test_shl_i64() {
+    %c1 = vm.const.i64 1
+    %c1dno = util.do_not_optimize(%c1) : i64
+    %shamt = vm.const.i32 2
+    %v = vm.shl.i64 %c1dno, %shamt : i64
+    %c4 = vm.const.i64 4
+    vm.check.eq %v, %c4, "1<<2=4" : i64
+    vm.return
+  }
+
+  vm.export @test_shr_i64s
+  vm.func @test_shr_i64s() {
+    %c1 = vm.const.i64 -1
+    %c1dno = util.do_not_optimize(%c1) : i64
+    %shamt = vm.const.i32 2
+    %v = vm.shr.i64.s %c1dno, %shamt : i64
+    %cn1 = vm.const.i64 -1
+    vm.check.eq %v, %cn1, "-1>>2=-1" : i64
+    vm.return
+  }
+
+  vm.export @test_shr_i64u
+  vm.func @test_shr_i64u() {
+    %c4 = vm.const.i64 4
+    %c4dno = util.do_not_optimize(%c4) : i64
+    %shamt = vm.const.i32 2
+    %v = vm.shr.i64.u %c4dno, %shamt : i64
+    %c1 = vm.const.i64 1
+    vm.check.eq %v, %c1, "4>>2=1" : i64
+    vm.return
+  }
+}
diff --git a/runtime/src/iree/vm/type_def.h b/runtime/src/iree/vm/type_def.h
new file mode 100644
index 0000000..d8cc8b6
--- /dev/null
+++ b/runtime/src/iree/vm/type_def.h
@@ -0,0 +1,91 @@
+// Copyright 2020 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_VM_TYPE_DEF_H_
+#define IREE_VM_TYPE_DEF_H_
+
+#include <stdint.h>
+
+#include "iree/vm/ref.h"
+#include "iree/vm/value.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Describes a type in the type table, mapping from a local module type ID to
+// either a primitive value type or registered ref type.
+//
+// * ?: variant (value_type/ref_type == 0)
+// * i8: primitive value (value_type != 0)
+// * !vm.ref<?>: any ref value (ref_type == IREE_VM_REF_TYPE_ANY)
+// * !vm.ref<!foo>: ref value of type !foo (ref_type > 0)
+typedef struct iree_vm_type_def_t {
+  iree_vm_value_type_t value_type : 8;
+  iree_vm_ref_type_t ref_type : 24;
+} iree_vm_type_def_t;
+
+static inline iree_vm_type_def_t iree_vm_type_def_make_variant_type(void) {
+  iree_vm_type_def_t result;
+  result.value_type = IREE_VM_VALUE_TYPE_NONE;
+  result.ref_type = IREE_VM_REF_TYPE_NULL;
+  return result;
+}
+
+static inline iree_vm_type_def_t iree_vm_type_def_make_value_type(
+    iree_vm_value_type_t value_type) {
+  iree_vm_type_def_t result;
+  result.value_type = value_type;
+  result.ref_type = IREE_VM_REF_TYPE_NULL;
+  return result;
+}
+
+static inline iree_vm_type_def_t iree_vm_type_def_make_ref_type(
+    iree_vm_ref_type_t ref_type) {
+  iree_vm_type_def_t result;
+  result.value_type = IREE_VM_VALUE_TYPE_NONE;
+  result.ref_type = ref_type;
+  return result;
+}
+
+#define iree_vm_type_def_is_value(v) \
+  ((v)->value_type != IREE_VM_VALUE_TYPE_NONE)
+#define iree_vm_type_def_is_ref(v) ((v)->ref_type != IREE_VM_REF_TYPE_NULL)
+#define iree_vm_type_def_is_variant(v)           \
+  ((v)->value_type == IREE_VM_VALUE_TYPE_NONE && \
+   (v)->ref_type == IREE_VM_REF_TYPE_NULL)
+
+// An variant value that can be either a primitive value type or a ref type.
+// Each variant value stores its type but users are required to check the type
+// prior to accessing any of the data.
+typedef struct iree_vm_variant_t {
+  iree_vm_type_def_t type;
+  union {
+    // TODO(benvanik): replace with iree_vm_value_t.
+    int8_t i8;
+    int16_t i16;
+    int32_t i32;
+    int64_t i64;
+    float f32;
+    double f64;
+    iree_vm_ref_t ref;
+
+    uint8_t value_storage[IREE_VM_VALUE_STORAGE_SIZE];  // max size of all value
+                                                        // types
+  };
+} iree_vm_variant_t;
+
+#define iree_vm_variant_empty() \
+  { {IREE_VM_VALUE_TYPE_NONE, IREE_VM_REF_TYPE_NULL}, {0}, }
+#define iree_vm_variant_is_value(v) iree_vm_type_def_is_value(&(v).type)
+#define iree_vm_variant_is_ref(v) iree_vm_type_def_is_ref(&(v).type)
+#define iree_vm_variant_is_empty(v) iree_vm_type_def_is_variant(&(v).type)
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_VM_TYPE_DEF_H_
diff --git a/runtime/src/iree/vm/value.h b/runtime/src/iree/vm/value.h
new file mode 100644
index 0000000..445d80f
--- /dev/null
+++ b/runtime/src/iree/vm/value.h
@@ -0,0 +1,133 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_VM_VALUE_H_
+#define IREE_VM_VALUE_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// TODO(benvanik): support variable size in modules. vm.imports would need index
+// type and we'd have to make sure all native modules used this size type. It
+// would be a compiler runtime flag and runtime compile flag.
+typedef int32_t iree_vm_size_t;
+
+// Defines the type of a primitive value.
+typedef enum iree_vm_value_type_e {
+  // Not a value type.
+  IREE_VM_VALUE_TYPE_NONE = 0,
+  // int8_t.
+  IREE_VM_VALUE_TYPE_I8 = 1,
+  // int16_t.
+  IREE_VM_VALUE_TYPE_I16 = 2,
+  // int32_t.
+  IREE_VM_VALUE_TYPE_I32 = 3,
+  // int64_t.
+  IREE_VM_VALUE_TYPE_I64 = 4,
+  // float.
+  IREE_VM_VALUE_TYPE_F32 = 5,
+  // double.
+  IREE_VM_VALUE_TYPE_F64 = 6,
+
+  IREE_VM_VALUE_TYPE_MAX = IREE_VM_VALUE_TYPE_F64,
+  IREE_VM_VALUE_TYPE_COUNT = IREE_VM_VALUE_TYPE_MAX + 1,  // used for lookup
+} iree_vm_value_type_t;
+
+// Maximum size, in bytes, of any value type we can represent.
+#define IREE_VM_VALUE_STORAGE_SIZE 8
+
+// A variant value type.
+typedef struct iree_vm_value_t {
+  iree_vm_value_type_t type;
+  union {
+    int8_t i8;
+    int16_t i16;
+    int32_t i32;
+    int64_t i64;
+    float f32;
+    double f64;
+
+    uint8_t value_storage[IREE_VM_VALUE_STORAGE_SIZE];  // max size of all value
+                                                        // types
+  };
+} iree_vm_value_t;
+
+static inline iree_vm_value_t iree_vm_value_make_none() {
+  iree_vm_value_t result;
+  result.type = IREE_VM_VALUE_TYPE_NONE;
+  return result;
+}
+
+static inline iree_vm_value_t iree_vm_value_make_i8(int8_t value) {
+  iree_vm_value_t result;
+  result.type = IREE_VM_VALUE_TYPE_I8;
+  result.i8 = value;
+  return result;
+}
+
+static inline iree_vm_value_t iree_vm_value_make_i16(int16_t value) {
+  iree_vm_value_t result;
+  result.type = IREE_VM_VALUE_TYPE_I16;
+  result.i16 = value;
+  return result;
+}
+
+static inline iree_vm_value_t iree_vm_value_make_i32(int32_t value) {
+  iree_vm_value_t result;
+  result.type = IREE_VM_VALUE_TYPE_I32;
+  result.i32 = value;
+  return result;
+}
+
+// TODO(#5542): check the value type before accessing the union.
+static inline int32_t iree_vm_value_get_i32(iree_vm_value_t *value) {
+  return value->i32;
+}
+
+static inline iree_vm_value_t iree_vm_value_make_i64(int64_t value) {
+  iree_vm_value_t result;
+  result.type = IREE_VM_VALUE_TYPE_I64;
+  result.i64 = value;
+  return result;
+}
+
+// TODO(#5542): check the value type before accessing the union.
+static inline int64_t iree_vm_value_get_i64(iree_vm_value_t *value) {
+  return value->i64;
+}
+
+static inline iree_vm_value_t iree_vm_value_make_f32(float value) {
+  iree_vm_value_t result;
+  result.type = IREE_VM_VALUE_TYPE_F32;
+  result.f32 = value;
+  return result;
+}
+
+// TODO(#5542): check the value type before accessing the union.
+static inline float iree_vm_value_get_f32(iree_vm_value_t *value) {
+  return value->f32;
+}
+
+static inline iree_vm_value_t iree_vm_value_make_f64(double value) {
+  iree_vm_value_t result;
+  result.type = IREE_VM_VALUE_TYPE_F64;
+  result.f64 = value;
+  return result;
+}
+
+// TODO(#5542): check the value type before accessing the union.
+static inline double iree_vm_value_get_f64(iree_vm_value_t *value) {
+  return value->f64;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_VM_VALUE_H_