Initial WebGPU HAL implementation. For now, this code will live in the `experimental/` folder, while the code matures and reaches feature parity with the supported HAL drivers. While some work has been started to add unit and integration tests for the web platform, that work is not complete yet (and it will likely be substantially different from our existing "native" tests). See also the discussion on [RFC: Promoting IREE web platform / Emscripten builds to "stable"](https://groups.google.com/g/iree-discuss/c/2K5VJ9P8K8I/m/elidhfFMCAAJ). This uses [webgpu-headers](https://github.com/webgpu-native/webgpu-headers) to connect with API implementations, either via [Emscripten's library_webgpu.js](https://github.com/emscripten-core/emscripten/blob/main/src/library_webgpu.js) or via other implementations like [Dawn](https://dawn.googlesource.com/dawn/) / [wgpu-native](https://github.com/gfx-rs/wgpu-native) (we originally had a platform switch for these, but dropped that for now)

commit: e7c2cba2e4e995d81fa2a792849ebe47b9455cef [log] [tgz]
author: Ben Vanik <benvanik@google.com> Mon Jul 19 15:45:39 2021 -0700
committer: Scott Todd <scotttodd@google.com> Tue Jun 06 08:17:24 2023 -0700
tree: 9e4030776640f0d47ba2fbc10ff589a066a52779
parent: 6d6f54bcbb4a9064a1e09a070422872c2849fda6 [diff]
diff --git a/experimental/webgpu/BUILD.bazel b/experimental/webgpu/BUILD.bazel
new file mode 100644
index 0000000..a615c5d
--- /dev/null
+++ b/experimental/webgpu/BUILD.bazel

@@ -0,0 +1,62 @@
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("//build_tools/bazel:build_defs.oss.bzl", "iree_runtime_cc_library")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+iree_runtime_cc_library(
+    name = "webgpu",
+    srcs = [
+        "api.h",
+        "bind_group_cache.c",
+        "bind_group_cache.h",
+        "buffer.c",
+        "buffer.h",
+        "builtins.c",
+        "builtins.h",
+        "command_buffer.c",
+        "command_buffer.h",
+        "executable.c",
+        "executable.h",
+        "nop_event.c",
+        "nop_event.h",
+        "nop_executable_cache.c",
+        "nop_executable_cache.h",
+        "nop_semaphore.c",
+        "nop_semaphore.h",
+        "pipeline_layout.c",
+        "pipeline_layout.h",
+        "simple_allocator.c",
+        "simple_allocator.h",
+        "staging_buffer.c",
+        "staging_buffer.h",
+        "webgpu_device.c",
+        "webgpu_device.h",
+    ],
+    hdrs = [
+        "api.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:core_headers",
+        "//runtime/src/iree/base:tracing",
+        "//runtime/src/iree/base/internal",
+        "//runtime/src/iree/base/internal:arena",
+        "//runtime/src/iree/base/internal/flatcc:parsing",
+        "//runtime/src/iree/hal",
+        "//runtime/src/iree/hal/drivers/webgpu/platform",
+        "//runtime/src/iree/hal/drivers/webgpu/shaders",
+        "//runtime/src/iree/hal/utils:buffer_transfer",
+        "//runtime/src/iree/schemas:wgsl_executable_def_c_fbs",
+        "@webgpu_headers",
+    ],
+)

diff --git a/experimental/webgpu/CMakeLists.txt b/experimental/webgpu/CMakeLists.txt
new file mode 100644
index 0000000..add4557
--- /dev/null
+++ b/experimental/webgpu/CMakeLists.txt

@@ -0,0 +1,58 @@
+# TODO(scotttodd): remove after moving out of experimental/, use bazel_to_cmake
+set(IREE_PACKAGE_ROOT_DIR ${CMAKE_CURRENT_LIST_DIR}/../..)
+# Canonicalize path.
+cmake_path(ABSOLUTE_PATH IREE_PACKAGE_ROOT_DIR
+  BASE_DIRECTORY ${IREE_PACKAGE_ROOT_DIR}
+  NORMALIZE
+  OUTPUT_VARIABLE IREE_PACKAGE_ROOT_DIR)
+set(IREE_PACKAGE_ROOT_PREFIX iree)
+
+iree_add_all_subdirs()
+
+iree_cc_library(
+  NAME
+    webgpu
+  HDRS
+    "api.h"
+  SRCS
+    "api.h"
+    "bind_group_cache.c"
+    "bind_group_cache.h"
+    "buffer.c"
+    "buffer.h"
+    "builtins.c"
+    "builtins.h"
+    "command_buffer.c"
+    "command_buffer.h"
+    "executable.c"
+    "executable.h"
+    "nop_event.c"
+    "nop_event.h"
+    "nop_executable_cache.c"
+    "nop_executable_cache.h"
+    "nop_semaphore.c"
+    "nop_semaphore.h"
+    "pipeline_layout.c"
+    "pipeline_layout.h"
+    "simple_allocator.c"
+    "simple_allocator.h"
+    "staging_buffer.c"
+    "staging_buffer.h"
+    "webgpu_device.c"
+    "webgpu_device.h"
+  DEPS
+    iree::base
+    iree::base::core_headers
+    iree::base::internal
+    iree::base::internal::arena
+    iree::base::internal::flatcc::parsing
+    iree::base::tracing
+    iree::hal
+    iree::experimental::webgpu::platform
+    iree::experimental::webgpu::shaders
+    iree::hal::utils::buffer_transfer
+    iree::schemas::wgsl_executable_def_c_fbs
+  PUBLIC
+)
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###

diff --git a/experimental/webgpu/api.h b/experimental/webgpu/api.h
new file mode 100644
index 0000000..31c5602
--- /dev/null
+++ b/experimental/webgpu/api.h

@@ -0,0 +1,95 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// See iree/base/api.h for documentation on the API conventions used.
+
+#ifndef IREE_HAL_DRIVERS_WEBGPU_API_H_
+#define IREE_HAL_DRIVERS_WEBGPU_API_H_
+
+#include "experimental/webgpu/platform/webgpu.h"
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+//===----------------------------------------------------------------------===//
+// iree_hal_webgpu_device_t
+//===----------------------------------------------------------------------===//
+
+// TODO(benvanik): replace with flag list (easier to version).
+enum iree_hal_webgpu_device_flag_bits_t {
+  IREE_HAL_WEBGPU_DEVICE_RESERVED = 0u,
+};
+typedef uint32_t iree_hal_webgpu_device_flags_t;
+
+typedef struct iree_hal_webgpu_device_options_t {
+  // Flags controlling device behavior.
+  iree_hal_webgpu_device_flags_t flags;
+
+  // Size of the per-queue uniform staging buffer.
+  // Larger buffer sizes will result in fewer flushes in large command buffers.
+  iree_device_size_t queue_uniform_buffer_size;
+} iree_hal_webgpu_device_options_t;
+
+IREE_API_EXPORT void iree_hal_webgpu_device_options_initialize(
+    iree_hal_webgpu_device_options_t* out_options);
+
+IREE_API_EXPORT iree_status_t iree_hal_webgpu_wrap_device(
+    iree_string_view_t identifier,
+    const iree_hal_webgpu_device_options_t* options, WGPUDevice handle,
+    iree_allocator_t host_allocator, iree_hal_device_t** out_device);
+
+//===----------------------------------------------------------------------===//
+// iree_hal_webgpu_driver_t
+//===----------------------------------------------------------------------===//
+
+typedef enum iree_hal_webgpu_driver_backend_e {
+  IREE_HAL_WEBGPU_DRIVER_BACKEND_ANY = 0u,
+  IREE_HAL_WEBGPU_DRIVER_BACKEND_D3D12 = 1u,
+  IREE_HAL_WEBGPU_DRIVER_BACKEND_METAL = 2u,
+  IREE_HAL_WEBGPU_DRIVER_BACKEND_VULKAN = 3u,
+} iree_hal_webgpu_driver_backend_t;
+
+typedef enum iree_hal_webgpu_driver_log_level_e {
+  IREE_HAL_WEBGPU_DRIVER_LOG_LEVEL_OFF = 0u,
+  IREE_HAL_WEBGPU_DRIVER_LOG_LEVEL_ERROR = 1u,
+  IREE_HAL_WEBGPU_DRIVER_LOG_LEVEL_WARNING = 2u,
+  IREE_HAL_WEBGPU_DRIVER_LOG_LEVEL_INFO = 3u,
+  IREE_HAL_WEBGPU_DRIVER_LOG_LEVEL_DEBUG = 4u,
+  IREE_HAL_WEBGPU_DRIVER_LOG_LEVEL_TRACE = 5u,
+} iree_hal_webgpu_driver_log_level_t;
+
+// WebGPU native driver creation options.
+typedef struct iree_hal_webgpu_driver_options_t {
+  // Logging level for messages logged to stderr. Disabled by default.
+  iree_hal_webgpu_driver_log_level_t log_level;
+
+  // Preferred backend - ignored if backend is not available.
+  iree_hal_webgpu_driver_backend_t backend_preference;
+
+  // TODO(benvanik): remove this single setting - it would be nice instead to
+  // pass a list to force device enumeration/matrix expansion or omit entirely
+  // to have auto-discovered options based on capabilities. Right now this
+  // forces all devices - even if from different vendors - to have the same
+  // options.
+  // Options to use for all devices created by the driver.
+  iree_hal_webgpu_device_options_t device_options;
+
+  // Controls adapter selection when multiple exist in the system having
+  // different power characteristics (such as integrated vs discrete GPUs).
+  WGPUPowerPreference power_preference;
+} iree_hal_webgpu_driver_options_t;
+
+IREE_API_EXPORT void iree_hal_webgpu_driver_options_initialize(
+    iree_hal_webgpu_driver_options_t* out_options);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_DRIVERS_WEBGPU_API_H_

diff --git a/experimental/webgpu/bind_group_cache.c b/experimental/webgpu/bind_group_cache.c
new file mode 100644
index 0000000..db9e2d0
--- /dev/null
+++ b/experimental/webgpu/bind_group_cache.c

@@ -0,0 +1,139 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "experimental/webgpu/bind_group_cache.h"
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "experimental/webgpu/buffer.h"
+#include "iree/base/api.h"
+#include "iree/base/internal/math.h"
+#include "iree/base/tracing.h"
+
+void iree_hal_webgpu_bind_group_cache_initialize(
+    WGPUDevice device, iree_hal_webgpu_bind_group_cache_t* out_cache) {
+  IREE_ASSERT_ARGUMENT(out_cache);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  out_cache->device = device;
+  out_cache->entry_count = IREE_HAL_WEBGPU_BIND_GROUP_CACHE_CAPACITY;
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_hal_webgpu_bind_group_cache_deinitialize(
+    iree_hal_webgpu_bind_group_cache_t* cache) {
+  IREE_ASSERT_ARGUMENT(cache);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Trim is the same as deinit today.
+  iree_hal_webgpu_bind_group_cache_trim(cache);
+  cache->entry_count = 0;
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_hal_webgpu_bind_group_cache_trim(
+    iree_hal_webgpu_bind_group_cache_t* cache) {
+  IREE_ASSERT_ARGUMENT(cache);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  for (iree_host_size_t i = 0; i < cache->entry_count; ++i) {
+    iree_hal_webgpu_bind_group_cache_entry_t* entry = &cache->entries[i];
+    if (entry->handle) iree_wgpuBindGroupDrop(entry->handle);
+  }
+  memset(cache->entries, 0, sizeof(cache->entries));
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+WGPUBindGroup iree_hal_webgpu_bind_group_cache_acquire(
+    iree_hal_webgpu_bind_group_cache_t* cache, WGPUBindGroupLayout group_layout,
+    const iree_hal_webgpu_bind_group_binding_t* bindings,
+    iree_hal_webgpu_binding_mask_t binding_mask) {
+  IREE_ASSERT_ARGUMENT(cache);
+  IREE_ASSERT_ARGUMENT(bindings);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // This is not a good algorithm :)
+  // We should probably have a split index and a mechanism to partition it such
+  // that lookups don't need to perform a full scan. This is cheaper than the
+  // cost of creating a new bind group per dispatch (no need to call out to
+  // WebGPU, allocate new objects, track those new objects lifetimes, etc) but
+  // not cheap. Ideally we'd be relying on this for 4-5 bind groups per command
+  // buffer at which point it doesn't matter but the compiler still needs to
+  // improve a bit there.
+
+  // Scan the cache for entries with a matching group layout and binding mask.
+  // These should be the same today but in the future we may want to allow for
+  // subsetting as defined by bind group compatibility.
+  iree_host_size_t insertion_slot = cache->entry_count - 1;
+  for (iree_host_size_t i = 0; i < cache->entry_count; ++i) {
+    iree_hal_webgpu_bind_group_cache_entry_t* entry = &cache->entries[i];
+    if (!entry->handle) {
+      insertion_slot = iree_min(insertion_slot, i);
+      continue;
+    }
+    if (entry->group_layout != group_layout) continue;
+    if (entry->binding_mask != binding_mask) continue;
+
+    // Found a potential match. Do a full comparison of the bindings.
+    // TODO(benvanik): we only really need to compare the bindings that are
+    // set in the mask, however memcmp over a few hundred bytes is usually
+    // faster than what we'd have to do for that comparison.
+    if (memcmp(bindings, entry->bindings, sizeof(entry->bindings)) == 0) {
+      // Same exact bindings - cache hit!
+      // TODO(benvanik): a real LRU that rearranges this to the front/back.
+      IREE_TRACE_ZONE_END(z0);
+      return entry->handle;
+    }
+  }
+
+  // Evict an existing entry to store this new one or use the last unused slot.
+  iree_hal_webgpu_bind_group_cache_entry_t* entry =
+      &cache->entries[insertion_slot];
+  if (entry->handle) {
+    IREE_TRACE_ZONE_APPEND_TEXT(z0, "evict");
+    iree_wgpuBindGroupDrop(entry->handle);
+  } else {
+    IREE_TRACE_ZONE_APPEND_TEXT(z0, "miss");
+  }
+  entry->group_layout = group_layout;
+  entry->binding_mask = binding_mask;
+  memcpy(entry->bindings, bindings, sizeof(entry->bindings));
+
+  // NOTE: we could change this to do bit scans over the binding_mask but I
+  // haven't checked to see how expensive those are in WebAssembly. For now we
+  // do a few more loop iterations with the assumption that doing a bit scan
+  // may require hundreds of more instructions.
+  uint32_t binding_count = 0;
+  WGPUBindGroupEntry entries[IREE_HAL_WEBGPU_MAX_DESCRIPTOR_SET_BINDING_COUNT];
+  for (iree_host_size_t i = 0; i < IREE_ARRAYSIZE(entries); ++i) {
+    if (!(binding_mask & (1u << i))) continue;
+    entries[binding_count] = (WGPUBindGroupEntry){
+        .nextInChain = NULL,
+        .binding = binding_count,
+        .buffer = bindings[i].buffer,
+        .offset = (uint64_t)bindings[i].offset,
+        .size = bindings[i].length,
+    };
+    ++binding_count;
+  }
+
+  const WGPUBindGroupDescriptor descriptor = {
+      .nextInChain = NULL,
+      .label = NULL,
+      .layout = group_layout,
+      .entryCount = binding_count,
+      .entries = entries,
+  };
+  entry->handle = wgpuDeviceCreateBindGroup(cache->device, &descriptor);
+
+  IREE_TRACE_ZONE_END(z0);
+  return entry->handle;
+}

diff --git a/experimental/webgpu/bind_group_cache.h b/experimental/webgpu/bind_group_cache.h
new file mode 100644
index 0000000..68874da
--- /dev/null
+++ b/experimental/webgpu/bind_group_cache.h

@@ -0,0 +1,86 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_DRIVERS_WEBGPU_BIND_GROUP_CACHE_H_
+#define IREE_HAL_DRIVERS_WEBGPU_BIND_GROUP_CACHE_H_
+
+#include "experimental/webgpu/pipeline_layout.h"
+#include "experimental/webgpu/platform/webgpu.h"
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// NOTE: this is probably too small, but this is all a hack anyway.
+// TODO(benvanik): build a real cache - today this is assuming the compiler does
+// a much better job than it currently does as reducing the number of push
+// descriptor sets.
+#define IREE_HAL_WEBGPU_BIND_GROUP_CACHE_CAPACITY 32
+
+// A subset of WGPUBindGroupEntry containing only what we need.
+// WGPUBindGroupEntry is quite large (has sampler and texture information).
+typedef struct iree_hal_webgpu_bind_group_binding_t {
+  // TODO(benvanik): also track whether dynamic.
+  WGPUBufferBindingType type;
+  WGPUBuffer buffer;
+  iree_device_size_t offset;
+  iree_device_size_t length;
+} iree_hal_webgpu_bind_group_binding_t;
+
+typedef struct iree_hal_webgpu_bind_group_cache_entry_t {
+  // Group layout this bind group conforms to.
+  // It's possible to share bind groups with different compatible layouts but
+  // we don't do that yet and require an exact match.
+  WGPUBindGroupLayout group_layout;
+  // Cached WebGPU bind group containing the bindings.
+  WGPUBindGroup handle;
+  // Each bit indicates a populated binding at the respective ordinal.
+  iree_hal_webgpu_binding_mask_t binding_mask;
+  // Each source binding to use for cache equality comparison.
+  iree_hal_webgpu_bind_group_binding_t
+      bindings[IREE_HAL_WEBGPU_MAX_DESCRIPTOR_SET_BINDING_COUNT];
+} iree_hal_webgpu_bind_group_cache_entry_t;
+
+// Simple cache of WGPUBindGroups.
+// Bind groups in WebGPU are immutable and we need to create new ones for each
+// unique set of bindings.
+typedef struct iree_hal_webgpu_bind_group_cache_t {
+  WGPUDevice device;
+  iree_host_size_t entry_count;
+  iree_hal_webgpu_bind_group_cache_entry_t
+      entries[IREE_HAL_WEBGPU_BIND_GROUP_CACHE_CAPACITY];
+} iree_hal_webgpu_bind_group_cache_t;
+
+// Initializes an empty bind group cache.
+void iree_hal_webgpu_bind_group_cache_initialize(
+    WGPUDevice device, iree_hal_webgpu_bind_group_cache_t* out_cache);
+
+// Deinitializes the cache and drops all bind group handles.
+void iree_hal_webgpu_bind_group_cache_deinitialize(
+    iree_hal_webgpu_bind_group_cache_t* cache);
+
+// Trims the cache down to its minimum size by dropping all bind groups.
+// All WGPUBindGroup handles will be dropped.
+void iree_hal_webgpu_bind_group_cache_trim(
+    iree_hal_webgpu_bind_group_cache_t* cache);
+
+// Acquires a bind group from the cache with the given |bindings|.
+// Each bit of |binding_mask| indicates a binding that is used by the caller;
+// this allows for matching of cached bind groups to match any with only the
+// used bindings needing to match.
+// Callers may use the returned bind group handle until the cache is trimmed.
+WGPUBindGroup iree_hal_webgpu_bind_group_cache_acquire(
+    iree_hal_webgpu_bind_group_cache_t* cache, WGPUBindGroupLayout group_layout,
+    const iree_hal_webgpu_bind_group_binding_t* bindings,
+    iree_hal_webgpu_binding_mask_t binding_mask);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_DRIVERS_WEBGPU_BIND_GROUP_CACHE_H_

diff --git a/experimental/webgpu/buffer.c b/experimental/webgpu/buffer.c
new file mode 100644
index 0000000..7005a8f
--- /dev/null
+++ b/experimental/webgpu/buffer.c

@@ -0,0 +1,136 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "experimental/webgpu/buffer.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "experimental/webgpu/webgpu_device.h"
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/utils/buffer_transfer.h"
+
+// TODO(benvanik): decouple via injection.
+#include "experimental/webgpu/simple_allocator.h"
+
+typedef struct iree_hal_webgpu_buffer_t {
+  iree_hal_buffer_t base;
+  iree_hal_device_t* device;  // unowned
+  WGPUBuffer handle;
+  bool is_mapped;
+} iree_hal_webgpu_buffer_t;
+
+extern const iree_hal_buffer_vtable_t iree_hal_webgpu_buffer_vtable;
+
+static iree_hal_webgpu_buffer_t* iree_hal_webgpu_buffer_cast(
+    iree_hal_buffer_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_webgpu_buffer_vtable);
+  return (iree_hal_webgpu_buffer_t*)base_value;
+}
+
+iree_status_t iree_hal_webgpu_buffer_wrap(
+    iree_hal_device_t* device, iree_hal_allocator_t* device_allocator,
+    iree_hal_memory_type_t memory_type, iree_hal_memory_access_t allowed_access,
+    iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size,
+    iree_device_size_t byte_offset, iree_device_size_t byte_length,
+    WGPUBuffer handle, iree_allocator_t host_allocator,
+    iree_hal_buffer_t** out_buffer) {
+  IREE_ASSERT_ARGUMENT(device);
+  IREE_ASSERT_ARGUMENT(device_allocator);
+  IREE_ASSERT_ARGUMENT(handle);
+  IREE_ASSERT_ARGUMENT(out_buffer);
+  *out_buffer = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_webgpu_buffer_t* buffer = NULL;
+  iree_status_t status =
+      iree_allocator_malloc(host_allocator, sizeof(*buffer), (void**)&buffer);
+  if (iree_status_is_ok(status)) {
+    iree_hal_buffer_initialize(host_allocator, device_allocator, &buffer->base,
+                               allocation_size, byte_offset, byte_length,
+                               memory_type, allowed_access, allowed_usage,
+                               &iree_hal_webgpu_buffer_vtable, &buffer->base);
+    buffer->device = device;
+    buffer->handle = handle;
+    *out_buffer = &buffer->base;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_webgpu_buffer_destroy(iree_hal_buffer_t* base_buffer) {
+  iree_hal_webgpu_buffer_t* buffer = iree_hal_webgpu_buffer_cast(base_buffer);
+  iree_allocator_t host_allocator = base_buffer->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  if (buffer->is_mapped) {
+    wgpuBufferUnmap(buffer->handle);
+  }
+
+  // NOTE: this immediately destroys the buffer (in theory) and it must not be
+  // in use. That's ok because we also have that requirement in the HAL.
+  wgpuBufferDestroy(buffer->handle);
+
+  iree_allocator_free(host_allocator, buffer);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+WGPUBuffer iree_hal_webgpu_buffer_handle(const iree_hal_buffer_t* base_buffer) {
+  iree_hal_webgpu_buffer_t* buffer =
+      iree_hal_webgpu_buffer_cast((iree_hal_buffer_t*)base_buffer);
+  IREE_ASSERT_ARGUMENT(buffer);
+  return buffer->handle;
+}
+
+static iree_status_t iree_hal_webgpu_buffer_map_range(
+    iree_hal_buffer_t* base_buffer, iree_hal_mapping_mode_t mapping_mode,
+    iree_hal_memory_access_t memory_access,
+    iree_device_size_t local_byte_offset, iree_device_size_t local_byte_length,
+    iree_hal_buffer_mapping_t* mapping) {
+  // WebGPU does not allow for synchronous buffer mapping.
+  // Use wgpuBufferMapAsync directly to avoid this emulation.
+  iree_hal_webgpu_buffer_t* buffer = iree_hal_webgpu_buffer_cast(base_buffer);
+  return iree_hal_buffer_emulated_map_range(
+      buffer->device, base_buffer, mapping_mode, memory_access,
+      local_byte_offset, local_byte_length, mapping);
+}
+
+static iree_status_t iree_hal_webgpu_buffer_unmap_range(
+    iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset,
+    iree_device_size_t local_byte_length, iree_hal_buffer_mapping_t* mapping) {
+  // WebGPU does not allow for synchronous buffer mapping.
+  // Use wgpuBufferMapAsync directly to avoid this emulation.
+  iree_hal_webgpu_buffer_t* buffer = iree_hal_webgpu_buffer_cast(base_buffer);
+  return iree_hal_buffer_emulated_unmap_range(buffer->device, base_buffer,
+                                              local_byte_offset,
+                                              local_byte_length, mapping);
+}
+
+static iree_status_t iree_hal_webgpu_buffer_invalidate_range(
+    iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset,
+    iree_device_size_t local_byte_length) {
+  // Nothing to do.
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_webgpu_buffer_flush_range(
+    iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset,
+    iree_device_size_t local_byte_length) {
+  // Nothing to do.
+  return iree_ok_status();
+}
+
+const iree_hal_buffer_vtable_t iree_hal_webgpu_buffer_vtable = {
+    .destroy = iree_hal_webgpu_buffer_destroy,
+    .map_range = iree_hal_webgpu_buffer_map_range,
+    .unmap_range = iree_hal_webgpu_buffer_unmap_range,
+    .invalidate_range = iree_hal_webgpu_buffer_invalidate_range,
+    .flush_range = iree_hal_webgpu_buffer_flush_range,
+};

diff --git a/experimental/webgpu/buffer.h b/experimental/webgpu/buffer.h
new file mode 100644
index 0000000..056185d
--- /dev/null
+++ b/experimental/webgpu/buffer.h

@@ -0,0 +1,35 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_DRIVERS_WEBGPU_BUFFER_H_
+#define IREE_HAL_DRIVERS_WEBGPU_BUFFER_H_
+
+#include "experimental/webgpu/platform/webgpu.h"
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// TODO(benvanik): callback for releasing the buffer back to the allocator if
+// we start to support pooling.
+
+iree_status_t iree_hal_webgpu_buffer_wrap(
+    iree_hal_device_t* device, iree_hal_allocator_t* device_allocator,
+    iree_hal_memory_type_t memory_type, iree_hal_memory_access_t allowed_access,
+    iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size,
+    iree_device_size_t byte_offset, iree_device_size_t byte_length,
+    WGPUBuffer handle, iree_allocator_t host_allocator,
+    iree_hal_buffer_t** out_buffer);
+
+WGPUBuffer iree_hal_webgpu_buffer_handle(const iree_hal_buffer_t* buffer);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_DRIVERS_WEBGPU_BUFFER_H_

diff --git a/experimental/webgpu/builtins.c b/experimental/webgpu/builtins.c
new file mode 100644
index 0000000..935bc9e
--- /dev/null
+++ b/experimental/webgpu/builtins.c

@@ -0,0 +1,149 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "experimental/webgpu/builtins.h"
+
+#include "experimental/webgpu/shaders/builtin_shaders.h"
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+
+static const char* iree_hal_webgpu_builtins_find_code(const char* file_name) {
+  const iree_file_toc_t* files = iree_hal_wgsl_builtin_shaders_create();
+  for (size_t i = 0; i < iree_hal_wgsl_builtin_shaders_size(); ++i) {
+    if (strcmp(file_name, files[i].name) == 0) {
+      return files[i].data;
+    }
+  }
+  IREE_ASSERT_TRUE(false, "builtin wgsl file not found");
+  return NULL;
+}
+
+static iree_status_t iree_hal_webgpu_builtins_initialize_fill_buffer(
+    WGPUDevice device, iree_hal_webgpu_staging_buffer_t* staging_buffer,
+    iree_hal_webgpu_builtin_fill_buffer_t* out_fill_buffer) {
+  const WGPUBindGroupLayoutEntry buffer_binding = {
+      .nextInChain = NULL,
+      .binding = 0,
+      .visibility = WGPUShaderStage_Compute,
+      .buffer =
+          {
+              .nextInChain = NULL,
+              .type = WGPUBufferBindingType_Storage,
+              .hasDynamicOffset = false,
+              .minBindingSize = 0,  // variable
+          },
+  };
+
+  const WGPUBindGroupLayoutDescriptor buffer_group_layout_descriptor = {
+      .nextInChain = NULL,
+      .label = WGPU_DEBUG_LABEL("_builtin_fill_buffer_buffer"),
+      .entryCount = 1,
+      .entries = &buffer_binding,
+  };
+  WGPUBindGroupLayout buffer_group_layout =
+      wgpuDeviceCreateBindGroupLayout(device, &buffer_group_layout_descriptor);
+  if (!buffer_group_layout) {
+    return iree_make_status(
+        IREE_STATUS_INTERNAL,
+        "failed to create fill_buffer builtin bind group layout");
+  }
+
+  const WGPUBindGroupLayout group_layouts[] = {
+      staging_buffer->bind_group_layout,
+      buffer_group_layout,
+  };
+  const WGPUPipelineLayoutDescriptor pipeline_layout_descriptor = {
+      .nextInChain = NULL,
+      .label = WGPU_DEBUG_LABEL("_builtin_fill_buffer_layout"),
+      .bindGroupLayoutCount = (uint32_t)IREE_ARRAYSIZE(group_layouts),
+      .bindGroupLayouts = group_layouts,
+  };
+  WGPUPipelineLayout pipeline_layout =
+      wgpuDeviceCreatePipelineLayout(device, &pipeline_layout_descriptor);
+  iree_wgpuBindGroupLayoutDrop(buffer_group_layout);
+  if (!pipeline_layout) {
+    return iree_make_status(
+        IREE_STATUS_INTERNAL,
+        "failed to create fill_buffer builtin pipeline layout");
+  }
+
+  const char* code = iree_hal_webgpu_builtins_find_code("fill_buffer.wgsl");
+  const WGPUShaderModuleWGSLDescriptor wgsl_descriptor = {
+    .chain =
+        {
+            .next = NULL,
+            .sType = WGPUSType_ShaderModuleWGSLDescriptor,
+        },
+#if defined(IREE_PLATFORM_EMSCRIPTEN)
+    // Emscripten uses this older name.
+    .source = code,
+#else
+    // Spec uses this name: https://www.w3.org/TR/webgpu/#shader-module-creation
+    .code = code,
+#endif
+  };
+  const WGPUShaderModuleDescriptor module_descriptor = {
+      .nextInChain = &wgsl_descriptor.chain,
+      .label = WGPU_DEBUG_LABEL("_builtin_fill_buffer_wgsl"),
+  };
+  WGPUShaderModule module =
+      wgpuDeviceCreateShaderModule(device, &module_descriptor);
+  if (!module) {
+    return iree_make_status(
+        IREE_STATUS_INTERNAL,
+        "failed to create fill_buffer builtin shader module");
+  }
+
+  const WGPUComputePipelineDescriptor pipeline_descriptor = {
+      .nextInChain = NULL,
+      .label = WGPU_DEBUG_LABEL("_builtin_fill_buffer"),
+      .layout = pipeline_layout,
+      .compute =
+          {
+              .nextInChain = NULL,
+              .module = module,
+              .entryPoint = "main",
+          },
+  };
+  WGPUComputePipeline pipeline =
+      wgpuDeviceCreateComputePipeline(device, &pipeline_descriptor);
+  if (!pipeline) {
+    return iree_make_status(IREE_STATUS_INTERNAL,
+                            "failed to create fill_buffer builtin pipeline");
+  }
+  out_fill_buffer->pipeline = pipeline;
+  out_fill_buffer->buffer_group_layout = buffer_group_layout;
+  return iree_ok_status();
+}
+
+iree_status_t iree_hal_webgpu_builtins_initialize(
+    WGPUDevice device, iree_hal_webgpu_staging_buffer_t* staging_buffer,
+    iree_hal_webgpu_builtins_t* out_builtins) {
+  IREE_ASSERT_ARGUMENT(device);
+  IREE_ASSERT_ARGUMENT(staging_buffer);
+  IREE_ASSERT_ARGUMENT(out_builtins);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  memset(out_builtins, 0, sizeof(*out_builtins));
+
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_hal_webgpu_builtins_initialize_fill_buffer(
+              device, staging_buffer, &out_builtins->fill_buffer));
+
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+void iree_hal_webgpu_builtins_deinitialize(
+    iree_hal_webgpu_builtins_t* builtins) {
+  IREE_ASSERT_ARGUMENT(builtins);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_wgpuBindGroupLayoutDrop(builtins->fill_buffer.buffer_group_layout);
+  iree_wgpuComputePipelineDrop(builtins->fill_buffer.pipeline);
+
+  memset(builtins, 0, sizeof(*builtins));
+  IREE_TRACE_ZONE_END(z0);
+}

diff --git a/experimental/webgpu/builtins.h b/experimental/webgpu/builtins.h
new file mode 100644
index 0000000..c32c271
--- /dev/null
+++ b/experimental/webgpu/builtins.h

@@ -0,0 +1,41 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_DRIVERS_WEBGPU_BUILTINS_H_
+#define IREE_HAL_DRIVERS_WEBGPU_BUILTINS_H_
+
+#include "experimental/webgpu/platform/webgpu.h"
+#include "experimental/webgpu/staging_buffer.h"
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef struct iree_hal_webgpu_builtin_fill_buffer_t {
+  // groupIndex[1]
+  //  binding[0]: target
+  WGPUBindGroupLayout buffer_group_layout;
+  WGPUComputePipeline pipeline;
+} iree_hal_webgpu_builtin_fill_buffer_t;
+
+typedef struct iree_hal_webgpu_builtins_t {
+  iree_hal_webgpu_builtin_fill_buffer_t fill_buffer;
+} iree_hal_webgpu_builtins_t;
+
+iree_status_t iree_hal_webgpu_builtins_initialize(
+    WGPUDevice device, iree_hal_webgpu_staging_buffer_t* staging_buffer,
+    iree_hal_webgpu_builtins_t* out_builtins);
+
+void iree_hal_webgpu_builtins_deinitialize(
+    iree_hal_webgpu_builtins_t* builtins);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_DRIVERS_WEBGPU_BUILTINS_H_

diff --git a/experimental/webgpu/command_buffer.c b/experimental/webgpu/command_buffer.c
new file mode 100644
index 0000000..b8efd48
--- /dev/null
+++ b/experimental/webgpu/command_buffer.c

@@ -0,0 +1,946 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "experimental/webgpu/command_buffer.h"
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "experimental/webgpu/buffer.h"
+#include "experimental/webgpu/executable.h"
+#include "experimental/webgpu/pipeline_layout.h"
+#include "iree/base/api.h"
+#include "iree/base/internal/arena.h"
+#include "iree/base/tracing.h"
+
+//===----------------------------------------------------------------------===//
+// Segmented submission management
+//===----------------------------------------------------------------------===//
+// WebGPU - like Metal - has a rather obtuse multi-level recording model with
+// the most obtuse design point being that DMA operations happen on the queue
+// directly. In trying to model a single command buffer we may need to make
+// multiple ordered submissions to the device queue, which is unfortunate as
+// the queue submission routine only takes command buffers and we need to
+// interleave the command buffer submissions with other queue operations.
+
+typedef enum iree_hal_webgpu_command_segment_action_e {
+  // wgpuQueueSubmit of a command buffer.
+  IREE_HAL_WEBGPU_COMMAND_SEGMENT_ACTION_EXECUTE,
+  // wgpuQueueWriteBuffer for a host->device transfer.
+  IREE_HAL_WEBGPU_COMMAND_SEGMENT_ACTION_WRITE_BUFFER,
+} iree_hal_webgpu_command_segment_action_t;
+
+struct iree_hal_webgpu_command_segment_t;
+typedef struct iree_hal_webgpu_command_segment_t {
+  struct iree_hal_webgpu_command_segment_t* next_segment;
+  iree_hal_webgpu_command_segment_action_t action;
+  union {
+    struct {
+      WGPUCommandBuffer command_buffer;
+    } execute;
+    struct {
+      const void* source_buffer;
+      iree_host_size_t source_offset;
+      WGPUBuffer target_buffer;
+      iree_device_size_t target_offset;
+      iree_host_size_t length;
+    } write_buffer;
+  };
+} iree_hal_webgpu_command_segment_t;
+
+typedef struct iree_hal_webgpu_command_segment_list_t {
+  iree_hal_webgpu_command_segment_t* head;
+  iree_hal_webgpu_command_segment_t* tail;
+} iree_hal_webgpu_command_segment_list_t;
+
+static void iree_hal_webgpu_command_segment_list_reset(
+    iree_hal_webgpu_command_segment_list_t* list) {
+  for (iree_hal_webgpu_command_segment_t* segment = list->head; segment;
+       segment = segment->next_segment) {
+    switch (segment->action) {
+      case IREE_HAL_WEBGPU_COMMAND_SEGMENT_ACTION_WRITE_BUFFER:
+        iree_wgpuCommandBufferDrop(segment->execute.command_buffer);
+        break;
+      default:
+      case IREE_HAL_WEBGPU_COMMAND_SEGMENT_ACTION_EXECUTE:
+        // Nothing to do.
+        break;
+    }
+  }
+  memset(list, 0, sizeof(*list));
+}
+
+static void iree_hal_webgpu_command_segment_list_push_front(
+    iree_hal_webgpu_command_segment_list_t* list,
+    iree_hal_webgpu_command_segment_t* segment) {
+  segment->next_segment = list->head;
+  list->head = segment;
+  if (!list->tail) list->tail = segment;
+}
+
+static void iree_hal_webgpu_command_segment_list_push_back(
+    iree_hal_webgpu_command_segment_list_t* list,
+    iree_hal_webgpu_command_segment_t* segment) {
+  segment->next_segment = NULL;
+  if (list->tail) {
+    list->tail->next_segment = segment;
+    list->tail = segment;
+  } else {
+    list->head = list->tail = segment;
+  }
+}
+
+static void iree_hal_webgpu_command_segment_issue_execute(
+    iree_hal_webgpu_command_segment_t* segment, WGPUQueue queue) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  wgpuQueueSubmit(queue, 1, &segment->execute.command_buffer);
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static void iree_hal_webgpu_command_segment_issue_write_buffer(
+    iree_hal_webgpu_command_segment_t* segment, WGPUQueue queue) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  wgpuQueueWriteBuffer(queue, segment->write_buffer.target_buffer,
+                       segment->write_buffer.target_offset,
+                       ((const uint8_t*)segment->write_buffer.source_buffer) +
+                           segment->write_buffer.source_offset,
+                       segment->write_buffer.length);
+  IREE_TRACE_ZONE_END(z0);
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_webgpu_command_buffer_t
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_webgpu_command_buffer_t {
+  iree_hal_command_buffer_t base;
+  iree_allocator_t host_allocator;
+  WGPUDevice device;
+
+  // Shared staging uniform buffer with queue-ordered data. We use this
+  // for push constant emulation by recording all of the push constants per
+  // dispatch and then updating the buffer prior to issuing the commands using
+  // it. This works because there's no out-of-order or overlapping execution in
+  // WebGPU (unfortunately) and we know that if we write in queue-order the
+  // updates will be visible to the subsequently issued commands.
+  iree_hal_webgpu_staging_buffer_t* staging_buffer;
+
+  // Device-shared WGPUBindGroup cache.
+  iree_hal_webgpu_bind_group_cache_t* bind_group_cache;
+
+  // Shaders emulating functionality not present in WebGPU.
+  // Owned by the parent device.
+  iree_hal_webgpu_builtins_t* builtins;
+
+  // Arena used for all allocations; references the shared device block pool.
+  iree_arena_allocator_t arena;
+
+  // Linked list of queue submission actions.
+  iree_hal_webgpu_command_segment_list_t segments;
+
+  struct {
+    // Valid only when recording.
+    WGPUCommandEncoder encoder;
+    // Currently open pass - NULL if no open pass.
+    WGPUComputePassEncoder compute_pass;
+
+    // All available push constants updated each time push_constants is called.
+    // Reset only with the command buffer and otherwise will maintain its values
+    // during recording to allow for partial push_constants updates.
+    uint32_t push_constants[IREE_HAL_WEBGPU_MAX_PUSH_CONSTANT_COUNT];
+
+    // TODO(benvanik): add a push_constants dirty bit so we know if we need to
+    // upload more. Today we'll stage the same values for each dispatch.
+
+    // Snapshot of descriptor sets as populated by push_descriptor_set.
+    // Each push_descriptor_set will invalidate the bind group handle and
+    // subsequent dispatches will acquire new bind groups from the cache. If
+    // future updates are no-ops the same bind group handle can be used.
+    struct {
+      WGPUBindGroup handle;
+      iree_hal_webgpu_bind_group_binding_t
+          bindings[IREE_HAL_WEBGPU_MAX_DESCRIPTOR_SET_BINDING_COUNT];
+    } bind_groups[IREE_HAL_WEBGPU_MAX_DESCRIPTOR_SET_COUNT];
+
+    // Bitfield tracking which bind groups are set to an empty group.
+    uint64_t bind_groups_empty;
+  } state;
+} iree_hal_webgpu_command_buffer_t;
+
+extern const iree_hal_command_buffer_vtable_t
+    iree_hal_webgpu_command_buffer_vtable;
+
+static iree_hal_webgpu_command_buffer_t* iree_hal_webgpu_command_buffer_cast(
+    iree_hal_command_buffer_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_webgpu_command_buffer_vtable);
+  return (iree_hal_webgpu_command_buffer_t*)base_value;
+}
+
+iree_status_t iree_hal_webgpu_command_buffer_create(
+    iree_hal_device_t* device, WGPUDevice device_handle,
+    iree_hal_command_buffer_mode_t mode,
+    iree_hal_command_category_t command_categories,
+    iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity,
+    iree_arena_block_pool_t* block_pool,
+    iree_hal_webgpu_staging_buffer_t* staging_buffer,
+    iree_hal_webgpu_bind_group_cache_t* bind_group_cache,
+    iree_hal_webgpu_builtins_t* builtins, iree_allocator_t host_allocator,
+    iree_hal_command_buffer_t** out_command_buffer) {
+  IREE_ASSERT_ARGUMENT(device);
+  IREE_ASSERT_ARGUMENT(block_pool);
+  IREE_ASSERT_ARGUMENT(staging_buffer);
+  IREE_ASSERT_ARGUMENT(bind_group_cache);
+  IREE_ASSERT_ARGUMENT(builtins);
+  IREE_ASSERT_ARGUMENT(out_command_buffer);
+  *out_command_buffer = NULL;
+
+  if (binding_capacity > 0) {
+    // TODO(#10144): support indirect command buffers with binding tables.
+    return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                            "indirect command buffers not yet implemented");
+  }
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_webgpu_command_buffer_t* command_buffer = NULL;
+  iree_status_t status = iree_allocator_malloc(
+      host_allocator, sizeof(*command_buffer), (void**)&command_buffer);
+  if (iree_status_is_ok(status)) {
+    iree_hal_command_buffer_initialize(
+        device, mode, command_categories, queue_affinity, binding_capacity,
+        &iree_hal_webgpu_command_buffer_vtable, &command_buffer->base);
+    command_buffer->host_allocator = host_allocator;
+    command_buffer->device = device_handle;
+    command_buffer->staging_buffer = staging_buffer;
+    command_buffer->bind_group_cache = bind_group_cache;
+    command_buffer->builtins = builtins;
+
+    iree_arena_initialize(block_pool, &command_buffer->arena);
+    iree_hal_webgpu_command_segment_list_reset(&command_buffer->segments);
+
+    *out_command_buffer = &command_buffer->base;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+bool iree_hal_webgpu_command_buffer_isa(
+    iree_hal_command_buffer_t* command_buffer) {
+  return iree_hal_resource_is(&command_buffer->resource,
+                              &iree_hal_webgpu_command_buffer_vtable);
+}
+
+static void* iree_hal_webgpu_command_buffer_dyn_cast(
+    iree_hal_command_buffer_t* command_buffer, const void* vtable) {
+  if (vtable == &iree_hal_webgpu_command_buffer_vtable) {
+    IREE_HAL_ASSERT_TYPE(command_buffer, vtable);
+    return command_buffer;
+  }
+  return NULL;
+}
+
+static void iree_hal_webgpu_command_buffer_reset(
+    iree_hal_webgpu_command_buffer_t* command_buffer) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  if (command_buffer->state.compute_pass) {
+    wgpuComputePassEncoderEnd(command_buffer->state.compute_pass);
+  }
+  if (command_buffer->state.encoder) {
+    const WGPUCommandBufferDescriptor descriptor = {
+        .nextInChain = NULL,
+        .label = NULL,
+    };
+    iree_wgpuCommandBufferDrop(
+        wgpuCommandEncoderFinish(command_buffer->state.encoder, &descriptor));
+    command_buffer->state.encoder = NULL;
+  }
+
+  command_buffer->state.bind_groups_empty = 0;
+
+  iree_hal_webgpu_staging_buffer_reset(command_buffer->staging_buffer);
+  iree_hal_webgpu_command_segment_list_reset(&command_buffer->segments);
+  iree_arena_reset(&command_buffer->arena);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static void iree_hal_webgpu_command_buffer_destroy(
+    iree_hal_command_buffer_t* base_command_buffer) {
+  iree_hal_webgpu_command_buffer_t* command_buffer =
+      iree_hal_webgpu_command_buffer_cast(base_command_buffer);
+  iree_allocator_t host_allocator = command_buffer->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_webgpu_command_buffer_reset(command_buffer);
+  iree_arena_deinitialize(&command_buffer->arena);
+  iree_allocator_free(host_allocator, command_buffer);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+iree_status_t iree_hal_webgpu_command_buffer_issue(
+    iree_hal_command_buffer_t* base_command_buffer, WGPUQueue queue) {
+  iree_hal_webgpu_command_buffer_t* command_buffer =
+      iree_hal_webgpu_command_buffer_cast(base_command_buffer);
+  IREE_ASSERT(command_buffer);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  for (iree_hal_webgpu_command_segment_t* segment =
+           command_buffer->segments.head;
+       segment; segment = segment->next_segment) {
+    switch (segment->action) {
+      case IREE_HAL_WEBGPU_COMMAND_SEGMENT_ACTION_EXECUTE:
+        iree_hal_webgpu_command_segment_issue_execute(segment, queue);
+        break;
+      case IREE_HAL_WEBGPU_COMMAND_SEGMENT_ACTION_WRITE_BUFFER:
+        iree_hal_webgpu_command_segment_issue_write_buffer(segment, queue);
+        break;
+      default:
+        break;
+    }
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_webgpu_command_buffer_flush_encoder(
+    iree_hal_webgpu_command_buffer_t* command_buffer) {
+  if (!command_buffer->state.encoder) return iree_ok_status();
+
+  // End any open compute pass.
+  if (command_buffer->state.compute_pass) {
+    wgpuComputePassEncoderEnd(command_buffer->state.compute_pass);
+    command_buffer->state.compute_pass = NULL;
+  }
+
+  // Finalize encoder and produce a command buffer.
+  const WGPUCommandBufferDescriptor descriptor = {
+      .nextInChain = NULL,
+      .label = NULL,
+  };
+  WGPUCommandBuffer handle =
+      wgpuCommandEncoderFinish(command_buffer->state.encoder, &descriptor);
+  command_buffer->state.encoder = NULL;
+
+  iree_hal_webgpu_command_segment_t* segment = NULL;
+  iree_status_t status = iree_arena_allocate(
+      &command_buffer->arena, sizeof(*segment), (void**)&segment);
+  if (iree_status_is_ok(status)) {
+    // Attach the command buffer segment.
+    segment->action = IREE_HAL_WEBGPU_COMMAND_SEGMENT_ACTION_EXECUTE;
+    segment->execute.command_buffer = handle;
+    iree_hal_webgpu_command_segment_list_push_back(&command_buffer->segments,
+                                                   segment);
+  } else {
+    iree_wgpuCommandBufferDrop(handle);
+  }
+  return status;
+}
+
+static iree_status_t iree_hal_webgpu_command_buffer_acquire_command_encoder(
+    iree_hal_webgpu_command_buffer_t* command_buffer,
+    WGPUCommandEncoder* out_command_encoder) {
+  // Close active compute pass, if any.
+  if (command_buffer->state.compute_pass) {
+    wgpuComputePassEncoderEnd(command_buffer->state.compute_pass);
+    command_buffer->state.compute_pass = NULL;
+  }
+
+  // Reuse an open encoder, if any.
+  if (command_buffer->state.encoder) {
+    *out_command_encoder = command_buffer->state.encoder;
+    return iree_ok_status();
+  }
+
+  // Open a new encoder.
+  const WGPUCommandEncoderDescriptor descriptor = {
+      .nextInChain = NULL,
+      .label = NULL,
+  };
+  command_buffer->state.encoder =
+      wgpuDeviceCreateCommandEncoder(command_buffer->device, &descriptor);
+  *out_command_encoder = command_buffer->state.encoder;
+
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_webgpu_command_buffer_acquire_compute_pass(
+    iree_hal_webgpu_command_buffer_t* command_buffer,
+    WGPUComputePassEncoder* out_compute_pass) {
+  // Reuse an open compute pass, if any.
+  if (command_buffer->state.compute_pass) {
+    *out_compute_pass = command_buffer->state.compute_pass;
+    return iree_ok_status();
+  }
+
+  // Open/reuse an encoder.
+  WGPUCommandEncoder command_encoder = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_webgpu_command_buffer_acquire_command_encoder(
+      command_buffer, &command_encoder));
+
+  // Open a new compute pass.
+  const WGPUComputePassDescriptor descriptor = {
+      .nextInChain = NULL,
+      .label = NULL,
+  };
+  command_buffer->state.compute_pass =
+      wgpuCommandEncoderBeginComputePass(command_encoder, &descriptor);
+  *out_compute_pass = command_buffer->state.compute_pass;
+
+  // Reset all device-side state for the compute pass - nothing carries over
+  // across passes and we will need to rebind things.
+  for (iree_host_size_t i = 0;
+       i < IREE_ARRAYSIZE(command_buffer->state.bind_groups); ++i) {
+    command_buffer->state.bind_groups[i].handle = NULL;
+  }
+  command_buffer->state.bind_groups_empty = 0;
+
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_webgpu_command_buffer_flush(
+    iree_hal_webgpu_command_buffer_t* command_buffer) {
+  // Flush any active encoder as we are beginning a new segment.
+  IREE_RETURN_IF_ERROR(
+      iree_hal_webgpu_command_buffer_flush_encoder(command_buffer));
+
+  // Flush the staging buffer to get the upload parameters.
+  void* source_buffer = NULL;
+  WGPUBuffer target_buffer = NULL;
+  iree_host_size_t upload_length = 0;
+  iree_hal_webgpu_staging_buffer_flush(command_buffer->staging_buffer,
+                                       &source_buffer, &target_buffer,
+                                       &upload_length);
+
+  // Enqueue new segment.
+  uint8_t* storage_base = NULL;
+  iree_hal_webgpu_command_segment_t* segment = NULL;
+  IREE_RETURN_IF_ERROR(iree_arena_allocate(&command_buffer->arena,
+                                           sizeof(*segment) + upload_length,
+                                           (void**)&storage_base));
+
+  // Copy the staging upload data into the command buffer so the host staging
+  // buffer can be reused immediately. This results in an extra copy but this
+  // is mostly small. We could - if executing inline - submit this to the
+  // queue immediately without the segment overhead.
+  uint8_t* storage_buffer = storage_base + sizeof(*segment);
+  memcpy(storage_buffer, source_buffer, upload_length);
+
+  // Attach the write_buffer segment.
+  segment = (iree_hal_webgpu_command_segment_t*)storage_base;
+  segment->action = IREE_HAL_WEBGPU_COMMAND_SEGMENT_ACTION_WRITE_BUFFER;
+  segment->write_buffer.source_buffer = storage_buffer;
+  segment->write_buffer.source_offset = 0;
+  segment->write_buffer.target_buffer = target_buffer;
+  segment->write_buffer.target_offset = 0;
+  segment->write_buffer.length = upload_length;
+  iree_hal_webgpu_command_segment_list_push_back(&command_buffer->segments,
+                                                 segment);
+
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_webgpu_command_buffer_append_parameters(
+    iree_hal_webgpu_command_buffer_t* command_buffer,
+    iree_const_byte_span_t source, uint32_t* out_offset) {
+  // Try to append the parameters - this may fail if the staging buffer is
+  // exhausted and needs to be flushed. If so we flush and then try again.
+  iree_status_t try_status = iree_hal_webgpu_staging_buffer_append(
+      command_buffer->staging_buffer, source, out_offset);
+  if (iree_status_is_ok(try_status) ||
+      !iree_status_is_resource_exhausted(try_status)) {
+    return try_status;  // NOTE: may be a failure.
+  }
+
+  // Flush any pending commands and the current staging buffer state.
+  IREE_RETURN_IF_ERROR(iree_hal_webgpu_command_buffer_flush(command_buffer));
+
+  // Try to stage the parameters again. If this fails it's not because it needed
+  // a flush.
+  return iree_hal_webgpu_staging_buffer_append(command_buffer->staging_buffer,
+                                               source, out_offset);
+}
+
+static iree_status_t iree_hal_webgpu_command_buffer_begin(
+    iree_hal_command_buffer_t* base_command_buffer) {
+  iree_hal_webgpu_command_buffer_t* command_buffer =
+      iree_hal_webgpu_command_buffer_cast(base_command_buffer);
+  iree_hal_webgpu_command_buffer_reset(command_buffer);
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_webgpu_command_buffer_end(
+    iree_hal_command_buffer_t* base_command_buffer) {
+  iree_hal_webgpu_command_buffer_t* command_buffer =
+      iree_hal_webgpu_command_buffer_cast(base_command_buffer);
+  return iree_hal_webgpu_command_buffer_flush(command_buffer);
+}
+
+static void iree_hal_webgpu_command_buffer_begin_debug_group(
+    iree_hal_command_buffer_t* base_command_buffer, iree_string_view_t label,
+    iree_hal_label_color_t label_color,
+    const iree_hal_label_location_t* location) {
+  iree_hal_webgpu_command_buffer_t* command_buffer =
+      iree_hal_webgpu_command_buffer_cast(base_command_buffer);
+
+  WGPUCommandEncoder command_encoder = NULL;
+  iree_status_t status = iree_hal_webgpu_command_buffer_acquire_command_encoder(
+      command_buffer, &command_encoder);
+  if (!iree_status_is_ok(status)) {
+    // TODO(benvanik): mark recording as failed.
+    iree_status_ignore(status);
+    return;
+  }
+
+  // TODO(benvanik): ensure this works right when in a compute pass.
+  char label_str[128] = {0};
+  memcpy(label_str, label.data, iree_min(sizeof(label_str) - 1, label.size));
+  wgpuCommandEncoderPushDebugGroup(command_encoder, label_str);
+}
+
+static void iree_hal_webgpu_command_buffer_end_debug_group(
+    iree_hal_command_buffer_t* base_command_buffer) {
+  iree_hal_webgpu_command_buffer_t* command_buffer =
+      iree_hal_webgpu_command_buffer_cast(base_command_buffer);
+
+  WGPUCommandEncoder command_encoder = NULL;
+  iree_status_t status = iree_hal_webgpu_command_buffer_acquire_command_encoder(
+      command_buffer, &command_encoder);
+  if (!iree_status_is_ok(status)) {
+    // TODO(benvanik): mark recording as failed.
+    iree_status_ignore(status);
+    return;
+  }
+
+  wgpuCommandEncoderPopDebugGroup(command_encoder);
+}
+
+static iree_status_t iree_hal_webgpu_command_buffer_execution_barrier(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_execution_stage_t source_stage_mask,
+    iree_hal_execution_stage_t target_stage_mask,
+    iree_hal_execution_barrier_flags_t flags,
+    iree_host_size_t memory_barrier_count,
+    const iree_hal_memory_barrier_t* memory_barriers,
+    iree_host_size_t buffer_barrier_count,
+    const iree_hal_buffer_barrier_t* buffer_barriers) {
+  // No-op: barriers are automatic in WebGPU.
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_webgpu_command_buffer_signal_event(
+    iree_hal_command_buffer_t* base_command_buffer, iree_hal_event_t* event,
+    iree_hal_execution_stage_t source_stage_mask) {
+  // No-op: no events in WebGPU.
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_webgpu_command_buffer_reset_event(
+    iree_hal_command_buffer_t* base_command_buffer, iree_hal_event_t* event,
+    iree_hal_execution_stage_t source_stage_mask) {
+  // No-op: no events in WebGPU.
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_webgpu_command_buffer_wait_events(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_host_size_t event_count, const iree_hal_event_t** events,
+    iree_hal_execution_stage_t source_stage_mask,
+    iree_hal_execution_stage_t target_stage_mask,
+    iree_host_size_t memory_barrier_count,
+    const iree_hal_memory_barrier_t* memory_barriers,
+    iree_host_size_t buffer_barrier_count,
+    const iree_hal_buffer_barrier_t* buffer_barriers) {
+  // No-op: no events in WebGPU.
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_webgpu_command_buffer_discard_buffer(
+    iree_hal_command_buffer_t* base_command_buffer, iree_hal_buffer_t* buffer) {
+  // No-op: though maybe it'd be a useful addition to the spec as otherwise
+  // false dependencies can creep in.
+  return iree_ok_status();
+}
+
+// Splats a pattern value of 1, 2, or 4 bytes out to a 4 byte value.
+static uint32_t iree_hal_webgpu_splat_pattern(const void* pattern,
+                                              size_t pattern_length) {
+  switch (pattern_length) {
+    case 1: {
+      uint32_t pattern_value = *(const uint8_t*)(pattern);
+      return (pattern_value << 24) | (pattern_value << 16) |
+             (pattern_value << 8) | pattern_value;
+    }
+    case 2: {
+      uint32_t pattern_value = *(const uint16_t*)(pattern);
+      return (pattern_value << 16) | pattern_value;
+    }
+    case 4: {
+      uint32_t pattern_value = *(const uint32_t*)(pattern);
+      return pattern_value;
+    }
+    default:
+      return 0;  // Already verified that this should not be possible.
+  }
+}
+
+static iree_status_t iree_hal_webgpu_command_buffer_fill_buffer(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+    iree_device_size_t length, const void* pattern,
+    iree_host_size_t pattern_length) {
+  iree_hal_webgpu_command_buffer_t* command_buffer =
+      iree_hal_webgpu_command_buffer_cast(base_command_buffer);
+
+  iree_hal_webgpu_builtin_fill_buffer_t* builtin =
+      &command_buffer->builtins->fill_buffer;
+  target_offset += iree_hal_buffer_byte_offset(target_buffer);
+
+  // TODO(scotttodd): change to using what the vulkan emulation does
+  uint32_t dword_pattern =
+      iree_hal_webgpu_splat_pattern(pattern, pattern_length);
+
+  // If the pattern is zero and both the offset and length are multiples of 4,
+  // we can use the native wgpuCommandEncoderClearBuffer function. Otherwise,
+  // we dispatch our own fill emulation shader.
+  uint32_t zero_pattern = 0;
+  if (memcmp(&dword_pattern, &zero_pattern, pattern_length) == 0 &&
+      target_offset % 4 == 0 && length % 4 == 0) {
+    WGPUCommandEncoder command_encoder = NULL;
+    IREE_RETURN_IF_ERROR(iree_hal_webgpu_command_buffer_acquire_command_encoder(
+        command_buffer, &command_encoder));
+
+    wgpuCommandEncoderClearBuffer(
+        command_encoder,
+        iree_hal_webgpu_buffer_handle(
+            iree_hal_buffer_allocated_buffer(target_buffer)),
+        target_offset, length);
+    return iree_ok_status();
+  }
+
+  // need to handle %4!=0 offset and pattern length as with vulkan
+
+  // Upload push constant data - this may incur a segment flush if the staging
+  // buffer is exhausted.
+  const uint32_t params_data[] = {
+      /*offset=*/target_offset,
+      /*length=*/length,
+      /*pattern=*/dword_pattern,
+  };
+  uint32_t params_offset = 0;
+  IREE_RETURN_IF_ERROR(iree_hal_webgpu_command_buffer_append_parameters(
+      command_buffer,
+      iree_make_const_byte_span(params_data, sizeof(params_data)),
+      &params_offset));
+
+  // Acquire the compute pass we'll encode the dispatch into - this may be
+  // fresh or reused from prior commands.
+  WGPUComputePassEncoder compute_pass = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_webgpu_command_buffer_acquire_compute_pass(
+      command_buffer, &compute_pass));
+  wgpuComputePassEncoderSetPipeline(compute_pass, builtin->pipeline);
+
+  // Bind the push constant emulation bind group at the staging buffer relative
+  // offset for this dispatch.
+  wgpuComputePassEncoderSetBindGroup(compute_pass, /*groupIndex=*/0,
+                                     command_buffer->staging_buffer->bind_group,
+                                     1, &params_offset);
+  command_buffer->state.bind_groups[0].handle = NULL;
+
+  // Grab a (probably uncached) bind group for the target buffer binding.
+  const iree_hal_webgpu_bind_group_binding_t buffer_binding = {
+      .type = WGPUBufferBindingType_Storage,
+      .buffer = iree_hal_webgpu_buffer_handle(
+          iree_hal_buffer_allocated_buffer(target_buffer)),
+      .offset = 0,
+      .length = length,
+  };
+  WGPUBindGroup buffer_group = iree_hal_webgpu_bind_group_cache_acquire(
+      command_buffer->bind_group_cache, builtin->buffer_group_layout,
+      &buffer_binding, /*binding_mask=*/1);
+  wgpuComputePassEncoderSetBindGroup(compute_pass, /*groupIndex=*/1,
+                                     buffer_group, 0, NULL);
+  command_buffer->state.bind_groups[1].handle = NULL;
+
+  // NOTE: this is not the right way to do this - we need to be tiling inside
+  // the fill.
+  wgpuComputePassEncoderDispatchWorkgroups(compute_pass, length, 1, 1);
+
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_webgpu_command_buffer_update_buffer(
+    iree_hal_command_buffer_t* base_command_buffer, const void* source_buffer,
+    iree_host_size_t source_offset, iree_hal_buffer_t* target_buffer,
+    iree_device_size_t target_offset, iree_device_size_t length) {
+  iree_hal_webgpu_command_buffer_t* command_buffer =
+      iree_hal_webgpu_command_buffer_cast(base_command_buffer);
+
+  // Flush any active encoder as we are beginning a new segment.
+  IREE_RETURN_IF_ERROR(
+      iree_hal_webgpu_command_buffer_flush_encoder(command_buffer));
+
+  // Enqueue new segment.
+  uint8_t* storage_base = NULL;
+  iree_hal_webgpu_command_segment_t* segment = NULL;
+  iree_status_t status = iree_arena_allocate(
+      &command_buffer->arena, sizeof(*segment) + length, (void**)&storage_base);
+  if (iree_status_is_ok(status)) {
+    // Copy the update data into the command buffer so the user can change
+    // it immediately after this call returns. This results in a double copy
+    // because we need to put it in our command buffer and then when issuing
+    // copy again into the WebGPU queue. Thankfully these updates are restricted
+    // to a handful of KB so that's not really our biggest inefficiency.
+    uint8_t* storage_buffer = storage_base + sizeof(*segment);
+    memcpy(storage_buffer, (const uint8_t*)source_buffer + source_offset,
+           length);
+
+    // Attach the write_buffer segment.
+    segment = (iree_hal_webgpu_command_segment_t*)storage_base;
+    segment->action = IREE_HAL_WEBGPU_COMMAND_SEGMENT_ACTION_WRITE_BUFFER;
+    segment->write_buffer.source_buffer = storage_buffer;
+    segment->write_buffer.source_offset = 0;
+    segment->write_buffer.target_buffer =
+        iree_hal_webgpu_buffer_handle(target_buffer);
+    segment->write_buffer.target_offset = target_offset;
+    segment->write_buffer.length = length;
+    iree_hal_webgpu_command_segment_list_push_back(&command_buffer->segments,
+                                                   segment);
+  }
+  return status;
+}
+
+static iree_status_t iree_hal_webgpu_command_buffer_copy_buffer(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
+    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+    iree_device_size_t length) {
+  iree_hal_webgpu_command_buffer_t* command_buffer =
+      iree_hal_webgpu_command_buffer_cast(base_command_buffer);
+
+  WGPUCommandEncoder command_encoder = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_webgpu_command_buffer_acquire_command_encoder(
+      command_buffer, &command_encoder));
+
+  wgpuCommandEncoderCopyBufferToBuffer(
+      command_encoder, iree_hal_webgpu_buffer_handle(source_buffer),
+      source_offset, iree_hal_webgpu_buffer_handle(target_buffer),
+      target_offset, length);
+
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_webgpu_command_buffer_push_constants(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_pipeline_layout_t* pipeline_layout, iree_host_size_t offset,
+    const void* values, iree_host_size_t values_length) {
+  iree_hal_webgpu_command_buffer_t* command_buffer =
+      iree_hal_webgpu_command_buffer_cast(base_command_buffer);
+
+  if (IREE_UNLIKELY(offset + values_length >=
+                    sizeof(command_buffer->state.push_constants))) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "push constant range %zu (length=%zu) out of range",
+                            offset, values_length);
+  }
+
+  // NOTE: command buffer state change only; enqueues no tasks.
+  memcpy((uint8_t*)&command_buffer->state.push_constants + offset, values,
+         values_length);
+
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_webgpu_command_buffer_push_descriptor_set(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_pipeline_layout_t* pipeline_layout, uint32_t set,
+    iree_host_size_t binding_count,
+    const iree_hal_descriptor_set_binding_t* bindings) {
+  iree_hal_webgpu_command_buffer_t* command_buffer =
+      iree_hal_webgpu_command_buffer_cast(base_command_buffer);
+
+  // NOTE: we don't check for redundant sets here as the compiler should have
+  // done that for us.
+  command_buffer->state.bind_groups[set].handle = NULL;
+  iree_hal_webgpu_bind_group_binding_t* group_bindings =
+      command_buffer->state.bind_groups[set].bindings;
+  for (iree_host_size_t i = 0; i < binding_count; ++i) {
+    uint32_t ordinal = bindings[i].binding;
+    if (ordinal >= IREE_HAL_WEBGPU_MAX_DESCRIPTOR_SET_BINDING_COUNT) {
+      return iree_make_status(
+          IREE_STATUS_INVALID_ARGUMENT,
+          "binding ordinal %d is out of range, must be 0-%d", ordinal,
+          IREE_HAL_WEBGPU_MAX_DESCRIPTOR_SET_BINDING_COUNT);
+    }
+    iree_hal_webgpu_bind_group_binding_t* group_binding =
+        &group_bindings[bindings[i].binding];
+
+    // TODO(benvanik): lookup binding type from layout. We should also be
+    // tagging whether it's dynamic here.
+    group_binding->type = WGPUBufferBindingType_Storage;
+
+    group_binding->buffer =
+        bindings[i].buffer ? iree_hal_webgpu_buffer_handle(bindings[i].buffer)
+                           : NULL;
+    group_binding->offset = bindings[i].offset;
+    group_binding->length = bindings[i].length;
+  }
+
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_webgpu_command_buffer_prepare_dispatch(
+    iree_hal_webgpu_command_buffer_t* command_buffer,
+    iree_hal_executable_t* executable, uint32_t ordinal,
+    WGPUComputePassEncoder* out_compute_pass) {
+  const iree_hal_webgpu_entry_point_t* entry_point =
+      iree_hal_webgpu_executable_lookup_entry_point(executable, ordinal);
+
+  // Upload push constant data - this may incur a segment flush if the staging
+  // buffer is exhausted.
+  iree_host_size_t push_constant_count =
+      iree_hal_webgpu_pipeline_layout_push_constant_count(entry_point->layout);
+  iree_const_byte_span_t push_constant_data = iree_make_const_byte_span(
+      command_buffer->state.push_constants,
+      push_constant_count * sizeof(command_buffer->state.push_constants[0]));
+  uint32_t params_offset = 0;
+  IREE_RETURN_IF_ERROR(iree_hal_webgpu_command_buffer_append_parameters(
+      command_buffer, push_constant_data, &params_offset));
+
+  // Acquire the compute pass we'll encode the dispatch into - this may be
+  // fresh or reused from prior commands.
+  WGPUComputePassEncoder compute_pass = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_webgpu_command_buffer_acquire_compute_pass(
+      command_buffer, &compute_pass));
+  wgpuComputePassEncoderSetPipeline(compute_pass, entry_point->pipeline);
+
+  if (push_constant_count > 0) {
+    // Bind the push constant emulation bind group at the staging buffer
+    // relative offset for this dispatch.
+    wgpuComputePassEncoderSetBindGroup(
+        compute_pass, IREE_HAL_WEBGPU_PARAMS_BIND_GROUP_INDEX,
+        command_buffer->staging_buffer->bind_group, 1, &params_offset);
+  }
+
+  // Set all bindings.
+  const iree_hal_webgpu_set_binding_info_t* binding_info =
+      iree_hal_webgpu_pipeline_layout_set_binding_info(entry_point->layout);
+  for (iree_host_size_t i = 0; i < binding_info->set_count; ++i) {
+    // If there are no bindings in this set we can skip it.
+    if (binding_info->set_masks[i] == 0) continue;
+
+    // If there is a bind group handle then it means we've done the lookup and
+    // set the bind group on the device already - we can skip.
+    if (command_buffer->state.bind_groups[i].handle) continue;
+
+    // Acquire the bind group to use for the current descriptor set.
+    WGPUBindGroup handle = iree_hal_webgpu_bind_group_cache_acquire(
+        command_buffer->bind_group_cache, binding_info->set_layouts[i],
+        command_buffer->state.bind_groups[i].bindings,
+        binding_info->set_masks[i]);
+
+    // NOTE: today we don't support dynamic offsets for push descriptor sets.
+    // This will be a larger change we'll need to handle in the compiler. If we
+    // wanted to improve caching we could make all the bindings dynamic and then
+    // always cache the base offsets, however
+    // maxDynamicStorageBuffersPerPipelineLayout is minimally 4 and that's not
+    // a lot of bindings.
+    wgpuComputePassEncoderSetBindGroup(compute_pass, (uint32_t)i, handle, 0,
+                                       NULL);
+    command_buffer->state.bind_groups[i].handle = handle;
+    command_buffer->state.bind_groups_empty &= ~(1ull << i);
+  }
+
+  if (push_constant_count > 0) {
+    // Pad up to IREE_HAL_WEBGPU_PARAMS_BIND_GROUP_INDEX with empty bind groups.
+    WGPUBindGroup empty_handle =
+        command_buffer->staging_buffer->empty_bind_group;
+    for (iree_host_size_t i = binding_info->set_count;
+         i < IREE_HAL_WEBGPU_PARAMS_BIND_GROUP_INDEX; ++i) {
+      // Skip if an empty group is already set at this index.
+      if ((command_buffer->state.bind_groups_empty >> i) & 1ull) continue;
+
+      wgpuComputePassEncoderSetBindGroup(compute_pass, (uint32_t)i,
+                                         empty_handle, 0, NULL);
+      command_buffer->state.bind_groups[i].handle = empty_handle;
+      command_buffer->state.bind_groups_empty |= 1ull << i;
+    }
+  }
+
+  *out_compute_pass = compute_pass;
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_webgpu_command_buffer_dispatch(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_executable_t* executable, int32_t entry_point,
+    uint32_t workgroup_x, uint32_t workgroup_y, uint32_t workgroup_z) {
+  iree_hal_webgpu_command_buffer_t* command_buffer =
+      iree_hal_webgpu_command_buffer_cast(base_command_buffer);
+
+  WGPUComputePassEncoder compute_pass = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_webgpu_command_buffer_prepare_dispatch(
+      command_buffer, executable, entry_point, &compute_pass));
+  wgpuComputePassEncoderDispatchWorkgroups(compute_pass, workgroup_x,
+                                           workgroup_y, workgroup_z);
+
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_webgpu_command_buffer_dispatch_indirect(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_executable_t* executable, int32_t entry_point,
+    iree_hal_buffer_t* workgroups_buffer,
+    iree_device_size_t workgroups_offset) {
+  iree_hal_webgpu_command_buffer_t* command_buffer =
+      iree_hal_webgpu_command_buffer_cast(base_command_buffer);
+
+  WGPUComputePassEncoder compute_pass = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_webgpu_command_buffer_prepare_dispatch(
+      command_buffer, executable, entry_point, &compute_pass));
+  wgpuComputePassEncoderDispatchWorkgroupsIndirect(
+      compute_pass, iree_hal_webgpu_buffer_handle(workgroups_buffer),
+      workgroups_offset);
+
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_webgpu_command_buffer_execute_commands(
+    iree_hal_command_buffer_t* base_command_buffer,
+    iree_hal_command_buffer_t* base_commands,
+    iree_hal_buffer_binding_table_t binding_table) {
+  // TODO(#10144): support indirect command buffers via deferred command buffers
+  // as WebGPU has no concept of reusable dispatch command encoders. One day
+  // hopefully there's an equivalent of GPURenderBundle but given WebGPU's other
+  // limitations it may not be useful.
+  return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                          "indirect command buffers not yet implemented");
+}
+
+const iree_hal_command_buffer_vtable_t iree_hal_webgpu_command_buffer_vtable = {
+    .destroy = iree_hal_webgpu_command_buffer_destroy,
+    .begin = iree_hal_webgpu_command_buffer_begin,
+    .end = iree_hal_webgpu_command_buffer_end,
+    .begin_debug_group = iree_hal_webgpu_command_buffer_begin_debug_group,
+    .end_debug_group = iree_hal_webgpu_command_buffer_end_debug_group,
+    .execution_barrier = iree_hal_webgpu_command_buffer_execution_barrier,
+    .signal_event = iree_hal_webgpu_command_buffer_signal_event,
+    .reset_event = iree_hal_webgpu_command_buffer_reset_event,
+    .wait_events = iree_hal_webgpu_command_buffer_wait_events,
+    .discard_buffer = iree_hal_webgpu_command_buffer_discard_buffer,
+    .fill_buffer = iree_hal_webgpu_command_buffer_fill_buffer,
+    .update_buffer = iree_hal_webgpu_command_buffer_update_buffer,
+    .copy_buffer = iree_hal_webgpu_command_buffer_copy_buffer,
+    .push_constants = iree_hal_webgpu_command_buffer_push_constants,
+    .push_descriptor_set = iree_hal_webgpu_command_buffer_push_descriptor_set,
+    .dispatch = iree_hal_webgpu_command_buffer_dispatch,
+    .dispatch_indirect = iree_hal_webgpu_command_buffer_dispatch_indirect,
+    .execute_commands = iree_hal_webgpu_command_buffer_execute_commands,
+};

diff --git a/experimental/webgpu/command_buffer.h b/experimental/webgpu/command_buffer.h
new file mode 100644
index 0000000..2cc5780
--- /dev/null
+++ b/experimental/webgpu/command_buffer.h

@@ -0,0 +1,40 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_DRIVERS_WEBGPU_COMMAND_BUFFER_H_
+#define IREE_HAL_DRIVERS_WEBGPU_COMMAND_BUFFER_H_
+
+#include "experimental/webgpu/bind_group_cache.h"
+#include "experimental/webgpu/builtins.h"
+#include "experimental/webgpu/platform/webgpu.h"
+#include "experimental/webgpu/staging_buffer.h"
+#include "iree/base/api.h"
+#include "iree/base/internal/arena.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+iree_status_t iree_hal_webgpu_command_buffer_create(
+    iree_hal_device_t* device, WGPUDevice device_handle,
+    iree_hal_command_buffer_mode_t mode,
+    iree_hal_command_category_t command_categories,
+    iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity,
+    iree_arena_block_pool_t* block_pool,
+    iree_hal_webgpu_staging_buffer_t* staging_buffer,
+    iree_hal_webgpu_bind_group_cache_t* bind_group_cache,
+    iree_hal_webgpu_builtins_t* builtins, iree_allocator_t host_allocator,
+    iree_hal_command_buffer_t** out_command_buffer);
+
+iree_status_t iree_hal_webgpu_command_buffer_issue(
+    iree_hal_command_buffer_t* command_buffer, WGPUQueue queue);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_DRIVERS_WEBGPU_COMMAND_BUFFER_H_

diff --git a/experimental/webgpu/cts/CMakeLists.txt b/experimental/webgpu/cts/CMakeLists.txt
new file mode 100644
index 0000000..f9744b7
--- /dev/null
+++ b/experimental/webgpu/cts/CMakeLists.txt

@@ -0,0 +1,20 @@
+# Copyright 2022 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+iree_hal_cts_test_suite(
+  DRIVER_NAME
+    webgpu
+  DRIVER_REGISTRATION_HDR
+    "experimental/webgpu/registration/driver_module.h"
+  DRIVER_REGISTRATION_FN
+    "iree_hal_webgpu_driver_module_register"
+  COMPILER_TARGET_BACKEND
+    "webgpu"
+  EXECUTABLE_FORMAT
+    "\"webgpu-wgsl-fb\""
+  DEPS
+    iree::experimental::webgpu::registration
+)

diff --git a/experimental/webgpu/executable.c b/experimental/webgpu/executable.c
new file mode 100644
index 0000000..b0c872e
--- /dev/null
+++ b/experimental/webgpu/executable.c

@@ -0,0 +1,331 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "experimental/webgpu/executable.h"
+
+#include <stddef.h>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/inline_array.h"
+#include "iree/base/tracing.h"
+
+// flatcc schemas:
+#include "iree/base/internal/flatcc/parsing.h"
+#include "iree/schemas/wgsl_executable_def_reader.h"
+#include "iree/schemas/wgsl_executable_def_verifier.h"
+
+typedef struct iree_hal_webgpu_executable_t {
+  iree_hal_resource_t resource;
+  iree_allocator_t host_allocator;
+  iree_host_size_t entry_point_count;
+  iree_hal_webgpu_entry_point_t entry_points[];
+} iree_hal_webgpu_executable_t;
+
+extern const iree_hal_executable_vtable_t iree_hal_webgpu_executable_vtable;
+
+static iree_hal_webgpu_executable_t* iree_hal_webgpu_executable_cast(
+    iree_hal_executable_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_webgpu_executable_vtable);
+  return (iree_hal_webgpu_executable_t*)base_value;
+}
+
+// Verifies the structure of the flatbuffer.
+static iree_status_t iree_hal_webgpu_executable_flatbuffer_verify(
+    iree_const_byte_span_t flatbuffer_data,
+    iree_host_size_t expected_entry_point_count) {
+  if (!flatbuffer_data.data || flatbuffer_data.data_length < 16) {
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "flatbuffer data is not present or less than 16 bytes (%zu total)",
+        flatbuffer_data.data_length);
+  }
+
+  // Run flatcc generated verification. This ensures all pointers are in-bounds
+  // and that we can safely walk the file, but not that the actual contents of
+  // the flatbuffer meet our expectations.
+  int verify_ret = iree_hal_wgsl_ExecutableDef_verify_as_root(
+      flatbuffer_data.data, flatbuffer_data.data_length);
+  if (verify_ret != flatcc_verify_ok) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "flatbuffer verification failed: %s",
+                            flatcc_verify_error_string(verify_ret));
+  }
+
+  iree_hal_wgsl_ExecutableDef_table_t executable_def =
+      iree_hal_wgsl_ExecutableDef_as_root(flatbuffer_data.data);
+
+  iree_hal_wgsl_ShaderModuleDef_vec_t shader_modules_vec =
+      iree_hal_wgsl_ExecutableDef_shader_modules_get(executable_def);
+  size_t shader_module_count =
+      iree_hal_wgsl_ShaderModuleDef_vec_len(shader_modules_vec);
+  for (size_t i = 0; i < shader_module_count; ++i) {
+    iree_hal_wgsl_ShaderModuleDef_table_t shader_module_def =
+        iree_hal_wgsl_ShaderModuleDef_vec_at(shader_modules_vec, i);
+    if (flatbuffers_string_len(
+            iree_hal_wgsl_ShaderModuleDef_code_get(shader_module_def)) == 0) {
+      return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                              "shader module %zu WGSL code is missing/empty",
+                              i);
+    }
+  }
+
+  flatbuffers_uint32_vec_t entry_points_vec =
+      iree_hal_wgsl_ExecutableDef_entry_points_get(executable_def);
+  size_t entry_point_count = flatbuffers_uint32_vec_len(entry_points_vec);
+  if (entry_point_count != expected_entry_point_count) {
+    return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
+                            "executable provides %zu entry points but caller "
+                            "provided %zu; must match",
+                            entry_point_count, expected_entry_point_count);
+  }
+
+  for (size_t i = 0; i < entry_point_count; ++i) {
+    uint32_t module_ordinal = flatbuffers_uint32_vec_at(entry_points_vec, i);
+    if (module_ordinal >= shader_module_count) {
+      return iree_make_status(
+          IREE_STATUS_INVALID_ARGUMENT,
+          "executable entry point %zu references an invalid shader module %d",
+          i, module_ordinal);
+    }
+  }
+
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_webgpu_create_wgsl_shader_module(
+    WGPUDevice device, iree_hal_wgsl_ShaderModuleDef_table_t shader_module_def,
+    WGPUShaderModule* out_shader_module) {
+  IREE_ASSERT_ARGUMENT(shader_module_def);
+  IREE_ASSERT_ARGUMENT(out_shader_module);
+  *out_shader_module = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  const char* code = iree_hal_wgsl_ShaderModuleDef_code_get(shader_module_def);
+
+  const WGPUShaderModuleWGSLDescriptor descriptor = {
+    .chain =
+        {
+            .next = NULL,
+            .sType = WGPUSType_ShaderModuleWGSLDescriptor,
+        },
+#if defined(IREE_PLATFORM_EMSCRIPTEN)
+    // Emscripten uses this older name.
+    .source = code,
+#else
+    // Spec uses this name: https://www.w3.org/TR/webgpu/#shader-module-creation
+    .code = code,
+#endif
+  };
+  const WGPUShaderModuleDescriptor module_descriptor = {
+      .nextInChain = &descriptor.chain,
+      .label = NULL,
+  };
+  *out_shader_module = wgpuDeviceCreateShaderModule(device, &module_descriptor);
+  iree_status_t status = iree_ok_status();
+  if (!*out_shader_module) {
+    // TODO(benvanik): see if we can get more detailed error info.
+    status = iree_make_status(IREE_STATUS_INTERNAL,
+                              "wgpuDeviceCreateShaderModule failed");
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+// Enough room for `d` + max uint32 characters + NUL.
+#define IREE_HAL_WEBGPU_MAX_ENTRY_NAME_LENGTH (1 + /*uint32*/ 10 + /*NUL*/ 1)
+
+// Makes a canonical entry point name based on its entry ordinal.
+// |buffer| must have at least
+// Example: ordinal 3 => 'd3'
+static void iree_hal_webgpu_make_entry_name(uint32_t entry_ordinal,
+                                            char* buffer) {
+  // Inlined base 10 unsigned itoa-like.
+  // Generates the string in reverse and then flips it around.
+  // It's not worth pulling in snprintf for this.
+  buffer[0] = 'd';
+  ++buffer;
+  uint32_t n = entry_ordinal;
+  int length = 0;
+  do {
+    buffer[length++] = '0' + (n % 10);
+  } while ((n /= 10) > 0);
+  buffer[length] = '\0';
+  for (int i = 0, j = length - 1; i < j; ++i, --j) {
+    char c = buffer[i];
+    buffer[i] = buffer[j];
+    buffer[j] = c;
+  }
+}
+
+// TODO(benvanik): switch to async compilation using
+// wgpuDeviceCreateComputePipelineAsync. We pack all pipelines into a single
+// executable (usually) and can batch compilation of all of them and only
+// join at the end. Technically we could extend the join point until first use
+// but it's harder to reason about lifetime that way. Today we just compile
+// them all synchronously.
+static iree_status_t iree_hal_webgpu_create_pipeline(
+    WGPUDevice device, WGPUShaderModule shader_module, uint32_t entry_ordinal,
+    iree_hal_pipeline_layout_t* pipeline_layout,
+    iree_hal_webgpu_entry_point_t* out_entry_point) {
+  IREE_ASSERT_ARGUMENT(shader_module);
+  IREE_ASSERT_ARGUMENT(pipeline_layout);
+  IREE_ASSERT_ARGUMENT(out_entry_point);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  char entry_name[IREE_HAL_WEBGPU_MAX_ENTRY_NAME_LENGTH] = {0};
+  iree_hal_webgpu_make_entry_name(entry_ordinal, entry_name);
+
+  const WGPUComputePipelineDescriptor pipeline_descriptor = {
+      .nextInChain = NULL,
+      .label = WGPU_DEBUG_LABEL(entry_name),
+      .layout = iree_hal_webgpu_pipeline_layout_handle(pipeline_layout),
+      .compute =
+          {
+              .nextInChain = NULL,
+              .module = shader_module,
+              .entryPoint = entry_name,
+          },
+  };
+
+  WGPUComputePipeline pipeline =
+      wgpuDeviceCreateComputePipeline(device, &pipeline_descriptor);
+  iree_status_t status = iree_ok_status();
+  if (!pipeline) {
+    status = iree_make_status(IREE_STATUS_INTERNAL,
+                              "wgpuDeviceCreateComputePipeline "
+                              "failed for entry point '%s'",
+                              entry_name);
+  }
+
+  if (iree_status_is_ok(status)) {
+    out_entry_point->pipeline = pipeline;
+    out_entry_point->layout = pipeline_layout;
+    iree_hal_pipeline_layout_retain(pipeline_layout);
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+iree_status_t iree_hal_webgpu_executable_create(
+    WGPUDevice device, const iree_hal_executable_params_t* executable_params,
+    iree_allocator_t host_allocator, iree_hal_executable_t** out_executable) {
+  IREE_ASSERT_ARGUMENT(device);
+  IREE_ASSERT_ARGUMENT(executable_params);
+  IREE_ASSERT_ARGUMENT(out_executable);
+  *out_executable = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Verify support up-front - the code below assumes
+  if (!iree_string_view_equal(executable_params->executable_format,
+                              iree_make_cstring_view("webgpu-wgsl-fb"))) {
+    IREE_TRACE_ZONE_END(z0);
+    return iree_make_status(
+        IREE_STATUS_UNIMPLEMENTED,
+        "executable format '%.*s' not available in this build",
+        (int)executable_params->executable_format.size,
+        executable_params->executable_format.data);
+  }
+
+  // Verify and fetch the executable flatbuffer wrapper.
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_hal_webgpu_executable_flatbuffer_verify(
+              executable_params->executable_data,
+              executable_params->pipeline_layout_count));
+  iree_hal_wgsl_ExecutableDef_table_t executable_def =
+      iree_hal_wgsl_ExecutableDef_as_root(
+          executable_params->executable_data.data);
+
+  // Create shader modules. This will be cheap on some implementations like
+  // Metal that need pipeline information in order to be JIT'ed from WGSL while
+  // on others it can be more expensive.
+  iree_hal_wgsl_ShaderModuleDef_vec_t shader_modules_vec =
+      iree_hal_wgsl_ExecutableDef_shader_modules_get(executable_def);
+  size_t shader_module_count =
+      iree_hal_wgsl_ShaderModuleDef_vec_len(shader_modules_vec);
+  iree_inline_array(WGPUShaderModule, shader_modules, shader_module_count,
+                    host_allocator);
+  memset(iree_inline_array_data(shader_modules), 0,
+         sizeof(WGPUShaderModule) * shader_module_count);
+  iree_status_t status = iree_ok_status();
+  for (size_t i = 0; i < shader_module_count; ++i) {
+    status = iree_hal_webgpu_create_wgsl_shader_module(
+        device, iree_hal_wgsl_ShaderModuleDef_vec_at(shader_modules_vec, i),
+        iree_inline_array_at(shader_modules, i));
+    if (!iree_status_is_ok(status)) break;
+  }
+
+  // Allocate the executable with storage for the pipeline handles.
+  iree_hal_webgpu_executable_t* executable = NULL;
+  if (iree_status_is_ok(status)) {
+    iree_host_size_t total_size =
+        sizeof(*executable) + executable_params->pipeline_layout_count *
+                                  sizeof(iree_hal_webgpu_entry_point_t);
+    status =
+        iree_allocator_malloc(host_allocator, total_size, (void**)&executable);
+  }
+
+  if (iree_status_is_ok(status)) {
+    iree_hal_resource_initialize(&iree_hal_webgpu_executable_vtable,
+                                 &executable->resource);
+    executable->host_allocator = host_allocator;
+    executable->entry_point_count = executable_params->pipeline_layout_count;
+
+    // Create one pipeline per entry point.
+    flatbuffers_uint32_vec_t entry_points_vec =
+        iree_hal_wgsl_ExecutableDef_entry_points_get(executable_def);
+    for (iree_host_size_t i = 0; i < executable->entry_point_count; i++) {
+      uint32_t module_ordinal = flatbuffers_uint32_vec_at(entry_points_vec, i);
+      status = iree_hal_webgpu_create_pipeline(
+          device, *iree_inline_array_at(shader_modules, module_ordinal), i,
+          executable_params->pipeline_layouts[i], &executable->entry_points[i]);
+      if (!iree_status_is_ok(status)) break;
+    }
+  }
+
+  for (size_t i = 0; i < shader_module_count; ++i) {
+    iree_wgpuShaderModuleDrop(*iree_inline_array_at(shader_modules, i));
+  }
+  iree_inline_array_deinitialize(shader_modules);
+
+  if (iree_status_is_ok(status)) {
+    *out_executable = (iree_hal_executable_t*)executable;
+  } else {
+    iree_hal_executable_destroy((iree_hal_executable_t*)executable);
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_webgpu_executable_destroy(
+    iree_hal_executable_t* base_executable) {
+  iree_hal_webgpu_executable_t* executable =
+      iree_hal_webgpu_executable_cast(base_executable);
+  iree_allocator_t host_allocator = executable->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  for (iree_host_size_t i = 0; i < executable->entry_point_count; i++) {
+    iree_hal_webgpu_entry_point_t* entry_point = &executable->entry_points[i];
+    iree_hal_pipeline_layout_release(entry_point->layout);
+    iree_wgpuComputePipelineDrop(entry_point->pipeline);
+  }
+  iree_allocator_free(host_allocator, executable);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+const iree_hal_webgpu_entry_point_t*
+iree_hal_webgpu_executable_lookup_entry_point(
+    iree_hal_executable_t* base_executable, uint32_t ordinal) {
+  iree_hal_webgpu_executable_t* executable =
+      iree_hal_webgpu_executable_cast(base_executable);
+  IREE_ASSERT_LT(ordinal, executable->entry_point_count);
+  return &executable->entry_points[ordinal];
+}
+
+const iree_hal_executable_vtable_t iree_hal_webgpu_executable_vtable = {
+    .destroy = iree_hal_webgpu_executable_destroy,
+};

diff --git a/experimental/webgpu/executable.h b/experimental/webgpu/executable.h
new file mode 100644
index 0000000..2c6cacd
--- /dev/null
+++ b/experimental/webgpu/executable.h

@@ -0,0 +1,42 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_DRIVERS_WEBGPU_EXECUTABLE_H_
+#define IREE_HAL_DRIVERS_WEBGPU_EXECUTABLE_H_
+
+#include <stdint.h>
+
+#include "experimental/webgpu/pipeline_layout.h"
+#include "experimental/webgpu/platform/webgpu.h"
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef struct iree_hal_webgpu_entry_point_t {
+  WGPUComputePipeline pipeline;
+  // TODO(benvanik): inline what's needed here (WGPUPipelineLayout, binding
+  // info, etc) instead so that we avoid needing to query it per dispatch from
+  // the layout. The extra ~32B per entry point feels like it may be worth it to
+  // avoid a guaranteed cache miss.
+  iree_hal_pipeline_layout_t* layout;
+} iree_hal_webgpu_entry_point_t;
+
+iree_status_t iree_hal_webgpu_executable_create(
+    WGPUDevice device, const iree_hal_executable_params_t* executable_params,
+    iree_allocator_t host_allocator, iree_hal_executable_t** out_executable);
+
+const iree_hal_webgpu_entry_point_t*
+iree_hal_webgpu_executable_lookup_entry_point(iree_hal_executable_t* executable,
+                                              uint32_t ordinal);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_DRIVERS_WEBGPU_EXECUTABLE_H_

diff --git a/experimental/webgpu/nop_event.c b/experimental/webgpu/nop_event.c
new file mode 100644
index 0000000..7873968
--- /dev/null
+++ b/experimental/webgpu/nop_event.c

@@ -0,0 +1,61 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "experimental/webgpu/nop_event.h"
+
+#include <stddef.h>
+
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+
+// Dummy events for now, don't do anything.
+typedef struct iree_hal_webgpu_nop_event_t {
+  iree_hal_resource_t resource;
+  iree_allocator_t host_allocator;
+} iree_hal_webgpu_nop_event_t;
+
+extern const iree_hal_event_vtable_t iree_hal_webgpu_nop_event_vtable;
+
+static iree_hal_webgpu_nop_event_t* iree_hal_webgpu_nop_event_cast(
+    iree_hal_event_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_webgpu_nop_event_vtable);
+  return (iree_hal_webgpu_nop_event_t*)base_value;
+}
+
+iree_status_t iree_hal_webgpu_nop_event_create(iree_allocator_t host_allocator,
+                                               iree_hal_event_t** out_event) {
+  IREE_ASSERT_ARGUMENT(out_event);
+  *out_event = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_webgpu_nop_event_t* event = NULL;
+  iree_status_t status =
+      iree_allocator_malloc(host_allocator, sizeof(*event), (void**)&event);
+  if (iree_status_is_ok(status)) {
+    iree_hal_resource_initialize(&iree_hal_webgpu_nop_event_vtable,
+                                 &event->resource);
+    event->host_allocator = host_allocator;
+    *out_event = (iree_hal_event_t*)event;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_webgpu_nop_event_destroy(iree_hal_event_t* base_event) {
+  iree_hal_webgpu_nop_event_t* event =
+      iree_hal_webgpu_nop_event_cast(base_event);
+  iree_allocator_t host_allocator = event->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_allocator_free(host_allocator, event);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+const iree_hal_event_vtable_t iree_hal_webgpu_nop_event_vtable = {
+    .destroy = iree_hal_webgpu_nop_event_destroy,
+};

diff --git a/experimental/webgpu/nop_event.h b/experimental/webgpu/nop_event.h
new file mode 100644
index 0000000..ad212f1
--- /dev/null
+++ b/experimental/webgpu/nop_event.h

@@ -0,0 +1,24 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_DRIVERS_WEBGPU_NOP_EVENT_H_
+#define IREE_HAL_DRIVERS_WEBGPU_NOP_EVENT_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+iree_status_t iree_hal_webgpu_nop_event_create(iree_allocator_t host_allocator,
+                                               iree_hal_event_t** out_event);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_DRIVERS_WEBGPU_NOP_EVENT_H_

diff --git a/experimental/webgpu/nop_executable_cache.c b/experimental/webgpu/nop_executable_cache.c
new file mode 100644
index 0000000..acdfa48
--- /dev/null
+++ b/experimental/webgpu/nop_executable_cache.c

@@ -0,0 +1,95 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "experimental/webgpu/nop_executable_cache.h"
+
+#include <stdbool.h>
+#include <stddef.h>
+
+#include "experimental/webgpu/executable.h"
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+
+typedef struct iree_hal_webgpu_nop_executable_cache_t {
+  iree_hal_resource_t resource;
+  iree_allocator_t host_allocator;
+  WGPUDevice device;
+} iree_hal_webgpu_nop_executable_cache_t;
+
+extern const iree_hal_executable_cache_vtable_t
+    iree_hal_webgpu_nop_executable_cache_vtable;
+
+static iree_hal_webgpu_nop_executable_cache_t*
+iree_hal_webgpu_nop_executable_cache_cast(
+    iree_hal_executable_cache_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value,
+                       &iree_hal_webgpu_nop_executable_cache_vtable);
+  return (iree_hal_webgpu_nop_executable_cache_t*)base_value;
+}
+
+iree_status_t iree_hal_webgpu_nop_executable_cache_create(
+    WGPUDevice device, iree_string_view_t identifier, iree_loop_t loop,
+    iree_allocator_t host_allocator,
+    iree_hal_executable_cache_t** out_executable_cache) {
+  IREE_ASSERT_ARGUMENT(device);
+  IREE_ASSERT_ARGUMENT(out_executable_cache);
+  *out_executable_cache = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_webgpu_nop_executable_cache_t* executable_cache = NULL;
+  iree_status_t status = iree_allocator_malloc(
+      host_allocator, sizeof(*executable_cache), (void**)&executable_cache);
+  if (iree_status_is_ok(status)) {
+    iree_hal_resource_initialize(&iree_hal_webgpu_nop_executable_cache_vtable,
+                                 &executable_cache->resource);
+    executable_cache->host_allocator = host_allocator;
+    executable_cache->device = device;
+    *out_executable_cache = (iree_hal_executable_cache_t*)executable_cache;
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_webgpu_nop_executable_cache_destroy(
+    iree_hal_executable_cache_t* base_executable_cache) {
+  iree_hal_webgpu_nop_executable_cache_t* executable_cache =
+      iree_hal_webgpu_nop_executable_cache_cast(base_executable_cache);
+  iree_allocator_t host_allocator = executable_cache->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_allocator_free(host_allocator, executable_cache);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static bool iree_hal_webgpu_nop_executable_cache_can_prepare_format(
+    iree_hal_executable_cache_t* base_executable_cache,
+    iree_hal_executable_caching_mode_t caching_mode,
+    iree_string_view_t executable_format) {
+  // TODO(benvanik): allow SPIR-V `webgpu-spirv-fb` etc based on device support.
+  return iree_string_view_equal(executable_format,
+                                iree_make_cstring_view("webgpu-wgsl-fb"));
+}
+
+static iree_status_t iree_hal_webgpu_nop_executable_cache_prepare_executable(
+    iree_hal_executable_cache_t* base_executable_cache,
+    const iree_hal_executable_params_t* executable_params,
+    iree_hal_executable_t** out_executable) {
+  iree_hal_webgpu_nop_executable_cache_t* executable_cache =
+      iree_hal_webgpu_nop_executable_cache_cast(base_executable_cache);
+  return iree_hal_webgpu_executable_create(
+      executable_cache->device, executable_params,
+      executable_cache->host_allocator, out_executable);
+}
+
+const iree_hal_executable_cache_vtable_t
+    iree_hal_webgpu_nop_executable_cache_vtable = {
+        .destroy = iree_hal_webgpu_nop_executable_cache_destroy,
+        .can_prepare_format =
+            iree_hal_webgpu_nop_executable_cache_can_prepare_format,
+        .prepare_executable =
+            iree_hal_webgpu_nop_executable_cache_prepare_executable,
+};

diff --git a/experimental/webgpu/nop_executable_cache.h b/experimental/webgpu/nop_executable_cache.h
new file mode 100644
index 0000000..5ac6de1
--- /dev/null
+++ b/experimental/webgpu/nop_executable_cache.h

@@ -0,0 +1,30 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_DRIVERS_WEBGPU_NOP_EXECUTABLE_CACHE_H_
+#define IREE_HAL_DRIVERS_WEBGPU_NOP_EXECUTABLE_CACHE_H_
+
+#include "experimental/webgpu/platform/webgpu.h"
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Creates a no-op executable cache that does not cache at all.
+// This is useful to isolate pipeline caching behavior and verify compilation
+// behavior.
+iree_status_t iree_hal_webgpu_nop_executable_cache_create(
+    WGPUDevice device, iree_string_view_t identifier, iree_loop_t loop,
+    iree_allocator_t host_allocator,
+    iree_hal_executable_cache_t** out_executable_cache);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_DRIVERS_WEBGPU_NOP_EXECUTABLE_CACHE_H_

diff --git a/experimental/webgpu/nop_semaphore.c b/experimental/webgpu/nop_semaphore.c
new file mode 100644
index 0000000..a99b6f1
--- /dev/null
+++ b/experimental/webgpu/nop_semaphore.c

@@ -0,0 +1,107 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "experimental/webgpu/nop_semaphore.h"
+
+#include <stddef.h>
+
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+
+typedef struct iree_hal_webgpu_nop_semaphore_t {
+  iree_hal_resource_t resource;
+  iree_allocator_t host_allocator;
+  iree_atomic_int64_t value;
+} iree_hal_webgpu_nop_semaphore_t;
+
+extern const iree_hal_semaphore_vtable_t iree_hal_webgpu_nop_semaphore_vtable;
+
+static iree_hal_webgpu_nop_semaphore_t* iree_hal_webgpu_nop_semaphore_cast(
+    iree_hal_semaphore_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_webgpu_nop_semaphore_vtable);
+  return (iree_hal_webgpu_nop_semaphore_t*)base_value;
+}
+
+iree_status_t iree_hal_webgpu_nop_semaphore_create(
+    uint64_t initial_value, iree_allocator_t host_allocator,
+    iree_hal_semaphore_t** out_semaphore) {
+  IREE_ASSERT_ARGUMENT(out_semaphore);
+  *out_semaphore = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_webgpu_nop_semaphore_t* semaphore = NULL;
+  iree_status_t status = iree_allocator_malloc(
+      host_allocator, sizeof(*semaphore), (void**)&semaphore);
+  if (iree_status_is_ok(status)) {
+    iree_hal_resource_initialize(&iree_hal_webgpu_nop_semaphore_vtable,
+                                 &semaphore->resource);
+    semaphore->host_allocator = host_allocator;
+    iree_atomic_store_int64(&semaphore->value, initial_value,
+                            iree_memory_order_seq_cst);
+    *out_semaphore = (iree_hal_semaphore_t*)semaphore;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_webgpu_nop_semaphore_destroy(
+    iree_hal_semaphore_t* base_semaphore) {
+  iree_hal_webgpu_nop_semaphore_t* semaphore =
+      iree_hal_webgpu_nop_semaphore_cast(base_semaphore);
+  iree_allocator_t host_allocator = semaphore->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_allocator_free(host_allocator, semaphore);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_status_t iree_hal_webgpu_nop_semaphore_query(
+    iree_hal_semaphore_t* base_semaphore, uint64_t* out_value) {
+  iree_hal_webgpu_nop_semaphore_t* semaphore =
+      iree_hal_webgpu_nop_semaphore_cast(base_semaphore);
+  *out_value =
+      iree_atomic_load_int64(&semaphore->value, iree_memory_order_seq_cst);
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_webgpu_nop_semaphore_signal(
+    iree_hal_semaphore_t* base_semaphore, uint64_t new_value) {
+  iree_hal_webgpu_nop_semaphore_t* semaphore =
+      iree_hal_webgpu_nop_semaphore_cast(base_semaphore);
+  iree_atomic_store_int64(&semaphore->value, new_value,
+                          iree_memory_order_seq_cst);
+  return iree_ok_status();
+}
+
+static void iree_hal_webgpu_nop_semaphore_fail(
+    iree_hal_semaphore_t* base_semaphore, iree_status_t status) {
+  iree_status_ignore(status);
+}
+
+static iree_status_t iree_hal_webgpu_nop_semaphore_wait(
+    iree_hal_semaphore_t* base_semaphore, uint64_t value,
+    iree_timeout_t timeout) {
+  iree_hal_webgpu_nop_semaphore_t* semaphore =
+      iree_hal_webgpu_nop_semaphore_cast(base_semaphore);
+  uint64_t current_value =
+      iree_atomic_load_int64(&semaphore->value, iree_memory_order_seq_cst);
+  if (current_value < value) {
+    return iree_make_status(
+        IREE_STATUS_FAILED_PRECONDITION,
+        "expected no-op semaphore to be signaled before wait");
+  }
+  return iree_ok_status();
+}
+
+const iree_hal_semaphore_vtable_t iree_hal_webgpu_nop_semaphore_vtable = {
+    .destroy = iree_hal_webgpu_nop_semaphore_destroy,
+    .query = iree_hal_webgpu_nop_semaphore_query,
+    .signal = iree_hal_webgpu_nop_semaphore_signal,
+    .fail = iree_hal_webgpu_nop_semaphore_fail,
+    .wait = iree_hal_webgpu_nop_semaphore_wait,
+};

diff --git a/experimental/webgpu/nop_semaphore.h b/experimental/webgpu/nop_semaphore.h
new file mode 100644
index 0000000..15409c4
--- /dev/null
+++ b/experimental/webgpu/nop_semaphore.h

@@ -0,0 +1,27 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_DRIVERS_WEBGPU_NOP_SEMAPHORE_H_
+#define IREE_HAL_DRIVERS_WEBGPU_NOP_SEMAPHORE_H_
+
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+iree_status_t iree_hal_webgpu_nop_semaphore_create(
+    uint64_t initial_value, iree_allocator_t host_allocator,
+    iree_hal_semaphore_t** out_semaphore);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_DRIVERS_WEBGPU_NOP_SEMAPHORE_H_

diff --git a/experimental/webgpu/pipeline_layout.c b/experimental/webgpu/pipeline_layout.c
new file mode 100644
index 0000000..5d63e6b
--- /dev/null
+++ b/experimental/webgpu/pipeline_layout.c

@@ -0,0 +1,314 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "experimental/webgpu/pipeline_layout.h"
+
+#include <stddef.h>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/inline_array.h"
+#include "iree/base/tracing.h"
+
+//===----------------------------------------------------------------------===//
+// iree_hal_webgpu_descriptor_set_layout_t
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_webgpu_descriptor_set_layout_t {
+  iree_hal_resource_t resource;
+  iree_allocator_t host_allocator;
+  WGPUBindGroupLayout handle;
+  iree_hal_webgpu_binding_mask_t binding_mask;
+} iree_hal_webgpu_descriptor_set_layout_t;
+
+extern const iree_hal_descriptor_set_layout_vtable_t
+    iree_hal_webgpu_descriptor_set_layout_vtable;
+
+static iree_hal_webgpu_descriptor_set_layout_t*
+iree_hal_webgpu_descriptor_set_layout_cast(
+    iree_hal_descriptor_set_layout_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value,
+                       &iree_hal_webgpu_descriptor_set_layout_vtable);
+  return (iree_hal_webgpu_descriptor_set_layout_t*)base_value;
+}
+
+iree_status_t iree_hal_webgpu_descriptor_set_layout_create(
+    WGPUDevice device, iree_hal_descriptor_set_layout_flags_t flags,
+    iree_host_size_t binding_count,
+    const iree_hal_descriptor_set_layout_binding_t* bindings,
+    iree_allocator_t host_allocator,
+    iree_hal_descriptor_set_layout_t** out_descriptor_set_layout) {
+  IREE_ASSERT_ARGUMENT(device);
+  IREE_ASSERT_ARGUMENT(!binding_count || bindings);
+  IREE_ASSERT_ARGUMENT(out_descriptor_set_layout);
+  *out_descriptor_set_layout = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_inline_array(WGPUBindGroupLayoutEntry, entries, binding_count,
+                    host_allocator);
+  iree_hal_webgpu_binding_mask_t binding_mask = 0;
+  for (iree_host_size_t i = 0; i < binding_count; ++i) {
+    if (bindings[i].binding >=
+        IREE_HAL_WEBGPU_MAX_DESCRIPTOR_SET_BINDING_COUNT) {
+      iree_inline_array_deinitialize(entries);
+      IREE_TRACE_ZONE_END(z0);
+      return iree_make_status(IREE_STATUS_OUT_OF_RANGE,
+                              "bindings must be in the range of 0-%d; binding "
+                              "%zu is has ordinal %d",
+                              IREE_HAL_WEBGPU_MAX_DESCRIPTOR_SET_BINDING_COUNT,
+                              i, bindings[i].binding);
+    }
+    binding_mask |= 1u << bindings[i].binding;
+
+    // TODO(benvanik): make all dynamic? this would let us reuse bind groups.
+    WGPUBufferBindingType binding_type = WGPUBufferBindingType_Undefined;
+    bool has_dynamic_offset = false;
+    switch (bindings[i].type) {
+      case IREE_HAL_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+        binding_type = WGPUBufferBindingType_Storage;
+        break;
+      case IREE_HAL_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+        binding_type = WGPUBufferBindingType_Uniform;
+        break;
+    }
+    *iree_inline_array_at(entries, i) = (WGPUBindGroupLayoutEntry){
+        .nextInChain = NULL,
+        .binding = bindings[i].binding,
+        .visibility = WGPUShaderStage_Compute,
+        .buffer =
+            {
+                .nextInChain = NULL,
+                .type = binding_type,
+                .hasDynamicOffset = has_dynamic_offset,
+                .minBindingSize = 0,
+            },
+    };
+  }
+  const WGPUBindGroupLayoutDescriptor descriptor = {
+      .nextInChain = NULL,
+      .label = NULL,
+      .entryCount = (uint32_t)binding_count,
+      .entries = iree_inline_array_at(entries, 0),
+  };
+  WGPUBindGroupLayout handle =
+      wgpuDeviceCreateBindGroupLayout(device, &descriptor);
+  iree_inline_array_deinitialize(entries);
+  if (!handle) {
+    IREE_TRACE_ZONE_END(z0);
+    return iree_make_status(IREE_STATUS_INTERNAL,
+                            "wgpuDeviceCreateBindGroupLayout failed");
+  }
+
+  iree_hal_webgpu_descriptor_set_layout_t* descriptor_set_layout = NULL;
+  iree_status_t status =
+      iree_allocator_malloc(host_allocator, sizeof(*descriptor_set_layout),
+                            (void**)&descriptor_set_layout);
+  if (iree_status_is_ok(status)) {
+    iree_hal_resource_initialize(&iree_hal_webgpu_descriptor_set_layout_vtable,
+                                 &descriptor_set_layout->resource);
+    descriptor_set_layout->host_allocator = host_allocator;
+    descriptor_set_layout->handle = handle;
+    descriptor_set_layout->binding_mask = binding_mask;
+    *out_descriptor_set_layout =
+        (iree_hal_descriptor_set_layout_t*)descriptor_set_layout;
+  } else {
+    iree_wgpuBindGroupLayoutDrop(handle);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_webgpu_descriptor_set_layout_destroy(
+    iree_hal_descriptor_set_layout_t* base_descriptor_set_layout) {
+  iree_hal_webgpu_descriptor_set_layout_t* descriptor_set_layout =
+      iree_hal_webgpu_descriptor_set_layout_cast(base_descriptor_set_layout);
+  iree_allocator_t host_allocator = descriptor_set_layout->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_wgpuBindGroupLayoutDrop(descriptor_set_layout->handle);
+  iree_allocator_free(host_allocator, descriptor_set_layout);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+WGPUBindGroupLayout iree_hal_webgpu_descriptor_set_layout_handle(
+    iree_hal_descriptor_set_layout_t* layout) {
+  IREE_ASSERT_ARGUMENT(layout);
+  return iree_hal_webgpu_descriptor_set_layout_cast(layout)->handle;
+}
+
+iree_hal_webgpu_binding_mask_t
+iree_hal_webgpu_descriptor_set_layout_binding_mask(
+    iree_hal_descriptor_set_layout_t* layout) {
+  IREE_ASSERT_ARGUMENT(layout);
+  return iree_hal_webgpu_descriptor_set_layout_cast(layout)->binding_mask;
+}
+
+const iree_hal_descriptor_set_layout_vtable_t
+    iree_hal_webgpu_descriptor_set_layout_vtable = {
+        .destroy = iree_hal_webgpu_descriptor_set_layout_destroy,
+};
+
+//===----------------------------------------------------------------------===//
+// iree_hal_webgpu_pipeline_layout_t
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_webgpu_pipeline_layout_t {
+  iree_hal_resource_t resource;
+  iree_allocator_t host_allocator;
+  WGPUPipelineLayout handle;
+  iree_host_size_t push_constant_count;
+  iree_hal_webgpu_set_binding_info_t set_binding_info;
+  iree_host_size_t set_layout_count;
+  iree_hal_descriptor_set_layout_t* set_layouts[];
+} iree_hal_webgpu_pipeline_layout_t;
+
+extern const iree_hal_pipeline_layout_vtable_t
+    iree_hal_webgpu_pipeline_layout_vtable;
+
+static iree_hal_webgpu_pipeline_layout_t* iree_hal_webgpu_pipeline_layout_cast(
+    iree_hal_pipeline_layout_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_webgpu_pipeline_layout_vtable);
+  return (iree_hal_webgpu_pipeline_layout_t*)base_value;
+}
+
+iree_status_t iree_hal_webgpu_pipeline_layout_create(
+    WGPUDevice device, iree_host_size_t set_layout_count,
+    iree_hal_descriptor_set_layout_t* const* set_layouts,
+    iree_host_size_t push_constant_count,
+    iree_hal_webgpu_staging_buffer_t* staging_buffer,
+    iree_allocator_t host_allocator,
+    iree_hal_pipeline_layout_t** out_pipeline_layout) {
+  IREE_ASSERT_ARGUMENT(device);
+  IREE_ASSERT_ARGUMENT(!set_layout_count || set_layouts);
+  IREE_ASSERT_ARGUMENT(out_pipeline_layout);
+  *out_pipeline_layout = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  if (set_layout_count > IREE_HAL_WEBGPU_PARAMS_BIND_GROUP_INDEX) {
+    IREE_TRACE_ZONE_END(z0);
+    return iree_make_status(
+        IREE_STATUS_OUT_OF_RANGE,
+        "set_layout_count must be <= %d, as bind group index %d is reserved",
+        IREE_HAL_WEBGPU_PARAMS_BIND_GROUP_INDEX,
+        IREE_HAL_WEBGPU_PARAMS_BIND_GROUP_INDEX);
+  }
+
+  // Pad to IREE_HAL_WEBGPU_PARAMS_BIND_GROUP_INDEX for push constant emulation.
+  iree_host_size_t bind_group_layouts_count =
+      push_constant_count > 0 ? IREE_HAL_WEBGPU_PARAMS_BIND_GROUP_INDEX + 1
+                              : set_layout_count;
+
+  // Populate a WGPUBindGroupLayout array with the provided set layouts, then
+  // set the staging buffer's bind group layout at the right index, padding
+  // with an empty bind layout as needed.
+  iree_inline_array(WGPUBindGroupLayout, bind_group_layouts,
+                    bind_group_layouts_count, host_allocator);
+  for (iree_host_size_t i = 0; i < set_layout_count; ++i) {
+    *iree_inline_array_at(bind_group_layouts, i) =
+        iree_hal_webgpu_descriptor_set_layout_handle(set_layouts[i]);
+  }
+  for (iree_host_size_t i = set_layout_count; i < bind_group_layouts_count - 1;
+       ++i) {
+    *iree_inline_array_at(bind_group_layouts, i) =
+        staging_buffer->empty_bind_group_layout;
+  }
+  if (push_constant_count > 0) {
+    *iree_inline_array_at(bind_group_layouts,
+                          IREE_HAL_WEBGPU_PARAMS_BIND_GROUP_INDEX) =
+        staging_buffer->bind_group_layout;
+  }
+  const WGPUPipelineLayoutDescriptor descriptor = {
+      .nextInChain = NULL,
+      .label = NULL,
+      .bindGroupLayoutCount = (uint32_t)bind_group_layouts_count,
+      .bindGroupLayouts = iree_inline_array_at(bind_group_layouts, 0),
+  };
+  WGPUPipelineLayout handle =
+      wgpuDeviceCreatePipelineLayout(device, &descriptor);
+  iree_inline_array_deinitialize(bind_group_layouts);
+
+  if (!handle) {
+    IREE_TRACE_ZONE_END(z0);
+    return iree_make_status(IREE_STATUS_INTERNAL,
+                            "wgpuDeviceCreatePipelineLayout failed");
+  }
+
+  iree_hal_webgpu_pipeline_layout_t* pipeline_layout = NULL;
+  iree_host_size_t total_size =
+      sizeof(*pipeline_layout) +
+      set_layout_count * sizeof(*pipeline_layout->set_layouts);
+  iree_status_t status = iree_allocator_malloc(host_allocator, total_size,
+                                               (void**)&pipeline_layout);
+  if (iree_status_is_ok(status)) {
+    iree_hal_resource_initialize(&iree_hal_webgpu_pipeline_layout_vtable,
+                                 &pipeline_layout->resource);
+    pipeline_layout->host_allocator = host_allocator;
+    pipeline_layout->handle = handle;
+    pipeline_layout->push_constant_count = push_constant_count;
+
+    pipeline_layout->set_layout_count = set_layout_count;
+    pipeline_layout->set_binding_info.set_count = set_layout_count;
+    for (iree_host_size_t i = 0; i < set_layout_count; ++i) {
+      pipeline_layout->set_layouts[i] = set_layouts[i];
+      iree_hal_descriptor_set_layout_retain(set_layouts[i]);
+      pipeline_layout->set_binding_info.set_layouts[i] =
+          iree_hal_webgpu_descriptor_set_layout_handle(set_layouts[i]);
+      pipeline_layout->set_binding_info.set_masks[i] =
+          iree_hal_webgpu_descriptor_set_layout_binding_mask(set_layouts[i]);
+    }
+    // Note: not tracking the empty/padding layout or the staging buffer layout.
+
+    *out_pipeline_layout = (iree_hal_pipeline_layout_t*)pipeline_layout;
+  } else {
+    iree_wgpuPipelineLayoutDrop(handle);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_webgpu_pipeline_layout_destroy(
+    iree_hal_pipeline_layout_t* base_pipeline_layout) {
+  iree_hal_webgpu_pipeline_layout_t* pipeline_layout =
+      iree_hal_webgpu_pipeline_layout_cast(base_pipeline_layout);
+  iree_allocator_t host_allocator = pipeline_layout->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_wgpuPipelineLayoutDrop(pipeline_layout->handle);
+  for (iree_host_size_t i = 0; i < pipeline_layout->set_layout_count; ++i) {
+    iree_hal_descriptor_set_layout_release(pipeline_layout->set_layouts[i]);
+  }
+  iree_allocator_free(host_allocator, pipeline_layout);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+WGPUPipelineLayout iree_hal_webgpu_pipeline_layout_handle(
+    iree_hal_pipeline_layout_t* layout) {
+  IREE_ASSERT_ARGUMENT(layout);
+  return iree_hal_webgpu_pipeline_layout_cast(layout)->handle;
+}
+
+iree_host_size_t iree_hal_webgpu_pipeline_layout_push_constant_count(
+    iree_hal_pipeline_layout_t* layout) {
+  IREE_ASSERT_ARGUMENT(layout);
+  return iree_hal_webgpu_pipeline_layout_cast(layout)->push_constant_count;
+}
+
+const iree_hal_webgpu_set_binding_info_t*
+iree_hal_webgpu_pipeline_layout_set_binding_info(
+    iree_hal_pipeline_layout_t* base_layout) {
+  IREE_ASSERT_ARGUMENT(base_layout);
+  iree_hal_webgpu_pipeline_layout_t* layout =
+      iree_hal_webgpu_pipeline_layout_cast(base_layout);
+  return &layout->set_binding_info;
+}
+
+const iree_hal_pipeline_layout_vtable_t iree_hal_webgpu_pipeline_layout_vtable =
+    {
+        .destroy = iree_hal_webgpu_pipeline_layout_destroy,
+};

diff --git a/experimental/webgpu/pipeline_layout.h b/experimental/webgpu/pipeline_layout.h
new file mode 100644
index 0000000..e15b3bf
--- /dev/null
+++ b/experimental/webgpu/pipeline_layout.h

@@ -0,0 +1,83 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_DRIVERS_WEBGPU_PIPELINE_LAYOUT_H_
+#define IREE_HAL_DRIVERS_WEBGPU_PIPELINE_LAYOUT_H_
+
+#include "experimental/webgpu/platform/webgpu.h"
+#include "experimental/webgpu/staging_buffer.h"
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+#define IREE_HAL_WEBGPU_MAX_DESCRIPTOR_SET_COUNT 4
+#define IREE_HAL_WEBGPU_MAX_PUSH_CONSTANT_COUNT 64
+#define IREE_HAL_WEBGPU_PARAMS_BIND_GROUP_INDEX 3
+
+//===----------------------------------------------------------------------===//
+// iree_hal_webgpu_descriptor_set_layout_t
+//===----------------------------------------------------------------------===//
+
+// TODO(benvanik): query from runtime? almost all devices support 16+ and
+// that's what our compiler is assuming.
+#define IREE_HAL_WEBGPU_MAX_DESCRIPTOR_SET_BINDING_COUNT 8
+
+typedef uint32_t iree_hal_webgpu_binding_mask_t;
+static_assert(sizeof(iree_hal_webgpu_binding_mask_t) * 8 >=
+                  IREE_HAL_WEBGPU_MAX_DESCRIPTOR_SET_BINDING_COUNT,
+              "mask must have capacity for one bit per binding in a group");
+
+iree_status_t iree_hal_webgpu_descriptor_set_layout_create(
+    WGPUDevice device, iree_hal_descriptor_set_layout_flags_t flags,
+    iree_host_size_t binding_count,
+    const iree_hal_descriptor_set_layout_binding_t* bindings,
+    iree_allocator_t host_allocator,
+    iree_hal_descriptor_set_layout_t** out_descriptor_set_layout);
+
+WGPUBindGroupLayout iree_hal_webgpu_descriptor_set_layout_handle(
+    iree_hal_descriptor_set_layout_t* layout);
+
+iree_hal_webgpu_binding_mask_t
+iree_hal_webgpu_descriptor_set_layout_binding_mask(
+    iree_hal_descriptor_set_layout_t* layout);
+
+//===----------------------------------------------------------------------===//
+// iree_hal_webgpu_pipeline_layout_t
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_webgpu_set_binding_info_t {
+  iree_host_size_t set_count;
+  WGPUBindGroupLayout set_layouts[IREE_HAL_WEBGPU_MAX_DESCRIPTOR_SET_COUNT];
+  iree_hal_webgpu_binding_mask_t
+      set_masks[IREE_HAL_WEBGPU_MAX_DESCRIPTOR_SET_COUNT];
+} iree_hal_webgpu_set_binding_info_t;
+
+iree_status_t iree_hal_webgpu_pipeline_layout_create(
+    WGPUDevice device, iree_host_size_t set_layout_count,
+    iree_hal_descriptor_set_layout_t* const* set_layouts,
+    iree_host_size_t push_constant_count,
+    iree_hal_webgpu_staging_buffer_t* staging_buffer,
+    iree_allocator_t host_allocator,
+    iree_hal_pipeline_layout_t** out_pipeline_layout);
+
+WGPUPipelineLayout iree_hal_webgpu_pipeline_layout_handle(
+    iree_hal_pipeline_layout_t* layout);
+
+iree_host_size_t iree_hal_webgpu_pipeline_layout_push_constant_count(
+    iree_hal_pipeline_layout_t* layout);
+
+const iree_hal_webgpu_set_binding_info_t*
+iree_hal_webgpu_pipeline_layout_set_binding_info(
+    iree_hal_pipeline_layout_t* layout);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_DRIVERS_WEBGPU_PIPELINE_LAYOUT_H_

diff --git a/experimental/webgpu/platform/BUILD.bazel b/experimental/webgpu/platform/BUILD.bazel
new file mode 100644
index 0000000..63994b4
--- /dev/null
+++ b/experimental/webgpu/platform/BUILD.bazel

@@ -0,0 +1,29 @@
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("//build_tools/bazel:build_defs.oss.bzl", "iree_runtime_cc_library")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+iree_runtime_cc_library(
+    name = "platform",
+    hdrs = [
+        "webgpu.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base:core_headers",
+        "//runtime/src/iree/base:tracing",
+        "//runtime/src/iree/base/internal",
+        "//runtime/src/iree/hal",
+        "@webgpu_headers",
+    ],
+)

diff --git a/experimental/webgpu/platform/CMakeLists.txt b/experimental/webgpu/platform/CMakeLists.txt
new file mode 100644
index 0000000..ea0c3de
--- /dev/null
+++ b/experimental/webgpu/platform/CMakeLists.txt

@@ -0,0 +1,27 @@
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# runtime/src/iree/hal/drivers/webgpu/platform/BUILD                           #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
+
+iree_add_all_subdirs()
+
+iree_cc_library(
+  NAME
+    platform
+  HDRS
+    "webgpu.h"
+  DEPS
+    iree::base
+    iree::base::core_headers
+    iree::base::internal
+    iree::base::tracing
+    iree::hal
+  PUBLIC
+)
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###

diff --git a/experimental/webgpu/platform/emscripten/CMakeLists.txt b/experimental/webgpu/platform/emscripten/CMakeLists.txt
new file mode 100644
index 0000000..1cc7008
--- /dev/null
+++ b/experimental/webgpu/platform/emscripten/CMakeLists.txt

@@ -0,0 +1,27 @@
+# Copyright 2022 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+if(NOT EMSCRIPTEN)
+  return()
+endif()
+
+iree_cc_library(
+  NAME
+    emscripten
+  HDRS
+    "emscripten_driver.h"
+  SRCS
+    "emscripten_driver.c"
+    "emscripten_util.c"
+  DEPS
+    iree::base
+    iree::base::core_headers
+    iree::base::internal
+    iree::base::tracing
+    iree::hal
+    iree::experimental::webgpu::platform
+  PUBLIC
+)

diff --git a/experimental/webgpu/platform/emscripten/emscripten_driver.c b/experimental/webgpu/platform/emscripten/emscripten_driver.c
new file mode 100644
index 0000000..de2036f
--- /dev/null
+++ b/experimental/webgpu/platform/emscripten/emscripten_driver.c

@@ -0,0 +1,260 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "experimental/webgpu/platform/emscripten/emscripten_driver.h"
+
+#include <emscripten.h>
+
+#include "iree/base/tracing.h"
+
+#define IREE_HAL_WEBGPU_DEVICE_ID_DEFAULT 0
+
+//===----------------------------------------------------------------------===//
+// Driver and device options
+//===----------------------------------------------------------------------===//
+
+IREE_API_EXPORT void iree_hal_webgpu_driver_options_initialize(
+    iree_hal_webgpu_driver_options_t* out_options) {
+  IREE_ASSERT_ARGUMENT(out_options);
+  memset(out_options, 0, sizeof(*out_options));
+
+  out_options->backend_preference = IREE_HAL_WEBGPU_DRIVER_BACKEND_ANY;
+  out_options->log_level = IREE_HAL_WEBGPU_DRIVER_LOG_LEVEL_OFF;
+
+  // TODO(benvanik): coming in future spec update. For now go high-perf.
+  // out_options->power_preference = WGPUPowerPreference_Undefined;
+  out_options->power_preference = WGPUPowerPreference_HighPerformance;
+
+  iree_hal_webgpu_device_options_initialize(&out_options->device_options);
+}
+
+//===----------------------------------------------------------------------===//
+// Synchronous adapter and device request functions
+//===----------------------------------------------------------------------===//
+
+// We use Asyncify for convenience - for now.
+//   https://emscripten.org/docs/porting/asyncify.html
+//   https://github.com/emscripten-core/emscripten/issues/15746
+//   https://github.com/juj/wasm_webgpu/blob/main/lib/lib_webgpu.h
+//
+// The requestAdapter and requestDevice functions are asynchronous, while the
+// HAL API has synchronous driver and device creation. We want to support both
+// platform-independent tests (CTS tests, 'check' framework tests) and user
+// applications. For platform-independent tests, we keep the APIs synchronous
+// by using Asyncify. For user applications, we could pass in a loop or add
+// asynchronous APIs that let those applications use async/await directly.
+//
+// An even simpler solution for user applications is to request an adapter and
+// and device purely up in JavaScript, then to pass the already created device
+// in via preinitializedWebGPUDevice / emscripten_webgpu_get_device().
+
+#ifdef EM_ASYNC_JS
+
+EM_ASYNC_JS(WGPUAdapter, wgpuInstanceRequestAdapterSync, (), {
+  // TODO(scotttodd): WGPURequestAdapterOptions struct
+  const adapter = await navigator['gpu']['requestAdapter']();
+  // WARNING: this calls functions directly on Emscripten's library_webgpu.js.
+  // This is not a stable API!
+  const adapterId = WebGPU.mgrAdapter.create(adapter);
+  return adapterId;
+});
+
+EM_ASYNC_JS(WGPUDevice, wgpuAdapterRequestDeviceSync, (WGPUAdapter adapterId), {
+  // WARNING: this calls functions directly on Emscripten's library_webgpu.js.
+  // This is not a stable API!
+  const adapter = WebGPU.mgrAdapter.get(adapterId);
+
+  // TODO(scotttodd): WGPUDeviceDescriptor struct
+  const descriptor = {};
+  const device = await adapter['requestDevice'](descriptor);
+
+  const deviceWrapper = {queueId : WebGPU.mgrQueue.create(device["queue"])};
+  const deviceId = WebGPU.mgrDevice.create(device, deviceWrapper);
+  return deviceId;
+});
+
+#else
+
+WGPUAdapter wgpuInstanceRequestAdapterSync() {
+  fprintf(stderr, "wgpuInstanceRequestAdapterSync requires -sASYNCIFY\n");
+  return NULL;
+}
+
+WGPUDevice wgpuAdapterRequestDeviceSync(WGPUAdapter adapterId) {
+  fprintf(stderr, "wgpuAdapterRequestDeviceSync requires -sASYNCIFY\n");
+  return NULL;
+}
+
+#endif  // EM_ASYNC_JS
+
+//===----------------------------------------------------------------------===//
+// iree_hal_webgpu_emscripten_driver_t
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_webgpu_emscripten_driver_t {
+  iree_hal_resource_t resource;
+  iree_allocator_t host_allocator;
+
+  iree_string_view_t identifier;
+  iree_hal_webgpu_device_options_t default_options;
+
+  WGPUInstance instance;
+  WGPUAdapter adapter;
+} iree_hal_webgpu_emscripten_driver_t;
+
+static const iree_hal_driver_vtable_t iree_hal_webgpu_emscripten_driver_vtable;
+
+static iree_hal_webgpu_emscripten_driver_t*
+iree_hal_webgpu_emscripten_driver_cast(iree_hal_driver_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_webgpu_emscripten_driver_vtable);
+  return (iree_hal_webgpu_emscripten_driver_t*)base_value;
+}
+
+iree_status_t iree_hal_webgpu_emscripten_driver_create(
+    iree_string_view_t identifier,
+    const iree_hal_webgpu_driver_options_t* options,
+    iree_allocator_t host_allocator, iree_hal_driver_t** out_driver) {
+  IREE_ASSERT_ARGUMENT(options);
+  IREE_ASSERT_ARGUMENT(out_driver);
+  *out_driver = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_webgpu_emscripten_driver_t* driver = NULL;
+  iree_host_size_t total_size = sizeof(*driver) + identifier.size + /*NUL=*/1;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_allocator_malloc(host_allocator, total_size, (void**)&driver));
+  iree_hal_resource_initialize(&iree_hal_webgpu_emscripten_driver_vtable,
+                               &driver->resource);
+  driver->host_allocator = host_allocator;
+
+  iree_string_view_append_to_buffer(identifier, &driver->identifier,
+                                    (char*)driver + sizeof(*driver));
+  memcpy(&driver->default_options, &options->device_options,
+         sizeof(driver->default_options));
+
+  const WGPUInstanceDescriptor instance_descriptor = {
+      .nextInChain = NULL,
+  };
+  driver->instance = wgpuCreateInstance(&instance_descriptor);
+  if (!driver->instance) {
+    iree_hal_driver_release((iree_hal_driver_t*)driver);
+    IREE_TRACE_ZONE_END(z0);
+    return iree_make_status(
+        IREE_STATUS_UNAVAILABLE,
+        "WebGPU implementation not present or failed to load");
+  }
+
+  // Request an adapter from the implementation. We only get one of these and it
+  // may expose multiple devices so it's effectively what we consider a driver.
+  // HACKS: sync via Asyncify
+  WGPUAdapter adapter = wgpuInstanceRequestAdapterSync();
+  if (!adapter) {
+    iree_hal_driver_release((iree_hal_driver_t*)driver);
+    IREE_TRACE_ZONE_END(z0);
+    return iree_make_status(
+        IREE_STATUS_UNAVAILABLE,
+        "WebGPU requestAdapter() failed to return a WGPUAdapter");
+  }
+  driver->adapter = adapter;
+
+  WGPUAdapterProperties adapter_props;
+  memset(&adapter_props, 0, sizeof(adapter_props));
+  wgpuAdapterGetProperties(driver->adapter, &adapter_props);
+
+  *out_driver = (iree_hal_driver_t*)driver;
+
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+static void iree_hal_webgpu_emscripten_driver_destroy(
+    iree_hal_driver_t* base_driver) {
+  iree_hal_webgpu_emscripten_driver_t* driver =
+      iree_hal_webgpu_emscripten_driver_cast(base_driver);
+  iree_allocator_t host_allocator = driver->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // TODO(scotttodd): emscripten teardown?
+  // driver->adapter = NULL;
+  // driver->instance = NULL;
+
+  iree_allocator_free(host_allocator, driver);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_status_t iree_hal_webgpu_emscripten_driver_query_available_devices(
+    iree_hal_driver_t* base_driver, iree_allocator_t allocator,
+    iree_host_size_t* out_device_info_count,
+    iree_hal_device_info_t** out_device_infos) {
+  // Unfortunately no queries in WebGPU; we can only request a single device.
+  static const iree_hal_device_info_t device_infos[1] = {
+      {
+          .device_id = IREE_HAL_WEBGPU_DEVICE_ID_DEFAULT,
+          .name = iree_string_view_literal("default"),
+      },
+  };
+  *out_device_info_count = IREE_ARRAYSIZE(device_infos);
+  return iree_allocator_clone(
+      allocator, iree_make_const_byte_span(device_infos, sizeof(device_infos)),
+      (void**)out_device_infos);
+}
+
+static iree_status_t iree_hal_webgpu_emscripten_driver_dump_device_info(
+    iree_hal_driver_t* base_driver, iree_hal_device_id_t device_id,
+    iree_string_builder_t* builder) {
+  iree_hal_webgpu_emscripten_driver_t* driver =
+      iree_hal_webgpu_emscripten_driver_cast(base_driver);
+  // TODO(scotttodd): dump detailed device info.
+  (void)driver;
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_webgpu_emscripten_driver_create_device_by_id(
+    iree_hal_driver_t* base_driver, iree_hal_device_id_t device_id,
+    iree_host_size_t param_count, const iree_string_pair_t* params,
+    iree_allocator_t host_allocator, iree_hal_device_t** out_device) {
+  iree_hal_webgpu_emscripten_driver_t* driver =
+      iree_hal_webgpu_emscripten_driver_cast(base_driver);
+
+  // HACKS: sync via Asyncify
+  WGPUDevice device = wgpuAdapterRequestDeviceSync(driver->adapter);
+  if (!device) {
+    return iree_make_status(
+        IREE_STATUS_UNAVAILABLE,
+        "WebGPU requestDevice() failed to return a WGPUDevice");
+  }
+
+  return iree_hal_webgpu_wrap_device(driver->identifier,
+                                     &driver->default_options, device,
+                                     driver->host_allocator, out_device);
+}
+
+static iree_status_t iree_hal_webgpu_emscripten_driver_create_device_by_path(
+    iree_hal_driver_t* base_driver, iree_string_view_t driver_name,
+    iree_string_view_t device_path, iree_host_size_t param_count,
+    const iree_string_pair_t* params, iree_allocator_t host_allocator,
+    iree_hal_device_t** out_device) {
+  if (!iree_string_view_is_empty(device_path)) {
+    return iree_make_status(IREE_STATUS_NOT_FOUND,
+                            "device paths not yet implemented");
+  }
+  return iree_hal_webgpu_emscripten_driver_create_device_by_id(
+      base_driver, IREE_HAL_DEVICE_ID_DEFAULT, param_count, params,
+      host_allocator, out_device);
+}
+
+static const iree_hal_driver_vtable_t iree_hal_webgpu_emscripten_driver_vtable =
+    {
+        .destroy = iree_hal_webgpu_emscripten_driver_destroy,
+        .query_available_devices =
+            iree_hal_webgpu_emscripten_driver_query_available_devices,
+        .dump_device_info = iree_hal_webgpu_emscripten_driver_dump_device_info,
+        .create_device_by_id =
+            iree_hal_webgpu_emscripten_driver_create_device_by_id,
+        .create_device_by_path =
+            iree_hal_webgpu_emscripten_driver_create_device_by_path,
+};

diff --git a/experimental/webgpu/platform/emscripten/emscripten_driver.h b/experimental/webgpu/platform/emscripten/emscripten_driver.h
new file mode 100644
index 0000000..71a392f
--- /dev/null
+++ b/experimental/webgpu/platform/emscripten/emscripten_driver.h

@@ -0,0 +1,28 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_DRIVERS_WEBGPU_PLATFORM_EMSCRIPTEN_EMSCRIPTEN_DRIVER_H_
+#define IREE_HAL_DRIVERS_WEBGPU_PLATFORM_EMSCRIPTEN_EMSCRIPTEN_DRIVER_H_
+
+#include "experimental/webgpu/api.h"
+#include "experimental/webgpu/platform/webgpu.h"
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+iree_status_t iree_hal_webgpu_emscripten_driver_create(
+    iree_string_view_t identifier,
+    const iree_hal_webgpu_driver_options_t* options,
+    iree_allocator_t host_allocator, iree_hal_driver_t** out_driver);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_DRIVERS_WEBGPU_PLATFORM_EMSCRIPTEN_EMSCRIPTEN_DRIVER_H_

diff --git a/experimental/webgpu/platform/emscripten/emscripten_util.c b/experimental/webgpu/platform/emscripten/emscripten_util.c
new file mode 100644
index 0000000..bf5aa00
--- /dev/null
+++ b/experimental/webgpu/platform/emscripten/emscripten_util.c

@@ -0,0 +1,54 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "experimental/webgpu/platform/webgpu.h"
+
+//===----------------------------------------------------------------------===//
+// Implementation compatibility layer
+//===----------------------------------------------------------------------===//
+
+#define WGPU_EMSCRIPTEN_INSTANCE ((WGPUInstance)((uintptr_t)0xABADD00Du))
+
+WGPUInstance wgpuCreateInstance(WGPUInstanceDescriptor const* descriptor) {
+  // Emscripten does not have instances (yet?)
+  // We use a sentinel value here so that we can do null checks in places for
+  // implementations that do use instances.
+  return WGPU_EMSCRIPTEN_INSTANCE;
+}
+
+void iree_wgpuBindGroupDrop(WGPUBindGroup bindGroup) {
+  // Not implemented on the web / Emscripten.
+}
+
+void iree_wgpuBindGroupLayoutDrop(WGPUBindGroupLayout bindGroupLayout) {
+  // Not implemented on the web / Emscripten.
+}
+
+void iree_wgpuBufferDrop(WGPUBuffer buffer) { wgpuBufferDestroy(buffer); }
+
+void iree_wgpuCommandBufferDrop(WGPUCommandBuffer commandBuffer) {
+  // Not implemented on the web / Emscripten.
+}
+
+void iree_wgpuCommandEncoderDrop(WGPUCommandEncoder commandEncoder) {
+  // Not implemented on the web / Emscripten.
+}
+
+void iree_wgpuComputePipelineDrop(WGPUComputePipeline computePipeline) {
+  // Not implemented on the web / Emscripten.
+}
+
+void iree_wgpuPipelineLayoutDrop(WGPUPipelineLayout pipelineLayout) {
+  // Not implemented on the web / Emscripten.
+}
+
+void iree_wgpuQuerySetDrop(WGPUQuerySet querySet) {
+  wgpuQuerySetDestroy(querySet);
+}
+
+void iree_wgpuShaderModuleDrop(WGPUShaderModule shaderModule) {
+  // Not implemented on the web / Emscripten.
+}

diff --git a/experimental/webgpu/platform/native/CMakeLists.txt b/experimental/webgpu/platform/native/CMakeLists.txt
new file mode 100644
index 0000000..66e90dc
--- /dev/null
+++ b/experimental/webgpu/platform/native/CMakeLists.txt

@@ -0,0 +1,22 @@
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+iree_cc_library(
+  NAME
+    native
+  HDRS
+    "native_driver.h"
+  SRCS
+    "native_driver.c"
+  DEPS
+    iree::base
+    iree::base::core_headers
+    iree::base::internal
+    iree::base::tracing
+    iree::hal
+    iree::experimental::webgpu::platform
+  PUBLIC
+)

diff --git a/experimental/webgpu/platform/native/native_driver.c b/experimental/webgpu/platform/native/native_driver.c
new file mode 100644
index 0000000..297531f
--- /dev/null
+++ b/experimental/webgpu/platform/native/native_driver.c

@@ -0,0 +1,29 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "experimental/webgpu/platform/native/native_driver.h"
+
+#include "iree/base/tracing.h"
+
+//===----------------------------------------------------------------------===//
+// Driver and device options
+//===----------------------------------------------------------------------===//
+
+IREE_API_EXPORT void iree_hal_webgpu_driver_options_initialize(
+    iree_hal_webgpu_driver_options_t* out_options) {
+  IREE_ASSERT_ARGUMENT(out_options);
+  memset(out_options, 0, sizeof(*out_options));
+
+  out_options->backend_preference = IREE_HAL_WEBGPU_DRIVER_BACKEND_ANY;
+
+  out_options->log_level = IREE_HAL_WEBGPU_DRIVER_LOG_LEVEL_OFF;
+
+  // TODO(benvanik): coming in future spec update. For now go high-perf.
+  // out_options->power_preference = WGPUPowerPreference_Undefined;
+  out_options->power_preference = WGPUPowerPreference_HighPerformance;
+
+  iree_hal_webgpu_device_options_initialize(&out_options->device_options);
+}

diff --git a/experimental/webgpu/platform/native/native_driver.h b/experimental/webgpu/platform/native/native_driver.h
new file mode 100644
index 0000000..f9d6b79
--- /dev/null
+++ b/experimental/webgpu/platform/native/native_driver.h

@@ -0,0 +1,30 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_DRIVERS_WEBGPU_PLATFORM_NATIVE_NATIVE_DRIVER_H_
+#define IREE_HAL_DRIVERS_WEBGPU_PLATFORM_NATIVE_NATIVE_DRIVER_H_
+
+#include "experimental/webgpu/api.h"
+#include "experimental/webgpu/platform/webgpu.h"
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// A stub webgpu driver that builds on native platforms, for testing
+// compilation without Emscripten.
+iree_status_t iree_hal_webgpu_native_driver_create(
+    iree_string_view_t identifier,
+    const iree_hal_webgpu_driver_options_t* options,
+    iree_allocator_t host_allocator, iree_hal_driver_t** out_driver);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_DRIVERS_WEBGPU_PLATFORM_NATIVE_NATIVE_DRIVER_H_

diff --git a/experimental/webgpu/platform/webgpu.h b/experimental/webgpu/platform/webgpu.h
new file mode 100644
index 0000000..3034663
--- /dev/null
+++ b/experimental/webgpu/platform/webgpu.h

@@ -0,0 +1,61 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_DRIVERS_WEBGPU_PLATFORM_WEBGPU_H_
+#define IREE_HAL_DRIVERS_WEBGPU_PLATFORM_WEBGPU_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+
+#define WGPU_SKIP_PROCS 1
+#if defined(IREE_PLATFORM_EMSCRIPTEN)
+#include <emscripten/html5_webgpu.h>
+#else
+#include "third_party/webgpu-headers/webgpu.h"  // IWYU pragma: export
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+//===----------------------------------------------------------------------===//
+// WebGPU API utilities
+//===----------------------------------------------------------------------===//
+
+#ifndef NDEBUG
+#define WGPU_DEBUG_LABEL(str) str
+#else
+#define WGPU_DEBUG_LABEL(str) NULL
+#endif  // NDEBUG
+
+//===----------------------------------------------------------------------===//
+// Implementation compatibility layer
+//===----------------------------------------------------------------------===//
+// The webgpu-native headers don't yet line up across implementations or expose
+// everything we need. These methods attempt to paper over that such that we
+// can avoid including implementation-specific headers and #ifdefing everywhere.
+
+// Methods for dropping references to objects.
+// The base header does have some *Destroy methods but they are not implemented
+// anywhere yet and the naming is incorrect as they are just dropping the user
+// reference to the object (the implementation retains them as long as needed).
+// Discussion here: https://github.com/webgpu-native/webgpu-headers/pull/15
+
+void iree_wgpuBindGroupDrop(WGPUBindGroup bindGroup);
+void iree_wgpuBindGroupLayoutDrop(WGPUBindGroupLayout bindGroupLayout);
+void iree_wgpuBufferDrop(WGPUBuffer buffer);
+void iree_wgpuCommandBufferDrop(WGPUCommandBuffer commandBuffer);
+void iree_wgpuCommandEncoderDrop(WGPUCommandEncoder commandEncoder);
+void iree_wgpuComputePipelineDrop(WGPUComputePipeline computePipeline);
+void iree_wgpuPipelineLayoutDrop(WGPUPipelineLayout pipelineLayout);
+void iree_wgpuQuerySetDrop(WGPUQuerySet querySet);
+void iree_wgpuShaderModuleDrop(WGPUShaderModule shaderModule);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_DRIVERS_WEBGPU_PLATFORM_WEBGPU_H_

diff --git a/experimental/webgpu/registration/CMakeLists.txt b/experimental/webgpu/registration/CMakeLists.txt
new file mode 100644
index 0000000..a6b4332
--- /dev/null
+++ b/experimental/webgpu/registration/CMakeLists.txt

@@ -0,0 +1,44 @@
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+iree_cc_library(
+  NAME
+    registration
+  HDRS
+    "driver_module.h"
+  DEPS
+    iree::base
+    iree::base::internal::flags
+    iree::hal
+    iree::experimental::webgpu
+  DEFINES
+    "IREE_HAL_HAVE_EXPERIMENTAL_WEBGPU_DRIVER_MODULE=1"
+  PUBLIC
+)
+
+if(${EMSCRIPTEN})
+  target_sources(
+    iree_experimental_webgpu_registration_registration
+    PUBLIC
+      "driver_module_emscripten.c"
+  )
+  target_link_libraries(
+    iree_experimental_webgpu_registration_registration
+    INTERFACE
+      iree::experimental::webgpu::platform::emscripten
+  )
+else()
+  target_sources(
+    iree_experimental_webgpu_registration_registration
+    PUBLIC
+      "driver_module_native.c"
+  )
+  target_link_libraries(
+    iree_experimental_webgpu_registration_registration
+    INTERFACE
+      iree::experimental::webgpu::platform::native
+  )
+endif()

diff --git a/experimental/webgpu/registration/driver_module.h b/experimental/webgpu/registration/driver_module.h
new file mode 100644
index 0000000..ff15be1
--- /dev/null
+++ b/experimental/webgpu/registration/driver_module.h

@@ -0,0 +1,24 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_DRIVERS_WEBGPU_REGISTRATION_DRIVER_MODULE_H_
+#define IREE_HAL_DRIVERS_WEBGPU_REGISTRATION_DRIVER_MODULE_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+IREE_API_EXPORT iree_status_t
+iree_hal_webgpu_driver_module_register(iree_hal_driver_registry_t* registry);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_DRIVERS_WEBGPU_REGISTRATION_DRIVER_MODULE_H_

diff --git a/experimental/webgpu/registration/driver_module_emscripten.c b/experimental/webgpu/registration/driver_module_emscripten.c
new file mode 100644
index 0000000..e162197
--- /dev/null
+++ b/experimental/webgpu/registration/driver_module_emscripten.c

@@ -0,0 +1,52 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "experimental/webgpu/platform/emscripten/emscripten_driver.h"
+#include "experimental/webgpu/registration/driver_module.h"
+#include "iree/base/api.h"
+
+// TODO(#4298): remove this driver registration and wrapper.
+
+static iree_status_t iree_hal_webgpu_emscripten_driver_factory_enumerate(
+    void* self, iree_host_size_t* out_driver_info_count,
+    const iree_hal_driver_info_t** out_driver_infos) {
+  static const iree_hal_driver_info_t driver_infos[1] = {
+      {
+          .driver_name = iree_string_view_literal("webgpu"),
+          .full_name = iree_string_view_literal("Experimental WebGPU"),
+      },
+  };
+  *out_driver_info_count = IREE_ARRAYSIZE(driver_infos);
+  *out_driver_infos = driver_infos;
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_webgpu_emscripten_driver_factory_try_create(
+    void* self, iree_string_view_t driver_name, iree_allocator_t host_allocator,
+    iree_hal_driver_t** out_driver) {
+  if (!iree_string_view_equal(driver_name, IREE_SV("webgpu"))) {
+    return iree_make_status(IREE_STATUS_UNAVAILABLE,
+                            "no driver '%.*s' is provided by this factory",
+                            (int)driver_name.size, driver_name.data);
+  }
+
+  iree_hal_webgpu_driver_options_t options;
+  iree_hal_webgpu_driver_options_initialize(&options);
+
+  return iree_hal_webgpu_emscripten_driver_create(
+      iree_make_cstring_view("webgpu-emscripten"), &options, host_allocator,
+      out_driver);
+}
+
+IREE_API_EXPORT iree_status_t
+iree_hal_webgpu_driver_module_register(iree_hal_driver_registry_t* registry) {
+  static const iree_hal_driver_factory_t factory = {
+      .self = NULL,
+      .enumerate = iree_hal_webgpu_emscripten_driver_factory_enumerate,
+      .try_create = iree_hal_webgpu_emscripten_driver_factory_try_create,
+  };
+  return iree_hal_driver_registry_register_factory(registry, &factory);
+}

diff --git a/experimental/webgpu/registration/driver_module_native.c b/experimental/webgpu/registration/driver_module_native.c
new file mode 100644
index 0000000..24d6ab4
--- /dev/null
+++ b/experimental/webgpu/registration/driver_module_native.c

@@ -0,0 +1,37 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "experimental/webgpu/platform/native/native_driver.h"
+#include "experimental/webgpu/registration/driver_module.h"
+#include "iree/base/api.h"
+
+// TODO(#4298): remove this driver registration and wrapper.
+
+static iree_status_t iree_hal_webgpu_native_driver_factory_enumerate(
+    void* self, const iree_hal_driver_info_t** out_driver_infos,
+    iree_host_size_t* out_driver_info_count) {
+  return iree_make_status(
+      IREE_STATUS_UNIMPLEMENTED,
+      "WebGPU native driver is only for testing compilation");
+}
+
+static iree_status_t iree_hal_webgpu_native_driver_factory_try_create(
+    void* self, iree_hal_driver_id_t driver_id, iree_allocator_t host_allocator,
+    iree_hal_driver_t** out_driver) {
+  return iree_make_status(
+      IREE_STATUS_UNIMPLEMENTED,
+      "WebGPU native driver is only for testing compilation");
+}
+
+IREE_API_EXPORT iree_status_t
+iree_hal_webgpu_driver_module_register(iree_hal_driver_registry_t* registry) {
+  static const iree_hal_driver_factory_t factory = {
+      .self = NULL,
+      .enumerate = iree_hal_webgpu_native_driver_factory_enumerate,
+      .try_create = iree_hal_webgpu_native_driver_factory_try_create,
+  };
+  return iree_hal_driver_registry_register_factory(registry, &factory);
+}

diff --git a/experimental/webgpu/shaders/BUILD.bazel b/experimental/webgpu/shaders/BUILD.bazel
new file mode 100644
index 0000000..9acb239
--- /dev/null
+++ b/experimental/webgpu/shaders/BUILD.bazel

@@ -0,0 +1,24 @@
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("//build_tools/embed_data:build_defs.bzl", "c_embed_data")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+c_embed_data(
+    name = "shaders",
+    srcs = [
+        "fill_buffer.wgsl",
+    ],
+    c_file_output = "builtin_shaders.c",
+    flatten = True,
+    h_file_output = "builtin_shaders.h",
+    identifier = "iree_hal_wgsl_builtin_shaders",
+)

diff --git a/experimental/webgpu/shaders/CMakeLists.txt b/experimental/webgpu/shaders/CMakeLists.txt
new file mode 100644
index 0000000..78cbc8c
--- /dev/null
+++ b/experimental/webgpu/shaders/CMakeLists.txt

@@ -0,0 +1,28 @@
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# runtime/src/iree/hal/drivers/webgpu/shaders/BUILD                            #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
+
+iree_add_all_subdirs()
+
+iree_c_embed_data(
+  NAME
+    shaders
+  SRCS
+    "fill_buffer.wgsl"
+  C_FILE_OUTPUT
+    "builtin_shaders.c"
+  H_FILE_OUTPUT
+    "builtin_shaders.h"
+  IDENTIFIER
+    "iree_hal_wgsl_builtin_shaders"
+  FLATTEN
+  PUBLIC
+)
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###

diff --git a/experimental/webgpu/shaders/fill_buffer.wgsl b/experimental/webgpu/shaders/fill_buffer.wgsl
new file mode 100644
index 0000000..946d302
--- /dev/null
+++ b/experimental/webgpu/shaders/fill_buffer.wgsl

@@ -0,0 +1,21 @@
+struct Params {
+  offset : u32,
+  length : u32,
+  pattern : u32,
+};
+@binding(0) @group(0) var<uniform> params: Params;
+
+struct OpaqueBuffer {
+  data : array<u32>,
+};
+@binding(0) @group(1) var<storage, read_write> buffer: OpaqueBuffer;
+
+@compute @workgroup_size(64, 1, 1)
+fn main(@builtin(global_invocation_id) GlobalInvocationID : vec3<u32>) {
+  var element_index : u32 = GlobalInvocationID.x;
+  var element_range_start : u32 = element_index * 64u;
+  var element_range_end : u32 = min(element_range_start + 64u, params.length / 4u);
+  for (var i : u32 = element_range_start; i < element_range_end; i = i + 1u) {
+    buffer.data[i] = params.pattern;
+  }
+}

diff --git a/experimental/webgpu/simple_allocator.c b/experimental/webgpu/simple_allocator.c
new file mode 100644
index 0000000..79b0b5d
--- /dev/null
+++ b/experimental/webgpu/simple_allocator.c

@@ -0,0 +1,272 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "experimental/webgpu/simple_allocator.h"
+
+#include <stddef.h>
+
+#include "experimental/webgpu/buffer.h"
+#include "experimental/webgpu/webgpu_device.h"
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+
+typedef struct iree_hal_webgpu_simple_allocator_t {
+  iree_hal_resource_t resource;
+  iree_allocator_t host_allocator;
+  iree_hal_device_t* device;
+  iree_string_view_t identifier;
+  IREE_STATISTICS(iree_hal_allocator_statistics_t statistics;)
+} iree_hal_webgpu_simple_allocator_t;
+
+extern const iree_hal_allocator_vtable_t
+    iree_hal_webgpu_simple_allocator_vtable;
+
+static iree_hal_webgpu_simple_allocator_t*
+iree_hal_webgpu_simple_allocator_cast(iree_hal_allocator_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_webgpu_simple_allocator_vtable);
+  return (iree_hal_webgpu_simple_allocator_t*)base_value;
+}
+
+iree_status_t iree_hal_webgpu_simple_allocator_create(
+    iree_hal_device_t* device, iree_string_view_t identifier,
+    iree_allocator_t host_allocator, iree_hal_allocator_t** out_allocator) {
+  IREE_ASSERT_ARGUMENT(device);
+  IREE_ASSERT_ARGUMENT(out_allocator);
+  *out_allocator = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_webgpu_simple_allocator_t* allocator = NULL;
+  iree_host_size_t struct_size = iree_sizeof_struct(*allocator);
+  iree_host_size_t total_size = struct_size + identifier.size;
+  iree_status_t status =
+      iree_allocator_malloc(host_allocator, total_size, (void**)&allocator);
+  if (iree_status_is_ok(status)) {
+    iree_hal_resource_initialize(&iree_hal_webgpu_simple_allocator_vtable,
+                                 &allocator->resource);
+    allocator->host_allocator = host_allocator;
+    allocator->device = device;
+    iree_string_view_append_to_buffer(identifier, &allocator->identifier,
+                                      (char*)allocator + struct_size);
+    *out_allocator = (iree_hal_allocator_t*)allocator;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_webgpu_simple_allocator_destroy(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator) {
+  iree_hal_webgpu_simple_allocator_t* allocator =
+      iree_hal_webgpu_simple_allocator_cast(base_allocator);
+  iree_allocator_t host_allocator = allocator->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_allocator_free(host_allocator, allocator);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_allocator_t iree_hal_webgpu_simple_allocator_host_allocator(
+    const iree_hal_allocator_t* IREE_RESTRICT base_allocator) {
+  iree_hal_webgpu_simple_allocator_t* allocator =
+      (iree_hal_webgpu_simple_allocator_t*)base_allocator;
+  return allocator->host_allocator;
+}
+
+static iree_status_t iree_hal_webgpu_simple_allocator_trim(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator) {
+  return iree_ok_status();
+}
+
+static void iree_hal_webgpu_simple_allocator_query_statistics(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    iree_hal_allocator_statistics_t* IREE_RESTRICT out_statistics) {
+  IREE_STATISTICS({
+    iree_hal_webgpu_simple_allocator_t* allocator =
+        iree_hal_webgpu_simple_allocator_cast(base_allocator);
+    memcpy(out_statistics, &allocator->statistics, sizeof(*out_statistics));
+  });
+}
+
+static iree_hal_buffer_compatibility_t
+iree_hal_webgpu_simple_allocator_query_buffer_compatibility(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    iree_hal_buffer_params_t* IREE_RESTRICT params,
+    iree_device_size_t* IREE_RESTRICT allocation_size) {
+  // TODO(benvanik): check to ensure the allocator can serve the memory type.
+
+  // All buffers can be allocated on the heap.
+  iree_hal_buffer_compatibility_t compatibility =
+      IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE;
+
+  // Buffers can only be used on the queue if they are device visible.
+  if (iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE)) {
+    if (iree_any_bit_set(params->usage, IREE_HAL_BUFFER_USAGE_TRANSFER)) {
+      compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_TRANSFER;
+    }
+    if (iree_any_bit_set(params->usage,
+                         IREE_HAL_BUFFER_USAGE_DISPATCH_STORAGE)) {
+      compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_DISPATCH;
+    }
+  }
+
+  // WebGPU does not support synchronous buffer mapping, so disallow.
+  if (iree_all_bits_set(params->usage, IREE_HAL_BUFFER_USAGE_MAPPING)) {
+    return IREE_HAL_BUFFER_COMPATIBILITY_NONE;
+  }
+
+  // Guard against the corner case where the requested buffer size is 0. The
+  // application is unlikely to do anything when requesting a 0-byte buffer; but
+  // it can happen in real world use cases. So we should at least not crash.
+  if (*allocation_size == 0) *allocation_size = 4;
+
+  return compatibility;
+}
+
+static iree_status_t iree_hal_webgpu_simple_allocator_allocate_buffer(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    const iree_hal_buffer_params_t* IREE_RESTRICT params,
+    iree_host_size_t allocation_size, iree_const_byte_span_t initial_data,
+    iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
+  IREE_ASSERT_ARGUMENT(base_allocator);
+  IREE_ASSERT_ARGUMENT(params);
+  IREE_ASSERT_ARGUMENT(out_buffer);
+  *out_buffer = NULL;
+  iree_hal_webgpu_simple_allocator_t* allocator =
+      iree_hal_webgpu_simple_allocator_cast(base_allocator);
+
+  // Guard against the corner case where the requested buffer size is 0. The
+  // application is unlikely to do anything when requesting a 0-byte buffer; but
+  // it can happen in real world use cases. So we should at least not crash.
+  if (allocation_size == 0) allocation_size = 4;
+
+  WGPUBufferUsageFlags usage_flags = WGPUBufferUsage_None;
+  if (iree_all_bits_set(params->usage, IREE_HAL_BUFFER_USAGE_TRANSFER)) {
+    usage_flags |= WGPUBufferUsage_CopySrc;
+    usage_flags |= WGPUBufferUsage_CopyDst;
+  }
+  if (iree_all_bits_set(params->usage, IREE_HAL_BUFFER_USAGE_MAPPING)) {
+    // Requirements from https://gpuweb.github.io/gpuweb/#buffer-usage:
+    //   * MAP_WRITE can only be combined with COPY_SRC
+    //   * MAP_READ  can only be combined with COPY_DST
+    //
+    // We don't have copy source/dest modeled in IREE's HAL (yet) so for now
+    // we only enable mapping if transfer is set and hope it's not a copy dest.
+    // Any copy dest buffers (such as for readback) must be allocated directly:
+    //     WGPUBufferDescriptor descriptor = {
+    //       ...
+    //       .usage = WGPUBufferUsage_MapRead | WGPUBufferUsage_CopyDst,
+    //     };
+    //     buffer = wgpuDeviceCreateBuffer(device, descriptor);
+    //     iree_hal_webgpu_buffer_wrap(..., buffer, ...);
+    if (iree_all_bits_set(params->usage, IREE_HAL_BUFFER_USAGE_TRANSFER) &&
+        !iree_any_bit_set(params->usage,
+                          IREE_HAL_BUFFER_USAGE_DISPATCH_STORAGE)) {
+      usage_flags |= WGPUBufferUsage_MapWrite;
+      usage_flags &= ~(WGPUBufferUsage_CopyDst);  // Clear CopyDst
+    }
+  }
+  if (iree_any_bit_set(params->usage, IREE_HAL_BUFFER_USAGE_DISPATCH_STORAGE)) {
+    usage_flags |= WGPUBufferUsage_Storage;
+  }
+  if (iree_any_bit_set(params->usage,
+                       IREE_HAL_BUFFER_USAGE_DISPATCH_UNIFORM_READ)) {
+    usage_flags |= WGPUBufferUsage_Uniform;
+  }
+  if (iree_any_bit_set(params->usage,
+                       IREE_HAL_BUFFER_USAGE_DISPATCH_INDIRECT_PARAMS)) {
+    usage_flags |= WGPUBufferUsage_Indirect;
+  }
+
+  const bool has_initial_data = !iree_const_byte_span_is_empty(initial_data);
+  WGPUBufferDescriptor descriptor = {
+      .nextInChain = NULL,
+      .label = NULL,
+      .usage = usage_flags,
+      .size = allocation_size,
+      .mappedAtCreation = has_initial_data,
+  };
+  WGPUBuffer buffer_handle = wgpuDeviceCreateBuffer(
+      iree_hal_webgpu_device_handle(allocator->device), &descriptor);
+  if (!buffer_handle) {
+    return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED,
+                            "unable to allocate buffer of size %" PRIdsz,
+                            allocation_size);
+  }
+
+  // Upload the initial data into the mapped buffer. In WebGPU the only
+  // _somewhat_ efficient path for buffer initialization is setting
+  // mappedAtCreation and populating it before unmapping.
+  if (has_initial_data) {
+    IREE_TRACE_ZONE_BEGIN(z1);
+    IREE_TRACE_ZONE_APPEND_VALUE(z1, (uint64_t)initial_data.data_length);
+    void* mapped_ptr =
+        wgpuBufferGetMappedRange(buffer_handle, 0, initial_data.data_length);
+    memcpy(mapped_ptr, initial_data.data, initial_data.data_length);
+    wgpuBufferUnmap(buffer_handle);
+    IREE_TRACE_ZONE_END(z1);
+  }
+
+  iree_status_t status = iree_hal_webgpu_buffer_wrap(
+      allocator->device, base_allocator, params->type, params->access,
+      params->usage, allocation_size,
+      /*byte_offset=*/0,
+      /*byte_length=*/allocation_size, buffer_handle, allocator->host_allocator,
+      out_buffer);
+  if (iree_status_is_ok(status)) {
+    IREE_STATISTICS(iree_hal_allocator_statistics_record_alloc(
+        &allocator->statistics, params->type, allocation_size));
+  } else {
+    wgpuBufferDestroy(buffer_handle);
+  }
+  return status;
+}
+
+static void iree_hal_webgpu_simple_allocator_deallocate_buffer(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    iree_hal_buffer_t* IREE_RESTRICT base_buffer) {
+  iree_hal_webgpu_simple_allocator_t* allocator =
+      iree_hal_webgpu_simple_allocator_cast(base_allocator);
+
+  IREE_STATISTICS(iree_hal_allocator_statistics_record_free(
+      &allocator->statistics, iree_hal_buffer_memory_type(base_buffer),
+      iree_hal_buffer_allocation_size(base_buffer)));
+
+  iree_hal_buffer_destroy(base_buffer);
+}
+
+static iree_status_t iree_hal_webgpu_allocator_import_buffer(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    const iree_hal_buffer_params_t* IREE_RESTRICT params,
+    iree_hal_external_buffer_t* IREE_RESTRICT external_buffer,
+    iree_hal_buffer_release_callback_t release_callback,
+    iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
+  return iree_make_status(IREE_STATUS_UNAVAILABLE,
+                          "importing from external buffers not supported");
+}
+
+static iree_status_t iree_hal_webgpu_allocator_export_buffer(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    iree_hal_buffer_t* IREE_RESTRICT buffer,
+    iree_hal_external_buffer_type_t requested_type,
+    iree_hal_external_buffer_flags_t requested_flags,
+    iree_hal_external_buffer_t* IREE_RESTRICT out_external_buffer) {
+  return iree_make_status(IREE_STATUS_UNAVAILABLE,
+                          "exporting to external buffers not supported");
+}
+
+const iree_hal_allocator_vtable_t iree_hal_webgpu_simple_allocator_vtable = {
+    .destroy = iree_hal_webgpu_simple_allocator_destroy,
+    .host_allocator = iree_hal_webgpu_simple_allocator_host_allocator,
+    .trim = iree_hal_webgpu_simple_allocator_trim,
+    .query_statistics = iree_hal_webgpu_simple_allocator_query_statistics,
+    .query_buffer_compatibility =
+        iree_hal_webgpu_simple_allocator_query_buffer_compatibility,
+    .allocate_buffer = iree_hal_webgpu_simple_allocator_allocate_buffer,
+    .deallocate_buffer = iree_hal_webgpu_simple_allocator_deallocate_buffer,
+    .import_buffer = iree_hal_webgpu_allocator_import_buffer,
+    .export_buffer = iree_hal_webgpu_allocator_export_buffer,
+};

diff --git a/experimental/webgpu/simple_allocator.h b/experimental/webgpu/simple_allocator.h
new file mode 100644
index 0000000..58192dc
--- /dev/null
+++ b/experimental/webgpu/simple_allocator.h

@@ -0,0 +1,26 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_DRIVERS_WEBGPU_SIMPLE_ALLOCATOR_H_
+#define IREE_HAL_DRIVERS_WEBGPU_SIMPLE_ALLOCATOR_H_
+
+#include "experimental/webgpu/platform/webgpu.h"
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+iree_status_t iree_hal_webgpu_simple_allocator_create(
+    iree_hal_device_t* device, iree_string_view_t identifier,
+    iree_allocator_t host_allocator, iree_hal_allocator_t** out_allocator);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_DRIVERS_WEBGPU_SIMPLE_ALLOCATOR_H_

diff --git a/experimental/webgpu/staging_buffer.c b/experimental/webgpu/staging_buffer.c
new file mode 100644
index 0000000..f1e7b5d
--- /dev/null
+++ b/experimental/webgpu/staging_buffer.c

@@ -0,0 +1,177 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "experimental/webgpu/staging_buffer.h"
+
+#include <stdint.h>
+
+#include "experimental/webgpu/buffer.h"
+#include "iree/base/api.h"
+#include "iree/base/tracing.h"
+
+iree_status_t iree_hal_webgpu_staging_buffer_initialize(
+    WGPUDevice device, const WGPULimits* limits,
+    iree_hal_allocator_t* device_allocator, uint8_t* host_buffer,
+    iree_host_size_t host_buffer_capacity,
+    iree_hal_webgpu_staging_buffer_t* out_staging_buffer) {
+  IREE_ASSERT_ARGUMENT(device);
+  IREE_ASSERT_ARGUMENT(device_allocator);
+  IREE_ASSERT_ARGUMENT(host_buffer);
+  IREE_ASSERT_ARGUMENT(out_staging_buffer);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  memset(out_staging_buffer, 0, sizeof(*out_staging_buffer));
+
+  if ((host_buffer_capacity % limits->minUniformBufferOffsetAlignment) != 0) {
+    IREE_TRACE_ZONE_END(z0);
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "host buffer capacity (%zu) must match the buffer "
+                            "offset alignment (%d)",
+                            host_buffer_capacity,
+                            limits->minUniformBufferOffsetAlignment);
+  }
+
+  out_staging_buffer->alignment = limits->minUniformBufferOffsetAlignment;
+  out_staging_buffer->capacity = (uint32_t)host_buffer_capacity;
+  out_staging_buffer->host_buffer = host_buffer;
+
+  const iree_hal_buffer_params_t buffer_params = {
+      .usage = IREE_HAL_BUFFER_USAGE_TRANSFER |
+               IREE_HAL_BUFFER_USAGE_DISPATCH_UNIFORM_READ |
+               IREE_HAL_BUFFER_USAGE_DISPATCH_STORAGE_READ,
+      .access = IREE_HAL_MEMORY_ACCESS_ALL,
+      .type = IREE_HAL_MEMORY_TYPE_OPTIMAL_FOR_DEVICE |
+              IREE_HAL_MEMORY_TYPE_HOST_VISIBLE,
+  };
+  iree_hal_buffer_t* device_buffer = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_hal_allocator_allocate_buffer(
+              device_allocator, buffer_params, out_staging_buffer->capacity,
+              iree_const_byte_span_empty(), &device_buffer));
+  out_staging_buffer->device_buffer = device_buffer;
+  iree_hal_buffer_retain(device_buffer);
+  out_staging_buffer->device_buffer_handle =
+      iree_hal_webgpu_buffer_handle(device_buffer);
+
+  const WGPUBindGroupLayoutEntry buffer_bindings[] = {
+      {
+          .nextInChain = NULL,
+          .binding = 0,
+          .visibility = WGPUShaderStage_Compute,
+          .buffer =
+              {
+                  .nextInChain = NULL,
+                  .type = WGPUBufferBindingType_Uniform,
+                  .hasDynamicOffset = true,
+                  .minBindingSize = out_staging_buffer->alignment,
+              },
+      },
+  };
+  const WGPUBindGroupLayoutDescriptor group_layout_descriptor = {
+      .nextInChain = NULL,
+      .label = WGPU_DEBUG_LABEL("_staging_buffer_binding"),
+      .entryCount = IREE_ARRAYSIZE(buffer_bindings),
+      .entries = buffer_bindings,
+  };
+  out_staging_buffer->bind_group_layout =
+      wgpuDeviceCreateBindGroupLayout(device, &group_layout_descriptor);
+
+  const WGPUBindGroupEntry group_entries[] = {
+      {
+          .nextInChain = NULL,
+          .binding = 0,
+          .buffer = out_staging_buffer->device_buffer_handle,
+          .offset = 0,
+          .size = limits->maxUniformBufferBindingSize,
+      },
+  };
+  const WGPUBindGroupDescriptor descriptor = {
+      .nextInChain = NULL,
+      .label = WGPU_DEBUG_LABEL("_staging_buffer"),
+      .layout = out_staging_buffer->bind_group_layout,
+      .entryCount = IREE_ARRAYSIZE(group_entries),
+      .entries = group_entries,
+  };
+  out_staging_buffer->bind_group =
+      wgpuDeviceCreateBindGroup(device, &descriptor);
+
+  const WGPUBindGroupLayoutDescriptor empty_group_layout_descriptor = {
+      .nextInChain = NULL,
+      .label = WGPU_DEBUG_LABEL("_empty_binding"),
+      .entryCount = 0,
+      .entries = NULL,
+  };
+  out_staging_buffer->empty_bind_group_layout =
+      wgpuDeviceCreateBindGroupLayout(device, &empty_group_layout_descriptor);
+  const WGPUBindGroupDescriptor empty_descriptor = {
+      .nextInChain = NULL,
+      .label = WGPU_DEBUG_LABEL("_empty"),
+      .layout = out_staging_buffer->empty_bind_group_layout,
+      .entryCount = 0,
+      .entries = NULL,
+  };
+  out_staging_buffer->empty_bind_group =
+      wgpuDeviceCreateBindGroup(device, &empty_descriptor);
+
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+void iree_hal_webgpu_staging_buffer_deinitialize(
+    iree_hal_webgpu_staging_buffer_t* staging_buffer) {
+  iree_wgpuBindGroupLayoutDrop(staging_buffer->empty_bind_group_layout);
+  iree_wgpuBindGroupDrop(staging_buffer->bind_group);
+  iree_wgpuBindGroupLayoutDrop(staging_buffer->bind_group_layout);
+  iree_wgpuBindGroupDrop(staging_buffer->empty_bind_group);
+  iree_hal_buffer_release(staging_buffer->device_buffer);
+}
+
+iree_status_t iree_hal_webgpu_staging_buffer_reserve(
+    iree_hal_webgpu_staging_buffer_t* staging_buffer, iree_host_size_t length,
+    iree_byte_span_t* out_reservation, uint32_t* out_offset) {
+  iree_host_size_t aligned_length =
+      iree_host_align(length, staging_buffer->alignment);
+  if (aligned_length > staging_buffer->capacity) {
+    // Will never fit in the staging buffer.
+    return iree_make_status(IREE_STATUS_OUT_OF_RANGE,
+                            "reservation (%" PRIhsz
+                            ") exceeds the maximum capacity of "
+                            "the staging buffer (%" PRIu32 ")",
+                            length, staging_buffer->capacity);
+  } else if (staging_buffer->offset + aligned_length >
+             staging_buffer->capacity) {
+    // Flush required - this is not an error but a request to the caller.
+    return iree_status_from_code(IREE_STATUS_RESOURCE_EXHAUSTED);
+  }
+  *out_reservation = iree_make_byte_span(
+      staging_buffer->host_buffer + staging_buffer->offset, aligned_length);
+  *out_offset = staging_buffer->offset;
+  staging_buffer->offset += aligned_length;
+  return iree_ok_status();
+}
+
+iree_status_t iree_hal_webgpu_staging_buffer_append(
+    iree_hal_webgpu_staging_buffer_t* staging_buffer,
+    iree_const_byte_span_t source, uint32_t* out_offset) {
+  iree_byte_span_t reservation;
+  IREE_RETURN_IF_ERROR(iree_hal_webgpu_staging_buffer_reserve(
+      staging_buffer, source.data_length, &reservation, out_offset));
+  memcpy(reservation.data, source.data, source.data_length);
+  return iree_ok_status();
+}
+
+void iree_hal_webgpu_staging_buffer_flush(
+    iree_hal_webgpu_staging_buffer_t* staging_buffer, void** out_source_buffer,
+    WGPUBuffer* out_target_buffer, iree_host_size_t* out_length) {
+  *out_source_buffer = staging_buffer->host_buffer;
+  *out_target_buffer = staging_buffer->device_buffer_handle;
+  *out_length = staging_buffer->offset;
+  staging_buffer->offset = 0;
+}
+
+void iree_hal_webgpu_staging_buffer_reset(
+    iree_hal_webgpu_staging_buffer_t* staging_buffer) {
+  staging_buffer->offset = 0;
+}

diff --git a/experimental/webgpu/staging_buffer.h b/experimental/webgpu/staging_buffer.h
new file mode 100644
index 0000000..1debf59
--- /dev/null
+++ b/experimental/webgpu/staging_buffer.h

@@ -0,0 +1,107 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_DRIVERS_WEBGPU_STAGING_BUFFER_H_
+#define IREE_HAL_DRIVERS_WEBGPU_STAGING_BUFFER_H_
+
+#include "experimental/webgpu/platform/webgpu.h"
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Size, in bytes, of the device-local staging buffer.
+// An equivalent amount of host memory will be reserved as a scratchpad used
+// to build up the buffer prior to submission into the queue timeline.
+//
+// Larger values here will use more memory while reducing the frequency at which
+// the staging buffer needs to be flushed. As most models that run in these
+// environments are only a few hundred dispatches per command buffer we can
+// approximate an average consumption of 500 dispatches x worst-case 256b per
+// dispatch of parameters and get 128KB.
+#define IREE_HAL_WEBGPU_STAGING_BUFFER_DEFAULT_CAPACITY (128 * 1024)
+
+// A staging uniform buffer used for uploading parameters to the device.
+// This allows for high-frequency writes of parameters at appropriate alignment.
+//
+// Intended usage is to retain one of these per device queue and use them during
+// command buffer recording targeting that particular queue. Parameters are
+// scribbled into the staging buffer host memory and then prior to submission
+// an upload is scheduled from host->device into the device-local buffer. This
+// puts the writes into queue timeline immediately before the commands that use
+// it are submitted, and as there is only in-order execution per WebGPU queue
+// this provides us a completely queue-ordered set of memory.
+typedef struct iree_hal_webgpu_staging_buffer_t {
+  // Alignment required on offsets into the buffer.
+  // Uniform bindings with dynamic offsets must satisfy this alignment and on
+  // some devices it can be as large as 256b.
+  uint32_t alignment;
+  // Maximum number of bytes in the buffer.
+  uint32_t capacity;
+
+  // Host-local buffer pointer.
+  uint8_t* host_buffer;
+  // Device-local HAL buffer - retains ownership.
+  iree_hal_buffer_t* device_buffer;
+  // Device-local buffer handle.
+  WGPUBuffer device_buffer_handle;
+
+  // Layout of a bind group with a single dynamic uniform buffer binding 0.
+  WGPUBindGroupLayout bind_group_layout;
+  // Bind group containing the buffer as dynamic at base offset 0.
+  WGPUBindGroup bind_group;
+
+  // Layout of an empty bind group, useful for padding within pipeline layouts.
+  WGPUBindGroupLayout empty_bind_group_layout;
+  // Empty bind group.
+  WGPUBindGroup empty_bind_group;
+
+  // Current write offset in the device buffer.
+  uint32_t offset;
+} iree_hal_webgpu_staging_buffer_t;
+
+// Initializes |out_staging_buffer| using the given |host_buffer| memory.
+iree_status_t iree_hal_webgpu_staging_buffer_initialize(
+    WGPUDevice device, const WGPULimits* limits,
+    iree_hal_allocator_t* device_allocator, uint8_t* host_buffer,
+    iree_host_size_t host_buffer_capacity,
+    iree_hal_webgpu_staging_buffer_t* out_staging_buffer);
+
+void iree_hal_webgpu_staging_buffer_deinitialize(
+    iree_hal_webgpu_staging_buffer_t* staging_buffer);
+
+// Reserves |length| bytes from the staging buffer and returns a pointer to it
+// in |out_reservation|.
+// Returns RESOURCE_EXHAUSTED if the staging buffer is full and must be flushed
+// with iree_hal_webgpu_staging_buffer_flush first.
+iree_status_t iree_hal_webgpu_staging_buffer_reserve(
+    iree_hal_webgpu_staging_buffer_t* staging_buffer, iree_host_size_t length,
+    iree_byte_span_t* out_reservation, uint32_t* out_offset);
+
+// Appends |data| of |length| bytes to the staging buffer.
+// Returns RESOURCE_EXHAUSTED if the staging buffer is full and must be flushed
+// with iree_hal_webgpu_staging_buffer_flush first.
+iree_status_t iree_hal_webgpu_staging_buffer_append(
+    iree_hal_webgpu_staging_buffer_t* staging_buffer,
+    iree_const_byte_span_t source, uint32_t* out_offset);
+
+// Flushes any pending uploads and returns the source buffer, target buffer,
+// and length to upload. |out_length| may be 0 if there is nothing to flush.
+void iree_hal_webgpu_staging_buffer_flush(
+    iree_hal_webgpu_staging_buffer_t* staging_buffer, void** out_source_buffer,
+    WGPUBuffer* out_target_buffer, iree_host_size_t* out_length);
+
+// Resets the staging buffer to clear any pending writes.
+void iree_hal_webgpu_staging_buffer_reset(
+    iree_hal_webgpu_staging_buffer_t* staging_buffer);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_DRIVERS_WEBGPU_STAGING_BUFFER_H_

diff --git a/experimental/webgpu/webgpu_device.c b/experimental/webgpu/webgpu_device.c
new file mode 100644
index 0000000..6a48f16
--- /dev/null
+++ b/experimental/webgpu/webgpu_device.c

@@ -0,0 +1,407 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "experimental/webgpu/webgpu_device.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "experimental/webgpu/bind_group_cache.h"
+#include "experimental/webgpu/builtins.h"
+#include "experimental/webgpu/command_buffer.h"
+#include "experimental/webgpu/nop_event.h"
+#include "experimental/webgpu/nop_executable_cache.h"
+#include "experimental/webgpu/nop_semaphore.h"
+#include "experimental/webgpu/pipeline_layout.h"
+#include "experimental/webgpu/simple_allocator.h"
+#include "experimental/webgpu/staging_buffer.h"
+#include "iree/base/internal/arena.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/utils/buffer_transfer.h"
+
+//===----------------------------------------------------------------------===//
+// iree_hal_webgpu_device_t
+//===----------------------------------------------------------------------===//
+
+#define IREE_HAL_WEBGPU_SMALL_POOL_BLOCK_SIZE (4 * 1024)
+#define IREE_HAL_WEBGPU_LARGE_POOL_BLOCK_SIZE (32 * 1024)
+
+IREE_API_EXPORT void iree_hal_webgpu_device_options_initialize(
+    iree_hal_webgpu_device_options_t* out_options) {
+  IREE_ASSERT_ARGUMENT(out_options);
+  out_options->flags = IREE_HAL_WEBGPU_DEVICE_RESERVED;
+  out_options->queue_uniform_buffer_size =
+      IREE_HAL_WEBGPU_STAGING_BUFFER_DEFAULT_CAPACITY;
+}
+
+typedef struct iree_hal_webgpu_device_t {
+  iree_hal_resource_t resource;
+  iree_allocator_t host_allocator;
+  iree_string_view_t identifier;
+
+  bool owns_device_handle;
+  WGPUDevice handle;
+  WGPUQueue queue;
+
+  // Block pool used for small allocations like submissions and callbacks.
+  iree_arena_block_pool_t small_block_pool;
+  // Block pool used for command buffers with a large block size (as command
+  // buffers can contain inlined data uploads).
+  iree_arena_block_pool_t large_block_pool;
+
+  iree_hal_allocator_t* device_allocator;
+
+  // Builtin shaders emulating functionality not present in WebGPU.
+  iree_hal_webgpu_builtins_t builtins;
+
+  // Cached bind groups used during command buffer recording.
+  iree_hal_webgpu_bind_group_cache_t bind_group_cache;
+
+  // Staging buffer for parameter uploads.
+  // Host storage is allocated as part of the device structure.
+  iree_hal_webgpu_staging_buffer_t staging_buffer;
+  uint8_t staging_buffer_host_data[];
+} iree_hal_webgpu_device_t;
+
+extern const iree_hal_device_vtable_t iree_hal_webgpu_device_vtable;
+
+static iree_hal_webgpu_device_t* iree_hal_webgpu_device_cast(
+    iree_hal_device_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_webgpu_device_vtable);
+  return (iree_hal_webgpu_device_t*)base_value;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_webgpu_wrap_device(
+    iree_string_view_t identifier,
+    const iree_hal_webgpu_device_options_t* options, WGPUDevice handle,
+    iree_allocator_t host_allocator, iree_hal_device_t** out_device) {
+  IREE_ASSERT_ARGUMENT(options);
+  IREE_ASSERT_ARGUMENT(handle);
+  IREE_ASSERT_ARGUMENT(out_device);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  *out_device = NULL;
+
+  WGPUSupportedLimits supported_limits;
+  memset(&supported_limits, 0, sizeof(supported_limits));
+  if (!wgpuDeviceGetLimits(handle, &supported_limits)) {
+    // Failed to query limits - conservatively set what we need.
+    // TODO(benvanik): see if it's realistic for this to fail - it cannot in
+    // the browser implementation so I'm not sure why it returns bool here.
+    supported_limits.limits.minStorageBufferOffsetAlignment = 256;
+    supported_limits.limits.minUniformBufferOffsetAlignment = 256;
+  }
+
+  iree_hal_webgpu_device_t* device = NULL;
+  iree_host_size_t total_size =
+      sizeof(*device) + options->queue_uniform_buffer_size + identifier.size;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_allocator_malloc(host_allocator, total_size, (void**)&device));
+  iree_hal_resource_initialize(&iree_hal_webgpu_device_vtable,
+                               &device->resource);
+  device->host_allocator = host_allocator;
+  uint8_t* buffer_ptr = (uint8_t*)device + sizeof(*device);
+  buffer_ptr += options->queue_uniform_buffer_size;
+  buffer_ptr += iree_string_view_append_to_buffer(
+      identifier, &device->identifier, (char*)buffer_ptr);
+
+  device->owns_device_handle = false;
+  device->handle = handle;
+  device->queue = wgpuDeviceGetQueue(handle);
+
+  iree_arena_block_pool_initialize(IREE_HAL_WEBGPU_SMALL_POOL_BLOCK_SIZE,
+                                   host_allocator, &device->small_block_pool);
+  iree_arena_block_pool_initialize(IREE_HAL_WEBGPU_LARGE_POOL_BLOCK_SIZE,
+                                   host_allocator, &device->large_block_pool);
+
+  iree_hal_webgpu_bind_group_cache_initialize(device->handle,
+                                              &device->bind_group_cache);
+
+  iree_status_t status = iree_hal_webgpu_simple_allocator_create(
+      (iree_hal_device_t*)device, device->identifier, device->host_allocator,
+      &device->device_allocator);
+
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_webgpu_staging_buffer_initialize(
+        device->handle, &supported_limits.limits, device->device_allocator,
+        device->staging_buffer_host_data, options->queue_uniform_buffer_size,
+        &device->staging_buffer);
+  }
+
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_webgpu_builtins_initialize(
+        device->handle, &device->staging_buffer, &device->builtins);
+  }
+
+  if (iree_status_is_ok(status)) {
+    *out_device = (iree_hal_device_t*)device;
+  } else {
+    iree_hal_device_release((iree_hal_device_t*)device);
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_webgpu_device_destroy(iree_hal_device_t* base_device) {
+  iree_hal_webgpu_device_t* device = iree_hal_webgpu_device_cast(base_device);
+  iree_allocator_t host_allocator = iree_hal_device_host_allocator(base_device);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Builtins may be retaining resources; must be dropped first.
+  iree_hal_webgpu_builtins_deinitialize(&device->builtins);
+
+  // Bind groups can retain buffers.
+  iree_hal_webgpu_bind_group_cache_deinitialize(&device->bind_group_cache);
+
+  // There must be no more buffers live that use the allocator.
+  iree_hal_webgpu_staging_buffer_deinitialize(&device->staging_buffer);
+  iree_hal_allocator_release(device->device_allocator);
+
+  // All outstanding blocks must have been returned to the pool (all command
+  // buffers/submissions/etc disposed).
+  iree_arena_block_pool_deinitialize(&device->small_block_pool);
+  iree_arena_block_pool_deinitialize(&device->large_block_pool);
+
+  // If we wrapped an existing device we don't want to destroy it on shutdown as
+  // it may still be in use by the hosting application.
+  if (device->owns_device_handle) {
+    // NOTE: this destroys the device immediately (vs dropping it); that's fine
+    // as we have the same requirement in the HAL.
+    wgpuDeviceDestroy(device->handle);
+  }
+
+  iree_allocator_free(host_allocator, device);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_string_view_t iree_hal_webgpu_device_id(
+    iree_hal_device_t* base_device) {
+  iree_hal_webgpu_device_t* device = iree_hal_webgpu_device_cast(base_device);
+  return device->identifier;
+}
+
+static iree_allocator_t iree_hal_webgpu_device_host_allocator(
+    iree_hal_device_t* base_device) {
+  iree_hal_webgpu_device_t* device = iree_hal_webgpu_device_cast(base_device);
+  return device->host_allocator;
+}
+
+static iree_hal_allocator_t* iree_hal_webgpu_device_allocator(
+    iree_hal_device_t* base_device) {
+  iree_hal_webgpu_device_t* device = iree_hal_webgpu_device_cast(base_device);
+  return device->device_allocator;
+}
+
+WGPUDevice iree_hal_webgpu_device_handle(iree_hal_device_t* base_device) {
+  iree_hal_webgpu_device_t* device = iree_hal_webgpu_device_cast(base_device);
+  return device->handle;
+}
+
+static iree_status_t iree_hal_webgpu_device_trim(
+    iree_hal_device_t* base_device) {
+  iree_hal_webgpu_device_t* device = iree_hal_webgpu_device_cast(base_device);
+  iree_arena_block_pool_trim(&device->small_block_pool);
+  iree_arena_block_pool_trim(&device->large_block_pool);
+  return iree_hal_allocator_trim(device->device_allocator);
+}
+
+static iree_status_t iree_hal_webgpu_device_query_i64(
+    iree_hal_device_t* base_device, iree_string_view_t category,
+    iree_string_view_t key, int64_t* out_value) {
+  // iree_hal_webgpu_device_t* device =
+  // iree_hal_webgpu_device_cast(base_device);
+  *out_value = 0;
+
+  if (iree_string_view_equal(category,
+                             iree_make_cstring_view("hal.executable.format"))) {
+    *out_value =
+        iree_string_view_equal(key, iree_make_cstring_view("webgpu-wgsl-fb"))
+            ? 1
+            : 0;
+    return iree_ok_status();
+  }
+
+  return iree_make_status(
+      IREE_STATUS_NOT_FOUND,
+      "unknown device configuration key value '%.*s :: %.*s'",
+      (int)category.size, category.data, (int)key.size, key.data);
+}
+
+static iree_status_t iree_hal_webgpu_device_create_command_buffer(
+    iree_hal_device_t* base_device, iree_hal_command_buffer_mode_t mode,
+    iree_hal_command_category_t command_categories,
+    iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity,
+    iree_hal_command_buffer_t** out_command_buffer) {
+  iree_hal_webgpu_device_t* device = iree_hal_webgpu_device_cast(base_device);
+  return iree_hal_webgpu_command_buffer_create(
+      (iree_hal_device_t*)device, device->handle, mode, command_categories,
+      queue_affinity, binding_capacity, &device->large_block_pool,
+      &device->staging_buffer, &device->bind_group_cache, &device->builtins,
+      device->host_allocator, out_command_buffer);
+}
+
+static iree_status_t iree_hal_webgpu_device_create_descriptor_set_layout(
+    iree_hal_device_t* base_device,
+    iree_hal_descriptor_set_layout_flags_t flags,
+    iree_host_size_t binding_count,
+    const iree_hal_descriptor_set_layout_binding_t* bindings,
+    iree_hal_descriptor_set_layout_t** out_descriptor_set_layout) {
+  iree_hal_webgpu_device_t* device = iree_hal_webgpu_device_cast(base_device);
+  return iree_hal_webgpu_descriptor_set_layout_create(
+      device->handle, flags, binding_count, bindings, device->host_allocator,
+      out_descriptor_set_layout);
+}
+
+static iree_status_t iree_hal_webgpu_device_create_event(
+    iree_hal_device_t* base_device, iree_hal_event_t** out_event) {
+  iree_hal_webgpu_device_t* device = iree_hal_webgpu_device_cast(base_device);
+  return iree_hal_webgpu_nop_event_create(device->host_allocator, out_event);
+}
+
+static iree_status_t iree_hal_webgpu_device_create_executable_cache(
+    iree_hal_device_t* base_device, iree_string_view_t identifier,
+    iree_loop_t loop, iree_hal_executable_cache_t** out_executable_cache) {
+  iree_hal_webgpu_device_t* device = iree_hal_webgpu_device_cast(base_device);
+  return iree_hal_webgpu_nop_executable_cache_create(
+      device->handle, identifier, loop, device->host_allocator,
+      out_executable_cache);
+}
+
+static iree_status_t iree_hal_webgpu_device_create_pipeline_layout(
+    iree_hal_device_t* base_device, iree_host_size_t push_constants,
+    iree_host_size_t set_layout_count,
+    iree_hal_descriptor_set_layout_t* const* set_layouts,
+    iree_hal_pipeline_layout_t** out_pipeline_layout) {
+  iree_hal_webgpu_device_t* device = iree_hal_webgpu_device_cast(base_device);
+  return iree_hal_webgpu_pipeline_layout_create(
+      device->handle, set_layout_count, set_layouts, push_constants,
+      &device->staging_buffer, device->host_allocator, out_pipeline_layout);
+}
+
+static iree_status_t iree_hal_webgpu_device_create_semaphore(
+    iree_hal_device_t* base_device, uint64_t initial_value,
+    iree_hal_semaphore_t** out_semaphore) {
+  iree_hal_webgpu_device_t* device = iree_hal_webgpu_device_cast(base_device);
+  return iree_hal_webgpu_nop_semaphore_create(
+      initial_value, device->host_allocator, out_semaphore);
+}
+
+static iree_hal_semaphore_compatibility_t
+iree_hal_webgpu_device_query_semaphore_compatibility(
+    iree_hal_device_t* base_device, iree_hal_semaphore_t* semaphore) {
+  // TODO(benvanik): implement webgpu semaphores.
+  return IREE_HAL_SEMAPHORE_COMPATIBILITY_HOST_ONLY;
+}
+
+static iree_status_t iree_hal_webgpu_device_queue_alloca(
+    iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
+    const iree_hal_semaphore_list_t wait_semaphore_list,
+    const iree_hal_semaphore_list_t signal_semaphore_list,
+    iree_hal_allocator_pool_t pool, iree_hal_buffer_params_t params,
+    iree_device_size_t allocation_size,
+    iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
+  // TODO(benvanik): queue-ordered allocations.
+  IREE_RETURN_IF_ERROR(iree_hal_semaphore_list_wait(wait_semaphore_list,
+                                                    iree_infinite_timeout()));
+  IREE_RETURN_IF_ERROR(iree_hal_allocator_allocate_buffer(
+      iree_hal_device_allocator(base_device), params, allocation_size,
+      iree_const_byte_span_empty(), out_buffer));
+  IREE_RETURN_IF_ERROR(iree_hal_semaphore_list_signal(signal_semaphore_list));
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_webgpu_device_queue_dealloca(
+    iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
+    const iree_hal_semaphore_list_t wait_semaphore_list,
+    const iree_hal_semaphore_list_t signal_semaphore_list,
+    iree_hal_buffer_t* buffer) {
+  // TODO(benvanik): queue-ordered allocations.
+  IREE_RETURN_IF_ERROR(iree_hal_device_queue_barrier(
+      base_device, queue_affinity, wait_semaphore_list, signal_semaphore_list));
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_webgpu_device_queue_execute(
+    iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
+    const iree_hal_semaphore_list_t wait_semaphore_list,
+    const iree_hal_semaphore_list_t signal_semaphore_list,
+    iree_host_size_t command_buffer_count,
+    iree_hal_command_buffer_t* const* command_buffers) {
+  iree_hal_webgpu_device_t* device = iree_hal_webgpu_device_cast(base_device);
+
+  // TODO(benvanik): this currently assumes we are synchronizing on semaphores
+  // and that any passed in to wait on will already be signaled. This would need
+  // to change a bit to properly support waiting on host-signaled semaphores.
+  // All work is ordered against the WebGPU queues and there's only one queue so
+  // there's really not much to do.
+  IREE_RETURN_IF_ERROR(iree_hal_semaphore_list_wait(wait_semaphore_list,
+                                                    iree_infinite_timeout()));
+
+  // TODO(benvanik): propagate errors to semaphores.
+  for (iree_host_size_t i = 0; i < command_buffer_count; i++) {
+    iree_hal_command_buffer_t* command_buffer = command_buffers[i];
+    IREE_RETURN_IF_ERROR(
+        iree_hal_webgpu_command_buffer_issue(command_buffer, device->queue));
+  }
+
+  IREE_RETURN_IF_ERROR(iree_hal_semaphore_list_signal(signal_semaphore_list));
+
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_webgpu_device_queue_flush(
+    iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity) {
+  // Currently unused; we flush as submissions are made.
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_webgpu_device_wait_semaphores(
+    iree_hal_device_t* base_device, iree_hal_wait_mode_t wait_mode,
+    const iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout) {
+  return iree_make_status(
+      IREE_STATUS_UNIMPLEMENTED,
+      "iree_hal_webgpu_device_wait_semaphores not yet implemented");
+}
+
+static iree_status_t iree_hal_webgpu_device_profiling_begin(
+    iree_hal_device_t* device,
+    const iree_hal_device_profiling_options_t* options) {
+  // Unimplemented (and that's ok).
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_webgpu_device_profiling_end(
+    iree_hal_device_t* device) {
+  // Unimplemented (and that's ok).
+  return iree_ok_status();
+}
+
+const iree_hal_device_vtable_t iree_hal_webgpu_device_vtable = {
+    .destroy = iree_hal_webgpu_device_destroy,
+    .id = iree_hal_webgpu_device_id,
+    .host_allocator = iree_hal_webgpu_device_host_allocator,
+    .device_allocator = iree_hal_webgpu_device_allocator,
+    .trim = iree_hal_webgpu_device_trim,
+    .query_i64 = iree_hal_webgpu_device_query_i64,
+    .create_command_buffer = iree_hal_webgpu_device_create_command_buffer,
+    .create_descriptor_set_layout =
+        iree_hal_webgpu_device_create_descriptor_set_layout,
+    .create_event = iree_hal_webgpu_device_create_event,
+    .create_executable_cache = iree_hal_webgpu_device_create_executable_cache,
+    .create_pipeline_layout = iree_hal_webgpu_device_create_pipeline_layout,
+    .create_semaphore = iree_hal_webgpu_device_create_semaphore,
+    .query_semaphore_compatibility =
+        iree_hal_webgpu_device_query_semaphore_compatibility,
+    .transfer_range = iree_hal_device_submit_transfer_range_and_wait,
+    .queue_alloca = iree_hal_webgpu_device_queue_alloca,
+    .queue_dealloca = iree_hal_webgpu_device_queue_dealloca,
+    .queue_execute = iree_hal_webgpu_device_queue_execute,
+    .queue_flush = iree_hal_webgpu_device_queue_flush,
+    .wait_semaphores = iree_hal_webgpu_device_wait_semaphores,
+    .profiling_begin = iree_hal_webgpu_device_profiling_begin,
+    .profiling_end = iree_hal_webgpu_device_profiling_end,
+};

diff --git a/experimental/webgpu/webgpu_device.h b/experimental/webgpu/webgpu_device.h
new file mode 100644
index 0000000..a4b7f77
--- /dev/null
+++ b/experimental/webgpu/webgpu_device.h

@@ -0,0 +1,27 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_DRIVERS_WEBGPU_WEBGPU_DEVICE_H_
+#define IREE_HAL_DRIVERS_WEBGPU_WEBGPU_DEVICE_H_
+
+#include "experimental/webgpu/api.h"
+#include "experimental/webgpu/platform/webgpu.h"
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// NOTE: wrapping is currently exposed via iree/hal/drivers/webgpu/api.h.
+
+WGPUDevice iree_hal_webgpu_device_handle(iree_hal_device_t* device);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_DRIVERS_WEBGPU_WEBGPU_DEVICE_H_
commit	e7c2cba2e4e995d81fa2a792849ebe47b9455cef	[log] [tgz]
author	Ben Vanik <benvanik@google.com>	Mon Jul 19 15:45:39 2021 -0700
committer	Scott Todd <scotttodd@google.com>	Tue Jun 06 08:17:24 2023 -0700
tree	9e4030776640f0d47ba2fbc10ff589a066a52779
parent	6d6f54bcbb4a9064a1e09a070422872c2849fda6 [diff]