Adding iree_hal_device_queue_host_call and emulation.  (#21653)

This allows for both blocking and non-blocking device->host calls.

Emulation is provided targets that aren't yet using their native
features (CUDA/HIP/Metal) or don't have them (Vulkan), but it should
never be used once we start relying on this for programs as the
performance is terrible. The CPU sync and task implementations are done
here as the emulation is incompatible with sync semantics and it's
possible to implement it on the task system fairly easily.

I split out the existing queue emulation utilities out of the device.c
so high-fidelity backends can eventually not even link that code in. I
added the host call emulation in its own target so we can avoid
introducing threading dependencies into `iree::hal` for the emulation
(and makes it clearer what's part of the API vs what's an implementation
detail).

I suspect there may be some HIP flakes and we can disable the CTS
there/file issues if they pop up. I think there's a few cases in the HIP
semaphore that don't quite work but it'd be better to improve the
semaphore tests first. HIP is using an emulated host call here and
that's pretty much just a thread and some semaphores and should be
possible to test independent of the host call logic.

Fixes #21631.
diff --git a/build_tools/cmake/iree_copts.cmake b/build_tools/cmake/iree_copts.cmake
index 9f80d49..5b9b9ba 100644
--- a/build_tools/cmake/iree_copts.cmake
+++ b/build_tools/cmake/iree_copts.cmake
@@ -376,6 +376,13 @@
   string(REPLACE "/GR" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
 endif()
 
+if(IREE_ENABLE_THREADING)
+  iree_select_compiler_opts(IREE_DEFAULT_COPTS
+    ALL
+      "-DIREE_THREADING_ENABLE=1"
+  )
+endif()
+
 # Find and add threads as dependency.
 if(NOT ANDROID AND IREE_ENABLE_THREADING)
   set(CMAKE_THREAD_PREFER_PTHREAD TRUE)
@@ -386,7 +393,6 @@
   # Android provides its own pthreads support with no linking required.
 endif()
 
-
 # Emscripten needs -pthread specified in link _and_ compile options when using
 # atomics, shared memory, or pthreads. If we bring our own threading impl and
 # try to omit this, we get this error:
diff --git a/experimental/webgpu/BUILD.bazel b/experimental/webgpu/BUILD.bazel
index 12d1a44..8484e96 100644
--- a/experimental/webgpu/BUILD.bazel
+++ b/experimental/webgpu/BUILD.bazel
@@ -56,6 +56,8 @@
         "//runtime/src/iree/hal/utils:executable_debug_info",
         "//runtime/src/iree/hal/utils:file_transfer",
         "//runtime/src/iree/hal/utils:files",
+        "//runtime/src/iree/hal/utils:queue_emulation",
+        "//runtime/src/iree/hal/utils:queue_host_call_emulation",
         "//runtime/src/iree/schemas:executable_debug_info_c_fbs",
         "//runtime/src/iree/schemas:webgpu_executable_def_c_fbs",
         "@webgpu_headers",
diff --git a/experimental/webgpu/CMakeLists.txt b/experimental/webgpu/CMakeLists.txt
index fa04820..dec651f 100644
--- a/experimental/webgpu/CMakeLists.txt
+++ b/experimental/webgpu/CMakeLists.txt
@@ -50,6 +50,8 @@
     iree::experimental::webgpu::shaders
     iree::hal::utils::file_transfer
     iree::hal::utils::files
+    iree::hal::utils::queue_emulation
+    iree::hal::utils::queue_host_call_emulation
     iree::schemas::webgpu_executable_def_c_fbs
   PUBLIC
 )
diff --git a/experimental/webgpu/webgpu_device.c b/experimental/webgpu/webgpu_device.c
index d80e7ac..a4c0414 100644
--- a/experimental/webgpu/webgpu_device.c
+++ b/experimental/webgpu/webgpu_device.c
@@ -22,6 +22,8 @@
 #include "iree/base/internal/arena.h"
 #include "iree/hal/utils/file_registry.h"
 #include "iree/hal/utils/file_transfer.h"
+#include "iree/hal/utils/queue_emulation.h"
+#include "iree/hal/utils/queue_host_call_emulation.h"
 
 //===----------------------------------------------------------------------===//
 // iree_hal_webgpu_device_t
@@ -474,6 +476,7 @@
     .queue_copy = iree_hal_device_queue_emulated_copy,
     .queue_read = iree_hal_webgpu_device_queue_read,
     .queue_write = iree_hal_webgpu_device_queue_write,
+    .queue_host_call = iree_hal_device_queue_emulated_host_call,
     .queue_dispatch = iree_hal_device_queue_emulated_dispatch,
     .queue_execute = iree_hal_webgpu_device_queue_execute,
     .queue_flush = iree_hal_webgpu_device_queue_flush,
diff --git a/runtime/src/CMakeLists.txt b/runtime/src/CMakeLists.txt
index 037124a..30e6237 100644
--- a/runtime/src/CMakeLists.txt
+++ b/runtime/src/CMakeLists.txt
@@ -20,7 +20,7 @@
 if(IREE_ENABLE_RUNTIME_COVERAGE)
   message(WARNING
     "IREE_ENABLE_RUNTIME_COVERAGE enabling coverage in all runtime libraries. "
-    "All runtime binaries are instrumented and should not be used for"
+    "All runtime binaries are instrumented and should not be used for "
     "benchmarking."
   )
   add_compile_options(
diff --git a/runtime/src/iree/base/config.h b/runtime/src/iree/base/config.h
index 041c393..cf92e82 100644
--- a/runtime/src/iree/base/config.h
+++ b/runtime/src/iree/base/config.h
@@ -141,6 +141,13 @@
 #define IREE_SYNCHRONIZATION_DISABLE_UNSAFE 0
 #endif  // !IREE_SYNCHRONIZATION_DISABLE_UNSAFE
 
+#if !defined(IREE_THREADING_ENABLE)
+// On platforms without threads (no pthreads or equivalent available) or in
+// applications where no threads are transitively used, all thread support code
+// can be stripped out.
+#define IREE_THREADING_ENABLE 1
+#endif  // !IREE_THREADING_ENABLE
+
 //===----------------------------------------------------------------------===//
 // File I/O
 //===----------------------------------------------------------------------===//
diff --git a/runtime/src/iree/hal/cts/CMakeLists.txt b/runtime/src/iree/hal/cts/CMakeLists.txt
index 1e7ea8f..b57df89 100644
--- a/runtime/src/iree/hal/cts/CMakeLists.txt
+++ b/runtime/src/iree/hal/cts/CMakeLists.txt
@@ -17,6 +17,7 @@
   "event"
   "executable_cache"
   "file"
+  "queue_host_call"
   "semaphore"
   "semaphore_submission"
   PARENT_SCOPE
@@ -211,6 +212,19 @@
 
 iree_cc_library(
   NAME
+    queue_host_call_test_library
+  HDRS
+    "queue_host_call_test.h"
+  DEPS
+    ::cts_test_base
+    iree::base
+    iree::hal
+    iree::testing::gtest
+  TESTONLY
+)
+
+iree_cc_library(
+  NAME
     semaphore_test_library
   HDRS
     "semaphore_test.h"
diff --git a/runtime/src/iree/hal/cts/queue_host_call_test.h b/runtime/src/iree/hal/cts/queue_host_call_test.h
new file mode 100644
index 0000000..49e0dc1
--- /dev/null
+++ b/runtime/src/iree/hal/cts/queue_host_call_test.h
@@ -0,0 +1,446 @@
+// Copyright 2025 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_CTS_QUEUE_HOST_CALL_TEST_H_
+#define IREE_HAL_CTS_QUEUE_HOST_CALL_TEST_H_
+
+#include <atomic>
+#include <chrono>
+#include <cstdint>
+#include <thread>
+#include <vector>
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/cts/cts_test_base.h"
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace iree::hal::cts {
+
+using iree::testing::status::StatusIs;
+using ::testing::ContainerEq;
+
+struct SemaphoreList {
+  SemaphoreList() = default;
+  SemaphoreList(iree_hal_device_t* device, std::vector<uint64_t> initial_values,
+                std::vector<uint64_t> desired_values) {
+    for (size_t i = 0; i < initial_values.size(); ++i) {
+      iree_hal_semaphore_t* semaphore = NULL;
+      IREE_EXPECT_OK(iree_hal_semaphore_create(
+          device, initial_values[i], IREE_HAL_SEMAPHORE_FLAG_NONE, &semaphore));
+      semaphores.push_back(semaphore);
+    }
+    payload_values = desired_values;
+    assert(semaphores.size() == payload_values.size());
+  }
+
+  // Copy constructor that retains semaphores.
+  SemaphoreList(const iree_hal_semaphore_list_t& list) {
+    semaphores.reserve(list.count);
+    payload_values.reserve(list.count);
+    for (iree_host_size_t i = 0; i < list.count; ++i) {
+      semaphores.push_back(list.semaphores[i]);
+      payload_values.push_back(list.payload_values[i]);
+    }
+    // Retain all semaphores.
+    iree_hal_semaphore_list_retain(*this);
+  }
+
+  // Copy constructor from another SemaphoreList.
+  SemaphoreList(const SemaphoreList& other) {
+    semaphores = other.semaphores;
+    payload_values = other.payload_values;
+    // Retain all semaphores.
+    iree_hal_semaphore_list_retain(*this);
+  }
+
+  // Copy assignment.
+  SemaphoreList& operator=(const SemaphoreList& other) {
+    if (this != &other) {
+      // Release old semaphores.
+      iree_hal_semaphore_list_release((iree_hal_semaphore_list_t)(*this));
+      // Copy new ones.
+      semaphores = other.semaphores;
+      payload_values = other.payload_values;
+      // Retain new semaphores.
+      iree_hal_semaphore_list_retain(*this);
+    }
+    return *this;
+  }
+
+  SemaphoreList(SemaphoreList&& other) noexcept
+      : semaphores(std::move(other.semaphores)),
+        payload_values(std::move(other.payload_values)) {
+    other.semaphores.clear();
+    other.payload_values.clear();
+  }
+
+  SemaphoreList& operator=(SemaphoreList&& other) noexcept {
+    if (this != &other) {
+      iree_hal_semaphore_list_release((iree_hal_semaphore_list_t)(*this));
+      semaphores = std::move(other.semaphores);
+      payload_values = std::move(other.payload_values);
+      other.semaphores.clear();
+      other.payload_values.clear();
+    }
+    return *this;
+  }
+
+  ~SemaphoreList() {
+    iree_hal_semaphore_list_release((iree_hal_semaphore_list_t)(*this));
+  }
+
+  operator iree_hal_semaphore_list_t() {
+    iree_hal_semaphore_list_t list;
+    list.count = semaphores.size();
+    list.semaphores = semaphores.data();
+    list.payload_values = payload_values.data();
+    return list;
+  }
+
+  std::vector<iree_hal_semaphore_t*> semaphores;
+  std::vector<uint64_t> payload_values;
+};
+
+class QueueHostCallTest : public CTSTestBase<> {};
+
+// Enqueues a host call on a wait condition that will not be satisfied until
+// after the enqueue request completes. This ensures that host calls properly
+// park themselves and get rescheduled as their dependencies resolve.
+TEST_F(QueueHostCallTest, EnqueueBeforeSignal) {
+  IREE_TRACE_SCOPE();
+
+  struct state_t {
+    std::atomic<int> did_call;
+    std::atomic<uint64_t> args[4];
+  } state = {0};
+  auto call = iree_hal_make_host_call(
+      +[](void* user_data, const uint64_t args[4],
+          iree_hal_host_call_context_t* context) {
+        auto* state = (state_t*)user_data;
+        ++state->did_call;
+        memcpy(state->args, args, sizeof(state->args));
+        return iree_ok_status();
+      },
+      &state);
+
+  SemaphoreList wait_semaphore_list(device_, {0}, {1});
+  SemaphoreList signal_semaphore_list(device_, {0}, {1});
+
+  EXPECT_EQ(state.did_call, 0);
+
+  // NOTE: we do this before issuing the host call so we can still function in
+  // synchronous contexts.
+  std::thread waker([&]() {
+    EXPECT_EQ(state.did_call, 0);
+    std::this_thread::sleep_for(std::chrono::milliseconds(50));
+    EXPECT_EQ(state.did_call, 0);
+    IREE_EXPECT_OK(iree_hal_semaphore_list_signal(wait_semaphore_list));
+  });
+
+  uint64_t args[4] = {10, 20, 30, UINT64_MAX};
+  IREE_EXPECT_OK(iree_hal_device_queue_host_call(
+      device_, IREE_HAL_QUEUE_AFFINITY_ANY, wait_semaphore_list,
+      signal_semaphore_list, call, args, IREE_HAL_HOST_CALL_FLAG_NONE));
+
+  IREE_EXPECT_OK(iree_hal_semaphore_list_wait(signal_semaphore_list,
+                                              iree_make_timeout_ms(5000)));
+
+  EXPECT_EQ(state.did_call, 1);
+  EXPECT_EQ(state.args[0], args[0]);
+  EXPECT_EQ(state.args[1], args[1]);
+  EXPECT_EQ(state.args[2], args[2]);
+  EXPECT_EQ(state.args[3], args[3]);
+
+  waker.join();
+}
+
+// Tests that a host call with no wait semaphores gets called ASAP.
+// The call may not be immediate but should execute without waiting.
+TEST_F(QueueHostCallTest, NoWaitSemaphores) {
+  IREE_TRACE_SCOPE();
+
+  struct state_t {
+    std::atomic<int> did_call;
+    std::atomic<uint64_t> args[4];
+  } state = {0};
+  auto call = iree_hal_make_host_call(
+      +[](void* user_data, const uint64_t args[4],
+          iree_hal_host_call_context_t* context) {
+        auto* state = (state_t*)user_data;
+        ++state->did_call;
+        memcpy(state->args, args, sizeof(state->args));
+        return iree_ok_status();
+      },
+      &state);
+
+  // Empty wait list - should execute ASAP.
+  SemaphoreList wait_semaphore_list;
+  SemaphoreList signal_semaphore_list(device_, {0}, {1});
+
+  uint64_t args[4] = {100, 200, 300, 400};
+  IREE_EXPECT_OK(iree_hal_device_queue_host_call(
+      device_, IREE_HAL_QUEUE_AFFINITY_ANY, wait_semaphore_list,
+      signal_semaphore_list, call, args, IREE_HAL_HOST_CALL_FLAG_NONE));
+
+  // Wait for completion - the host call should complete quickly.
+  IREE_EXPECT_OK(iree_hal_semaphore_list_wait(signal_semaphore_list,
+                                              iree_make_timeout_ms(5000)));
+
+  EXPECT_EQ(state.did_call, 1);
+  EXPECT_EQ(state.args[0], args[0]);
+  EXPECT_EQ(state.args[1], args[1]);
+  EXPECT_EQ(state.args[2], args[2]);
+  EXPECT_EQ(state.args[3], args[3]);
+}
+
+// Tests that NON_BLOCKING flag causes signal_semaphore_list to be omitted.
+// The callback should not receive the semaphores and they should be signaled
+// before the callback returns.
+TEST_F(QueueHostCallTest, NonBlockingFlag) {
+  IREE_TRACE_SCOPE();
+
+  struct state_t {
+    std::atomic<int> did_call;
+    std::atomic<bool> received_semaphores;
+    SemaphoreList sideband_semaphore_list;
+  } state = {0, false, {device_, {0}, {1}}};
+  auto call = iree_hal_make_host_call(
+      +[](void* user_data, const uint64_t args[4],
+          iree_hal_host_call_context_t* context) {
+        IREE_TRACE_SCOPE_NAMED("callback");
+        auto* state = (state_t*)user_data;
+        ++state->did_call;
+        // With NON_BLOCKING flag, signal_semaphore_list should be empty.
+        state->received_semaphores =
+            !iree_hal_semaphore_list_is_empty(context->signal_semaphore_list);
+        // The enqueuing thread should have been signaled already, but we need
+        // to make sure it made it to at least here for the test to not be
+        // flakey (since in NON_BLOCKING we may still not have executed this
+        // callback by the time any waiters have executed).
+        IREE_EXPECT_OK(
+            iree_hal_semaphore_list_signal(state->sideband_semaphore_list));
+        return iree_ok_status();
+      },
+      &state);
+
+  SemaphoreList wait_semaphore_list(device_, {0}, {1});
+  SemaphoreList signal_semaphore_list(device_, {0}, {1});
+
+  // Signal the wait semaphore so the call can proceed.
+  IREE_EXPECT_OK(iree_hal_semaphore_list_signal(wait_semaphore_list));
+
+  uint64_t args[4] = {1, 2, 3, 4};
+  IREE_EXPECT_OK(iree_hal_device_queue_host_call(
+      device_, IREE_HAL_QUEUE_AFFINITY_ANY, wait_semaphore_list,
+      signal_semaphore_list, call, args, IREE_HAL_HOST_CALL_FLAG_NON_BLOCKING));
+
+  // Wait for the signal semaphores - they should be signaled prior to the
+  // callback having executed, but it's hard to verify that. Instead we just
+  // wait for the signal and then wait again to join the thread.
+  IREE_EXPECT_OK(iree_hal_semaphore_list_wait(signal_semaphore_list,
+                                              iree_make_timeout_ms(5000)));
+  IREE_EXPECT_OK(iree_hal_semaphore_list_wait(state.sideband_semaphore_list,
+                                              iree_make_timeout_ms(5000)));
+
+  EXPECT_EQ(state.did_call, 1);
+  EXPECT_FALSE(state.received_semaphores)
+      << "Callback should not receive semaphores with NON_BLOCKING flag";
+}
+
+// Tests async callback that clones signal_semaphore_list and signals from a
+// thread. The semaphores should not be signaled until the spawned thread runs.
+TEST_F(QueueHostCallTest, AsyncCallback) {
+  IREE_TRACE_SCOPE();
+
+  struct state_t {
+    std::atomic<int> did_call;
+    std::thread* signal_thread;
+    SemaphoreList* cloned_list;
+    std::atomic<bool> thread_started;
+    std::atomic<bool> thread_completed;
+  } state = {0, nullptr, nullptr, false, false};
+  auto call = iree_hal_make_host_call(
+      +[](void* user_data, const uint64_t args[4],
+          iree_hal_host_call_context_t* context) {
+        auto* state = (state_t*)user_data;
+        ++state->did_call;
+
+        // Clone the signal semaphore list for async completion using the copy
+        // constructor.
+        auto& list = context->signal_semaphore_list;
+
+        if (list.count > 0) {
+          // Use the SemaphoreList copy constructor that retains semaphores.
+          state->cloned_list = new SemaphoreList(list);
+
+          // Launch thread to signal after a delay.
+          state->signal_thread = new std::thread([state]() {
+            IREE_TRACE_SCOPE_NAMED("signal_thread");
+            state->thread_started = true;
+            std::this_thread::sleep_for(std::chrono::milliseconds(10));
+            state->thread_completed = true;
+
+            // Signal all semaphores.
+            iree_hal_semaphore_list_signal(*state->cloned_list);
+
+            // Clean up the cloned list.
+            delete state->cloned_list;
+            state->cloned_list = nullptr;
+          });
+        }
+
+        // Notify that we are an asynchronous operation.
+        return iree_status_from_code(IREE_STATUS_DEFERRED);
+      },
+      &state);
+
+  SemaphoreList wait_semaphore_list(device_, {0}, {1});
+  SemaphoreList signal_semaphore_list(device_, {0, 0}, {1, 2});
+
+  // Signal wait semaphore to let the call proceed.
+  IREE_EXPECT_OK(iree_hal_semaphore_list_signal(wait_semaphore_list));
+
+  uint64_t args[4] = {5, 6, 7, 8};
+  IREE_EXPECT_OK(iree_hal_device_queue_host_call(
+      device_, IREE_HAL_QUEUE_AFFINITY_ANY, wait_semaphore_list,
+      signal_semaphore_list, call, args, IREE_HAL_HOST_CALL_FLAG_NONE));
+
+  // Now wait for the semaphores to be signaled by the thread.
+  IREE_EXPECT_OK(iree_hal_semaphore_list_wait(signal_semaphore_list,
+                                              iree_make_timeout_ms(5000)));
+
+  EXPECT_EQ(state.did_call, 1);
+  EXPECT_TRUE(state.thread_started);
+  EXPECT_TRUE(state.thread_completed);
+
+  // Clean up thread.
+  if (state.signal_thread) {
+    state.signal_thread->join();
+    delete state.signal_thread;
+  }
+}
+
+// Tests that a callback returning an error signals semaphores with error state.
+TEST_F(QueueHostCallTest, CallbackReturnsError) {
+  IREE_TRACE_SCOPE();
+
+  struct state_t {
+    std::atomic<int> did_call;
+  } state = {0};
+  auto call = iree_hal_make_host_call(
+      +[](void* user_data, const uint64_t args[4],
+          iree_hal_host_call_context_t* context) {
+        auto* state = (state_t*)user_data;
+        ++state->did_call;
+        // Return an error - this should cause signal semaphores to fail.
+        return iree_make_status(IREE_STATUS_PERMISSION_DENIED,
+                                "test error from callback");
+      },
+      &state);
+
+  SemaphoreList wait_semaphore_list(device_, {0}, {1});
+  SemaphoreList signal_semaphore_list(device_, {0, 0}, {1, 2});
+
+  // Signal wait semaphore to let the call proceed.
+  IREE_EXPECT_OK(iree_hal_semaphore_list_signal(wait_semaphore_list));
+
+  uint64_t args[4] = {9, 10, 11, 12};
+  IREE_EXPECT_OK(iree_hal_device_queue_host_call(
+      device_, IREE_HAL_QUEUE_AFFINITY_ANY, wait_semaphore_list,
+      signal_semaphore_list, call, args, IREE_HAL_HOST_CALL_FLAG_NONE));
+
+  // Wait for semaphores - this should fail because the callback returned an
+  // error.
+  EXPECT_THAT(Status(iree_hal_semaphore_list_wait(signal_semaphore_list,
+                                                  iree_make_timeout_ms(5000))),
+              StatusIs(StatusCode::kAborted));
+
+  // Query individual semaphores to verify they're in error state.
+  uint64_t value0 = 0;
+  EXPECT_THAT(Status(iree_hal_semaphore_query(
+                  signal_semaphore_list.semaphores[0], &value0)),
+              StatusIs(StatusCode::kPermissionDenied));
+  uint64_t value1 = 0;
+  EXPECT_THAT(Status(iree_hal_semaphore_query(
+                  signal_semaphore_list.semaphores[1], &value1)),
+              StatusIs(StatusCode::kPermissionDenied));
+
+  EXPECT_EQ(state.did_call, 1);
+}
+
+// Tests that a callback returning an error after waiting for dependencies
+// properly signals semaphores with error state.
+TEST_F(QueueHostCallTest, CallbackReturnsErrorAfterWait) {
+  IREE_TRACE_SCOPE();
+
+  struct state_t {
+    std::atomic<int> did_call;
+    std::atomic<bool> wait_completed;
+  } state = {0, false};
+  auto call = iree_hal_make_host_call(
+      +[](void* user_data, const uint64_t args[4],
+          iree_hal_host_call_context_t* context) {
+        auto* state = (state_t*)user_data;
+        ++state->did_call;
+        state->wait_completed = true;
+        // Return an error which should cause signal semaphores to fail.
+        return iree_make_status(IREE_STATUS_PERMISSION_DENIED,
+                                "test error after waiting");
+      },
+      &state);
+
+  SemaphoreList wait_semaphore_list(device_, {0}, {1});
+  SemaphoreList signal_semaphore_list(device_, {0, 0}, {1, 2});
+
+  // Verify the callback hasn't been called yet.
+  EXPECT_EQ(state.did_call, 0);
+  EXPECT_FALSE(state.wait_completed);
+
+  // Start a thread that will signal the wait semaphore after a delay.
+  // NOTE: we do this before issuing the host call so we can still function in
+  // synchronous contexts.
+  std::thread waker([&]() {
+    EXPECT_EQ(state.did_call, 0);
+    std::this_thread::sleep_for(std::chrono::milliseconds(50));
+    EXPECT_EQ(state.did_call, 0);
+    EXPECT_FALSE(state.wait_completed);
+    // Signal the wait semaphore to unblock the host call.
+    IREE_EXPECT_OK(iree_hal_semaphore_list_signal(wait_semaphore_list));
+  });
+
+  uint64_t args[4] = {13, 14, 15, 16};
+  IREE_EXPECT_OK(iree_hal_device_queue_host_call(
+      device_, IREE_HAL_QUEUE_AFFINITY_ANY, wait_semaphore_list,
+      signal_semaphore_list, call, args, IREE_HAL_HOST_CALL_FLAG_NONE));
+
+  // Wait for signal semaphores - this should fail because the callback
+  // returned an error after waiting.
+  EXPECT_THAT(Status(iree_hal_semaphore_list_wait(signal_semaphore_list,
+                                                  iree_make_timeout_ms(5000))),
+              StatusIs(StatusCode::kAborted));
+
+  // Verify the callback was called after waiting.
+  EXPECT_EQ(state.did_call, 1);
+  EXPECT_TRUE(state.wait_completed);
+
+  // Query individual semaphores to verify they're in error state.
+  uint64_t value0 = 0;
+  EXPECT_THAT(Status(iree_hal_semaphore_query(
+                  signal_semaphore_list.semaphores[0], &value0)),
+              StatusIs(StatusCode::kPermissionDenied));
+  uint64_t value1 = 0;
+  EXPECT_THAT(Status(iree_hal_semaphore_query(
+                  signal_semaphore_list.semaphores[1], &value1)),
+              StatusIs(StatusCode::kPermissionDenied));
+
+  waker.join();
+}
+
+}  // namespace iree::hal::cts
+
+#endif  // IREE_HAL_CTS_QUEUE_HOST_CALL_TEST_H_
diff --git a/runtime/src/iree/hal/device.c b/runtime/src/iree/hal/device.c
index ad160a9..77b0fb7 100644
--- a/runtime/src/iree/hal/device.c
+++ b/runtime/src/iree/hal/device.c
@@ -171,56 +171,6 @@
   return status;
 }
 
-IREE_API_EXPORT iree_status_t iree_hal_device_queue_emulated_fill(
-    iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
-    const iree_hal_semaphore_list_t wait_semaphore_list,
-    const iree_hal_semaphore_list_t signal_semaphore_list,
-    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length, const void* pattern,
-    iree_host_size_t pattern_length, iree_hal_fill_flags_t flags) {
-  IREE_ASSERT_ARGUMENT(device);
-  IREE_ASSERT_ARGUMENT(target_buffer);
-  IREE_ASSERT_ARGUMENT(pattern);
-  IREE_TRACE_ZONE_BEGIN(z0);
-  IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, (int64_t)length);
-
-  // If we are starting execution immediately then we can reduce latency by
-  // allowing inline command buffer execution.
-  iree_hal_command_buffer_mode_t command_buffer_mode =
-      IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT;
-  if (wait_semaphore_list.count == 0) {
-    command_buffer_mode |= IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION;
-  }
-
-  iree_hal_transfer_command_t command = {
-      .type = IREE_HAL_TRANSFER_COMMAND_TYPE_FILL,
-      .fill =
-          {
-              .target_buffer = target_buffer,
-              .target_offset = target_offset,
-              .length = length,
-              .pattern = pattern,
-              .pattern_length = pattern_length,
-          },
-  };
-
-  iree_hal_command_buffer_t* command_buffer = NULL;
-  IREE_RETURN_AND_END_ZONE_IF_ERROR(
-      z0, iree_hal_create_transfer_command_buffer(device, command_buffer_mode,
-                                                  queue_affinity, 1, &command,
-                                                  &command_buffer));
-
-  iree_status_t status = iree_hal_device_queue_execute(
-      device, queue_affinity, wait_semaphore_list, signal_semaphore_list,
-      command_buffer, iree_hal_buffer_binding_table_empty(),
-      IREE_HAL_EXECUTE_FLAG_NONE);
-
-  iree_hal_command_buffer_release(command_buffer);
-
-  IREE_TRACE_ZONE_END(z0);
-  return status;
-}
-
 IREE_API_EXPORT iree_status_t iree_hal_device_queue_update(
     iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
     const iree_hal_semaphore_list_t wait_semaphore_list,
@@ -247,66 +197,6 @@
   return status;
 }
 
-IREE_API_EXPORT iree_status_t iree_hal_device_queue_emulated_update(
-    iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
-    const iree_hal_semaphore_list_t wait_semaphore_list,
-    const iree_hal_semaphore_list_t signal_semaphore_list,
-    const void* source_buffer, iree_host_size_t source_offset,
-    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length, iree_hal_update_flags_t flags) {
-  IREE_ASSERT_ARGUMENT(device);
-  IREE_ASSERT_ARGUMENT(source_buffer);
-  IREE_ASSERT_ARGUMENT(target_buffer);
-  IREE_TRACE_ZONE_BEGIN(z0);
-  IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, (int64_t)length);
-
-  // If we are starting execution immediately then we can reduce latency by
-  // allowing inline command buffer execution.
-  iree_hal_command_buffer_mode_t command_buffer_mode =
-      IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT;
-  if (wait_semaphore_list.count == 0) {
-    command_buffer_mode |= IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION;
-  }
-
-  // TODO(benvanik): support splitting the update into multiple chunks to fit
-  // under the max command buffer update size limit. This provisional API is
-  // intended only for updating dispatch parameters today.
-  if (length > UINT16_MAX) {
-    return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
-                            "queue buffer updates currently limited to 64KB, "
-                            "tried to update %" PRIhsz " bytes",
-                            length);
-  }
-
-  iree_hal_transfer_command_t command = {
-      .type = IREE_HAL_TRANSFER_COMMAND_TYPE_UPDATE,
-      .update =
-          {
-              .source_buffer = source_buffer,
-              .source_offset = source_offset,
-              .target_buffer = target_buffer,
-              .target_offset = target_offset,
-              .length = length,
-          },
-  };
-
-  iree_hal_command_buffer_t* command_buffer = NULL;
-  IREE_RETURN_AND_END_ZONE_IF_ERROR(
-      z0, iree_hal_create_transfer_command_buffer(device, command_buffer_mode,
-                                                  queue_affinity, 1, &command,
-                                                  &command_buffer));
-
-  iree_status_t status = iree_hal_device_queue_execute(
-      device, queue_affinity, wait_semaphore_list, signal_semaphore_list,
-      command_buffer, iree_hal_buffer_binding_table_empty(),
-      IREE_HAL_EXECUTE_FLAG_NONE);
-
-  iree_hal_command_buffer_release(command_buffer);
-
-  IREE_TRACE_ZONE_END(z0);
-  return status;
-}
-
 IREE_API_EXPORT iree_status_t iree_hal_device_queue_copy(
     iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
     const iree_hal_semaphore_list_t wait_semaphore_list,
@@ -333,56 +223,6 @@
   return status;
 }
 
-IREE_API_EXPORT iree_status_t iree_hal_device_queue_emulated_copy(
-    iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
-    const iree_hal_semaphore_list_t wait_semaphore_list,
-    const iree_hal_semaphore_list_t signal_semaphore_list,
-    iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
-    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length, iree_hal_copy_flags_t flags) {
-  IREE_ASSERT_ARGUMENT(device);
-  IREE_ASSERT_ARGUMENT(source_buffer);
-  IREE_ASSERT_ARGUMENT(target_buffer);
-  IREE_TRACE_ZONE_BEGIN(z0);
-  IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, (int64_t)length);
-
-  // If we are starting execution immediately then we can reduce latency by
-  // allowing inline command buffer execution.
-  iree_hal_command_buffer_mode_t command_buffer_mode =
-      IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT;
-  if (wait_semaphore_list.count == 0) {
-    command_buffer_mode |= IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION;
-  }
-
-  iree_hal_transfer_command_t command = {
-      .type = IREE_HAL_TRANSFER_COMMAND_TYPE_COPY,
-      .copy =
-          {
-              .source_buffer = source_buffer,
-              .source_offset = source_offset,
-              .target_buffer = target_buffer,
-              .target_offset = target_offset,
-              .length = length,
-          },
-  };
-
-  iree_hal_command_buffer_t* command_buffer = NULL;
-  IREE_RETURN_AND_END_ZONE_IF_ERROR(
-      z0, iree_hal_create_transfer_command_buffer(device, command_buffer_mode,
-                                                  queue_affinity, 1, &command,
-                                                  &command_buffer));
-
-  iree_status_t status = iree_hal_device_queue_execute(
-      device, queue_affinity, wait_semaphore_list, signal_semaphore_list,
-      command_buffer, iree_hal_buffer_binding_table_empty(),
-      IREE_HAL_EXECUTE_FLAG_NONE);
-
-  iree_hal_command_buffer_release(command_buffer);
-
-  IREE_TRACE_ZONE_END(z0);
-  return status;
-}
-
 IREE_API_EXPORT iree_status_t iree_hal_device_queue_read(
     iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
     const iree_hal_semaphore_list_t wait_semaphore_list,
@@ -431,6 +271,28 @@
   return status;
 }
 
+IREE_API_EXPORT iree_status_t iree_hal_device_queue_host_call(
+    iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
+    const iree_hal_semaphore_list_t wait_semaphore_list,
+    const iree_hal_semaphore_list_t signal_semaphore_list,
+    iree_hal_host_call_t call, const uint64_t args[4],
+    iree_hal_host_call_flags_t flags) {
+  IREE_ASSERT_ARGUMENT(device);
+  IREE_ASSERT_ARGUMENT(
+      !wait_semaphore_list.count ||
+      (wait_semaphore_list.semaphores && wait_semaphore_list.payload_values));
+  IREE_ASSERT_ARGUMENT(!signal_semaphore_list.count ||
+                       (signal_semaphore_list.semaphores &&
+                        signal_semaphore_list.payload_values));
+  IREE_ASSERT_ARGUMENT(call.fn);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_status_t status = _VTABLE_DISPATCH(device, queue_host_call)(
+      device, queue_affinity, wait_semaphore_list, signal_semaphore_list, call,
+      args, flags);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
 IREE_API_EXPORT iree_status_t iree_hal_device_queue_dispatch(
     iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
     const iree_hal_semaphore_list_t wait_semaphore_list,
@@ -455,57 +317,6 @@
   return status;
 }
 
-IREE_API_EXPORT iree_status_t iree_hal_device_queue_emulated_dispatch(
-    iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
-    const iree_hal_semaphore_list_t wait_semaphore_list,
-    const iree_hal_semaphore_list_t signal_semaphore_list,
-    iree_hal_executable_t* executable, int32_t entry_point,
-    const iree_hal_dispatch_config_t config, iree_const_byte_span_t constants,
-    const iree_hal_buffer_ref_list_t bindings,
-    iree_hal_dispatch_flags_t flags) {
-  IREE_ASSERT_ARGUMENT(device);
-  IREE_ASSERT_ARGUMENT(executable);
-  IREE_TRACE_ZONE_BEGIN(z0);
-
-  // If we are starting execution immediately then we can reduce latency by
-  // allowing inline command buffer execution.
-  iree_hal_command_buffer_mode_t command_buffer_mode =
-      IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT;
-  if (wait_semaphore_list.count == 0) {
-    command_buffer_mode |= IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION;
-  }
-
-  iree_hal_command_buffer_t* command_buffer = NULL;
-  IREE_RETURN_AND_END_ZONE_IF_ERROR(
-      z0, iree_hal_command_buffer_create(
-              device, command_buffer_mode, IREE_HAL_COMMAND_CATEGORY_DISPATCH,
-              queue_affinity, /*binding_capacity=*/0, &command_buffer));
-
-  iree_status_t status = iree_hal_command_buffer_begin(command_buffer);
-
-  if (iree_status_is_ok(status)) {
-    status = iree_hal_command_buffer_dispatch(command_buffer, executable,
-                                              entry_point, config, constants,
-                                              bindings, flags);
-  }
-
-  if (iree_status_is_ok(status)) {
-    status = iree_hal_command_buffer_end(command_buffer);
-  }
-
-  if (iree_status_is_ok(status)) {
-    status = iree_hal_device_queue_execute(
-        device, queue_affinity, wait_semaphore_list, signal_semaphore_list,
-        command_buffer, iree_hal_buffer_binding_table_empty(),
-        IREE_HAL_EXECUTE_FLAG_NONE);
-  }
-
-  iree_hal_command_buffer_release(command_buffer);
-
-  IREE_TRACE_ZONE_END(z0);
-  return status;
-}
-
 IREE_API_EXPORT iree_status_t iree_hal_device_queue_execute(
     iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
     const iree_hal_semaphore_list_t wait_semaphore_list,
diff --git a/runtime/src/iree/hal/device.h b/runtime/src/iree/hal/device.h
index 2021745..3780003 100644
--- a/runtime/src/iree/hal/device.h
+++ b/runtime/src/iree/hal/device.h
@@ -151,6 +151,87 @@
   IREE_HAL_WRITE_FLAG_NONE = 0,
 };
 
+// Bitfield specifying flags controlling a host call operation.
+typedef uint64_t iree_hal_host_call_flags_t;
+enum iree_hal_host_call_flag_bits_e {
+  IREE_HAL_HOST_CALL_FLAG_NONE = 0ull,
+
+  // The call will not block the queue it is executing on.
+  // The signal semaphores provided to iree_hal_device_queue_host_call will be
+  // signaled immediately after the queue has issued the call so that work can
+  // progress. The queue will not wait for the call to be made and it's possible
+  // for it to happen out of order with respect to subsequent work on the queue.
+  // The application itself must ensure that any references captured by the call
+  // (user_data or args) are valid until the callback has completed.
+  //
+  // This is intended primarily for use as an optimization for custom signaling
+  // behavior or notifications.
+  IREE_HAL_HOST_CALL_FLAG_NON_BLOCKING = 1ull << 0,
+
+  // Hints that the host call is expected to be very short and that the issuing
+  // queue may want to spin (possibly with backoff) until the host call has
+  // signaled completion.
+  IREE_HAL_HOST_CALL_FLAG_WAIT_ACTIVE = 1ull << 1,
+
+  // Hints that the host call does not require the device to flush/invalidate
+  // caches. Use if the call does not consume any device resources that may have
+  // been produced but not yet flushed to host memory and does not produce any
+  // device resources that will be consumed without invalidation.
+  IREE_HAL_HOST_CALL_FLAG_RELAXED = 1ull << 2,
+};
+
+// Provides context to a host call about where it was made from as well as any
+// additional data requested.
+typedef struct iree_hal_host_call_context_t {
+  // The device the call was issued on.
+  iree_hal_device_t* device;
+  // The queue the call was issued on.
+  // This is guaranteed to be equal-to or a subset-of the queue affinity
+  // provided when the call was enqueued. Implementations are allowed to pick a
+  // single queue to call the operation on and block that or block entire groups
+  // of queues if there is some internal aliasing that introduces progress
+  // issues if only one queue is treated as blocked.
+  iree_hal_queue_affinity_t queue_affinity;
+  // A list of semaphores that must be signaled once the call has completed.
+  // Omitted if IREE_HAL_HOST_CALL_FLAG_NON_BLOCKING was requested.
+  //
+  // The list lives on the stack and must be copied and each semaphore retained
+  // if the call function does not immediately signal them inline. Asynchronous
+  // completion would clone the list, retain the semaphores, fire off the async
+  // operation, and then upon completion signal the semaphores and release them.
+  iree_hal_semaphore_list_t signal_semaphore_list;
+} iree_hal_host_call_context_t;
+
+// Executes a user-requested host call in queue order.
+// If the call succeeds and returns OK the semaphores will be signaled and
+// otherwise they will be failed. In non-blocking mode any error returned is
+// ignored and no semaphores are available.
+//
+// To implement asynchronous callbacks the signal_semaphore_list provided in
+// |context| should be cloned (list of pointers and retains on semaphores) and
+// stored for later signaling. The callback must return IREE_STATUS_DEFERRED to
+// indicate the asynchronous operation and when the operation has completed use
+// iree_hal_semaphore_list_signal or iree_hal_semaphore_list_fail based on
+// result.
+typedef iree_status_t(IREE_API_PTR* iree_hal_host_call_fn_t)(
+    void* user_data, const uint64_t args[4],
+    iree_hal_host_call_context_t* context);
+
+// Bound host call function and user data.
+typedef struct iree_hal_host_call_t {
+  // Callback function pointer in the host program.
+  iree_hal_host_call_fn_t fn;
+  // User data passed to the callback function. Unowned.
+  void* user_data;
+} iree_hal_host_call_t;
+
+// Returns a host call bound to the given function pointer and user data.
+static inline iree_hal_host_call_t iree_hal_make_host_call(
+    iree_hal_host_call_fn_t fn, void* user_data) {
+  iree_hal_host_call_t call = {fn, user_data};
+  return call;
+}
+
 // Bitfield specifying flags controlling an execution operation.
 typedef uint64_t iree_hal_execute_flags_t;
 enum iree_hal_execute_flag_bits_t {
@@ -385,6 +466,66 @@
     iree_hal_file_t* target_file, uint64_t target_offset,
     iree_device_size_t length, iree_hal_write_flags_t flags);
 
+// Enqueues a host call request.
+// The device will issue the host call once all waits are satisfied. Host calls
+// receive the signal semaphores provided and can be either synchronous (signal
+// inline) or asynchronous (signal at any point in the future). A non-blocking
+// mode is provided for unidirectional/post-style calls.
+//
+// WARNING: re-entrancy is not supported. It is safe to perform semaphore
+// queries and signals and synchronously allocate/deallocate buffers and
+// resources but queue operations _may_ lead to hangs/crashes. Avoid using any
+// iree_hal_device_queue_* API or performing any blocking waits. If queuing is
+// required then bounce the call to another thread and have it performed there.
+//
+// Arguments are passed without modification from the enqueue operation to the
+// callback. If the arguments contain pointers those must remain live until the
+// host call has executed.
+//
+// Calls block dependent work by default. Once all waits have been satisfied the
+// queue will issue the call to the host with the signals provided and the host
+// call is responsible for either completing its work and returning OK to
+// automatically signal the semaphores. Note that other independent work in the
+// queue is allowed to progress while the host call is in-flight. Calls can be
+// implemented asynchronously by cloning and retaining the signal semaphores
+// they are provided, returning IREE_STATUS_DEFERRED, and signaling them at any
+// point in the future (from an async completion callback, another queue, etc).
+//
+// The IREE_HAL_HOST_CALL_FLAG_NON_BLOCKING flag can be used to instead have the
+// queue issue the call after waits have been satisfied and then immediately
+// signal dependencies prior to the host call being executed. This allows post
+// style notifications without blocking subsequent device work and can be used
+// as a generic signaling mechanism.
+//
+// Call lifetime in both modes:
+// ```
+// BLOCKING (call responsible for signaling):
+//   [alloc state]->[wait]->[call on host]->[signal]->[free state]
+//                            ^             ^
+//                            |             |
+//                            |             Call must signal before returning
+//                            Call receives signal_semaphore_list
+//
+// NON_BLOCKING (queue signals, call runs detached):
+//   [alloc state]->[wait]->[signal]->[call on host]->[free state]
+//                            ^       ^
+//                            |       |
+//                            |       Call receives empty signal_semaphore_list
+//                            Queue signals immediately
+// ```
+//
+// NOTE: host calls can be extremely expensive and result in significant
+// performance issues. Some implementations are not able to natively support
+// host calls and require emulation with poller threads and other techniques
+// that add non-trivial latency in device->host->device situations. Avoid host
+// calls if at all possible.
+IREE_API_EXPORT iree_status_t iree_hal_device_queue_host_call(
+    iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
+    const iree_hal_semaphore_list_t wait_semaphore_list,
+    const iree_hal_semaphore_list_t signal_semaphore_list,
+    iree_hal_host_call_t call, const uint64_t args[4],
+    iree_hal_host_call_flags_t flags);
+
 // Enqueues a dispatch over a 3D grid of workgroups.
 // The request may execute overlapped with any other queue operations. The
 // executable specified must be registered for use with the device driver owning
@@ -653,6 +794,13 @@
       iree_hal_file_t* target_file, uint64_t target_offset,
       iree_device_size_t length, iree_hal_write_flags_t flags);
 
+  iree_status_t(IREE_API_PTR* queue_host_call)(
+      iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
+      const iree_hal_semaphore_list_t wait_semaphore_list,
+      const iree_hal_semaphore_list_t signal_semaphore_list,
+      iree_hal_host_call_t call, const uint64_t args[4],
+      iree_hal_host_call_flags_t flags);
+
   iree_status_t(IREE_API_PTR* queue_dispatch)(
       iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
       const iree_hal_semaphore_list_t wait_semaphore_list,
@@ -688,38 +836,6 @@
 
 IREE_API_EXPORT void iree_hal_device_destroy(iree_hal_device_t* device);
 
-IREE_API_EXPORT iree_status_t iree_hal_device_queue_emulated_fill(
-    iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
-    const iree_hal_semaphore_list_t wait_semaphore_list,
-    const iree_hal_semaphore_list_t signal_semaphore_list,
-    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length, const void* pattern,
-    iree_host_size_t pattern_length, iree_hal_fill_flags_t flags);
-
-IREE_API_EXPORT iree_status_t iree_hal_device_queue_emulated_update(
-    iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
-    const iree_hal_semaphore_list_t wait_semaphore_list,
-    const iree_hal_semaphore_list_t signal_semaphore_list,
-    const void* source_buffer, iree_host_size_t source_offset,
-    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length, iree_hal_update_flags_t flags);
-
-IREE_API_EXPORT iree_status_t iree_hal_device_queue_emulated_copy(
-    iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
-    const iree_hal_semaphore_list_t wait_semaphore_list,
-    const iree_hal_semaphore_list_t signal_semaphore_list,
-    iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
-    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
-    iree_device_size_t length, iree_hal_copy_flags_t flags);
-
-IREE_API_EXPORT iree_status_t iree_hal_device_queue_emulated_dispatch(
-    iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
-    const iree_hal_semaphore_list_t wait_semaphore_list,
-    const iree_hal_semaphore_list_t signal_semaphore_list,
-    iree_hal_executable_t* executable, int32_t entry_point,
-    const iree_hal_dispatch_config_t config, iree_const_byte_span_t constants,
-    const iree_hal_buffer_ref_list_t bindings, iree_hal_dispatch_flags_t flags);
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/runtime/src/iree/hal/drivers/amdgpu/logical_device.c b/runtime/src/iree/hal/drivers/amdgpu/logical_device.c
index 5a29e02..432330e 100644
--- a/runtime/src/iree/hal/drivers/amdgpu/logical_device.c
+++ b/runtime/src/iree/hal/drivers/amdgpu/logical_device.c
@@ -822,10 +822,6 @@
   return iree_ok_status();
 }
 
-// GNUC #define's alloca!
-// <michael> Why should I change, they're the ones that suck! </michael>
-#undef alloca
-
 static iree_status_t iree_hal_amdgpu_logical_device_queue_alloca(
     iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
     const iree_hal_semaphore_list_t wait_semaphore_list,
diff --git a/runtime/src/iree/hal/drivers/amdgpu/virtual_queue.h b/runtime/src/iree/hal/drivers/amdgpu/virtual_queue.h
index a119984..5c28e67 100644
--- a/runtime/src/iree/hal/drivers/amdgpu/virtual_queue.h
+++ b/runtime/src/iree/hal/drivers/amdgpu/virtual_queue.h
@@ -126,6 +126,10 @@
   const iree_hal_amdgpu_virtual_queue_vtable_t* vtable;
 } iree_hal_amdgpu_virtual_queue_t;
 
+// GNUC #define's alloca!
+// <michael> Why should I change, they're the ones that suck! </michael>
+#undef alloca
+
 typedef struct iree_hal_amdgpu_virtual_queue_vtable_t {
   // Deinitializes the queue on shutdown.
   void(IREE_API_PTR* deinitialize)(iree_hal_amdgpu_virtual_queue_t* queue);
diff --git a/runtime/src/iree/hal/drivers/cuda/BUILD.bazel b/runtime/src/iree/hal/drivers/cuda/BUILD.bazel
index 29cbec7..940d592 100644
--- a/runtime/src/iree/hal/drivers/cuda/BUILD.bazel
+++ b/runtime/src/iree/hal/drivers/cuda/BUILD.bazel
@@ -64,6 +64,8 @@
         "//runtime/src/iree/hal/utils:executable_debug_info",
         "//runtime/src/iree/hal/utils:file_transfer",
         "//runtime/src/iree/hal/utils:files",
+        "//runtime/src/iree/hal/utils:queue_emulation",
+        "//runtime/src/iree/hal/utils:queue_host_call_emulation",
         "//runtime/src/iree/hal/utils:resource_set",
         "//runtime/src/iree/hal/utils:semaphore_base",
         "//runtime/src/iree/hal/utils:stream_tracing",
diff --git a/runtime/src/iree/hal/drivers/cuda/CMakeLists.txt b/runtime/src/iree/hal/drivers/cuda/CMakeLists.txt
index 03ec177..4005932 100644
--- a/runtime/src/iree/hal/drivers/cuda/CMakeLists.txt
+++ b/runtime/src/iree/hal/drivers/cuda/CMakeLists.txt
@@ -61,6 +61,8 @@
     iree::hal::utils::executable_debug_info
     iree::hal::utils::file_transfer
     iree::hal::utils::files
+    iree::hal::utils::queue_emulation
+    iree::hal::utils::queue_host_call_emulation
     iree::hal::utils::resource_set
     iree::hal::utils::semaphore_base
     iree::hal::utils::stream_tracing
diff --git a/runtime/src/iree/hal/drivers/cuda/cuda_device.c b/runtime/src/iree/hal/drivers/cuda/cuda_device.c
index 4459030..c2f2eb8 100644
--- a/runtime/src/iree/hal/drivers/cuda/cuda_device.c
+++ b/runtime/src/iree/hal/drivers/cuda/cuda_device.c
@@ -29,6 +29,8 @@
 #include "iree/hal/utils/deferred_work_queue.h"
 #include "iree/hal/utils/file_registry.h"
 #include "iree/hal/utils/file_transfer.h"
+#include "iree/hal/utils/queue_emulation.h"
+#include "iree/hal/utils/queue_host_call_emulation.h"
 #include "iree/hal/utils/stream_tracing.h"
 
 //===----------------------------------------------------------------------===//
@@ -1132,6 +1134,7 @@
     .queue_copy = iree_hal_device_queue_emulated_copy,
     .queue_read = iree_hal_cuda_device_queue_read,
     .queue_write = iree_hal_cuda_device_queue_write,
+    .queue_host_call = iree_hal_device_queue_emulated_host_call,
     .queue_dispatch = iree_hal_device_queue_emulated_dispatch,
     .queue_execute = iree_hal_cuda_device_queue_execute,
     .queue_flush = iree_hal_cuda_device_queue_flush,
diff --git a/runtime/src/iree/hal/drivers/hip/CMakeLists.txt b/runtime/src/iree/hal/drivers/hip/CMakeLists.txt
index dc93558..484eb98 100644
--- a/runtime/src/iree/hal/drivers/hip/CMakeLists.txt
+++ b/runtime/src/iree/hal/drivers/hip/CMakeLists.txt
@@ -71,6 +71,8 @@
     iree::hal::utils::deferred_command_buffer
     iree::hal::utils::file_transfer
     iree::hal::utils::files
+    iree::hal::utils::queue_emulation
+    iree::hal::utils::queue_host_call_emulation
     iree::hal::utils::resource_set
     iree::hal::utils::semaphore_base
     iree::hal::utils::stream_tracing
diff --git a/runtime/src/iree/hal/drivers/hip/event_semaphore.c b/runtime/src/iree/hal/drivers/hip/event_semaphore.c
index 277c9e6..c223929 100644
--- a/runtime/src/iree/hal/drivers/hip/event_semaphore.c
+++ b/runtime/src/iree/hal/drivers/hip/event_semaphore.c
@@ -905,6 +905,8 @@
 
   if (iree_status_is_ok(status)) {
     semaphore->current_visible_value = new_value;
+    semaphore->max_value_to_be_signaled =
+        iree_max(new_value, semaphore->max_value_to_be_signaled);
   }
 
   iree_slim_mutex_unlock(&semaphore->mutex);
diff --git a/runtime/src/iree/hal/drivers/hip/hip_device.c b/runtime/src/iree/hal/drivers/hip/hip_device.c
index 4c26a28..d7a7dd1 100644
--- a/runtime/src/iree/hal/drivers/hip/hip_device.c
+++ b/runtime/src/iree/hal/drivers/hip/hip_device.c
@@ -33,6 +33,8 @@
 #include "iree/hal/utils/deferred_command_buffer.h"
 #include "iree/hal/utils/file_registry.h"
 #include "iree/hal/utils/file_transfer.h"
+#include "iree/hal/utils/queue_emulation.h"
+#include "iree/hal/utils/queue_host_call_emulation.h"
 #include "iree/hal/utils/stream_tracing.h"
 
 #define IREE_HAL_DEVICE_TRANSFER_DEFAULT_BUFFER_SIZE (128 * 1024 * 1024)
@@ -2701,6 +2703,7 @@
     .queue_copy = iree_hal_device_queue_emulated_copy,
     .queue_read = iree_hal_hip_device_queue_read,
     .queue_write = iree_hal_hip_device_queue_write,
+    .queue_host_call = iree_hal_device_queue_emulated_host_call,
     .queue_dispatch = iree_hal_device_queue_emulated_dispatch,
     .queue_execute = iree_hal_hip_device_queue_execute,
     .queue_flush = iree_hal_hip_device_queue_flush,
diff --git a/runtime/src/iree/hal/drivers/local_sync/BUILD.bazel b/runtime/src/iree/hal/drivers/local_sync/BUILD.bazel
index 5650640..dd86e71 100644
--- a/runtime/src/iree/hal/drivers/local_sync/BUILD.bazel
+++ b/runtime/src/iree/hal/drivers/local_sync/BUILD.bazel
@@ -38,6 +38,7 @@
         "//runtime/src/iree/hal/utils:deferred_command_buffer",
         "//runtime/src/iree/hal/utils:file_transfer",
         "//runtime/src/iree/hal/utils:files",
+        "//runtime/src/iree/hal/utils:queue_emulation",
         "//runtime/src/iree/hal/utils:semaphore_base",
     ],
 )
diff --git a/runtime/src/iree/hal/drivers/local_sync/CMakeLists.txt b/runtime/src/iree/hal/drivers/local_sync/CMakeLists.txt
index 5fc8af6..bca06c7 100644
--- a/runtime/src/iree/hal/drivers/local_sync/CMakeLists.txt
+++ b/runtime/src/iree/hal/drivers/local_sync/CMakeLists.txt
@@ -35,6 +35,7 @@
     iree::hal::utils::deferred_command_buffer
     iree::hal::utils::file_transfer
     iree::hal::utils::files
+    iree::hal::utils::queue_emulation
     iree::hal::utils::semaphore_base
   PUBLIC
 )
diff --git a/runtime/src/iree/hal/drivers/local_sync/sync_device.c b/runtime/src/iree/hal/drivers/local_sync/sync_device.c
index 7f4f211..7f5e2bb 100644
--- a/runtime/src/iree/hal/drivers/local_sync/sync_device.c
+++ b/runtime/src/iree/hal/drivers/local_sync/sync_device.c
@@ -20,6 +20,7 @@
 #include "iree/hal/utils/deferred_command_buffer.h"
 #include "iree/hal/utils/file_registry.h"
 #include "iree/hal/utils/file_transfer.h"
+#include "iree/hal/utils/queue_emulation.h"
 
 typedef struct iree_hal_sync_device_t {
   iree_hal_resource_t resource;
@@ -361,6 +362,55 @@
   return loop_status;
 }
 
+static iree_status_t iree_hal_sync_device_queue_host_call(
+    iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
+    const iree_hal_semaphore_list_t wait_semaphore_list,
+    const iree_hal_semaphore_list_t signal_semaphore_list,
+    iree_hal_host_call_t call, const uint64_t args[4],
+    iree_hal_host_call_flags_t flags) {
+  // Wait for all dependencies.
+  IREE_RETURN_IF_ERROR(iree_hal_semaphore_list_wait(wait_semaphore_list,
+                                                    iree_infinite_timeout()));
+
+  // If non-blocking then immediately signal the dependencies instead of letting
+  // the call do it. We don't expect this to allow more work to proceed in the
+  // sync device case _on this device_ but it may on others.
+  const bool is_nonblocking =
+      iree_any_bit_set(flags, IREE_HAL_HOST_CALL_FLAG_NON_BLOCKING);
+  if (is_nonblocking) {
+    // NOTE: the signals can fail in which case we never perform the call.
+    // That's ok as failure to signal is considered a device-loss/death
+    // situation as there's no telling what has gone wrong.
+    IREE_RETURN_IF_ERROR(iree_hal_semaphore_list_signal(signal_semaphore_list));
+  }
+
+  // Issue the call.
+  iree_hal_host_call_context_t context = {
+      .device = base_device,
+      .queue_affinity = queue_affinity,
+      .signal_semaphore_list = is_nonblocking ? iree_hal_semaphore_list_empty()
+                                              : signal_semaphore_list,
+  };
+  iree_status_t call_status = call.fn(call.user_data, args, &context);
+
+  if (is_nonblocking || iree_status_is_deferred(call_status)) {
+    // User callback will signal in the future (or they are fire-and-forget).
+    return iree_ok_status();
+  } else if (iree_status_is_ok(call_status)) {
+    // Signal callback completed synchronously.
+    return iree_hal_semaphore_list_signal(signal_semaphore_list);
+  } else {
+    // If the call failed we need to fail all dependent semaphores to propagate
+    // the error.
+    if (!is_nonblocking) {
+      iree_hal_semaphore_list_fail(signal_semaphore_list, call_status);
+    } else {
+      iree_status_ignore(call_status);
+    }
+    return iree_ok_status();
+  }
+}
+
 static iree_status_t iree_hal_sync_device_apply_deferred_command_buffer(
     iree_hal_sync_device_t* device, iree_hal_command_buffer_t* command_buffer,
     iree_hal_buffer_binding_table_t binding_table) {
@@ -508,6 +558,7 @@
     .queue_copy = iree_hal_device_queue_emulated_copy,
     .queue_read = iree_hal_sync_device_queue_read,
     .queue_write = iree_hal_sync_device_queue_write,
+    .queue_host_call = iree_hal_sync_device_queue_host_call,
     .queue_dispatch = iree_hal_device_queue_emulated_dispatch,
     .queue_execute = iree_hal_sync_device_queue_execute,
     .queue_flush = iree_hal_sync_device_queue_flush,
diff --git a/runtime/src/iree/hal/drivers/local_task/BUILD.bazel b/runtime/src/iree/hal/drivers/local_task/BUILD.bazel
index 48b78c8..3038016 100644
--- a/runtime/src/iree/hal/drivers/local_task/BUILD.bazel
+++ b/runtime/src/iree/hal/drivers/local_task/BUILD.bazel
@@ -50,6 +50,7 @@
         "//runtime/src/iree/hal/utils:deferred_command_buffer",
         "//runtime/src/iree/hal/utils:file_transfer",
         "//runtime/src/iree/hal/utils:files",
+        "//runtime/src/iree/hal/utils:queue_emulation",
         "//runtime/src/iree/hal/utils:resource_set",
         "//runtime/src/iree/hal/utils:semaphore_base",
         "//runtime/src/iree/task",
diff --git a/runtime/src/iree/hal/drivers/local_task/CMakeLists.txt b/runtime/src/iree/hal/drivers/local_task/CMakeLists.txt
index 042ff6b..c7f5a6e 100644
--- a/runtime/src/iree/hal/drivers/local_task/CMakeLists.txt
+++ b/runtime/src/iree/hal/drivers/local_task/CMakeLists.txt
@@ -44,6 +44,7 @@
     iree::hal::utils::deferred_command_buffer
     iree::hal::utils::file_transfer
     iree::hal::utils::files
+    iree::hal::utils::queue_emulation
     iree::hal::utils::resource_set
     iree::hal::utils::semaphore_base
     iree::task
diff --git a/runtime/src/iree/hal/drivers/local_task/task_device.c b/runtime/src/iree/hal/drivers/local_task/task_device.c
index 45e4c8e..030e7e4 100644
--- a/runtime/src/iree/hal/drivers/local_task/task_device.c
+++ b/runtime/src/iree/hal/drivers/local_task/task_device.c
@@ -21,6 +21,7 @@
 #include "iree/hal/utils/deferred_command_buffer.h"
 #include "iree/hal/utils/file_registry.h"
 #include "iree/hal/utils/file_transfer.h"
+#include "iree/hal/utils/queue_emulation.h"
 
 typedef struct iree_hal_task_device_t {
   iree_hal_resource_t resource;
@@ -448,6 +449,20 @@
   return loop_status;
 }
 
+static iree_status_t iree_hal_task_device_queue_host_call(
+    iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
+    const iree_hal_semaphore_list_t wait_semaphore_list,
+    const iree_hal_semaphore_list_t signal_semaphore_list,
+    iree_hal_host_call_t call, const uint64_t args[4],
+    iree_hal_host_call_flags_t flags) {
+  iree_hal_task_device_t* device = iree_hal_task_device_cast(base_device);
+  const iree_host_size_t queue_index = iree_hal_task_device_select_queue(
+      device, IREE_HAL_COMMAND_CATEGORY_ANY, queue_affinity);
+  return iree_hal_task_queue_submit_host_call(
+      &device->queues[queue_index], base_device, 1ull << queue_index,
+      wait_semaphore_list, signal_semaphore_list, call, args, flags);
+}
+
 static iree_status_t iree_hal_task_device_queue_execute(
     iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
     const iree_hal_semaphore_list_t wait_semaphore_list,
@@ -457,7 +472,7 @@
     iree_hal_execute_flags_t flags) {
   iree_hal_task_device_t* device = iree_hal_task_device_cast(base_device);
   // NOTE: today we are not discriminating queues based on command type.
-  iree_host_size_t queue_index = iree_hal_task_device_select_queue(
+  const iree_host_size_t queue_index = iree_hal_task_device_select_queue(
       device, IREE_HAL_COMMAND_CATEGORY_ANY, queue_affinity);
   if (command_buffer == NULL) {
     // Fast-path for barriers (fork/join/sequence).
@@ -542,6 +557,7 @@
     .queue_copy = iree_hal_device_queue_emulated_copy,
     .queue_read = iree_hal_task_device_queue_read,
     .queue_write = iree_hal_task_device_queue_write,
+    .queue_host_call = iree_hal_task_device_queue_host_call,
     .queue_dispatch = iree_hal_device_queue_emulated_dispatch,
     .queue_execute = iree_hal_task_device_queue_execute,
     .queue_flush = iree_hal_task_device_queue_flush,
diff --git a/runtime/src/iree/hal/drivers/local_task/task_queue.c b/runtime/src/iree/hal/drivers/local_task/task_queue.c
index cdd4c22..9cb06ae 100644
--- a/runtime/src/iree/hal/drivers/local_task/task_queue.c
+++ b/runtime/src/iree/hal/drivers/local_task/task_queue.c
@@ -64,6 +64,11 @@
 static iree_status_t iree_hal_semaphore_list_clone(
     const iree_hal_semaphore_list_t* source_list, iree_arena_allocator_t* arena,
     iree_hal_semaphore_list_t* out_target_list) {
+  if (iree_hal_semaphore_list_is_empty(*source_list)) {
+    *out_target_list = iree_hal_semaphore_list_empty();
+    return iree_ok_status();
+  }
+
   iree_host_size_t semaphores_size =
       source_list->count * sizeof(out_target_list->semaphores[0]);
   iree_host_size_t payload_values_size =
@@ -86,12 +91,6 @@
   return iree_ok_status();
 }
 
-static void iree_hal_semaphore_list_release(iree_hal_semaphore_list_t* list) {
-  for (iree_host_size_t i = 0; i < list->count; ++i) {
-    iree_hal_semaphore_release(list->semaphores[i]);
-  }
-}
-
 //===----------------------------------------------------------------------===//
 // iree_hal_task_queue_wait_cmd_t
 //===----------------------------------------------------------------------===//
@@ -142,7 +141,7 @@
   iree_hal_task_queue_wait_cmd_t* cmd = (iree_hal_task_queue_wait_cmd_t*)task;
   IREE_TRACE_ZONE_BEGIN(z0);
 
-  iree_hal_semaphore_list_release(&cmd->wait_semaphores);
+  iree_hal_semaphore_list_release(cmd->wait_semaphores);
 
   IREE_TRACE_ZONE_END(z0);
 }
@@ -394,13 +393,7 @@
   // Signal all semaphores to their new values.
   // Note that if any signal fails then the whole command will fail and all
   // semaphores will be signaled to the failure state.
-  iree_status_t status = iree_ok_status();
-  for (iree_host_size_t i = 0; i < cmd->signal_semaphores.count; ++i) {
-    status =
-        iree_hal_semaphore_signal(cmd->signal_semaphores.semaphores[i],
-                                  cmd->signal_semaphores.payload_values[i]);
-    if (IREE_UNLIKELY(!iree_status_is_ok(status))) break;
-  }
+  iree_status_t status = iree_hal_semaphore_list_signal(cmd->signal_semaphores);
 
   IREE_TRACE_ZONE_END(z0);
   return status;
@@ -426,14 +419,12 @@
   // If the command failed then fail all semaphores to ensure future
   // submissions fail as well (including those on other queues).
   if (IREE_UNLIKELY(status_code != IREE_STATUS_OK)) {
-    for (iree_host_size_t i = 0; i < cmd->signal_semaphores.count; ++i) {
-      iree_hal_semaphore_fail(cmd->signal_semaphores.semaphores[i],
-                              iree_status_from_code(status_code));
-    }
+    iree_hal_semaphore_list_fail(cmd->signal_semaphores,
+                                 iree_status_from_code(status_code));
   }
 
   // Release all semaphores.
-  iree_hal_semaphore_list_release(&cmd->signal_semaphores);
+  iree_hal_semaphore_list_release(cmd->signal_semaphores);
 
   // Drop all memory used by the submission (**including cmd**).
   iree_arena_allocator_t arena = cmd->arena;
@@ -493,7 +484,167 @@
   } else {
     if (cmd) {
       iree_hal_resource_set_free(cmd->resource_set);
-      iree_hal_semaphore_list_release(&cmd->signal_semaphores);
+      iree_hal_semaphore_list_release(cmd->signal_semaphores);
+    }
+    iree_arena_deinitialize(&arena);
+  }
+  return status;
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_task_queue_host_call_cmd_t
+//===----------------------------------------------------------------------===//
+
+// Task to call a user-defined function with host call semantics.
+// This is an optimized version of the internal callback path that avoids the
+// retire command latency and the resource set allocation as host calls need
+// neither. Host calls also have a NON_BLOCKING mode that requires that we
+// signal _before_ calling the user function and that doesn't fit with the
+// normal wait-issue-execute-retire model.
+typedef struct iree_hal_task_queue_host_call_cmd_t {
+  // Call to iree_hal_task_queue_host_call_cmd.
+  iree_task_call_t task;
+
+  // Original arena used for all transient allocations required for the
+  // submission.
+  iree_arena_allocator_t arena;
+
+  // Device the call was scheduled on. Unowned.
+  iree_hal_device_t* device;
+  // Queue affinity as originally requested.
+  // We don't know where we'd actually run so we pass through without
+  // modification.
+  iree_hal_queue_affinity_t queue_affinity;
+  // Target function to call.
+  iree_hal_host_call_t call;
+  // User arguments.
+  uint64_t args[4];
+  // Flags controlling call behavior.
+  iree_hal_host_call_flags_t flags;
+
+  // A list of semaphores to signal upon retiring.
+  iree_hal_semaphore_list_t signal_semaphores;
+} iree_hal_task_queue_host_call_cmd_t;
+
+// Issues a host call submission and either calls the user function which
+// transitively signals semaphores (blocking, call is responsible) or eagerly
+// signals (NON_BLOCKING) and then calls the user function.
+static iree_status_t iree_hal_task_queue_host_call_cmd(
+    void* user_context, iree_task_t* task,
+    iree_task_submission_t* pending_submission) {
+  iree_hal_task_queue_host_call_cmd_t* cmd =
+      (iree_hal_task_queue_host_call_cmd_t*)task;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // When non-blocking we want to eagerly signal all waiters prior to issuing
+  // the call.
+  // Note that if any signal fails then the whole command will fail and all
+  // semaphores will be signaled to the failure state.
+  const bool is_nonblocking =
+      iree_any_bit_set(cmd->flags, IREE_HAL_HOST_CALL_FLAG_NON_BLOCKING);
+  iree_status_t status = iree_ok_status();
+  if (is_nonblocking) {
+    status = iree_hal_semaphore_list_signal(cmd->signal_semaphores);
+  }
+
+  // Issue the call.
+  if (iree_status_is_ok(status)) {
+    iree_hal_host_call_context_t context = {
+        .device = cmd->device,
+        .queue_affinity = cmd->queue_affinity,
+        .signal_semaphore_list = is_nonblocking
+                                     ? iree_hal_semaphore_list_empty()
+                                     : cmd->signal_semaphores,
+    };
+    iree_status_t call_status =
+        cmd->call.fn(cmd->call.user_data, cmd->args, &context);
+    if (is_nonblocking || iree_status_is_deferred(call_status)) {
+      // User callback will signal in the future (or they are fire-and-forget).
+    } else if (iree_status_is_ok(call_status)) {
+      // Signal callback completed synchronously.
+      iree_hal_semaphore_list_signal(cmd->signal_semaphores);
+    } else {
+      if (!is_nonblocking) {
+        iree_hal_semaphore_list_fail(cmd->signal_semaphores, call_status);
+      } else {
+        iree_status_ignore(call_status);
+      }
+    }
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+// Cleanup for iree_hal_task_queue_host_call_cmd_t that ensures that the arena
+// holding the submission is properly disposed and that semaphores are signaled
+// (or signaled to failure if the command failed).
+static void iree_hal_task_queue_host_call_cmd_cleanup(
+    iree_task_t* task, iree_status_code_t status_code) {
+  iree_hal_task_queue_host_call_cmd_t* cmd =
+      (iree_hal_task_queue_host_call_cmd_t*)task;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // If the command failed then fail all semaphores to ensure future
+  // submissions fail as well (including those on other queues).
+  if (IREE_UNLIKELY(status_code != IREE_STATUS_OK)) {
+    iree_hal_semaphore_list_fail(cmd->signal_semaphores,
+                                 iree_status_from_code(status_code));
+  }
+
+  // Release all semaphores.
+  iree_hal_semaphore_list_release(cmd->signal_semaphores);
+
+  // Drop all memory used by the submission (**including cmd**).
+  iree_arena_allocator_t arena = cmd->arena;
+  cmd = NULL;
+  iree_arena_deinitialize(&arena);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+// Allocates and initializes a iree_hal_task_queue_host_call_cmd_t task.
+// The command will own an arena that can be used for other submission-related
+// allocations.
+static iree_status_t iree_hal_task_queue_host_call_cmd_allocate(
+    iree_task_scope_t* scope,
+    const iree_hal_semaphore_list_t* signal_semaphores,
+    iree_arena_block_pool_t* block_pool,
+    iree_hal_task_queue_host_call_cmd_t** out_cmd) {
+  // Make an arena we'll use for allocating the command itself.
+  iree_arena_allocator_t arena;
+  iree_arena_initialize(block_pool, &arena);
+
+  // Allocate the command from the arena.
+  iree_hal_task_queue_host_call_cmd_t* cmd = NULL;
+  iree_status_t status =
+      iree_arena_allocate(&arena, sizeof(*cmd), (void**)&cmd);
+  if (!iree_status_is_ok(status)) {
+    iree_arena_deinitialize(&arena);
+    return status;
+  }
+
+  iree_task_call_initialize(
+      scope, iree_task_make_call_closure(iree_hal_task_queue_host_call_cmd, 0),
+      &cmd->task);
+  iree_task_set_cleanup_fn(&cmd->task.header,
+                           iree_hal_task_queue_host_call_cmd_cleanup);
+  cmd->signal_semaphores = iree_hal_semaphore_list_empty();
+
+  // Clone the signal semaphores from the batch - we retain them and their
+  // payloads.
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_semaphore_list_clone(signal_semaphores, &arena,
+                                           &cmd->signal_semaphores);
+  }
+
+  if (iree_status_is_ok(status)) {
+    // Transfer ownership of the arena to command.
+    memcpy(&cmd->arena, &arena, sizeof(cmd->arena));
+    *out_cmd = cmd;
+  } else {
+    if (cmd) {
+      iree_hal_semaphore_list_release(cmd->signal_semaphores);
     }
     iree_arena_deinitialize(&arena);
   }
@@ -698,10 +849,81 @@
   iree_status_t status = iree_hal_task_queue_submit(
       queue, wait_semaphores, signal_semaphores, resource_count, resources,
       iree_hal_task_queue_callback_cmd_allocate, &callback);
+  if (iree_status_is_ok(status)) {
+    iree_task_executor_flush(queue->executor);
+  }
   IREE_TRACE_ZONE_END(z0);
   return status;
 }
 
+iree_status_t iree_hal_task_queue_submit_host_call(
+    iree_hal_task_queue_t* queue, iree_hal_device_t* device,
+    iree_hal_queue_affinity_t queue_affinity,
+    iree_hal_semaphore_list_t wait_semaphores,
+    iree_hal_semaphore_list_t signal_semaphores, iree_hal_host_call_t call,
+    const uint64_t args[4], iree_hal_host_call_flags_t flags) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Allocate the task that tracks the host call state and dependencies.
+  // NOTE: unlike most other submissions host calls do not use a retire command.
+  iree_hal_task_queue_host_call_cmd_t* call_cmd = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_task_queue_host_call_cmd_allocate(
+      &queue->scope, &signal_semaphores, queue->small_block_pool, &call_cmd));
+  call_cmd->device = device;  // unowned
+  call_cmd->queue_affinity = queue_affinity;
+  call_cmd->call = call;
+  memcpy(call_cmd->args, args, sizeof(call_cmd->args));
+  call_cmd->flags = flags;
+
+  // A fence we'll use to detect when the entire submission has completed.
+  // TODO(benvanik): fold into the host call command. This is currently required
+  // to keep the scope live for the duration of the callback even in
+  // non-blocking mode.
+  iree_task_fence_t* fence = NULL;
+  iree_status_t status =
+      iree_task_executor_acquire_fence(queue->executor, &queue->scope, &fence);
+  if (iree_status_is_ok(status)) {
+    iree_task_set_completion_task(&call_cmd->task.header, &fence->header);
+  }
+
+  // Task to fork and wait for unsatisfied semaphore dependencies.
+  // This is optional and only required if we have previous submissions still
+  // in-flight - if the queue is empty then we can directly schedule the waits.
+  iree_task_t* wait_task = NULL;
+  if (iree_status_is_ok(status) && wait_semaphores.count > 0) {
+    status = iree_hal_task_queue_wait_cmd_allocate(
+        &queue->scope, &wait_semaphores, &call_cmd->arena, &wait_task);
+  }
+
+  // Last chance for failure - from here on we are submitting.
+  if (IREE_UNLIKELY(!iree_status_is_ok(status))) {
+    iree_arena_deinitialize(&call_cmd->arena);
+    return status;
+  }
+
+  iree_task_submission_t submission;
+  iree_task_submission_initialize(&submission);
+
+  // Sequencing: wait on semaphores or go directly into the executor queue.
+  iree_task_t* head_task = &call_cmd->task.header;
+  if (wait_task != NULL) {
+    // Ensure that we only issue command buffers after all waits have completed.
+    iree_task_set_completion_task(wait_task, head_task);
+    iree_task_submission_enqueue(&submission, wait_task);
+  } else {
+    // No waits needed; directly enqueue.
+    iree_task_submission_enqueue(&submission, head_task);
+  }
+
+  // Submit the tasks immediately. The executor may queue them up until we
+  // force the flush after all batches have been processed.
+  iree_task_executor_submit(queue->executor, &submission);
+  iree_task_executor_flush(queue->executor);
+
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
 iree_status_t iree_hal_task_queue_wait_idle(iree_hal_task_queue_t* queue,
                                             iree_timeout_t timeout) {
   IREE_TRACE_ZONE_BEGIN(z0);
diff --git a/runtime/src/iree/hal/drivers/local_task/task_queue.h b/runtime/src/iree/hal/drivers/local_task/task_queue.h
index 91065ff..d5daa5c 100644
--- a/runtime/src/iree/hal/drivers/local_task/task_queue.h
+++ b/runtime/src/iree/hal/drivers/local_task/task_queue.h
@@ -92,6 +92,13 @@
     iree_host_size_t resource_count, iree_hal_resource_t* const* resources,
     iree_task_call_closure_t callback);
 
+iree_status_t iree_hal_task_queue_submit_host_call(
+    iree_hal_task_queue_t* queue, iree_hal_device_t* device,
+    iree_hal_queue_affinity_t queue_affinity,
+    iree_hal_semaphore_list_t wait_semaphores,
+    iree_hal_semaphore_list_t signal_semaphores, iree_hal_host_call_t call,
+    const uint64_t args[4], iree_hal_host_call_flags_t flags);
+
 iree_status_t iree_hal_task_queue_wait_idle(iree_hal_task_queue_t* queue,
                                             iree_timeout_t timeout);
 
diff --git a/runtime/src/iree/hal/drivers/local_task/task_semaphore.c b/runtime/src/iree/hal/drivers/local_task/task_semaphore.c
index 62cd17d..c745493 100644
--- a/runtime/src/iree/hal/drivers/local_task/task_semaphore.c
+++ b/runtime/src/iree/hal/drivers/local_task/task_semaphore.c
@@ -285,7 +285,8 @@
 
   iree_slim_mutex_lock(&semaphore->mutex);
 
-  if (!iree_status_is_ok(semaphore->failure_status)) {
+  if (semaphore->current_value == IREE_HAL_SEMAPHORE_FAILURE_VALUE ||
+      !iree_status_is_ok(semaphore->failure_status)) {
     // Fastest path: failed; return an error to tell callers to query for it.
     iree_slim_mutex_unlock(&semaphore->mutex);
     return iree_status_from_code(IREE_STATUS_ABORTED);
@@ -318,6 +319,20 @@
   }
   iree_event_pool_release(semaphore->event_pool, 1, &timepoint.event);
 
+  // Recheck conditions.
+  if (iree_status_is_ok(status)) {
+    iree_slim_mutex_lock(&semaphore->mutex);
+    if (semaphore->current_value == IREE_HAL_SEMAPHORE_FAILURE_VALUE ||
+        !iree_status_is_ok(semaphore->failure_status)) {
+      status = iree_status_from_code(IREE_STATUS_ABORTED);
+    } else if (semaphore->current_value >= value) {
+      status = iree_ok_status();
+    } else if (iree_timeout_is_immediate(timeout)) {
+      status = iree_status_from_code(IREE_STATUS_DEADLINE_EXCEEDED);
+    }
+    iree_slim_mutex_unlock(&semaphore->mutex);
+  }
+
   return status;
 }
 
diff --git a/runtime/src/iree/hal/drivers/metal/CMakeLists.txt b/runtime/src/iree/hal/drivers/metal/CMakeLists.txt
index 9331d2d..c2a991e 100644
--- a/runtime/src/iree/hal/drivers/metal/CMakeLists.txt
+++ b/runtime/src/iree/hal/drivers/metal/CMakeLists.txt
@@ -43,6 +43,8 @@
     iree::hal::utils::executable_debug_info
     iree::hal::utils::file_transfer
     iree::hal::utils::files
+    iree::hal::utils::queue_emulation
+    iree::hal::utils::queue_host_call_emulation
     iree::hal::utils::resource_set
     iree::schemas::executable_debug_info_c_fbs
     iree::schemas::metal_executable_def_c_fbs
diff --git a/runtime/src/iree/hal/drivers/metal/metal_device.m b/runtime/src/iree/hal/drivers/metal/metal_device.m
index 33028dd..ac264b9 100644
--- a/runtime/src/iree/hal/drivers/metal/metal_device.m
+++ b/runtime/src/iree/hal/drivers/metal/metal_device.m
@@ -19,6 +19,8 @@
 #include "iree/hal/utils/deferred_command_buffer.h"
 #include "iree/hal/utils/file_registry.h"
 #include "iree/hal/utils/file_transfer.h"
+#include "iree/hal/utils/queue_emulation.h"
+#include "iree/hal/utils/queue_host_call_emulation.h"
 #include "iree/hal/utils/resource_set.h"
 
 typedef struct iree_hal_metal_device_t {
@@ -620,6 +622,7 @@
     .queue_copy = iree_hal_device_queue_emulated_copy,
     .queue_read = iree_hal_metal_device_queue_read,
     .queue_write = iree_hal_metal_device_queue_write,
+    .queue_host_call = iree_hal_device_queue_emulated_host_call,
     .queue_dispatch = iree_hal_device_queue_emulated_dispatch,
     .queue_execute = iree_hal_metal_device_queue_execute,
     .queue_flush = iree_hal_metal_device_queue_flush,
diff --git a/runtime/src/iree/hal/drivers/null/BUILD.bazel b/runtime/src/iree/hal/drivers/null/BUILD.bazel
index 5343034..b28ba50 100644
--- a/runtime/src/iree/hal/drivers/null/BUILD.bazel
+++ b/runtime/src/iree/hal/drivers/null/BUILD.bazel
@@ -45,6 +45,8 @@
         "//runtime/src/iree/hal",
         "//runtime/src/iree/hal/utils:file_transfer",
         "//runtime/src/iree/hal/utils:files",
+        "//runtime/src/iree/hal/utils:queue_emulation",
+        "//runtime/src/iree/hal/utils:queue_host_call_emulation",
         "//runtime/src/iree/hal/utils:semaphore_base",
     ],
 )
diff --git a/runtime/src/iree/hal/drivers/null/CMakeLists.txt b/runtime/src/iree/hal/drivers/null/CMakeLists.txt
index fa9f96a..dbdc925 100644
--- a/runtime/src/iree/hal/drivers/null/CMakeLists.txt
+++ b/runtime/src/iree/hal/drivers/null/CMakeLists.txt
@@ -42,6 +42,8 @@
     iree::hal
     iree::hal::utils::file_transfer
     iree::hal::utils::files
+    iree::hal::utils::queue_emulation
+    iree::hal::utils::queue_host_call_emulation
     iree::hal::utils::semaphore_base
   PUBLIC
 )
diff --git a/runtime/src/iree/hal/drivers/null/device.c b/runtime/src/iree/hal/drivers/null/device.c
index 0d9cc72..9ddedc1 100644
--- a/runtime/src/iree/hal/drivers/null/device.c
+++ b/runtime/src/iree/hal/drivers/null/device.c
@@ -16,6 +16,8 @@
 #include "iree/hal/drivers/null/semaphore.h"
 #include "iree/hal/utils/file_registry.h"
 #include "iree/hal/utils/file_transfer.h"
+#include "iree/hal/utils/queue_emulation.h"
+#include "iree/hal/utils/queue_host_call_emulation.h"
 
 //===----------------------------------------------------------------------===//
 // iree_hal_null_device_options_t
@@ -459,6 +461,22 @@
   return loop_status;
 }
 
+static iree_status_t iree_hal_null_device_queue_host_call(
+    iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
+    const iree_hal_semaphore_list_t wait_semaphore_list,
+    const iree_hal_semaphore_list_t signal_semaphore_list,
+    iree_hal_host_call_t call, const uint64_t args[4],
+    iree_hal_host_call_flags_t flags) {
+  // TODO(null): if a native queue host call operation is available use that
+  // instead. The emulated host call is horrendous and creates a new thread for
+  // every requested host call. Even if native host call support is not
+  // available an implementation should do _anything_ better than launching a
+  // thread per call (polling threads, worker pools, etc).
+  return iree_hal_device_queue_emulated_host_call(
+      base_device, queue_affinity, wait_semaphore_list, signal_semaphore_list,
+      call, args, flags);
+}
+
 static iree_status_t iree_hal_null_device_queue_dispatch(
     iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
     const iree_hal_semaphore_list_t wait_semaphore_list,
@@ -621,6 +639,7 @@
     .queue_copy = iree_hal_null_device_queue_copy,
     .queue_read = iree_hal_null_device_queue_read,
     .queue_write = iree_hal_null_device_queue_write,
+    .queue_host_call = iree_hal_null_device_queue_host_call,
     .queue_dispatch = iree_hal_null_device_queue_dispatch,
     .queue_execute = iree_hal_null_device_queue_execute,
     .queue_flush = iree_hal_null_device_queue_flush,
diff --git a/runtime/src/iree/hal/drivers/vulkan/BUILD.bazel b/runtime/src/iree/hal/drivers/vulkan/BUILD.bazel
index ef91748..15d7924 100644
--- a/runtime/src/iree/hal/drivers/vulkan/BUILD.bazel
+++ b/runtime/src/iree/hal/drivers/vulkan/BUILD.bazel
@@ -82,6 +82,8 @@
         "//runtime/src/iree/hal/utils:executable_debug_info",
         "//runtime/src/iree/hal/utils:file_transfer",
         "//runtime/src/iree/hal/utils:files",
+        "//runtime/src/iree/hal/utils:queue_emulation",
+        "//runtime/src/iree/hal/utils:queue_host_call_emulation",
         "//runtime/src/iree/hal/utils:resource_set",
         "//runtime/src/iree/hal/utils:semaphore_base",
         "//runtime/src/iree/schemas:executable_debug_info_c_fbs",
diff --git a/runtime/src/iree/hal/drivers/vulkan/CMakeLists.txt b/runtime/src/iree/hal/drivers/vulkan/CMakeLists.txt
index 7444ef3..0bd351a 100644
--- a/runtime/src/iree/hal/drivers/vulkan/CMakeLists.txt
+++ b/runtime/src/iree/hal/drivers/vulkan/CMakeLists.txt
@@ -77,6 +77,8 @@
     iree::hal::utils::executable_debug_info
     iree::hal::utils::file_transfer
     iree::hal::utils::files
+    iree::hal::utils::queue_emulation
+    iree::hal::utils::queue_host_call_emulation
     iree::hal::utils::resource_set
     iree::hal::utils::semaphore_base
     iree::schemas::executable_debug_info_c_fbs
diff --git a/runtime/src/iree/hal/drivers/vulkan/native_semaphore.cc b/runtime/src/iree/hal/drivers/vulkan/native_semaphore.cc
index 20ac806..2d51410 100644
--- a/runtime/src/iree/hal/drivers/vulkan/native_semaphore.cc
+++ b/runtime/src/iree/hal/drivers/vulkan/native_semaphore.cc
@@ -254,13 +254,34 @@
   // TODO(benvanik): on success optimize this to notify of reaching the new
   // values instead of a full poll; it'll avoid a bunch of additional API
   // queries.
+  bool any_failed = false;
   for (iree_host_size_t i = 0; i < semaphore_list->count; ++i) {
+    // Query from Vulkan source-of-truth.
+    iree_hal_vulkan_native_semaphore_t* semaphore =
+        iree_hal_vulkan_native_semaphore_cast(semaphore_list->semaphores[i]);
+
     uint64_t value = 0;
-    iree_status_ignore(iree_hal_vulkan_native_semaphore_query(
-        semaphore_list->semaphores[i], &value));
+    IREE_RETURN_IF_ERROR(VK_RESULT_TO_STATUS(
+        semaphore->logical_device->syms()->vkGetSemaphoreCounterValue(
+            *semaphore->logical_device, semaphore->handle, &value),
+        "vkGetSemaphoreCounterValue"));
+    if (value >= IREE_HAL_SEMAPHORE_FAILURE_VALUE) {
+      any_failed = true;
+    }
+
+    // Notify timepoints on the query as we aren't notified by Vulkan when a
+    // device-side signal occurs. This helps us keep latencies lower by flushing
+    // timepoints without needing waits at the risk of making queries slower.
+    iree_hal_semaphore_notify(&semaphore->base, value,
+                              value < IREE_HAL_SEMAPHORE_FAILURE_VALUE
+                                  ? IREE_STATUS_OK
+                                  : IREE_STATUS_ABORTED);
   }
 
-  if (result == VK_SUCCESS) {
+  if (any_failed) {
+    return iree_make_status(IREE_STATUS_ABORTED,
+                            "one or more semaphores have failed");
+  } else if (result == VK_SUCCESS) {
     return iree_ok_status();
   } else if (result == VK_ERROR_DEVICE_LOST) {
     // Nothing we do now matters.
diff --git a/runtime/src/iree/hal/drivers/vulkan/vulkan_device.cc b/runtime/src/iree/hal/drivers/vulkan/vulkan_device.cc
index b9a42f7..dd6f24b 100644
--- a/runtime/src/iree/hal/drivers/vulkan/vulkan_device.cc
+++ b/runtime/src/iree/hal/drivers/vulkan/vulkan_device.cc
@@ -33,6 +33,8 @@
 #include "iree/hal/utils/deferred_command_buffer.h"
 #include "iree/hal/utils/file_registry.h"
 #include "iree/hal/utils/file_transfer.h"
+#include "iree/hal/utils/queue_emulation.h"
+#include "iree/hal/utils/queue_host_call_emulation.h"
 
 using namespace iree::hal::vulkan;
 
@@ -1897,6 +1899,7 @@
     /*.queue_copy=*/iree_hal_device_queue_emulated_copy,
     /*.queue_read=*/iree_hal_vulkan_device_queue_read,
     /*.queue_write=*/iree_hal_vulkan_device_queue_write,
+    /*.queue_host_call=*/iree_hal_device_queue_emulated_host_call,
     /*.queue_dispatch=*/iree_hal_device_queue_emulated_dispatch,
     /*.queue_execute=*/iree_hal_vulkan_device_queue_execute,
     /*.queue_flush=*/iree_hal_vulkan_device_queue_flush,
diff --git a/runtime/src/iree/hal/semaphore.c b/runtime/src/iree/hal/semaphore.c
index 58da6ac..c29001c 100644
--- a/runtime/src/iree/hal/semaphore.c
+++ b/runtime/src/iree/hal/semaphore.c
@@ -200,6 +200,39 @@
 // iree_hal_semaphore_list_t
 //===----------------------------------------------------------------------===//
 
+IREE_API_EXPORT void iree_hal_semaphore_list_retain(
+    iree_hal_semaphore_list_t semaphore_list) {
+  for (iree_host_size_t i = 0; i < semaphore_list.count; ++i) {
+    iree_hal_semaphore_retain(semaphore_list.semaphores[i]);
+  }
+}
+
+IREE_API_EXPORT void iree_hal_semaphore_list_release(
+    iree_hal_semaphore_list_t semaphore_list) {
+  for (iree_host_size_t i = 0; i < semaphore_list.count; ++i) {
+    iree_hal_semaphore_release(semaphore_list.semaphores[i]);
+  }
+}
+
+IREE_API_EXPORT bool iree_hal_semaphore_list_poll(
+    iree_hal_semaphore_list_t semaphore_list) {
+  for (iree_host_size_t i = 0; i < semaphore_list.count; ++i) {
+    // NOTE: this is unfortunately expensive in failure cases as it'll return
+    // a clone (or maybe the original!) status. We rely on failures being
+    // exceptional to make this acceptable.
+    uint64_t current_value = 0;
+    iree_status_t status =
+        iree_hal_semaphore_query(semaphore_list.semaphores[i], &current_value);
+    if (!iree_status_is_ok(status)) {
+      iree_status_ignore(status);
+      return false;
+    } else if (current_value < semaphore_list.payload_values[i]) {
+      return false;  // not yet reached
+    }
+  }
+  return true;
+}
+
 IREE_API_EXPORT iree_status_t
 iree_hal_semaphore_list_signal(iree_hal_semaphore_list_t semaphore_list) {
   IREE_TRACE_ZONE_BEGIN(z0);
diff --git a/runtime/src/iree/hal/semaphore.h b/runtime/src/iree/hal/semaphore.h
index 1392762..b246e6a 100644
--- a/runtime/src/iree/hal/semaphore.h
+++ b/runtime/src/iree/hal/semaphore.h
@@ -444,6 +444,25 @@
   return list;
 }
 
+// Returns true if the |semaphore_list| is empty.
+static inline bool iree_hal_semaphore_list_is_empty(
+    iree_hal_semaphore_list_t semaphore_list) {
+  return semaphore_list.count == 0;
+}
+
+// Retains each semaphore in the semaphore list.
+IREE_API_EXPORT void iree_hal_semaphore_list_retain(
+    iree_hal_semaphore_list_t semaphore_list);
+
+// Releases each semaphore in the semaphore list.
+IREE_API_EXPORT void iree_hal_semaphore_list_release(
+    iree_hal_semaphore_list_t semaphore_list);
+
+// Returns true if all semaphores in the list have reached the specified payload
+// values and false otherwise (or if any have failed).
+IREE_API_EXPORT bool iree_hal_semaphore_list_poll(
+    iree_hal_semaphore_list_t semaphore_list);
+
 // Signals each semaphore in |semaphore_list| to the defined timepoint.
 IREE_API_EXPORT iree_status_t
 iree_hal_semaphore_list_signal(iree_hal_semaphore_list_t semaphore_list);
diff --git a/runtime/src/iree/hal/utils/BUILD.bazel b/runtime/src/iree/hal/utils/BUILD.bazel
index 61cd73d..309e19e 100644
--- a/runtime/src/iree/hal/utils/BUILD.bazel
+++ b/runtime/src/iree/hal/utils/BUILD.bazel
@@ -161,6 +161,17 @@
 )
 
 iree_runtime_cc_library(
+    name = "queue_emulation",
+    srcs = ["queue_emulation.c"],
+    hdrs = ["queue_emulation.h"],
+    deps = [
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base/internal",
+        "//runtime/src/iree/hal",
+    ],
+)
+
+iree_runtime_cc_library(
     name = "resource_set",
     srcs = ["resource_set.c"],
     hdrs = ["resource_set.h"],
@@ -256,3 +267,16 @@
         "//runtime/src/iree/hal",
     ],
 )
+
+iree_runtime_cc_library(
+    name = "queue_host_call_emulation",
+    srcs = ["queue_host_call_emulation.c"],
+    hdrs = ["queue_host_call_emulation.h"],
+    deps = [
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base/internal",
+        "//runtime/src/iree/base/internal:synchronization",
+        "//runtime/src/iree/base/internal:threading",
+        "//runtime/src/iree/hal",
+    ],
+)
diff --git a/runtime/src/iree/hal/utils/CMakeLists.txt b/runtime/src/iree/hal/utils/CMakeLists.txt
index e47c9ab..bc7bfb3 100644
--- a/runtime/src/iree/hal/utils/CMakeLists.txt
+++ b/runtime/src/iree/hal/utils/CMakeLists.txt
@@ -187,6 +187,20 @@
 
 iree_cc_library(
   NAME
+    queue_emulation
+  HDRS
+    "queue_emulation.h"
+  SRCS
+    "queue_emulation.c"
+  DEPS
+    iree::base
+    iree::base::internal
+    iree::hal
+  PUBLIC
+)
+
+iree_cc_library(
+  NAME
     resource_set
   HDRS
     "resource_set.h"
@@ -293,4 +307,20 @@
   PUBLIC
 )
 
+iree_cc_library(
+  NAME
+    queue_host_call_emulation
+  HDRS
+    "queue_host_call_emulation.h"
+  SRCS
+    "queue_host_call_emulation.c"
+  DEPS
+    iree::base
+    iree::base::internal
+    iree::base::internal::synchronization
+    iree::base::internal::threading
+    iree::hal
+  PUBLIC
+)
+
 ### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/runtime/src/iree/hal/utils/queue_emulation.c b/runtime/src/iree/hal/utils/queue_emulation.c
new file mode 100644
index 0000000..aa88ea6
--- /dev/null
+++ b/runtime/src/iree/hal/utils/queue_emulation.c
@@ -0,0 +1,222 @@
+// Copyright 2025 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/utils/queue_emulation.h"
+
+//===----------------------------------------------------------------------===//
+// Emulated Queue Operations
+//===----------------------------------------------------------------------===//
+
+IREE_API_EXPORT iree_status_t iree_hal_device_queue_emulated_fill(
+    iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
+    const iree_hal_semaphore_list_t wait_semaphore_list,
+    const iree_hal_semaphore_list_t signal_semaphore_list,
+    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+    iree_device_size_t length, const void* pattern,
+    iree_host_size_t pattern_length, iree_hal_fill_flags_t flags) {
+  IREE_ASSERT_ARGUMENT(device);
+  IREE_ASSERT_ARGUMENT(target_buffer);
+  IREE_ASSERT_ARGUMENT(pattern);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, (int64_t)length);
+
+  // If we are starting execution immediately then we can reduce latency by
+  // allowing inline command buffer execution.
+  iree_hal_command_buffer_mode_t command_buffer_mode =
+      IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT;
+  if (wait_semaphore_list.count == 0) {
+    command_buffer_mode |= IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION;
+  }
+
+  iree_hal_transfer_command_t command = {
+      .type = IREE_HAL_TRANSFER_COMMAND_TYPE_FILL,
+      .fill =
+          {
+              .target_buffer = target_buffer,
+              .target_offset = target_offset,
+              .length = length,
+              .pattern = pattern,
+              .pattern_length = pattern_length,
+          },
+  };
+
+  iree_hal_command_buffer_t* command_buffer = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_hal_create_transfer_command_buffer(device, command_buffer_mode,
+                                                  queue_affinity, 1, &command,
+                                                  &command_buffer));
+
+  iree_status_t status = iree_hal_device_queue_execute(
+      device, queue_affinity, wait_semaphore_list, signal_semaphore_list,
+      command_buffer, iree_hal_buffer_binding_table_empty(),
+      IREE_HAL_EXECUTE_FLAG_NONE);
+
+  iree_hal_command_buffer_release(command_buffer);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_device_queue_emulated_update(
+    iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
+    const iree_hal_semaphore_list_t wait_semaphore_list,
+    const iree_hal_semaphore_list_t signal_semaphore_list,
+    const void* source_buffer, iree_host_size_t source_offset,
+    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+    iree_device_size_t length, iree_hal_update_flags_t flags) {
+  IREE_ASSERT_ARGUMENT(device);
+  IREE_ASSERT_ARGUMENT(source_buffer);
+  IREE_ASSERT_ARGUMENT(target_buffer);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, (int64_t)length);
+
+  // If we are starting execution immediately then we can reduce latency by
+  // allowing inline command buffer execution.
+  iree_hal_command_buffer_mode_t command_buffer_mode =
+      IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT;
+  if (wait_semaphore_list.count == 0) {
+    command_buffer_mode |= IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION;
+  }
+
+  // TODO(benvanik): support splitting the update into multiple chunks to fit
+  // under the max command buffer update size limit. This provisional API is
+  // intended only for updating dispatch parameters today.
+  if (length > UINT16_MAX) {
+    return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+                            "queue buffer updates currently limited to 64KB, "
+                            "tried to update %" PRIhsz " bytes",
+                            length);
+  }
+
+  iree_hal_transfer_command_t command = {
+      .type = IREE_HAL_TRANSFER_COMMAND_TYPE_UPDATE,
+      .update =
+          {
+              .source_buffer = source_buffer,
+              .source_offset = source_offset,
+              .target_buffer = target_buffer,
+              .target_offset = target_offset,
+              .length = length,
+          },
+  };
+
+  iree_hal_command_buffer_t* command_buffer = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_hal_create_transfer_command_buffer(device, command_buffer_mode,
+                                                  queue_affinity, 1, &command,
+                                                  &command_buffer));
+
+  iree_status_t status = iree_hal_device_queue_execute(
+      device, queue_affinity, wait_semaphore_list, signal_semaphore_list,
+      command_buffer, iree_hal_buffer_binding_table_empty(),
+      IREE_HAL_EXECUTE_FLAG_NONE);
+
+  iree_hal_command_buffer_release(command_buffer);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_device_queue_emulated_copy(
+    iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
+    const iree_hal_semaphore_list_t wait_semaphore_list,
+    const iree_hal_semaphore_list_t signal_semaphore_list,
+    iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
+    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+    iree_device_size_t length, iree_hal_copy_flags_t flags) {
+  IREE_ASSERT_ARGUMENT(device);
+  IREE_ASSERT_ARGUMENT(source_buffer);
+  IREE_ASSERT_ARGUMENT(target_buffer);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, (int64_t)length);
+
+  // If we are starting execution immediately then we can reduce latency by
+  // allowing inline command buffer execution.
+  iree_hal_command_buffer_mode_t command_buffer_mode =
+      IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT;
+  if (wait_semaphore_list.count == 0) {
+    command_buffer_mode |= IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION;
+  }
+
+  iree_hal_transfer_command_t command = {
+      .type = IREE_HAL_TRANSFER_COMMAND_TYPE_COPY,
+      .copy =
+          {
+              .source_buffer = source_buffer,
+              .source_offset = source_offset,
+              .target_buffer = target_buffer,
+              .target_offset = target_offset,
+              .length = length,
+          },
+  };
+
+  iree_hal_command_buffer_t* command_buffer = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_hal_create_transfer_command_buffer(device, command_buffer_mode,
+                                                  queue_affinity, 1, &command,
+                                                  &command_buffer));
+
+  iree_status_t status = iree_hal_device_queue_execute(
+      device, queue_affinity, wait_semaphore_list, signal_semaphore_list,
+      command_buffer, iree_hal_buffer_binding_table_empty(),
+      IREE_HAL_EXECUTE_FLAG_NONE);
+
+  iree_hal_command_buffer_release(command_buffer);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_device_queue_emulated_dispatch(
+    iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
+    const iree_hal_semaphore_list_t wait_semaphore_list,
+    const iree_hal_semaphore_list_t signal_semaphore_list,
+    iree_hal_executable_t* executable, int32_t entry_point,
+    const iree_hal_dispatch_config_t config, iree_const_byte_span_t constants,
+    const iree_hal_buffer_ref_list_t bindings,
+    iree_hal_dispatch_flags_t flags) {
+  IREE_ASSERT_ARGUMENT(device);
+  IREE_ASSERT_ARGUMENT(executable);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // If we are starting execution immediately then we can reduce latency by
+  // allowing inline command buffer execution.
+  iree_hal_command_buffer_mode_t command_buffer_mode =
+      IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT;
+  if (wait_semaphore_list.count == 0) {
+    command_buffer_mode |= IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION;
+  }
+
+  iree_hal_command_buffer_t* command_buffer = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_hal_command_buffer_create(
+              device, command_buffer_mode, IREE_HAL_COMMAND_CATEGORY_DISPATCH,
+              queue_affinity, /*binding_capacity=*/0, &command_buffer));
+
+  iree_status_t status = iree_hal_command_buffer_begin(command_buffer);
+
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_command_buffer_dispatch(command_buffer, executable,
+                                              entry_point, config, constants,
+                                              bindings, flags);
+  }
+
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_command_buffer_end(command_buffer);
+  }
+
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_device_queue_execute(
+        device, queue_affinity, wait_semaphore_list, signal_semaphore_list,
+        command_buffer, iree_hal_buffer_binding_table_empty(),
+        IREE_HAL_EXECUTE_FLAG_NONE);
+  }
+
+  iree_hal_command_buffer_release(command_buffer);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
diff --git a/runtime/src/iree/hal/utils/queue_emulation.h b/runtime/src/iree/hal/utils/queue_emulation.h
new file mode 100644
index 0000000..2854b4f
--- /dev/null
+++ b/runtime/src/iree/hal/utils/queue_emulation.h
@@ -0,0 +1,57 @@
+// Copyright 2025 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_UTILS_QUEUE_EMULATION_H_
+#define IREE_HAL_UTILS_QUEUE_EMULATION_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+//===----------------------------------------------------------------------===//
+// Emulated Queue Operations
+//===----------------------------------------------------------------------===//
+
+IREE_API_EXPORT iree_status_t iree_hal_device_queue_emulated_fill(
+    iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
+    const iree_hal_semaphore_list_t wait_semaphore_list,
+    const iree_hal_semaphore_list_t signal_semaphore_list,
+    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+    iree_device_size_t length, const void* pattern,
+    iree_host_size_t pattern_length, iree_hal_fill_flags_t flags);
+
+IREE_API_EXPORT iree_status_t iree_hal_device_queue_emulated_update(
+    iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
+    const iree_hal_semaphore_list_t wait_semaphore_list,
+    const iree_hal_semaphore_list_t signal_semaphore_list,
+    const void* source_buffer, iree_host_size_t source_offset,
+    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+    iree_device_size_t length, iree_hal_update_flags_t flags);
+
+IREE_API_EXPORT iree_status_t iree_hal_device_queue_emulated_copy(
+    iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
+    const iree_hal_semaphore_list_t wait_semaphore_list,
+    const iree_hal_semaphore_list_t signal_semaphore_list,
+    iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
+    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+    iree_device_size_t length, iree_hal_copy_flags_t flags);
+
+IREE_API_EXPORT iree_status_t iree_hal_device_queue_emulated_dispatch(
+    iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
+    const iree_hal_semaphore_list_t wait_semaphore_list,
+    const iree_hal_semaphore_list_t signal_semaphore_list,
+    iree_hal_executable_t* executable, int32_t entry_point,
+    const iree_hal_dispatch_config_t config, iree_const_byte_span_t constants,
+    const iree_hal_buffer_ref_list_t bindings, iree_hal_dispatch_flags_t flags);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_UTILS_QUEUE_EMULATION_H_
diff --git a/runtime/src/iree/hal/utils/queue_host_call_emulation.c b/runtime/src/iree/hal/utils/queue_host_call_emulation.c
new file mode 100644
index 0000000..8241c90
--- /dev/null
+++ b/runtime/src/iree/hal/utils/queue_host_call_emulation.c
@@ -0,0 +1,242 @@
+// Copyright 2025 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/utils/queue_host_call_emulation.h"
+
+#if IREE_THREADING_ENABLE
+
+#include "iree/base/internal/threading.h"
+
+//===----------------------------------------------------------------------===//
+// Emulated Host Call
+//===----------------------------------------------------------------------===//
+
+// Issues the host call on the calling thread and signals the semaphore list.
+// Returns errors only if signaling fails; user call errors are propagated to
+// the semaphore list.
+static iree_status_t iree_hal_emulated_host_call_issue(
+    iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
+    const iree_hal_semaphore_list_t signal_semaphore_list,
+    iree_hal_host_call_t call, const uint64_t args[4],
+    iree_hal_host_call_flags_t flags) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Non-blocking mode signals the semaphore list first.
+  const bool is_nonblocking =
+      iree_any_bit_set(flags, IREE_HAL_HOST_CALL_FLAG_NON_BLOCKING);
+  if (is_nonblocking) {
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z0, iree_hal_semaphore_list_signal(signal_semaphore_list));
+  }
+
+  // Call the user function.
+  iree_hal_host_call_context_t context = {
+      .device = device,
+      .queue_affinity = queue_affinity,
+      .signal_semaphore_list = is_nonblocking ? iree_hal_semaphore_list_empty()
+                                              : signal_semaphore_list,
+  };
+  iree_status_t call_status = call.fn(call.user_data, args, &context);
+
+  if (is_nonblocking || iree_status_is_deferred(call_status)) {
+    // User callback will signal in the future (or they are fire-and-forget).
+  } else if (iree_status_is_ok(call_status)) {
+    // Signal callback completed synchronously.
+    iree_hal_semaphore_list_signal(signal_semaphore_list);
+  } else {
+    // If the user function failed we propagate the error to the semaphore list
+    // (blocking) or ignore it (non-blocking, where we lost our chance).
+    if (!is_nonblocking) {
+      iree_hal_semaphore_list_fail(signal_semaphore_list, call_status);
+    } else {
+      iree_status_ignore(call_status);
+    }
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+// Heap-allocated state to track a host call that is in-flight.
+typedef struct iree_hal_emulated_host_call_state_t {
+  // Device the call was scheduled on. Unowned.
+  iree_hal_device_t* device;
+  // Queue affinity as originally requested.
+  // We don't know where we'd actually run so we pass through without
+  // modification.
+  iree_hal_queue_affinity_t queue_affinity;
+  // The transient thread waiting for the wait semaphores and issuing the call.
+  iree_thread_t* thread;
+  // Target function to call.
+  iree_hal_host_call_t call;
+  // User arguments.
+  uint64_t args[4];
+  // Flags controlling call behavior.
+  iree_hal_host_call_flags_t flags;
+  // Wait semaphores, stored at the end of the state structure.
+  iree_hal_semaphore_list_t wait_semaphore_list;
+  // Signal semaphores, stored at the end of the state structure.
+  iree_hal_semaphore_list_t signal_semaphore_list;
+} iree_hal_emulated_host_call_state_t;
+
+// Waits, calls, and signals a host call.
+// Resources will be released and the state will be deallocated prior to
+// returning.
+static int iree_hal_emulated_host_call_main(void* entry_arg) {
+  iree_hal_emulated_host_call_state_t* state =
+      (iree_hal_emulated_host_call_state_t*)entry_arg;
+
+  // Wait for all semaphores to be reached.
+  iree_status_t status = iree_hal_semaphore_list_wait(
+      state->wait_semaphore_list, iree_infinite_timeout());
+
+  // Release wait semaphores early.
+  iree_hal_semaphore_list_release(state->wait_semaphore_list);
+
+  // If non-blocking then immediately signal the dependencies instead of letting
+  // the call do it. If there's dependent work in the queue it should be able to
+  // progress after this point regardless of how long the host call takes.
+  const bool is_nonblocking =
+      iree_any_bit_set(state->flags, IREE_HAL_HOST_CALL_FLAG_NON_BLOCKING);
+  if (is_nonblocking) {
+    // NOTE: the signals can fail in which case we never perform the call.
+    // That's ok as failure to signal is considered a device-loss/death
+    // situation as there's no telling what has gone wrong.
+    status = iree_hal_semaphore_list_signal(state->signal_semaphore_list);
+  }
+
+  // Issue the call.
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_emulated_host_call_issue(
+        state->device, state->queue_affinity,
+        is_nonblocking ? iree_hal_semaphore_list_empty()
+                       : state->signal_semaphore_list,
+        state->call, state->args, state->flags);
+  }
+
+  // If anything (wait, call, or signal) failed we need to fail all dependent
+  // semaphores to propagate the error.
+  if (!iree_status_is_ok(status)) {
+    // Transfers status ownership.
+    iree_hal_semaphore_list_fail(state->signal_semaphore_list, status);
+    status = iree_status_from_code(IREE_STATUS_INTERNAL);
+  }
+  // NOTE: status is invalid here as we've transferred ownership to the
+  // semaphore list via iree_hal_semaphore_list_fail.
+
+  // Release signal semaphores.
+  iree_hal_semaphore_list_release(state->signal_semaphore_list);
+
+  // Deallocate state (note that we must take the thread handle locally).
+  iree_allocator_t host_allocator =
+      iree_hal_device_host_allocator(state->device);
+  iree_thread_t* thread = state->thread;
+  iree_allocator_free(host_allocator, state);
+
+  // Release the thread and return.
+  iree_thread_release(thread);
+  return 0;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_device_queue_emulated_host_call(
+    iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
+    const iree_hal_semaphore_list_t wait_semaphore_list,
+    const iree_hal_semaphore_list_t signal_semaphore_list,
+    iree_hal_host_call_t call, const uint64_t args[4],
+    iree_hal_host_call_flags_t flags) {
+  IREE_ASSERT_ARGUMENT(device);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // If there are no wait semaphores we can immediately issue the call from the
+  // calling thread. We still honor the non-blocking flag by signaling early
+  // when set.
+  if (wait_semaphore_list.count == 0 ||
+      iree_hal_semaphore_list_poll(wait_semaphore_list)) {
+    iree_status_t status = iree_hal_emulated_host_call_issue(
+        device, queue_affinity, signal_semaphore_list, call, args, flags);
+    IREE_TRACE_ZONE_END(z0);
+    return status;
+  }
+
+  // Allocate state structure for tracking the host call and waiter thread.
+  // We embed all parameters in the state structure to avoid extra allocations.
+  iree_hal_emulated_host_call_state_t* state = NULL;
+  const iree_host_size_t semaphore_list_size = iree_host_align(
+      (wait_semaphore_list.count + signal_semaphore_list.count) *
+          sizeof(iree_hal_semaphore_t*),
+      iree_max_align_t);
+  const iree_host_size_t payload_list_size = iree_host_align(
+      (wait_semaphore_list.count + signal_semaphore_list.count) *
+          sizeof(uint64_t),
+      iree_max_align_t);
+  const iree_host_size_t total_length =
+      iree_host_align(sizeof(*state), iree_max_align_t) + semaphore_list_size +
+      payload_list_size;
+  iree_allocator_t host_allocator = iree_hal_device_host_allocator(device);
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_allocator_malloc(host_allocator, total_length, (void**)&state));
+
+  state->device = device;
+  state->queue_affinity = queue_affinity;
+  state->call = call;
+  memcpy(state->args, args, sizeof(state->args));
+  state->flags = flags;
+
+  uint8_t* state_ptr =
+      (uint8_t*)state + iree_host_align(sizeof(*state), iree_max_align_t);
+  iree_hal_semaphore_t** semaphore_list_ptr = (iree_hal_semaphore_t**)state_ptr;
+  state_ptr += semaphore_list_size;
+  uint64_t* payload_list_ptr = (uint64_t*)state_ptr;
+  state_ptr += payload_list_size;
+
+  state->wait_semaphore_list.count = wait_semaphore_list.count;
+  state->wait_semaphore_list.semaphores = semaphore_list_ptr;
+  state->wait_semaphore_list.payload_values = payload_list_ptr;
+  memcpy(state->wait_semaphore_list.semaphores, wait_semaphore_list.semaphores,
+         wait_semaphore_list.count * sizeof(*semaphore_list_ptr));
+  memcpy(state->wait_semaphore_list.payload_values,
+         wait_semaphore_list.payload_values,
+         wait_semaphore_list.count * sizeof(*payload_list_ptr));
+  iree_hal_semaphore_list_retain(state->wait_semaphore_list);
+
+  state->signal_semaphore_list.count = signal_semaphore_list.count;
+  state->signal_semaphore_list.semaphores =
+      semaphore_list_ptr + wait_semaphore_list.count;
+  state->signal_semaphore_list.payload_values =
+      payload_list_ptr + wait_semaphore_list.count;
+  memcpy(state->signal_semaphore_list.semaphores,
+         signal_semaphore_list.semaphores,
+         signal_semaphore_list.count * sizeof(*semaphore_list_ptr));
+  memcpy(state->signal_semaphore_list.payload_values,
+         signal_semaphore_list.payload_values,
+         signal_semaphore_list.count * sizeof(*payload_list_ptr));
+  iree_hal_semaphore_list_retain(state->signal_semaphore_list);
+
+  // Launch the thread to perform the wait.
+  const iree_thread_create_params_t thread_params = {
+      .name = iree_make_cstring_view("iree-hal-host-call"),
+      .stack_size = 0,  // default
+      .create_suspended = false,
+      .priority_class = IREE_THREAD_PRIORITY_CLASS_HIGH,
+  };
+  iree_status_t status =
+      iree_thread_create(iree_hal_emulated_host_call_main, state, thread_params,
+                         host_allocator, &state->thread);
+
+  // NOTE: if thread creation fails we never enqueued the waits and thus can
+  // treat the failure like a failure to enqueue. We need to clean up the state
+  // but do not need to signal dependencies as failures.
+  if (!iree_status_is_ok(status)) {
+    iree_hal_semaphore_list_release(state->wait_semaphore_list);
+    iree_hal_semaphore_list_release(state->signal_semaphore_list);
+    iree_allocator_free(host_allocator, state);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+#endif  // IREE_THREADING_ENABLE
diff --git a/runtime/src/iree/hal/utils/queue_host_call_emulation.h b/runtime/src/iree/hal/utils/queue_host_call_emulation.h
new file mode 100644
index 0000000..45201d8
--- /dev/null
+++ b/runtime/src/iree/hal/utils/queue_host_call_emulation.h
@@ -0,0 +1,34 @@
+// Copyright 2025 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_UTILS_QUEUE_HOST_CALL_EMULATION_H_
+#define IREE_HAL_UTILS_QUEUE_HOST_CALL_EMULATION_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+//===----------------------------------------------------------------------===//
+// Emulated Host Call
+//===----------------------------------------------------------------------===//
+
+#if IREE_THREADING_ENABLE
+IREE_API_EXPORT iree_status_t iree_hal_device_queue_emulated_host_call(
+    iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
+    const iree_hal_semaphore_list_t wait_semaphore_list,
+    const iree_hal_semaphore_list_t signal_semaphore_list,
+    iree_hal_host_call_t call, const uint64_t args[4],
+    iree_hal_host_call_flags_t flags);
+#endif  // IREE_THREADING_ENABLE
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_UTILS_QUEUE_HOST_CALL_EMULATION_H_