Merge pull request #9698 from iree-org/benvanik-device-cleanup

Cleaning up some iree_hal_device_t methods.
diff --git a/experimental/rocm/rocm_device.c b/experimental/rocm/rocm_device.c
index 7ddd495..1dee08c 100644
--- a/experimental/rocm/rocm_device.c
+++ b/experimental/rocm/rocm_device.c
@@ -270,21 +270,6 @@
   return iree_ok_status();
 }
 
-static iree_status_t iree_hal_rocm_device_submit_and_wait(
-    iree_hal_device_t* base_device,
-    iree_hal_command_category_t command_categories,
-    iree_hal_queue_affinity_t queue_affinity, iree_host_size_t batch_count,
-    const iree_hal_submission_batch_t* batches,
-    iree_hal_semaphore_t* wait_semaphore, uint64_t wait_value,
-    iree_timeout_t timeout) {
-  // Submit...
-  IREE_RETURN_IF_ERROR(iree_hal_rocm_device_queue_submit(
-      base_device, command_categories, queue_affinity, batch_count, batches));
-
-  // ...and wait.
-  return iree_hal_semaphore_wait(wait_semaphore, wait_value, timeout);
-}
-
 static iree_status_t iree_hal_rocm_device_wait_semaphores(
     iree_hal_device_t* base_device, iree_hal_wait_mode_t wait_mode,
     const iree_hal_semaphore_list_t* semaphore_list, iree_timeout_t timeout) {
@@ -323,7 +308,6 @@
         iree_hal_rocm_device_query_semaphore_compatibility,
     .transfer_range = iree_hal_device_submit_transfer_range_and_wait,
     .queue_submit = iree_hal_rocm_device_queue_submit,
-    .submit_and_wait = iree_hal_rocm_device_submit_and_wait,
     .wait_semaphores = iree_hal_rocm_device_wait_semaphores,
     .wait_idle = iree_hal_rocm_device_wait_idle,
 };
diff --git a/runtime/src/iree/hal/device.c b/runtime/src/iree/hal/device.c
index 7cad656..06f9142 100644
--- a/runtime/src/iree/hal/device.c
+++ b/runtime/src/iree/hal/device.c
@@ -152,72 +152,6 @@
       flags, timeout);
 }
 
-IREE_API_EXPORT iree_status_t iree_hal_device_transfer_and_wait(
-    iree_hal_device_t* device, iree_hal_semaphore_t* wait_semaphore,
-    uint64_t wait_value, iree_host_size_t transfer_count,
-    const iree_hal_transfer_command_t* transfer_commands,
-    iree_timeout_t timeout) {
-  IREE_ASSERT_ARGUMENT(device);
-  IREE_ASSERT_ARGUMENT(!transfer_count || transfer_commands);
-  IREE_TRACE_ZONE_BEGIN(z0);
-
-  // We only want to allow inline execution if we have not been instructed to
-  // wait on a semaphore and it hasn't yet been signaled.
-  iree_hal_command_buffer_mode_t mode = IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT;
-  if (wait_semaphore) {
-    uint64_t current_value = 0ull;
-    IREE_RETURN_AND_END_ZONE_IF_ERROR(
-        z0, iree_hal_semaphore_query(wait_semaphore, &current_value));
-    if (current_value >= wait_value) {
-      mode |= IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION;
-    }
-  } else {
-    mode |= IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION;
-  }
-
-  // Create a command buffer performing all of the transfer operations.
-  iree_hal_command_buffer_t* command_buffer = NULL;
-  IREE_RETURN_AND_END_ZONE_IF_ERROR(
-      z0, iree_hal_create_transfer_command_buffer(
-              device, mode, IREE_HAL_QUEUE_AFFINITY_ANY, transfer_count,
-              transfer_commands, &command_buffer));
-
-  // Perform a full submit-and-wait. On devices with multiple queues this can
-  // run out-of-order/overlapped with other work and return earlier than device
-  // idle.
-  iree_hal_semaphore_t* fence_semaphore = NULL;
-  iree_status_t status =
-      iree_hal_semaphore_create(device, 0ull, &fence_semaphore);
-  uint64_t signal_value = 1ull;
-  if (iree_status_is_ok(status)) {
-    iree_hal_submission_batch_t batch = {
-        .wait_semaphores =
-            {
-                .count = wait_semaphore != NULL ? 1 : 0,
-                .semaphores = &wait_semaphore,
-                .payload_values = &wait_value,
-            },
-        .command_buffer_count = 1,
-        .command_buffers = &command_buffer,
-        .signal_semaphores =
-            {
-                .count = 1,
-                .semaphores = &fence_semaphore,
-                .payload_values = &signal_value,
-            },
-    };
-    status = iree_hal_device_submit_and_wait(
-        device, IREE_HAL_COMMAND_CATEGORY_TRANSFER, IREE_HAL_QUEUE_AFFINITY_ANY,
-        1, &batch, fence_semaphore, signal_value, timeout);
-  }
-
-  iree_hal_command_buffer_release(command_buffer);
-  iree_hal_semaphore_release(fence_semaphore);
-
-  IREE_TRACE_ZONE_END(z0);
-  return status;
-}
-
 // Validates that the submission is well-formed.
 static iree_status_t iree_hal_device_validate_submission(
     iree_host_size_t batch_count, const iree_hal_submission_batch_t* batches) {
@@ -256,24 +190,6 @@
   return status;
 }
 
-IREE_API_EXPORT iree_status_t iree_hal_device_submit_and_wait(
-    iree_hal_device_t* device, iree_hal_command_category_t command_categories,
-    iree_hal_queue_affinity_t queue_affinity, iree_host_size_t batch_count,
-    const iree_hal_submission_batch_t* batches,
-    iree_hal_semaphore_t* wait_semaphore, uint64_t wait_value,
-    iree_timeout_t timeout) {
-  IREE_ASSERT_ARGUMENT(device);
-  IREE_ASSERT_ARGUMENT(!batch_count || batches);
-  IREE_TRACE_ZONE_BEGIN(z0);
-  IREE_RETURN_AND_END_ZONE_IF_ERROR(
-      z0, iree_hal_device_validate_submission(batch_count, batches));
-  iree_status_t status = _VTABLE_DISPATCH(device, submit_and_wait)(
-      device, command_categories, queue_affinity, batch_count, batches,
-      wait_semaphore, wait_value, timeout);
-  IREE_TRACE_ZONE_END(z0);
-  return status;
-}
-
 IREE_API_EXPORT iree_status_t iree_hal_device_wait_semaphores(
     iree_hal_device_t* device, iree_hal_wait_mode_t wait_mode,
     const iree_hal_semaphore_list_t* semaphore_list, iree_timeout_t timeout) {
diff --git a/runtime/src/iree/hal/device.h b/runtime/src/iree/hal/device.h
index c5b53f8..97568e4 100644
--- a/runtime/src/iree/hal/device.h
+++ b/runtime/src/iree/hal/device.h
@@ -302,30 +302,6 @@
     iree_device_size_t target_offset, iree_device_size_t data_length,
     iree_hal_transfer_buffer_flags_t flags, iree_timeout_t timeout);
 
-// Synchronously executes one or more transfer operations against a queue.
-// All buffers must be compatible with |device| and ranges must not overlap
-// (same as with memcpy).
-//
-// This is a blocking operation and may incur significant overheads as
-// internally it issues a command buffer with the transfer operations and waits
-// for it to complete. Users should do that themselves so that the work can be
-// issued concurrently and batched effectively. This is only useful as a
-// fallback for implementations that require it or tools where things like I/O
-// are transferred without worrying about performance. When submitting other
-// work it's preferable to use iree_hal_create_transfer_command_buffer and a
-// normal queue submission that allows for more fine-grained sequencing and
-// amortizes the submission cost by batching other work.
-//
-// The transfer will begin after the optional |wait_semaphore| reaches
-// |wait_value|. Behavior is undefined if no semaphore is provided and there are
-// in-flight operations concurrently using the buffer ranges.
-// Returns only after all transfers have completed and been flushed.
-IREE_API_EXPORT iree_status_t iree_hal_device_transfer_and_wait(
-    iree_hal_device_t* device, iree_hal_semaphore_t* wait_semaphore,
-    uint64_t wait_value, iree_host_size_t transfer_count,
-    const iree_hal_transfer_command_t* transfer_commands,
-    iree_timeout_t timeout);
-
 // Submits one or more batches of work to a device queue.
 //
 // The queue is selected based on the flags set in |command_categories| and the
@@ -346,23 +322,6 @@
     iree_hal_queue_affinity_t queue_affinity, iree_host_size_t batch_count,
     const iree_hal_submission_batch_t* batches);
 
-// Submits batches of work and waits until |wait_semaphore| reaches or exceeds
-// |wait_value|.
-//
-// This is equivalent to following iree_hal_device_queue_submit with a
-// iree_hal_semaphore_wait on |wait_timeout|/|wait_value| but
-// may help to reduce overhead by preventing thread wakeups, kernel calls, and
-// internal tracking.
-//
-// See iree_hal_device_queue_submit for more information about the queuing
-// behavior and iree_hal_semaphore_wait for the waiting  behavior.
-IREE_API_EXPORT iree_status_t iree_hal_device_submit_and_wait(
-    iree_hal_device_t* device, iree_hal_command_category_t command_categories,
-    iree_hal_queue_affinity_t queue_affinity, iree_host_size_t batch_count,
-    const iree_hal_submission_batch_t* batches,
-    iree_hal_semaphore_t* wait_semaphore, uint64_t wait_value,
-    iree_timeout_t timeout);
-
 // Blocks the caller until the semaphores reach or exceed the specified payload
 // values or the |timeout| elapses. All semaphores in |semaphore_list| must be
 // created from this device (or be imported into it).
@@ -469,13 +428,6 @@
       iree_hal_queue_affinity_t queue_affinity, iree_host_size_t batch_count,
       const iree_hal_submission_batch_t* batches);
 
-  iree_status_t(IREE_API_PTR* submit_and_wait)(
-      iree_hal_device_t* device, iree_hal_command_category_t command_categories,
-      iree_hal_queue_affinity_t queue_affinity, iree_host_size_t batch_count,
-      const iree_hal_submission_batch_t* batches,
-      iree_hal_semaphore_t* wait_semaphore, uint64_t wait_value,
-      iree_timeout_t timeout);
-
   iree_status_t(IREE_API_PTR* wait_semaphores)(
       iree_hal_device_t* device, iree_hal_wait_mode_t wait_mode,
       const iree_hal_semaphore_list_t* semaphore_list, iree_timeout_t timeout);
diff --git a/runtime/src/iree/hal/drivers/cuda/cuda_device.c b/runtime/src/iree/hal/drivers/cuda/cuda_device.c
index 6fe8f13..99a2e32 100644
--- a/runtime/src/iree/hal/drivers/cuda/cuda_device.c
+++ b/runtime/src/iree/hal/drivers/cuda/cuda_device.c
@@ -357,21 +357,6 @@
   return iree_ok_status();
 }
 
-static iree_status_t iree_hal_cuda_device_submit_and_wait(
-    iree_hal_device_t* base_device,
-    iree_hal_command_category_t command_categories,
-    iree_hal_queue_affinity_t queue_affinity, iree_host_size_t batch_count,
-    const iree_hal_submission_batch_t* batches,
-    iree_hal_semaphore_t* wait_semaphore, uint64_t wait_value,
-    iree_timeout_t timeout) {
-  // Submit...
-  IREE_RETURN_IF_ERROR(iree_hal_cuda_device_queue_submit(
-      base_device, command_categories, queue_affinity, batch_count, batches));
-
-  // ...and wait.
-  return iree_hal_semaphore_wait(wait_semaphore, wait_value, timeout);
-}
-
 static iree_status_t iree_hal_cuda_device_wait_semaphores(
     iree_hal_device_t* base_device, iree_hal_wait_mode_t wait_mode,
     const iree_hal_semaphore_list_t* semaphore_list, iree_timeout_t timeout) {
@@ -410,7 +395,6 @@
         iree_hal_cuda_device_query_semaphore_compatibility,
     .transfer_range = iree_hal_device_submit_transfer_range_and_wait,
     .queue_submit = iree_hal_cuda_device_queue_submit,
-    .submit_and_wait = iree_hal_cuda_device_submit_and_wait,
     .wait_semaphores = iree_hal_cuda_device_wait_semaphores,
     .wait_idle = iree_hal_cuda_device_wait_idle,
 };
diff --git a/runtime/src/iree/hal/drivers/local_sync/sync_device.c b/runtime/src/iree/hal/drivers/local_sync/sync_device.c
index a59f5c7..8390e62 100644
--- a/runtime/src/iree/hal/drivers/local_sync/sync_device.c
+++ b/runtime/src/iree/hal/drivers/local_sync/sync_device.c
@@ -277,21 +277,6 @@
   return iree_ok_status();
 }
 
-static iree_status_t iree_hal_sync_device_submit_and_wait(
-    iree_hal_device_t* base_device,
-    iree_hal_command_category_t command_categories,
-    iree_hal_queue_affinity_t queue_affinity, iree_host_size_t batch_count,
-    const iree_hal_submission_batch_t* batches,
-    iree_hal_semaphore_t* wait_semaphore, uint64_t wait_value,
-    iree_timeout_t timeout) {
-  // Submit...
-  IREE_RETURN_IF_ERROR(iree_hal_sync_device_queue_submit(
-      base_device, command_categories, queue_affinity, batch_count, batches));
-
-  // ...and wait.
-  return iree_hal_semaphore_wait(wait_semaphore, wait_value, timeout);
-}
-
 static iree_status_t iree_hal_sync_device_wait_semaphores(
     iree_hal_device_t* base_device, iree_hal_wait_mode_t wait_mode,
     const iree_hal_semaphore_list_t* semaphore_list, iree_timeout_t timeout) {
@@ -327,7 +312,6 @@
         iree_hal_sync_device_query_semaphore_compatibility,
     .transfer_range = iree_hal_device_transfer_mappable_range,
     .queue_submit = iree_hal_sync_device_queue_submit,
-    .submit_and_wait = iree_hal_sync_device_submit_and_wait,
     .wait_semaphores = iree_hal_sync_device_wait_semaphores,
     .wait_idle = iree_hal_sync_device_wait_idle,
 };
diff --git a/runtime/src/iree/hal/drivers/local_task/task_device.c b/runtime/src/iree/hal/drivers/local_task/task_device.c
index 261790b..5f6c577 100644
--- a/runtime/src/iree/hal/drivers/local_task/task_device.c
+++ b/runtime/src/iree/hal/drivers/local_task/task_device.c
@@ -331,21 +331,6 @@
                                     batches);
 }
 
-static iree_status_t iree_hal_task_device_submit_and_wait(
-    iree_hal_device_t* base_device,
-    iree_hal_command_category_t command_categories,
-    iree_hal_queue_affinity_t queue_affinity, iree_host_size_t batch_count,
-    const iree_hal_submission_batch_t* batches,
-    iree_hal_semaphore_t* wait_semaphore, uint64_t wait_value,
-    iree_timeout_t timeout) {
-  // Submit...
-  IREE_RETURN_IF_ERROR(iree_hal_task_device_queue_submit(
-      base_device, command_categories, queue_affinity, batch_count, batches));
-
-  // ...and wait.
-  return iree_hal_semaphore_wait(wait_semaphore, wait_value, timeout);
-}
-
 static iree_status_t iree_hal_task_device_wait_semaphores(
     iree_hal_device_t* base_device, iree_hal_wait_mode_t wait_mode,
     const iree_hal_semaphore_list_t* semaphore_list, iree_timeout_t timeout) {
@@ -388,7 +373,6 @@
         iree_hal_task_device_query_semaphore_compatibility,
     .transfer_range = iree_hal_device_transfer_mappable_range,
     .queue_submit = iree_hal_task_device_queue_submit,
-    .submit_and_wait = iree_hal_task_device_submit_and_wait,
     .wait_semaphores = iree_hal_task_device_wait_semaphores,
     .wait_idle = iree_hal_task_device_wait_idle,
 };
diff --git a/runtime/src/iree/hal/drivers/local_task/task_queue.c b/runtime/src/iree/hal/drivers/local_task/task_queue.c
index 6521842..307820f 100644
--- a/runtime/src/iree/hal/drivers/local_task/task_queue.c
+++ b/runtime/src/iree/hal/drivers/local_task/task_queue.c
@@ -506,30 +506,6 @@
   return status;
 }
 
-iree_status_t iree_hal_task_queue_submit_and_wait(
-    iree_hal_task_queue_t* queue, iree_host_size_t batch_count,
-    const iree_hal_submission_batch_t* batches,
-    iree_hal_semaphore_t* wait_semaphore, uint64_t wait_value,
-    iree_timeout_t timeout) {
-  IREE_TRACE_ZONE_BEGIN(z0);
-
-  iree_convert_timeout_to_absolute(&timeout);
-
-  // Queue all of the batches.
-  iree_status_t status =
-      iree_hal_task_queue_submit_batches(queue, batch_count, batches);
-  if (iree_status_is_ok(status)) {
-    // Flush the pending submissions and begin processing, then wait until idle.
-    // TODO(benvanik): get a wait_handle we can pass to
-    // iree_task_executor_donate_caller - it'll flush + do work.
-    iree_task_executor_flush(queue->executor);
-    status = iree_hal_semaphore_wait(wait_semaphore, wait_value, timeout);
-  }
-
-  IREE_TRACE_ZONE_END(z0);
-  return status;
-}
-
 iree_status_t iree_hal_task_queue_wait_idle(iree_hal_task_queue_t* queue,
                                             iree_timeout_t timeout) {
   IREE_TRACE_ZONE_BEGIN(z0);
diff --git a/runtime/src/iree/hal/drivers/local_task/task_queue.h b/runtime/src/iree/hal/drivers/local_task/task_queue.h
index ee255d7..b46586f 100644
--- a/runtime/src/iree/hal/drivers/local_task/task_queue.h
+++ b/runtime/src/iree/hal/drivers/local_task/task_queue.h
@@ -50,12 +50,6 @@
     iree_hal_task_queue_t* queue, iree_host_size_t batch_count,
     const iree_hal_submission_batch_t* batches);
 
-iree_status_t iree_hal_task_queue_submit_and_wait(
-    iree_hal_task_queue_t* queue, iree_host_size_t batch_count,
-    const iree_hal_submission_batch_t* batches,
-    iree_hal_semaphore_t* wait_semaphore, uint64_t wait_value,
-    iree_timeout_t timeout);
-
 iree_status_t iree_hal_task_queue_wait_idle(iree_hal_task_queue_t* queue,
                                             iree_timeout_t timeout);
 
diff --git a/runtime/src/iree/hal/drivers/vulkan/vulkan_device.cc b/runtime/src/iree/hal/drivers/vulkan/vulkan_device.cc
index af0eb41..7a4480b 100644
--- a/runtime/src/iree/hal/drivers/vulkan/vulkan_device.cc
+++ b/runtime/src/iree/hal/drivers/vulkan/vulkan_device.cc
@@ -1087,21 +1087,6 @@
   return queue->Submit(batch_count, batches);
 }
 
-static iree_status_t iree_hal_vulkan_device_submit_and_wait(
-    iree_hal_device_t* base_device,
-    iree_hal_command_category_t command_categories,
-    iree_hal_queue_affinity_t queue_affinity, iree_host_size_t batch_count,
-    const iree_hal_submission_batch_t* batches,
-    iree_hal_semaphore_t* wait_semaphore, uint64_t wait_value,
-    iree_timeout_t timeout) {
-  // Submit...
-  IREE_RETURN_IF_ERROR(iree_hal_vulkan_device_queue_submit(
-      base_device, command_categories, queue_affinity, batch_count, batches));
-
-  // ...and wait.
-  return iree_hal_semaphore_wait(wait_semaphore, wait_value, timeout);
-}
-
 static iree_status_t iree_hal_vulkan_device_wait_semaphores(
     iree_hal_device_t* base_device, iree_hal_wait_mode_t wait_mode,
     const iree_hal_semaphore_list_t* semaphore_list, iree_timeout_t timeout) {
@@ -1145,8 +1130,6 @@
     iree_hal_vulkan_device_query_semaphore_compatibility,
     /*.transfer_range=*/iree_hal_device_submit_transfer_range_and_wait,
     /*.queue_submit=*/iree_hal_vulkan_device_queue_submit,
-    /*.submit_and_wait=*/
-    iree_hal_vulkan_device_submit_and_wait,
     /*.wait_semaphores=*/iree_hal_vulkan_device_wait_semaphores,
     /*.wait_idle=*/iree_hal_vulkan_device_wait_idle,
 };
diff --git a/runtime/src/iree/hal/utils/buffer_transfer.c b/runtime/src/iree/hal/utils/buffer_transfer.c
index b271d79..0d5a045 100644
--- a/runtime/src/iree/hal/utils/buffer_transfer.c
+++ b/runtime/src/iree/hal/utils/buffer_transfer.c
@@ -9,6 +9,97 @@
 #include "iree/base/tracing.h"
 
 //===----------------------------------------------------------------------===//
+// Transfer utilities
+//===----------------------------------------------------------------------===//
+
+// Synchronously executes one or more transfer operations against a queue.
+// All buffers must be compatible with |device| and ranges must not overlap
+// (same as with memcpy).
+//
+// This is a blocking operation and may incur significant overheads as
+// internally it issues a command buffer with the transfer operations and waits
+// for it to complete. Users should do that themselves so that the work can be
+// issued concurrently and batched effectively. This is only useful as a
+// fallback for implementations that require it or tools where things like I/O
+// are transferred without worrying about performance. When submitting other
+// work it's preferable to use iree_hal_create_transfer_command_buffer and a
+// normal queue submission that allows for more fine-grained sequencing and
+// amortizes the submission cost by batching other work.
+//
+// The transfer will begin after the optional |wait_semaphore| reaches
+// |wait_value|. Behavior is undefined if no semaphore is provided and there are
+// in-flight operations concurrently using the buffer ranges.
+// Returns only after all transfers have completed and been flushed.
+static iree_status_t iree_hal_device_transfer_and_wait(
+    iree_hal_device_t* device, iree_hal_semaphore_t* wait_semaphore,
+    uint64_t wait_value, iree_host_size_t transfer_count,
+    const iree_hal_transfer_command_t* transfer_commands,
+    iree_timeout_t timeout) {
+  IREE_ASSERT_ARGUMENT(device);
+  IREE_ASSERT_ARGUMENT(!transfer_count || transfer_commands);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // We only want to allow inline execution if we have not been instructed to
+  // wait on a semaphore and it hasn't yet been signaled.
+  iree_hal_command_buffer_mode_t mode = IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT;
+  if (wait_semaphore) {
+    uint64_t current_value = 0ull;
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z0, iree_hal_semaphore_query(wait_semaphore, &current_value));
+    if (current_value >= wait_value) {
+      mode |= IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION;
+    }
+  } else {
+    mode |= IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION;
+  }
+
+  // Create a command buffer performing all of the transfer operations.
+  iree_hal_command_buffer_t* command_buffer = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_hal_create_transfer_command_buffer(
+              device, mode, IREE_HAL_QUEUE_AFFINITY_ANY, transfer_count,
+              transfer_commands, &command_buffer));
+
+  // Perform a full submit-and-wait. On devices with multiple queues this can
+  // run out-of-order/overlapped with other work and return earlier than device
+  // idle.
+  iree_hal_semaphore_t* fence_semaphore = NULL;
+  iree_status_t status =
+      iree_hal_semaphore_create(device, 0ull, &fence_semaphore);
+  uint64_t signal_value = 1ull;
+  if (iree_status_is_ok(status)) {
+    iree_hal_submission_batch_t batch = {
+        .wait_semaphores =
+            {
+                .count = wait_semaphore != NULL ? 1 : 0,
+                .semaphores = &wait_semaphore,
+                .payload_values = &wait_value,
+            },
+        .command_buffer_count = 1,
+        .command_buffers = &command_buffer,
+        .signal_semaphores =
+            {
+                .count = 1,
+                .semaphores = &fence_semaphore,
+                .payload_values = &signal_value,
+            },
+    };
+    status =
+        iree_hal_device_queue_submit(device, IREE_HAL_COMMAND_CATEGORY_TRANSFER,
+                                     IREE_HAL_QUEUE_AFFINITY_ANY, 1, &batch);
+  }
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_semaphore_wait(fence_semaphore, signal_value, timeout);
+  }
+
+  iree_hal_command_buffer_release(command_buffer);
+  iree_hal_semaphore_release(fence_semaphore);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+//===----------------------------------------------------------------------===//
 // iree_hal_device_transfer_range implementations
 //===----------------------------------------------------------------------===//
 
diff --git a/runtime/src/iree/modules/hal/module.c b/runtime/src/iree/modules/hal/module.c
index 1630b50..e82f700 100644
--- a/runtime/src/iree/modules/hal/module.c
+++ b/runtime/src/iree/modules/hal/module.c
@@ -232,14 +232,14 @@
   batch.signal_semaphores.semaphores = signal_semaphore_ptrs;
   batch.signal_semaphores.payload_values = signal_semaphore_values;
 
-  iree_status_t status = iree_hal_device_submit_and_wait(
-      device, IREE_HAL_COMMAND_CATEGORY_ANY, 0, 1, &batch,
-      state->submit_semaphore, next_semaphore_value, iree_infinite_timeout());
-  if (!iree_status_is_ok(status)) {
-    return status;
+  iree_status_t status = iree_hal_device_queue_submit(
+      device, IREE_HAL_COMMAND_CATEGORY_ANY, 0, 1, &batch);
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_semaphore_wait(
+        state->submit_semaphore, next_semaphore_value, iree_infinite_timeout());
   }
 
-  return iree_ok_status();
+  return status;
 }
 
 //===----------------------------------------------------------------------===//