Hiding iree_hal_device_transfer_and_wait as it should not be used.
diff --git a/runtime/src/iree/hal/device.c b/runtime/src/iree/hal/device.c
index 6167187..06f9142 100644
--- a/runtime/src/iree/hal/device.c
+++ b/runtime/src/iree/hal/device.c
@@ -152,75 +152,6 @@
       flags, timeout);
 }
 
-IREE_API_EXPORT iree_status_t iree_hal_device_transfer_and_wait(
-    iree_hal_device_t* device, iree_hal_semaphore_t* wait_semaphore,
-    uint64_t wait_value, iree_host_size_t transfer_count,
-    const iree_hal_transfer_command_t* transfer_commands,
-    iree_timeout_t timeout) {
-  IREE_ASSERT_ARGUMENT(device);
-  IREE_ASSERT_ARGUMENT(!transfer_count || transfer_commands);
-  IREE_TRACE_ZONE_BEGIN(z0);
-
-  // We only want to allow inline execution if we have not been instructed to
-  // wait on a semaphore and it hasn't yet been signaled.
-  iree_hal_command_buffer_mode_t mode = IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT;
-  if (wait_semaphore) {
-    uint64_t current_value = 0ull;
-    IREE_RETURN_AND_END_ZONE_IF_ERROR(
-        z0, iree_hal_semaphore_query(wait_semaphore, &current_value));
-    if (current_value >= wait_value) {
-      mode |= IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION;
-    }
-  } else {
-    mode |= IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION;
-  }
-
-  // Create a command buffer performing all of the transfer operations.
-  iree_hal_command_buffer_t* command_buffer = NULL;
-  IREE_RETURN_AND_END_ZONE_IF_ERROR(
-      z0, iree_hal_create_transfer_command_buffer(
-              device, mode, IREE_HAL_QUEUE_AFFINITY_ANY, transfer_count,
-              transfer_commands, &command_buffer));
-
-  // Perform a full submit-and-wait. On devices with multiple queues this can
-  // run out-of-order/overlapped with other work and return earlier than device
-  // idle.
-  iree_hal_semaphore_t* fence_semaphore = NULL;
-  iree_status_t status =
-      iree_hal_semaphore_create(device, 0ull, &fence_semaphore);
-  uint64_t signal_value = 1ull;
-  if (iree_status_is_ok(status)) {
-    iree_hal_submission_batch_t batch = {
-        .wait_semaphores =
-            {
-                .count = wait_semaphore != NULL ? 1 : 0,
-                .semaphores = &wait_semaphore,
-                .payload_values = &wait_value,
-            },
-        .command_buffer_count = 1,
-        .command_buffers = &command_buffer,
-        .signal_semaphores =
-            {
-                .count = 1,
-                .semaphores = &fence_semaphore,
-                .payload_values = &signal_value,
-            },
-    };
-    status =
-        iree_hal_device_queue_submit(device, IREE_HAL_COMMAND_CATEGORY_TRANSFER,
-                                     IREE_HAL_QUEUE_AFFINITY_ANY, 1, &batch);
-  }
-  if (iree_status_is_ok(status)) {
-    status = iree_hal_semaphore_wait(fence_semaphore, signal_value, timeout);
-  }
-
-  iree_hal_command_buffer_release(command_buffer);
-  iree_hal_semaphore_release(fence_semaphore);
-
-  IREE_TRACE_ZONE_END(z0);
-  return status;
-}
-
 // Validates that the submission is well-formed.
 static iree_status_t iree_hal_device_validate_submission(
     iree_host_size_t batch_count, const iree_hal_submission_batch_t* batches) {
diff --git a/runtime/src/iree/hal/device.h b/runtime/src/iree/hal/device.h
index 48f6108..97568e4 100644
--- a/runtime/src/iree/hal/device.h
+++ b/runtime/src/iree/hal/device.h
@@ -302,30 +302,6 @@
     iree_device_size_t target_offset, iree_device_size_t data_length,
     iree_hal_transfer_buffer_flags_t flags, iree_timeout_t timeout);
 
-// Synchronously executes one or more transfer operations against a queue.
-// All buffers must be compatible with |device| and ranges must not overlap
-// (same as with memcpy).
-//
-// This is a blocking operation and may incur significant overheads as
-// internally it issues a command buffer with the transfer operations and waits
-// for it to complete. Users should do that themselves so that the work can be
-// issued concurrently and batched effectively. This is only useful as a
-// fallback for implementations that require it or tools where things like I/O
-// are transferred without worrying about performance. When submitting other
-// work it's preferable to use iree_hal_create_transfer_command_buffer and a
-// normal queue submission that allows for more fine-grained sequencing and
-// amortizes the submission cost by batching other work.
-//
-// The transfer will begin after the optional |wait_semaphore| reaches
-// |wait_value|. Behavior is undefined if no semaphore is provided and there are
-// in-flight operations concurrently using the buffer ranges.
-// Returns only after all transfers have completed and been flushed.
-IREE_API_EXPORT iree_status_t iree_hal_device_transfer_and_wait(
-    iree_hal_device_t* device, iree_hal_semaphore_t* wait_semaphore,
-    uint64_t wait_value, iree_host_size_t transfer_count,
-    const iree_hal_transfer_command_t* transfer_commands,
-    iree_timeout_t timeout);
-
 // Submits one or more batches of work to a device queue.
 //
 // The queue is selected based on the flags set in |command_categories| and the
diff --git a/runtime/src/iree/hal/utils/buffer_transfer.c b/runtime/src/iree/hal/utils/buffer_transfer.c
index b271d79..0d5a045 100644
--- a/runtime/src/iree/hal/utils/buffer_transfer.c
+++ b/runtime/src/iree/hal/utils/buffer_transfer.c
@@ -9,6 +9,97 @@
 #include "iree/base/tracing.h"
 
 //===----------------------------------------------------------------------===//
+// Transfer utilities
+//===----------------------------------------------------------------------===//
+
+// Synchronously executes one or more transfer operations against a queue.
+// All buffers must be compatible with |device| and ranges must not overlap
+// (same as with memcpy).
+//
+// This is a blocking operation and may incur significant overheads as
+// internally it issues a command buffer with the transfer operations and waits
+// for it to complete. Users should do that themselves so that the work can be
+// issued concurrently and batched effectively. This is only useful as a
+// fallback for implementations that require it or tools where things like I/O
+// are transferred without worrying about performance. When submitting other
+// work it's preferable to use iree_hal_create_transfer_command_buffer and a
+// normal queue submission that allows for more fine-grained sequencing and
+// amortizes the submission cost by batching other work.
+//
+// The transfer will begin after the optional |wait_semaphore| reaches
+// |wait_value|. Behavior is undefined if no semaphore is provided and there are
+// in-flight operations concurrently using the buffer ranges.
+// Returns only after all transfers have completed and been flushed.
+static iree_status_t iree_hal_device_transfer_and_wait(
+    iree_hal_device_t* device, iree_hal_semaphore_t* wait_semaphore,
+    uint64_t wait_value, iree_host_size_t transfer_count,
+    const iree_hal_transfer_command_t* transfer_commands,
+    iree_timeout_t timeout) {
+  IREE_ASSERT_ARGUMENT(device);
+  IREE_ASSERT_ARGUMENT(!transfer_count || transfer_commands);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // We only want to allow inline execution if we have not been instructed to
+  // wait on a semaphore and it hasn't yet been signaled.
+  iree_hal_command_buffer_mode_t mode = IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT;
+  if (wait_semaphore) {
+    uint64_t current_value = 0ull;
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z0, iree_hal_semaphore_query(wait_semaphore, &current_value));
+    if (current_value >= wait_value) {
+      mode |= IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION;
+    }
+  } else {
+    mode |= IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION;
+  }
+
+  // Create a command buffer performing all of the transfer operations.
+  iree_hal_command_buffer_t* command_buffer = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_hal_create_transfer_command_buffer(
+              device, mode, IREE_HAL_QUEUE_AFFINITY_ANY, transfer_count,
+              transfer_commands, &command_buffer));
+
+  // Perform a full submit-and-wait. On devices with multiple queues this can
+  // run out-of-order/overlapped with other work and return earlier than device
+  // idle.
+  iree_hal_semaphore_t* fence_semaphore = NULL;
+  iree_status_t status =
+      iree_hal_semaphore_create(device, 0ull, &fence_semaphore);
+  uint64_t signal_value = 1ull;
+  if (iree_status_is_ok(status)) {
+    iree_hal_submission_batch_t batch = {
+        .wait_semaphores =
+            {
+                .count = wait_semaphore != NULL ? 1 : 0,
+                .semaphores = &wait_semaphore,
+                .payload_values = &wait_value,
+            },
+        .command_buffer_count = 1,
+        .command_buffers = &command_buffer,
+        .signal_semaphores =
+            {
+                .count = 1,
+                .semaphores = &fence_semaphore,
+                .payload_values = &signal_value,
+            },
+    };
+    status =
+        iree_hal_device_queue_submit(device, IREE_HAL_COMMAND_CATEGORY_TRANSFER,
+                                     IREE_HAL_QUEUE_AFFINITY_ANY, 1, &batch);
+  }
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_semaphore_wait(fence_semaphore, signal_value, timeout);
+  }
+
+  iree_hal_command_buffer_release(command_buffer);
+  iree_hal_semaphore_release(fence_semaphore);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+//===----------------------------------------------------------------------===//
 // iree_hal_device_transfer_range implementations
 //===----------------------------------------------------------------------===//