Adding iree_hal_device_queue_update.
As with all queue DMA operations it's best if things are batched into
command buffers but it's bad to have a command buffer with a single
DMA operation - this completes the set of fill/update/copy operations
at the queue level to match the command buffer DMA operations.
Practically this is useful when combined with reusable/indirect command
buffers for uploading new parameters in queue order prior to issuing
a command buffer that references them. The compiler will use this to
turn push constants into uniform buffers.
An emulated version is added but implementations are encouraged to do
better... they currently don't.
diff --git a/experimental/webgpu/webgpu_device.c b/experimental/webgpu/webgpu_device.c
index 4d26911..c9a2457 100644
--- a/experimental/webgpu/webgpu_device.c
+++ b/experimental/webgpu/webgpu_device.c
@@ -470,6 +470,7 @@
.queue_alloca = iree_hal_webgpu_device_queue_alloca,
.queue_dealloca = iree_hal_webgpu_device_queue_dealloca,
.queue_fill = iree_hal_device_queue_emulated_fill,
+ .queue_update = iree_hal_device_queue_emulated_update,
.queue_copy = iree_hal_device_queue_emulated_copy,
.queue_read = iree_hal_webgpu_device_queue_read,
.queue_write = iree_hal_webgpu_device_queue_write,
diff --git a/runtime/src/iree/hal/command_buffer.c b/runtime/src/iree/hal/command_buffer.c
index 58fda97..7f26263 100644
--- a/runtime/src/iree/hal/command_buffer.c
+++ b/runtime/src/iree/hal/command_buffer.c
@@ -662,6 +662,15 @@
transfer_command->fill.pattern,
transfer_command->fill.pattern_length, IREE_HAL_FILL_FLAG_NONE);
break;
+ case IREE_HAL_TRANSFER_COMMAND_TYPE_UPDATE:
+ status = iree_hal_command_buffer_update_buffer(
+ command_buffer, transfer_command->update.source_buffer,
+ transfer_command->update.source_offset,
+ iree_hal_make_buffer_ref(transfer_command->update.target_buffer,
+ transfer_command->update.target_offset,
+ transfer_command->update.length),
+ IREE_HAL_UPDATE_FLAG_NONE);
+ break;
case IREE_HAL_TRANSFER_COMMAND_TYPE_COPY:
status = iree_hal_command_buffer_copy_buffer(
command_buffer,
@@ -673,15 +682,6 @@
transfer_command->copy.length),
IREE_HAL_COPY_FLAG_NONE);
break;
- case IREE_HAL_TRANSFER_COMMAND_TYPE_UPDATE:
- status = iree_hal_command_buffer_update_buffer(
- command_buffer, transfer_command->update.source_buffer,
- transfer_command->update.source_offset,
- iree_hal_make_buffer_ref(transfer_command->update.target_buffer,
- transfer_command->update.target_offset,
- transfer_command->update.length),
- IREE_HAL_UPDATE_FLAG_NONE);
- break;
default:
status =
iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
diff --git a/runtime/src/iree/hal/command_buffer.h b/runtime/src/iree/hal/command_buffer.h
index 9152106..f15f74a 100644
--- a/runtime/src/iree/hal/command_buffer.h
+++ b/runtime/src/iree/hal/command_buffer.h
@@ -230,7 +230,7 @@
IREE_HAL_FILL_FLAG_NONE = 0,
};
-// Bitfield specifying flags controlling a update operation.
+// Bitfield specifying flags controlling an update operation.
typedef uint64_t iree_hal_update_flags_t;
enum iree_hal_update_flag_bits_t {
IREE_HAL_UPDATE_FLAG_NONE = 0,
@@ -802,10 +802,10 @@
typedef enum iree_hal_transfer_command_type_t {
// iree_hal_command_buffer_fill_buffer
IREE_HAL_TRANSFER_COMMAND_TYPE_FILL = 0u,
- // iree_hal_command_buffer_copy_buffer
- IREE_HAL_TRANSFER_COMMAND_TYPE_COPY = 1u,
// iree_hal_command_buffer_update_buffer
- IREE_HAL_TRANSFER_COMMAND_TYPE_UPDATE = 2u,
+ IREE_HAL_TRANSFER_COMMAND_TYPE_UPDATE = 1u,
+ // iree_hal_command_buffer_copy_buffer
+ IREE_HAL_TRANSFER_COMMAND_TYPE_COPY = 2u,
} iree_hal_transfer_command_type_t;
// Represents a single transfer command within a batch of commands.
@@ -821,14 +821,6 @@
const void* pattern;
iree_host_size_t pattern_length;
} fill;
- // IREE_HAL_TRANSFER_COMMAND_TYPE_COPY
- struct {
- iree_hal_buffer_t* source_buffer;
- iree_device_size_t source_offset;
- iree_hal_buffer_t* target_buffer;
- iree_device_size_t target_offset;
- iree_device_size_t length;
- } copy;
// IREE_HAL_TRANSFER_COMMAND_TYPE_UPDATE
struct {
const void* source_buffer;
@@ -837,6 +829,14 @@
iree_device_size_t target_offset;
iree_device_size_t length;
} update;
+ // IREE_HAL_TRANSFER_COMMAND_TYPE_COPY
+ struct {
+ iree_hal_buffer_t* source_buffer;
+ iree_device_size_t source_offset;
+ iree_hal_buffer_t* target_buffer;
+ iree_device_size_t target_offset;
+ iree_device_size_t length;
+ } copy;
};
} iree_hal_transfer_command_t;
diff --git a/runtime/src/iree/hal/device.c b/runtime/src/iree/hal/device.c
index 5923ba4..f2b7d78 100644
--- a/runtime/src/iree/hal/device.c
+++ b/runtime/src/iree/hal/device.c
@@ -196,6 +196,91 @@
return status;
}
+IREE_API_EXPORT iree_status_t iree_hal_device_queue_emulated_update(
+ iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
+ const iree_hal_semaphore_list_t wait_semaphore_list,
+ const iree_hal_semaphore_list_t signal_semaphore_list,
+ const void* source_buffer, iree_host_size_t source_offset,
+ iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+ iree_device_size_t length, iree_hal_update_flags_t flags) {
+ IREE_ASSERT_ARGUMENT(device);
+ IREE_ASSERT_ARGUMENT(source_buffer);
+ IREE_ASSERT_ARGUMENT(target_buffer);
+ IREE_TRACE_ZONE_BEGIN(z0);
+ IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, (int64_t)length);
+
+ // If we are starting execution immediately then we can reduce latency by
+ // allowing inline command buffer execution.
+ iree_hal_command_buffer_mode_t command_buffer_mode =
+ IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT;
+ if (wait_semaphore_list.count == 0) {
+ command_buffer_mode |= IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION;
+ }
+
+ // TODO(benvanik): support splitting the update into multiple chunks to fit
+ // under the max command buffer update size limit. This provisional API is
+ // intended only for updating dispatch parameters today.
+ if (length > UINT16_MAX) {
+ return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+ "queue buffer updates currently limited to 64KB, "
+ "tried to update %" PRIhsz " bytes",
+ length);
+ }
+
+ iree_hal_transfer_command_t command = {
+ .type = IREE_HAL_TRANSFER_COMMAND_TYPE_UPDATE,
+ .update =
+ {
+ .source_buffer = source_buffer,
+ .source_offset = source_offset,
+ .target_buffer = target_buffer,
+ .target_offset = target_offset,
+ .length = length,
+ },
+ };
+
+ iree_hal_command_buffer_t* command_buffer = NULL;
+ IREE_RETURN_AND_END_ZONE_IF_ERROR(
+ z0, iree_hal_create_transfer_command_buffer(device, command_buffer_mode,
+ queue_affinity, 1, &command,
+ &command_buffer));
+
+ iree_status_t status = iree_hal_device_queue_execute(
+ device, queue_affinity, wait_semaphore_list, signal_semaphore_list,
+ command_buffer, iree_hal_buffer_binding_table_empty());
+
+ iree_hal_command_buffer_release(command_buffer);
+
+ IREE_TRACE_ZONE_END(z0);
+ return status;
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_device_queue_update(
+ iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
+ const iree_hal_semaphore_list_t wait_semaphore_list,
+ const iree_hal_semaphore_list_t signal_semaphore_list,
+ const void* source_buffer, iree_host_size_t source_offset,
+ iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+ iree_device_size_t length, iree_hal_update_flags_t flags) {
+ IREE_ASSERT_ARGUMENT(device);
+ IREE_ASSERT_ARGUMENT(
+ !wait_semaphore_list.count ||
+ (wait_semaphore_list.semaphores && wait_semaphore_list.payload_values));
+ IREE_ASSERT_ARGUMENT(!signal_semaphore_list.count ||
+ (signal_semaphore_list.semaphores &&
+ signal_semaphore_list.payload_values));
+ IREE_ASSERT_ARGUMENT(source_buffer);
+ IREE_ASSERT_ARGUMENT(target_buffer);
+ IREE_TRACE_ZONE_BEGIN(z0);
+ IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, (int64_t)length);
+ iree_status_t status = _VTABLE_DISPATCH(device, queue_update)(
+ device, queue_affinity, wait_semaphore_list, signal_semaphore_list,
+ source_buffer, source_offset, target_buffer, target_offset, length,
+ flags);
+ IREE_TRACE_ZONE_END(z0);
+ return status;
+}
+
IREE_API_EXPORT iree_status_t iree_hal_device_queue_emulated_copy(
iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
const iree_hal_semaphore_list_t wait_semaphore_list,
diff --git a/runtime/src/iree/hal/device.h b/runtime/src/iree/hal/device.h
index 145537a..4511fa7 100644
--- a/runtime/src/iree/hal/device.h
+++ b/runtime/src/iree/hal/device.h
@@ -312,11 +312,12 @@
iree_hal_buffer_t* buffer);
// Enqueues a single queue-ordered fill operation.
+// The |target_buffer| must be visible to the device queue performing the fill.
//
// WARNING: individual fills have a high overhead and batching should be
// performed by the caller instead of calling this multiple times. The
// iree_hal_create_transfer_command_buffer utility makes it easy to create
-// batches of transfer operations (fill, copy, update) and is only a few lines
+// batches of transfer operations (fill, update, copy) and is only a few lines
// more code.
IREE_API_EXPORT iree_status_t iree_hal_device_queue_fill(
iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
@@ -326,12 +327,36 @@
iree_device_size_t length, const void* pattern,
iree_host_size_t pattern_length, iree_hal_fill_flags_t flags);
-// Enqueues a single queue-ordered copy operation.
+// Enqueues a single queue-ordered buffer update operation.
+// The provided |source_buffer| will be captured and need not remain live or
+// unchanged while the operation is queued. The |target_buffer| must be visible
+// to the device queue performing the update.
+//
+// Some implementations may have limits on the size of the update or may perform
+// poorly if the size is larger than an implementation-defined limit. Updates
+// should be kept as small and infrequent as possible.
//
// WARNING: individual copies have a high overhead and batching should be
// performed by the caller instead of calling this multiple times. The
// iree_hal_create_transfer_command_buffer utility makes it easy to create
-// batches of transfer operations (fill, copy, update) and is only a few lines
+// batches of transfer operations (fill, update, copy) and is only a few lines
+// more code.
+IREE_API_EXPORT iree_status_t iree_hal_device_queue_update(
+ iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
+ const iree_hal_semaphore_list_t wait_semaphore_list,
+ const iree_hal_semaphore_list_t signal_semaphore_list,
+ const void* source_buffer, iree_host_size_t source_offset,
+ iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+ iree_device_size_t length, iree_hal_update_flags_t flags);
+
+// Enqueues a single queue-ordered copy operation.
+// The |source_buffer| and |target_buffer| must both be visible to the device
+// queue performing the copy.
+//
+// WARNING: individual copies have a high overhead and batching should be
+// performed by the caller instead of calling this multiple times. The
+// iree_hal_create_transfer_command_buffer utility makes it easy to create
+// batches of transfer operations (fill, update, copy) and is only a few lines
// more code.
IREE_API_EXPORT iree_status_t iree_hal_device_queue_copy(
iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
@@ -578,6 +603,14 @@
iree_device_size_t length, const void* pattern,
iree_host_size_t pattern_length, iree_hal_fill_flags_t flags);
+ iree_status_t(IREE_API_PTR* queue_update)(
+ iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
+ const iree_hal_semaphore_list_t wait_semaphore_list,
+ const iree_hal_semaphore_list_t signal_semaphore_list,
+ const void* source_buffer, iree_host_size_t source_offset,
+ iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+ iree_device_size_t length, iree_hal_update_flags_t flags);
+
iree_status_t(IREE_API_PTR* queue_copy)(
iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
const iree_hal_semaphore_list_t wait_semaphore_list,
@@ -634,6 +667,14 @@
iree_device_size_t length, const void* pattern,
iree_host_size_t pattern_length, iree_hal_fill_flags_t flags);
+IREE_API_EXPORT iree_status_t iree_hal_device_queue_emulated_update(
+ iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
+ const iree_hal_semaphore_list_t wait_semaphore_list,
+ const iree_hal_semaphore_list_t signal_semaphore_list,
+ const void* source_buffer, iree_host_size_t source_offset,
+ iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+ iree_device_size_t length, iree_hal_update_flags_t flags);
+
IREE_API_EXPORT iree_status_t iree_hal_device_queue_emulated_copy(
iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
const iree_hal_semaphore_list_t wait_semaphore_list,
diff --git a/runtime/src/iree/hal/drivers/cuda/cuda_device.c b/runtime/src/iree/hal/drivers/cuda/cuda_device.c
index 1669510..2ed014b 100644
--- a/runtime/src/iree/hal/drivers/cuda/cuda_device.c
+++ b/runtime/src/iree/hal/drivers/cuda/cuda_device.c
@@ -1130,6 +1130,7 @@
.queue_alloca = iree_hal_cuda_device_queue_alloca,
.queue_dealloca = iree_hal_cuda_device_queue_dealloca,
.queue_fill = iree_hal_device_queue_emulated_fill,
+ .queue_update = iree_hal_device_queue_emulated_update,
.queue_copy = iree_hal_device_queue_emulated_copy,
.queue_read = iree_hal_cuda_device_queue_read,
.queue_write = iree_hal_cuda_device_queue_write,
diff --git a/runtime/src/iree/hal/drivers/hip/hip_device.c b/runtime/src/iree/hal/drivers/hip/hip_device.c
index d065d2c..7f42e8d 100644
--- a/runtime/src/iree/hal/drivers/hip/hip_device.c
+++ b/runtime/src/iree/hal/drivers/hip/hip_device.c
@@ -1127,6 +1127,7 @@
.queue_alloca = iree_hal_hip_device_queue_alloca,
.queue_dealloca = iree_hal_hip_device_queue_dealloca,
.queue_fill = iree_hal_device_queue_emulated_fill,
+ .queue_update = iree_hal_device_queue_emulated_update,
.queue_copy = iree_hal_device_queue_emulated_copy,
.queue_read = iree_hal_hip_device_queue_read,
.queue_write = iree_hal_hip_device_queue_write,
diff --git a/runtime/src/iree/hal/drivers/local_sync/sync_device.c b/runtime/src/iree/hal/drivers/local_sync/sync_device.c
index a543d55..7283e58 100644
--- a/runtime/src/iree/hal/drivers/local_sync/sync_device.c
+++ b/runtime/src/iree/hal/drivers/local_sync/sync_device.c
@@ -504,6 +504,7 @@
.queue_alloca = iree_hal_sync_device_queue_alloca,
.queue_dealloca = iree_hal_sync_device_queue_dealloca,
.queue_fill = iree_hal_device_queue_emulated_fill,
+ .queue_update = iree_hal_device_queue_emulated_update,
.queue_copy = iree_hal_device_queue_emulated_copy,
.queue_read = iree_hal_sync_device_queue_read,
.queue_write = iree_hal_sync_device_queue_write,
diff --git a/runtime/src/iree/hal/drivers/local_task/task_device.c b/runtime/src/iree/hal/drivers/local_task/task_device.c
index d52b08a..8aa0925 100644
--- a/runtime/src/iree/hal/drivers/local_task/task_device.c
+++ b/runtime/src/iree/hal/drivers/local_task/task_device.c
@@ -539,6 +539,7 @@
.queue_alloca = iree_hal_task_device_queue_alloca,
.queue_dealloca = iree_hal_task_device_queue_dealloca,
.queue_fill = iree_hal_device_queue_emulated_fill,
+ .queue_update = iree_hal_device_queue_emulated_update,
.queue_copy = iree_hal_device_queue_emulated_copy,
.queue_read = iree_hal_task_device_queue_read,
.queue_write = iree_hal_task_device_queue_write,
diff --git a/runtime/src/iree/hal/drivers/metal/metal_device.m b/runtime/src/iree/hal/drivers/metal/metal_device.m
index 593abd1..4f8b4fd 100644
--- a/runtime/src/iree/hal/drivers/metal/metal_device.m
+++ b/runtime/src/iree/hal/drivers/metal/metal_device.m
@@ -618,6 +618,7 @@
.queue_alloca = iree_hal_metal_device_queue_alloca,
.queue_dealloca = iree_hal_metal_device_queue_dealloca,
.queue_fill = iree_hal_device_queue_emulated_fill,
+ .queue_update = iree_hal_device_queue_emulated_update,
.queue_copy = iree_hal_device_queue_emulated_copy,
.queue_read = iree_hal_metal_device_queue_read,
.queue_write = iree_hal_metal_device_queue_write,
diff --git a/runtime/src/iree/hal/drivers/null/device.c b/runtime/src/iree/hal/drivers/null/device.c
index 935d9a0..1195364 100644
--- a/runtime/src/iree/hal/drivers/null/device.c
+++ b/runtime/src/iree/hal/drivers/null/device.c
@@ -372,6 +372,26 @@
target_buffer, target_offset, length, pattern, pattern_length, flags);
}
+static iree_status_t iree_hal_null_device_queue_update(
+ iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
+ const iree_hal_semaphore_list_t wait_semaphore_list,
+ const iree_hal_semaphore_list_t signal_semaphore_list,
+ const void* source_buffer, iree_host_size_t source_offset,
+ iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+ iree_device_size_t length, iree_hal_update_flags_t flags) {
+ // TODO(null): if a native queue update operation is available use that
+ // instead. The emulated update creates a command buffer and executes it and
+ // it's best if the extra recording/upload/allocation time can be avoided.
+ // Since command buffers have a limited capacity for embedded data the
+ // emulated version may need to allocate buffers, split the update into
+ // multiple commands, or commit other sins a native implementation would be
+ // able to avoid.
+ return iree_hal_device_queue_emulated_update(
+ base_device, queue_affinity, wait_semaphore_list, signal_semaphore_list,
+ source_buffer, source_offset, target_buffer, target_offset, length,
+ flags);
+}
+
static iree_status_t iree_hal_null_device_queue_copy(
iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
const iree_hal_semaphore_list_t wait_semaphore_list,
@@ -580,6 +600,7 @@
.queue_alloca = iree_hal_null_device_queue_alloca,
.queue_dealloca = iree_hal_null_device_queue_dealloca,
.queue_fill = iree_hal_null_device_queue_fill,
+ .queue_update = iree_hal_null_device_queue_update,
.queue_copy = iree_hal_null_device_queue_copy,
.queue_read = iree_hal_null_device_queue_read,
.queue_write = iree_hal_null_device_queue_write,
diff --git a/runtime/src/iree/hal/drivers/vulkan/vulkan_device.cc b/runtime/src/iree/hal/drivers/vulkan/vulkan_device.cc
index 2edd31d..6db27bc 100644
--- a/runtime/src/iree/hal/drivers/vulkan/vulkan_device.cc
+++ b/runtime/src/iree/hal/drivers/vulkan/vulkan_device.cc
@@ -1893,6 +1893,7 @@
/*.queue_alloca=*/iree_hal_vulkan_device_queue_alloca,
/*.queue_dealloca=*/iree_hal_vulkan_device_queue_dealloca,
/*.queue_fill=*/iree_hal_device_queue_emulated_fill,
+ /*.queue_update=*/iree_hal_device_queue_emulated_update,
/*.queue_copy=*/iree_hal_device_queue_emulated_copy,
/*.queue_read=*/iree_hal_vulkan_device_queue_read,
/*.queue_write=*/iree_hal_vulkan_device_queue_write,