Adding IREE_HAL_COMMAND_BUFFER_MODE_UNRETAINED flag. (#21755)
This (*dangerously*) disables lifetime management of objects recorded
into command buffers. In the rare case where lifetime management is not
required this neuters all resource set tracking during recording and
allows implementations to fire-and-forget command buffers without
needing to release resources post-execution. This is primarily useful
internally where we are replaying command buffers against an immediate
mode target or for future HIP/CUDA graph capture where resource lifetime
is not managed. The compiler will never use this.
Fixes #21749.
diff --git a/runtime/src/iree/hal/command_buffer.c b/runtime/src/iree/hal/command_buffer.c
index 4f0abec..f2dfe1d 100644
--- a/runtime/src/iree/hal/command_buffer.c
+++ b/runtime/src/iree/hal/command_buffer.c
@@ -116,6 +116,7 @@
{IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION,
IREE_SVL("ALLOW_INLINE_EXECUTION")},
{IREE_HAL_COMMAND_BUFFER_MODE_UNVALIDATED, IREE_SVL("UNVALIDATED")},
+ {IREE_HAL_COMMAND_BUFFER_MODE_UNRETAINED, IREE_SVL("UNRETAINED")},
};
return iree_bitfield_format_inline(value, IREE_ARRAYSIZE(mappings), mappings,
out_temp);
diff --git a/runtime/src/iree/hal/command_buffer.h b/runtime/src/iree/hal/command_buffer.h
index dea5bb8..2ff64a3 100644
--- a/runtime/src/iree/hal/command_buffer.h
+++ b/runtime/src/iree/hal/command_buffer.h
@@ -63,6 +63,15 @@
// `IREE_HAL_COMMAND_BUFFER_VALIDATION_ENABLE=1` - if shimming command buffers
// or performing replay this validation can be disabled per-command buffer.
IREE_HAL_COMMAND_BUFFER_MODE_UNVALIDATED = 1u << 5,
+
+ // Disables resource lifetime management.
+ // ***DANGER***: all resources used in the command buffer will not be retained
+ // and **MUST** remain valid for the lifetime of the command buffer. This is
+ // not safe in IREE as all code assumes proper retain semantics. If layering
+ // on top of the HAL with a different programming model that makes assumptions
+ // about lifetime this flag disables the internal resource tracking to reduce
+ // overhead.
+ IREE_HAL_COMMAND_BUFFER_MODE_UNRETAINED = 1u << 6,
};
typedef uint32_t iree_hal_command_buffer_mode_t;
@@ -142,6 +151,12 @@
const iree_hal_buffer_ref_t* values;
} iree_hal_buffer_ref_list_t;
+// Returns an empty buffer ref list.
+static inline iree_hal_buffer_ref_list_t iree_hal_buffer_ref_list_empty(void) {
+ iree_hal_buffer_ref_list_t list = {0};
+ return list;
+}
+
// Bitfield specifying which execution stage a barrier should start/end at.
//
// Maps to VkPipelineStageFlagBits.
diff --git a/runtime/src/iree/hal/drivers/amdgpu/command_buffer.c b/runtime/src/iree/hal/drivers/amdgpu/command_buffer.c
index dd14b24..7175b2d 100644
--- a/runtime/src/iree/hal/drivers/amdgpu/command_buffer.c
+++ b/runtime/src/iree/hal/drivers/amdgpu/command_buffer.c
@@ -973,7 +973,9 @@
// would want to repack/trim the resource set when freezing (if not one-shot
// where it doesn't matter). The risk with large allocs is that a user with
// 10000 reusable command buffers will eat all that memory forever.
- if (iree_status_is_ok(status)) {
+ if (iree_status_is_ok(status) &&
+ !iree_all_bits_set(options->mode,
+ IREE_HAL_COMMAND_BUFFER_MODE_UNRETAINED)) {
status = iree_hal_resource_set_allocate(&options->host_block_pools->small,
&command_buffer->resource_set);
}
diff --git a/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.c b/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.c
index 698d0aa..58e4c6d 100644
--- a/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.c
+++ b/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.c
@@ -204,8 +204,11 @@
command_buffer->cu_barrier_node = NULL;
command_buffer->graph_node_count = 0;
- iree_status_t status =
- iree_hal_resource_set_allocate(block_pool, &command_buffer->resource_set);
+ iree_status_t status = iree_ok_status();
+ if (!iree_all_bits_set(mode, IREE_HAL_COMMAND_BUFFER_MODE_UNRETAINED)) {
+ status = iree_hal_resource_set_allocate(block_pool,
+ &command_buffer->resource_set);
+ }
if (iree_status_is_ok(status)) {
iree_hal_collective_batch_initialize(&command_buffer->arena,
diff --git a/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.c b/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.c
index eedb403..67839ac 100644
--- a/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.c
+++ b/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.c
@@ -95,8 +95,11 @@
command_buffer->cu_stream = stream;
iree_arena_initialize(block_pool, &command_buffer->arena);
- iree_status_t status =
- iree_hal_resource_set_allocate(block_pool, &command_buffer->resource_set);
+ iree_status_t status = iree_ok_status();
+ if (!iree_all_bits_set(mode, IREE_HAL_COMMAND_BUFFER_MODE_UNRETAINED)) {
+ status = iree_hal_resource_set_allocate(block_pool,
+ &command_buffer->resource_set);
+ }
if (iree_status_is_ok(status)) {
iree_hal_collective_batch_initialize(&command_buffer->arena,
diff --git a/runtime/src/iree/hal/drivers/hip/graph_command_buffer.c b/runtime/src/iree/hal/drivers/hip/graph_command_buffer.c
index f6875c5..5e88a3f 100644
--- a/runtime/src/iree/hal/drivers/hip/graph_command_buffer.c
+++ b/runtime/src/iree/hal/drivers/hip/graph_command_buffer.c
@@ -209,8 +209,11 @@
command_buffer->hip_barrier_node = NULL;
command_buffer->graph_node_count = 0;
- iree_status_t status =
- iree_hal_resource_set_allocate(block_pool, &command_buffer->resource_set);
+ iree_status_t status = iree_ok_status();
+ if (!iree_all_bits_set(mode, IREE_HAL_COMMAND_BUFFER_MODE_UNRETAINED)) {
+ status = iree_hal_resource_set_allocate(block_pool,
+ &command_buffer->resource_set);
+ }
if (iree_status_is_ok(status)) {
iree_hal_collective_batch_initialize(&command_buffer->arena,
diff --git a/runtime/src/iree/hal/drivers/hip/stream_command_buffer.c b/runtime/src/iree/hal/drivers/hip/stream_command_buffer.c
index 1cc16a5..66594cb 100644
--- a/runtime/src/iree/hal/drivers/hip/stream_command_buffer.c
+++ b/runtime/src/iree/hal/drivers/hip/stream_command_buffer.c
@@ -97,8 +97,11 @@
command_buffer->hip_stream = stream;
iree_arena_initialize(block_pool, &command_buffer->arena);
- iree_status_t status =
- iree_hal_resource_set_allocate(block_pool, &command_buffer->resource_set);
+ iree_status_t status = iree_ok_status();
+ if (!iree_all_bits_set(mode, IREE_HAL_COMMAND_BUFFER_MODE_UNRETAINED)) {
+ status = iree_hal_resource_set_allocate(block_pool,
+ &command_buffer->resource_set);
+ }
if (iree_status_is_ok(status)) {
iree_hal_collective_batch_initialize(&command_buffer->arena,
diff --git a/runtime/src/iree/hal/drivers/local_task/task_command_buffer.c b/runtime/src/iree/hal/drivers/local_task/task_command_buffer.c
index 4d1360c..2a51be5 100644
--- a/runtime/src/iree/hal/drivers/local_task/task_command_buffer.c
+++ b/runtime/src/iree/hal/drivers/local_task/task_command_buffer.c
@@ -136,8 +136,10 @@
iree_task_list_initialize(&command_buffer->root_tasks);
iree_task_list_initialize(&command_buffer->leaf_tasks);
memset(&command_buffer->state, 0, sizeof(command_buffer->state));
- status = iree_hal_resource_set_allocate(block_pool,
- &command_buffer->resource_set);
+ if (!iree_all_bits_set(mode, IREE_HAL_COMMAND_BUFFER_MODE_UNRETAINED)) {
+ status = iree_hal_resource_set_allocate(block_pool,
+ &command_buffer->resource_set);
+ }
}
if (iree_status_is_ok(status)) {
*out_command_buffer = &command_buffer->base;
diff --git a/runtime/src/iree/hal/drivers/local_task/task_device.c b/runtime/src/iree/hal/drivers/local_task/task_device.c
index 030e7e4..1bea00e 100644
--- a/runtime/src/iree/hal/drivers/local_task/task_device.c
+++ b/runtime/src/iree/hal/drivers/local_task/task_device.c
@@ -296,7 +296,8 @@
iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity,
iree_hal_command_buffer_t** out_command_buffer) {
iree_hal_task_device_t* device = iree_hal_task_device_cast(base_device);
- if (binding_capacity > 0) {
+ if (!iree_all_bits_set(mode, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT) ||
+ binding_capacity > 0) {
// TODO(indirect-cmd): natively support reusable task command buffers. For
// now we emulate by recording into a deferred command buffer and
// recording/issuing at submission time. The task system needs some
diff --git a/runtime/src/iree/hal/drivers/local_task/task_queue.c b/runtime/src/iree/hal/drivers/local_task/task_queue.c
index 9cb06ae..34b4c4c 100644
--- a/runtime/src/iree/hal/drivers/local_task/task_queue.c
+++ b/runtime/src/iree/hal/drivers/local_task/task_queue.c
@@ -215,6 +215,7 @@
cmd->queue->device_allocator, &cmd->queue->scope,
iree_hal_command_buffer_mode(command_buffer) |
IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT |
+ IREE_HAL_COMMAND_BUFFER_MODE_UNRETAINED |
// NOTE: we need to validate if a binding table is provided as the
// bindings were not known when it was originally recorded.
(iree_hal_buffer_binding_table_is_empty(binding_table)
diff --git a/runtime/src/iree/hal/drivers/metal/direct_command_buffer.m b/runtime/src/iree/hal/drivers/metal/direct_command_buffer.m
index 1509d70..da4ae00 100644
--- a/runtime/src/iree/hal/drivers/metal/direct_command_buffer.m
+++ b/runtime/src/iree/hal/drivers/metal/direct_command_buffer.m
@@ -357,7 +357,10 @@
iree_arena_initialize(block_pool, &command_buffer->arena);
command_buffer->staging_buffer = staging_buffer;
command_buffer->host_allocator = host_allocator;
- iree_status_t status = iree_hal_resource_set_allocate(block_pool, &command_buffer->resource_set);
+ iree_status_t status = iree_ok_status();
+ if (!iree_all_bits_set(mode, IREE_HAL_COMMAND_BUFFER_MODE_UNRETAINED)) {
+ status = iree_hal_resource_set_allocate(block_pool, &command_buffer->resource_set);
+ }
if (iree_status_is_ok(status)) {
iree_hal_metal_command_segment_list_reset(&command_buffer->segments);
@autoreleasepool { // Use @autoreleasepool to trigger the autorelease within encoder creation.
diff --git a/runtime/src/iree/hal/drivers/vulkan/direct_command_buffer.cc b/runtime/src/iree/hal/drivers/vulkan/direct_command_buffer.cc
index 9787b4b..ae830f5 100644
--- a/runtime/src/iree/hal/drivers/vulkan/direct_command_buffer.cc
+++ b/runtime/src/iree/hal/drivers/vulkan/direct_command_buffer.cc
@@ -128,8 +128,10 @@
new (&command_buffer->descriptor_set_group) DescriptorSetGroup();
command_buffer->builtin_executables = builtin_executables;
- status = iree_hal_resource_set_allocate(block_pool,
- &command_buffer->resource_set);
+ if (!iree_all_bits_set(mode, IREE_HAL_COMMAND_BUFFER_MODE_UNRETAINED)) {
+ status = iree_hal_resource_set_allocate(block_pool,
+ &command_buffer->resource_set);
+ }
}
if (iree_status_is_ok(status)) {
diff --git a/runtime/src/iree/hal/utils/deferred_command_buffer.c b/runtime/src/iree/hal/utils/deferred_command_buffer.c
index f593779..908aadd 100644
--- a/runtime/src/iree/hal/utils/deferred_command_buffer.c
+++ b/runtime/src/iree/hal/utils/deferred_command_buffer.c
@@ -182,8 +182,10 @@
command_buffer->host_allocator = host_allocator;
iree_hal_cmd_list_initialize(block_pool, &command_buffer->cmd_list);
- status = iree_hal_resource_set_allocate(block_pool,
- &command_buffer->resource_set);
+ if (!iree_all_bits_set(mode, IREE_HAL_COMMAND_BUFFER_MODE_UNRETAINED)) {
+ status = iree_hal_resource_set_allocate(block_pool,
+ &command_buffer->resource_set);
+ }
}
if (iree_status_is_ok(status)) {
diff --git a/runtime/src/iree/hal/utils/resource_set.c b/runtime/src/iree/hal/utils/resource_set.c
index a807fd6..6cf543b 100644
--- a/runtime/src/iree/hal/utils/resource_set.c
+++ b/runtime/src/iree/hal/utils/resource_set.c
@@ -277,7 +277,6 @@
IREE_API_EXPORT iree_status_t
iree_hal_resource_set_insert(iree_hal_resource_set_t* set,
iree_host_size_t count, const void* resources) {
- IREE_ASSERT_ARGUMENT(set);
return iree_hal_resource_set_insert_strided(set, count, resources, 0,
sizeof(iree_hal_resource_t*));
}
@@ -285,7 +284,7 @@
IREE_API_EXPORT iree_status_t iree_hal_resource_set_insert_strided(
iree_hal_resource_set_t* set, iree_host_size_t count, const void* elements,
iree_host_size_t offset, iree_host_size_t stride) {
- IREE_ASSERT_ARGUMENT(set);
+ if (!set) return iree_ok_status();
// For now we process one at a time. We should have a stride that lets us
// amortize the cost of doing the MRU update and insertion allocation by
// say slicing off 4/8/16/32 resources at a time etc. Today each miss that