Merge pull request #7998 from google/benvanik-resource-retain
Command buffers that execute asynchronously now need to ensure the lifetime of all referenced resources extends for as long as the command buffer lives. Future changes will extend this to device queues/semaphores.
By tracking lifetime on command buffers we can remove the deferred release list that was performing lifetime tracking on the HAL VM module - it was using a generic VM list (24 bytes per entry vs 8 on 64-bit) and limited us to a single device/command buffer and synchronous submission: now when we get around to attaching lifetime to timepoints we can correctly track lifetime in asynchronous programs 🎉
An important user-visible reason we bring the lifetime tracking into the HAL is that when we start to mix asynchronous host application code, compiler-generated code, and custom module code we have no good way of plumbing through the lifetimes with the current interface. A bonus of this change is that now we only pay for lifetime tracking when it's required: when running against implementations that perform their own resource lifetime tracking (WebGPU/Metal) we may be able to avoid doing the tracking. We can also avoid tracking when we are wrapping command buffers in things like the deferred command buffer (or future RPC systems) that perform the tracking themselves.
This change drops our memory consumption high-water mark in dylib-sync/embedded cases by 16-32KB as now we don't need tracking there and improves the performance (~4us -> 3us per hello-world invocation) as there's no tracking logic taking place. It's better for the other backends that do require tracking (task system/GPUs/etc) due to reduced memory consumption, allocations, and a more optimized implementation.
diff --git a/experimental/rocm/rocm_device.c b/experimental/rocm/rocm_device.c
index 026650f..6666bd4 100644
--- a/experimental/rocm/rocm_device.c
+++ b/experimental/rocm/rocm_device.c
@@ -172,7 +172,7 @@
static iree_status_t iree_hal_rocm_device_trim(iree_hal_device_t* base_device) {
iree_hal_rocm_device_t* device = iree_hal_rocm_device_cast(base_device);
- // TODO(benvanik): trim of ROCM resources, whenever we care.
+ iree_arena_block_pool_trim(&device->block_pool);
return iree_hal_allocator_trim(device->device_allocator);
}
diff --git a/iree/hal/command_buffer.h b/iree/hal/command_buffer.h
index a219948..8f6d8fe 100644
--- a/iree/hal/command_buffer.h
+++ b/iree/hal/command_buffer.h
@@ -243,11 +243,13 @@
// Commands are recorded by the implementation for later submission to command
// queues.
//
-// Buffers and synchronization objects referenced must remain valid and not be
+// Buffers, events, and programs referenced must remain valid and not be
// modified or read while there are commands in-flight. The usual flow is to
-// populate input buffers, Dispatch using those buffers, wait on a Semaphore
-// until the buffers are guaranteed to no longer be in use, and then reuse or
-// release the buffers.
+// populate input buffers, dispatch using those buffers, wait on a semaphore
+// until the buffers are guaranteed to no longer be in use, and then reuse the
+// buffers. Lifetimes are managed by the command buffer and all used resources
+// will be retained for as long as the command buffer is live or until it is
+// reset.
//
// Errors that can be recognized when operations are enqueued will be returned
// immediately, such as invalid argument errors. Errors that can only be
diff --git a/iree/hal/cuda/BUILD b/iree/hal/cuda/BUILD
index 0522dba..3ab575b 100644
--- a/iree/hal/cuda/BUILD
+++ b/iree/hal/cuda/BUILD
@@ -67,6 +67,7 @@
"//iree/hal",
"//iree/hal/utils:buffer_transfer",
"//iree/hal/utils:deferred_command_buffer",
+ "//iree/hal/utils:resource_set",
"//iree/schemas:cuda_executable_def_c_fbs",
],
)
diff --git a/iree/hal/cuda/CMakeLists.txt b/iree/hal/cuda/CMakeLists.txt
index 9b91207..6fe645f 100644
--- a/iree/hal/cuda/CMakeLists.txt
+++ b/iree/hal/cuda/CMakeLists.txt
@@ -59,6 +59,7 @@
iree::hal
iree::hal::utils::buffer_transfer
iree::hal::utils::deferred_command_buffer
+ iree::hal::utils::resource_set
iree::schemas::cuda_executable_def_c_fbs
PUBLIC
)
diff --git a/iree/hal/cuda/cuda_device.c b/iree/hal/cuda/cuda_device.c
index eab84ee..677fb5b 100644
--- a/iree/hal/cuda/cuda_device.c
+++ b/iree/hal/cuda/cuda_device.c
@@ -202,7 +202,7 @@
static iree_status_t iree_hal_cuda_device_trim(iree_hal_device_t* base_device) {
iree_hal_cuda_device_t* device = iree_hal_cuda_device_cast(base_device);
- // TODO(benvanik): trim of CUDA resources, whenever we care.
+ iree_arena_block_pool_trim(&device->block_pool);
return iree_hal_allocator_trim(device->device_allocator);
}
@@ -237,7 +237,7 @@
case IREE_HAL_CUDA_COMMAND_BUFFER_MODE_GRAPH:
return iree_hal_cuda_graph_command_buffer_create(
base_device, &device->context_wrapper, mode, command_categories,
- queue_affinity, out_command_buffer);
+ queue_affinity, &device->block_pool, out_command_buffer);
case IREE_HAL_CUDA_COMMAND_BUFFER_MODE_STREAM:
return iree_hal_deferred_command_buffer_create(
base_device, mode, command_categories, &device->block_pool,
@@ -324,8 +324,8 @@
}
}
}
- // TODO(thomasraoux): Conservatively syncronize after every submit until we
- // support semaphores.
+ // TODO(thomasraoux): implement semaphores - for now this conservatively
+ // synchronizes after every submit.
CUDA_RETURN_IF_ERROR(device->context_wrapper.syms,
cuStreamSynchronize(device->stream),
"cuStreamSynchronize");
diff --git a/iree/hal/cuda/graph_command_buffer.c b/iree/hal/cuda/graph_command_buffer.c
index d55ac91..592cbd9 100644
--- a/iree/hal/cuda/graph_command_buffer.c
+++ b/iree/hal/cuda/graph_command_buffer.c
@@ -17,6 +17,7 @@
#include "iree/hal/cuda/executable_layout.h"
#include "iree/hal/cuda/native_executable.h"
#include "iree/hal/cuda/status_util.h"
+#include "iree/hal/utils/resource_set.h"
#define IREE_HAL_CUDA_MAX_BINDING_COUNT 64
// Kernel arguments contains binding and push constants.
@@ -28,6 +29,11 @@
typedef struct iree_hal_cuda_graph_command_buffer_t {
iree_hal_command_buffer_t base;
iree_hal_cuda_context_wrapper_t* context;
+ iree_arena_block_pool_t* block_pool;
+
+ // Maintains a reference to all resources used within the command buffer.
+ // Reset on each begin.
+ iree_hal_resource_set_t* resource_set;
CUgraph graph;
CUgraphExec exec;
@@ -54,8 +60,10 @@
iree_hal_command_buffer_mode_t mode,
iree_hal_command_category_t command_categories,
iree_hal_queue_affinity_t queue_affinity,
+ iree_arena_block_pool_t* block_pool,
iree_hal_command_buffer_t** out_command_buffer) {
IREE_ASSERT_ARGUMENT(context);
+ IREE_ASSERT_ARGUMENT(block_pool);
IREE_ASSERT_ARGUMENT(out_command_buffer);
IREE_TRACE_ZONE_BEGIN(z0);
@@ -73,6 +81,7 @@
device, mode, command_categories, queue_affinity,
&iree_hal_cuda_graph_command_buffer_vtable, &command_buffer->base);
command_buffer->context = context;
+ command_buffer->block_pool = block_pool;
command_buffer->graph = graph;
command_buffer->exec = NULL;
command_buffer->last_node = NULL;
@@ -84,6 +93,11 @@
command_buffer->current_descriptor[i] = &device_ptrs[i];
}
+ status = iree_hal_resource_set_allocate(block_pool,
+ &command_buffer->resource_set);
+ }
+
+ if (iree_status_is_ok(status)) {
*out_command_buffer = &command_buffer->base;
} else {
context->syms->cuGraphDestroy(graph);
@@ -107,6 +121,7 @@
CUDA_IGNORE_ERROR(command_buffer->context->syms,
cuGraphExecDestroy(command_buffer->exec));
}
+ iree_hal_resource_set_free(command_buffer->resource_set);
iree_allocator_free(command_buffer->context->host_allocator, command_buffer);
IREE_TRACE_ZONE_END(z0);
@@ -136,7 +151,15 @@
static iree_status_t iree_hal_cuda_graph_command_buffer_begin(
iree_hal_command_buffer_t* base_command_buffer) {
- // Nothing to do.
+ iree_hal_cuda_graph_command_buffer_t* command_buffer =
+ iree_hal_cuda_graph_command_buffer_cast(base_command_buffer);
+ // TODO(thomasroux): reset existing state, if present. Right now this leaks.
+ if (command_buffer->graph) {
+ return iree_make_status(
+ IREE_STATUS_UNIMPLEMENTED,
+ "rerecording of command buffers not yet supported with CUDA graphs");
+ }
+ iree_hal_resource_set_reset(command_buffer->resource_set);
return iree_ok_status();
}
@@ -257,6 +280,9 @@
iree_hal_cuda_graph_command_buffer_t* command_buffer =
iree_hal_cuda_graph_command_buffer_cast(base_command_buffer);
+ IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+ command_buffer->resource_set, 1, &target_buffer));
+
CUdeviceptr target_device_buffer = iree_hal_cuda_buffer_device_pointer(
iree_hal_buffer_allocated_buffer(target_buffer));
target_offset += iree_hal_buffer_byte_offset(target_buffer);
@@ -297,6 +323,10 @@
iree_hal_cuda_graph_command_buffer_t* command_buffer =
iree_hal_cuda_graph_command_buffer_cast(base_command_buffer);
+ const iree_hal_buffer_t* buffers[2] = {source_buffer, target_buffer};
+ IREE_RETURN_IF_ERROR(
+ iree_hal_resource_set_insert(command_buffer->resource_set, 2, buffers));
+
CUdeviceptr target_device_buffer = iree_hal_cuda_buffer_device_pointer(
iree_hal_buffer_allocated_buffer(target_buffer));
target_offset += iree_hal_buffer_byte_offset(target_buffer);
@@ -376,16 +406,19 @@
}
qsort(binding_used, binding_count, sizeof(iree_hal_cuda_binding_mapping_t),
compare_binding_index);
- assert(binding_count < IREE_HAL_CUDA_MAX_BINDING_COUNT &&
- "binding count larger than the max expected.");
+ IREE_ASSERT_LT(binding_count, IREE_HAL_CUDA_MAX_BINDING_COUNT,
+ "binding count larger than the max expected");
for (iree_host_size_t i = 0; i < binding_count; i++) {
- iree_hal_descriptor_set_binding_t binding = bindings[binding_used[i].index];
+ const iree_hal_descriptor_set_binding_t* binding =
+ &bindings[binding_used[i].index];
CUdeviceptr device_ptr =
iree_hal_cuda_buffer_device_pointer(
- iree_hal_buffer_allocated_buffer(binding.buffer)) +
- iree_hal_buffer_byte_offset(binding.buffer) + binding.offset;
+ iree_hal_buffer_allocated_buffer(binding->buffer)) +
+ iree_hal_buffer_byte_offset(binding->buffer) + binding->offset;
*((CUdeviceptr*)command_buffer->current_descriptor[i + base_binding]) =
device_ptr;
+ IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+ command_buffer->resource_set, 1, &binding->buffer));
}
return iree_ok_status();
}
@@ -406,6 +439,8 @@
uint32_t workgroup_x, uint32_t workgroup_y, uint32_t workgroup_z) {
iree_hal_cuda_graph_command_buffer_t* command_buffer =
iree_hal_cuda_graph_command_buffer_cast(base_command_buffer);
+ IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+ command_buffer->resource_set, 1, &executable));
iree_hal_executable_layout_t* layout =
iree_hal_cuda_executable_get_layout(executable, entry_point);
iree_host_size_t num_constants =
diff --git a/iree/hal/cuda/graph_command_buffer.h b/iree/hal/cuda/graph_command_buffer.h
index 7ad5251..8ef4fda 100644
--- a/iree/hal/cuda/graph_command_buffer.h
+++ b/iree/hal/cuda/graph_command_buffer.h
@@ -17,12 +17,18 @@
extern "C" {
#endif // __cplusplus
-// Creates a cuda graph.
+typedef struct iree_arena_block_pool_t iree_arena_block_pool_t;
+
+// Creates a command buffer that records into a CUDA graph.
+//
+// NOTE: the |block_pool| must remain live for the lifetime of the command
+// buffers that use it.
iree_status_t iree_hal_cuda_graph_command_buffer_create(
iree_hal_device_t* device, iree_hal_cuda_context_wrapper_t* context,
iree_hal_command_buffer_mode_t mode,
iree_hal_command_category_t command_categories,
iree_hal_queue_affinity_t queue_affinity,
+ iree_arena_block_pool_t* block_pool,
iree_hal_command_buffer_t** out_command_buffer);
// Returns true if |command_buffer| is a CUDA graph-based command buffer.
diff --git a/iree/hal/local/BUILD b/iree/hal/local/BUILD
index 3fc61bd..7b06009 100644
--- a/iree/hal/local/BUILD
+++ b/iree/hal/local/BUILD
@@ -170,6 +170,7 @@
"//iree/base/internal:wait_handle",
"//iree/hal",
"//iree/hal/utils:buffer_transfer",
+ "//iree/hal/utils:resource_set",
"//iree/task",
],
)
diff --git a/iree/hal/local/CMakeLists.txt b/iree/hal/local/CMakeLists.txt
index 60c9588..42fb8d2 100644
--- a/iree/hal/local/CMakeLists.txt
+++ b/iree/hal/local/CMakeLists.txt
@@ -157,6 +157,7 @@
iree::base::tracing
iree::hal
iree::hal::utils::buffer_transfer
+ iree::hal::utils::resource_set
iree::task
PUBLIC
)
diff --git a/iree/hal/local/task_command_buffer.c b/iree/hal/local/task_command_buffer.c
index d8745b9..ed5cb96 100644
--- a/iree/hal/local/task_command_buffer.c
+++ b/iree/hal/local/task_command_buffer.c
@@ -17,6 +17,7 @@
#include "iree/hal/local/local_descriptor_set_layout.h"
#include "iree/hal/local/local_executable.h"
#include "iree/hal/local/local_executable_layout.h"
+#include "iree/hal/utils/resource_set.h"
#include "iree/task/affinity_set.h"
#include "iree/task/list.h"
#include "iree/task/submission.h"
@@ -43,6 +44,10 @@
// Arena used for all allocations; references the shared device block pool.
iree_arena_allocator_t arena;
+ // Maintains a reference to all resources used within the command buffer.
+ // Reset on each begin.
+ iree_hal_resource_set_t* resource_set;
+
// One or more tasks at the root of the command buffer task DAG.
// These tasks are all able to execute concurrently and will be the initial
// ready task set in the submission.
@@ -139,7 +144,13 @@
iree_task_list_initialize(&command_buffer->root_tasks);
iree_task_list_initialize(&command_buffer->leaf_tasks);
memset(&command_buffer->state, 0, sizeof(command_buffer->state));
+ status = iree_hal_resource_set_allocate(block_pool,
+ &command_buffer->resource_set);
+ }
+ if (iree_status_is_ok(status)) {
*out_command_buffer = &command_buffer->base;
+ } else {
+ iree_hal_command_buffer_release(&command_buffer->base);
}
IREE_TRACE_ZONE_END(z0);
@@ -151,6 +162,7 @@
memset(&command_buffer->state, 0, sizeof(command_buffer->state));
iree_task_list_discard(&command_buffer->leaf_tasks);
iree_task_list_discard(&command_buffer->root_tasks);
+ iree_hal_resource_set_reset(command_buffer->resource_set);
iree_arena_reset(&command_buffer->arena);
}
@@ -163,6 +175,7 @@
iree_hal_task_command_buffer_reset(command_buffer);
iree_arena_deinitialize(&command_buffer->arena);
+ iree_hal_resource_set_free(command_buffer->resource_set);
iree_allocator_free(host_allocator, command_buffer);
IREE_TRACE_ZONE_END(z0);
@@ -518,6 +531,9 @@
iree_hal_task_command_buffer_t* command_buffer =
iree_hal_task_command_buffer_cast(base_command_buffer);
+ IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+ command_buffer->resource_set, 1, &target_buffer));
+
iree_hal_cmd_fill_buffer_t* cmd = NULL;
IREE_RETURN_IF_ERROR(
iree_arena_allocate(&command_buffer->arena, sizeof(*cmd), (void**)&cmd));
@@ -577,6 +593,9 @@
iree_hal_task_command_buffer_t* command_buffer =
iree_hal_task_command_buffer_cast(base_command_buffer);
+ IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+ command_buffer->resource_set, 1, &target_buffer));
+
iree_host_size_t total_cmd_size =
sizeof(iree_hal_cmd_update_buffer_t) + length;
@@ -651,6 +670,10 @@
iree_hal_task_command_buffer_t* command_buffer =
iree_hal_task_command_buffer_cast(base_command_buffer);
+ const iree_hal_buffer_t* buffers[2] = {source_buffer, target_buffer};
+ IREE_RETURN_IF_ERROR(
+ iree_hal_resource_set_insert(command_buffer->resource_set, 2, buffers));
+
iree_hal_cmd_copy_buffer_t* cmd = NULL;
IREE_RETURN_IF_ERROR(
iree_arena_allocate(&command_buffer->arena, sizeof(*cmd), (void**)&cmd));
@@ -732,6 +755,10 @@
}
iree_host_size_t binding_ordinal = binding_base + bindings[i].binding;
+ // TODO(benvanik): batch insert by getting the resources in their own list.
+ IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+ command_buffer->resource_set, 1, &bindings[i].buffer));
+
// TODO(benvanik): track mapping so we can properly map/unmap/flush/etc.
iree_hal_buffer_mapping_t buffer_mapping = {{0}};
IREE_RETURN_IF_ERROR(iree_hal_buffer_map_range(
@@ -922,6 +949,10 @@
iree_hal_command_buffer_t* base_command_buffer,
iree_hal_executable_t* executable, int32_t entry_point,
uint32_t workgroup_x, uint32_t workgroup_y, uint32_t workgroup_z) {
+ iree_hal_task_command_buffer_t* command_buffer =
+ iree_hal_task_command_buffer_cast(base_command_buffer);
+ IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+ command_buffer->resource_set, 1, &executable));
iree_hal_cmd_dispatch_t* cmd = NULL;
return iree_hal_task_command_buffer_build_dispatch(
base_command_buffer, executable, entry_point, workgroup_x, workgroup_y,
@@ -933,6 +964,13 @@
iree_hal_executable_t* executable, int32_t entry_point,
iree_hal_buffer_t* workgroups_buffer,
iree_device_size_t workgroups_offset) {
+ iree_hal_task_command_buffer_t* command_buffer =
+ iree_hal_task_command_buffer_cast(base_command_buffer);
+
+ const void* resources[2] = {executable, workgroups_buffer};
+ IREE_RETURN_IF_ERROR(
+ iree_hal_resource_set_insert(command_buffer->resource_set, 2, resources));
+
// TODO(benvanik): track mapping so we can properly map/unmap/flush/etc.
iree_hal_buffer_mapping_t buffer_mapping = {{0}};
IREE_RETURN_IF_ERROR(iree_hal_buffer_map_range(
diff --git a/iree/hal/utils/BUILD b/iree/hal/utils/BUILD
index 31b157c..42e00dc 100644
--- a/iree/hal/utils/BUILD
+++ b/iree/hal/utils/BUILD
@@ -28,6 +28,7 @@
hdrs = ["deferred_command_buffer.h"],
visibility = ["//visibility:public"],
deps = [
+ ":resource_set",
"//iree/base",
"//iree/base:tracing",
"//iree/base/internal:arena",
diff --git a/iree/hal/utils/CMakeLists.txt b/iree/hal/utils/CMakeLists.txt
index 99e890f..cd18b36 100644
--- a/iree/hal/utils/CMakeLists.txt
+++ b/iree/hal/utils/CMakeLists.txt
@@ -32,6 +32,7 @@
SRCS
"deferred_command_buffer.c"
DEPS
+ ::resource_set
iree::base
iree::base::internal::arena
iree::base::tracing
diff --git a/iree/hal/utils/deferred_command_buffer.c b/iree/hal/utils/deferred_command_buffer.c
index 8f2a05a..347a222 100644
--- a/iree/hal/utils/deferred_command_buffer.c
+++ b/iree/hal/utils/deferred_command_buffer.c
@@ -8,6 +8,7 @@
#include "iree/base/internal/arena.h"
#include "iree/base/tracing.h"
+#include "iree/hal/utils/resource_set.h"
//===----------------------------------------------------------------------===//
// Command recording structures
@@ -134,6 +135,12 @@
typedef struct iree_hal_deferred_command_buffer_t {
iree_hal_command_buffer_t base;
iree_allocator_t host_allocator;
+
+ // Maintains a reference to all resources used within the command buffer.
+ // Reset on each begin.
+ iree_hal_resource_set_t* resource_set;
+
+ // All commands in encoding order.
iree_hal_cmd_list_t cmd_list;
} iree_hal_deferred_command_buffer_t;
@@ -165,9 +172,16 @@
&iree_hal_deferred_command_buffer_vtable, &command_buffer->base);
command_buffer->host_allocator = host_allocator;
iree_hal_cmd_list_initialize(block_pool, &command_buffer->cmd_list);
+
+ status = iree_hal_resource_set_allocate(block_pool,
+ &command_buffer->resource_set);
}
- *out_command_buffer = &command_buffer->base;
+ if (iree_status_is_ok(status)) {
+ *out_command_buffer = &command_buffer->base;
+ } else {
+ iree_hal_command_buffer_destroy(&command_buffer->base);
+ }
IREE_TRACE_ZONE_END(z0);
return status;
}
@@ -180,6 +194,7 @@
IREE_TRACE_ZONE_BEGIN(z0);
iree_hal_cmd_list_deinitialize(&command_buffer->cmd_list);
+ iree_hal_resource_set_free(command_buffer->resource_set);
iree_allocator_free(host_allocator, command_buffer);
IREE_TRACE_ZONE_END(z0);
@@ -199,6 +214,7 @@
iree_hal_deferred_command_buffer_t* command_buffer =
iree_hal_deferred_command_buffer_cast(base_command_buffer);
iree_hal_cmd_list_reset(&command_buffer->cmd_list);
+ iree_hal_resource_set_reset(command_buffer->resource_set);
return iree_ok_status();
}
@@ -280,8 +296,11 @@
static iree_status_t iree_hal_deferred_command_buffer_signal_event(
iree_hal_command_buffer_t* base_command_buffer, iree_hal_event_t* event,
iree_hal_execution_stage_t source_stage_mask) {
- iree_hal_cmd_list_t* cmd_list =
- &iree_hal_deferred_command_buffer_cast(base_command_buffer)->cmd_list;
+ iree_hal_deferred_command_buffer_t* command_buffer =
+ iree_hal_deferred_command_buffer_cast(base_command_buffer);
+ iree_hal_cmd_list_t* cmd_list = &command_buffer->cmd_list;
+ IREE_RETURN_IF_ERROR(
+ iree_hal_resource_set_insert(command_buffer->resource_set, 1, &event));
iree_hal_cmd_signal_event_t* cmd = NULL;
IREE_RETURN_IF_ERROR(iree_hal_cmd_list_append_command(
cmd_list, IREE_HAL_CMD_SIGNAL_EVENT, sizeof(*cmd), (void**)&cmd));
@@ -310,8 +329,11 @@
static iree_status_t iree_hal_deferred_command_buffer_reset_event(
iree_hal_command_buffer_t* base_command_buffer, iree_hal_event_t* event,
iree_hal_execution_stage_t source_stage_mask) {
- iree_hal_cmd_list_t* cmd_list =
- &iree_hal_deferred_command_buffer_cast(base_command_buffer)->cmd_list;
+ iree_hal_deferred_command_buffer_t* command_buffer =
+ iree_hal_deferred_command_buffer_cast(base_command_buffer);
+ iree_hal_cmd_list_t* cmd_list = &command_buffer->cmd_list;
+ IREE_RETURN_IF_ERROR(
+ iree_hal_resource_set_insert(command_buffer->resource_set, 1, &event));
iree_hal_cmd_reset_event_t* cmd = NULL;
IREE_RETURN_IF_ERROR(iree_hal_cmd_list_append_command(
cmd_list, IREE_HAL_CMD_RESET_EVENT, sizeof(*cmd), (void**)&cmd));
@@ -352,8 +374,11 @@
const iree_hal_memory_barrier_t* memory_barriers,
iree_host_size_t buffer_barrier_count,
const iree_hal_buffer_barrier_t* buffer_barriers) {
- iree_hal_cmd_list_t* cmd_list =
- &iree_hal_deferred_command_buffer_cast(base_command_buffer)->cmd_list;
+ iree_hal_deferred_command_buffer_t* command_buffer =
+ iree_hal_deferred_command_buffer_cast(base_command_buffer);
+ iree_hal_cmd_list_t* cmd_list = &command_buffer->cmd_list;
+ IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+ command_buffer->resource_set, event_count, events));
iree_hal_cmd_wait_events_t* cmd = NULL;
IREE_RETURN_IF_ERROR(iree_hal_cmd_list_append_command(
cmd_list, IREE_HAL_CMD_WAIT_EVENTS,
@@ -402,8 +427,11 @@
static iree_status_t iree_hal_deferred_command_buffer_discard_buffer(
iree_hal_command_buffer_t* base_command_buffer, iree_hal_buffer_t* buffer) {
- iree_hal_cmd_list_t* cmd_list =
- &iree_hal_deferred_command_buffer_cast(base_command_buffer)->cmd_list;
+ iree_hal_deferred_command_buffer_t* command_buffer =
+ iree_hal_deferred_command_buffer_cast(base_command_buffer);
+ iree_hal_cmd_list_t* cmd_list = &command_buffer->cmd_list;
+ IREE_RETURN_IF_ERROR(
+ iree_hal_resource_set_insert(command_buffer->resource_set, 1, &buffer));
iree_hal_cmd_discard_buffer_t* cmd = NULL;
IREE_RETURN_IF_ERROR(iree_hal_cmd_list_append_command(
cmd_list, IREE_HAL_CMD_DISCARD_BUFFER, sizeof(*cmd), (void**)&cmd));
@@ -436,13 +464,16 @@
iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
iree_device_size_t length, const void* pattern,
iree_host_size_t pattern_length) {
- iree_hal_cmd_list_t* cmd_list =
- &iree_hal_deferred_command_buffer_cast(base_command_buffer)->cmd_list;
+ iree_hal_deferred_command_buffer_t* command_buffer =
+ iree_hal_deferred_command_buffer_cast(base_command_buffer);
+ iree_hal_cmd_list_t* cmd_list = &command_buffer->cmd_list;
iree_hal_cmd_fill_buffer_t* cmd = NULL;
if (pattern_length > sizeof(cmd->pattern)) {
return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
"fill patterns must be < 8 bytes");
}
+ IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+ command_buffer->resource_set, 1, &target_buffer));
IREE_RETURN_IF_ERROR(iree_hal_cmd_list_append_command(
cmd_list, IREE_HAL_CMD_FILL_BUFFER, sizeof(*cmd), (void**)&cmd));
cmd->target_buffer = target_buffer;
@@ -477,8 +508,11 @@
iree_hal_command_buffer_t* base_command_buffer, const void* source_buffer,
iree_host_size_t source_offset, iree_hal_buffer_t* target_buffer,
iree_device_size_t target_offset, iree_device_size_t length) {
- iree_hal_cmd_list_t* cmd_list =
- &iree_hal_deferred_command_buffer_cast(base_command_buffer)->cmd_list;
+ iree_hal_deferred_command_buffer_t* command_buffer =
+ iree_hal_deferred_command_buffer_cast(base_command_buffer);
+ iree_hal_cmd_list_t* cmd_list = &command_buffer->cmd_list;
+ IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+ command_buffer->resource_set, 1, &target_buffer));
iree_hal_cmd_update_buffer_t* cmd = NULL;
IREE_RETURN_IF_ERROR(iree_hal_cmd_list_append_command(
cmd_list, IREE_HAL_CMD_UPDATE_BUFFER,
@@ -517,8 +551,12 @@
iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
iree_device_size_t length) {
- iree_hal_cmd_list_t* cmd_list =
- &iree_hal_deferred_command_buffer_cast(base_command_buffer)->cmd_list;
+ iree_hal_deferred_command_buffer_t* command_buffer =
+ iree_hal_deferred_command_buffer_cast(base_command_buffer);
+ iree_hal_cmd_list_t* cmd_list = &command_buffer->cmd_list;
+ const void* buffers[2] = {source_buffer, target_buffer};
+ IREE_RETURN_IF_ERROR(
+ iree_hal_resource_set_insert(command_buffer->resource_set, 2, buffers));
iree_hal_cmd_copy_buffer_t* cmd = NULL;
IREE_RETURN_IF_ERROR(iree_hal_cmd_list_append_command(
cmd_list, IREE_HAL_CMD_COPY_BUFFER, sizeof(*cmd), (void**)&cmd));
@@ -554,8 +592,11 @@
iree_hal_command_buffer_t* base_command_buffer,
iree_hal_executable_layout_t* executable_layout, iree_host_size_t offset,
const void* values, iree_host_size_t values_length) {
- iree_hal_cmd_list_t* cmd_list =
- &iree_hal_deferred_command_buffer_cast(base_command_buffer)->cmd_list;
+ iree_hal_deferred_command_buffer_t* command_buffer =
+ iree_hal_deferred_command_buffer_cast(base_command_buffer);
+ iree_hal_cmd_list_t* cmd_list = &command_buffer->cmd_list;
+ IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+ command_buffer->resource_set, 1, &executable_layout));
iree_hal_cmd_push_constants_t* cmd = NULL;
IREE_RETURN_IF_ERROR(iree_hal_cmd_list_append_command(
cmd_list, IREE_HAL_CMD_PUSH_CONSTANTS,
@@ -592,8 +633,15 @@
iree_hal_executable_layout_t* executable_layout, uint32_t set,
iree_host_size_t binding_count,
const iree_hal_descriptor_set_binding_t* bindings) {
- iree_hal_cmd_list_t* cmd_list =
- &iree_hal_deferred_command_buffer_cast(base_command_buffer)->cmd_list;
+ iree_hal_deferred_command_buffer_t* command_buffer =
+ iree_hal_deferred_command_buffer_cast(base_command_buffer);
+ iree_hal_cmd_list_t* cmd_list = &command_buffer->cmd_list;
+ IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+ command_buffer->resource_set, 1, &executable_layout));
+ for (iree_host_size_t i = 0; i < binding_count; ++i) {
+ IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+ command_buffer->resource_set, 1, &bindings[i].buffer));
+ }
iree_hal_cmd_push_descriptor_set_t* cmd = NULL;
IREE_RETURN_IF_ERROR(iree_hal_cmd_list_append_command(
cmd_list, IREE_HAL_CMD_PUSH_DESCRIPTOR_SET,
@@ -632,8 +680,12 @@
iree_hal_descriptor_set_t* descriptor_set,
iree_host_size_t dynamic_offset_count,
const iree_device_size_t* dynamic_offsets) {
- iree_hal_cmd_list_t* cmd_list =
- &iree_hal_deferred_command_buffer_cast(base_command_buffer)->cmd_list;
+ iree_hal_deferred_command_buffer_t* command_buffer =
+ iree_hal_deferred_command_buffer_cast(base_command_buffer);
+ iree_hal_cmd_list_t* cmd_list = &command_buffer->cmd_list;
+ const void* resources[2] = {executable_layout, descriptor_set};
+ IREE_RETURN_IF_ERROR(
+ iree_hal_resource_set_insert(command_buffer->resource_set, 2, resources));
iree_hal_cmd_bind_descriptor_set_t* cmd = NULL;
IREE_RETURN_IF_ERROR(iree_hal_cmd_list_append_command(
cmd_list, IREE_HAL_CMD_BIND_DESCRIPTOR_SET,
@@ -673,8 +725,11 @@
iree_hal_command_buffer_t* base_command_buffer,
iree_hal_executable_t* executable, int32_t entry_point,
uint32_t workgroup_x, uint32_t workgroup_y, uint32_t workgroup_z) {
- iree_hal_cmd_list_t* cmd_list =
- &iree_hal_deferred_command_buffer_cast(base_command_buffer)->cmd_list;
+ iree_hal_deferred_command_buffer_t* command_buffer =
+ iree_hal_deferred_command_buffer_cast(base_command_buffer);
+ iree_hal_cmd_list_t* cmd_list = &command_buffer->cmd_list;
+ IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+ command_buffer->resource_set, 1, &executable));
iree_hal_cmd_dispatch_t* cmd = NULL;
IREE_RETURN_IF_ERROR(iree_hal_cmd_list_append_command(
cmd_list, IREE_HAL_CMD_DISPATCH, sizeof(*cmd), (void**)&cmd));
@@ -711,8 +766,12 @@
iree_hal_executable_t* executable, int32_t entry_point,
iree_hal_buffer_t* workgroups_buffer,
iree_device_size_t workgroups_offset) {
- iree_hal_cmd_list_t* cmd_list =
- &iree_hal_deferred_command_buffer_cast(base_command_buffer)->cmd_list;
+ iree_hal_deferred_command_buffer_t* command_buffer =
+ iree_hal_deferred_command_buffer_cast(base_command_buffer);
+ iree_hal_cmd_list_t* cmd_list = &command_buffer->cmd_list;
+ const void* resources[2] = {executable, workgroups_buffer};
+ IREE_RETURN_IF_ERROR(
+ iree_hal_resource_set_insert(command_buffer->resource_set, 2, resources));
iree_hal_cmd_dispatch_indirect_t* cmd = NULL;
IREE_RETURN_IF_ERROR(iree_hal_cmd_list_append_command(
cmd_list, IREE_HAL_CMD_DISPATCH_INDIRECT, sizeof(*cmd), (void**)&cmd));
diff --git a/iree/hal/utils/resource_set.c b/iree/hal/utils/resource_set.c
index f42d77c..14e5871 100644
--- a/iree/hal/utils/resource_set.c
+++ b/iree/hal/utils/resource_set.c
@@ -8,6 +8,24 @@
#include "iree/base/tracing.h"
+// Inlines the first chunk into the block using all of the remaining space.
+// This is a special case chunk that is released back to the pool with the
+// resource set and lets us avoid an additional allocation.
+static void iree_hal_resource_set_setup_inline_chunk(
+ iree_hal_resource_set_t* set) {
+ uint8_t* block_ptr = (uint8_t*)set + sizeof(*set);
+ iree_hal_resource_set_chunk_t* inlined_chunk =
+ (iree_hal_resource_set_chunk_t*)block_ptr;
+ inlined_chunk->flags = IREE_HAL_RESOURCE_SET_CHUNK_FLAG_INLINE;
+ inlined_chunk->capacity = (set->block_pool->total_block_size - sizeof(*set) -
+ sizeof(*inlined_chunk)) /
+ sizeof(iree_hal_resource_t*);
+ inlined_chunk->capacity = iree_min(inlined_chunk->capacity,
+ IREE_HAL_RESOURCE_SET_CHUNK_MAX_CAPACITY);
+ inlined_chunk->count = 0;
+ set->chunk_head = inlined_chunk;
+}
+
IREE_API_EXPORT iree_status_t iree_hal_resource_set_allocate(
iree_arena_block_pool_t* block_pool, iree_hal_resource_set_t** out_set) {
IREE_TRACE_ZONE_BEGIN(z0);
@@ -25,30 +43,15 @@
iree_hal_resource_set_t* set = (iree_hal_resource_set_t*)block_ptr;
memset(set, 0, sizeof(*set));
set->block_pool = block_pool;
- block_ptr += sizeof(*set);
-
- // Inline the first chunk into the block using all of the remaining space.
- // This is a special case chunk that is released back to the pool with the
- // resource set and lets us avoid an additional allocation.
- iree_hal_resource_set_chunk_t* inlined_chunk =
- (iree_hal_resource_set_chunk_t*)block_ptr;
- inlined_chunk->flags = IREE_HAL_RESOURCE_SET_CHUNK_FLAG_INLINE;
- inlined_chunk->capacity =
- (block_pool->total_block_size - sizeof(*set) - sizeof(*inlined_chunk)) /
- sizeof(iree_hal_resource_t*);
- inlined_chunk->capacity = iree_min(inlined_chunk->capacity,
- IREE_HAL_RESOURCE_SET_CHUNK_MAX_CAPACITY);
- inlined_chunk->count = 0;
- set->chunk_head = inlined_chunk;
+ iree_hal_resource_set_setup_inline_chunk(set);
*out_set = set;
IREE_TRACE_ZONE_END(z0);
return iree_ok_status();
}
-IREE_API_EXPORT void iree_hal_resource_set_free(iree_hal_resource_set_t* set) {
- IREE_TRACE_ZONE_BEGIN(z0);
-
+static void iree_hal_resource_set_release_blocks(iree_hal_resource_set_t* set,
+ bool preserve_set) {
// Release all resources in all chunks and stitch together the blocks in a
// linked list. We do this first so that we can release all of the chunks back
// to the block pool in one operation. Ideally we'd maintain the linked list
@@ -67,10 +70,16 @@
iree_arena_block_t* block = NULL;
if (iree_hal_resource_set_chunk_is_stored_inline(chunk)) {
// This is the inlined first chunk that also stores the set header.
- // We use the easily-available set pointer as the base.
- block = (iree_arena_block_t*)((uint8_t*)set +
- set->block_pool->usable_block_size);
- next_chunk = NULL;
+ // If we are not freeing the set then we don't release the block back to
+ // the pool.
+ if (preserve_set) {
+ // Don't release the block.
+ break;
+ } else {
+ block = (iree_arena_block_t*)((uint8_t*)set +
+ set->block_pool->usable_block_size);
+ next_chunk = NULL;
+ }
} else {
// A chunk acquired after the set was acquired.
block = (iree_arena_block_t*)((uint8_t*)chunk +
@@ -86,6 +95,27 @@
// NOTE: this invalidates the |set| memory.
iree_arena_block_pool_t* block_pool = set->block_pool;
iree_arena_block_pool_release(block_pool, block_head, block_tail);
+}
+
+IREE_API_EXPORT void iree_hal_resource_set_free(iree_hal_resource_set_t* set) {
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ // Release all resources and the arena block used by the set.
+ // The set pointer is invalid after this call returns.
+ iree_hal_resource_set_release_blocks(set, /*preserve_set=*/false);
+
+ IREE_TRACE_ZONE_END(z0);
+}
+
+IREE_API_EXPORT void iree_hal_resource_set_reset(iree_hal_resource_set_t* set) {
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ // Release all resources and the blocks besides the base set.
+ iree_hal_resource_set_release_blocks(set, /*preserve_set=*/true);
+
+ // Reset the set state.
+ memset(set->mru, 0, sizeof(set->mru));
+ iree_hal_resource_set_setup_inline_chunk(set);
IREE_TRACE_ZONE_END(z0);
}
@@ -228,17 +258,19 @@
return iree_ok_status();
}
-IREE_API_EXPORT iree_status_t iree_hal_resource_set_insert(
- iree_hal_resource_set_t* set, iree_host_size_t count,
- iree_hal_resource_t* const* resources) {
+IREE_API_EXPORT iree_status_t
+iree_hal_resource_set_insert(iree_hal_resource_set_t* set,
+ iree_host_size_t count, const void* resources) {
// For now we process one at a time. We should have a stride that lets us
// amortize the cost of doing the MRU update and insertion allocation by
// say slicing off 4/8/16/32 resources at a time etc. Today each miss that
// requires a full insertion goes down the whole path of checking chunk
// capacity and such.
+ iree_hal_resource_t* const* typed_resources =
+ (iree_hal_resource_t* const*)resources;
for (iree_host_size_t i = 0; i < count; ++i) {
- iree_hal_resource_t* resource = resources[i];
- IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert_1(set, resource));
+ IREE_RETURN_IF_ERROR(
+ iree_hal_resource_set_insert_1(set, typed_resources[i]));
}
return iree_ok_status();
}
diff --git a/iree/hal/utils/resource_set.h b/iree/hal/utils/resource_set.h
index 0bf04fc..6f63ced 100644
--- a/iree/hal/utils/resource_set.h
+++ b/iree/hal/utils/resource_set.h
@@ -123,11 +123,14 @@
// from.
IREE_API_EXPORT void iree_hal_resource_set_free(iree_hal_resource_set_t* set);
+// Resets the set to its initial empty state by releasing all owned resources.
+IREE_API_EXPORT void iree_hal_resource_set_reset(iree_hal_resource_set_t* set);
+
// Inserts zero or more resources into the set.
// Each resource will be retained for at least the lifetime of the set.
-IREE_API_EXPORT iree_status_t iree_hal_resource_set_insert(
- iree_hal_resource_set_t* set, iree_host_size_t count,
- iree_hal_resource_t* const* resources);
+IREE_API_EXPORT iree_status_t
+iree_hal_resource_set_insert(iree_hal_resource_set_t* set,
+ iree_host_size_t count, const void* resources);
#ifdef __cplusplus
} // extern "C"
diff --git a/iree/hal/vulkan/BUILD b/iree/hal/vulkan/BUILD
index 2184d1a..40b56c1 100644
--- a/iree/hal/vulkan/BUILD
+++ b/iree/hal/vulkan/BUILD
@@ -91,10 +91,12 @@
"//iree/base:logging",
"//iree/base:tracing",
"//iree/base/internal",
+ "//iree/base/internal:arena",
"//iree/base/internal:synchronization",
"//iree/base/internal/flatcc:parsing",
"//iree/hal",
"//iree/hal/utils:buffer_transfer",
+ "//iree/hal/utils:resource_set",
"//iree/hal/vulkan/builtin",
"//iree/hal/vulkan/util:arena",
"//iree/hal/vulkan/util:intrusive_list",
diff --git a/iree/hal/vulkan/CMakeLists.txt b/iree/hal/vulkan/CMakeLists.txt
index 9d27e03..37fe297 100644
--- a/iree/hal/vulkan/CMakeLists.txt
+++ b/iree/hal/vulkan/CMakeLists.txt
@@ -79,12 +79,14 @@
iree::base::cc
iree::base::core_headers
iree::base::internal
+ iree::base::internal::arena
iree::base::internal::flatcc::parsing
iree::base::internal::synchronization
iree::base::logging
iree::base::tracing
iree::hal
iree::hal::utils::buffer_transfer
+ iree::hal::utils::resource_set
iree::hal::vulkan::builtin
iree::hal::vulkan::util::arena
iree::hal::vulkan::util::intrusive_list
diff --git a/iree/hal/vulkan/direct_command_buffer.cc b/iree/hal/vulkan/direct_command_buffer.cc
index 9e6d799..015250c 100644
--- a/iree/hal/vulkan/direct_command_buffer.cc
+++ b/iree/hal/vulkan/direct_command_buffer.cc
@@ -15,6 +15,7 @@
#include "iree/base/logging.h"
#include "iree/base/status_cc.h"
#include "iree/base/tracing.h"
+#include "iree/hal/utils/resource_set.h"
#include "iree/hal/vulkan/descriptor_set_arena.h"
#include "iree/hal/vulkan/dynamic_symbols.h"
#include "iree/hal/vulkan/native_descriptor_set.h"
@@ -34,12 +35,17 @@
iree_hal_command_buffer_t base;
VkDeviceHandle* logical_device;
iree_hal_vulkan_tracing_context_t* tracing_context;
+ iree_arena_block_pool_t* block_pool;
VkCommandPoolHandle* command_pool;
VkCommandBuffer handle;
DynamicSymbols* syms;
+ // Maintains a reference to all resources used within the command buffer.
+ // Reset on each begin.
+ iree_hal_resource_set_t* resource_set;
+
// TODO(benvanik): may grow large - should try to reclaim or reuse.
DescriptorSetArena descriptor_set_arena;
@@ -81,10 +87,12 @@
iree_hal_vulkan_tracing_context_t* tracing_context,
iree::hal::vulkan::DescriptorPoolCache* descriptor_pool_cache,
iree::hal::vulkan::BuiltinExecutables* builtin_executables,
+ iree_arena_block_pool_t* block_pool,
iree_hal_command_buffer_t** out_command_buffer) {
IREE_ASSERT_ARGUMENT(logical_device);
IREE_ASSERT_ARGUMENT(command_pool);
IREE_ASSERT_ARGUMENT(descriptor_pool_cache);
+ IREE_ASSERT_ARGUMENT(block_pool);
IREE_ASSERT_ARGUMENT(out_command_buffer);
IREE_TRACE_ZONE_BEGIN(z0);
@@ -109,6 +117,7 @@
&iree_hal_vulkan_direct_command_buffer_vtable, &command_buffer->base);
command_buffer->logical_device = logical_device;
command_buffer->tracing_context = tracing_context;
+ command_buffer->block_pool = block_pool;
command_buffer->command_pool = command_pool;
command_buffer->handle = handle;
command_buffer->syms = logical_device->syms().get();
@@ -118,7 +127,11 @@
new (&command_buffer->descriptor_set_group) DescriptorSetGroup();
command_buffer->builtin_executables = builtin_executables;
+ status = iree_hal_resource_set_allocate(block_pool,
+ &command_buffer->resource_set);
+ }
+ if (iree_status_is_ok(status)) {
*out_command_buffer = &command_buffer->base;
} else {
command_pool->Free(handle);
@@ -133,6 +146,7 @@
// NOTE: we require that command buffers not be recorded while they are
// in-flight so this is safe.
IREE_IGNORE_ERROR(command_buffer->descriptor_set_group.Reset());
+ iree_hal_resource_set_reset(command_buffer->resource_set);
}
bool iree_hal_vulkan_direct_command_buffer_isa(
@@ -164,6 +178,7 @@
command_buffer->descriptor_set_group.~DescriptorSetGroup();
command_buffer->descriptor_set_arena.~DescriptorSetArena();
+ iree_hal_resource_set_reset(command_buffer->resource_set);
iree_allocator_free(host_allocator, command_buffer);
IREE_TRACE_ZONE_END(z0);
@@ -399,6 +414,9 @@
iree_hal_vulkan_direct_command_buffer_t* command_buffer =
iree_hal_vulkan_direct_command_buffer_cast(base_command_buffer);
+ IREE_RETURN_IF_ERROR(
+ iree_hal_resource_set_insert(command_buffer->resource_set, 1, &event));
+
command_buffer->syms->vkCmdSetEvent(
command_buffer->handle, iree_hal_vulkan_native_event_handle(event),
iree_hal_vulkan_convert_pipeline_stage_flags(source_stage_mask));
@@ -412,6 +430,9 @@
iree_hal_vulkan_direct_command_buffer_t* command_buffer =
iree_hal_vulkan_direct_command_buffer_cast(base_command_buffer);
+ IREE_RETURN_IF_ERROR(
+ iree_hal_resource_set_insert(command_buffer->resource_set, 1, &event));
+
command_buffer->syms->vkCmdResetEvent(
command_buffer->handle, iree_hal_vulkan_native_event_handle(event),
iree_hal_vulkan_convert_pipeline_stage_flags(source_stage_mask));
@@ -433,6 +454,9 @@
iree_allocator_t host_allocator =
command_buffer->logical_device->host_allocator();
+ IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+ command_buffer->resource_set, event_count, events));
+
iree_inline_array(VkEvent, event_handles, event_count, host_allocator);
for (int i = 0; i < event_count; ++i) {
*iree_inline_array_at(event_handles, i) =
@@ -526,6 +550,9 @@
VkBuffer target_device_buffer = iree_hal_vulkan_vma_buffer_handle(
iree_hal_buffer_allocated_buffer(target_buffer));
+ IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+ command_buffer->resource_set, 1, &target_buffer));
+
// vkCmdFillBuffer requires a 4 byte alignment for the offset, pattern, and
// length. We use a polyfill here that fills the unaligned start and end of
// fill operations, if needed.
@@ -580,6 +607,9 @@
VkBuffer target_device_buffer = iree_hal_vulkan_vma_buffer_handle(
iree_hal_buffer_allocated_buffer(target_buffer));
+ IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+ command_buffer->resource_set, 1, &target_buffer));
+
// Vulkan only allows updates of <= 65536 because you really, really, really
// shouldn't do large updates like this (as it wastes command buffer space and
// may be slower than just using write-through mapped memory). The
@@ -613,6 +643,10 @@
VkBuffer target_device_buffer = iree_hal_vulkan_vma_buffer_handle(
iree_hal_buffer_allocated_buffer(target_buffer));
+ const iree_hal_buffer_t* buffers[2] = {source_buffer, target_buffer};
+ IREE_RETURN_IF_ERROR(
+ iree_hal_resource_set_insert(command_buffer->resource_set, 2, buffers));
+
VkBufferCopy region;
region.srcOffset = iree_hal_buffer_byte_offset(source_buffer) + source_offset;
region.dstOffset = iree_hal_buffer_byte_offset(target_buffer) + target_offset;
@@ -655,6 +689,12 @@
iree_hal_vulkan_direct_command_buffer_t* command_buffer =
iree_hal_vulkan_direct_command_buffer_cast(base_command_buffer);
+ // TODO(benvanik): batch insert by getting the resources in their own list.
+ for (iree_host_size_t i = 0; i < binding_count; ++i) {
+ IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+ command_buffer->resource_set, 1, &bindings[i].buffer));
+ }
+
// Either allocate, update, and bind a descriptor set or use push descriptor
// sets to use the command buffer pool when supported.
return command_buffer->descriptor_set_arena.BindDescriptorSet(
@@ -672,6 +712,9 @@
iree_allocator_t host_allocator =
command_buffer->logical_device->host_allocator();
+ IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+ command_buffer->resource_set, 1, &descriptor_set));
+
// Vulkan takes uint32_t as the size here, unlike everywhere else.
iree_inline_array(uint32_t, dynamic_offsets_i32, dynamic_offset_count,
host_allocator);
@@ -713,6 +756,9 @@
source_location.func_name.data, source_location.func_name.size);
});
+ IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+ command_buffer->resource_set, 1, &executable));
+
// Get the compiled and linked pipeline for the specified entry point and
// bind it to the command buffer.
VkPipeline pipeline_handle = VK_NULL_HANDLE;
@@ -739,6 +785,10 @@
iree_hal_vulkan_direct_command_buffer_t* command_buffer =
iree_hal_vulkan_direct_command_buffer_cast(base_command_buffer);
+ const void* resources[2] = {executable, workgroups_buffer};
+ IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert(
+ command_buffer->resource_set, IREE_ARRAYSIZE(resources), resources));
+
iree_hal_vulkan_source_location_t source_location;
iree_hal_vulkan_native_executable_entry_point_source_location(
executable, entry_point, &source_location);
diff --git a/iree/hal/vulkan/direct_command_buffer.h b/iree/hal/vulkan/direct_command_buffer.h
index 071d369..57c15ad 100644
--- a/iree/hal/vulkan/direct_command_buffer.h
+++ b/iree/hal/vulkan/direct_command_buffer.h
@@ -18,7 +18,12 @@
extern "C" {
#endif // __cplusplus
+typedef struct iree_arena_block_pool_t iree_arena_block_pool_t;
+
// Creates a command buffer that directly records into a VkCommandBuffer.
+//
+// NOTE: the |block_pool| must remain live for the lifetime of the command
+// buffers that use it.
iree_status_t iree_hal_vulkan_direct_command_buffer_allocate(
iree_hal_device_t* device,
iree::hal::vulkan::VkDeviceHandle* logical_device,
@@ -29,6 +34,7 @@
iree_hal_vulkan_tracing_context_t* tracing_context,
iree::hal::vulkan::DescriptorPoolCache* descriptor_pool_cache,
iree::hal::vulkan::BuiltinExecutables* builtin_executables,
+ iree_arena_block_pool_t* block_pool,
iree_hal_command_buffer_t** out_command_buffer);
// Returns the native Vulkan VkCommandBuffer handle.
diff --git a/iree/hal/vulkan/vulkan_device.cc b/iree/hal/vulkan/vulkan_device.cc
index 73e4a82..966e4e3 100644
--- a/iree/hal/vulkan/vulkan_device.cc
+++ b/iree/hal/vulkan/vulkan_device.cc
@@ -11,6 +11,7 @@
#include <cstring>
#include <vector>
+#include "iree/base/internal/arena.h"
#include "iree/base/internal/math.h"
#include "iree/base/tracing.h"
#include "iree/hal/utils/buffer_transfer.h"
@@ -362,6 +363,10 @@
VkCommandPoolHandle* dispatch_command_pool;
VkCommandPoolHandle* transfer_command_pool;
+ // Block pool used for command buffers with a larger block size (as command
+ // buffers can contain inlined data uploads).
+ iree_arena_block_pool_t block_pool;
+
// Used only for emulated timeline semaphores.
TimePointSemaphorePool* semaphore_pool;
TimePointFencePool* fence_pool;
@@ -563,6 +568,9 @@
device->logical_device = logical_device;
device->logical_device->AddReference();
+ iree_arena_block_pool_initialize(32 * 1024, host_allocator,
+ &device->block_pool);
+
// Point the queue storage into the new device allocation. The queues
// themselves are populated
device->queues = (CommandQueue**)buffer_ptr;
@@ -667,6 +675,9 @@
// There should be no more buffers live that use the allocator.
iree_hal_allocator_release(device->device_allocator);
+ // All arena blocks should have been returned.
+ iree_arena_block_pool_deinitialize(&device->block_pool);
+
// Finally, destroy the device.
device->logical_device->ReleaseReference();
iree_hal_driver_release(device->driver);
@@ -919,7 +930,7 @@
static iree_status_t iree_hal_vulkan_device_trim(
iree_hal_device_t* base_device) {
iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device);
- // TODO(benvanik): trim of vulkan resources, whenever we care.
+ iree_arena_block_pool_trim(&device->block_pool);
return iree_hal_allocator_trim(device->device_allocator);
}
@@ -1007,7 +1018,7 @@
base_device, device->logical_device, command_pool, mode,
command_categories, queue_affinity, queue->tracing_context(),
device->descriptor_pool_cache, device->builtin_executables,
- out_command_buffer);
+ &device->block_pool, out_command_buffer);
}
static iree_status_t iree_hal_vulkan_device_create_descriptor_set(
diff --git a/iree/modules/hal/module.c b/iree/modules/hal/module.c
index 2a63d22..b0b0a7f 100644
--- a/iree/modules/hal/module.c
+++ b/iree/modules/hal/module.c
@@ -130,9 +130,6 @@
iree_hal_semaphore_t* submit_semaphore;
uint64_t submit_value;
-
- void* deferred_lru[6];
- iree_vm_list_t* deferred_releases;
} iree_hal_module_state_t;
static void IREE_API_PTR iree_hal_module_destroy(void* base_module) {
@@ -156,11 +153,6 @@
iree_hal_device_retain(state->shared_device);
IREE_RETURN_AND_END_ZONE_IF_ERROR(
- z0, iree_vm_list_create(
- /*element_type=*/NULL, /*initial_capacity=*/32,
- state->host_allocator, &state->deferred_releases));
-
- IREE_RETURN_AND_END_ZONE_IF_ERROR(
z0, iree_hal_executable_cache_create(state->shared_device,
iree_string_view_empty(),
&state->executable_cache));
@@ -181,7 +173,6 @@
iree_hal_module_state_t* state = (iree_hal_module_state_t*)module_state;
iree_hal_semaphore_release(state->submit_semaphore);
- iree_vm_list_release(state->deferred_releases);
iree_hal_executable_cache_release(state->executable_cache);
iree_hal_device_release(state->shared_device);
iree_allocator_free(state->host_allocator, state);
@@ -195,8 +186,6 @@
switch (signal) {
case IREE_VM_SIGNAL_SUSPEND:
case IREE_VM_SIGNAL_LOW_MEMORY:
- // TODO(benvanik): trims for the deferred_releases list and our other
- // tables.
return iree_hal_device_trim(state->shared_device);
default:
return iree_ok_status();
@@ -216,30 +205,6 @@
return iree_ok_status();
}
-void iree_hal_module_ex_defer_release(iree_hal_module_state_t* state,
- const iree_vm_ref_t value) {
- // A bulk of the calls to this are for the same (or very recently same)
- // objects, such as constant pool or transient buffer storage that may be
- // bound 4-10 times per dispatch. This tiny LRU lets us avoid adding such
- // repeated patterns in the common case.
- for (iree_host_size_t i = 0; i < IREE_ARRAYSIZE(state->deferred_lru); ++i) {
- if (state->deferred_lru[i] == value.ptr) {
- // Hit - keep the list sorted by most->least recently used.
- state->deferred_lru[i] = state->deferred_lru[0];
- state->deferred_lru[0] = value.ptr;
- return;
- }
- }
- // Miss - shift the list down and insert the new item at the head.
- memmove(&state->deferred_lru[1], &state->deferred_lru[0],
- sizeof(state->deferred_lru[0]) *
- (IREE_ARRAYSIZE(state->deferred_lru) - 1));
- state->deferred_lru[0] = value.ptr;
-
- IREE_IGNORE_ERROR(
- iree_vm_list_push_ref_retain(state->deferred_releases, &value));
-}
-
IREE_VM_ABI_EXPORT(iree_hal_module_ex_submit_and_wait, //
iree_hal_module_state_t, //
rr, v) {
@@ -271,12 +236,6 @@
return status;
}
- // Drop all pending deferred releases (references to everything in flight).
- // This will be replaced with resource sets in the future that are attached to
- // each command buffer.
- IREE_RETURN_IF_ERROR(iree_vm_list_resize(state->deferred_releases, 0));
- memset(state->deferred_lru, 0, sizeof(state->deferred_lru));
-
return iree_ok_status();
}
@@ -980,9 +939,6 @@
iree_vm_size_t length = (iree_vm_size_t)args->i3;
uint32_t pattern = (uint32_t)args->i4;
uint32_t pattern_length = (uint32_t)args->i5;
-
- iree_hal_module_ex_defer_release(state, args->r1);
-
return iree_hal_command_buffer_fill_buffer(command_buffer, target_buffer,
target_offset, length, &pattern,
pattern_length);
@@ -1001,10 +957,6 @@
IREE_RETURN_IF_ERROR(iree_hal_buffer_check_deref(args->r3, &target_buffer));
iree_vm_size_t target_offset = (iree_vm_size_t)args->i4;
iree_vm_size_t length = (iree_vm_size_t)args->i5;
-
- iree_hal_module_ex_defer_release(state, args->r1);
- iree_hal_module_ex_defer_release(state, args->r3);
-
return iree_hal_command_buffer_copy_buffer(command_buffer, source_buffer,
source_offset, target_buffer,
target_offset, length);
@@ -1055,7 +1007,6 @@
bindings[i].binding = (uint32_t)args->a3[i].i0;
bindings[i].offset = (iree_device_size_t)args->a3[i].i2;
bindings[i].length = (iree_device_size_t)args->a3[i].i3;
- iree_hal_module_ex_defer_release(state, args->a3[i].r1);
}
return iree_hal_command_buffer_push_descriptor_set(
@@ -1079,9 +1030,6 @@
iree_device_size_t* dynamic_offsets = NULL;
IREE_VM_ABI_VLA_STACK_CAST(args, a4_count, a4, iree_device_size_t, 64,
&dynamic_offset_count, &dynamic_offsets);
-
- iree_hal_module_ex_defer_release(state, args->r3);
-
return iree_hal_command_buffer_bind_descriptor_set(
command_buffer, executable_layout, set, descriptor_set,
dynamic_offset_count, dynamic_offsets);
@@ -1099,9 +1047,6 @@
uint32_t workgroup_x = (uint32_t)args->i3;
uint32_t workgroup_y = (uint32_t)args->i4;
uint32_t workgroup_z = (uint32_t)args->i5;
-
- iree_hal_module_ex_defer_release(state, args->r1);
-
return iree_hal_command_buffer_dispatch(command_buffer, executable,
entry_point, workgroup_x, workgroup_y,
workgroup_z);
@@ -1120,10 +1065,6 @@
IREE_RETURN_IF_ERROR(
iree_hal_buffer_check_deref(args->r3, &workgroups_buffer));
iree_vm_size_t workgroups_offset = (iree_vm_size_t)args->i4;
-
- iree_hal_module_ex_defer_release(state, args->r1);
- iree_hal_module_ex_defer_release(state, args->r3);
-
return iree_hal_command_buffer_dispatch_indirect(
command_buffer, executable, entry_point, workgroups_buffer,
workgroups_offset);