[HIP] Add inline execution mode (#16951)
diff --git a/runtime/src/iree/hal/drivers/hip/api.h b/runtime/src/iree/hal/drivers/hip/api.h
index 2b210ec..c4a7a6d 100644
--- a/runtime/src/iree/hal/drivers/hip/api.h
+++ b/runtime/src/iree/hal/drivers/hip/api.h
@@ -87,6 +87,11 @@
// Parameters for each hipMemPool_t used for queue-ordered allocations.
iree_hal_hip_memory_pooling_params_t memory_pools;
+
+ // Allow executing command buffers against HIP streams as they are recorded.
+ // Only command buffers produced by the compiler that have the
+ // IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION bit set will use this.
+ bool allow_inline_execution;
} iree_hal_hip_device_params_t;
// Initializes |out_params| to default values.
diff --git a/runtime/src/iree/hal/drivers/hip/hip_device.c b/runtime/src/iree/hal/drivers/hip/hip_device.c
index 4052da7..ca566e7 100644
--- a/runtime/src/iree/hal/drivers/hip/hip_device.c
+++ b/runtime/src/iree/hal/drivers/hip/hip_device.c
@@ -107,6 +107,7 @@
out_params->command_buffer_mode = IREE_HAL_HIP_COMMAND_BUFFER_MODE_GRAPH;
out_params->stream_tracing = false;
out_params->async_allocations = true;
+ out_params->allow_inline_execution = false;
}
static iree_status_t iree_hal_hip_device_check_params(
@@ -440,7 +441,18 @@
iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity,
iree_hal_command_buffer_t** out_command_buffer) {
iree_hal_hip_device_t* device = iree_hal_hip_device_cast(base_device);
-
+ if (device->params.allow_inline_execution &&
+ iree_all_bits_set(mode,
+ IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION)) {
+ // The caller has indicated the command buffer can be executed as it is
+ // recorded, implying that the command buffer cannot be reused and doesn't
+ // need to be persisted. This lets us lower the execution delay as we can
+ // directly route commands to a HIP stream and let it eagerly flush.
+ return iree_hal_hip_stream_command_buffer_create(
+ base_device, device->hip_symbols, device->tracing_context, mode,
+ command_categories, binding_capacity, device->hip_dispatch_stream,
+ &device->block_pool, device->host_allocator, out_command_buffer);
+ }
switch (device->params.command_buffer_mode) {
case IREE_HAL_HIP_COMMAND_BUFFER_MODE_GRAPH:
return iree_hal_hip_graph_command_buffer_create(
diff --git a/runtime/src/iree/hal/drivers/hip/pending_queue_actions.c b/runtime/src/iree/hal/drivers/hip/pending_queue_actions.c
index be5a097..0fb0852 100644
--- a/runtime/src/iree/hal/drivers/hip/pending_queue_actions.c
+++ b/runtime/src/iree/hal/drivers/hip/pending_queue_actions.c
@@ -21,6 +21,7 @@
#include "iree/hal/drivers/hip/graph_command_buffer.h"
#include "iree/hal/drivers/hip/hip_device.h"
#include "iree/hal/drivers/hip/status_util.h"
+#include "iree/hal/drivers/hip/stream_command_buffer.h"
#include "iree/hal/utils/deferred_command_buffer.h"
#include "iree/hal/utils/resource_set.h"
@@ -596,7 +597,12 @@
for (iree_host_size_t i = 0; i < action->payload.command_buffers.count; ++i) {
iree_hal_command_buffer_t* command_buffer =
action->payload.command_buffers.ptr[i];
- if (iree_hal_hip_graph_command_buffer_isa(command_buffer)) {
+ if (iree_hal_hip_stream_command_buffer_isa(command_buffer)) {
+ // Nothing to do for an inline command buffer; all the work has already
+ // been submitted. When we support semaphores we'll still need to signal
+ // their completion but do not have to worry about any waits: if there
+ // were waits we wouldn't have been able to execute inline!
+ } else if (iree_hal_hip_graph_command_buffer_isa(command_buffer)) {
hipGraphExec_t exec = iree_hal_hip_graph_command_buffer_handle(
action->payload.command_buffers.ptr[i]);
IREE_HIP_RETURN_AND_END_ZONE_IF_ERROR(
diff --git a/runtime/src/iree/hal/drivers/hip/registration/driver_module.c b/runtime/src/iree/hal/drivers/hip/registration/driver_module.c
index 99c11b4..b508f82 100644
--- a/runtime/src/iree/hal/drivers/hip/registration/driver_module.c
+++ b/runtime/src/iree/hal/drivers/hip/registration/driver_module.c
@@ -27,6 +27,10 @@
IREE_FLAG(bool, hip_use_streams, true,
"Use HIP streams (instead of graphs) for executing command buffers.");
+IREE_FLAG(bool, hip_allow_inline_execution, false,
+ "Allow command buffers to execute inline against HIP streams when \n"
+ "possible.");
+
IREE_FLAG(
bool, hip_async_allocations, true,
"Enables HIP asynchronous stream-ordered allocations when supported.");
@@ -63,6 +67,8 @@
iree_string_view_literal("hip_dylib_path");
static const iree_string_view_t key_hip_use_streams =
iree_string_view_literal("hip_use_streams");
+static const iree_string_view_t key_hip_allow_inline_execution =
+ iree_string_view_literal("hip_allow_inline_execution");
static const iree_string_view_t key_hip_async_allocations =
iree_string_view_literal("hip_async_allocations");
static const iree_string_view_t key_hip_tracing =
@@ -86,6 +92,9 @@
IREE_RETURN_IF_ERROR(iree_string_pair_builder_add_int32(
builder, key_hip_use_streams, FLAG_hip_use_streams));
IREE_RETURN_IF_ERROR(iree_string_pair_builder_add_int32(
+ builder, key_hip_allow_inline_execution,
+ FLAG_hip_allow_inline_execution));
+ IREE_RETURN_IF_ERROR(iree_string_pair_builder_add_int32(
builder, key_hip_async_allocations, FLAG_hip_async_allocations));
IREE_RETURN_IF_ERROR(iree_string_pair_builder_add_int32(
builder, key_hip_tracing, FLAG_hip_tracing));
@@ -147,6 +156,17 @@
device_params->command_buffer_mode =
IREE_HAL_HIP_COMMAND_BUFFER_MODE_STREAM;
}
+ } else if (iree_string_view_equal(key, key_hip_allow_inline_execution)) {
+ if (!iree_string_view_atoi_int32(value, &ivalue)) {
+ return iree_make_status(
+ IREE_STATUS_FAILED_PRECONDITION,
+ "Option 'hip_allow_inline_execution' expected to be "
+ "int. Got: '%.*s'",
+ (int)value.size, value.data);
+ }
+ if (ivalue) {
+ device_params->allow_inline_execution = ivalue ? true : false;
+ }
} else if (iree_string_view_equal(key, key_hip_async_allocations)) {
if (!iree_string_view_atoi_int32(value, &ivalue)) {
return iree_make_status(