[HIP] Add inline execution mode (#16951)

diff --git a/runtime/src/iree/hal/drivers/hip/api.h b/runtime/src/iree/hal/drivers/hip/api.h
index 2b210ec..c4a7a6d 100644
--- a/runtime/src/iree/hal/drivers/hip/api.h
+++ b/runtime/src/iree/hal/drivers/hip/api.h
@@ -87,6 +87,11 @@
 
   // Parameters for each hipMemPool_t used for queue-ordered allocations.
   iree_hal_hip_memory_pooling_params_t memory_pools;
+
+  // Allow executing command buffers against HIP streams as they are recorded.
+  // Only command buffers produced by the compiler that have the
+  // IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION bit set will use this.
+  bool allow_inline_execution;
 } iree_hal_hip_device_params_t;
 
 // Initializes |out_params| to default values.
diff --git a/runtime/src/iree/hal/drivers/hip/hip_device.c b/runtime/src/iree/hal/drivers/hip/hip_device.c
index 4052da7..ca566e7 100644
--- a/runtime/src/iree/hal/drivers/hip/hip_device.c
+++ b/runtime/src/iree/hal/drivers/hip/hip_device.c
@@ -107,6 +107,7 @@
   out_params->command_buffer_mode = IREE_HAL_HIP_COMMAND_BUFFER_MODE_GRAPH;
   out_params->stream_tracing = false;
   out_params->async_allocations = true;
+  out_params->allow_inline_execution = false;
 }
 
 static iree_status_t iree_hal_hip_device_check_params(
@@ -440,7 +441,18 @@
     iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity,
     iree_hal_command_buffer_t** out_command_buffer) {
   iree_hal_hip_device_t* device = iree_hal_hip_device_cast(base_device);
-
+  if (device->params.allow_inline_execution &&
+      iree_all_bits_set(mode,
+                        IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION)) {
+    // The caller has indicated the command buffer can be executed as it is
+    // recorded, implying that the command buffer cannot be reused and doesn't
+    // need to be persisted. This lets us lower the execution delay as we can
+    // directly route commands to a HIP stream and let it eagerly flush.
+    return iree_hal_hip_stream_command_buffer_create(
+        base_device, device->hip_symbols, device->tracing_context, mode,
+        command_categories, binding_capacity, device->hip_dispatch_stream,
+        &device->block_pool, device->host_allocator, out_command_buffer);
+  }
   switch (device->params.command_buffer_mode) {
     case IREE_HAL_HIP_COMMAND_BUFFER_MODE_GRAPH:
       return iree_hal_hip_graph_command_buffer_create(
diff --git a/runtime/src/iree/hal/drivers/hip/pending_queue_actions.c b/runtime/src/iree/hal/drivers/hip/pending_queue_actions.c
index be5a097..0fb0852 100644
--- a/runtime/src/iree/hal/drivers/hip/pending_queue_actions.c
+++ b/runtime/src/iree/hal/drivers/hip/pending_queue_actions.c
@@ -21,6 +21,7 @@
 #include "iree/hal/drivers/hip/graph_command_buffer.h"
 #include "iree/hal/drivers/hip/hip_device.h"
 #include "iree/hal/drivers/hip/status_util.h"
+#include "iree/hal/drivers/hip/stream_command_buffer.h"
 #include "iree/hal/utils/deferred_command_buffer.h"
 #include "iree/hal/utils/resource_set.h"
 
@@ -596,7 +597,12 @@
   for (iree_host_size_t i = 0; i < action->payload.command_buffers.count; ++i) {
     iree_hal_command_buffer_t* command_buffer =
         action->payload.command_buffers.ptr[i];
-    if (iree_hal_hip_graph_command_buffer_isa(command_buffer)) {
+    if (iree_hal_hip_stream_command_buffer_isa(command_buffer)) {
+      // Nothing to do for an inline command buffer; all the work has already
+      // been submitted. When we support semaphores we'll still need to signal
+      // their completion but do not have to worry about any waits: if there
+      // were waits we wouldn't have been able to execute inline!
+    } else if (iree_hal_hip_graph_command_buffer_isa(command_buffer)) {
       hipGraphExec_t exec = iree_hal_hip_graph_command_buffer_handle(
           action->payload.command_buffers.ptr[i]);
       IREE_HIP_RETURN_AND_END_ZONE_IF_ERROR(
diff --git a/runtime/src/iree/hal/drivers/hip/registration/driver_module.c b/runtime/src/iree/hal/drivers/hip/registration/driver_module.c
index 99c11b4..b508f82 100644
--- a/runtime/src/iree/hal/drivers/hip/registration/driver_module.c
+++ b/runtime/src/iree/hal/drivers/hip/registration/driver_module.c
@@ -27,6 +27,10 @@
 IREE_FLAG(bool, hip_use_streams, true,
           "Use HIP streams (instead of graphs) for executing command buffers.");
 
+IREE_FLAG(bool, hip_allow_inline_execution, false,
+          "Allow command buffers to execute inline against HIP streams when \n"
+          "possible.");
+
 IREE_FLAG(
     bool, hip_async_allocations, true,
     "Enables HIP asynchronous stream-ordered allocations when supported.");
@@ -63,6 +67,8 @@
     iree_string_view_literal("hip_dylib_path");
 static const iree_string_view_t key_hip_use_streams =
     iree_string_view_literal("hip_use_streams");
+static const iree_string_view_t key_hip_allow_inline_execution =
+    iree_string_view_literal("hip_allow_inline_execution");
 static const iree_string_view_t key_hip_async_allocations =
     iree_string_view_literal("hip_async_allocations");
 static const iree_string_view_t key_hip_tracing =
@@ -86,6 +92,9 @@
   IREE_RETURN_IF_ERROR(iree_string_pair_builder_add_int32(
       builder, key_hip_use_streams, FLAG_hip_use_streams));
   IREE_RETURN_IF_ERROR(iree_string_pair_builder_add_int32(
+      builder, key_hip_allow_inline_execution,
+      FLAG_hip_allow_inline_execution));
+  IREE_RETURN_IF_ERROR(iree_string_pair_builder_add_int32(
       builder, key_hip_async_allocations, FLAG_hip_async_allocations));
   IREE_RETURN_IF_ERROR(iree_string_pair_builder_add_int32(
       builder, key_hip_tracing, FLAG_hip_tracing));
@@ -147,6 +156,17 @@
         device_params->command_buffer_mode =
             IREE_HAL_HIP_COMMAND_BUFFER_MODE_STREAM;
       }
+    } else if (iree_string_view_equal(key, key_hip_allow_inline_execution)) {
+      if (!iree_string_view_atoi_int32(value, &ivalue)) {
+        return iree_make_status(
+            IREE_STATUS_FAILED_PRECONDITION,
+            "Option 'hip_allow_inline_execution' expected to be "
+            "int. Got: '%.*s'",
+            (int)value.size, value.data);
+      }
+      if (ivalue) {
+        device_params->allow_inline_execution = ivalue ? true : false;
+      }
     } else if (iree_string_view_equal(key, key_hip_async_allocations)) {
       if (!iree_string_view_atoi_int32(value, &ivalue)) {
         return iree_make_status(