Converting HIP target to support executable-create2. This produces a new flatbuffer that supports multiple hipModule_ts per HAL executable, reorganizes per-export information to be per-export, and removes HAL pipeline layouts and the existing stateful command recording.

diff --git a/tools/iree-benchmark-executable-main.c b/tools/iree-benchmark-executable-main.c
index da015df..ff76ade 100644
--- a/tools/iree-benchmark-executable-main.c
+++ b/tools/iree-benchmark-executable-main.c

@@ -205,7 +205,6 @@
 typedef struct iree_benchmark_executable_args_t {
   iree_hal_device_t* device;
   iree_hal_executable_t* executable;
-  iree_hal_pipeline_layout_t* pipeline_layout;
   const iree_hal_buffer_ref_t* bindings;
   uint32_t workgroup_count[3];
 } iree_benchmark_executable_args_t;
@@ -232,6 +231,34 @@
       .payload_values = &fence_value,
   };
 
+  // Record a command buffer with the dispatches.
+  // The same command buffer recording is reused on each benchmark step.
+  iree_hal_command_buffer_t* command_buffer = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_command_buffer_create(
+      args->device, IREE_HAL_COMMAND_BUFFER_MODE_DEFAULT,
+      IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY,
+      /*binding_capacity=*/0, &command_buffer));
+  IREE_RETURN_IF_ERROR(iree_hal_command_buffer_begin(command_buffer));
+  iree_const_byte_span_t constants =
+      iree_make_const_byte_span(&parsed_params.push_constants[0].ui32,
+                                parsed_params.push_constant_count *
+                                    sizeof(parsed_params.push_constants[0]));
+  iree_hal_buffer_ref_list_t bindings = {
+      .count = parsed_params.binding_count,
+      .values = args->bindings,
+  };
+  for (int32_t i = 0; i < FLAG_batch_size; ++i) {
+    IREE_RETURN_IF_ERROR(iree_hal_command_buffer_dispatch2(
+        command_buffer, args->executable, FLAG_entry_point,
+        args->workgroup_count, constants, bindings,
+        IREE_HAL_DISPATCH_FLAG_NONE));
+    IREE_RETURN_IF_ERROR(iree_hal_command_buffer_execution_barrier(
+        command_buffer, IREE_HAL_EXECUTION_STAGE_COMMAND_RETIRE,
+        IREE_HAL_EXECUTION_STAGE_COMMAND_ISSUE,
+        IREE_HAL_EXECUTION_BARRIER_FLAG_NONE, 0, NULL, 0, NULL));
+  }
+  IREE_RETURN_IF_ERROR(iree_hal_command_buffer_end(command_buffer));
+
   // Start profiling now - all subsequent device operations will be what the
   // user wants to measure.
   IREE_RETURN_IF_ERROR(iree_hal_begin_profiling_from_flags(args->device));
@@ -244,48 +271,6 @@
   // number of workgroups executed.
   int64_t dispatch_count = 0;
   while (iree_benchmark_keep_running(benchmark_state, FLAG_batch_size)) {
-    // TODO(benvanik): record a secondary command buffer and just replay it
-    // here. This should fix the overhead at just primary command buffer
-    // creation. Most backends don't support reusable command buffers, yet, and
-    // some only support inline execution so we are conservatively doing that.
-    // In the future we should have an option (possibly based on device query)
-    // as to which path to use.
-
-    // Record a command buffer with the dispatches.
-    // Note that today we are doing this inside of the benchmark loop so that
-    // we can use inline execution. This is a boost to devices that support it
-    // like CUDA streams and synchronous CPU executors but a pessimization to
-    // devices that benefit from reusable command buffers like CUDA graphs.
-    // In the future we can add a flag that switches the mode between
-    // reusable and one-shot.
-    iree_hal_command_buffer_t* command_buffer = NULL;
-    IREE_RETURN_IF_ERROR(iree_hal_command_buffer_create(
-        args->device,
-        IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT |
-            IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION,
-        IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY,
-        /*binding_capacity=*/0, &command_buffer));
-    IREE_RETURN_IF_ERROR(iree_hal_command_buffer_begin(command_buffer));
-    IREE_RETURN_IF_ERROR(iree_hal_command_buffer_push_constants(
-        command_buffer, args->pipeline_layout, /*offset=*/0,
-        &parsed_params.push_constants[0].ui32,
-        parsed_params.push_constant_count *
-            sizeof(parsed_params.push_constants[0])));
-    IREE_RETURN_IF_ERROR(iree_hal_command_buffer_push_descriptor_set(
-        command_buffer, args->pipeline_layout, /*set=*/0,
-        parsed_params.binding_count, args->bindings));
-    for (int32_t i = 0; i < FLAG_batch_size; ++i) {
-      IREE_RETURN_IF_ERROR(iree_hal_command_buffer_dispatch(
-          command_buffer, args->executable, FLAG_entry_point,
-          args->workgroup_count[0], args->workgroup_count[1],
-          args->workgroup_count[2], IREE_HAL_DISPATCH_FLAG_NONE));
-      IREE_RETURN_IF_ERROR(iree_hal_command_buffer_execution_barrier(
-          command_buffer, IREE_HAL_EXECUTION_STAGE_COMMAND_RETIRE,
-          IREE_HAL_EXECUTION_STAGE_COMMAND_ISSUE,
-          IREE_HAL_EXECUTION_BARRIER_FLAG_NONE, 0, NULL, 0, NULL));
-    }
-    IREE_RETURN_IF_ERROR(iree_hal_command_buffer_end(command_buffer));
-
     // Submit the command buffer; if the device could not start executing while
     // we were recording then this will kick off the execution.
     ++fence_value;
@@ -301,9 +286,6 @@
 
     iree_benchmark_pause_timing(benchmark_state);
 
-    // Don't count cleanup time in the benchmark.
-    iree_hal_command_buffer_release(command_buffer);
-
     // Accumulate the total number of dispatches executed.
     dispatch_count += FLAG_batch_size;
 
@@ -325,6 +307,7 @@
                               args->workgroup_count[2];
   iree_benchmark_set_items_processed(benchmark_state, total_invocations);
 
+  iree_hal_command_buffer_release(command_buffer);
   iree_hal_semaphore_release(fence_semaphore);
 
   return iree_ok_status();
@@ -435,19 +418,6 @@
       iree_make_cstring_view(FLAG_executable_format);
   executable_params.executable_data = file_contents->const_buffer;
 
-  // Setup the layouts defining how each entry point is interpreted.
-  iree_hal_pipeline_layout_t* pipeline_layout = NULL;
-  iree_hal_descriptor_set_layout_t* descriptor_set_layout = NULL;
-  IREE_RETURN_IF_ERROR(iree_hal_descriptor_set_layout_create(
-      device, IREE_HAL_DESCRIPTOR_SET_LAYOUT_FLAG_NONE,
-      parsed_params.binding_count, parsed_params.binding_layouts,
-      &descriptor_set_layout));
-  IREE_RETURN_IF_ERROR(iree_hal_pipeline_layout_create(
-      device, parsed_params.push_constant_count,
-      /*set_layout_count=*/1, &descriptor_set_layout, &pipeline_layout));
-  executable_params.pipeline_layout_count = 1;
-  executable_params.pipeline_layouts = &pipeline_layout;
-
   // Executable-level constants allow us to perform some basic load-time value
   // propagation - usually dependent on device features or tuning parameters.
   executable_params.constant_count = parsed_params.executable_constant_count;
@@ -468,7 +438,6 @@
     args[i] = (iree_benchmark_executable_args_t){
         .device = device,
         .executable = executable,
-        .pipeline_layout = pipeline_layout,
         .bindings = bindings,
         .workgroup_count = {1, 1, 1},
     };
@@ -495,8 +464,6 @@
 
   iree_vm_list_release(binding_list);
   iree_hal_executable_release(executable);
-  iree_hal_descriptor_set_layout_release(descriptor_set_layout);
-  iree_hal_pipeline_layout_release(pipeline_layout);
   iree_file_contents_free(file_contents);
   iree_hal_executable_cache_release(executable_cache);
   iree_hal_device_release(device);