Converting HIP target to support executable-create2. This produces a new flatbuffer that supports multiple hipModule_ts per HAL executable, reorganizes per-export information to be per-export, and removes HAL pipeline layouts and the existing stateful command recording.
diff --git a/tools/iree-benchmark-executable-main.c b/tools/iree-benchmark-executable-main.c index da015df..ff76ade 100644 --- a/tools/iree-benchmark-executable-main.c +++ b/tools/iree-benchmark-executable-main.c
@@ -205,7 +205,6 @@ typedef struct iree_benchmark_executable_args_t { iree_hal_device_t* device; iree_hal_executable_t* executable; - iree_hal_pipeline_layout_t* pipeline_layout; const iree_hal_buffer_ref_t* bindings; uint32_t workgroup_count[3]; } iree_benchmark_executable_args_t; @@ -232,6 +231,34 @@ .payload_values = &fence_value, }; + // Record a command buffer with the dispatches. + // The same command buffer recording is reused on each benchmark step. + iree_hal_command_buffer_t* command_buffer = NULL; + IREE_RETURN_IF_ERROR(iree_hal_command_buffer_create( + args->device, IREE_HAL_COMMAND_BUFFER_MODE_DEFAULT, + IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY, + /*binding_capacity=*/0, &command_buffer)); + IREE_RETURN_IF_ERROR(iree_hal_command_buffer_begin(command_buffer)); + iree_const_byte_span_t constants = + iree_make_const_byte_span(&parsed_params.push_constants[0].ui32, + parsed_params.push_constant_count * + sizeof(parsed_params.push_constants[0])); + iree_hal_buffer_ref_list_t bindings = { + .count = parsed_params.binding_count, + .values = args->bindings, + }; + for (int32_t i = 0; i < FLAG_batch_size; ++i) { + IREE_RETURN_IF_ERROR(iree_hal_command_buffer_dispatch2( + command_buffer, args->executable, FLAG_entry_point, + args->workgroup_count, constants, bindings, + IREE_HAL_DISPATCH_FLAG_NONE)); + IREE_RETURN_IF_ERROR(iree_hal_command_buffer_execution_barrier( + command_buffer, IREE_HAL_EXECUTION_STAGE_COMMAND_RETIRE, + IREE_HAL_EXECUTION_STAGE_COMMAND_ISSUE, + IREE_HAL_EXECUTION_BARRIER_FLAG_NONE, 0, NULL, 0, NULL)); + } + IREE_RETURN_IF_ERROR(iree_hal_command_buffer_end(command_buffer)); + // Start profiling now - all subsequent device operations will be what the // user wants to measure. IREE_RETURN_IF_ERROR(iree_hal_begin_profiling_from_flags(args->device)); @@ -244,48 +271,6 @@ // number of workgroups executed. int64_t dispatch_count = 0; while (iree_benchmark_keep_running(benchmark_state, FLAG_batch_size)) { - // TODO(benvanik): record a secondary command buffer and just replay it - // here. This should fix the overhead at just primary command buffer - // creation. Most backends don't support reusable command buffers, yet, and - // some only support inline execution so we are conservatively doing that. - // In the future we should have an option (possibly based on device query) - // as to which path to use. - - // Record a command buffer with the dispatches. - // Note that today we are doing this inside of the benchmark loop so that - // we can use inline execution. This is a boost to devices that support it - // like CUDA streams and synchronous CPU executors but a pessimization to - // devices that benefit from reusable command buffers like CUDA graphs. - // In the future we can add a flag that switches the mode between - // reusable and one-shot. - iree_hal_command_buffer_t* command_buffer = NULL; - IREE_RETURN_IF_ERROR(iree_hal_command_buffer_create( - args->device, - IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT | - IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION, - IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY, - /*binding_capacity=*/0, &command_buffer)); - IREE_RETURN_IF_ERROR(iree_hal_command_buffer_begin(command_buffer)); - IREE_RETURN_IF_ERROR(iree_hal_command_buffer_push_constants( - command_buffer, args->pipeline_layout, /*offset=*/0, - &parsed_params.push_constants[0].ui32, - parsed_params.push_constant_count * - sizeof(parsed_params.push_constants[0]))); - IREE_RETURN_IF_ERROR(iree_hal_command_buffer_push_descriptor_set( - command_buffer, args->pipeline_layout, /*set=*/0, - parsed_params.binding_count, args->bindings)); - for (int32_t i = 0; i < FLAG_batch_size; ++i) { - IREE_RETURN_IF_ERROR(iree_hal_command_buffer_dispatch( - command_buffer, args->executable, FLAG_entry_point, - args->workgroup_count[0], args->workgroup_count[1], - args->workgroup_count[2], IREE_HAL_DISPATCH_FLAG_NONE)); - IREE_RETURN_IF_ERROR(iree_hal_command_buffer_execution_barrier( - command_buffer, IREE_HAL_EXECUTION_STAGE_COMMAND_RETIRE, - IREE_HAL_EXECUTION_STAGE_COMMAND_ISSUE, - IREE_HAL_EXECUTION_BARRIER_FLAG_NONE, 0, NULL, 0, NULL)); - } - IREE_RETURN_IF_ERROR(iree_hal_command_buffer_end(command_buffer)); - // Submit the command buffer; if the device could not start executing while // we were recording then this will kick off the execution. ++fence_value; @@ -301,9 +286,6 @@ iree_benchmark_pause_timing(benchmark_state); - // Don't count cleanup time in the benchmark. - iree_hal_command_buffer_release(command_buffer); - // Accumulate the total number of dispatches executed. dispatch_count += FLAG_batch_size; @@ -325,6 +307,7 @@ args->workgroup_count[2]; iree_benchmark_set_items_processed(benchmark_state, total_invocations); + iree_hal_command_buffer_release(command_buffer); iree_hal_semaphore_release(fence_semaphore); return iree_ok_status(); @@ -435,19 +418,6 @@ iree_make_cstring_view(FLAG_executable_format); executable_params.executable_data = file_contents->const_buffer; - // Setup the layouts defining how each entry point is interpreted. - iree_hal_pipeline_layout_t* pipeline_layout = NULL; - iree_hal_descriptor_set_layout_t* descriptor_set_layout = NULL; - IREE_RETURN_IF_ERROR(iree_hal_descriptor_set_layout_create( - device, IREE_HAL_DESCRIPTOR_SET_LAYOUT_FLAG_NONE, - parsed_params.binding_count, parsed_params.binding_layouts, - &descriptor_set_layout)); - IREE_RETURN_IF_ERROR(iree_hal_pipeline_layout_create( - device, parsed_params.push_constant_count, - /*set_layout_count=*/1, &descriptor_set_layout, &pipeline_layout)); - executable_params.pipeline_layout_count = 1; - executable_params.pipeline_layouts = &pipeline_layout; - // Executable-level constants allow us to perform some basic load-time value // propagation - usually dependent on device features or tuning parameters. executable_params.constant_count = parsed_params.executable_constant_count; @@ -468,7 +438,6 @@ args[i] = (iree_benchmark_executable_args_t){ .device = device, .executable = executable, - .pipeline_layout = pipeline_layout, .bindings = bindings, .workgroup_count = {1, 1, 1}, }; @@ -495,8 +464,6 @@ iree_vm_list_release(binding_list); iree_hal_executable_release(executable); - iree_hal_descriptor_set_layout_release(descriptor_set_layout); - iree_hal_pipeline_layout_release(pipeline_layout); iree_file_contents_free(file_contents); iree_hal_executable_cache_release(executable_cache); iree_hal_device_release(device);