| // Copyright 2021 The IREE Authors |
| // |
| // Licensed under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| |
| #include <stdint.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| |
| #include "iree/base/api.h" |
| #include "iree/base/internal/flags.h" |
| #include "iree/base/tracing.h" |
| #include "iree/hal/api.h" |
| #include "iree/hal/local/executable_library.h" |
| #include "iree/hal/local/executable_loader.h" |
| #include "iree/hal/local/local_descriptor_set_layout.h" |
| #include "iree/hal/local/local_executable.h" |
| #include "iree/hal/local/local_executable_layout.h" |
| #include "iree/testing/benchmark.h" |
| |
| IREE_FLAG(string, executable_format, "", |
| "Format of the executable file being loaded."); |
| IREE_FLAG(string, executable_file, "", |
| "Path to the executable library file to load."); |
| |
| IREE_FLAG(int32_t, entry_point, 0, "Entry point ordinal to run."); |
| |
| IREE_FLAG(int32_t, workgroup_count_x, 1, |
| "X dimension of the workgroup count defining the number of\n" |
| "workgroup invocations that will be run per benchmark iteration.\n" |
| "This is the fastest-changing dimension."); |
| IREE_FLAG(int32_t, workgroup_count_y, 1, |
| "Y dimension of the workgroup count defining the number of\n" |
| "workgroup invocations that will be run per benchmark iteration."); |
| IREE_FLAG(int32_t, workgroup_count_z, 1, |
| "Z dimension of the workgroup count defining the number of\n" |
| "workgroup invocations that will be run per benchmark iteration.\n" |
| "This is the slowest-changing dimension."); |
| IREE_FLAG(int32_t, workgroup_size_x, 1, |
| "X dimension of the workgroup size passed to the executable."); |
| IREE_FLAG(int32_t, workgroup_size_y, 1, |
| "Y dimension of the workgroup size passed to the executable."); |
| IREE_FLAG(int32_t, workgroup_size_z, 1, |
| "Z dimension of the workgroup size passed to the executable."); |
| |
| // Total number of bindings we (currently) allow any executable to have. |
| #define IREE_HAL_LOCAL_MAX_TOTAL_BINDING_COUNT \ |
| (IREE_HAL_LOCAL_MAX_DESCRIPTOR_SET_COUNT * \ |
| IREE_HAL_LOCAL_MAX_DESCRIPTOR_BINDING_COUNT) |
| |
| // Parsed parameters from flags. |
| // Used to construct the dispatch parameters for the benchmark invocation. |
| struct { |
| int32_t push_constant_count; |
| union { |
| uint32_t ui32; |
| } push_constants[IREE_HAL_LOCAL_MAX_PUSH_CONSTANT_COUNT]; |
| |
| int32_t binding_count; |
| iree_string_view_t bindings[IREE_HAL_LOCAL_MAX_TOTAL_BINDING_COUNT]; |
| } dispatch_params = { |
| .push_constant_count = 0, |
| .binding_count = 0, |
| }; |
| |
| static iree_status_t parse_push_constant(iree_string_view_t flag_name, |
| void* storage, |
| iree_string_view_t value) { |
| IREE_ASSERT_LE(dispatch_params.push_constant_count + 1, |
| IREE_ARRAYSIZE(dispatch_params.push_constants), |
| "too many push constants"); |
| dispatch_params.push_constants[dispatch_params.push_constant_count++].ui32 = |
| atoi(value.data); |
| return iree_ok_status(); |
| } |
| static void print_push_constant(iree_string_view_t flag_name, void* storage, |
| FILE* file) { |
| if (dispatch_params.push_constant_count == 0) { |
| fprintf(file, "# --%.*s=[integer value]\n", (int)flag_name.size, |
| flag_name.data); |
| return; |
| } |
| for (int32_t i = 0; i < dispatch_params.push_constant_count; ++i) { |
| fprintf(file, "--%.*s=%u", (int)flag_name.size, flag_name.data, |
| dispatch_params.push_constants[i].ui32); |
| if (i < dispatch_params.push_constant_count - 1) { |
| fprintf(file, "\n"); |
| } |
| } |
| } |
| IREE_FLAG_CALLBACK(parse_push_constant, print_push_constant, &dispatch_params, |
| push_constant_callback, |
| "Appends a uint32_t push constant value.\n"); |
| |
| static iree_status_t parse_binding(iree_string_view_t flag_name, void* storage, |
| iree_string_view_t value) { |
| IREE_ASSERT_LE(dispatch_params.binding_count + 1, |
| IREE_ARRAYSIZE(dispatch_params.bindings), "too many bindings"); |
| dispatch_params.bindings[dispatch_params.binding_count++] = value; |
| return iree_ok_status(); |
| } |
| static void print_binding(iree_string_view_t flag_name, void* storage, |
| FILE* file) { |
| if (dispatch_params.binding_count == 0) { |
| fprintf(file, "# --%.*s=\"shapextype[=values]\"\n", (int)flag_name.size, |
| flag_name.data); |
| return; |
| } |
| for (int32_t i = 0; i < dispatch_params.binding_count; ++i) { |
| const iree_string_view_t binding_str = dispatch_params.bindings[i]; |
| fprintf(file, "--%.*s=\"%.*s\"\n", (int)flag_name.size, flag_name.data, |
| (int)binding_str.size, binding_str.data); |
| } |
| } |
| IREE_FLAG_CALLBACK( |
| parse_binding, print_binding, &dispatch_params, binding, |
| "Appends a binding to the dispatch parameters.\n" |
| "Bindings are defined by their shape, element type, and their data.\n" |
| "Examples:\n" |
| " # 16 4-byte elements zero-initialized:\n" |
| " --binding=2x8xi32\n" |
| " # 10000 bytes all initialized to 123:\n" |
| " --binding=10000xi8=123\n" |
| " # 2 4-byte floating-point values with contents [[1.4], [2.1]]:\n" |
| " --binding=2x1xf32=1.4,2.1"); |
| |
| #if defined(IREE_HAL_HAVE_EMBEDDED_LIBRARY_LOADER) |
| #include "iree/hal/local/loaders/embedded_library_loader.h" |
| #endif // IREE_HAL_HAVE_EMBEDDED_LIBRARY_LOADER |
| |
| // Creates an executable loader based on the given format flag. |
| static iree_status_t iree_hal_executable_library_create_loader( |
| iree_allocator_t host_allocator, |
| iree_hal_executable_loader_t** out_executable_loader) { |
| #if defined(IREE_HAL_HAVE_EMBEDDED_LIBRARY_LOADER) |
| if (strcmp(FLAG_executable_format, "EX_ELF") == 0) { |
| return iree_hal_embedded_library_loader_create( |
| iree_hal_executable_import_provider_null(), host_allocator, |
| out_executable_loader); |
| } |
| #endif // IREE_HAL_HAVE_EMBEDDED_LIBRARY_LOADER |
| return iree_make_status( |
| IREE_STATUS_UNAVAILABLE, |
| "no loader available that can handle --executable_format=%s", |
| FLAG_executable_format); |
| } |
| |
| // TODO(benvanik): use this to replace file_io.cc. |
| static iree_status_t iree_file_read_contents(const char* path, |
| iree_allocator_t allocator, |
| iree_byte_span_t* out_contents) { |
| IREE_TRACE_ZONE_BEGIN(z0); |
| *out_contents = iree_make_byte_span(NULL, 0); |
| FILE* file = fopen(path, "rb"); |
| if (file == NULL) { |
| IREE_TRACE_ZONE_END(z0); |
| return iree_make_status(iree_status_code_from_errno(errno), |
| "failed to open file '%s'", path); |
| } |
| iree_status_t status = iree_ok_status(); |
| if (fseek(file, 0, SEEK_END) == -1) { |
| status = iree_make_status(iree_status_code_from_errno(errno), "seek (end)"); |
| } |
| size_t file_size = 0; |
| if (iree_status_is_ok(status)) { |
| file_size = ftell(file); |
| if (file_size == -1L) { |
| status = |
| iree_make_status(iree_status_code_from_errno(errno), "size query"); |
| } |
| } |
| if (iree_status_is_ok(status)) { |
| if (fseek(file, 0, SEEK_SET) == -1) { |
| status = |
| iree_make_status(iree_status_code_from_errno(errno), "seek (beg)"); |
| } |
| } |
| // Allocate +1 to force a trailing \0 in case this is a string. |
| char* contents = NULL; |
| if (iree_status_is_ok(status)) { |
| status = iree_allocator_malloc(allocator, file_size + 1, (void**)&contents); |
| } |
| if (iree_status_is_ok(status)) { |
| if (fread(contents, file_size, 1, file) != 1) { |
| status = |
| iree_make_status(iree_status_code_from_errno(errno), |
| "unable to read entire file contents of '%s'", path); |
| } |
| } |
| if (iree_status_is_ok(status)) { |
| contents[file_size] = 0; // NUL |
| *out_contents = iree_make_byte_span(contents, file_size); |
| } else { |
| iree_allocator_free(allocator, contents); |
| } |
| fclose(file); |
| IREE_TRACE_ZONE_END(z0); |
| return status; |
| } |
| |
| // NOTE: error handling is here just for better diagnostics: it is not tracking |
| // allocations correctly and will leak. Don't use this as an example for how to |
| // write robust code. |
| static iree_status_t iree_hal_executable_library_run( |
| const iree_benchmark_def_t* benchmark_def, |
| iree_benchmark_state_t* benchmark_state) { |
| iree_allocator_t host_allocator = benchmark_state->host_allocator; |
| |
| // Register the loader used to load (or find) the executable. |
| iree_hal_executable_loader_t* executable_loader = NULL; |
| IREE_RETURN_IF_ERROR(iree_hal_executable_library_create_loader( |
| host_allocator, &executable_loader)); |
| |
| // Setup the specification used to perform the executable load. |
| // This information is normally used to select the appropriate loader but in |
| // this benchmark we only have a single one. |
| iree_hal_executable_spec_t executable_spec; |
| iree_hal_executable_spec_initialize(&executable_spec); |
| executable_spec.caching_mode = |
| IREE_HAL_EXECUTABLE_CACHING_MODE_ALLOW_OPTIMIZATION | |
| IREE_HAL_EXECUTABLE_CACHING_MODE_ALIAS_PROVIDED_DATA | |
| IREE_HAL_EXECUTABLE_CACHING_MODE_DISABLE_VERIFICATION; |
| executable_spec.executable_format = |
| iree_make_cstring_view(FLAG_executable_format); |
| |
| // Load the executable data. |
| IREE_RETURN_IF_ERROR(iree_file_read_contents( |
| FLAG_executable_file, host_allocator, |
| (iree_byte_span_t*)&executable_spec.executable_data)); |
| |
| // Setup the layouts defining how each entry point is interpreted. |
| // NOTE: we know for the embedded library loader that this is not required. |
| // Other loaders may need it in which case it'll have to be provided. |
| executable_spec.executable_layout_count = 0; |
| executable_spec.executable_layouts = NULL; |
| |
| // Perform the load, which will fail if the executable cannot be loaded or |
| // there was an issue with the layouts. |
| iree_hal_executable_t* executable = NULL; |
| IREE_RETURN_IF_ERROR(iree_hal_executable_loader_try_load( |
| executable_loader, &executable_spec, &executable)); |
| iree_hal_local_executable_t* local_executable = |
| iree_hal_local_executable_cast(executable); |
| |
| // Allocate workgroup-local memory that each invocation can use. |
| iree_byte_span_t local_memory = iree_make_byte_span(NULL, 0); |
| iree_host_size_t local_memory_size = |
| local_executable->dispatch_attrs |
| ? local_executable->dispatch_attrs[FLAG_entry_point] |
| .local_memory_pages * |
| IREE_HAL_WORKGROUP_LOCAL_MEMORY_PAGE_SIZE |
| : 0; |
| if (local_memory_size > 0) { |
| IREE_RETURN_IF_ERROR(iree_allocator_malloc( |
| host_allocator, local_memory_size, (void**)&local_memory.data)); |
| local_memory.data_length = local_memory_size; |
| } |
| |
| // Allocate storage for buffers and populate them. |
| // They only need to remain valid for the duration of the invocation and all |
| // memory accessed by the invocation will come from here. |
| iree_hal_allocator_t* heap_allocator = NULL; |
| IREE_RETURN_IF_ERROR(iree_hal_allocator_create_heap( |
| iree_make_cstring_view("benchmark"), host_allocator, host_allocator, |
| &heap_allocator)); |
| iree_hal_buffer_view_t* buffer_views[IREE_HAL_LOCAL_MAX_TOTAL_BINDING_COUNT]; |
| void* binding_ptrs[IREE_HAL_LOCAL_MAX_TOTAL_BINDING_COUNT]; |
| size_t binding_lengths[IREE_HAL_LOCAL_MAX_TOTAL_BINDING_COUNT]; |
| for (iree_host_size_t i = 0; i < dispatch_params.binding_count; ++i) { |
| IREE_RETURN_IF_ERROR(iree_hal_buffer_view_parse( |
| dispatch_params.bindings[i], heap_allocator, &buffer_views[i])); |
| iree_hal_buffer_t* buffer = iree_hal_buffer_view_buffer(buffer_views[i]); |
| iree_device_size_t buffer_length = |
| iree_hal_buffer_view_byte_length(buffer_views[i]); |
| iree_hal_buffer_mapping_t buffer_mapping = {{0}}; |
| IREE_RETURN_IF_ERROR(iree_hal_buffer_map_range( |
| buffer, IREE_HAL_MAPPING_MODE_PERSISTENT, |
| IREE_HAL_MEMORY_ACCESS_READ | IREE_HAL_MEMORY_ACCESS_WRITE, 0, |
| buffer_length, &buffer_mapping)); |
| binding_ptrs[i] = buffer_mapping.contents.data; |
| binding_lengths[i] = (size_t)buffer_mapping.contents.data_length; |
| } |
| |
| // Setup dispatch state. |
| iree_hal_executable_dispatch_state_v0_t dispatch_state = { |
| .workgroup_count = {{ |
| .x = FLAG_workgroup_count_x, |
| .y = FLAG_workgroup_count_y, |
| .z = FLAG_workgroup_count_z, |
| }}, |
| .workgroup_size = {{ |
| .x = FLAG_workgroup_size_x, |
| .y = FLAG_workgroup_size_y, |
| .z = FLAG_workgroup_size_z, |
| }}, |
| .push_constant_count = dispatch_params.push_constant_count, |
| .push_constants = &dispatch_params.push_constants[0].ui32, |
| .binding_count = dispatch_params.binding_count, |
| .binding_ptrs = binding_ptrs, |
| .binding_lengths = binding_lengths, |
| .environment = &local_executable->environment, |
| }; |
| |
| // Execute benchmark the workgroup invocation. |
| // Note that each iteration runs through the whole grid as it's important that |
| // we are testing the memory access patterns: if we just ran the same single |
| // tile processing the same exact region of memory over and over we are not |
| // testing cache effects. |
| int64_t dispatch_count = 0; |
| while (iree_benchmark_keep_running(benchmark_state, /*batch_count=*/1)) { |
| IREE_RETURN_IF_ERROR(iree_hal_local_executable_issue_dispatch_inline( |
| local_executable, FLAG_entry_point, &dispatch_state, local_memory)); |
| ++dispatch_count; |
| } |
| |
| // To get a total time per invocation we set the item count to the total |
| // invocations dispatched. That gives us both total dispatch and single |
| // invocation times in the reporter output. |
| int64_t total_invocations = |
| dispatch_count * dispatch_state.workgroup_count.x * |
| dispatch_state.workgroup_count.y * dispatch_state.workgroup_count.z; |
| iree_benchmark_set_items_processed(benchmark_state, total_invocations); |
| |
| // Deallocate buffers. |
| for (iree_host_size_t i = 0; i < dispatch_params.binding_count; ++i) { |
| iree_hal_buffer_view_release(buffer_views[i]); |
| } |
| iree_hal_allocator_release(heap_allocator); |
| |
| // Unload. |
| iree_allocator_free(host_allocator, |
| (void*)executable_spec.executable_data.data); |
| iree_hal_executable_release(executable); |
| iree_hal_executable_loader_release(executable_loader); |
| |
| return iree_ok_status(); |
| } |
| |
| int main(int argc, char** argv) { |
| iree_flags_set_usage( |
| "executable_library_benchmark", |
| "Benchmarks a single entry point within an executable library.\n" |
| "Executable libraries can be found in your temp path when compiling\n" |
| "with `-iree-llvm-keep-linker-artifacts`. The parameters used can be\n" |
| "inferred from the entry point `hal.interface` and dispatches to it.\n" |
| "\n" |
| "Note that this tool is intentionally low level: you must specify all\n" |
| "of the push constant/binding parameters precisely as they are expected\n" |
| "by the executable. `iree-benchmark-module` is the user-friendly\n" |
| "benchmarking tool while this one favors direct access to the\n" |
| "executables (bypassing all of the IREE VM, HAL APIs, task system,\n" |
| "etc).\n" |
| "\n" |
| "Example --flagfile:\n" |
| " --executable_format=EX_ELF\n" |
| " --executable_file=iree/hal/local/elf/testdata/" |
| "elementwise_mul_x86_64.so\n" |
| " --entry_point=0\n" |
| " --workgroup_count_x=1\n" |
| " --workgroup_count_y=1\n" |
| " --workgroup_count_z=1\n" |
| " --workgroup_size_x=1\n" |
| " --workgroup_size_y=1\n" |
| " --workgroup_size_z=1\n" |
| " --binding=4xf32=1,2,3,4\n" |
| " --binding=4xf32=100,200,300,400\n" |
| " --binding=4xf32=0,0,0,0);\n" |
| "\n"); |
| |
| iree_flags_parse_checked(IREE_FLAGS_PARSE_MODE_UNDEFINED_OK, &argc, &argv); |
| iree_benchmark_initialize(&argc, argv); |
| |
| // TODO(benvanik): override these with our own flags. |
| iree_benchmark_def_t benchmark_def = { |
| .flags = IREE_BENCHMARK_FLAG_MEASURE_PROCESS_CPU_TIME | |
| IREE_BENCHMARK_FLAG_USE_REAL_TIME, |
| .time_unit = IREE_BENCHMARK_UNIT_NANOSECOND, |
| .minimum_duration_ns = 0, |
| .iteration_count = 0, |
| .run = iree_hal_executable_library_run, |
| }; |
| iree_benchmark_register(iree_make_cstring_view("dispatch"), &benchmark_def); |
| |
| iree_benchmark_run_specified(); |
| return 0; |
| } |