Ben Vanik | 23f2828 | 2024-02-23 11:14:25 -0800 | [diff] [blame] | 1 | // Copyright 2024 The IREE Authors |
| 2 | // |
| 3 | // Licensed under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | |
| 7 | #include <stdint.h> |
| 8 | #include <stdio.h> |
| 9 | #include <stdlib.h> |
| 10 | #include <string.h> |
| 11 | |
| 12 | #include "iree/base/api.h" |
| 13 | #include "iree/base/internal/file_io.h" |
| 14 | #include "iree/base/internal/flags.h" |
| 15 | #include "iree/hal/api.h" |
| 16 | #include "iree/modules/hal/types.h" |
| 17 | #include "iree/testing/benchmark.h" |
| 18 | #include "iree/tooling/device_util.h" |
| 19 | #include "iree/tooling/function_io.h" |
| 20 | #include "iree/vm/api.h" |
| 21 | |
| 22 | IREE_FLAG( |
| 23 | int32_t, batch_size, 64, |
| 24 | "Number of dispatches to perform per command buffer submission.\n" |
| 25 | "Higher numbers will reduce the effect of submission overheads on the\n" |
| 26 | "final timings but too high a value may result in hangs."); |
| 27 | |
| 28 | IREE_FLAG(string, executable_format, "", |
| 29 | "Format of the executable file being loaded."); |
| 30 | IREE_FLAG(string, executable_file, "", "Path to the executable file to load."); |
| 31 | |
| 32 | IREE_FLAG(int32_t, entry_point, 0, "Entry point ordinal to run."); |
| 33 | |
| 34 | IREE_FLAG_LIST( |
| 35 | string, workgroup_count, |
| 36 | "`x,y,z` dimensions of the workgroup count defining the number of\n" |
| 37 | "workgroup invocations that will be run per benchmark iteration.\n" |
| 38 | "Each occurrence of the flag will run a benchmark with that set of\n" |
| 39 | "workgroup count values."); |
| 40 | |
| 41 | // Total number of executable-level constants we (currently) allow; this is only |
| 42 | // a limitation of how much memory we allocate and we could make this |
| 43 | // dynamically growable. |
| 44 | #define IREE_HAL_MAX_EXECUTABLE_CONSTANT_COUNT 512 |
| 45 | // Total number of push constants we (currently) allow any executable to have. |
Ben Vanik | 9bbc926 | 2024-08-20 22:16:16 -0700 | [diff] [blame] | 46 | #define IREE_HAL_MAX_CONSTANT_COUNT 64 |
Ben Vanik | 23f2828 | 2024-02-23 11:14:25 -0800 | [diff] [blame] | 47 | // Total number of bindings we (currently) allow any executable to have. |
Ben Vanik | 9bbc926 | 2024-08-20 22:16:16 -0700 | [diff] [blame] | 48 | #define IREE_HAL_MAX_BINDING_COUNT 64 |
Ben Vanik | 23f2828 | 2024-02-23 11:14:25 -0800 | [diff] [blame] | 49 | |
| 50 | // Parsed dispatch parameters from flags. |
| 51 | // Used to construct the dispatch parameters for the benchmark invocation. |
| 52 | struct { |
Ben Vanik | 23f2828 | 2024-02-23 11:14:25 -0800 | [diff] [blame] | 53 | int32_t executable_constant_count; |
| 54 | union { |
| 55 | uint32_t ui32; |
| 56 | } executable_constants[IREE_HAL_MAX_EXECUTABLE_CONSTANT_COUNT]; |
| 57 | |
Ben Vanik | 9bbc926 | 2024-08-20 22:16:16 -0700 | [diff] [blame] | 58 | int32_t constant_count; |
Ben Vanik | 23f2828 | 2024-02-23 11:14:25 -0800 | [diff] [blame] | 59 | union { |
| 60 | uint32_t ui32; |
Ben Vanik | 9bbc926 | 2024-08-20 22:16:16 -0700 | [diff] [blame] | 61 | } constants[IREE_HAL_MAX_CONSTANT_COUNT]; |
Ben Vanik | 23f2828 | 2024-02-23 11:14:25 -0800 | [diff] [blame] | 62 | |
| 63 | int32_t binding_count; |
Ben Vanik | 9bbc926 | 2024-08-20 22:16:16 -0700 | [diff] [blame] | 64 | iree_string_view_t binding_specs[IREE_HAL_MAX_BINDING_COUNT]; |
| 65 | char binding_cconv[IREE_HAL_MAX_BINDING_COUNT]; |
Ben Vanik | 23f2828 | 2024-02-23 11:14:25 -0800 | [diff] [blame] | 66 | } parsed_params = { |
| 67 | .executable_constant_count = 0, |
Ben Vanik | 9bbc926 | 2024-08-20 22:16:16 -0700 | [diff] [blame] | 68 | .constant_count = 0, |
Ben Vanik | 23f2828 | 2024-02-23 11:14:25 -0800 | [diff] [blame] | 69 | .binding_count = 0, |
| 70 | }; |
| 71 | |
| 72 | static iree_status_t parse_executable_constant(iree_string_view_t flag_name, |
| 73 | void* storage, |
| 74 | iree_string_view_t value) { |
| 75 | IREE_ASSERT_LE(parsed_params.executable_constant_count + 1, |
| 76 | IREE_ARRAYSIZE(parsed_params.executable_constants), |
| 77 | "too many executable constants"); |
| 78 | uint32_t value_ui32 = 0; |
| 79 | if (!iree_string_view_atoi_uint32(value, &value_ui32)) { |
| 80 | return iree_make_status( |
| 81 | IREE_STATUS_INVALID_ARGUMENT, |
| 82 | "invalid executable constant value `%.*s`; expects uint32_t", |
| 83 | (int)value.size, value.data); |
| 84 | } |
| 85 | parsed_params.executable_constants[parsed_params.executable_constant_count++] |
| 86 | .ui32 = value_ui32; |
| 87 | return iree_ok_status(); |
| 88 | } |
| 89 | static void print_executable_constant(iree_string_view_t flag_name, |
| 90 | void* storage, FILE* file) { |
| 91 | if (parsed_params.executable_constant_count == 0) { |
| 92 | fprintf(file, "# --%.*s=[integer value]\n", (int)flag_name.size, |
| 93 | flag_name.data); |
| 94 | return; |
| 95 | } |
| 96 | for (int32_t i = 0; i < parsed_params.executable_constant_count; ++i) { |
| 97 | fprintf(file, "--%.*s=%u", (int)flag_name.size, flag_name.data, |
| 98 | parsed_params.executable_constants[i].ui32); |
| 99 | if (i < parsed_params.executable_constant_count - 1) { |
| 100 | fprintf(file, "\n"); |
| 101 | } |
| 102 | } |
| 103 | } |
| 104 | IREE_FLAG_CALLBACK(parse_executable_constant, print_executable_constant, |
| 105 | &parsed_params, executable_constant, |
| 106 | "Appends a uint32_t executable constant value.\n"); |
| 107 | |
Ben Vanik | 9bbc926 | 2024-08-20 22:16:16 -0700 | [diff] [blame] | 108 | static iree_status_t parse_constant(iree_string_view_t flag_name, void* storage, |
| 109 | iree_string_view_t value) { |
| 110 | IREE_ASSERT_LE(parsed_params.constant_count + 1, |
| 111 | IREE_ARRAYSIZE(parsed_params.constants), |
Ben Vanik | 23f2828 | 2024-02-23 11:14:25 -0800 | [diff] [blame] | 112 | "too many push constants"); |
| 113 | uint32_t value_ui32 = 0; |
| 114 | if (!iree_string_view_atoi_uint32(value, &value_ui32)) { |
| 115 | return iree_make_status( |
| 116 | IREE_STATUS_INVALID_ARGUMENT, |
| 117 | "invalid push constant value `%.*s`; expects uint32_t", (int)value.size, |
| 118 | value.data); |
| 119 | } |
Ben Vanik | 9bbc926 | 2024-08-20 22:16:16 -0700 | [diff] [blame] | 120 | parsed_params.constants[parsed_params.constant_count++].ui32 = value_ui32; |
Ben Vanik | 23f2828 | 2024-02-23 11:14:25 -0800 | [diff] [blame] | 121 | return iree_ok_status(); |
| 122 | } |
Ben Vanik | 9bbc926 | 2024-08-20 22:16:16 -0700 | [diff] [blame] | 123 | static void print_constant(iree_string_view_t flag_name, void* storage, |
| 124 | FILE* file) { |
| 125 | if (parsed_params.constant_count == 0) { |
Ben Vanik | 23f2828 | 2024-02-23 11:14:25 -0800 | [diff] [blame] | 126 | fprintf(file, "# --%.*s=[integer value]\n", (int)flag_name.size, |
| 127 | flag_name.data); |
| 128 | return; |
| 129 | } |
Ben Vanik | 9bbc926 | 2024-08-20 22:16:16 -0700 | [diff] [blame] | 130 | for (int32_t i = 0; i < parsed_params.constant_count; ++i) { |
Ben Vanik | 23f2828 | 2024-02-23 11:14:25 -0800 | [diff] [blame] | 131 | fprintf(file, "--%.*s=%u", (int)flag_name.size, flag_name.data, |
Ben Vanik | 9bbc926 | 2024-08-20 22:16:16 -0700 | [diff] [blame] | 132 | parsed_params.constants[i].ui32); |
| 133 | if (i < parsed_params.constant_count - 1) { |
Ben Vanik | 23f2828 | 2024-02-23 11:14:25 -0800 | [diff] [blame] | 134 | fprintf(file, "\n"); |
| 135 | } |
| 136 | } |
| 137 | } |
Ben Vanik | 9bbc926 | 2024-08-20 22:16:16 -0700 | [diff] [blame] | 138 | IREE_FLAG_CALLBACK(parse_constant, print_constant, &parsed_params, constant, |
| 139 | "Appends a uint32_t constant value.\n"); |
Ben Vanik | 23f2828 | 2024-02-23 11:14:25 -0800 | [diff] [blame] | 140 | |
| 141 | static iree_status_t parse_binding(iree_string_view_t flag_name, void* storage, |
| 142 | iree_string_view_t value) { |
| 143 | IREE_ASSERT_LE(parsed_params.binding_count + 1, |
| 144 | IREE_ARRAYSIZE(parsed_params.binding_specs), |
| 145 | "too many bindings"); |
| 146 | int32_t i = parsed_params.binding_count++; |
| 147 | parsed_params.binding_specs[i] = value; |
| 148 | parsed_params.binding_cconv[i] = 'r'; |
Ben Vanik | 23f2828 | 2024-02-23 11:14:25 -0800 | [diff] [blame] | 149 | return iree_ok_status(); |
| 150 | } |
| 151 | static void print_binding(iree_string_view_t flag_name, void* storage, |
| 152 | FILE* file) { |
| 153 | if (parsed_params.binding_count == 0) { |
| 154 | fprintf(file, "# --%.*s=\"shapextype[=values]\"\n", (int)flag_name.size, |
| 155 | flag_name.data); |
| 156 | return; |
| 157 | } |
| 158 | for (int32_t i = 0; i < parsed_params.binding_count; ++i) { |
| 159 | const iree_string_view_t binding_spec = parsed_params.binding_specs[i]; |
| 160 | fprintf(file, "--%.*s=\"%.*s\"\n", (int)flag_name.size, flag_name.data, |
| 161 | (int)binding_spec.size, binding_spec.data); |
| 162 | } |
| 163 | } |
| 164 | IREE_FLAG_CALLBACK( |
| 165 | parse_binding, print_binding, &parsed_params, binding, |
| 166 | "Appends a binding to the dispatch parameters.\n" |
| 167 | "Bindings are defined by their shape, element type, and their data.\n" |
| 168 | "There must be one binding for every declared layout binding.\n" |
| 169 | "Examples:\n" |
| 170 | " # 16 4-byte elements zero-initialized:\n" |
| 171 | " --binding=2x8xi32\n" |
| 172 | " # 10000 bytes all initialized to 123:\n" |
| 173 | " --binding=10000xi8=123\n" |
| 174 | " # 2 4-byte floating-point values with contents [[1.4], [2.1]]:\n" |
| 175 | " --binding=2x1xf32=1.4,2.1\n" |
| 176 | " # First array from a numpy file followed by the second:\n" |
| 177 | " --binding=@file.npy\n" |
| 178 | " --binding=+file.npy\n" |
| 179 | " # All arrays from a numpy file\n" |
| 180 | " --binding=*file.npy\n" |
| 181 | " # Binary tensor<2x2xf32> and tensor<4xf32> read from a single file\n" |
| 182 | " --binding=2x2xf32=@file.ext\n" |
| 183 | " --binding=4xf32=+file.ext"); |
| 184 | |
| 185 | typedef struct iree_benchmark_executable_args_t { |
| 186 | iree_hal_device_t* device; |
| 187 | iree_hal_executable_t* executable; |
Ben Vanik | 9ffe473 | 2024-07-08 17:10:45 -0700 | [diff] [blame] | 188 | const iree_hal_buffer_ref_t* bindings; |
Ben Vanik | 23f2828 | 2024-02-23 11:14:25 -0800 | [diff] [blame] | 189 | uint32_t workgroup_count[3]; |
| 190 | } iree_benchmark_executable_args_t; |
| 191 | |
| 192 | // NOTE: error handling is here just for better diagnostics: it is not tracking |
| 193 | // allocations correctly and will leak. Don't use this as an example for how to |
| 194 | // write robust code. |
| 195 | static iree_status_t iree_benchmark_executable_run( |
| 196 | const iree_benchmark_def_t* benchmark_def, |
| 197 | iree_benchmark_state_t* benchmark_state) { |
| 198 | iree_benchmark_executable_args_t* args = |
| 199 | (iree_benchmark_executable_args_t*)benchmark_def->user_data; |
| 200 | |
| 201 | iree_hal_semaphore_t* fence_semaphore = NULL; |
| 202 | uint64_t fence_value = 0ull; |
Ben Vanik | a28f76f | 2024-08-06 15:04:15 -0700 | [diff] [blame] | 203 | IREE_RETURN_IF_ERROR(iree_hal_semaphore_create(args->device, fence_value, |
| 204 | IREE_HAL_SEMAPHORE_FLAG_NONE, |
| 205 | &fence_semaphore)); |
Ben Vanik | 23f2828 | 2024-02-23 11:14:25 -0800 | [diff] [blame] | 206 | iree_hal_semaphore_list_t wait_semaphore_list = |
| 207 | iree_hal_semaphore_list_empty(); |
| 208 | iree_hal_semaphore_list_t signal_semaphore_list = { |
| 209 | .count = 1, |
| 210 | .semaphores = &fence_semaphore, |
| 211 | .payload_values = &fence_value, |
| 212 | }; |
| 213 | |
Ben Vanik | 894dfbe | 2024-08-13 11:41:10 -0700 | [diff] [blame] | 214 | // Record a command buffer with the dispatches. |
| 215 | // The same command buffer recording is reused on each benchmark step. |
| 216 | iree_hal_command_buffer_t* command_buffer = NULL; |
| 217 | IREE_RETURN_IF_ERROR(iree_hal_command_buffer_create( |
| 218 | args->device, IREE_HAL_COMMAND_BUFFER_MODE_DEFAULT, |
| 219 | IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY, |
| 220 | /*binding_capacity=*/0, &command_buffer)); |
| 221 | IREE_RETURN_IF_ERROR(iree_hal_command_buffer_begin(command_buffer)); |
Ben Vanik | 9bbc926 | 2024-08-20 22:16:16 -0700 | [diff] [blame] | 222 | iree_const_byte_span_t constants = iree_make_const_byte_span( |
| 223 | &parsed_params.constants[0].ui32, |
| 224 | parsed_params.constant_count * sizeof(parsed_params.constants[0])); |
Ben Vanik | 894dfbe | 2024-08-13 11:41:10 -0700 | [diff] [blame] | 225 | iree_hal_buffer_ref_list_t bindings = { |
| 226 | .count = parsed_params.binding_count, |
| 227 | .values = args->bindings, |
| 228 | }; |
| 229 | for (int32_t i = 0; i < FLAG_batch_size; ++i) { |
Ben Vanik | 7dc8c26 | 2024-08-22 14:43:33 -0700 | [diff] [blame] | 230 | IREE_RETURN_IF_ERROR(iree_hal_command_buffer_dispatch( |
Ben Vanik | 894dfbe | 2024-08-13 11:41:10 -0700 | [diff] [blame] | 231 | command_buffer, args->executable, FLAG_entry_point, |
| 232 | args->workgroup_count, constants, bindings, |
| 233 | IREE_HAL_DISPATCH_FLAG_NONE)); |
| 234 | IREE_RETURN_IF_ERROR(iree_hal_command_buffer_execution_barrier( |
| 235 | command_buffer, IREE_HAL_EXECUTION_STAGE_COMMAND_RETIRE, |
| 236 | IREE_HAL_EXECUTION_STAGE_COMMAND_ISSUE, |
| 237 | IREE_HAL_EXECUTION_BARRIER_FLAG_NONE, 0, NULL, 0, NULL)); |
| 238 | } |
| 239 | IREE_RETURN_IF_ERROR(iree_hal_command_buffer_end(command_buffer)); |
| 240 | |
Ben Vanik | 23f2828 | 2024-02-23 11:14:25 -0800 | [diff] [blame] | 241 | // Start profiling now - all subsequent device operations will be what the |
| 242 | // user wants to measure. |
| 243 | IREE_RETURN_IF_ERROR(iree_hal_begin_profiling_from_flags(args->device)); |
| 244 | |
| 245 | // Submit the command buffer and wait for it to complete. |
| 246 | // Note that each iteration runs through the whole grid as it's important that |
| 247 | // we are testing the memory access patterns: if we just ran the same single |
| 248 | // workgroup processing the same exact region of memory over and over we are |
| 249 | // not testing cache effects. This means we need to account for the total |
| 250 | // number of workgroups executed. |
| 251 | int64_t dispatch_count = 0; |
| 252 | while (iree_benchmark_keep_running(benchmark_state, FLAG_batch_size)) { |
Ben Vanik | 23f2828 | 2024-02-23 11:14:25 -0800 | [diff] [blame] | 253 | // Submit the command buffer; if the device could not start executing while |
| 254 | // we were recording then this will kick off the execution. |
| 255 | ++fence_value; |
| 256 | IREE_RETURN_IF_ERROR(iree_hal_device_queue_execute( |
| 257 | args->device, IREE_HAL_QUEUE_AFFINITY_ANY, wait_semaphore_list, |
Ben Vanik | 13e6b7e | 2024-07-08 09:40:30 -0700 | [diff] [blame] | 258 | signal_semaphore_list, 1, &command_buffer, /*binding_tables=*/NULL)); |
Ben Vanik | 23f2828 | 2024-02-23 11:14:25 -0800 | [diff] [blame] | 259 | |
| 260 | // Block and wait for the submission to complete. |
| 261 | // Note that this will include round-trip overhead and if the dispatch or |
| 262 | // batch size is small then the final time may end up being mostly overhead. |
| 263 | IREE_RETURN_IF_ERROR(iree_hal_semaphore_wait(fence_semaphore, fence_value, |
| 264 | iree_infinite_timeout())); |
| 265 | |
| 266 | iree_benchmark_pause_timing(benchmark_state); |
| 267 | |
Ben Vanik | 23f2828 | 2024-02-23 11:14:25 -0800 | [diff] [blame] | 268 | // Accumulate the total number of dispatches executed. |
| 269 | dispatch_count += FLAG_batch_size; |
| 270 | |
| 271 | // Flush profiling if recording. Note that we don't want to include the |
| 272 | // profiling time in the benchmark result. |
| 273 | IREE_RETURN_IF_ERROR(iree_hal_device_profiling_flush(args->device)); |
| 274 | |
| 275 | iree_benchmark_resume_timing(benchmark_state); |
| 276 | } |
| 277 | |
| 278 | // End profiling before cleaning up so tooling doesn't capture it. |
| 279 | IREE_RETURN_IF_ERROR(iree_hal_end_profiling_from_flags(args->device)); |
| 280 | |
| 281 | // To get a total time per invocation we set the item count to the total |
| 282 | // invocations dispatched. That gives us both total dispatch and single |
| 283 | // invocation times in the reporter output. |
| 284 | int64_t total_invocations = dispatch_count * args->workgroup_count[0] * |
| 285 | args->workgroup_count[1] * |
| 286 | args->workgroup_count[2]; |
| 287 | iree_benchmark_set_items_processed(benchmark_state, total_invocations); |
| 288 | |
Ben Vanik | 894dfbe | 2024-08-13 11:41:10 -0700 | [diff] [blame] | 289 | iree_hal_command_buffer_release(command_buffer); |
Ben Vanik | 23f2828 | 2024-02-23 11:14:25 -0800 | [diff] [blame] | 290 | iree_hal_semaphore_release(fence_semaphore); |
| 291 | |
| 292 | return iree_ok_status(); |
| 293 | } |
| 294 | |
| 295 | // Parses an `x,y,z` workgroup count. |
| 296 | static iree_status_t iree_parse_workgroup_count( |
| 297 | iree_string_view_t workgroup_count_str, uint32_t* out_workgroup_count) { |
| 298 | iree_string_view_t str = workgroup_count_str; |
| 299 | iree_string_view_t str_x; |
| 300 | iree_string_view_split(str, ',', &str_x, &str); |
| 301 | iree_string_view_t str_y; |
| 302 | iree_string_view_split(str, ',', &str_y, &str); |
| 303 | iree_string_view_t str_z = str; |
| 304 | if (!iree_string_view_atoi_uint32(str_x, &out_workgroup_count[0]) || |
| 305 | !iree_string_view_atoi_uint32(str_y, &out_workgroup_count[1]) || |
| 306 | !iree_string_view_atoi_uint32(str_z, &out_workgroup_count[2])) { |
| 307 | return iree_make_status( |
| 308 | IREE_STATUS_INVALID_ARGUMENT, |
| 309 | "invalid workgroup count string `%.*s`; expects `X,Y,Z`", |
| 310 | (int)workgroup_count_str.size, workgroup_count_str.data); |
| 311 | } |
| 312 | return iree_ok_status(); |
| 313 | } |
| 314 | |
| 315 | // Runs one benchmark per workgroup count specified using the same device |
| 316 | // and input/output buffers. |
| 317 | static iree_status_t iree_benchmark_executable_from_flags( |
| 318 | iree_allocator_t host_allocator) { |
| 319 | iree_vm_instance_t* instance = NULL; |
| 320 | IREE_RETURN_IF_ERROR(iree_vm_instance_create(IREE_VM_TYPE_CAPACITY_DEFAULT, |
| 321 | host_allocator, &instance)); |
| 322 | IREE_RETURN_IF_ERROR(iree_hal_module_register_inline_types(instance)); |
| 323 | |
| 324 | // Create the HAL device we'll be using during execution. |
| 325 | // Devices can be very expensive to create and we want to avoid doing it |
| 326 | // multiple times throughout the benchmark execution. |
| 327 | iree_hal_device_t* device = NULL; |
| 328 | IREE_RETURN_IF_ERROR(iree_hal_create_device_from_flags( |
| 329 | iree_hal_available_driver_registry(), iree_hal_default_device_uri(), |
| 330 | host_allocator, &device)); |
| 331 | |
| 332 | // We'll reuse the same executable cache so that once we load the executable |
| 333 | // we'll be able to reuse any driver-side optimizations. |
| 334 | iree_hal_executable_cache_t* executable_cache = NULL; |
| 335 | iree_status_t loop_status = iree_ok_status(); |
| 336 | IREE_RETURN_IF_ERROR(iree_hal_executable_cache_create( |
| 337 | device, iree_make_cstring_view("cache"), iree_loop_inline(&loop_status), |
| 338 | &executable_cache)); |
| 339 | IREE_RETURN_IF_ERROR(loop_status); |
| 340 | |
| 341 | // Allocate storage for buffers and populate them. |
| 342 | // They only need to remain valid for the duration of the invocation and all |
| 343 | // memory accessed by the invocation will come from here. |
| 344 | // Note that we do this parsing first so that we can reflect on the I/O to |
| 345 | // infer the pipeline layout. |
| 346 | iree_hal_allocator_t* device_allocator = iree_hal_device_allocator(device); |
| 347 | iree_vm_list_t* binding_list = NULL; |
| 348 | IREE_RETURN_IF_ERROR(iree_tooling_parse_variants( |
| 349 | iree_make_string_view(parsed_params.binding_cconv, |
| 350 | parsed_params.binding_count), |
| 351 | (iree_string_view_list_t){parsed_params.binding_count, |
| 352 | parsed_params.binding_specs}, |
| 353 | device, device_allocator, host_allocator, &binding_list)); |
Ben Vanik | 9bbc926 | 2024-08-20 22:16:16 -0700 | [diff] [blame] | 354 | iree_hal_buffer_ref_t bindings[IREE_HAL_MAX_BINDING_COUNT]; |
Ben Vanik | 23f2828 | 2024-02-23 11:14:25 -0800 | [diff] [blame] | 355 | for (iree_host_size_t i = 0; i < parsed_params.binding_count; ++i) { |
| 356 | iree_vm_ref_t value = iree_vm_ref_null(); |
| 357 | IREE_RETURN_IF_ERROR(iree_vm_list_get_ref_assign(binding_list, i, &value)); |
| 358 | iree_hal_buffer_t* buffer = NULL; |
| 359 | if (iree_hal_buffer_isa(value)) { |
| 360 | buffer = iree_hal_buffer_deref(value); |
| 361 | } else if (iree_hal_buffer_view_isa(value)) { |
| 362 | buffer = iree_hal_buffer_view_buffer(iree_hal_buffer_view_deref(value)); |
| 363 | } else { |
| 364 | return iree_make_status( |
| 365 | IREE_STATUS_INVALID_ARGUMENT, |
| 366 | "bindings must be shaped types (4xf32, etc), binding %" PRIhsz |
| 367 | " is not", |
| 368 | i); |
| 369 | } |
Ben Vanik | 9ffe473 | 2024-07-08 17:10:45 -0700 | [diff] [blame] | 370 | bindings[i] = iree_hal_make_buffer_ref(buffer, 0, IREE_WHOLE_BUFFER); |
Ben Vanik | 23f2828 | 2024-02-23 11:14:25 -0800 | [diff] [blame] | 371 | } |
| 372 | |
| 373 | // Setup the specification used to perform the executable load. |
| 374 | // This information is normally used to select the appropriate loader but in |
| 375 | // this benchmark we only have a single one. |
| 376 | // TODO(benvanik): expose the flags once they are implemented anywhere. |
| 377 | iree_hal_executable_params_t executable_params; |
| 378 | iree_hal_executable_params_initialize(&executable_params); |
| 379 | executable_params.caching_mode = |
| 380 | IREE_HAL_EXECUTABLE_CACHING_MODE_ALLOW_OPTIMIZATION | |
| 381 | IREE_HAL_EXECUTABLE_CACHING_MODE_ALIAS_PROVIDED_DATA; |
| 382 | |
| 383 | // Load the executable data into memory. |
| 384 | // In normal usage this would be mapped from the containing module file (which |
| 385 | // itself may be mapped from disk). |
| 386 | iree_file_contents_t* file_contents = NULL; |
| 387 | if (strcmp(FLAG_executable_file, "-") == 0) { |
| 388 | IREE_RETURN_IF_ERROR( |
| 389 | iree_stdin_read_contents(host_allocator, &file_contents)); |
| 390 | } else { |
| 391 | IREE_RETURN_IF_ERROR(iree_file_read_contents( |
| 392 | FLAG_executable_file, IREE_FILE_READ_FLAG_DEFAULT, host_allocator, |
| 393 | &file_contents)); |
| 394 | } |
| 395 | executable_params.executable_format = |
| 396 | iree_make_cstring_view(FLAG_executable_format); |
| 397 | executable_params.executable_data = file_contents->const_buffer; |
| 398 | |
Ben Vanik | 23f2828 | 2024-02-23 11:14:25 -0800 | [diff] [blame] | 399 | // Executable-level constants allow us to perform some basic load-time value |
| 400 | // propagation - usually dependent on device features or tuning parameters. |
| 401 | executable_params.constant_count = parsed_params.executable_constant_count; |
| 402 | executable_params.constants = &parsed_params.executable_constants[0].ui32; |
| 403 | |
| 404 | // Perform the load, which will fail if the executable cannot be loaded or |
| 405 | // there was an issue with the layouts. |
| 406 | iree_hal_executable_t* executable = NULL; |
| 407 | IREE_RETURN_IF_ERROR(iree_hal_executable_cache_prepare_executable( |
| 408 | executable_cache, &executable_params, &executable)); |
| 409 | |
| 410 | // Register one benchmark per workgroup count specified. |
| 411 | iree_benchmark_executable_args_t* args = NULL; |
| 412 | IREE_RETURN_IF_ERROR(iree_allocator_malloc( |
| 413 | host_allocator, sizeof(*args) * FLAG_workgroup_count_list().count, |
| 414 | (void**)&args)); |
| 415 | for (iree_host_size_t i = 0; i < FLAG_workgroup_count_list().count; ++i) { |
| 416 | args[i] = (iree_benchmark_executable_args_t){ |
| 417 | .device = device, |
| 418 | .executable = executable, |
Ben Vanik | 23f2828 | 2024-02-23 11:14:25 -0800 | [diff] [blame] | 419 | .bindings = bindings, |
| 420 | .workgroup_count = {1, 1, 1}, |
| 421 | }; |
| 422 | IREE_RETURN_IF_ERROR(iree_parse_workgroup_count( |
| 423 | FLAG_workgroup_count_list().values[i], args[i].workgroup_count)); |
| 424 | iree_benchmark_def_t benchmark_def = { |
| 425 | .flags = IREE_BENCHMARK_FLAG_MEASURE_PROCESS_CPU_TIME | |
| 426 | IREE_BENCHMARK_FLAG_USE_REAL_TIME, |
| 427 | .time_unit = IREE_BENCHMARK_UNIT_NANOSECOND, |
| 428 | .minimum_duration_ns = 0, |
| 429 | .iteration_count = 0, |
| 430 | .run = iree_benchmark_executable_run, |
| 431 | .user_data = &args[i], |
| 432 | }; |
| 433 | char benchmark_name[512]; |
| 434 | snprintf(benchmark_name, sizeof(benchmark_name) - 1, "dispatch_%ux%ux%u", |
| 435 | args[i].workgroup_count[0], args[i].workgroup_count[1], |
| 436 | args[i].workgroup_count[2]); |
| 437 | iree_benchmark_register(iree_make_cstring_view(benchmark_name), |
| 438 | &benchmark_def); |
| 439 | } |
| 440 | iree_benchmark_run_specified(); |
| 441 | iree_allocator_free(host_allocator, args); |
| 442 | |
| 443 | iree_vm_list_release(binding_list); |
| 444 | iree_hal_executable_release(executable); |
Ben Vanik | 23f2828 | 2024-02-23 11:14:25 -0800 | [diff] [blame] | 445 | iree_file_contents_free(file_contents); |
| 446 | iree_hal_executable_cache_release(executable_cache); |
| 447 | iree_hal_device_release(device); |
| 448 | iree_vm_instance_release(instance); |
| 449 | |
| 450 | return iree_ok_status(); |
| 451 | } |
| 452 | |
| 453 | int main(int argc, char** argv) { |
| 454 | IREE_TRACE_APP_ENTER(); |
| 455 | IREE_TRACE_ZONE_BEGIN(z0); |
| 456 | |
| 457 | iree_allocator_t host_allocator = iree_allocator_system(); |
| 458 | int exit_code = EXIT_SUCCESS; |
| 459 | |
| 460 | iree_flags_set_usage( |
| 461 | "iree-benchmark-executable", |
| 462 | "Benchmarks a single entry point within an executable library.\n" |
| 463 | "The parameters used can be inferred from the entry point " |
| 464 | "`hal.interface` and dispatches to it in the source program.\n" |
| 465 | "\n" |
| 466 | "Executables can be extracted from VMFB files using `unzip` or dumped\n" |
| 467 | "during compilation using --iree-hal-dump-executable-binaries-to=path/.\n" |
| 468 | "\n" |
| 469 | "The compiler can directly compile `hal.executable.source` and\n" |
| 470 | "`hal.executable` ops to the appropriate binaries by using the\n" |
| 471 | "`iree-compile --compile-mode=hal-executable` mode.\n" |
| 472 | "\n" |
| 473 | "Example flags for various compilation backends:\n" |
| 474 | " --iree-hal-target-backends=vmvx\n" |
| 475 | " --device=local-sync or --device=local-task\n" |
| 476 | " --executable_format=vmvx-bytecode-fb\n" |
| 477 | " --iree-hal-target-backends=llvm-cpu\n" |
| 478 | " --device=local-sync or --device=local-task\n" |
| 479 | " --executable_format=embedded-elf-x86_64\n" |
| 480 | " --executable_format=system-dll-x86_64\n" |
Ben Vanik | 1489584 | 2024-02-24 09:10:03 -0800 | [diff] [blame] | 481 | " --iree-hal-target-backends=cuda\n" |
| 482 | " --device=cuda\n" |
| 483 | " --executable_format=cuda-nvptx-fb\n" |
Ben Vanik | 23f2828 | 2024-02-23 11:14:25 -0800 | [diff] [blame] | 484 | " --iree-hal-target-backends=vulkan-spirv\n" |
| 485 | " --device=vulkan\n" |
| 486 | " --executable_format=vulkan-spirv-fb\n" |
| 487 | "\n" |
| 488 | "Note that this tool is intentionally low level: you must specify all\n" |
| 489 | "of the push constant/binding parameters precisely as they are expected\n" |
| 490 | "by the executable. `iree-benchmark-module` is the user-friendly\n" |
| 491 | "benchmarking tool while this one favors direct access to the\n" |
| 492 | "executables (bypassing all of the IREE VM, HAL APIs, task system,\n" |
| 493 | "etc).\n" |
| 494 | "\n" |
| 495 | "Example --flagfile:\n" |
| 496 | " --device=local-sync\n" |
| 497 | " --executable_format=embedded-elf-x86_64\n" |
| 498 | " --executable_file=runtime/src/iree/hal/local/elf/testdata/" |
| 499 | "elementwise_mul_x86_64.so\n" |
| 500 | " --entry_point=0\n" |
| 501 | " --binding=4xf32=1,2,3,4\n" |
| 502 | " --binding=4xf32=100,200,300,400\n" |
| 503 | " --binding=4xf32=0,0,0,0\n" |
| 504 | " --workgroup_count=1,1,1\n" |
| 505 | "\n"); |
| 506 | |
| 507 | iree_flags_parse_checked(IREE_FLAGS_PARSE_MODE_UNDEFINED_OK, &argc, &argv); |
| 508 | iree_benchmark_initialize(&argc, argv); |
| 509 | |
| 510 | iree_status_t status = iree_benchmark_executable_from_flags(host_allocator); |
| 511 | if (!iree_status_is_ok(status)) { |
| 512 | iree_status_fprint(stderr, status); |
| 513 | iree_status_free(status); |
| 514 | exit_code = EXIT_FAILURE; |
| 515 | } |
| 516 | fflush(stderr); |
| 517 | |
| 518 | IREE_TRACE_ZONE_END(z0); |
| 519 | IREE_TRACE_APP_EXIT(exit_code); |
| 520 | return exit_code; |
| 521 | } |