Ben Vanik | 23f2828 | 2024-02-23 11:14:25 -0800 | [diff] [blame] | 1 | // Copyright 2024 The IREE Authors |
| 2 | // |
| 3 | // Licensed under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | |
| 7 | #include <stdint.h> |
| 8 | #include <stdio.h> |
| 9 | #include <stdlib.h> |
| 10 | #include <string.h> |
| 11 | |
| 12 | #include "iree/base/api.h" |
| 13 | #include "iree/base/internal/file_io.h" |
| 14 | #include "iree/base/internal/flags.h" |
| 15 | #include "iree/hal/api.h" |
| 16 | #include "iree/modules/hal/types.h" |
| 17 | #include "iree/testing/benchmark.h" |
| 18 | #include "iree/tooling/device_util.h" |
| 19 | #include "iree/tooling/function_io.h" |
| 20 | #include "iree/vm/api.h" |
| 21 | |
| 22 | IREE_FLAG( |
| 23 | int32_t, batch_size, 64, |
| 24 | "Number of dispatches to perform per command buffer submission.\n" |
| 25 | "Higher numbers will reduce the effect of submission overheads on the\n" |
| 26 | "final timings but too high a value may result in hangs."); |
| 27 | |
| 28 | IREE_FLAG(string, executable_format, "", |
| 29 | "Format of the executable file being loaded."); |
| 30 | IREE_FLAG(string, executable_file, "", "Path to the executable file to load."); |
| 31 | |
| 32 | IREE_FLAG(int32_t, entry_point, 0, "Entry point ordinal to run."); |
| 33 | |
| 34 | IREE_FLAG_LIST( |
| 35 | string, workgroup_count, |
| 36 | "`x,y,z` dimensions of the workgroup count defining the number of\n" |
| 37 | "workgroup invocations that will be run per benchmark iteration.\n" |
| 38 | "Each occurrence of the flag will run a benchmark with that set of\n" |
| 39 | "workgroup count values."); |
| 40 | |
| 41 | // Total number of executable-level constants we (currently) allow; this is only |
| 42 | // a limitation of how much memory we allocate and we could make this |
| 43 | // dynamically growable. |
| 44 | #define IREE_HAL_MAX_EXECUTABLE_CONSTANT_COUNT 512 |
| 45 | // Total number of push constants we (currently) allow any executable to have. |
| 46 | #define IREE_HAL_MAX_PUSH_CONSTANT_COUNT 64 |
| 47 | // Maximum number of descriptor sets in an pipeline layout. |
| 48 | #define IREE_HAL_MAX_DESCRIPTOR_SET_COUNT 2 |
| 49 | // Total number of bindings we (currently) allow any executable to have. |
| 50 | #define IREE_HAL_MAX_TOTAL_BINDING_COUNT \ |
| 51 | (IREE_HAL_MAX_DESCRIPTOR_SET_COUNT * 32) |
| 52 | |
| 53 | // Parsed dispatch parameters from flags. |
| 54 | // Used to construct the dispatch parameters for the benchmark invocation. |
| 55 | struct { |
| 56 | int32_t set_count; |
| 57 | struct { |
| 58 | // For now we only track the binding counts and assume they are all storage |
| 59 | // buffers. When we support more types we'll need an encoding. |
| 60 | int32_t binding_count; |
| 61 | } sets[IREE_HAL_MAX_DESCRIPTOR_SET_COUNT]; |
| 62 | |
| 63 | int32_t executable_constant_count; |
| 64 | union { |
| 65 | uint32_t ui32; |
| 66 | } executable_constants[IREE_HAL_MAX_EXECUTABLE_CONSTANT_COUNT]; |
| 67 | |
| 68 | int32_t push_constant_count; |
| 69 | union { |
| 70 | uint32_t ui32; |
| 71 | } push_constants[IREE_HAL_MAX_PUSH_CONSTANT_COUNT]; |
| 72 | |
| 73 | int32_t binding_count; |
| 74 | iree_string_view_t binding_specs[IREE_HAL_MAX_TOTAL_BINDING_COUNT]; |
| 75 | char binding_cconv[IREE_HAL_MAX_TOTAL_BINDING_COUNT]; |
| 76 | iree_hal_descriptor_set_layout_binding_t |
| 77 | binding_layouts[IREE_HAL_MAX_TOTAL_BINDING_COUNT]; |
| 78 | } parsed_params = { |
| 79 | .executable_constant_count = 0, |
| 80 | .push_constant_count = 0, |
| 81 | .binding_count = 0, |
| 82 | }; |
| 83 | |
| 84 | static iree_status_t parse_executable_constant(iree_string_view_t flag_name, |
| 85 | void* storage, |
| 86 | iree_string_view_t value) { |
| 87 | IREE_ASSERT_LE(parsed_params.executable_constant_count + 1, |
| 88 | IREE_ARRAYSIZE(parsed_params.executable_constants), |
| 89 | "too many executable constants"); |
| 90 | uint32_t value_ui32 = 0; |
| 91 | if (!iree_string_view_atoi_uint32(value, &value_ui32)) { |
| 92 | return iree_make_status( |
| 93 | IREE_STATUS_INVALID_ARGUMENT, |
| 94 | "invalid executable constant value `%.*s`; expects uint32_t", |
| 95 | (int)value.size, value.data); |
| 96 | } |
| 97 | parsed_params.executable_constants[parsed_params.executable_constant_count++] |
| 98 | .ui32 = value_ui32; |
| 99 | return iree_ok_status(); |
| 100 | } |
| 101 | static void print_executable_constant(iree_string_view_t flag_name, |
| 102 | void* storage, FILE* file) { |
| 103 | if (parsed_params.executable_constant_count == 0) { |
| 104 | fprintf(file, "# --%.*s=[integer value]\n", (int)flag_name.size, |
| 105 | flag_name.data); |
| 106 | return; |
| 107 | } |
| 108 | for (int32_t i = 0; i < parsed_params.executable_constant_count; ++i) { |
| 109 | fprintf(file, "--%.*s=%u", (int)flag_name.size, flag_name.data, |
| 110 | parsed_params.executable_constants[i].ui32); |
| 111 | if (i < parsed_params.executable_constant_count - 1) { |
| 112 | fprintf(file, "\n"); |
| 113 | } |
| 114 | } |
| 115 | } |
| 116 | IREE_FLAG_CALLBACK(parse_executable_constant, print_executable_constant, |
| 117 | &parsed_params, executable_constant, |
| 118 | "Appends a uint32_t executable constant value.\n"); |
| 119 | |
| 120 | static iree_status_t parse_push_constant(iree_string_view_t flag_name, |
| 121 | void* storage, |
| 122 | iree_string_view_t value) { |
| 123 | IREE_ASSERT_LE(parsed_params.push_constant_count + 1, |
| 124 | IREE_ARRAYSIZE(parsed_params.push_constants), |
| 125 | "too many push constants"); |
| 126 | uint32_t value_ui32 = 0; |
| 127 | if (!iree_string_view_atoi_uint32(value, &value_ui32)) { |
| 128 | return iree_make_status( |
| 129 | IREE_STATUS_INVALID_ARGUMENT, |
| 130 | "invalid push constant value `%.*s`; expects uint32_t", (int)value.size, |
| 131 | value.data); |
| 132 | } |
| 133 | parsed_params.push_constants[parsed_params.push_constant_count++].ui32 = |
| 134 | value_ui32; |
| 135 | return iree_ok_status(); |
| 136 | } |
| 137 | static void print_push_constant(iree_string_view_t flag_name, void* storage, |
| 138 | FILE* file) { |
| 139 | if (parsed_params.push_constant_count == 0) { |
| 140 | fprintf(file, "# --%.*s=[integer value]\n", (int)flag_name.size, |
| 141 | flag_name.data); |
| 142 | return; |
| 143 | } |
| 144 | for (int32_t i = 0; i < parsed_params.push_constant_count; ++i) { |
| 145 | fprintf(file, "--%.*s=%u", (int)flag_name.size, flag_name.data, |
| 146 | parsed_params.push_constants[i].ui32); |
| 147 | if (i < parsed_params.push_constant_count - 1) { |
| 148 | fprintf(file, "\n"); |
| 149 | } |
| 150 | } |
| 151 | } |
| 152 | IREE_FLAG_CALLBACK(parse_push_constant, print_push_constant, &parsed_params, |
| 153 | push_constant, "Appends a uint32_t push constant value.\n"); |
| 154 | |
| 155 | static iree_status_t parse_binding(iree_string_view_t flag_name, void* storage, |
| 156 | iree_string_view_t value) { |
| 157 | IREE_ASSERT_LE(parsed_params.binding_count + 1, |
| 158 | IREE_ARRAYSIZE(parsed_params.binding_specs), |
| 159 | "too many bindings"); |
| 160 | int32_t i = parsed_params.binding_count++; |
| 161 | parsed_params.binding_specs[i] = value; |
| 162 | parsed_params.binding_cconv[i] = 'r'; |
| 163 | // TODO(benvanik): allow for a specification of type/immutability. |
| 164 | parsed_params.binding_layouts[i] = (iree_hal_descriptor_set_layout_binding_t){ |
| 165 | .binding = (uint32_t)i, |
| 166 | .type = IREE_HAL_DESCRIPTOR_TYPE_STORAGE_BUFFER, |
| 167 | .flags = IREE_HAL_DESCRIPTOR_FLAG_NONE, |
| 168 | }; |
| 169 | return iree_ok_status(); |
| 170 | } |
| 171 | static void print_binding(iree_string_view_t flag_name, void* storage, |
| 172 | FILE* file) { |
| 173 | if (parsed_params.binding_count == 0) { |
| 174 | fprintf(file, "# --%.*s=\"shapextype[=values]\"\n", (int)flag_name.size, |
| 175 | flag_name.data); |
| 176 | return; |
| 177 | } |
| 178 | for (int32_t i = 0; i < parsed_params.binding_count; ++i) { |
| 179 | const iree_string_view_t binding_spec = parsed_params.binding_specs[i]; |
| 180 | fprintf(file, "--%.*s=\"%.*s\"\n", (int)flag_name.size, flag_name.data, |
| 181 | (int)binding_spec.size, binding_spec.data); |
| 182 | } |
| 183 | } |
| 184 | IREE_FLAG_CALLBACK( |
| 185 | parse_binding, print_binding, &parsed_params, binding, |
| 186 | "Appends a binding to the dispatch parameters.\n" |
| 187 | "Bindings are defined by their shape, element type, and their data.\n" |
| 188 | "There must be one binding for every declared layout binding.\n" |
| 189 | "Examples:\n" |
| 190 | " # 16 4-byte elements zero-initialized:\n" |
| 191 | " --binding=2x8xi32\n" |
| 192 | " # 10000 bytes all initialized to 123:\n" |
| 193 | " --binding=10000xi8=123\n" |
| 194 | " # 2 4-byte floating-point values with contents [[1.4], [2.1]]:\n" |
| 195 | " --binding=2x1xf32=1.4,2.1\n" |
| 196 | " # First array from a numpy file followed by the second:\n" |
| 197 | " --binding=@file.npy\n" |
| 198 | " --binding=+file.npy\n" |
| 199 | " # All arrays from a numpy file\n" |
| 200 | " --binding=*file.npy\n" |
| 201 | " # Binary tensor<2x2xf32> and tensor<4xf32> read from a single file\n" |
| 202 | " --binding=2x2xf32=@file.ext\n" |
| 203 | " --binding=4xf32=+file.ext"); |
| 204 | |
| 205 | typedef struct iree_benchmark_executable_args_t { |
| 206 | iree_hal_device_t* device; |
| 207 | iree_hal_executable_t* executable; |
| 208 | iree_hal_pipeline_layout_t* pipeline_layout; |
| 209 | const iree_hal_descriptor_set_binding_t* bindings; |
| 210 | uint32_t workgroup_count[3]; |
| 211 | } iree_benchmark_executable_args_t; |
| 212 | |
| 213 | // NOTE: error handling is here just for better diagnostics: it is not tracking |
| 214 | // allocations correctly and will leak. Don't use this as an example for how to |
| 215 | // write robust code. |
| 216 | static iree_status_t iree_benchmark_executable_run( |
| 217 | const iree_benchmark_def_t* benchmark_def, |
| 218 | iree_benchmark_state_t* benchmark_state) { |
| 219 | iree_benchmark_executable_args_t* args = |
| 220 | (iree_benchmark_executable_args_t*)benchmark_def->user_data; |
| 221 | |
| 222 | iree_hal_semaphore_t* fence_semaphore = NULL; |
| 223 | uint64_t fence_value = 0ull; |
| 224 | IREE_RETURN_IF_ERROR( |
| 225 | iree_hal_semaphore_create(args->device, fence_value, &fence_semaphore)); |
| 226 | iree_hal_semaphore_list_t wait_semaphore_list = |
| 227 | iree_hal_semaphore_list_empty(); |
| 228 | iree_hal_semaphore_list_t signal_semaphore_list = { |
| 229 | .count = 1, |
| 230 | .semaphores = &fence_semaphore, |
| 231 | .payload_values = &fence_value, |
| 232 | }; |
| 233 | |
| 234 | // Start profiling now - all subsequent device operations will be what the |
| 235 | // user wants to measure. |
| 236 | IREE_RETURN_IF_ERROR(iree_hal_begin_profiling_from_flags(args->device)); |
| 237 | |
| 238 | // Submit the command buffer and wait for it to complete. |
| 239 | // Note that each iteration runs through the whole grid as it's important that |
| 240 | // we are testing the memory access patterns: if we just ran the same single |
| 241 | // workgroup processing the same exact region of memory over and over we are |
| 242 | // not testing cache effects. This means we need to account for the total |
| 243 | // number of workgroups executed. |
| 244 | int64_t dispatch_count = 0; |
| 245 | while (iree_benchmark_keep_running(benchmark_state, FLAG_batch_size)) { |
| 246 | // TODO(benvanik): record a secondary command buffer and just replay it |
| 247 | // here. This should fix the overhead at just primary command buffer |
| 248 | // creation. Most backends don't support reusable command buffers, yet, and |
| 249 | // some only support inline execution so we are conservatively doing that. |
| 250 | // In the future we should have an option (possibly based on device query) |
| 251 | // as to which path to use. |
| 252 | |
| 253 | // Record a command buffer with the dispatches. |
| 254 | // Note that today we are doing this inside of the benchmark loop so that |
| 255 | // we can use inline execution. This is a boost to devices that support it |
| 256 | // like CUDA streams and synchronous CPU executors but a pessimization to |
| 257 | // devices that benefit from reusable command buffers like CUDA graphs. |
| 258 | // In the future we can add a flag that switches the mode between |
| 259 | // reusable and one-shot. |
| 260 | iree_hal_command_buffer_t* command_buffer = NULL; |
| 261 | IREE_RETURN_IF_ERROR(iree_hal_command_buffer_create( |
| 262 | args->device, |
| 263 | IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT | |
| 264 | IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION, |
| 265 | IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY, |
| 266 | /*binding_capacity=*/0, &command_buffer)); |
| 267 | IREE_RETURN_IF_ERROR(iree_hal_command_buffer_begin(command_buffer)); |
| 268 | IREE_RETURN_IF_ERROR(iree_hal_command_buffer_push_constants( |
| 269 | command_buffer, args->pipeline_layout, /*offset=*/0, |
| 270 | &parsed_params.push_constants[0].ui32, |
| 271 | parsed_params.push_constant_count * |
| 272 | sizeof(parsed_params.push_constants[0]))); |
| 273 | IREE_RETURN_IF_ERROR(iree_hal_command_buffer_push_descriptor_set( |
| 274 | command_buffer, args->pipeline_layout, /*set=*/0, |
| 275 | parsed_params.binding_count, args->bindings)); |
| 276 | for (int32_t i = 0; i < FLAG_batch_size; ++i) { |
| 277 | IREE_RETURN_IF_ERROR(iree_hal_command_buffer_dispatch( |
| 278 | command_buffer, args->executable, FLAG_entry_point, |
| 279 | args->workgroup_count[0], args->workgroup_count[1], |
| 280 | args->workgroup_count[2])); |
| 281 | IREE_RETURN_IF_ERROR(iree_hal_command_buffer_execution_barrier( |
| 282 | command_buffer, IREE_HAL_EXECUTION_STAGE_COMMAND_RETIRE, |
| 283 | IREE_HAL_EXECUTION_STAGE_COMMAND_ISSUE, |
| 284 | IREE_HAL_EXECUTION_BARRIER_FLAG_NONE, 0, NULL, 0, NULL)); |
| 285 | } |
| 286 | IREE_RETURN_IF_ERROR(iree_hal_command_buffer_end(command_buffer)); |
| 287 | |
| 288 | // Submit the command buffer; if the device could not start executing while |
| 289 | // we were recording then this will kick off the execution. |
| 290 | ++fence_value; |
| 291 | IREE_RETURN_IF_ERROR(iree_hal_device_queue_execute( |
| 292 | args->device, IREE_HAL_QUEUE_AFFINITY_ANY, wait_semaphore_list, |
| 293 | signal_semaphore_list, 1, &command_buffer)); |
| 294 | |
| 295 | // Block and wait for the submission to complete. |
| 296 | // Note that this will include round-trip overhead and if the dispatch or |
| 297 | // batch size is small then the final time may end up being mostly overhead. |
| 298 | IREE_RETURN_IF_ERROR(iree_hal_semaphore_wait(fence_semaphore, fence_value, |
| 299 | iree_infinite_timeout())); |
| 300 | |
| 301 | iree_benchmark_pause_timing(benchmark_state); |
| 302 | |
| 303 | // Don't count cleanup time in the benchmark. |
| 304 | iree_hal_command_buffer_release(command_buffer); |
| 305 | |
| 306 | // Accumulate the total number of dispatches executed. |
| 307 | dispatch_count += FLAG_batch_size; |
| 308 | |
| 309 | // Flush profiling if recording. Note that we don't want to include the |
| 310 | // profiling time in the benchmark result. |
| 311 | IREE_RETURN_IF_ERROR(iree_hal_device_profiling_flush(args->device)); |
| 312 | |
| 313 | iree_benchmark_resume_timing(benchmark_state); |
| 314 | } |
| 315 | |
| 316 | // End profiling before cleaning up so tooling doesn't capture it. |
| 317 | IREE_RETURN_IF_ERROR(iree_hal_end_profiling_from_flags(args->device)); |
| 318 | |
| 319 | // To get a total time per invocation we set the item count to the total |
| 320 | // invocations dispatched. That gives us both total dispatch and single |
| 321 | // invocation times in the reporter output. |
| 322 | int64_t total_invocations = dispatch_count * args->workgroup_count[0] * |
| 323 | args->workgroup_count[1] * |
| 324 | args->workgroup_count[2]; |
| 325 | iree_benchmark_set_items_processed(benchmark_state, total_invocations); |
| 326 | |
| 327 | iree_hal_semaphore_release(fence_semaphore); |
| 328 | |
| 329 | return iree_ok_status(); |
| 330 | } |
| 331 | |
| 332 | // Parses an `x,y,z` workgroup count. |
| 333 | static iree_status_t iree_parse_workgroup_count( |
| 334 | iree_string_view_t workgroup_count_str, uint32_t* out_workgroup_count) { |
| 335 | iree_string_view_t str = workgroup_count_str; |
| 336 | iree_string_view_t str_x; |
| 337 | iree_string_view_split(str, ',', &str_x, &str); |
| 338 | iree_string_view_t str_y; |
| 339 | iree_string_view_split(str, ',', &str_y, &str); |
| 340 | iree_string_view_t str_z = str; |
| 341 | if (!iree_string_view_atoi_uint32(str_x, &out_workgroup_count[0]) || |
| 342 | !iree_string_view_atoi_uint32(str_y, &out_workgroup_count[1]) || |
| 343 | !iree_string_view_atoi_uint32(str_z, &out_workgroup_count[2])) { |
| 344 | return iree_make_status( |
| 345 | IREE_STATUS_INVALID_ARGUMENT, |
| 346 | "invalid workgroup count string `%.*s`; expects `X,Y,Z`", |
| 347 | (int)workgroup_count_str.size, workgroup_count_str.data); |
| 348 | } |
| 349 | return iree_ok_status(); |
| 350 | } |
| 351 | |
| 352 | // Runs one benchmark per workgroup count specified using the same device |
| 353 | // and input/output buffers. |
| 354 | static iree_status_t iree_benchmark_executable_from_flags( |
| 355 | iree_allocator_t host_allocator) { |
| 356 | iree_vm_instance_t* instance = NULL; |
| 357 | IREE_RETURN_IF_ERROR(iree_vm_instance_create(IREE_VM_TYPE_CAPACITY_DEFAULT, |
| 358 | host_allocator, &instance)); |
| 359 | IREE_RETURN_IF_ERROR(iree_hal_module_register_inline_types(instance)); |
| 360 | |
| 361 | // Create the HAL device we'll be using during execution. |
| 362 | // Devices can be very expensive to create and we want to avoid doing it |
| 363 | // multiple times throughout the benchmark execution. |
| 364 | iree_hal_device_t* device = NULL; |
| 365 | IREE_RETURN_IF_ERROR(iree_hal_create_device_from_flags( |
| 366 | iree_hal_available_driver_registry(), iree_hal_default_device_uri(), |
| 367 | host_allocator, &device)); |
| 368 | |
| 369 | // We'll reuse the same executable cache so that once we load the executable |
| 370 | // we'll be able to reuse any driver-side optimizations. |
| 371 | iree_hal_executable_cache_t* executable_cache = NULL; |
| 372 | iree_status_t loop_status = iree_ok_status(); |
| 373 | IREE_RETURN_IF_ERROR(iree_hal_executable_cache_create( |
| 374 | device, iree_make_cstring_view("cache"), iree_loop_inline(&loop_status), |
| 375 | &executable_cache)); |
| 376 | IREE_RETURN_IF_ERROR(loop_status); |
| 377 | |
| 378 | // Allocate storage for buffers and populate them. |
| 379 | // They only need to remain valid for the duration of the invocation and all |
| 380 | // memory accessed by the invocation will come from here. |
| 381 | // Note that we do this parsing first so that we can reflect on the I/O to |
| 382 | // infer the pipeline layout. |
| 383 | iree_hal_allocator_t* device_allocator = iree_hal_device_allocator(device); |
| 384 | iree_vm_list_t* binding_list = NULL; |
| 385 | IREE_RETURN_IF_ERROR(iree_tooling_parse_variants( |
| 386 | iree_make_string_view(parsed_params.binding_cconv, |
| 387 | parsed_params.binding_count), |
| 388 | (iree_string_view_list_t){parsed_params.binding_count, |
| 389 | parsed_params.binding_specs}, |
| 390 | device, device_allocator, host_allocator, &binding_list)); |
| 391 | iree_hal_descriptor_set_binding_t bindings[IREE_HAL_MAX_TOTAL_BINDING_COUNT]; |
| 392 | for (iree_host_size_t i = 0; i < parsed_params.binding_count; ++i) { |
| 393 | iree_vm_ref_t value = iree_vm_ref_null(); |
| 394 | IREE_RETURN_IF_ERROR(iree_vm_list_get_ref_assign(binding_list, i, &value)); |
| 395 | iree_hal_buffer_t* buffer = NULL; |
| 396 | if (iree_hal_buffer_isa(value)) { |
| 397 | buffer = iree_hal_buffer_deref(value); |
| 398 | } else if (iree_hal_buffer_view_isa(value)) { |
| 399 | buffer = iree_hal_buffer_view_buffer(iree_hal_buffer_view_deref(value)); |
| 400 | } else { |
| 401 | return iree_make_status( |
| 402 | IREE_STATUS_INVALID_ARGUMENT, |
| 403 | "bindings must be shaped types (4xf32, etc), binding %" PRIhsz |
| 404 | " is not", |
| 405 | i); |
| 406 | } |
| 407 | bindings[i] = (iree_hal_descriptor_set_binding_t){ |
| 408 | .binding = i, |
| 409 | .buffer_slot = 0, |
| 410 | .buffer = buffer, |
| 411 | .offset = 0, |
| 412 | .length = IREE_WHOLE_BUFFER, |
| 413 | }; |
| 414 | } |
| 415 | |
| 416 | // Setup the specification used to perform the executable load. |
| 417 | // This information is normally used to select the appropriate loader but in |
| 418 | // this benchmark we only have a single one. |
| 419 | // TODO(benvanik): expose the flags once they are implemented anywhere. |
| 420 | iree_hal_executable_params_t executable_params; |
| 421 | iree_hal_executable_params_initialize(&executable_params); |
| 422 | executable_params.caching_mode = |
| 423 | IREE_HAL_EXECUTABLE_CACHING_MODE_ALLOW_OPTIMIZATION | |
| 424 | IREE_HAL_EXECUTABLE_CACHING_MODE_ALIAS_PROVIDED_DATA; |
| 425 | |
| 426 | // Load the executable data into memory. |
| 427 | // In normal usage this would be mapped from the containing module file (which |
| 428 | // itself may be mapped from disk). |
| 429 | iree_file_contents_t* file_contents = NULL; |
| 430 | if (strcmp(FLAG_executable_file, "-") == 0) { |
| 431 | IREE_RETURN_IF_ERROR( |
| 432 | iree_stdin_read_contents(host_allocator, &file_contents)); |
| 433 | } else { |
| 434 | IREE_RETURN_IF_ERROR(iree_file_read_contents( |
| 435 | FLAG_executable_file, IREE_FILE_READ_FLAG_DEFAULT, host_allocator, |
| 436 | &file_contents)); |
| 437 | } |
| 438 | executable_params.executable_format = |
| 439 | iree_make_cstring_view(FLAG_executable_format); |
| 440 | executable_params.executable_data = file_contents->const_buffer; |
| 441 | |
| 442 | // Setup the layouts defining how each entry point is interpreted. |
| 443 | iree_hal_pipeline_layout_t* pipeline_layout = NULL; |
| 444 | iree_hal_descriptor_set_layout_t* descriptor_set_layout = NULL; |
| 445 | IREE_RETURN_IF_ERROR(iree_hal_descriptor_set_layout_create( |
| 446 | device, IREE_HAL_DESCRIPTOR_SET_LAYOUT_FLAG_NONE, |
| 447 | parsed_params.binding_count, parsed_params.binding_layouts, |
| 448 | &descriptor_set_layout)); |
| 449 | IREE_RETURN_IF_ERROR(iree_hal_pipeline_layout_create( |
| 450 | device, parsed_params.push_constant_count, |
| 451 | /*set_layout_count=*/1, &descriptor_set_layout, &pipeline_layout)); |
| 452 | executable_params.pipeline_layout_count = 1; |
| 453 | executable_params.pipeline_layouts = &pipeline_layout; |
| 454 | |
| 455 | // Executable-level constants allow us to perform some basic load-time value |
| 456 | // propagation - usually dependent on device features or tuning parameters. |
| 457 | executable_params.constant_count = parsed_params.executable_constant_count; |
| 458 | executable_params.constants = &parsed_params.executable_constants[0].ui32; |
| 459 | |
| 460 | // Perform the load, which will fail if the executable cannot be loaded or |
| 461 | // there was an issue with the layouts. |
| 462 | iree_hal_executable_t* executable = NULL; |
| 463 | IREE_RETURN_IF_ERROR(iree_hal_executable_cache_prepare_executable( |
| 464 | executable_cache, &executable_params, &executable)); |
| 465 | |
| 466 | // Register one benchmark per workgroup count specified. |
| 467 | iree_benchmark_executable_args_t* args = NULL; |
| 468 | IREE_RETURN_IF_ERROR(iree_allocator_malloc( |
| 469 | host_allocator, sizeof(*args) * FLAG_workgroup_count_list().count, |
| 470 | (void**)&args)); |
| 471 | for (iree_host_size_t i = 0; i < FLAG_workgroup_count_list().count; ++i) { |
| 472 | args[i] = (iree_benchmark_executable_args_t){ |
| 473 | .device = device, |
| 474 | .executable = executable, |
| 475 | .pipeline_layout = pipeline_layout, |
| 476 | .bindings = bindings, |
| 477 | .workgroup_count = {1, 1, 1}, |
| 478 | }; |
| 479 | IREE_RETURN_IF_ERROR(iree_parse_workgroup_count( |
| 480 | FLAG_workgroup_count_list().values[i], args[i].workgroup_count)); |
| 481 | iree_benchmark_def_t benchmark_def = { |
| 482 | .flags = IREE_BENCHMARK_FLAG_MEASURE_PROCESS_CPU_TIME | |
| 483 | IREE_BENCHMARK_FLAG_USE_REAL_TIME, |
| 484 | .time_unit = IREE_BENCHMARK_UNIT_NANOSECOND, |
| 485 | .minimum_duration_ns = 0, |
| 486 | .iteration_count = 0, |
| 487 | .run = iree_benchmark_executable_run, |
| 488 | .user_data = &args[i], |
| 489 | }; |
| 490 | char benchmark_name[512]; |
| 491 | snprintf(benchmark_name, sizeof(benchmark_name) - 1, "dispatch_%ux%ux%u", |
| 492 | args[i].workgroup_count[0], args[i].workgroup_count[1], |
| 493 | args[i].workgroup_count[2]); |
| 494 | iree_benchmark_register(iree_make_cstring_view(benchmark_name), |
| 495 | &benchmark_def); |
| 496 | } |
| 497 | iree_benchmark_run_specified(); |
| 498 | iree_allocator_free(host_allocator, args); |
| 499 | |
| 500 | iree_vm_list_release(binding_list); |
| 501 | iree_hal_executable_release(executable); |
| 502 | iree_hal_descriptor_set_layout_release(descriptor_set_layout); |
| 503 | iree_hal_pipeline_layout_release(pipeline_layout); |
| 504 | iree_file_contents_free(file_contents); |
| 505 | iree_hal_executable_cache_release(executable_cache); |
| 506 | iree_hal_device_release(device); |
| 507 | iree_vm_instance_release(instance); |
| 508 | |
| 509 | return iree_ok_status(); |
| 510 | } |
| 511 | |
| 512 | int main(int argc, char** argv) { |
| 513 | IREE_TRACE_APP_ENTER(); |
| 514 | IREE_TRACE_ZONE_BEGIN(z0); |
| 515 | |
| 516 | iree_allocator_t host_allocator = iree_allocator_system(); |
| 517 | int exit_code = EXIT_SUCCESS; |
| 518 | |
| 519 | iree_flags_set_usage( |
| 520 | "iree-benchmark-executable", |
| 521 | "Benchmarks a single entry point within an executable library.\n" |
| 522 | "The parameters used can be inferred from the entry point " |
| 523 | "`hal.interface` and dispatches to it in the source program.\n" |
| 524 | "\n" |
| 525 | "Executables can be extracted from VMFB files using `unzip` or dumped\n" |
| 526 | "during compilation using --iree-hal-dump-executable-binaries-to=path/.\n" |
| 527 | "\n" |
| 528 | "The compiler can directly compile `hal.executable.source` and\n" |
| 529 | "`hal.executable` ops to the appropriate binaries by using the\n" |
| 530 | "`iree-compile --compile-mode=hal-executable` mode.\n" |
| 531 | "\n" |
| 532 | "Example flags for various compilation backends:\n" |
| 533 | " --iree-hal-target-backends=vmvx\n" |
| 534 | " --device=local-sync or --device=local-task\n" |
| 535 | " --executable_format=vmvx-bytecode-fb\n" |
| 536 | " --iree-hal-target-backends=llvm-cpu\n" |
| 537 | " --device=local-sync or --device=local-task\n" |
| 538 | " --executable_format=embedded-elf-x86_64\n" |
| 539 | " --executable_format=system-dll-x86_64\n" |
Ben Vanik | 1489584 | 2024-02-24 09:10:03 -0800 | [diff] [blame^] | 540 | " --iree-hal-target-backends=cuda\n" |
| 541 | " --device=cuda\n" |
| 542 | " --executable_format=cuda-nvptx-fb\n" |
Ben Vanik | 23f2828 | 2024-02-23 11:14:25 -0800 | [diff] [blame] | 543 | " --iree-hal-target-backends=vulkan-spirv\n" |
| 544 | " --device=vulkan\n" |
| 545 | " --executable_format=vulkan-spirv-fb\n" |
| 546 | "\n" |
| 547 | "Note that this tool is intentionally low level: you must specify all\n" |
| 548 | "of the push constant/binding parameters precisely as they are expected\n" |
| 549 | "by the executable. `iree-benchmark-module` is the user-friendly\n" |
| 550 | "benchmarking tool while this one favors direct access to the\n" |
| 551 | "executables (bypassing all of the IREE VM, HAL APIs, task system,\n" |
| 552 | "etc).\n" |
| 553 | "\n" |
| 554 | "Example --flagfile:\n" |
| 555 | " --device=local-sync\n" |
| 556 | " --executable_format=embedded-elf-x86_64\n" |
| 557 | " --executable_file=runtime/src/iree/hal/local/elf/testdata/" |
| 558 | "elementwise_mul_x86_64.so\n" |
| 559 | " --entry_point=0\n" |
| 560 | " --binding=4xf32=1,2,3,4\n" |
| 561 | " --binding=4xf32=100,200,300,400\n" |
| 562 | " --binding=4xf32=0,0,0,0\n" |
| 563 | " --workgroup_count=1,1,1\n" |
| 564 | "\n"); |
| 565 | |
| 566 | iree_flags_parse_checked(IREE_FLAGS_PARSE_MODE_UNDEFINED_OK, &argc, &argv); |
| 567 | iree_benchmark_initialize(&argc, argv); |
| 568 | |
| 569 | iree_status_t status = iree_benchmark_executable_from_flags(host_allocator); |
| 570 | if (!iree_status_is_ok(status)) { |
| 571 | iree_status_fprint(stderr, status); |
| 572 | iree_status_free(status); |
| 573 | exit_code = EXIT_FAILURE; |
| 574 | } |
| 575 | fflush(stderr); |
| 576 | |
| 577 | IREE_TRACE_ZONE_END(z0); |
| 578 | IREE_TRACE_APP_EXIT(exit_code); |
| 579 | return exit_code; |
| 580 | } |