blob: 60f5395c5b25af9e2ce3d351888c024efe7b0b3f [file] [log] [blame]
Ben Vanik23f28282024-02-23 11:14:25 -08001// Copyright 2024 The IREE Authors
2//
3// Licensed under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
7#include <stdint.h>
8#include <stdio.h>
9#include <stdlib.h>
10#include <string.h>
11
12#include "iree/base/api.h"
13#include "iree/base/internal/file_io.h"
14#include "iree/base/internal/flags.h"
15#include "iree/hal/api.h"
16#include "iree/modules/hal/types.h"
17#include "iree/testing/benchmark.h"
18#include "iree/tooling/device_util.h"
19#include "iree/tooling/function_io.h"
20#include "iree/vm/api.h"
21
22IREE_FLAG(
23 int32_t, batch_size, 64,
24 "Number of dispatches to perform per command buffer submission.\n"
25 "Higher numbers will reduce the effect of submission overheads on the\n"
26 "final timings but too high a value may result in hangs.");
27
28IREE_FLAG(string, executable_format, "",
29 "Format of the executable file being loaded.");
30IREE_FLAG(string, executable_file, "", "Path to the executable file to load.");
31
32IREE_FLAG(int32_t, entry_point, 0, "Entry point ordinal to run.");
33
34IREE_FLAG_LIST(
35 string, workgroup_count,
36 "`x,y,z` dimensions of the workgroup count defining the number of\n"
37 "workgroup invocations that will be run per benchmark iteration.\n"
38 "Each occurrence of the flag will run a benchmark with that set of\n"
39 "workgroup count values.");
40
41// Total number of executable-level constants we (currently) allow; this is only
42// a limitation of how much memory we allocate and we could make this
43// dynamically growable.
44#define IREE_HAL_MAX_EXECUTABLE_CONSTANT_COUNT 512
45// Total number of push constants we (currently) allow any executable to have.
46#define IREE_HAL_MAX_PUSH_CONSTANT_COUNT 64
47// Maximum number of descriptor sets in an pipeline layout.
48#define IREE_HAL_MAX_DESCRIPTOR_SET_COUNT 2
49// Total number of bindings we (currently) allow any executable to have.
50#define IREE_HAL_MAX_TOTAL_BINDING_COUNT \
51 (IREE_HAL_MAX_DESCRIPTOR_SET_COUNT * 32)
52
53// Parsed dispatch parameters from flags.
54// Used to construct the dispatch parameters for the benchmark invocation.
55struct {
56 int32_t set_count;
57 struct {
58 // For now we only track the binding counts and assume they are all storage
59 // buffers. When we support more types we'll need an encoding.
60 int32_t binding_count;
61 } sets[IREE_HAL_MAX_DESCRIPTOR_SET_COUNT];
62
63 int32_t executable_constant_count;
64 union {
65 uint32_t ui32;
66 } executable_constants[IREE_HAL_MAX_EXECUTABLE_CONSTANT_COUNT];
67
68 int32_t push_constant_count;
69 union {
70 uint32_t ui32;
71 } push_constants[IREE_HAL_MAX_PUSH_CONSTANT_COUNT];
72
73 int32_t binding_count;
74 iree_string_view_t binding_specs[IREE_HAL_MAX_TOTAL_BINDING_COUNT];
75 char binding_cconv[IREE_HAL_MAX_TOTAL_BINDING_COUNT];
76 iree_hal_descriptor_set_layout_binding_t
77 binding_layouts[IREE_HAL_MAX_TOTAL_BINDING_COUNT];
78} parsed_params = {
79 .executable_constant_count = 0,
80 .push_constant_count = 0,
81 .binding_count = 0,
82};
83
84static iree_status_t parse_executable_constant(iree_string_view_t flag_name,
85 void* storage,
86 iree_string_view_t value) {
87 IREE_ASSERT_LE(parsed_params.executable_constant_count + 1,
88 IREE_ARRAYSIZE(parsed_params.executable_constants),
89 "too many executable constants");
90 uint32_t value_ui32 = 0;
91 if (!iree_string_view_atoi_uint32(value, &value_ui32)) {
92 return iree_make_status(
93 IREE_STATUS_INVALID_ARGUMENT,
94 "invalid executable constant value `%.*s`; expects uint32_t",
95 (int)value.size, value.data);
96 }
97 parsed_params.executable_constants[parsed_params.executable_constant_count++]
98 .ui32 = value_ui32;
99 return iree_ok_status();
100}
101static void print_executable_constant(iree_string_view_t flag_name,
102 void* storage, FILE* file) {
103 if (parsed_params.executable_constant_count == 0) {
104 fprintf(file, "# --%.*s=[integer value]\n", (int)flag_name.size,
105 flag_name.data);
106 return;
107 }
108 for (int32_t i = 0; i < parsed_params.executable_constant_count; ++i) {
109 fprintf(file, "--%.*s=%u", (int)flag_name.size, flag_name.data,
110 parsed_params.executable_constants[i].ui32);
111 if (i < parsed_params.executable_constant_count - 1) {
112 fprintf(file, "\n");
113 }
114 }
115}
116IREE_FLAG_CALLBACK(parse_executable_constant, print_executable_constant,
117 &parsed_params, executable_constant,
118 "Appends a uint32_t executable constant value.\n");
119
120static iree_status_t parse_push_constant(iree_string_view_t flag_name,
121 void* storage,
122 iree_string_view_t value) {
123 IREE_ASSERT_LE(parsed_params.push_constant_count + 1,
124 IREE_ARRAYSIZE(parsed_params.push_constants),
125 "too many push constants");
126 uint32_t value_ui32 = 0;
127 if (!iree_string_view_atoi_uint32(value, &value_ui32)) {
128 return iree_make_status(
129 IREE_STATUS_INVALID_ARGUMENT,
130 "invalid push constant value `%.*s`; expects uint32_t", (int)value.size,
131 value.data);
132 }
133 parsed_params.push_constants[parsed_params.push_constant_count++].ui32 =
134 value_ui32;
135 return iree_ok_status();
136}
137static void print_push_constant(iree_string_view_t flag_name, void* storage,
138 FILE* file) {
139 if (parsed_params.push_constant_count == 0) {
140 fprintf(file, "# --%.*s=[integer value]\n", (int)flag_name.size,
141 flag_name.data);
142 return;
143 }
144 for (int32_t i = 0; i < parsed_params.push_constant_count; ++i) {
145 fprintf(file, "--%.*s=%u", (int)flag_name.size, flag_name.data,
146 parsed_params.push_constants[i].ui32);
147 if (i < parsed_params.push_constant_count - 1) {
148 fprintf(file, "\n");
149 }
150 }
151}
152IREE_FLAG_CALLBACK(parse_push_constant, print_push_constant, &parsed_params,
153 push_constant, "Appends a uint32_t push constant value.\n");
154
155static iree_status_t parse_binding(iree_string_view_t flag_name, void* storage,
156 iree_string_view_t value) {
157 IREE_ASSERT_LE(parsed_params.binding_count + 1,
158 IREE_ARRAYSIZE(parsed_params.binding_specs),
159 "too many bindings");
160 int32_t i = parsed_params.binding_count++;
161 parsed_params.binding_specs[i] = value;
162 parsed_params.binding_cconv[i] = 'r';
163 // TODO(benvanik): allow for a specification of type/immutability.
164 parsed_params.binding_layouts[i] = (iree_hal_descriptor_set_layout_binding_t){
165 .binding = (uint32_t)i,
166 .type = IREE_HAL_DESCRIPTOR_TYPE_STORAGE_BUFFER,
167 .flags = IREE_HAL_DESCRIPTOR_FLAG_NONE,
168 };
169 return iree_ok_status();
170}
171static void print_binding(iree_string_view_t flag_name, void* storage,
172 FILE* file) {
173 if (parsed_params.binding_count == 0) {
174 fprintf(file, "# --%.*s=\"shapextype[=values]\"\n", (int)flag_name.size,
175 flag_name.data);
176 return;
177 }
178 for (int32_t i = 0; i < parsed_params.binding_count; ++i) {
179 const iree_string_view_t binding_spec = parsed_params.binding_specs[i];
180 fprintf(file, "--%.*s=\"%.*s\"\n", (int)flag_name.size, flag_name.data,
181 (int)binding_spec.size, binding_spec.data);
182 }
183}
184IREE_FLAG_CALLBACK(
185 parse_binding, print_binding, &parsed_params, binding,
186 "Appends a binding to the dispatch parameters.\n"
187 "Bindings are defined by their shape, element type, and their data.\n"
188 "There must be one binding for every declared layout binding.\n"
189 "Examples:\n"
190 " # 16 4-byte elements zero-initialized:\n"
191 " --binding=2x8xi32\n"
192 " # 10000 bytes all initialized to 123:\n"
193 " --binding=10000xi8=123\n"
194 " # 2 4-byte floating-point values with contents [[1.4], [2.1]]:\n"
195 " --binding=2x1xf32=1.4,2.1\n"
196 " # First array from a numpy file followed by the second:\n"
197 " --binding=@file.npy\n"
198 " --binding=+file.npy\n"
199 " # All arrays from a numpy file\n"
200 " --binding=*file.npy\n"
201 " # Binary tensor<2x2xf32> and tensor<4xf32> read from a single file\n"
202 " --binding=2x2xf32=@file.ext\n"
203 " --binding=4xf32=+file.ext");
204
205typedef struct iree_benchmark_executable_args_t {
206 iree_hal_device_t* device;
207 iree_hal_executable_t* executable;
208 iree_hal_pipeline_layout_t* pipeline_layout;
209 const iree_hal_descriptor_set_binding_t* bindings;
210 uint32_t workgroup_count[3];
211} iree_benchmark_executable_args_t;
212
213// NOTE: error handling is here just for better diagnostics: it is not tracking
214// allocations correctly and will leak. Don't use this as an example for how to
215// write robust code.
216static iree_status_t iree_benchmark_executable_run(
217 const iree_benchmark_def_t* benchmark_def,
218 iree_benchmark_state_t* benchmark_state) {
219 iree_benchmark_executable_args_t* args =
220 (iree_benchmark_executable_args_t*)benchmark_def->user_data;
221
222 iree_hal_semaphore_t* fence_semaphore = NULL;
223 uint64_t fence_value = 0ull;
224 IREE_RETURN_IF_ERROR(
225 iree_hal_semaphore_create(args->device, fence_value, &fence_semaphore));
226 iree_hal_semaphore_list_t wait_semaphore_list =
227 iree_hal_semaphore_list_empty();
228 iree_hal_semaphore_list_t signal_semaphore_list = {
229 .count = 1,
230 .semaphores = &fence_semaphore,
231 .payload_values = &fence_value,
232 };
233
234 // Start profiling now - all subsequent device operations will be what the
235 // user wants to measure.
236 IREE_RETURN_IF_ERROR(iree_hal_begin_profiling_from_flags(args->device));
237
238 // Submit the command buffer and wait for it to complete.
239 // Note that each iteration runs through the whole grid as it's important that
240 // we are testing the memory access patterns: if we just ran the same single
241 // workgroup processing the same exact region of memory over and over we are
242 // not testing cache effects. This means we need to account for the total
243 // number of workgroups executed.
244 int64_t dispatch_count = 0;
245 while (iree_benchmark_keep_running(benchmark_state, FLAG_batch_size)) {
246 // TODO(benvanik): record a secondary command buffer and just replay it
247 // here. This should fix the overhead at just primary command buffer
248 // creation. Most backends don't support reusable command buffers, yet, and
249 // some only support inline execution so we are conservatively doing that.
250 // In the future we should have an option (possibly based on device query)
251 // as to which path to use.
252
253 // Record a command buffer with the dispatches.
254 // Note that today we are doing this inside of the benchmark loop so that
255 // we can use inline execution. This is a boost to devices that support it
256 // like CUDA streams and synchronous CPU executors but a pessimization to
257 // devices that benefit from reusable command buffers like CUDA graphs.
258 // In the future we can add a flag that switches the mode between
259 // reusable and one-shot.
260 iree_hal_command_buffer_t* command_buffer = NULL;
261 IREE_RETURN_IF_ERROR(iree_hal_command_buffer_create(
262 args->device,
263 IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT |
264 IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION,
265 IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY,
266 /*binding_capacity=*/0, &command_buffer));
267 IREE_RETURN_IF_ERROR(iree_hal_command_buffer_begin(command_buffer));
268 IREE_RETURN_IF_ERROR(iree_hal_command_buffer_push_constants(
269 command_buffer, args->pipeline_layout, /*offset=*/0,
270 &parsed_params.push_constants[0].ui32,
271 parsed_params.push_constant_count *
272 sizeof(parsed_params.push_constants[0])));
273 IREE_RETURN_IF_ERROR(iree_hal_command_buffer_push_descriptor_set(
274 command_buffer, args->pipeline_layout, /*set=*/0,
275 parsed_params.binding_count, args->bindings));
276 for (int32_t i = 0; i < FLAG_batch_size; ++i) {
277 IREE_RETURN_IF_ERROR(iree_hal_command_buffer_dispatch(
278 command_buffer, args->executable, FLAG_entry_point,
279 args->workgroup_count[0], args->workgroup_count[1],
280 args->workgroup_count[2]));
281 IREE_RETURN_IF_ERROR(iree_hal_command_buffer_execution_barrier(
282 command_buffer, IREE_HAL_EXECUTION_STAGE_COMMAND_RETIRE,
283 IREE_HAL_EXECUTION_STAGE_COMMAND_ISSUE,
284 IREE_HAL_EXECUTION_BARRIER_FLAG_NONE, 0, NULL, 0, NULL));
285 }
286 IREE_RETURN_IF_ERROR(iree_hal_command_buffer_end(command_buffer));
287
288 // Submit the command buffer; if the device could not start executing while
289 // we were recording then this will kick off the execution.
290 ++fence_value;
291 IREE_RETURN_IF_ERROR(iree_hal_device_queue_execute(
292 args->device, IREE_HAL_QUEUE_AFFINITY_ANY, wait_semaphore_list,
293 signal_semaphore_list, 1, &command_buffer));
294
295 // Block and wait for the submission to complete.
296 // Note that this will include round-trip overhead and if the dispatch or
297 // batch size is small then the final time may end up being mostly overhead.
298 IREE_RETURN_IF_ERROR(iree_hal_semaphore_wait(fence_semaphore, fence_value,
299 iree_infinite_timeout()));
300
301 iree_benchmark_pause_timing(benchmark_state);
302
303 // Don't count cleanup time in the benchmark.
304 iree_hal_command_buffer_release(command_buffer);
305
306 // Accumulate the total number of dispatches executed.
307 dispatch_count += FLAG_batch_size;
308
309 // Flush profiling if recording. Note that we don't want to include the
310 // profiling time in the benchmark result.
311 IREE_RETURN_IF_ERROR(iree_hal_device_profiling_flush(args->device));
312
313 iree_benchmark_resume_timing(benchmark_state);
314 }
315
316 // End profiling before cleaning up so tooling doesn't capture it.
317 IREE_RETURN_IF_ERROR(iree_hal_end_profiling_from_flags(args->device));
318
319 // To get a total time per invocation we set the item count to the total
320 // invocations dispatched. That gives us both total dispatch and single
321 // invocation times in the reporter output.
322 int64_t total_invocations = dispatch_count * args->workgroup_count[0] *
323 args->workgroup_count[1] *
324 args->workgroup_count[2];
325 iree_benchmark_set_items_processed(benchmark_state, total_invocations);
326
327 iree_hal_semaphore_release(fence_semaphore);
328
329 return iree_ok_status();
330}
331
332// Parses an `x,y,z` workgroup count.
333static iree_status_t iree_parse_workgroup_count(
334 iree_string_view_t workgroup_count_str, uint32_t* out_workgroup_count) {
335 iree_string_view_t str = workgroup_count_str;
336 iree_string_view_t str_x;
337 iree_string_view_split(str, ',', &str_x, &str);
338 iree_string_view_t str_y;
339 iree_string_view_split(str, ',', &str_y, &str);
340 iree_string_view_t str_z = str;
341 if (!iree_string_view_atoi_uint32(str_x, &out_workgroup_count[0]) ||
342 !iree_string_view_atoi_uint32(str_y, &out_workgroup_count[1]) ||
343 !iree_string_view_atoi_uint32(str_z, &out_workgroup_count[2])) {
344 return iree_make_status(
345 IREE_STATUS_INVALID_ARGUMENT,
346 "invalid workgroup count string `%.*s`; expects `X,Y,Z`",
347 (int)workgroup_count_str.size, workgroup_count_str.data);
348 }
349 return iree_ok_status();
350}
351
352// Runs one benchmark per workgroup count specified using the same device
353// and input/output buffers.
354static iree_status_t iree_benchmark_executable_from_flags(
355 iree_allocator_t host_allocator) {
356 iree_vm_instance_t* instance = NULL;
357 IREE_RETURN_IF_ERROR(iree_vm_instance_create(IREE_VM_TYPE_CAPACITY_DEFAULT,
358 host_allocator, &instance));
359 IREE_RETURN_IF_ERROR(iree_hal_module_register_inline_types(instance));
360
361 // Create the HAL device we'll be using during execution.
362 // Devices can be very expensive to create and we want to avoid doing it
363 // multiple times throughout the benchmark execution.
364 iree_hal_device_t* device = NULL;
365 IREE_RETURN_IF_ERROR(iree_hal_create_device_from_flags(
366 iree_hal_available_driver_registry(), iree_hal_default_device_uri(),
367 host_allocator, &device));
368
369 // We'll reuse the same executable cache so that once we load the executable
370 // we'll be able to reuse any driver-side optimizations.
371 iree_hal_executable_cache_t* executable_cache = NULL;
372 iree_status_t loop_status = iree_ok_status();
373 IREE_RETURN_IF_ERROR(iree_hal_executable_cache_create(
374 device, iree_make_cstring_view("cache"), iree_loop_inline(&loop_status),
375 &executable_cache));
376 IREE_RETURN_IF_ERROR(loop_status);
377
378 // Allocate storage for buffers and populate them.
379 // They only need to remain valid for the duration of the invocation and all
380 // memory accessed by the invocation will come from here.
381 // Note that we do this parsing first so that we can reflect on the I/O to
382 // infer the pipeline layout.
383 iree_hal_allocator_t* device_allocator = iree_hal_device_allocator(device);
384 iree_vm_list_t* binding_list = NULL;
385 IREE_RETURN_IF_ERROR(iree_tooling_parse_variants(
386 iree_make_string_view(parsed_params.binding_cconv,
387 parsed_params.binding_count),
388 (iree_string_view_list_t){parsed_params.binding_count,
389 parsed_params.binding_specs},
390 device, device_allocator, host_allocator, &binding_list));
391 iree_hal_descriptor_set_binding_t bindings[IREE_HAL_MAX_TOTAL_BINDING_COUNT];
392 for (iree_host_size_t i = 0; i < parsed_params.binding_count; ++i) {
393 iree_vm_ref_t value = iree_vm_ref_null();
394 IREE_RETURN_IF_ERROR(iree_vm_list_get_ref_assign(binding_list, i, &value));
395 iree_hal_buffer_t* buffer = NULL;
396 if (iree_hal_buffer_isa(value)) {
397 buffer = iree_hal_buffer_deref(value);
398 } else if (iree_hal_buffer_view_isa(value)) {
399 buffer = iree_hal_buffer_view_buffer(iree_hal_buffer_view_deref(value));
400 } else {
401 return iree_make_status(
402 IREE_STATUS_INVALID_ARGUMENT,
403 "bindings must be shaped types (4xf32, etc), binding %" PRIhsz
404 " is not",
405 i);
406 }
407 bindings[i] = (iree_hal_descriptor_set_binding_t){
408 .binding = i,
409 .buffer_slot = 0,
410 .buffer = buffer,
411 .offset = 0,
412 .length = IREE_WHOLE_BUFFER,
413 };
414 }
415
416 // Setup the specification used to perform the executable load.
417 // This information is normally used to select the appropriate loader but in
418 // this benchmark we only have a single one.
419 // TODO(benvanik): expose the flags once they are implemented anywhere.
420 iree_hal_executable_params_t executable_params;
421 iree_hal_executable_params_initialize(&executable_params);
422 executable_params.caching_mode =
423 IREE_HAL_EXECUTABLE_CACHING_MODE_ALLOW_OPTIMIZATION |
424 IREE_HAL_EXECUTABLE_CACHING_MODE_ALIAS_PROVIDED_DATA;
425
426 // Load the executable data into memory.
427 // In normal usage this would be mapped from the containing module file (which
428 // itself may be mapped from disk).
429 iree_file_contents_t* file_contents = NULL;
430 if (strcmp(FLAG_executable_file, "-") == 0) {
431 IREE_RETURN_IF_ERROR(
432 iree_stdin_read_contents(host_allocator, &file_contents));
433 } else {
434 IREE_RETURN_IF_ERROR(iree_file_read_contents(
435 FLAG_executable_file, IREE_FILE_READ_FLAG_DEFAULT, host_allocator,
436 &file_contents));
437 }
438 executable_params.executable_format =
439 iree_make_cstring_view(FLAG_executable_format);
440 executable_params.executable_data = file_contents->const_buffer;
441
442 // Setup the layouts defining how each entry point is interpreted.
443 iree_hal_pipeline_layout_t* pipeline_layout = NULL;
444 iree_hal_descriptor_set_layout_t* descriptor_set_layout = NULL;
445 IREE_RETURN_IF_ERROR(iree_hal_descriptor_set_layout_create(
446 device, IREE_HAL_DESCRIPTOR_SET_LAYOUT_FLAG_NONE,
447 parsed_params.binding_count, parsed_params.binding_layouts,
448 &descriptor_set_layout));
449 IREE_RETURN_IF_ERROR(iree_hal_pipeline_layout_create(
450 device, parsed_params.push_constant_count,
451 /*set_layout_count=*/1, &descriptor_set_layout, &pipeline_layout));
452 executable_params.pipeline_layout_count = 1;
453 executable_params.pipeline_layouts = &pipeline_layout;
454
455 // Executable-level constants allow us to perform some basic load-time value
456 // propagation - usually dependent on device features or tuning parameters.
457 executable_params.constant_count = parsed_params.executable_constant_count;
458 executable_params.constants = &parsed_params.executable_constants[0].ui32;
459
460 // Perform the load, which will fail if the executable cannot be loaded or
461 // there was an issue with the layouts.
462 iree_hal_executable_t* executable = NULL;
463 IREE_RETURN_IF_ERROR(iree_hal_executable_cache_prepare_executable(
464 executable_cache, &executable_params, &executable));
465
466 // Register one benchmark per workgroup count specified.
467 iree_benchmark_executable_args_t* args = NULL;
468 IREE_RETURN_IF_ERROR(iree_allocator_malloc(
469 host_allocator, sizeof(*args) * FLAG_workgroup_count_list().count,
470 (void**)&args));
471 for (iree_host_size_t i = 0; i < FLAG_workgroup_count_list().count; ++i) {
472 args[i] = (iree_benchmark_executable_args_t){
473 .device = device,
474 .executable = executable,
475 .pipeline_layout = pipeline_layout,
476 .bindings = bindings,
477 .workgroup_count = {1, 1, 1},
478 };
479 IREE_RETURN_IF_ERROR(iree_parse_workgroup_count(
480 FLAG_workgroup_count_list().values[i], args[i].workgroup_count));
481 iree_benchmark_def_t benchmark_def = {
482 .flags = IREE_BENCHMARK_FLAG_MEASURE_PROCESS_CPU_TIME |
483 IREE_BENCHMARK_FLAG_USE_REAL_TIME,
484 .time_unit = IREE_BENCHMARK_UNIT_NANOSECOND,
485 .minimum_duration_ns = 0,
486 .iteration_count = 0,
487 .run = iree_benchmark_executable_run,
488 .user_data = &args[i],
489 };
490 char benchmark_name[512];
491 snprintf(benchmark_name, sizeof(benchmark_name) - 1, "dispatch_%ux%ux%u",
492 args[i].workgroup_count[0], args[i].workgroup_count[1],
493 args[i].workgroup_count[2]);
494 iree_benchmark_register(iree_make_cstring_view(benchmark_name),
495 &benchmark_def);
496 }
497 iree_benchmark_run_specified();
498 iree_allocator_free(host_allocator, args);
499
500 iree_vm_list_release(binding_list);
501 iree_hal_executable_release(executable);
502 iree_hal_descriptor_set_layout_release(descriptor_set_layout);
503 iree_hal_pipeline_layout_release(pipeline_layout);
504 iree_file_contents_free(file_contents);
505 iree_hal_executable_cache_release(executable_cache);
506 iree_hal_device_release(device);
507 iree_vm_instance_release(instance);
508
509 return iree_ok_status();
510}
511
512int main(int argc, char** argv) {
513 IREE_TRACE_APP_ENTER();
514 IREE_TRACE_ZONE_BEGIN(z0);
515
516 iree_allocator_t host_allocator = iree_allocator_system();
517 int exit_code = EXIT_SUCCESS;
518
519 iree_flags_set_usage(
520 "iree-benchmark-executable",
521 "Benchmarks a single entry point within an executable library.\n"
522 "The parameters used can be inferred from the entry point "
523 "`hal.interface` and dispatches to it in the source program.\n"
524 "\n"
525 "Executables can be extracted from VMFB files using `unzip` or dumped\n"
526 "during compilation using --iree-hal-dump-executable-binaries-to=path/.\n"
527 "\n"
528 "The compiler can directly compile `hal.executable.source` and\n"
529 "`hal.executable` ops to the appropriate binaries by using the\n"
530 "`iree-compile --compile-mode=hal-executable` mode.\n"
531 "\n"
532 "Example flags for various compilation backends:\n"
533 " --iree-hal-target-backends=vmvx\n"
534 " --device=local-sync or --device=local-task\n"
535 " --executable_format=vmvx-bytecode-fb\n"
536 " --iree-hal-target-backends=llvm-cpu\n"
537 " --device=local-sync or --device=local-task\n"
538 " --executable_format=embedded-elf-x86_64\n"
539 " --executable_format=system-dll-x86_64\n"
Ben Vanik14895842024-02-24 09:10:03 -0800540 " --iree-hal-target-backends=cuda\n"
541 " --device=cuda\n"
542 " --executable_format=cuda-nvptx-fb\n"
Ben Vanik23f28282024-02-23 11:14:25 -0800543 " --iree-hal-target-backends=vulkan-spirv\n"
544 " --device=vulkan\n"
545 " --executable_format=vulkan-spirv-fb\n"
546 "\n"
547 "Note that this tool is intentionally low level: you must specify all\n"
548 "of the push constant/binding parameters precisely as they are expected\n"
549 "by the executable. `iree-benchmark-module` is the user-friendly\n"
550 "benchmarking tool while this one favors direct access to the\n"
551 "executables (bypassing all of the IREE VM, HAL APIs, task system,\n"
552 "etc).\n"
553 "\n"
554 "Example --flagfile:\n"
555 " --device=local-sync\n"
556 " --executable_format=embedded-elf-x86_64\n"
557 " --executable_file=runtime/src/iree/hal/local/elf/testdata/"
558 "elementwise_mul_x86_64.so\n"
559 " --entry_point=0\n"
560 " --binding=4xf32=1,2,3,4\n"
561 " --binding=4xf32=100,200,300,400\n"
562 " --binding=4xf32=0,0,0,0\n"
563 " --workgroup_count=1,1,1\n"
564 "\n");
565
566 iree_flags_parse_checked(IREE_FLAGS_PARSE_MODE_UNDEFINED_OK, &argc, &argv);
567 iree_benchmark_initialize(&argc, argv);
568
569 iree_status_t status = iree_benchmark_executable_from_flags(host_allocator);
570 if (!iree_status_is_ok(status)) {
571 iree_status_fprint(stderr, status);
572 iree_status_free(status);
573 exit_code = EXIT_FAILURE;
574 }
575 fflush(stderr);
576
577 IREE_TRACE_ZONE_END(z0);
578 IREE_TRACE_APP_EXIT(exit_code);
579 return exit_code;
580}