blob: f5cfb4a260fe508c943a140ec8908db30f8a96f0 [file] [log] [blame]
Ben Vanik23f28282024-02-23 11:14:25 -08001// Copyright 2024 The IREE Authors
2//
3// Licensed under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
7#include <stdint.h>
8#include <stdio.h>
9#include <stdlib.h>
10#include <string.h>
11
12#include "iree/base/api.h"
13#include "iree/base/internal/file_io.h"
14#include "iree/base/internal/flags.h"
15#include "iree/hal/api.h"
16#include "iree/modules/hal/types.h"
17#include "iree/testing/benchmark.h"
18#include "iree/tooling/device_util.h"
19#include "iree/tooling/function_io.h"
20#include "iree/vm/api.h"
21
22IREE_FLAG(
23 int32_t, batch_size, 64,
24 "Number of dispatches to perform per command buffer submission.\n"
25 "Higher numbers will reduce the effect of submission overheads on the\n"
26 "final timings but too high a value may result in hangs.");
27
28IREE_FLAG(string, executable_format, "",
29 "Format of the executable file being loaded.");
30IREE_FLAG(string, executable_file, "", "Path to the executable file to load.");
31
32IREE_FLAG(int32_t, entry_point, 0, "Entry point ordinal to run.");
33
34IREE_FLAG_LIST(
35 string, workgroup_count,
36 "`x,y,z` dimensions of the workgroup count defining the number of\n"
37 "workgroup invocations that will be run per benchmark iteration.\n"
38 "Each occurrence of the flag will run a benchmark with that set of\n"
39 "workgroup count values.");
40
41// Total number of executable-level constants we (currently) allow; this is only
42// a limitation of how much memory we allocate and we could make this
43// dynamically growable.
44#define IREE_HAL_MAX_EXECUTABLE_CONSTANT_COUNT 512
45// Total number of push constants we (currently) allow any executable to have.
Ben Vanik9bbc9262024-08-20 22:16:16 -070046#define IREE_HAL_MAX_CONSTANT_COUNT 64
Ben Vanik23f28282024-02-23 11:14:25 -080047// Total number of bindings we (currently) allow any executable to have.
Ben Vanik9bbc9262024-08-20 22:16:16 -070048#define IREE_HAL_MAX_BINDING_COUNT 64
Ben Vanik23f28282024-02-23 11:14:25 -080049
50// Parsed dispatch parameters from flags.
51// Used to construct the dispatch parameters for the benchmark invocation.
52struct {
Ben Vanik23f28282024-02-23 11:14:25 -080053 int32_t executable_constant_count;
54 union {
55 uint32_t ui32;
56 } executable_constants[IREE_HAL_MAX_EXECUTABLE_CONSTANT_COUNT];
57
Ben Vanik9bbc9262024-08-20 22:16:16 -070058 int32_t constant_count;
Ben Vanik23f28282024-02-23 11:14:25 -080059 union {
60 uint32_t ui32;
Ben Vanik9bbc9262024-08-20 22:16:16 -070061 } constants[IREE_HAL_MAX_CONSTANT_COUNT];
Ben Vanik23f28282024-02-23 11:14:25 -080062
63 int32_t binding_count;
Ben Vanik9bbc9262024-08-20 22:16:16 -070064 iree_string_view_t binding_specs[IREE_HAL_MAX_BINDING_COUNT];
65 char binding_cconv[IREE_HAL_MAX_BINDING_COUNT];
Ben Vanik23f28282024-02-23 11:14:25 -080066} parsed_params = {
67 .executable_constant_count = 0,
Ben Vanik9bbc9262024-08-20 22:16:16 -070068 .constant_count = 0,
Ben Vanik23f28282024-02-23 11:14:25 -080069 .binding_count = 0,
70};
71
72static iree_status_t parse_executable_constant(iree_string_view_t flag_name,
73 void* storage,
74 iree_string_view_t value) {
75 IREE_ASSERT_LE(parsed_params.executable_constant_count + 1,
76 IREE_ARRAYSIZE(parsed_params.executable_constants),
77 "too many executable constants");
78 uint32_t value_ui32 = 0;
79 if (!iree_string_view_atoi_uint32(value, &value_ui32)) {
80 return iree_make_status(
81 IREE_STATUS_INVALID_ARGUMENT,
82 "invalid executable constant value `%.*s`; expects uint32_t",
83 (int)value.size, value.data);
84 }
85 parsed_params.executable_constants[parsed_params.executable_constant_count++]
86 .ui32 = value_ui32;
87 return iree_ok_status();
88}
89static void print_executable_constant(iree_string_view_t flag_name,
90 void* storage, FILE* file) {
91 if (parsed_params.executable_constant_count == 0) {
92 fprintf(file, "# --%.*s=[integer value]\n", (int)flag_name.size,
93 flag_name.data);
94 return;
95 }
96 for (int32_t i = 0; i < parsed_params.executable_constant_count; ++i) {
97 fprintf(file, "--%.*s=%u", (int)flag_name.size, flag_name.data,
98 parsed_params.executable_constants[i].ui32);
99 if (i < parsed_params.executable_constant_count - 1) {
100 fprintf(file, "\n");
101 }
102 }
103}
104IREE_FLAG_CALLBACK(parse_executable_constant, print_executable_constant,
105 &parsed_params, executable_constant,
106 "Appends a uint32_t executable constant value.\n");
107
Ben Vanik9bbc9262024-08-20 22:16:16 -0700108static iree_status_t parse_constant(iree_string_view_t flag_name, void* storage,
109 iree_string_view_t value) {
110 IREE_ASSERT_LE(parsed_params.constant_count + 1,
111 IREE_ARRAYSIZE(parsed_params.constants),
Ben Vanik23f28282024-02-23 11:14:25 -0800112 "too many push constants");
113 uint32_t value_ui32 = 0;
114 if (!iree_string_view_atoi_uint32(value, &value_ui32)) {
115 return iree_make_status(
116 IREE_STATUS_INVALID_ARGUMENT,
117 "invalid push constant value `%.*s`; expects uint32_t", (int)value.size,
118 value.data);
119 }
Ben Vanik9bbc9262024-08-20 22:16:16 -0700120 parsed_params.constants[parsed_params.constant_count++].ui32 = value_ui32;
Ben Vanik23f28282024-02-23 11:14:25 -0800121 return iree_ok_status();
122}
Ben Vanik9bbc9262024-08-20 22:16:16 -0700123static void print_constant(iree_string_view_t flag_name, void* storage,
124 FILE* file) {
125 if (parsed_params.constant_count == 0) {
Ben Vanik23f28282024-02-23 11:14:25 -0800126 fprintf(file, "# --%.*s=[integer value]\n", (int)flag_name.size,
127 flag_name.data);
128 return;
129 }
Ben Vanik9bbc9262024-08-20 22:16:16 -0700130 for (int32_t i = 0; i < parsed_params.constant_count; ++i) {
Ben Vanik23f28282024-02-23 11:14:25 -0800131 fprintf(file, "--%.*s=%u", (int)flag_name.size, flag_name.data,
Ben Vanik9bbc9262024-08-20 22:16:16 -0700132 parsed_params.constants[i].ui32);
133 if (i < parsed_params.constant_count - 1) {
Ben Vanik23f28282024-02-23 11:14:25 -0800134 fprintf(file, "\n");
135 }
136 }
137}
Ben Vanik9bbc9262024-08-20 22:16:16 -0700138IREE_FLAG_CALLBACK(parse_constant, print_constant, &parsed_params, constant,
139 "Appends a uint32_t constant value.\n");
Ben Vanik23f28282024-02-23 11:14:25 -0800140
141static iree_status_t parse_binding(iree_string_view_t flag_name, void* storage,
142 iree_string_view_t value) {
143 IREE_ASSERT_LE(parsed_params.binding_count + 1,
144 IREE_ARRAYSIZE(parsed_params.binding_specs),
145 "too many bindings");
146 int32_t i = parsed_params.binding_count++;
147 parsed_params.binding_specs[i] = value;
148 parsed_params.binding_cconv[i] = 'r';
Ben Vanik23f28282024-02-23 11:14:25 -0800149 return iree_ok_status();
150}
151static void print_binding(iree_string_view_t flag_name, void* storage,
152 FILE* file) {
153 if (parsed_params.binding_count == 0) {
154 fprintf(file, "# --%.*s=\"shapextype[=values]\"\n", (int)flag_name.size,
155 flag_name.data);
156 return;
157 }
158 for (int32_t i = 0; i < parsed_params.binding_count; ++i) {
159 const iree_string_view_t binding_spec = parsed_params.binding_specs[i];
160 fprintf(file, "--%.*s=\"%.*s\"\n", (int)flag_name.size, flag_name.data,
161 (int)binding_spec.size, binding_spec.data);
162 }
163}
164IREE_FLAG_CALLBACK(
165 parse_binding, print_binding, &parsed_params, binding,
166 "Appends a binding to the dispatch parameters.\n"
167 "Bindings are defined by their shape, element type, and their data.\n"
168 "There must be one binding for every declared layout binding.\n"
169 "Examples:\n"
170 " # 16 4-byte elements zero-initialized:\n"
171 " --binding=2x8xi32\n"
172 " # 10000 bytes all initialized to 123:\n"
173 " --binding=10000xi8=123\n"
174 " # 2 4-byte floating-point values with contents [[1.4], [2.1]]:\n"
175 " --binding=2x1xf32=1.4,2.1\n"
176 " # First array from a numpy file followed by the second:\n"
177 " --binding=@file.npy\n"
178 " --binding=+file.npy\n"
179 " # All arrays from a numpy file\n"
180 " --binding=*file.npy\n"
181 " # Binary tensor<2x2xf32> and tensor<4xf32> read from a single file\n"
182 " --binding=2x2xf32=@file.ext\n"
183 " --binding=4xf32=+file.ext");
184
185typedef struct iree_benchmark_executable_args_t {
186 iree_hal_device_t* device;
187 iree_hal_executable_t* executable;
Ben Vanik9ffe4732024-07-08 17:10:45 -0700188 const iree_hal_buffer_ref_t* bindings;
Ben Vanik23f28282024-02-23 11:14:25 -0800189 uint32_t workgroup_count[3];
190} iree_benchmark_executable_args_t;
191
192// NOTE: error handling is here just for better diagnostics: it is not tracking
193// allocations correctly and will leak. Don't use this as an example for how to
194// write robust code.
195static iree_status_t iree_benchmark_executable_run(
196 const iree_benchmark_def_t* benchmark_def,
197 iree_benchmark_state_t* benchmark_state) {
198 iree_benchmark_executable_args_t* args =
199 (iree_benchmark_executable_args_t*)benchmark_def->user_data;
200
201 iree_hal_semaphore_t* fence_semaphore = NULL;
202 uint64_t fence_value = 0ull;
Ben Vanika28f76f2024-08-06 15:04:15 -0700203 IREE_RETURN_IF_ERROR(iree_hal_semaphore_create(args->device, fence_value,
204 IREE_HAL_SEMAPHORE_FLAG_NONE,
205 &fence_semaphore));
Ben Vanik23f28282024-02-23 11:14:25 -0800206 iree_hal_semaphore_list_t wait_semaphore_list =
207 iree_hal_semaphore_list_empty();
208 iree_hal_semaphore_list_t signal_semaphore_list = {
209 .count = 1,
210 .semaphores = &fence_semaphore,
211 .payload_values = &fence_value,
212 };
213
Ben Vanik894dfbe2024-08-13 11:41:10 -0700214 // Record a command buffer with the dispatches.
215 // The same command buffer recording is reused on each benchmark step.
216 iree_hal_command_buffer_t* command_buffer = NULL;
217 IREE_RETURN_IF_ERROR(iree_hal_command_buffer_create(
218 args->device, IREE_HAL_COMMAND_BUFFER_MODE_DEFAULT,
219 IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY,
220 /*binding_capacity=*/0, &command_buffer));
221 IREE_RETURN_IF_ERROR(iree_hal_command_buffer_begin(command_buffer));
Ben Vanik9bbc9262024-08-20 22:16:16 -0700222 iree_const_byte_span_t constants = iree_make_const_byte_span(
223 &parsed_params.constants[0].ui32,
224 parsed_params.constant_count * sizeof(parsed_params.constants[0]));
Ben Vanik894dfbe2024-08-13 11:41:10 -0700225 iree_hal_buffer_ref_list_t bindings = {
226 .count = parsed_params.binding_count,
227 .values = args->bindings,
228 };
229 for (int32_t i = 0; i < FLAG_batch_size; ++i) {
Ben Vanik7dc8c262024-08-22 14:43:33 -0700230 IREE_RETURN_IF_ERROR(iree_hal_command_buffer_dispatch(
Ben Vanik894dfbe2024-08-13 11:41:10 -0700231 command_buffer, args->executable, FLAG_entry_point,
232 args->workgroup_count, constants, bindings,
233 IREE_HAL_DISPATCH_FLAG_NONE));
234 IREE_RETURN_IF_ERROR(iree_hal_command_buffer_execution_barrier(
235 command_buffer, IREE_HAL_EXECUTION_STAGE_COMMAND_RETIRE,
236 IREE_HAL_EXECUTION_STAGE_COMMAND_ISSUE,
237 IREE_HAL_EXECUTION_BARRIER_FLAG_NONE, 0, NULL, 0, NULL));
238 }
239 IREE_RETURN_IF_ERROR(iree_hal_command_buffer_end(command_buffer));
240
Ben Vanik23f28282024-02-23 11:14:25 -0800241 // Start profiling now - all subsequent device operations will be what the
242 // user wants to measure.
243 IREE_RETURN_IF_ERROR(iree_hal_begin_profiling_from_flags(args->device));
244
245 // Submit the command buffer and wait for it to complete.
246 // Note that each iteration runs through the whole grid as it's important that
247 // we are testing the memory access patterns: if we just ran the same single
248 // workgroup processing the same exact region of memory over and over we are
249 // not testing cache effects. This means we need to account for the total
250 // number of workgroups executed.
251 int64_t dispatch_count = 0;
252 while (iree_benchmark_keep_running(benchmark_state, FLAG_batch_size)) {
Ben Vanik23f28282024-02-23 11:14:25 -0800253 // Submit the command buffer; if the device could not start executing while
254 // we were recording then this will kick off the execution.
255 ++fence_value;
256 IREE_RETURN_IF_ERROR(iree_hal_device_queue_execute(
257 args->device, IREE_HAL_QUEUE_AFFINITY_ANY, wait_semaphore_list,
Ben Vanik13e6b7e2024-07-08 09:40:30 -0700258 signal_semaphore_list, 1, &command_buffer, /*binding_tables=*/NULL));
Ben Vanik23f28282024-02-23 11:14:25 -0800259
260 // Block and wait for the submission to complete.
261 // Note that this will include round-trip overhead and if the dispatch or
262 // batch size is small then the final time may end up being mostly overhead.
263 IREE_RETURN_IF_ERROR(iree_hal_semaphore_wait(fence_semaphore, fence_value,
264 iree_infinite_timeout()));
265
266 iree_benchmark_pause_timing(benchmark_state);
267
Ben Vanik23f28282024-02-23 11:14:25 -0800268 // Accumulate the total number of dispatches executed.
269 dispatch_count += FLAG_batch_size;
270
271 // Flush profiling if recording. Note that we don't want to include the
272 // profiling time in the benchmark result.
273 IREE_RETURN_IF_ERROR(iree_hal_device_profiling_flush(args->device));
274
275 iree_benchmark_resume_timing(benchmark_state);
276 }
277
278 // End profiling before cleaning up so tooling doesn't capture it.
279 IREE_RETURN_IF_ERROR(iree_hal_end_profiling_from_flags(args->device));
280
281 // To get a total time per invocation we set the item count to the total
282 // invocations dispatched. That gives us both total dispatch and single
283 // invocation times in the reporter output.
284 int64_t total_invocations = dispatch_count * args->workgroup_count[0] *
285 args->workgroup_count[1] *
286 args->workgroup_count[2];
287 iree_benchmark_set_items_processed(benchmark_state, total_invocations);
288
Ben Vanik894dfbe2024-08-13 11:41:10 -0700289 iree_hal_command_buffer_release(command_buffer);
Ben Vanik23f28282024-02-23 11:14:25 -0800290 iree_hal_semaphore_release(fence_semaphore);
291
292 return iree_ok_status();
293}
294
295// Parses an `x,y,z` workgroup count.
296static iree_status_t iree_parse_workgroup_count(
297 iree_string_view_t workgroup_count_str, uint32_t* out_workgroup_count) {
298 iree_string_view_t str = workgroup_count_str;
299 iree_string_view_t str_x;
300 iree_string_view_split(str, ',', &str_x, &str);
301 iree_string_view_t str_y;
302 iree_string_view_split(str, ',', &str_y, &str);
303 iree_string_view_t str_z = str;
304 if (!iree_string_view_atoi_uint32(str_x, &out_workgroup_count[0]) ||
305 !iree_string_view_atoi_uint32(str_y, &out_workgroup_count[1]) ||
306 !iree_string_view_atoi_uint32(str_z, &out_workgroup_count[2])) {
307 return iree_make_status(
308 IREE_STATUS_INVALID_ARGUMENT,
309 "invalid workgroup count string `%.*s`; expects `X,Y,Z`",
310 (int)workgroup_count_str.size, workgroup_count_str.data);
311 }
312 return iree_ok_status();
313}
314
315// Runs one benchmark per workgroup count specified using the same device
316// and input/output buffers.
317static iree_status_t iree_benchmark_executable_from_flags(
318 iree_allocator_t host_allocator) {
319 iree_vm_instance_t* instance = NULL;
320 IREE_RETURN_IF_ERROR(iree_vm_instance_create(IREE_VM_TYPE_CAPACITY_DEFAULT,
321 host_allocator, &instance));
322 IREE_RETURN_IF_ERROR(iree_hal_module_register_inline_types(instance));
323
324 // Create the HAL device we'll be using during execution.
325 // Devices can be very expensive to create and we want to avoid doing it
326 // multiple times throughout the benchmark execution.
327 iree_hal_device_t* device = NULL;
328 IREE_RETURN_IF_ERROR(iree_hal_create_device_from_flags(
329 iree_hal_available_driver_registry(), iree_hal_default_device_uri(),
330 host_allocator, &device));
331
332 // We'll reuse the same executable cache so that once we load the executable
333 // we'll be able to reuse any driver-side optimizations.
334 iree_hal_executable_cache_t* executable_cache = NULL;
335 iree_status_t loop_status = iree_ok_status();
336 IREE_RETURN_IF_ERROR(iree_hal_executable_cache_create(
337 device, iree_make_cstring_view("cache"), iree_loop_inline(&loop_status),
338 &executable_cache));
339 IREE_RETURN_IF_ERROR(loop_status);
340
341 // Allocate storage for buffers and populate them.
342 // They only need to remain valid for the duration of the invocation and all
343 // memory accessed by the invocation will come from here.
344 // Note that we do this parsing first so that we can reflect on the I/O to
345 // infer the pipeline layout.
346 iree_hal_allocator_t* device_allocator = iree_hal_device_allocator(device);
347 iree_vm_list_t* binding_list = NULL;
348 IREE_RETURN_IF_ERROR(iree_tooling_parse_variants(
349 iree_make_string_view(parsed_params.binding_cconv,
350 parsed_params.binding_count),
351 (iree_string_view_list_t){parsed_params.binding_count,
352 parsed_params.binding_specs},
353 device, device_allocator, host_allocator, &binding_list));
Ben Vanik9bbc9262024-08-20 22:16:16 -0700354 iree_hal_buffer_ref_t bindings[IREE_HAL_MAX_BINDING_COUNT];
Ben Vanik23f28282024-02-23 11:14:25 -0800355 for (iree_host_size_t i = 0; i < parsed_params.binding_count; ++i) {
356 iree_vm_ref_t value = iree_vm_ref_null();
357 IREE_RETURN_IF_ERROR(iree_vm_list_get_ref_assign(binding_list, i, &value));
358 iree_hal_buffer_t* buffer = NULL;
359 if (iree_hal_buffer_isa(value)) {
360 buffer = iree_hal_buffer_deref(value);
361 } else if (iree_hal_buffer_view_isa(value)) {
362 buffer = iree_hal_buffer_view_buffer(iree_hal_buffer_view_deref(value));
363 } else {
364 return iree_make_status(
365 IREE_STATUS_INVALID_ARGUMENT,
366 "bindings must be shaped types (4xf32, etc), binding %" PRIhsz
367 " is not",
368 i);
369 }
Ben Vanik9ffe4732024-07-08 17:10:45 -0700370 bindings[i] = iree_hal_make_buffer_ref(buffer, 0, IREE_WHOLE_BUFFER);
Ben Vanik23f28282024-02-23 11:14:25 -0800371 }
372
373 // Setup the specification used to perform the executable load.
374 // This information is normally used to select the appropriate loader but in
375 // this benchmark we only have a single one.
376 // TODO(benvanik): expose the flags once they are implemented anywhere.
377 iree_hal_executable_params_t executable_params;
378 iree_hal_executable_params_initialize(&executable_params);
379 executable_params.caching_mode =
380 IREE_HAL_EXECUTABLE_CACHING_MODE_ALLOW_OPTIMIZATION |
381 IREE_HAL_EXECUTABLE_CACHING_MODE_ALIAS_PROVIDED_DATA;
382
383 // Load the executable data into memory.
384 // In normal usage this would be mapped from the containing module file (which
385 // itself may be mapped from disk).
386 iree_file_contents_t* file_contents = NULL;
387 if (strcmp(FLAG_executable_file, "-") == 0) {
388 IREE_RETURN_IF_ERROR(
389 iree_stdin_read_contents(host_allocator, &file_contents));
390 } else {
391 IREE_RETURN_IF_ERROR(iree_file_read_contents(
392 FLAG_executable_file, IREE_FILE_READ_FLAG_DEFAULT, host_allocator,
393 &file_contents));
394 }
395 executable_params.executable_format =
396 iree_make_cstring_view(FLAG_executable_format);
397 executable_params.executable_data = file_contents->const_buffer;
398
Ben Vanik23f28282024-02-23 11:14:25 -0800399 // Executable-level constants allow us to perform some basic load-time value
400 // propagation - usually dependent on device features or tuning parameters.
401 executable_params.constant_count = parsed_params.executable_constant_count;
402 executable_params.constants = &parsed_params.executable_constants[0].ui32;
403
404 // Perform the load, which will fail if the executable cannot be loaded or
405 // there was an issue with the layouts.
406 iree_hal_executable_t* executable = NULL;
407 IREE_RETURN_IF_ERROR(iree_hal_executable_cache_prepare_executable(
408 executable_cache, &executable_params, &executable));
409
410 // Register one benchmark per workgroup count specified.
411 iree_benchmark_executable_args_t* args = NULL;
412 IREE_RETURN_IF_ERROR(iree_allocator_malloc(
413 host_allocator, sizeof(*args) * FLAG_workgroup_count_list().count,
414 (void**)&args));
415 for (iree_host_size_t i = 0; i < FLAG_workgroup_count_list().count; ++i) {
416 args[i] = (iree_benchmark_executable_args_t){
417 .device = device,
418 .executable = executable,
Ben Vanik23f28282024-02-23 11:14:25 -0800419 .bindings = bindings,
420 .workgroup_count = {1, 1, 1},
421 };
422 IREE_RETURN_IF_ERROR(iree_parse_workgroup_count(
423 FLAG_workgroup_count_list().values[i], args[i].workgroup_count));
424 iree_benchmark_def_t benchmark_def = {
425 .flags = IREE_BENCHMARK_FLAG_MEASURE_PROCESS_CPU_TIME |
426 IREE_BENCHMARK_FLAG_USE_REAL_TIME,
427 .time_unit = IREE_BENCHMARK_UNIT_NANOSECOND,
428 .minimum_duration_ns = 0,
429 .iteration_count = 0,
430 .run = iree_benchmark_executable_run,
431 .user_data = &args[i],
432 };
433 char benchmark_name[512];
434 snprintf(benchmark_name, sizeof(benchmark_name) - 1, "dispatch_%ux%ux%u",
435 args[i].workgroup_count[0], args[i].workgroup_count[1],
436 args[i].workgroup_count[2]);
437 iree_benchmark_register(iree_make_cstring_view(benchmark_name),
438 &benchmark_def);
439 }
440 iree_benchmark_run_specified();
441 iree_allocator_free(host_allocator, args);
442
443 iree_vm_list_release(binding_list);
444 iree_hal_executable_release(executable);
Ben Vanik23f28282024-02-23 11:14:25 -0800445 iree_file_contents_free(file_contents);
446 iree_hal_executable_cache_release(executable_cache);
447 iree_hal_device_release(device);
448 iree_vm_instance_release(instance);
449
450 return iree_ok_status();
451}
452
453int main(int argc, char** argv) {
454 IREE_TRACE_APP_ENTER();
455 IREE_TRACE_ZONE_BEGIN(z0);
456
457 iree_allocator_t host_allocator = iree_allocator_system();
458 int exit_code = EXIT_SUCCESS;
459
460 iree_flags_set_usage(
461 "iree-benchmark-executable",
462 "Benchmarks a single entry point within an executable library.\n"
463 "The parameters used can be inferred from the entry point "
464 "`hal.interface` and dispatches to it in the source program.\n"
465 "\n"
466 "Executables can be extracted from VMFB files using `unzip` or dumped\n"
467 "during compilation using --iree-hal-dump-executable-binaries-to=path/.\n"
468 "\n"
469 "The compiler can directly compile `hal.executable.source` and\n"
470 "`hal.executable` ops to the appropriate binaries by using the\n"
471 "`iree-compile --compile-mode=hal-executable` mode.\n"
472 "\n"
473 "Example flags for various compilation backends:\n"
474 " --iree-hal-target-backends=vmvx\n"
475 " --device=local-sync or --device=local-task\n"
476 " --executable_format=vmvx-bytecode-fb\n"
477 " --iree-hal-target-backends=llvm-cpu\n"
478 " --device=local-sync or --device=local-task\n"
479 " --executable_format=embedded-elf-x86_64\n"
480 " --executable_format=system-dll-x86_64\n"
Ben Vanik14895842024-02-24 09:10:03 -0800481 " --iree-hal-target-backends=cuda\n"
482 " --device=cuda\n"
483 " --executable_format=cuda-nvptx-fb\n"
Ben Vanik23f28282024-02-23 11:14:25 -0800484 " --iree-hal-target-backends=vulkan-spirv\n"
485 " --device=vulkan\n"
486 " --executable_format=vulkan-spirv-fb\n"
487 "\n"
488 "Note that this tool is intentionally low level: you must specify all\n"
489 "of the push constant/binding parameters precisely as they are expected\n"
490 "by the executable. `iree-benchmark-module` is the user-friendly\n"
491 "benchmarking tool while this one favors direct access to the\n"
492 "executables (bypassing all of the IREE VM, HAL APIs, task system,\n"
493 "etc).\n"
494 "\n"
495 "Example --flagfile:\n"
496 " --device=local-sync\n"
497 " --executable_format=embedded-elf-x86_64\n"
498 " --executable_file=runtime/src/iree/hal/local/elf/testdata/"
499 "elementwise_mul_x86_64.so\n"
500 " --entry_point=0\n"
501 " --binding=4xf32=1,2,3,4\n"
502 " --binding=4xf32=100,200,300,400\n"
503 " --binding=4xf32=0,0,0,0\n"
504 " --workgroup_count=1,1,1\n"
505 "\n");
506
507 iree_flags_parse_checked(IREE_FLAGS_PARSE_MODE_UNDEFINED_OK, &argc, &argv);
508 iree_benchmark_initialize(&argc, argv);
509
510 iree_status_t status = iree_benchmark_executable_from_flags(host_allocator);
511 if (!iree_status_is_ok(status)) {
512 iree_status_fprint(stderr, status);
513 iree_status_free(status);
514 exit_code = EXIT_FAILURE;
515 }
516 fflush(stderr);
517
518 IREE_TRACE_ZONE_END(z0);
519 IREE_TRACE_APP_EXIT(exit_code);
520 return exit_code;
521}