blob: 02dc927f7185560119d5c1b1f0ce66d0294d7890 [file] [log] [blame]
// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <errno.h>
#include "iree/base/api.h"
#include "iree/base/internal/flags.h"
#include "iree/base/tracing.h"
#include "iree/hal/local/executable_library.h"
#include "iree/hal/local/executable_loader.h"
#include "iree/hal/local/local_descriptor_set_layout.h"
#include "iree/hal/local/local_executable.h"
#include "iree/testing/benchmark.h"
IREE_FLAG(string, executable_format, "",
"Format of the executable file being loaded.");
IREE_FLAG(string, executable_file, "",
"Path to the executable library file to load.");
IREE_FLAG(int32_t, entry_point, 0, "Entry point ordinal to run.");
IREE_FLAG(int32_t, workgroup_count_x, 1,
"X dimension of the workgroup count defining the number of\n"
"workgroup invocations that will be run per benchmark iteration.\n"
"This is the fastest-changing dimension.");
IREE_FLAG(int32_t, workgroup_count_y, 1,
"Y dimension of the workgroup count defining the number of\n"
"workgroup invocations that will be run per benchmark iteration.");
IREE_FLAG(int32_t, workgroup_count_z, 1,
"Z dimension of the workgroup count defining the number of\n"
"workgroup invocations that will be run per benchmark iteration.\n"
"This is the slowest-changing dimension.");
IREE_FLAG(int32_t, workgroup_size_x, 1,
"X dimension of the workgroup size passed to the executable.");
IREE_FLAG(int32_t, workgroup_size_y, 1,
"Y dimension of the workgroup size passed to the executable.");
IREE_FLAG(int32_t, workgroup_size_z, 1,
"Z dimension of the workgroup size passed to the executable.");
// Total number of bindings we (currently) allow any executable to have.
#define IREE_HAL_LOCAL_MAX_TOTAL_BINDING_COUNT \
(IREE_HAL_LOCAL_MAX_DESCRIPTOR_SET_COUNT * \
IREE_HAL_LOCAL_MAX_DESCRIPTOR_BINDING_COUNT)
// Parsed parameters from flags.
// Used to construct the dispatch parameters for the benchmark invocation.
struct {
int32_t push_constant_count;
union {
uint32_t ui32;
} push_constants[IREE_HAL_LOCAL_MAX_PUSH_CONSTANT_COUNT];
int32_t binding_count;
iree_string_view_t bindings[IREE_HAL_LOCAL_MAX_TOTAL_BINDING_COUNT];
} dispatch_params = {
.push_constant_count = 0,
.binding_count = 0,
};
static iree_status_t parse_push_constant(iree_string_view_t flag_name,
void* storage,
iree_string_view_t value) {
IREE_ASSERT_LE(dispatch_params.push_constant_count + 1,
IREE_ARRAYSIZE(dispatch_params.push_constants),
"too many push constants");
dispatch_params.push_constants[dispatch_params.push_constant_count++].ui32 =
atoi(value.data);
return iree_ok_status();
}
static void print_push_constant(iree_string_view_t flag_name, void* storage,
FILE* file) {
if (dispatch_params.push_constant_count == 0) {
fprintf(file, "# --%.*s=[integer value]\n", (int)flag_name.size,
flag_name.data);
return;
}
for (int32_t i = 0; i < dispatch_params.push_constant_count; ++i) {
fprintf(file, "--%.*s=%u", (int)flag_name.size, flag_name.data,
dispatch_params.push_constants[i].ui32);
if (i < dispatch_params.push_constant_count - 1) {
fprintf(file, "\n");
}
}
}
IREE_FLAG_CALLBACK(parse_push_constant, print_push_constant, &dispatch_params,
push_constant_callback,
"Appends a uint32_t push constant value.\n");
static iree_status_t parse_binding(iree_string_view_t flag_name, void* storage,
iree_string_view_t value) {
IREE_ASSERT_LE(dispatch_params.binding_count + 1,
IREE_ARRAYSIZE(dispatch_params.bindings), "too many bindings");
dispatch_params.bindings[dispatch_params.binding_count++] = value;
return iree_ok_status();
}
static void print_binding(iree_string_view_t flag_name, void* storage,
FILE* file) {
if (dispatch_params.binding_count == 0) {
fprintf(file, "# --%.*s=\"shapextype[=values]\"\n", (int)flag_name.size,
flag_name.data);
return;
}
for (int32_t i = 0; i < dispatch_params.binding_count; ++i) {
const iree_string_view_t binding_str = dispatch_params.bindings[i];
fprintf(file, "--%.*s=\"%.*s\"\n", (int)flag_name.size, flag_name.data,
(int)binding_str.size, binding_str.data);
}
}
IREE_FLAG_CALLBACK(
parse_binding, print_binding, &dispatch_params, binding,
"Appends a binding to the dispatch parameters.\n"
"Bindings are defined by their shape, element type, and their data.\n"
"Examples:\n"
" # 16 4-byte elements zero-initialized:\n"
" --binding=2x8xi32\n"
" # 10000 bytes all initialized to 123:\n"
" --binding=10000xi8=123\n"
" # 2 4-byte floating-point values with contents [[1.4], [2.1]]:\n"
" --binding=2x1xf32=1.4,2.1");
#if defined(IREE_HAL_HAVE_EMBEDDED_LIBRARY_LOADER)
#include "iree/hal/local/loaders/embedded_library_loader.h"
#endif // IREE_HAL_HAVE_EMBEDDED_LIBRARY_LOADER
// Creates an executable loader based on the given format flag.
static iree_status_t iree_hal_executable_library_create_loader(
iree_allocator_t host_allocator,
iree_hal_executable_loader_t** out_executable_loader) {
#if defined(IREE_HAL_HAVE_EMBEDDED_LIBRARY_LOADER)
if (strcmp(FLAG_executable_format, "EX_ELF") == 0) {
return iree_hal_embedded_library_loader_create(host_allocator,
out_executable_loader);
}
#endif // IREE_HAL_HAVE_EMBEDDED_LIBRARY_LOADER
return iree_make_status(
IREE_STATUS_UNAVAILABLE,
"no loader available that can handle --executable_format=%s",
FLAG_executable_format);
}
// TODO(benvanik): use this to replace file_io.cc.
static iree_status_t iree_file_read_contents(const char* path,
iree_allocator_t allocator,
iree_byte_span_t* out_contents) {
IREE_TRACE_ZONE_BEGIN(z0);
*out_contents = iree_make_byte_span(NULL, 0);
FILE* file = fopen(path, "rb");
if (file == NULL) {
IREE_TRACE_ZONE_END(z0);
return iree_make_status(iree_status_code_from_errno(errno),
"failed to open file '%s'", path);
}
iree_status_t status = iree_ok_status();
if (fseek(file, 0, SEEK_END) == -1) {
status = iree_make_status(iree_status_code_from_errno(errno), "seek (end)");
}
size_t file_size = 0;
if (iree_status_is_ok(status)) {
file_size = ftell(file);
if (file_size == -1L) {
status =
iree_make_status(iree_status_code_from_errno(errno), "size query");
}
}
if (iree_status_is_ok(status)) {
if (fseek(file, 0, SEEK_SET) == -1) {
status =
iree_make_status(iree_status_code_from_errno(errno), "seek (beg)");
}
}
// Allocate +1 to force a trailing \0 in case this is a string.
char* contents = NULL;
if (iree_status_is_ok(status)) {
status = iree_allocator_malloc(allocator, file_size + 1, (void**)&contents);
}
if (iree_status_is_ok(status)) {
if (fread(contents, file_size, 1, file) != 1) {
status =
iree_make_status(iree_status_code_from_errno(errno),
"unable to read entire file contents of '%s'", path);
}
}
if (iree_status_is_ok(status)) {
contents[file_size] = 0; // NUL
*out_contents = iree_make_byte_span(contents, file_size);
} else {
iree_allocator_free(allocator, contents);
}
fclose(file);
IREE_TRACE_ZONE_END(z0);
return status;
}
// NOTE: error handling is here just for better diagnostics: it is not tracking
// allocations correctly and will leak. Don't use this as an example for how to
// write robust code.
static iree_status_t iree_hal_executable_library_run(
iree_benchmark_state_t* benchmark_state) {
iree_allocator_t host_allocator = benchmark_state->host_allocator;
// Register the loader used to load (or find) the executable.
iree_hal_executable_loader_t* executable_loader = NULL;
IREE_RETURN_IF_ERROR(iree_hal_executable_library_create_loader(
host_allocator, &executable_loader));
// Setup the specification used to perform the executable load.
// This information is normally used to select the appropriate loader but in
// this benchmark we only have a single one.
iree_hal_executable_spec_t executable_spec;
iree_hal_executable_spec_initialize(&executable_spec);
executable_spec.caching_mode =
IREE_HAL_EXECUTABLE_CACHING_MODE_ALLOW_OPTIMIZATION |
IREE_HAL_EXECUTABLE_CACHING_MODE_ALIAS_PROVIDED_DATA |
IREE_HAL_EXECUTABLE_CACHING_MODE_DISABLE_VERIFICATION;
executable_spec.executable_format =
iree_make_cstring_view(FLAG_executable_format);
// Load the executable data.
IREE_RETURN_IF_ERROR(iree_file_read_contents(
FLAG_executable_file, host_allocator,
(iree_byte_span_t*)&executable_spec.executable_data));
// Setup the layouts defining how each entry point is interpreted.
// NOTE: we know for the embedded library loader that this is not required.
// Other loaders may need it in which case it'll have to be provided.
executable_spec.executable_layout_count = 0;
executable_spec.executable_layouts = NULL;
// Perform the load, which will fail if the executable cannot be loaded or
// there was an issue with the layouts.
iree_hal_executable_t* executable = NULL;
IREE_RETURN_IF_ERROR(iree_hal_executable_loader_try_load(
executable_loader, &executable_spec, &executable));
// Allocate storage for buffers and populate them.
// They only need to remain valid for the duration of the invocation and all
// memory accessed by the invocation will come from here.
iree_hal_allocator_t* heap_allocator = NULL;
IREE_RETURN_IF_ERROR(iree_hal_allocator_create_heap(
iree_make_cstring_view("benchmark"), host_allocator, &heap_allocator));
iree_hal_buffer_view_t* buffer_views[IREE_HAL_LOCAL_MAX_TOTAL_BINDING_COUNT];
void* binding_ptrs[IREE_HAL_LOCAL_MAX_TOTAL_BINDING_COUNT];
size_t binding_lengths[IREE_HAL_LOCAL_MAX_TOTAL_BINDING_COUNT];
for (iree_host_size_t i = 0; i < dispatch_params.binding_count; ++i) {
IREE_RETURN_IF_ERROR(iree_hal_buffer_view_parse(
dispatch_params.bindings[i], heap_allocator, &buffer_views[i]));
iree_hal_buffer_t* buffer = iree_hal_buffer_view_buffer(buffer_views[i]);
iree_device_size_t buffer_length =
iree_hal_buffer_view_byte_length(buffer_views[i]);
iree_hal_buffer_mapping_t buffer_mapping;
IREE_RETURN_IF_ERROR(iree_hal_buffer_map_range(
buffer, IREE_HAL_MEMORY_ACCESS_ALL, 0, buffer_length, &buffer_mapping));
binding_ptrs[i] = buffer_mapping.contents.data;
binding_lengths[i] = (size_t)buffer_mapping.contents.data_length;
}
// Setup dispatch state.
iree_hal_executable_dispatch_state_v0_t dispatch_state = {
.workgroup_count = {{
.x = FLAG_workgroup_count_x,
.y = FLAG_workgroup_count_y,
.z = FLAG_workgroup_count_z,
}},
.workgroup_size = {{
.x = FLAG_workgroup_size_x,
.y = FLAG_workgroup_size_y,
.z = FLAG_workgroup_size_z,
}},
.push_constant_count = dispatch_params.push_constant_count,
.push_constants = &dispatch_params.push_constants[0].ui32,
.binding_count = dispatch_params.binding_count,
.binding_ptrs = binding_ptrs,
.binding_lengths = binding_lengths,
.imports = NULL, // not yet implemented
};
// Execute benchmark the workgroup invocation.
// Note that each iteration runs through the whole grid as it's important that
// we are testing the memory access patterns: if we just ran the same single
// tile processing the same exact region of memory over and over we are not
// testing cache effects.
IREE_TRACE_ZONE_BEGIN(z1);
int64_t dispatch_count = 0;
while (iree_benchmark_keep_running(benchmark_state, /*batch_count=*/1)) {
IREE_RETURN_AND_END_ZONE_IF_ERROR(
z1, iree_hal_local_executable_issue_dispatch_inline(
iree_hal_local_executable_cast(executable), FLAG_entry_point,
&dispatch_state));
++dispatch_count;
}
IREE_TRACE_ZONE_END(z1);
// To get a total time per invocation we set the item count to the total
// invocations dispatched. That gives us both total dispatch and single
// invocation times in the reporter output.
int64_t total_invocations =
dispatch_count * dispatch_state.workgroup_count.x *
dispatch_state.workgroup_count.y * dispatch_state.workgroup_count.z;
iree_benchmark_set_items_processed(benchmark_state, total_invocations);
// Deallocate buffers.
for (iree_host_size_t i = 0; i < dispatch_params.binding_count; ++i) {
iree_hal_buffer_view_release(buffer_views[i]);
}
iree_hal_allocator_release(heap_allocator);
// Unload.
iree_allocator_free(host_allocator,
(void*)executable_spec.executable_data.data);
iree_hal_executable_release(executable);
iree_hal_executable_loader_release(executable_loader);
return iree_ok_status();
}
int main(int argc, char** argv) {
iree_flags_set_usage(
"executable_library_benchmark",
"Benchmarks a single entry point within an executable library.\n"
"Executable libraries can be found in your temp path when compiling\n"
"with `-iree-llvm-keep-linker-artifacts`. The parameters used can be\n"
"inferred from the entry point `hal.interface` and dispatches to it.\n"
"\n"
"Note that this tool is intentionally low level: you must specify all\n"
"of the push constant/binding parameters precisely as they are expected\n"
"by the executable. `iree-benchmark-module` is the user-friendly\n"
"benchmarking tool while this one favors direct access to the\n"
"executables (bypassing all of the IREE VM, HAL APIs, task system,\n"
"etc).\n"
"\n"
"Example --flagfile:\n"
" --executable_format=EX_ELF\n"
" --executable_file=iree/hal/local/elf/testdata/"
"simple_mul_dispatch_x86_64.so\n"
" --entry_point=0\n"
" --workgroup_count_x=1\n"
" --workgroup_count_y=1\n"
" --workgroup_count_z=1\n"
" --workgroup_size_x=1\n"
" --workgroup_size_y=1\n"
" --workgroup_size_z=1\n"
" --binding=4xf32=1,2,3,4\n"
" --binding=4xf32=100,200,300,400\n"
" --binding=4xf32=0,0,0,0);\n"
"\n");
iree_flags_parse_checked(IREE_FLAGS_PARSE_MODE_UNDEFINED_OK, &argc, &argv);
iree_benchmark_initialize(&argc, argv);
#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
// clang-format off
fprintf(stderr,
"\x1b[31m"
"===----------------------------------------------------------------------===\n"
"\n"
" ██ ██ █████ ██████ ███ ██ ██ ███ ██ ██████\n"
" ██ ██ ██ ██ ██ ██ ████ ██ ██ ████ ██ ██\n"
" ██ █ ██ ███████ ██████ ██ ██ ██ ██ ██ ██ ██ ██ ███\n"
" ██ ███ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██\n"
" ███ ███ ██ ██ ██ ██ ██ ████ ██ ██ ████ ██████\n"
"\n"
"===----------------------------------------------------------------------===\n"
"\n"
"Tracing is enabled and will skew your results!\n"
"The timings involved here can an order of magnitude off due to the tracing\n"
"time sampling, recording, and instrumentation overhead. Disable tracing with\n"
"IREE_ENABLE_RUNTIME_TRACING=OFF and rebuild.\n"
"\x1b[0m"
"\n"
);
fflush(stderr);
// clang-format on
#endif // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
// TODO(benvanik): override these with our own flags.
iree_benchmark_def_t benchmark_def = {
.flags = IREE_BENCHMARK_FLAG_MEASURE_PROCESS_CPU_TIME |
IREE_BENCHMARK_FLAG_USE_REAL_TIME,
.time_unit = IREE_BENCHMARK_UNIT_NANOSECOND,
.minimum_duration_ns = 0,
.iteration_count = 0,
.run = iree_hal_executable_library_run,
};
iree_benchmark_register(iree_make_cstring_view("dispatch"), &benchmark_def);
iree_benchmark_run_specified();
return 0;
}