blob: ec031fd0e8853f22bb061d145d497d48f609815b [file] [log] [blame]
// Copyright 2020 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//===----------------------------------------------------------------------===//
// iree-benchmark-module: benchmarks public functions in an IREE VM module
//===----------------------------------------------------------------------===//
//
// This runs exported functions using flags specified on the command line.
// Each function is measured independently and the numbers reported will be for
// the full end-to-end CPU and wall times.
//
// From an ML perspective this is an integration benchmark for measuring total
// user-visible latency of model entry points. It is *not* a microbenchmarking
// tool for individual device-side dispatch functions (aka ops aka kernels).
// If interested in the precise time of a particular dispatch then tracy,
// executable_library_benchmark, and platform/vendor tooling (nsight, perf, etc)
// are to be used instead and attaching them to this tool is often useful in
// order to get a large sample set.
//
// By default all functions taking no inputs will be benchmarked. If a function
// takes inputs then the user will need to specify them using --input=
// flags. Depending on the input program the -iree-flow-export-benchmark-funcs
// flag can be passed to the compiler to attempt to wrap each function with
// dummy inputs however this will fail in programs with dynamically shaped
// inputs. The workaround for avoiding the need for flags is to provide the
// input program in a form with no inputs from the start.
//
// It's important to remember that IREE is not a BLAS library and is meant to
// run entire programs. It's not generally appropriate to benchmark a model with
// a single matmul, for example, as that's just treating IREE as a BLAS library.
// Note also that user-level ops in a frontend environment don't map to the
// dispatches that IREE executes: IREE is a compiler like any other and does not
// guarantee a source line of code translates into an atomically divisible and
// independently measurable execution command. In other words don't expect to be
// able to benchmark the cost of a broadcasting elementwise tf.add op within a
// model: by the time we are running the program that's fused itself into a
// single machine instruction operating as part of some other ops.
//
// For coarse dispatch testing and triaging it can still be useful to remove
// some of the overheads introduced by whole-program execution and the compiler
// flag --iree-hal-benchmark-dispatch-repeat-count=N is provided to enable
// batching. Whatever N is chosen must then be passed to this tool via
// --batch_size=N so that the benchmark reporting properly reflects the
// batching. As an example --iree-hal-benchmark-dispatch-repeat-count=32 +
// --batch_size=32 will reduce the overheads by 32x. Think of this as a way to
// control the p value in Amdahl's law representing the amount of time spent in
// dispatches relative to the rest of the program. This isn't representative of
// how the full program will run, though, and YMMV. Always verify timings with
// an appropriate device-specific tool before trusting the more generic and
// higher-level numbers from this tool.
#include <array>
#include <cstdio>
#include <iterator>
#include <string>
#include <type_traits>
#include <utility>
#include <vector>
#include "benchmark/benchmark.h"
#include "iree/base/api.h"
#include "iree/base/internal/flags.h"
#include "iree/hal/api.h"
#include "iree/modules/hal/types.h"
#include "iree/tooling/context_util.h"
#include "iree/tooling/device_util.h"
#include "iree/tooling/function_io.h"
#include "iree/vm/api.h"
constexpr char kNanosecondsUnitString[] = "ns";
constexpr char kMicrosecondsUnitString[] = "us";
constexpr char kMillisecondsUnitString[] = "ms";
// TODO(hanchung): Extract the batch size using
// iree_vm_function_lookup_attr_by_name.
IREE_FLAG(int32_t, batch_size, 1,
"Number of invocations per iteration, which for dispatch benchmarks "
"must match the --iree-hal-benchmark-dispatch-repeat-count value "
"used during compilation.");
IREE_FLAG(int32_t, batch_concurrency, 1,
"Number of invocations within a batch that should run concurrently.");
IREE_FLAG(string, function, "",
"Name of a function contained in the module specified by --module= "
"to run. If this is not set, all the exported functions will be "
"benchmarked and they are expected to not have input arguments.");
IREE_FLAG(bool, print_statistics, false,
"Prints runtime statistics to stderr on exit.");
IREE_FLAG_LIST(
string, input,
"An input value or buffer of the format:\n"
" [shape]xtype=[value]\n"
" 2x2xi32=1 2 3 4\n"
"Optionally, brackets may be used to separate the element values:\n"
" 2x2xi32=[[1 2][3 4]]\n"
"Raw binary files can be read to provide buffer contents:\n"
" 2x2xi32=@some/file.bin\n"
"numpy npy files (from numpy.save) can be read to provide 1+ values:\n"
" @some.npy\n"
"Each occurrence of the flag indicates an input in the order they were\n"
"specified on the command line.");
static iree_status_t parse_time_unit(iree_string_view_t flag_name,
void* storage, iree_string_view_t value) {
auto* unit = (std::pair<bool, benchmark::TimeUnit>*)storage;
auto unit_string = std::string(value.data, value.size);
if (unit_string == kMillisecondsUnitString) {
*unit = {true, benchmark::kMillisecond};
return iree_ok_status();
} else if (unit_string == kMicrosecondsUnitString) {
*unit = {true, benchmark::kMicrosecond};
return iree_ok_status();
} else if (unit_string == kNanosecondsUnitString) {
*unit = {true, benchmark::kNanosecond};
return iree_ok_status();
}
return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
"unsupported time unit");
}
static void print_time_unit(iree_string_view_t flag_name, void* storage,
FILE* file) {
auto* unit = (std::pair<bool, benchmark::TimeUnit>*)storage;
if (!unit->first) {
return;
}
std::string unit_string;
switch (unit->second) {
case benchmark::kMillisecond:
unit_string = kMillisecondsUnitString;
break;
case benchmark::kMicrosecond:
unit_string = kMicrosecondsUnitString;
break;
case benchmark::kNanosecond:
unit_string = kNanosecondsUnitString;
break;
default:
assert(false && "Unexpected time unit.");
}
fprintf(file, "--%.*s=\"%s\"\n", (int)flag_name.size, flag_name.data,
unit_string.c_str());
}
// Time unit to be printed. If the first field is false, each place will use its
// default time unit.
static std::pair<bool, benchmark::TimeUnit> FLAG_time_unit = {
false, benchmark::kNanosecond};
IREE_FLAG_CALLBACK(
parse_time_unit, print_time_unit, &FLAG_time_unit, time_unit,
"The time unit to be printed in the results. Can be 'ms', 'us', or 'ns'.");
namespace iree {
namespace {
static void BenchmarkGenericFunction(const std::string& benchmark_name,
int32_t batch_size,
iree_hal_device_t* device,
iree_vm_context_t* context,
iree_vm_function_t function,
iree_vm_list_t* inputs,
benchmark::State& state) {
IREE_TRACE_ZONE_BEGIN_NAMED_DYNAMIC(z0, benchmark_name.data(),
benchmark_name.size());
IREE_TRACE_FRAME_MARK();
vm::ref<iree_vm_list_t> outputs;
IREE_CHECK_OK(iree_vm_list_create(iree_vm_make_undefined_type_def(), 16,
iree_allocator_system(), &outputs));
// Benchmarking loop.
while (state.KeepRunningBatch(batch_size)) {
IREE_TRACE_ZONE_BEGIN_NAMED(z1, "BenchmarkIteration");
IREE_TRACE_FRAME_MARK_NAMED("Iteration");
IREE_CHECK_OK(iree_vm_invoke(
context, function, IREE_VM_INVOCATION_FLAG_NONE, /*policy=*/nullptr,
inputs, outputs.get(), iree_allocator_system()));
IREE_CHECK_OK(iree_vm_list_resize(outputs.get(), 0));
IREE_TRACE_ZONE_END(z1);
if (device) {
state.PauseTiming();
IREE_CHECK_OK(iree_hal_device_profiling_flush(device));
state.ResumeTiming();
}
}
state.SetItemsProcessed(state.iterations());
IREE_TRACE_ZONE_END(z0);
}
void RegisterGenericBenchmark(const std::string& function_name,
iree_hal_device_t* device,
iree_vm_context_t* context,
iree_vm_function_t function,
iree_vm_list_t* inputs) {
auto benchmark_name = "BM_" + function_name;
int32_t batch_size = FLAG_batch_size;
benchmark::RegisterBenchmark(benchmark_name.c_str(),
[=](benchmark::State& state) -> void {
BenchmarkGenericFunction(
benchmark_name, batch_size, device,
context, function, inputs, state);
})
// By default only the main thread is included in CPU time. Include all
// the threads instead.
->MeasureProcessCPUTime()
// To make single and multi-threaded benchmarks more comparable, use the
// wall time to determine how many iterations to run. See
// https://github.com/google/benchmark#cpu-timers,
->UseRealTime()
->Unit(FLAG_time_unit.first ? FLAG_time_unit.second
: benchmark::kMillisecond);
}
// Runs up to |batch_size| pipelined invocations in sequence along with
// concurrency. Example:
// batch_size=1, concurrency=1:
// [invocation 0]
// batch_size=2, concurrency=1:
// [invocation 0] -> [invocation 1]
// batch_size=2, concurrency=2:
// [invocation 0]
// [invocation 1]
// batch_size=4, concurrency=2:
// [invocation 0] -> [invocation 2]
// [invocation 1] -> [invocation 3]
static void BenchmarkAsyncFunction(
const std::string& benchmark_name, int32_t batch_size,
int32_t batch_concurrency, iree_hal_device_t* device,
iree_vm_context_t* context, iree_vm_function_t function,
iree_vm_list_t* common_inputs, benchmark::State& state) {
IREE_TRACE_ZONE_BEGIN_NAMED_DYNAMIC(z0, benchmark_name.data(),
benchmark_name.size());
IREE_TRACE_FRAME_MARK();
iree_allocator_t host_allocator = iree_allocator_system();
// Round up batch size to some multiple of concurrency.
batch_size = (int32_t)iree_host_align(batch_size, batch_concurrency);
// Benchmarking loop.
while (state.KeepRunningBatch(batch_size)) {
state.PauseTiming();
IREE_TRACE_ZONE_BEGIN_NAMED(z1, "BenchmarkIteration");
IREE_TRACE_FRAME_MARK_NAMED("Iteration");
IREE_TRACE_ZONE_BEGIN_NAMED(z_begin, "PrepareBatch");
// Each concurrent track of execution gets its own semaphore.
std::vector<vm::ref<iree_hal_semaphore_t>> timeline_semaphores;
for (int32_t i = 0; i < batch_concurrency; ++i) {
vm::ref<iree_hal_semaphore_t> timeline_semaphore;
IREE_CHECK_OK(iree_hal_semaphore_create(
device, 0ull, IREE_HAL_SEMAPHORE_FLAG_NONE, &timeline_semaphore));
timeline_semaphores.push_back(std::move(timeline_semaphore));
}
// Preallocate fences and I/O for each invocation.
// The same inputs are used for each but we need a unique list to hold the
// unique fences. Each fence represents when the invocation has completed.
std::vector<vm::ref<iree_hal_fence_t>> invocation_fences;
std::vector<vm::ref<iree_vm_list_t>> invocation_inputs;
std::vector<vm::ref<iree_vm_list_t>> invocation_outputs;
vm::ref<iree_hal_fence_t> completion_fence;
IREE_CHECK_OK(iree_hal_fence_create(batch_concurrency, host_allocator,
&completion_fence));
for (int32_t i = 0; i < batch_size / batch_concurrency; ++i) {
for (int32_t j = 0; j < batch_concurrency; ++j) {
// Chain each concurrent minibatch to the previous. Note that to start
// we wait on nothing and begin executing immediately.
vm::ref<iree_hal_fence_t> wait_fence;
if (i > 0) {
wait_fence = vm::retain_ref(
invocation_fences[(i - 1) * batch_concurrency + j]);
}
uint64_t signal_value = i + 1;
vm::ref<iree_hal_fence_t> signal_fence;
IREE_CHECK_OK(iree_hal_fence_create_at(timeline_semaphores[j].get(),
signal_value, host_allocator,
&signal_fence));
invocation_fences.push_back(vm::retain_ref(signal_fence));
// Join the final minibatch on the completion fence.
if (i == batch_size / batch_concurrency - 1) {
IREE_CHECK_OK(iree_hal_fence_insert(completion_fence.get(),
timeline_semaphores[j].get(),
signal_value));
}
// Clone common inputs and add the invocation-specific fences.
vm::ref<iree_vm_list_t> inputs;
IREE_CHECK_OK(
iree_vm_list_clone(common_inputs, host_allocator, &inputs));
IREE_CHECK_OK(iree_vm_list_push_ref_move(inputs.get(), wait_fence));
IREE_CHECK_OK(iree_vm_list_push_ref_move(inputs.get(), signal_fence));
invocation_inputs.push_back(std::move(inputs));
// Setup empty outputs.
vm::ref<iree_vm_list_t> outputs;
IREE_CHECK_OK(iree_vm_list_create(iree_vm_make_undefined_type_def(), 16,
host_allocator, &outputs));
invocation_outputs.push_back(std::move(outputs));
}
}
IREE_TRACE_ZONE_END(z_begin);
state.ResumeTiming();
{
// TODO(benvanik): replace with async invocations. Today if the invocation
// performs any waits this will block on the initial invoke instead of
// actually overlapping things.
for (int32_t i = 0; i < batch_size; ++i) {
IREE_CHECK_OK(
iree_vm_invoke(context, function, IREE_VM_INVOCATION_FLAG_NONE,
/*policy=*/nullptr, invocation_inputs[i].get(),
invocation_outputs[i].get(), host_allocator));
}
IREE_CHECK_OK(
iree_hal_fence_wait(completion_fence.get(), iree_infinite_timeout()));
}
state.PauseTiming();
IREE_TRACE_ZONE_BEGIN_NAMED(z_end, "CleanupBatch");
for (int32_t i = 0; i < batch_size; ++i) {
iree_vm_list_clear(invocation_outputs[i].get());
}
invocation_fences.clear();
invocation_inputs.clear();
invocation_outputs.clear();
completion_fence.reset();
timeline_semaphores.clear();
IREE_TRACE_ZONE_END(z_end);
IREE_TRACE_ZONE_END(z1);
if (device) {
IREE_CHECK_OK(iree_hal_device_profiling_flush(device));
}
state.ResumeTiming();
}
state.SetItemsProcessed(state.iterations());
IREE_TRACE_ZONE_END(z0);
}
void RegisterAsyncBenchmark(const std::string& function_name,
iree_hal_device_t* device,
iree_vm_context_t* context,
iree_vm_function_t function,
iree_vm_list_t* inputs) {
auto benchmark_name = "BM_" + function_name;
int32_t batch_size = FLAG_batch_size;
int32_t batch_concurrency = FLAG_batch_concurrency;
benchmark::RegisterBenchmark(
benchmark_name.c_str(),
[=](benchmark::State& state) -> void {
BenchmarkAsyncFunction(benchmark_name, batch_size, batch_concurrency,
device, context, function, inputs, state);
})
// By default only the main thread is included in CPU time. Include all
// the threads instead.
->MeasureProcessCPUTime()
// To make single and multi-threaded benchmarks more comparable, use the
// wall time to determine how many iterations to run. See
// https://github.com/google/benchmark#cpu-timers,
->UseRealTime()
->Unit(FLAG_time_unit.first ? FLAG_time_unit.second
: benchmark::kMillisecond);
}
static void BenchmarkDispatchFunction(const std::string& benchmark_name,
iree_vm_context_t* context,
iree_vm_function_t function,
benchmark::State& state) {
IREE_TRACE_ZONE_BEGIN_NAMED_DYNAMIC(z0, benchmark_name.data(),
benchmark_name.size());
IREE_TRACE_FRAME_MARK();
vm::ref<iree_vm_list_t> inputs;
IREE_CHECK_OK(iree_vm_list_create(iree_vm_make_undefined_type_def(), 16,
iree_allocator_system(), &inputs));
iree_vm_value_t batch_size = iree_vm_value_make_i32(FLAG_batch_size);
IREE_CHECK_OK(iree_vm_list_push_value(inputs.get(), &batch_size));
vm::ref<iree_vm_list_t> outputs;
IREE_CHECK_OK(iree_vm_list_create(iree_vm_make_undefined_type_def(), 16,
iree_allocator_system(), &outputs));
// Benchmarking loop.
while (state.KeepRunningBatch(FLAG_batch_size)) {
IREE_TRACE_ZONE_BEGIN_NAMED(z1, "BenchmarkIteration");
IREE_TRACE_FRAME_MARK_NAMED("Iteration");
IREE_CHECK_OK(iree_vm_invoke(
context, function, IREE_VM_INVOCATION_FLAG_NONE, /*policy=*/nullptr,
inputs.get(), outputs.get(), iree_allocator_system()));
IREE_CHECK_OK(iree_vm_list_resize(outputs.get(), 0));
IREE_TRACE_ZONE_END(z1);
}
state.SetItemsProcessed(state.iterations());
IREE_TRACE_ZONE_END(z0);
}
void RegisterDispatchBenchmark(const std::string& function_name,
iree_vm_context_t* context,
iree_vm_function_t function) {
auto benchmark_name = "BM_" + function_name;
benchmark::RegisterBenchmark(
benchmark_name.c_str(),
[benchmark_name, context, function](benchmark::State& state) -> void {
BenchmarkDispatchFunction(benchmark_name, context, function, state);
})
// By default only the main thread is included in CPU time. Include all
// the threads instead.
->MeasureProcessCPUTime()
// To make single and multi-threaded benchmarks more comparable, use the
// wall time to determine how many iterations to run. See
// https://github.com/google/benchmark#cpu-timers,
->UseRealTime()
->Unit(FLAG_time_unit.first ? FLAG_time_unit.second
: benchmark::kMicrosecond);
}
// The lifetime of IREEBenchmark should be as long as
// ::benchmark::RunSpecifiedBenchmarks() where the resources are used during
// benchmarking.
class IREEBenchmark {
public:
IREEBenchmark() { iree_tooling_module_list_initialize(&module_list_); }
~IREEBenchmark() {
IREE_TRACE_SCOPE_NAMED("IREEBenchmark::dtor");
// Order matters. Tear down modules first to release resources.
inputs_.reset();
context_.reset();
iree_tooling_module_list_reset(&module_list_);
instance_.reset();
// Tear down device last in order to get accurate statistics.
if (device_allocator_ && FLAG_print_statistics) {
IREE_IGNORE_ERROR(iree_hal_allocator_statistics_fprint(
stderr, device_allocator_.get()));
}
device_allocator_.reset();
device_.reset();
};
iree_hal_device_t* device() const { return device_.get(); }
iree_status_t Register() {
IREE_TRACE_SCOPE_NAMED("IREEBenchmark::Register");
if (!instance_ || !device_allocator_ || !context_ || !module_list_.count) {
IREE_RETURN_IF_ERROR(Init());
}
auto function_name = std::string(FLAG_function);
if (!function_name.empty()) {
IREE_RETURN_IF_ERROR(RegisterSpecificFunction(function_name));
} else {
IREE_RETURN_IF_ERROR(RegisterAllExportedFunctions());
}
return iree_ok_status();
}
private:
iree_status_t Init() {
IREE_TRACE_SCOPE_NAMED("IREEBenchmark::Init");
IREE_TRACE_FRAME_MARK_BEGIN_NAMED("init");
iree_allocator_t host_allocator = iree_allocator_system();
IREE_RETURN_IF_ERROR(
iree_tooling_create_instance(host_allocator, &instance_));
IREE_RETURN_IF_ERROR(iree_tooling_load_modules_from_flags(
instance_.get(), host_allocator, &module_list_));
IREE_RETURN_IF_ERROR(iree_tooling_create_context_from_flags(
instance_.get(), module_list_.count, module_list_.values,
/*default_device_uri=*/iree_string_view_empty(), host_allocator,
&context_, &device_, &device_allocator_));
IREE_TRACE_FRAME_MARK_END_NAMED("init");
return iree_ok_status();
}
iree_status_t RegisterSpecificFunction(const std::string& function_name) {
IREE_TRACE_SCOPE_NAMED("IREEBenchmark::RegisterSpecificFunction");
iree_vm_module_t* main_module =
iree_tooling_module_list_back(&module_list_);
iree_vm_function_t function;
IREE_RETURN_IF_ERROR(iree_vm_module_lookup_function_by_name(
main_module, IREE_VM_FUNCTION_LINKAGE_EXPORT,
iree_string_view_t{function_name.data(),
(iree_host_size_t)function_name.size()},
&function));
iree_vm_function_signature_t signature =
iree_vm_function_signature(&function);
iree_string_view_t arguments_cconv, results_cconv;
IREE_RETURN_IF_ERROR(iree_vm_function_call_get_cconv_fragments(
&signature, &arguments_cconv, &results_cconv));
IREE_CHECK_OK(iree_tooling_parse_variants(
arguments_cconv, FLAG_input_list(), device_.get(),
device_allocator_.get(), iree_vm_instance_allocator(instance_.get()),
&inputs_));
iree_string_view_t invocation_model = iree_vm_function_lookup_attr_by_name(
&function, IREE_SV("iree.abi.model"));
if (iree_string_view_equal(invocation_model, IREE_SV("coarse-fences"))) {
// Asynchronous invocation.
iree::RegisterAsyncBenchmark(function_name, device_.get(), context_.get(),
function, inputs_.get());
} else {
// Synchronous invocation.
iree::RegisterGenericBenchmark(function_name, device_.get(),
context_.get(), function, inputs_.get());
}
return iree_ok_status();
}
iree_status_t RegisterAllExportedFunctions() {
IREE_TRACE_SCOPE_NAMED("IREEBenchmark::RegisterAllExportedFunctions");
iree_vm_module_t* main_module =
iree_tooling_module_list_back(&module_list_);
iree_vm_module_signature_t signature =
iree_vm_module_signature(main_module);
for (iree_host_size_t i = 0; i < signature.export_function_count; ++i) {
iree_vm_function_t function;
IREE_RETURN_IF_ERROR(iree_vm_module_lookup_function_by_ordinal(
main_module, IREE_VM_FUNCTION_LINKAGE_EXPORT, i, &function));
iree_string_view_t function_name = iree_vm_function_name(&function);
// We run anything with the 'benchmark' attribute.
// If the attribute is not present we'll run anything that looks runnable.
iree_string_view_t benchmark_type = iree_vm_function_lookup_attr_by_name(
&function, IREE_SV("iree.benchmark"));
if (iree_string_view_equal(benchmark_type, IREE_SV("dispatch"))) {
iree::RegisterDispatchBenchmark(
std::string(function_name.data, function_name.size), context_.get(),
function);
} else if (iree_string_view_equal(benchmark_type, IREE_SV("entry"))) {
iree::RegisterGenericBenchmark(
std::string(function_name.data, function_name.size), device_.get(),
context_.get(), function,
/*inputs=*/nullptr);
} else {
// Pick up generic () -> () functions.
if (iree_string_view_starts_with(function_name,
iree_make_cstring_view("__")) ||
iree_string_view_find_char(function_name, '$', 0) !=
IREE_STRING_VIEW_NPOS) {
// Skip internal or special functions.
continue;
}
// Query function information to determine how to run it.
iree_vm_function_signature_t signature =
iree_vm_function_signature(&function);
iree_host_size_t argument_count = 0;
iree_host_size_t result_count = 0;
IREE_RETURN_IF_ERROR(iree_vm_function_call_count_arguments_and_results(
&signature, &argument_count, &result_count));
iree_string_view_t invocation_model =
iree_vm_function_lookup_attr_by_name(&function,
IREE_SV("iree.abi.model"));
if (iree_string_view_equal(invocation_model,
IREE_SV("coarse-fences"))) {
// Asynchronous invocation with coarse fences. Expect just those.
if (argument_count == 2) {
// Only functions taking a (wait, signal) fence pair are run.
iree::RegisterAsyncBenchmark(
std::string(function_name.data, function_name.size),
device_.get(), context_.get(), function,
/*inputs=*/nullptr);
}
} else {
// Basic synchronous invocation.
if (argument_count == 0) {
// Only functions with no inputs are run (because we can't pass
// anything).
iree::RegisterGenericBenchmark(
std::string(function_name.data, function_name.size),
device_.get(), context_.get(), function,
/*inputs=*/nullptr);
}
}
}
}
return iree_ok_status();
}
iree::vm::ref<iree_vm_instance_t> instance_;
iree::vm::ref<iree_vm_context_t> context_;
iree::vm::ref<iree_hal_device_t> device_;
iree::vm::ref<iree_hal_allocator_t> device_allocator_;
iree_tooling_module_list_t module_list_;
iree::vm::ref<iree_vm_list_t> inputs_;
};
} // namespace
} // namespace iree
int main(int argc, char** argv) {
IREE_TRACE_APP_ENTER();
IREE_TRACE_ZONE_BEGIN_NAMED(z0, "iree-benchmark-module");
// Pass through flags to benchmark (allowing --help to fall through).
iree_flags_set_usage(
"iree-benchmark-module",
"Benchmarks a function within a compiled IREE module and handles I/O\n"
"parsing. Modules can be provided by file path (`--module=file.vmfb`)\n"
"or read from stdin (`--module=-`) and the function to execute\n"
"matches the original name provided to the compiler\n"
"(`--function=foo` for `func.func @foo`).\n");
iree_flags_parse_checked(IREE_FLAGS_PARSE_MODE_UNDEFINED_OK |
IREE_FLAGS_PARSE_MODE_CONTINUE_AFTER_HELP,
&argc, &argv);
::benchmark::Initialize(&argc, argv);
iree::IREEBenchmark iree_benchmark;
iree_status_t status = iree_benchmark.Register();
if (!iree_status_is_ok(status)) {
int exit_code = static_cast<int>(iree_status_code(status));
printf("%s\n", iree::Status(std::move(status)).ToString().c_str());
IREE_TRACE_ZONE_END(z0);
IREE_TRACE_APP_EXIT(exit_code);
return exit_code;
}
IREE_CHECK_OK(iree_hal_begin_profiling_from_flags(iree_benchmark.device()));
::benchmark::RunSpecifiedBenchmarks();
IREE_CHECK_OK(iree_hal_end_profiling_from_flags(iree_benchmark.device()));
IREE_TRACE_ZONE_END(z0);
IREE_TRACE_APP_EXIT(EXIT_SUCCESS);
return EXIT_SUCCESS;
}