blob: 0ff459a04adb5ee2e2a518446f793da2e3016b36 [file] [log] [blame]
// Copyright 2020 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//===----------------------------------------------------------------------===//
// iree-benchmark-module: benchmarks public functions in an IREE VM module
//===----------------------------------------------------------------------===//
//
// This runs exported functions using flags specified on the command line.
// Each function is measured independently and the numbers reported will be for
// the full end-to-end CPU and wall times.
//
// From an ML perspective this is an integration benchmark for measuring total
// user-visible latency of model entry points. It is *not* a microbenchmarking
// tool for individual device-side dispatch functions (aka ops aka kernels).
// If interested in the precise time of a particular dispatch then tracy,
// executable_library_benchmark, and platform/vendor tooling (nsight, perf, etc)
// are to be used instead and attaching them to this tool is often useful in
// order to get a large sample set.
//
// By default all functions taking no inputs will be benchmarked. If a function
// takes inputs then the user will need to specify them using --function_input=
// flags. Depending on the input program the -iree-flow-export-benchmark-funcs
// flag can be passed to the compiler to attempt to wrap each function with
// dummy inputs however this will fail in programs with dynamically shaped
// inputs. The workaround for avoiding the need for flags is to provide the
// input program in a form with no inputs from the start.
//
// It's important to remember that IREE is not a BLAS library and is meant to
// run entire programs. It's not generally appropriate to benchmark a model with
// a single matmul, for example, as that's just treating IREE as a BLAS library.
// Note also that user-level ops in a frontend environment don't map to the
// dispatches that IREE executes: IREE is a compiler like any other and does not
// guarantee a source line of code translates into an atomically divisible and
// independently measurable execution command. In other words don't expect to be
// able to benchmark the cost of a broadcasting elementwise tf.add op within a
// model: by the time we are running the program that's fused itself into a
// single machine instruction operating as part of some other ops.
//
// For coarse dispatch testing and triaging it can still be useful to remove
// some of the overheads introduced by whole-program execution and the compiler
// flag --iree-hal-benchmark-dispatch-repeat-count=N is provided to enable
// batching. Whatever N is chosen must then be passed to this tool via
// --batch_size=N so that the benchmark reporting properly reflects the
// batching. As an example --iree-hal-benchmark-dispatch-repeat-count=32 +
// --batch_size=32 will reduce the overheads by 32x. Think of this as a way to
// control the p value in Amdahl's law representing the amount of time spent in
// dispatches relative to the rest of the program. This isn't representative of
// how the full program will run, though, and YMMV. Always verify timings with
// an appropriate device-specific tool before trusting the more generic and
// higher-level numbers from this tool.
#include <array>
#include <cstdio>
#include <iterator>
#include <string>
#include <type_traits>
#include <utility>
#include <vector>
#include "benchmark/benchmark.h"
#include "iree/base/api.h"
#include "iree/base/internal/flags.h"
#include "iree/base/status_cc.h"
#include "iree/base/tracing.h"
#include "iree/hal/api.h"
#include "iree/tooling/context_util.h"
#include "iree/tooling/vm_util_cc.h"
#include "iree/vm/api.h"
#include "iree/vm/ref_cc.h"
constexpr char kNanosecondsUnitString[] = "ns";
constexpr char kMicrosecondsUnitString[] = "us";
constexpr char kMillisecondsUnitString[] = "ms";
// TODO(hanchung): Extract the batch size using
// iree_vm_function_lookup_attr_by_name.
IREE_FLAG(
int32_t, batch_size, 1,
"The number of batch size, which is expected to match "
"iree-hal-benchmark-dispatch-repeat-count when translating the module");
IREE_FLAG(string, entry_function, "",
"Name of a function contained in the module specified by module_file "
"to run. If this is not set, all the exported functions will be "
"benchmarked and they are expected to not have input arguments.");
IREE_FLAG(bool, print_statistics, false,
"Prints runtime statistics to stderr on exit.");
// TODO(benvanik): move --function_input= flag into a util.
static iree_status_t parse_function_input(iree_string_view_t flag_name,
void* storage,
iree_string_view_t value) {
auto* list = (std::vector<std::string>*)storage;
list->push_back(std::string(value.data, value.size));
return iree_ok_status();
}
static void print_function_input(iree_string_view_t flag_name, void* storage,
FILE* file) {
auto* list = (std::vector<std::string>*)storage;
if (list->empty()) {
fprintf(file, "# --%.*s=\n", (int)flag_name.size, flag_name.data);
} else {
for (size_t i = 0; i < list->size(); ++i) {
fprintf(file, "--%.*s=\"%s\"\n", (int)flag_name.size, flag_name.data,
list->at(i).c_str());
}
}
}
static std::vector<std::string> FLAG_function_inputs;
IREE_FLAG_CALLBACK(
parse_function_input, print_function_input, &FLAG_function_inputs,
function_input,
"An input value or buffer of the format:\n"
" [shape]xtype=[value]\n"
" 2x2xi32=1 2 3 4\n"
"Optionally, brackets may be used to separate the element values:\n"
" 2x2xi32=[[1 2][3 4]]\n"
"Raw binary files can be read to provide buffer contents:\n"
" 2x2xi32=@some/file.bin\n"
"numpy npy files (from numpy.save) can be read to provide 1+ values:\n"
" @some.npy\n"
"Each occurrence of the flag indicates an input in the order they were\n"
"specified on the command line.");
static iree_status_t parse_time_unit(iree_string_view_t flag_name,
void* storage, iree_string_view_t value) {
auto* unit = (std::pair<bool, benchmark::TimeUnit>*)storage;
auto unit_string = std::string(value.data, value.size);
if (unit_string == kMillisecondsUnitString) {
*unit = {true, benchmark::kMillisecond};
return iree_ok_status();
} else if (unit_string == kMicrosecondsUnitString) {
*unit = {true, benchmark::kMicrosecond};
return iree_ok_status();
} else if (unit_string == kNanosecondsUnitString) {
*unit = {true, benchmark::kNanosecond};
return iree_ok_status();
}
return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
"unsupported time unit");
}
static void print_time_unit(iree_string_view_t flag_name, void* storage,
FILE* file) {
auto* unit = (std::pair<bool, benchmark::TimeUnit>*)storage;
if (!unit->first) {
return;
}
std::string unit_string;
switch (unit->second) {
case benchmark::kMillisecond:
unit_string = kMillisecondsUnitString;
break;
case benchmark::kMicrosecond:
unit_string = kMicrosecondsUnitString;
break;
case benchmark::kNanosecond:
unit_string = kNanosecondsUnitString;
break;
default:
assert(false && "Unexpected time unit.");
}
fprintf(file, "--%.*s=\"%s\"\n", (int)flag_name.size, flag_name.data,
unit_string.c_str());
}
// Time unit to be printed. If the first field is false, each place will use its
// default time unit.
static std::pair<bool, benchmark::TimeUnit> FLAG_time_unit = {
false, benchmark::kNanosecond};
IREE_FLAG_CALLBACK(
parse_time_unit, print_time_unit, &FLAG_time_unit, time_unit,
"The time unit to be printed in the results. Can be 'ms', 'us', or 'ns'.");
namespace iree {
namespace {
static void BenchmarkGenericFunction(const std::string& benchmark_name,
int batch_size, iree_vm_context_t* context,
iree_vm_function_t function,
iree_vm_list_t* inputs,
benchmark::State& state) {
IREE_TRACE_SCOPE_DYNAMIC(benchmark_name.c_str());
IREE_TRACE_FRAME_MARK();
vm::ref<iree_vm_list_t> outputs;
IREE_CHECK_OK(iree_vm_list_create(/*element_type=*/nullptr, 16,
iree_allocator_system(), &outputs));
// Benchmarking loop.
while (state.KeepRunningBatch(batch_size)) {
IREE_TRACE_SCOPE0("BenchmarkIteration");
IREE_TRACE_FRAME_MARK_NAMED("Iteration");
IREE_CHECK_OK(iree_vm_invoke(
context, function, IREE_VM_INVOCATION_FLAG_NONE, /*policy=*/nullptr,
inputs, outputs.get(), iree_allocator_system()));
IREE_CHECK_OK(iree_vm_list_resize(outputs.get(), 0));
}
state.SetItemsProcessed(state.iterations());
}
void RegisterGenericBenchmark(const std::string& function_name,
iree_vm_context_t* context,
iree_vm_function_t function,
iree_vm_list_t* inputs) {
auto benchmark_name = "BM_" + function_name;
int batch_size = FLAG_batch_size;
benchmark::RegisterBenchmark(benchmark_name.c_str(),
[benchmark_name, batch_size, context, function,
inputs](benchmark::State& state) -> void {
BenchmarkGenericFunction(
benchmark_name, batch_size, context,
function, inputs, state);
})
// By default only the main thread is included in CPU time. Include all
// the threads instead.
->MeasureProcessCPUTime()
// To make single and multi-threaded benchmarks more comparable, use the
// wall time to determine how many iterations to run. See
// https://github.com/google/benchmark#cpu-timers,
->UseRealTime()
->Unit(FLAG_time_unit.first ? FLAG_time_unit.second
: benchmark::kMillisecond);
}
static void BenchmarkDispatchFunction(const std::string& benchmark_name,
iree_vm_context_t* context,
iree_vm_function_t function,
benchmark::State& state) {
IREE_TRACE_SCOPE_DYNAMIC(benchmark_name.c_str());
IREE_TRACE_FRAME_MARK();
vm::ref<iree_vm_list_t> inputs;
IREE_CHECK_OK(iree_vm_list_create(/*element_type=*/nullptr, 16,
iree_allocator_system(), &inputs));
iree_vm_value_t batch_size = iree_vm_value_make_i32(FLAG_batch_size);
IREE_CHECK_OK(iree_vm_list_push_value(inputs.get(), &batch_size));
vm::ref<iree_vm_list_t> outputs;
IREE_CHECK_OK(iree_vm_list_create(/*element_type=*/nullptr, 16,
iree_allocator_system(), &outputs));
// Benchmarking loop.
while (state.KeepRunningBatch(FLAG_batch_size)) {
IREE_TRACE_SCOPE0("BenchmarkIteration");
IREE_TRACE_FRAME_MARK_NAMED("Iteration");
IREE_CHECK_OK(iree_vm_invoke(
context, function, IREE_VM_INVOCATION_FLAG_NONE, /*policy=*/nullptr,
inputs.get(), outputs.get(), iree_allocator_system()));
IREE_CHECK_OK(iree_vm_list_resize(outputs.get(), 0));
}
state.SetItemsProcessed(state.iterations());
}
void RegisterDispatchBenchmark(const std::string& function_name,
iree_vm_context_t* context,
iree_vm_function_t function) {
auto benchmark_name = "BM_" + function_name;
benchmark::RegisterBenchmark(
benchmark_name.c_str(),
[benchmark_name, context, function](benchmark::State& state) -> void {
BenchmarkDispatchFunction(benchmark_name, context, function, state);
})
// By default only the main thread is included in CPU time. Include all
// the threads instead.
->MeasureProcessCPUTime()
// To make single and multi-threaded benchmarks more comparable, use the
// wall time to determine how many iterations to run. See
// https://github.com/google/benchmark#cpu-timers,
->UseRealTime()
->Unit(FLAG_time_unit.first ? FLAG_time_unit.second
: benchmark::kMicrosecond);
}
// The lifetime of IREEBenchmark should be as long as
// ::benchmark::RunSpecifiedBenchmarks() where the resources are used during
// benchmarking.
class IREEBenchmark {
public:
IREEBenchmark() = default;
~IREEBenchmark() {
IREE_TRACE_SCOPE0("IREEBenchmark::dtor");
// Order matters. Tear down modules first to release resources.
inputs_.reset();
iree_vm_context_release(context_);
iree_vm_module_release(main_module_);
iree_vm_instance_release(instance_);
// Tear down device last in order to get accurate statistics.
if (device_allocator_ && FLAG_print_statistics) {
IREE_IGNORE_ERROR(
iree_hal_allocator_statistics_fprint(stderr, device_allocator_));
}
iree_hal_allocator_release(device_allocator_);
iree_hal_device_release(device_);
};
iree_status_t Register() {
IREE_TRACE_SCOPE0("IREEBenchmark::Register");
if (!instance_ || !device_allocator_ || !context_ || !main_module_) {
IREE_RETURN_IF_ERROR(Init());
}
auto function_name = std::string(FLAG_entry_function);
if (!function_name.empty()) {
IREE_RETURN_IF_ERROR(RegisterSpecificFunction(function_name));
} else {
IREE_RETURN_IF_ERROR(RegisterAllExportedFunctions());
}
return iree_ok_status();
}
private:
iree_status_t Init() {
IREE_TRACE_SCOPE0("IREEBenchmark::Init");
IREE_TRACE_FRAME_MARK_BEGIN_NAMED("init");
iree_allocator_t host_allocator = iree_allocator_system();
IREE_RETURN_IF_ERROR(
iree_tooling_create_instance(host_allocator, &instance_));
IREE_RETURN_IF_ERROR(iree_tooling_load_module_from_flags(
instance_, host_allocator, &main_module_));
IREE_RETURN_IF_ERROR(iree_tooling_create_context_from_flags(
instance_, /*user_module_count=*/1, /*user_modules=*/&main_module_,
/*default_device_uri=*/iree_string_view_empty(), host_allocator,
&context_, &device_, &device_allocator_));
IREE_TRACE_FRAME_MARK_END_NAMED("init");
return iree_ok_status();
}
iree_status_t RegisterSpecificFunction(const std::string& function_name) {
IREE_TRACE_SCOPE0("IREEBenchmark::RegisterSpecificFunction");
iree_vm_function_t function;
IREE_RETURN_IF_ERROR(iree_vm_module_lookup_function_by_name(
main_module_, IREE_VM_FUNCTION_LINKAGE_EXPORT,
iree_string_view_t{function_name.data(), function_name.size()},
&function));
IREE_CHECK_OK(ParseToVariantList(
device_allocator_,
iree::span<const std::string>{FLAG_function_inputs.data(),
FLAG_function_inputs.size()},
iree_vm_instance_allocator(instance_), &inputs_));
RegisterGenericBenchmark(function_name, context_, function, inputs_.get());
return iree_ok_status();
}
iree_status_t RegisterAllExportedFunctions() {
IREE_TRACE_SCOPE0("IREEBenchmark::RegisterAllExportedFunctions");
iree_vm_module_signature_t signature =
iree_vm_module_signature(main_module_);
for (iree_host_size_t i = 0; i < signature.export_function_count; ++i) {
iree_vm_function_t function;
IREE_RETURN_IF_ERROR(iree_vm_module_lookup_function_by_ordinal(
main_module_, IREE_VM_FUNCTION_LINKAGE_EXPORT, i, &function));
iree_string_view_t function_name = iree_vm_function_name(&function);
// We run anything with the 'benchmark' attribute.
// If the attribute is not present we'll run anything that looks runnable.
iree_string_view_t benchmark_type = iree_vm_function_lookup_attr_by_name(
&function, IREE_SV("iree.benchmark"));
if (iree_string_view_equal(benchmark_type, IREE_SV("dispatch"))) {
iree::RegisterDispatchBenchmark(
std::string(function_name.data, function_name.size), context_,
function);
} else if (iree_string_view_equal(benchmark_type, IREE_SV("entry"))) {
iree::RegisterGenericBenchmark(
std::string(function_name.data, function_name.size), context_,
function,
/*inputs=*/nullptr);
} else {
// Pick up generic () -> () functions.
if (iree_string_view_starts_with(function_name,
iree_make_cstring_view("__")) ||
iree_string_view_find_char(function_name, '$', 0) !=
IREE_STRING_VIEW_NPOS) {
// Skip internal or special functions.
continue;
}
iree_vm_function_signature_t signature =
iree_vm_function_signature(&function);
iree_host_size_t argument_count = 0;
iree_host_size_t result_count = 0;
IREE_RETURN_IF_ERROR(iree_vm_function_call_count_arguments_and_results(
&signature, &argument_count, &result_count));
if (argument_count) {
// Only functions with no inputs are run (because we can't pass
// anything).
continue;
}
iree::RegisterGenericBenchmark(
std::string(function_name.data, function_name.size), context_,
function,
/*inputs=*/nullptr);
}
}
return iree_ok_status();
}
iree_vm_instance_t* instance_ = nullptr;
iree_vm_context_t* context_ = nullptr;
iree_hal_device_t* device_ = nullptr;
iree_hal_allocator_t* device_allocator_ = nullptr;
iree_vm_module_t* main_module_ = nullptr;
iree::vm::ref<iree_vm_list_t> inputs_;
};
} // namespace
} // namespace iree
int main(int argc, char** argv) {
IREE_TRACE_SCOPE0("main");
// Pass through flags to benchmark (allowing --help to fall through).
iree_flags_parse_checked(IREE_FLAGS_PARSE_MODE_UNDEFINED_OK |
IREE_FLAGS_PARSE_MODE_CONTINUE_AFTER_HELP,
&argc, &argv);
::benchmark::Initialize(&argc, argv);
iree::IREEBenchmark iree_benchmark;
iree_status_t status = iree_benchmark.Register();
if (!iree_status_is_ok(status)) {
int ret = static_cast<int>(iree_status_code(status));
printf("%s\n", iree::Status(std::move(status)).ToString().c_str());
return ret;
}
::benchmark::RunSpecifiedBenchmarks();
return 0;
}