tools/iree-benchmark-module-main.cc - 3p/openxla/iree - Git at Google

 // Copyright 2020 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 //===----------------------------------------------------------------------===//
 // iree-benchmark-module: benchmarks public functions in an IREE VM module
 //===----------------------------------------------------------------------===//
 //
 // This runs exported functions using flags specified on the command line.
 // Each function is measured independently and the numbers reported will be for
 // the full end-to-end CPU and wall times.
 //
 // From an ML perspective this is an integration benchmark for measuring total
 // user-visible latency of model entry points. It is *not* a microbenchmarking
 // tool for individual device-side dispatch functions (aka ops aka kernels).
 // If interested in the precise time of a particular dispatch then tracy,
 // executable_library_benchmark, and platform/vendor tooling (nsight, perf, etc)
 // are to be used instead and attaching them to this tool is often useful in
 // order to get a large sample set.
 //
 // By default all functions taking no inputs will be benchmarked. If a function
 // takes inputs then the user will need to specify them using --function_input=
 // flags. Depending on the input program the -iree-flow-export-benchmark-funcs
 // flag can be passed to the compiler to attempt to wrap each function with
 // dummy inputs however this will fail in programs with dynamically shaped
 // inputs. The workaround for avoiding the need for flags is to provide the
 // input program in a form with no inputs from the start.
 //
 // It's important to remember that IREE is not a BLAS library and is meant to
 // run entire programs. It's not generally appropriate to benchmark a model with
 // a single matmul, for example, as that's just treating IREE as a BLAS library.
 // Note also that user-level ops in a frontend environment don't map to the
 // dispatches that IREE executes: IREE is a compiler like any other and does not
 // guarantee a source line of code translates into an atomically divisible and
 // independently measurable execution command. In other words don't expect to be
 // able to benchmark the cost of a broadcasting elementwise tf.add op within a
 // model: by the time we are running the program that's fused itself into a
 // single machine instruction operating as part of some other ops.
 //
 // For coarse dispatch testing and triaging it can still be useful to remove
 // some of the overheads introduced by whole-program execution and the compiler
 // flag --iree-hal-benchmark-dispatch-repeat-count=N is provided to enable
 // batching. Whatever N is chosen must then be passed to this tool via
 // --batch_size=N so that the benchmark reporting properly reflects the
 // batching. As an example --iree-hal-benchmark-dispatch-repeat-count=32 +
 // --batch_size=32 will reduce the overheads by 32x. Think of this as a way to
 // control the p value in Amdahl's law representing the amount of time spent in
 // dispatches relative to the rest of the program. This isn't representative of
 // how the full program will run, though, and YMMV. Always verify timings with
 // an appropriate device-specific tool before trusting the more generic and
 // higher-level numbers from this tool.

 #include <array>
 #include <cstdio>
 #include <iostream>
 #include <iterator>
 #include <string>
 #include <type_traits>
 #include <utility>
 #include <vector>

 #include "benchmark/benchmark.h"
 #include "iree/base/api.h"
 #include "iree/base/internal/file_io.h"
 #include "iree/base/internal/flags.h"
 #include "iree/base/status_cc.h"
 #include "iree/base/tracing.h"
 #include "iree/hal/api.h"
 #include "iree/modules/hal/module.h"
 #include "iree/tooling/device_util.h"
 #include "iree/tooling/vm_util.h"
 #include "iree/vm/api.h"
 #include "iree/vm/bytecode_module.h"
 #include "iree/vm/ref_cc.h"

 constexpr char kNanosecondsUnitString[] = "ns";
 constexpr char kMicrosecondsUnitString[] = "us";
 constexpr char kMillisecondsUnitString[] = "ms";

 IREE_FLAG(string, module_file, "-",
           "File containing the module to load that contains the entry "
           "function. Defaults to stdin.");

 // TODO(hanchung): Extract the batch size using
 // iree_vm_function_lookup_attr_by_name.
 IREE_FLAG(
     int32_t, batch_size, 1,
     "The number of batch size, which is expected to match "
     "iree-hal-benchmark-dispatch-repeat-count when translating the module");

 IREE_FLAG(string, entry_function, "",
           "Name of a function contained in the module specified by module_file "
           "to run. If this is not set, all the exported functions will be "
           "benchmarked and they are expected to not have input arguments.");

 IREE_FLAG(bool, print_statistics, false,
           "Prints runtime statistics to stderr on exit.");

 static iree_status_t parse_function_input(iree_string_view_t flag_name,
                                           void* storage,
                                           iree_string_view_t value) {
   auto* list = (std::vector<std::string>*)storage;
   list->push_back(std::string(value.data, value.size));
   return iree_ok_status();
 }
 static void print_function_input(iree_string_view_t flag_name, void* storage,
                                  FILE* file) {
   auto* list = (std::vector<std::string>*)storage;
   if (list->empty()) {
     fprintf(file, "# --%.*s=\n", (int)flag_name.size, flag_name.data);
   } else {
     for (size_t i = 0; i < list->size(); ++i) {
       fprintf(file, "--%.*s=\"%s\"\n", (int)flag_name.size, flag_name.data,
               list->at(i).c_str());
     }
   }
 }
 static std::vector<std::string> FLAG_function_inputs;
 IREE_FLAG_CALLBACK(
     parse_function_input, print_function_input, &FLAG_function_inputs,
     function_input,
     "An input value or buffer of the format:\n"
     "  [shape]xtype=[value]\n"
     "  2x2xi32=1 2 3 4\n"
     "Optionally, brackets may be used to separate the element values:\n"
     "  2x2xi32=[[1 2][3 4]]\n"
     "Raw binary files can be read to provide buffer contents:\n"
     "  2x2xi32=@some/file.bin\n"
     "numpy npy files (from numpy.save) can be read to provide 1+ values:\n"
     "  @some.npy\n"
     "Each occurrence of the flag indicates an input in the order they were\n"
     "specified on the command line.");

 static iree_status_t parse_time_unit(iree_string_view_t flag_name,
                                      void* storage, iree_string_view_t value) {
   auto* unit = (std::pair<bool, benchmark::TimeUnit>*)storage;
   auto unit_string = std::string(value.data, value.size);
   if (unit_string == kMillisecondsUnitString) {
     *unit = {true, benchmark::kMillisecond};
     return iree_ok_status();
   } else if (unit_string == kMicrosecondsUnitString) {
     *unit = {true, benchmark::kMicrosecond};
     return iree_ok_status();
   } else if (unit_string == kNanosecondsUnitString) {
     *unit = {true, benchmark::kNanosecond};
     return iree_ok_status();
   }
   return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
                           "unsupported time unit");
 }
 static void print_time_unit(iree_string_view_t flag_name, void* storage,
                             FILE* file) {
   auto* unit = (std::pair<bool, benchmark::TimeUnit>*)storage;
   if (!unit->first) {
     return;
   }
   std::string unit_string;
   switch (unit->second) {
     case benchmark::kMillisecond:
       unit_string = kMillisecondsUnitString;
       break;
     case benchmark::kMicrosecond:
       unit_string = kMicrosecondsUnitString;
       break;
     case benchmark::kNanosecond:
       unit_string = kNanosecondsUnitString;
       break;
     default:
       assert(false && "Unexpected time unit.");
   }
   fprintf(file, "--%.*s=\"%s\"\n", (int)flag_name.size, flag_name.data,
           unit_string.c_str());
 }
 // Time unit to be printed. If the first field is false, each place will use its
 // default time unit.
 static std::pair<bool, benchmark::TimeUnit> FLAG_time_unit = {
     false, benchmark::kNanosecond};
 IREE_FLAG_CALLBACK(
     parse_time_unit, print_time_unit, &FLAG_time_unit, time_unit,
     "The time unit to be printed in the results. Can be 'ms', 'us', or 'ns'.");

 namespace iree {
 namespace {

 static void BenchmarkGenericFunction(const std::string& benchmark_name,
                                      int batch_size, iree_vm_context_t* context,
                                      iree_vm_function_t function,
                                      iree_vm_list_t* inputs,
                                      iree_hal_device_t* device,
                                      benchmark::State& state) {
   IREE_TRACE_SCOPE_DYNAMIC(benchmark_name.c_str());
   IREE_TRACE_FRAME_MARK();

   vm::ref<iree_vm_list_t> outputs;
   IREE_CHECK_OK(iree_vm_list_create(/*element_type=*/nullptr, 16,
                                     iree_allocator_system(), &outputs));

   // Benchmarking loop.
   while (state.KeepRunningBatch(batch_size)) {
     IREE_TRACE_SCOPE0("BenchmarkIteration");
     IREE_TRACE_FRAME_MARK_NAMED("Iteration");
     IREE_CHECK_OK(iree_vm_invoke(
         context, function, IREE_VM_INVOCATION_FLAG_NONE, /*policy=*/nullptr,
         inputs, outputs.get(), iree_allocator_system()));
     IREE_CHECK_OK(iree_vm_list_resize(outputs.get(), 0));
   }

   // Force a full flush and get the device back to an idle state.
   IREE_CHECK_OK(iree_hal_device_wait_idle(device, iree_infinite_timeout()));
 }

 void RegisterGenericBenchmark(const std::string& function_name,
                               iree_vm_context_t* context,
                               iree_vm_function_t function,
                               iree_vm_list_t* inputs,
                               iree_hal_device_t* device) {
   auto benchmark_name = "BM_" + function_name;
   int batch_size = FLAG_batch_size;
   benchmark::RegisterBenchmark(
       benchmark_name.c_str(),
       [benchmark_name, batch_size, context, function, inputs,
        device](benchmark::State& state) -> void {
         BenchmarkGenericFunction(benchmark_name, batch_size, context, function,
                                  inputs, device, state);
       })
       // By default only the main thread is included in CPU time. Include all
       // the threads instead.
       ->MeasureProcessCPUTime()
       // To make single and multi-threaded benchmarks more comparable, use the
       // wall time to determine how many iterations to run. See
       // https://github.com/google/benchmark#cpu-timers,
       ->UseRealTime()
       ->Unit(FLAG_time_unit.first ? FLAG_time_unit.second
                                   : benchmark::kMillisecond);
 }

 static void BenchmarkDispatchFunction(const std::string& benchmark_name,
                                       iree_vm_context_t* context,
                                       iree_vm_function_t function,
                                       iree_hal_device_t* device,
                                       benchmark::State& state) {
   IREE_TRACE_SCOPE_DYNAMIC(benchmark_name.c_str());
   IREE_TRACE_FRAME_MARK();

   vm::ref<iree_vm_list_t> inputs;
   IREE_CHECK_OK(iree_vm_list_create(/*element_type=*/nullptr, 16,
                                     iree_allocator_system(), &inputs));
   iree_vm_value_t batch_size = iree_vm_value_make_i32(FLAG_batch_size);
   IREE_CHECK_OK(iree_vm_list_push_value(inputs.get(), &batch_size));

   vm::ref<iree_vm_list_t> outputs;
   IREE_CHECK_OK(iree_vm_list_create(/*element_type=*/nullptr, 16,
                                     iree_allocator_system(), &outputs));

   // Benchmarking loop.
   while (state.KeepRunningBatch(FLAG_batch_size)) {
     IREE_TRACE_SCOPE0("BenchmarkIteration");
     IREE_TRACE_FRAME_MARK_NAMED("Iteration");
     IREE_CHECK_OK(iree_vm_invoke(
         context, function, IREE_VM_INVOCATION_FLAG_NONE, /*policy=*/nullptr,
         inputs.get(), outputs.get(), iree_allocator_system()));
     IREE_CHECK_OK(iree_vm_list_resize(outputs.get(), 0));
   }

   // Force a full flush and get the device back to an idle state.
   IREE_CHECK_OK(iree_hal_device_wait_idle(device, iree_infinite_timeout()));
 }

 void RegisterDispatchBenchmark(const std::string& function_name,
                                iree_vm_context_t* context,
                                iree_vm_function_t function,
                                iree_hal_device_t* device) {
   auto benchmark_name = "BM_" + function_name;
   benchmark::RegisterBenchmark(benchmark_name.c_str(),
                                [benchmark_name, context, function,
                                 device](benchmark::State& state) -> void {
                                  BenchmarkDispatchFunction(benchmark_name,
                                                            context, function,
                                                            device, state);
                                })
       // By default only the main thread is included in CPU time. Include all
       // the threads instead.
       ->MeasureProcessCPUTime()
       // To make single and multi-threaded benchmarks more comparable, use the
       // wall time to determine how many iterations to run. See
       // https://github.com/google/benchmark#cpu-timers,
       ->UseRealTime()
       ->Unit(FLAG_time_unit.first ? FLAG_time_unit.second
                                   : benchmark::kMicrosecond);
 }

 iree_status_t GetModuleContentsFromFlags(iree_file_contents_t** out_contents) {
   IREE_TRACE_SCOPE0("GetModuleContentsFromFlags");
   auto module_file = std::string(FLAG_module_file);
   if (module_file == "-") {
     std::cout << "Reading module contents from stdin...\n";
     return iree_stdin_read_contents(iree_allocator_system(), out_contents);
   } else {
     return iree_file_read_contents(module_file.c_str(), iree_allocator_system(),
                                    out_contents);
   }
 }

 // TODO(hanchung): Consider to refactor this out and reuse in iree-run-module.
 // This class helps organize required resources for IREE. The order of
 // construction and destruction for resources matters. And the lifetime of
 // resources also matters. The lifetime of IREEBenchmark should be as long as
 // ::benchmark::RunSpecifiedBenchmarks() where the resources are used during
 // benchmarking.
 class IREEBenchmark {
  public:
   IREEBenchmark() = default;

   ~IREEBenchmark() {
     IREE_TRACE_SCOPE0("IREEBenchmark::dtor");

     // Order matters.
     inputs_.reset();
     iree_vm_context_release(context_);
     iree_vm_module_release(hal_module_);
     iree_vm_module_release(input_module_);
     if (FLAG_print_statistics) {
       IREE_IGNORE_ERROR(iree_hal_allocator_statistics_fprint(
           stderr, iree_hal_device_allocator(device_)));
     }
     iree_hal_device_release(device_);
     iree_vm_instance_release(instance_);
   };

   iree_status_t Register() {
     IREE_TRACE_SCOPE0("IREEBenchmark::Register");

     if (!instance_ || !device_ || !hal_module_ || !context_ || !input_module_) {
       IREE_RETURN_IF_ERROR(Init());
     }

     auto function_name = std::string(FLAG_entry_function);
     if (!function_name.empty()) {
       IREE_RETURN_IF_ERROR(RegisterSpecificFunction(function_name));
     } else {
       IREE_RETURN_IF_ERROR(RegisterAllExportedFunctions());
     }
     return iree_ok_status();
   }

  private:
   iree_status_t Init() {
     IREE_TRACE_SCOPE0("IREEBenchmark::Init");
     IREE_TRACE_FRAME_MARK_BEGIN_NAMED("init");

     iree_file_contents_t* flatbuffer_contents = NULL;
     IREE_RETURN_IF_ERROR(
         iree::GetModuleContentsFromFlags(&flatbuffer_contents));

     IREE_RETURN_IF_ERROR(iree_hal_module_register_all_types());
     IREE_RETURN_IF_ERROR(
         iree_vm_instance_create(iree_allocator_system(), &instance_));

     // Create IREE's device and module.
     IREE_RETURN_IF_ERROR(iree_hal_create_device_from_flags(
         iree_hal_default_device_uri(), iree_allocator_system(), &device_));
     IREE_RETURN_IF_ERROR(
         iree_hal_module_create(device_, IREE_HAL_MODULE_FLAG_NONE,
                                iree_allocator_system(), &hal_module_));
     IREE_RETURN_IF_ERROR(iree_vm_bytecode_module_create(
         flatbuffer_contents->const_buffer,
         iree_file_contents_deallocator(flatbuffer_contents),
         iree_allocator_system(), &input_module_));

     // Order matters. The input module will likely be dependent on the hal
     // module.
     std::array<iree_vm_module_t*, 2> modules = {hal_module_, input_module_};
     IREE_RETURN_IF_ERROR(iree_vm_context_create_with_modules(
         instance_, IREE_VM_CONTEXT_FLAG_NONE, modules.size(), modules.data(),
         iree_allocator_system(), &context_));

     IREE_TRACE_FRAME_MARK_END_NAMED("init");
     return iree_ok_status();
   }

   iree_status_t RegisterSpecificFunction(const std::string& function_name) {
     IREE_TRACE_SCOPE0("IREEBenchmark::RegisterSpecificFunction");

     iree_vm_function_t function;
     IREE_RETURN_IF_ERROR(iree_vm_module_lookup_function_by_name(
         input_module_, IREE_VM_FUNCTION_LINKAGE_EXPORT,
         iree_string_view_t{function_name.data(), function_name.size()},
         &function));

     IREE_CHECK_OK(ParseToVariantList(
         iree_hal_device_allocator(device_),
         iree::span<const std::string>{FLAG_function_inputs.data(),
                                       FLAG_function_inputs.size()},
         &inputs_));
     RegisterGenericBenchmark(function_name, context_, function, inputs_.get(),
                              device_);
     return iree_ok_status();
   }

   iree_status_t RegisterAllExportedFunctions() {
     IREE_TRACE_SCOPE0("IREEBenchmark::RegisterAllExportedFunctions");
     iree_vm_module_signature_t signature =
         input_module_->signature(input_module_->self);
     for (iree_host_size_t i = 0; i < signature.export_function_count; ++i) {
       iree_vm_function_t function;
       IREE_RETURN_IF_ERROR(iree_vm_module_lookup_function_by_ordinal(
           input_module_, IREE_VM_FUNCTION_LINKAGE_EXPORT, i, &function));
       iree_string_view_t function_name = iree_vm_function_name(&function);

       // We run anything with the 'benchmark' attribute.
       // If the attribute is not present we'll run anything that looks runnable.
       iree_string_view_t benchmark_type = iree_vm_function_lookup_attr_by_name(
           &function, IREE_SV("iree.benchmark"));
       if (iree_string_view_equal(benchmark_type, IREE_SV("dispatch"))) {
         iree::RegisterDispatchBenchmark(
             std::string(function_name.data, function_name.size), context_,
             function, device_);
       } else if (iree_string_view_equal(benchmark_type, IREE_SV("entry"))) {
         iree::RegisterGenericBenchmark(
             std::string(function_name.data, function_name.size), context_,
             function,
             /*inputs=*/nullptr, device_);
       } else {
         // Pick up generic () -> () functions.
         if (iree_string_view_starts_with(function_name,
                                          iree_make_cstring_view("__")) ||
             iree_string_view_find_char(function_name, '$', 0) !=
                 IREE_STRING_VIEW_NPOS) {
           // Skip internal or special functions.
           continue;
         }

         iree_vm_function_signature_t signature =
             iree_vm_function_signature(&function);
         iree_host_size_t argument_count = 0;
         iree_host_size_t result_count = 0;
         IREE_RETURN_IF_ERROR(iree_vm_function_call_count_arguments_and_results(
             &signature, &argument_count, &result_count));
         if (argument_count) {
           // Only functions with no inputs are run (because we can't pass
           // anything).
           continue;
         }

         iree::RegisterGenericBenchmark(
             std::string(function_name.data, function_name.size), context_,
             function,
             /*inputs=*/nullptr, device_);
       }
     }
     return iree_ok_status();
   }

   iree_vm_instance_t* instance_ = nullptr;
   iree_hal_device_t* device_ = nullptr;
   iree_vm_module_t* hal_module_ = nullptr;
   iree_vm_context_t* context_ = nullptr;
   iree_vm_module_t* input_module_ = nullptr;
   iree::vm::ref<iree_vm_list_t> inputs_;
 };
 }  // namespace
 }  // namespace iree

 int main(int argc, char** argv) {
   IREE_TRACE_SCOPE0("main");

   // Pass through flags to benchmark (allowing --help to fall through).
   iree_flags_parse_checked(IREE_FLAGS_PARSE_MODE_UNDEFINED_OK |
                                IREE_FLAGS_PARSE_MODE_CONTINUE_AFTER_HELP,
                            &argc, &argv);
   ::benchmark::Initialize(&argc, argv);

   iree::IREEBenchmark iree_benchmark;
   iree_status_t status = iree_benchmark.Register();
   if (!iree_status_is_ok(status)) {
     int ret = static_cast<int>(iree_status_code(status));
     std::cout << iree::Status(std::move(status)) << std::endl;
     return ret;
   }
   ::benchmark::RunSpecifiedBenchmarks();
   return 0;
 }
	// Copyright 2020 The IREE Authors
	//
	// Licensed under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

	//===----------------------------------------------------------------------===//
	// iree-benchmark-module: benchmarks public functions in an IREE VM module
	//===----------------------------------------------------------------------===//
	//
	// This runs exported functions using flags specified on the command line.
	// Each function is measured independently and the numbers reported will be for
	// the full end-to-end CPU and wall times.
	//
	// From an ML perspective this is an integration benchmark for measuring total
	// user-visible latency of model entry points. It is not a microbenchmarking
	// tool for individual device-side dispatch functions (aka ops aka kernels).
	// If interested in the precise time of a particular dispatch then tracy,
	// executable_library_benchmark, and platform/vendor tooling (nsight, perf, etc)
	// are to be used instead and attaching them to this tool is often useful in
	// order to get a large sample set.
	//
	// By default all functions taking no inputs will be benchmarked. If a function
	// takes inputs then the user will need to specify them using --function_input=
	// flags. Depending on the input program the -iree-flow-export-benchmark-funcs
	// flag can be passed to the compiler to attempt to wrap each function with
	// dummy inputs however this will fail in programs with dynamically shaped
	// inputs. The workaround for avoiding the need for flags is to provide the
	// input program in a form with no inputs from the start.
	//
	// It's important to remember that IREE is not a BLAS library and is meant to
	// run entire programs. It's not generally appropriate to benchmark a model with
	// a single matmul, for example, as that's just treating IREE as a BLAS library.
	// Note also that user-level ops in a frontend environment don't map to the
	// dispatches that IREE executes: IREE is a compiler like any other and does not
	// guarantee a source line of code translates into an atomically divisible and
	// independently measurable execution command. In other words don't expect to be
	// able to benchmark the cost of a broadcasting elementwise tf.add op within a
	// model: by the time we are running the program that's fused itself into a
	// single machine instruction operating as part of some other ops.
	//
	// For coarse dispatch testing and triaging it can still be useful to remove
	// some of the overheads introduced by whole-program execution and the compiler
	// flag --iree-hal-benchmark-dispatch-repeat-count=N is provided to enable
	// batching. Whatever N is chosen must then be passed to this tool via
	// --batch_size=N so that the benchmark reporting properly reflects the
	// batching. As an example --iree-hal-benchmark-dispatch-repeat-count=32 +
	// --batch_size=32 will reduce the overheads by 32x. Think of this as a way to
	// control the p value in Amdahl's law representing the amount of time spent in
	// dispatches relative to the rest of the program. This isn't representative of
	// how the full program will run, though, and YMMV. Always verify timings with
	// an appropriate device-specific tool before trusting the more generic and
	// higher-level numbers from this tool.

	#include <array>
	#include <cstdio>
	#include <iostream>
	#include <iterator>
	#include <string>
	#include <type_traits>
	#include <utility>
	#include <vector>

	#include "benchmark/benchmark.h"
	#include "iree/base/api.h"
	#include "iree/base/internal/file_io.h"
	#include "iree/base/internal/flags.h"
	#include "iree/base/status_cc.h"
	#include "iree/base/tracing.h"
	#include "iree/hal/api.h"
	#include "iree/modules/hal/module.h"
	#include "iree/tooling/device_util.h"
	#include "iree/tooling/vm_util.h"
	#include "iree/vm/api.h"
	#include "iree/vm/bytecode_module.h"
	#include "iree/vm/ref_cc.h"

	constexpr char kNanosecondsUnitString[] = "ns";
	constexpr char kMicrosecondsUnitString[] = "us";
	constexpr char kMillisecondsUnitString[] = "ms";

	IREE_FLAG(string, module_file, "-",
	"File containing the module to load that contains the entry "
	"function. Defaults to stdin.");

	// TODO(hanchung): Extract the batch size using
	// iree_vm_function_lookup_attr_by_name.
	IREE_FLAG(
	int32_t, batch_size, 1,
	"The number of batch size, which is expected to match "
	"iree-hal-benchmark-dispatch-repeat-count when translating the module");

	IREE_FLAG(string, entry_function, "",
	"Name of a function contained in the module specified by module_file "
	"to run. If this is not set, all the exported functions will be "
	"benchmarked and they are expected to not have input arguments.");

	IREE_FLAG(bool, print_statistics, false,
	"Prints runtime statistics to stderr on exit.");

	static iree_status_t parse_function_input(iree_string_view_t flag_name,
	void* storage,
	iree_string_view_t value) {
	auto* list = (std::vector<std::string>*)storage;
	list->push_back(std::string(value.data, value.size));
	return iree_ok_status();
	}
	static void print_function_input(iree_string_view_t flag_name, void* storage,
	FILE* file) {
	auto* list = (std::vector<std::string>*)storage;
	if (list->empty()) {
	fprintf(file, "# --%.*s=\n", (int)flag_name.size, flag_name.data);
	} else {
	for (size_t i = 0; i < list->size(); ++i) {
	fprintf(file, "--%.*s=\"%s\"\n", (int)flag_name.size, flag_name.data,
	list->at(i).c_str());
	}
	}
	}
	static std::vector<std::string> FLAG_function_inputs;
	IREE_FLAG_CALLBACK(
	parse_function_input, print_function_input, &FLAG_function_inputs,
	function_input,
	"An input value or buffer of the format:\n"
	" [shape]xtype=[value]\n"
	" 2x2xi32=1 2 3 4\n"
	"Optionally, brackets may be used to separate the element values:\n"
	" 2x2xi32=[[1 2][3 4]]\n"
	"Raw binary files can be read to provide buffer contents:\n"
	" 2x2xi32=@some/file.bin\n"
	"numpy npy files (from numpy.save) can be read to provide 1+ values:\n"
	" @some.npy\n"
	"Each occurrence of the flag indicates an input in the order they were\n"
	"specified on the command line.");

	static iree_status_t parse_time_unit(iree_string_view_t flag_name,
	void* storage, iree_string_view_t value) {
	auto* unit = (std::pair<bool, benchmark::TimeUnit>*)storage;
	auto unit_string = std::string(value.data, value.size);
	if (unit_string == kMillisecondsUnitString) {
	*unit = {true, benchmark::kMillisecond};
	return iree_ok_status();
	} else if (unit_string == kMicrosecondsUnitString) {
	*unit = {true, benchmark::kMicrosecond};
	return iree_ok_status();
	} else if (unit_string == kNanosecondsUnitString) {
	*unit = {true, benchmark::kNanosecond};
	return iree_ok_status();
	}
	return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
	"unsupported time unit");
	}
	static void print_time_unit(iree_string_view_t flag_name, void* storage,
	FILE* file) {
	auto* unit = (std::pair<bool, benchmark::TimeUnit>*)storage;
	if (!unit->first) {
	return;
	}
	std::string unit_string;
	switch (unit->second) {
	case benchmark::kMillisecond:
	unit_string = kMillisecondsUnitString;
	break;
	case benchmark::kMicrosecond:
	unit_string = kMicrosecondsUnitString;
	break;
	case benchmark::kNanosecond:
	unit_string = kNanosecondsUnitString;
	break;
	default:
	assert(false && "Unexpected time unit.");
	}
	fprintf(file, "--%.*s=\"%s\"\n", (int)flag_name.size, flag_name.data,
	unit_string.c_str());
	}
	// Time unit to be printed. If the first field is false, each place will use its
	// default time unit.
	static std::pair<bool, benchmark::TimeUnit> FLAG_time_unit = {
	false, benchmark::kNanosecond};
	IREE_FLAG_CALLBACK(
	parse_time_unit, print_time_unit, &FLAG_time_unit, time_unit,
	"The time unit to be printed in the results. Can be 'ms', 'us', or 'ns'.");

	namespace iree {
	namespace {

	static void BenchmarkGenericFunction(const std::string& benchmark_name,
	int batch_size, iree_vm_context_t* context,
	iree_vm_function_t function,
	iree_vm_list_t* inputs,
	iree_hal_device_t* device,
	benchmark::State& state) {
	IREE_TRACE_SCOPE_DYNAMIC(benchmark_name.c_str());
	IREE_TRACE_FRAME_MARK();

	vm::ref<iree_vm_list_t> outputs;
	IREE_CHECK_OK(iree_vm_list_create(/element_type=/nullptr, 16,
	iree_allocator_system(), &outputs));

	// Benchmarking loop.
	while (state.KeepRunningBatch(batch_size)) {
	IREE_TRACE_SCOPE0("BenchmarkIteration");
	IREE_TRACE_FRAME_MARK_NAMED("Iteration");
	IREE_CHECK_OK(iree_vm_invoke(
	context, function, IREE_VM_INVOCATION_FLAG_NONE, /policy=/nullptr,
	inputs, outputs.get(), iree_allocator_system()));
	IREE_CHECK_OK(iree_vm_list_resize(outputs.get(), 0));
	}

	// Force a full flush and get the device back to an idle state.
	IREE_CHECK_OK(iree_hal_device_wait_idle(device, iree_infinite_timeout()));
	}

	void RegisterGenericBenchmark(const std::string& function_name,
	iree_vm_context_t* context,
	iree_vm_function_t function,
	iree_vm_list_t* inputs,
	iree_hal_device_t* device) {
	auto benchmark_name = "BM_" + function_name;
	int batch_size = FLAG_batch_size;
	benchmark::RegisterBenchmark(
	benchmark_name.c_str(),
	[benchmark_name, batch_size, context, function, inputs,
	device](benchmark::State& state) -> void {
	BenchmarkGenericFunction(benchmark_name, batch_size, context, function,
	inputs, device, state);
	})
	// By default only the main thread is included in CPU time. Include all
	// the threads instead.
	->MeasureProcessCPUTime()
	// To make single and multi-threaded benchmarks more comparable, use the
	// wall time to determine how many iterations to run. See
	// https://github.com/google/benchmark#cpu-timers,
	->UseRealTime()
	->Unit(FLAG_time_unit.first ? FLAG_time_unit.second
	: benchmark::kMillisecond);
	}

	static void BenchmarkDispatchFunction(const std::string& benchmark_name,
	iree_vm_context_t* context,
	iree_vm_function_t function,
	iree_hal_device_t* device,
	benchmark::State& state) {
	IREE_TRACE_SCOPE_DYNAMIC(benchmark_name.c_str());
	IREE_TRACE_FRAME_MARK();

	vm::ref<iree_vm_list_t> inputs;
	IREE_CHECK_OK(iree_vm_list_create(/element_type=/nullptr, 16,
	iree_allocator_system(), &inputs));
	iree_vm_value_t batch_size = iree_vm_value_make_i32(FLAG_batch_size);
	IREE_CHECK_OK(iree_vm_list_push_value(inputs.get(), &batch_size));

	vm::ref<iree_vm_list_t> outputs;
	IREE_CHECK_OK(iree_vm_list_create(/element_type=/nullptr, 16,
	iree_allocator_system(), &outputs));

	// Benchmarking loop.
	while (state.KeepRunningBatch(FLAG_batch_size)) {
	IREE_TRACE_SCOPE0("BenchmarkIteration");
	IREE_TRACE_FRAME_MARK_NAMED("Iteration");
	IREE_CHECK_OK(iree_vm_invoke(
	context, function, IREE_VM_INVOCATION_FLAG_NONE, /policy=/nullptr,
	inputs.get(), outputs.get(), iree_allocator_system()));
	IREE_CHECK_OK(iree_vm_list_resize(outputs.get(), 0));
	}

	// Force a full flush and get the device back to an idle state.
	IREE_CHECK_OK(iree_hal_device_wait_idle(device, iree_infinite_timeout()));
	}

	void RegisterDispatchBenchmark(const std::string& function_name,
	iree_vm_context_t* context,
	iree_vm_function_t function,
	iree_hal_device_t* device) {
	auto benchmark_name = "BM_" + function_name;
	benchmark::RegisterBenchmark(benchmark_name.c_str(),
	[benchmark_name, context, function,
	device](benchmark::State& state) -> void {
	BenchmarkDispatchFunction(benchmark_name,
	context, function,
	device, state);
	})
	// By default only the main thread is included in CPU time. Include all
	// the threads instead.
	->MeasureProcessCPUTime()
	// To make single and multi-threaded benchmarks more comparable, use the
	// wall time to determine how many iterations to run. See
	// https://github.com/google/benchmark#cpu-timers,
	->UseRealTime()
	->Unit(FLAG_time_unit.first ? FLAG_time_unit.second
	: benchmark::kMicrosecond);
	}

	iree_status_t GetModuleContentsFromFlags(iree_file_contents_t** out_contents) {
	IREE_TRACE_SCOPE0("GetModuleContentsFromFlags");
	auto module_file = std::string(FLAG_module_file);
	if (module_file == "-") {
	std::cout << "Reading module contents from stdin...\n";
	return iree_stdin_read_contents(iree_allocator_system(), out_contents);
	} else {
	return iree_file_read_contents(module_file.c_str(), iree_allocator_system(),
	out_contents);
	}
	}

	// TODO(hanchung): Consider to refactor this out and reuse in iree-run-module.
	// This class helps organize required resources for IREE. The order of
	// construction and destruction for resources matters. And the lifetime of
	// resources also matters. The lifetime of IREEBenchmark should be as long as
	// ::benchmark::RunSpecifiedBenchmarks() where the resources are used during
	// benchmarking.
	class IREEBenchmark {
	public:
	IREEBenchmark() = default;

	~IREEBenchmark() {
	IREE_TRACE_SCOPE0("IREEBenchmark::dtor");

	// Order matters.
	inputs_.reset();
	iree_vm_context_release(context_);
	iree_vm_module_release(hal_module_);
	iree_vm_module_release(input_module_);
	if (FLAG_print_statistics) {
	IREE_IGNORE_ERROR(iree_hal_allocator_statistics_fprint(
	stderr, iree_hal_device_allocator(device_)));
	}
	iree_hal_device_release(device_);
	iree_vm_instance_release(instance_);
	};

	iree_status_t Register() {
	IREE_TRACE_SCOPE0("IREEBenchmark::Register");

	if (!instance_ \|\| !device_ \|\| !hal_module_ \|\| !context_ \|\| !input_module_) {
	IREE_RETURN_IF_ERROR(Init());
	}

	auto function_name = std::string(FLAG_entry_function);
	if (!function_name.empty()) {
	IREE_RETURN_IF_ERROR(RegisterSpecificFunction(function_name));
	} else {
	IREE_RETURN_IF_ERROR(RegisterAllExportedFunctions());
	}
	return iree_ok_status();
	}

	private:
	iree_status_t Init() {
	IREE_TRACE_SCOPE0("IREEBenchmark::Init");
	IREE_TRACE_FRAME_MARK_BEGIN_NAMED("init");

	iree_file_contents_t* flatbuffer_contents = NULL;
	IREE_RETURN_IF_ERROR(
	iree::GetModuleContentsFromFlags(&flatbuffer_contents));

	IREE_RETURN_IF_ERROR(iree_hal_module_register_all_types());
	IREE_RETURN_IF_ERROR(
	iree_vm_instance_create(iree_allocator_system(), &instance_));

	// Create IREE's device and module.
	IREE_RETURN_IF_ERROR(iree_hal_create_device_from_flags(
	iree_hal_default_device_uri(), iree_allocator_system(), &device_));
	IREE_RETURN_IF_ERROR(
	iree_hal_module_create(device_, IREE_HAL_MODULE_FLAG_NONE,
	iree_allocator_system(), &hal_module_));
	IREE_RETURN_IF_ERROR(iree_vm_bytecode_module_create(
	flatbuffer_contents->const_buffer,
	iree_file_contents_deallocator(flatbuffer_contents),
	iree_allocator_system(), &input_module_));

	// Order matters. The input module will likely be dependent on the hal
	// module.
	std::array<iree_vm_module_t*, 2> modules = {hal_module_, input_module_};
	IREE_RETURN_IF_ERROR(iree_vm_context_create_with_modules(
	instance_, IREE_VM_CONTEXT_FLAG_NONE, modules.size(), modules.data(),
	iree_allocator_system(), &context_));

	IREE_TRACE_FRAME_MARK_END_NAMED("init");
	return iree_ok_status();
	}

	iree_status_t RegisterSpecificFunction(const std::string& function_name) {
	IREE_TRACE_SCOPE0("IREEBenchmark::RegisterSpecificFunction");

	iree_vm_function_t function;
	IREE_RETURN_IF_ERROR(iree_vm_module_lookup_function_by_name(
	input_module_, IREE_VM_FUNCTION_LINKAGE_EXPORT,
	iree_string_view_t{function_name.data(), function_name.size()},
	&function));

	IREE_CHECK_OK(ParseToVariantList(
	iree_hal_device_allocator(device_),
	iree::span<const std::string>{FLAG_function_inputs.data(),
	FLAG_function_inputs.size()},
	&inputs_));
	RegisterGenericBenchmark(function_name, context_, function, inputs_.get(),
	device_);
	return iree_ok_status();
	}

	iree_status_t RegisterAllExportedFunctions() {
	IREE_TRACE_SCOPE0("IREEBenchmark::RegisterAllExportedFunctions");
	iree_vm_module_signature_t signature =
	input_module_->signature(input_module_->self);
	for (iree_host_size_t i = 0; i < signature.export_function_count; ++i) {
	iree_vm_function_t function;
	IREE_RETURN_IF_ERROR(iree_vm_module_lookup_function_by_ordinal(
	input_module_, IREE_VM_FUNCTION_LINKAGE_EXPORT, i, &function));
	iree_string_view_t function_name = iree_vm_function_name(&function);

	// We run anything with the 'benchmark' attribute.
	// If the attribute is not present we'll run anything that looks runnable.
	iree_string_view_t benchmark_type = iree_vm_function_lookup_attr_by_name(
	&function, IREE_SV("iree.benchmark"));
	if (iree_string_view_equal(benchmark_type, IREE_SV("dispatch"))) {
	iree::RegisterDispatchBenchmark(
	std::string(function_name.data, function_name.size), context_,
	function, device_);
	} else if (iree_string_view_equal(benchmark_type, IREE_SV("entry"))) {
	iree::RegisterGenericBenchmark(
	std::string(function_name.data, function_name.size), context_,
	function,
	/inputs=/nullptr, device_);
	} else {
	// Pick up generic () -> () functions.
	if (iree_string_view_starts_with(function_name,
	iree_make_cstring_view("__")) \|\|
	iree_string_view_find_char(function_name, '$', 0) !=
	IREE_STRING_VIEW_NPOS) {
	// Skip internal or special functions.
	continue;
	}

	iree_vm_function_signature_t signature =
	iree_vm_function_signature(&function);
	iree_host_size_t argument_count = 0;
	iree_host_size_t result_count = 0;
	IREE_RETURN_IF_ERROR(iree_vm_function_call_count_arguments_and_results(
	&signature, &argument_count, &result_count));
	if (argument_count) {
	// Only functions with no inputs are run (because we can't pass
	// anything).
	continue;
	}

	iree::RegisterGenericBenchmark(
	std::string(function_name.data, function_name.size), context_,
	function,
	/inputs=/nullptr, device_);
	}
	}
	return iree_ok_status();
	}

	iree_vm_instance_t* instance_ = nullptr;
	iree_hal_device_t* device_ = nullptr;
	iree_vm_module_t* hal_module_ = nullptr;
	iree_vm_context_t* context_ = nullptr;
	iree_vm_module_t* input_module_ = nullptr;
	iree::vm::ref<iree_vm_list_t> inputs_;
	};
	} // namespace
	} // namespace iree

	int main(int argc, char** argv) {
	IREE_TRACE_SCOPE0("main");

	// Pass through flags to benchmark (allowing --help to fall through).
	iree_flags_parse_checked(IREE_FLAGS_PARSE_MODE_UNDEFINED_OK \|
	IREE_FLAGS_PARSE_MODE_CONTINUE_AFTER_HELP,
	&argc, &argv);
	::benchmark::Initialize(&argc, argv);

	iree::IREEBenchmark iree_benchmark;
	iree_status_t status = iree_benchmark.Register();
	if (!iree_status_is_ok(status)) {
	int ret = static_cast<int>(iree_status_code(status));
	std::cout << iree::Status(std::move(status)) << std::endl;
	return ret;
	}
	::benchmark::RunSpecifiedBenchmarks();
	return 0;
	}