benchmarks/benchmark_kelvin.cc - sw/kelvin - Git at Google

 /*
  * Copyright 2024 Google LLC
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #include <memory>

 #include "crt/kelvin.h"
 #include "crt/log.h"
 #include "benchmarks/benchmark.h"
 #include "tensorflow/lite/micro/micro_interpreter.h"
 #include "tensorflow/lite/micro/micro_log.h"
 #include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
 #include "tensorflow/lite/schema/schema_generated.h"

 #define STRINGIZE(x) #x
 #define STR(x) STRINGIZE(x)

 // In order to include the model data generate from Bazel, include the header
 // using the name passed as a macro.
 #define MODEL_HEADER_DIRECTORY benchmarks/
 #define MODEL_HEADER_TYPE _model.h
 #define MODEL_HEADER STR(MODEL_HEADER_DIRECTORY BENCHMARK_NAME MODEL_HEADER_TYPE)
 #include MODEL_HEADER

 namespace {
 constexpr int kTensorArenaSize = 1024 * 1024;
 uint8_t g_tensor_arena[kTensorArenaSize] __attribute__((aligned(64)));

 __attribute__((section(".model_output_header"))) BenchmarkOutputHeader output_header = {
     .return_code = 0, // Set by kelvin_start based on return value in main.
     .iterations = 0,
     .cycles = 0,
 };

 inline uint64_t mcycle_read(void) {
   uint32_t cycle_low = 0;
   uint32_t cycle_high = 0;
   uint32_t cycle_high_2 = 0;
   asm volatile(
       "1:"
       "  csrr %0, mcycleh;"  // Read `mcycleh`.
       "  csrr %1, mcycle;"   // Read `mcycle`.
       "  csrr %2, mcycleh;"  // Read `mcycleh` again.
       "  bne  %0, %2, 1b;"
       : "=r"(cycle_high), "=r"(cycle_low), "=r"(cycle_high_2)
       :);
   return static_cast<uint64_t>(cycle_high) << 32 | cycle_low;
 }

 // This includes all ops currently used in the Kelvin model suite. More can be added.
 constexpr int kAllOpsNum = 22;
 std::unique_ptr<tflite::MicroMutableOpResolver<kAllOpsNum>> GetAllOpsResolver() {
   tflite::MicroMutableOpResolver<kAllOpsNum> resolver;
   resolver.AddAveragePool2D();
   resolver.AddMaxPool2D();
   resolver.AddConv2D();
   resolver.AddConcatenation();
   resolver.AddDepthwiseConv2D();
   resolver.AddDequantize();
   resolver.AddQuantize();
   resolver.AddReshape();
   resolver.AddSoftmax();
   resolver.AddCallOnce();
   resolver.AddVarHandle();
   resolver.AddReadVariable();
   resolver.AddAssignVariable();
   resolver.AddLogistic();
   resolver.AddStridedSlice();
   resolver.AddFullyConnected();
   resolver.AddPad();
   resolver.AddLeakyRelu();
   resolver.AddSplit();
   resolver.AddTransposeConv();
   resolver.AddAdd();
   resolver.AddSub();
   return std::make_unique<tflite::MicroMutableOpResolver<kAllOpsNum>>(resolver);
 }

 void _print64(const char* header, uint64_t number) {
   uint32_t number_low = number & 0xFFFFFFFF;
   uint32_t number_hi = number >> 32;
   LOG_INFO("%s: 0x%08lx%08lx", header, number_hi, number_low);
 }

 constexpr int kSuccess = 0;
 constexpr int kAllocatonFailed = -1;
 constexpr int kInvokeFailed = -2;
 } // namespace


 int main(int argc, char **argv) {
   std::unique_ptr<tflite::MicroMutableOpResolver<kAllOpsNum>> resolver = GetAllOpsResolver();

   const auto* model = tflite::GetModel(g_benchmark_model_data);

   uint8_t variable_arena[2048];
   tflite::MicroAllocator *variable_allocator =
       tflite::MicroAllocator::Create(variable_arena, 1024);
   tflite::MicroResourceVariables *resource_variables =
       tflite::MicroResourceVariables::Create(variable_allocator, 20);
   std::unique_ptr<tflite::MicroInterpreter> interpreter = std::make_unique<tflite::MicroInterpreter>(
       model, *resolver.get(), g_tensor_arena, kTensorArenaSize, resource_variables);

   // Run inference outside of benchmark to intialize model.
   if (interpreter->AllocateTensors() != kTfLiteOk) {
     return kAllocatonFailed;
   }
   TfLiteTensor* input = interpreter->input(0);

   // Set input tensor to zero for first inference, subsequent runs
   // will run on output tensor data (since the memory is shared).
   memset(tflite::GetTensorData<uint8_t>(input), 0, input->bytes);
   if (interpreter->Invoke() != kTfLiteOk) {
     return kInvokeFailed;
   }

   LOG_INFO("========== Begin Benchmark (%s) ==========", STR(BENCHMARK_NAME));
   uint64_t begin = mcycle_read();

   // TODO(michaelbrooks): Possibly set/verify test data?
   for (int i = 0; i < ITERATIONS; ++i) {
     interpreter->Invoke();
   }
   uint64_t end = mcycle_read();
   uint64_t num_cycles = end - begin;
   // Stores benchmark information in output header for other cores to access.
   output_header.iterations = ITERATIONS;
   output_header.cycles = num_cycles;

   // If running on a simulator, print cycle information.
   uint64_t average_cycles = num_cycles / ITERATIONS;
   LOG_INFO("Iterations: %ld", output_header.iterations);
   _print64("Total Cycles: ", output_header.cycles);
   _print64("Average Cycles per Iteration: ", average_cycles);
   LOG_INFO("========== End Benchmark ==========");
   return kSuccess;
 }
	/*
	* Copyright 2024 Google LLC
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#include <memory>

	#include "crt/kelvin.h"
	#include "crt/log.h"
	#include "benchmarks/benchmark.h"
	#include "tensorflow/lite/micro/micro_interpreter.h"
	#include "tensorflow/lite/micro/micro_log.h"
	#include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
	#include "tensorflow/lite/schema/schema_generated.h"

	#define STRINGIZE(x) #x
	#define STR(x) STRINGIZE(x)

	// In order to include the model data generate from Bazel, include the header
	// using the name passed as a macro.
	#define MODEL_HEADER_DIRECTORY benchmarks/
	#define MODEL_HEADER_TYPE _model.h
	#define MODEL_HEADER STR(MODEL_HEADER_DIRECTORY BENCHMARK_NAME MODEL_HEADER_TYPE)
	#include MODEL_HEADER

	namespace {
	constexpr int kTensorArenaSize = 1024 * 1024;
	uint8_t g_tensor_arena[kTensorArenaSize] __attribute__((aligned(64)));

	__attribute__((section(".model_output_header"))) BenchmarkOutputHeader output_header = {
	.return_code = 0, // Set by kelvin_start based on return value in main.
	.iterations = 0,
	.cycles = 0,
	};

	inline uint64_t mcycle_read(void) {
	uint32_t cycle_low = 0;
	uint32_t cycle_high = 0;
	uint32_t cycle_high_2 = 0;
	asm volatile(
	"1:"
	" csrr %0, mcycleh;" // Read `mcycleh`.
	" csrr %1, mcycle;" // Read `mcycle`.
	" csrr %2, mcycleh;" // Read `mcycleh` again.
	" bne %0, %2, 1b;"
	: "=r"(cycle_high), "=r"(cycle_low), "=r"(cycle_high_2)
	:);
	return static_cast<uint64_t>(cycle_high) << 32 \| cycle_low;
	}

	// This includes all ops currently used in the Kelvin model suite. More can be added.
	constexpr int kAllOpsNum = 22;
	std::unique_ptr<tflite::MicroMutableOpResolver<kAllOpsNum>> GetAllOpsResolver() {
	tflite::MicroMutableOpResolver<kAllOpsNum> resolver;
	resolver.AddAveragePool2D();
	resolver.AddMaxPool2D();
	resolver.AddConv2D();
	resolver.AddConcatenation();
	resolver.AddDepthwiseConv2D();
	resolver.AddDequantize();
	resolver.AddQuantize();
	resolver.AddReshape();
	resolver.AddSoftmax();
	resolver.AddCallOnce();
	resolver.AddVarHandle();
	resolver.AddReadVariable();
	resolver.AddAssignVariable();
	resolver.AddLogistic();
	resolver.AddStridedSlice();
	resolver.AddFullyConnected();
	resolver.AddPad();
	resolver.AddLeakyRelu();
	resolver.AddSplit();
	resolver.AddTransposeConv();
	resolver.AddAdd();
	resolver.AddSub();
	return std::make_unique<tflite::MicroMutableOpResolver<kAllOpsNum>>(resolver);
	}

	void _print64(const char* header, uint64_t number) {
	uint32_t number_low = number & 0xFFFFFFFF;
	uint32_t number_hi = number >> 32;
	LOG_INFO("%s: 0x%08lx%08lx", header, number_hi, number_low);
	}

	constexpr int kSuccess = 0;
	constexpr int kAllocatonFailed = -1;
	constexpr int kInvokeFailed = -2;
	} // namespace


	int main(int argc, char **argv) {
	std::unique_ptr<tflite::MicroMutableOpResolver<kAllOpsNum>> resolver = GetAllOpsResolver();

	const auto* model = tflite::GetModel(g_benchmark_model_data);

	uint8_t variable_arena[2048];
	tflite::MicroAllocator *variable_allocator =
	tflite::MicroAllocator::Create(variable_arena, 1024);
	tflite::MicroResourceVariables *resource_variables =
	tflite::MicroResourceVariables::Create(variable_allocator, 20);
	std::unique_ptr<tflite::MicroInterpreter> interpreter = std::make_unique<tflite::MicroInterpreter>(
	model, *resolver.get(), g_tensor_arena, kTensorArenaSize, resource_variables);

	// Run inference outside of benchmark to intialize model.
	if (interpreter->AllocateTensors() != kTfLiteOk) {
	return kAllocatonFailed;
	}
	TfLiteTensor* input = interpreter->input(0);

	// Set input tensor to zero for first inference, subsequent runs
	// will run on output tensor data (since the memory is shared).
	memset(tflite::GetTensorData<uint8_t>(input), 0, input->bytes);
	if (interpreter->Invoke() != kTfLiteOk) {
	return kInvokeFailed;
	}

	LOG_INFO("========== Begin Benchmark (%s) ==========", STR(BENCHMARK_NAME));
	uint64_t begin = mcycle_read();

	// TODO(michaelbrooks): Possibly set/verify test data?
	for (int i = 0; i < ITERATIONS; ++i) {
	interpreter->Invoke();
	}
	uint64_t end = mcycle_read();
	uint64_t num_cycles = end - begin;
	// Stores benchmark information in output header for other cores to access.
	output_header.iterations = ITERATIONS;
	output_header.cycles = num_cycles;

	// If running on a simulator, print cycle information.
	uint64_t average_cycles = num_cycles / ITERATIONS;
	LOG_INFO("Iterations: %ld", output_header.iterations);
	_print64("Total Cycles: ", output_header.cycles);
	_print64("Average Cycles per Iteration: ", average_cycles);
	LOG_INFO("========== End Benchmark ==========");
	return kSuccess;
	}