Merge "Add Support for Profiling TFLM Models in Benchmarks"
diff --git a/benchmarks/BUILD b/benchmarks/BUILD
index 5a14073..7e1f4c0 100644
--- a/benchmarks/BUILD
+++ b/benchmarks/BUILD
@@ -20,6 +20,12 @@
visibility = ["//visibility:public"],
)
+cc_library(
+ name = "cycle_count",
+ hdrs = ["cycle_count.h"],
+ visibility = ["//visibility:public"],
+)
+
exports_files(
srcs = glob(["*.c", "*.cc", "*.h"]),
)
diff --git a/benchmarks/benchmark_kelvin.cc b/benchmarks/benchmark_kelvin.cc
index 766d3d6..6a84038 100644
--- a/benchmarks/benchmark_kelvin.cc
+++ b/benchmarks/benchmark_kelvin.cc
@@ -19,11 +19,16 @@
#include "crt/kelvin.h"
#include "crt/log.h"
#include "benchmarks/benchmark.h"
+#include "benchmarks/cycle_count.h"
#include "tensorflow/lite/micro/micro_interpreter.h"
#include "tensorflow/lite/micro/micro_log.h"
#include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
#include "tensorflow/lite/schema/schema_generated.h"
+#if (PROFILE == 1)
+#include "tensorflow/lite/micro/micro_profiler.h"
+#endif
+
#define STRINGIZE(x) #x
#define STR(x) STRINGIZE(x)
@@ -44,21 +49,6 @@
.cycles = 0,
};
-inline uint64_t mcycle_read(void) {
- uint32_t cycle_low = 0;
- uint32_t cycle_high = 0;
- uint32_t cycle_high_2 = 0;
- asm volatile(
- "1:"
- " csrr %0, mcycleh;" // Read `mcycleh`.
- " csrr %1, mcycle;" // Read `mcycle`.
- " csrr %2, mcycleh;" // Read `mcycleh` again.
- " bne %0, %2, 1b;"
- : "=r"(cycle_high), "=r"(cycle_low), "=r"(cycle_high_2)
- :);
- return static_cast<uint64_t>(cycle_high) << 32 | cycle_low;
-}
-
// This includes all ops currently used in the Kelvin model suite. More can be added.
constexpr int kAllOpsNum = 22;
std::unique_ptr<tflite::MicroMutableOpResolver<kAllOpsNum>> GetAllOpsResolver() {
@@ -110,8 +100,17 @@
tflite::MicroAllocator::Create(variable_arena, 1024);
tflite::MicroResourceVariables *resource_variables =
tflite::MicroResourceVariables::Create(variable_allocator, 20);
+#if (PROFILE == 1)
+ tflite::MicroProfiler profiler;
+ std::unique_ptr<tflite::MicroInterpreter> interpreter = std::make_unique<tflite::MicroInterpreter>(
+ model, *resolver.get(), g_tensor_arena, kTensorArenaSize, resource_variables, &profiler);
+ // For a profiled model, just run a single iteration
+ const int iterations = 1;
+#else
std::unique_ptr<tflite::MicroInterpreter> interpreter = std::make_unique<tflite::MicroInterpreter>(
model, *resolver.get(), g_tensor_arena, kTensorArenaSize, resource_variables);
+ const int iterations = ITERATIONS;
+#endif
// Run inference outside of benchmark to intialize model.
if (interpreter->AllocateTensors() != kTfLiteOk) {
@@ -130,17 +129,22 @@
uint64_t begin = mcycle_read();
// TODO(michaelbrooks): Possibly set/verify test data?
- for (int i = 0; i < ITERATIONS; ++i) {
+ for (int i = 0; i < iterations; ++i) {
interpreter->Invoke();
}
uint64_t end = mcycle_read();
uint64_t num_cycles = end - begin;
+
+#if (PROFILE == 1)
+ profiler.LogCsv();
+#endif
+
// Stores benchmark information in output header for other cores to access.
- output_header.iterations = ITERATIONS;
+ output_header.iterations = iterations;
output_header.cycles = num_cycles;
// If running on a simulator, print cycle information.
- uint64_t average_cycles = num_cycles / ITERATIONS;
+ uint64_t average_cycles = num_cycles / iterations;
LOG_INFO("Iterations: %ld", output_header.iterations);
_print64("Total Cycles: ", output_header.cycles);
_print64("Average Cycles per Iteration: ", average_cycles);
diff --git a/benchmarks/benchmarks.bzl b/benchmarks/benchmarks.bzl
index 3a269e1..318269d 100644
--- a/benchmarks/benchmarks.bzl
+++ b/benchmarks/benchmarks.bzl
@@ -21,6 +21,7 @@
name,
model,
iterations,
+ profile = False,
hw_test_size = "medium",
hw_test_tags = [],
iss_test_size = "small",
@@ -38,10 +39,15 @@
name = "{}".format(name),
srcs = ["@kelvin_sw//benchmarks:benchmark_kelvin.cc"],
hdrs = ["@kelvin_sw//benchmarks:benchmark.h", "{}_model.h".format(name)],
- copts = ["-DITERATIONS={}".format(iterations), "-DBENCHMARK_NAME={}".format(name)],
+ copts = [
+ "-DITERATIONS={}".format(iterations),
+ "-DBENCHMARK_NAME={}".format(name),
+ "-DPROFILE={}".format(1 if profile else 0),
+ ],
deps = [
"@kelvin_sw//crt",
"@kelvin_sw//benchmarks:benchmark_header",
+ "@kelvin_sw//benchmarks:cycle_count",
"@tflite-micro//tensorflow/lite/micro:micro_framework",
"@tflite-micro//tensorflow/lite/micro:system_setup",
],
@@ -55,12 +61,14 @@
name,
model,
iterations,
+ profile = False,
**kwargs):
_kelvin_benchmark_device(
name = name,
model = model,
device_type = "fpga_nexus",
iterations = iterations,
+ profile = profile,
**kwargs,
)
@@ -68,6 +76,7 @@
name,
model,
iterations,
+ profile = False,
**kwargs):
_kelvin_benchmark_device(
@@ -75,6 +84,7 @@
model = model,
device_type = "asic",
iterations = iterations,
+ profile = profile,
**kwargs,
)
@@ -82,12 +92,14 @@
name,
model,
iterations,
+ profile = False,
**kwargs):
kelvin_benchmark_asic(
name = "{}_asic".format(name),
model = model,
iterations = iterations,
+ profile = profile,
**kwargs,
)
@@ -95,6 +107,7 @@
name = "{}_fpga".format(name),
model = model,
iterations = iterations,
+ profile = profile,
**kwargs,
)
@@ -113,6 +126,7 @@
model,
device_type,
iterations,
+ profile = False,
**kwargs):
bin_to_c_file(
@@ -138,6 +152,7 @@
"@matcha//sw/device/lib/dif:i2s",
"@matcha//sw/device/lib/dif:tlul_mailbox",
"@kelvin_sw//benchmarks:benchmark_header",
+ "@kelvin_sw//benchmarks:cycle_count",
"@lowrisc_opentitan//sw/device/lib/dif:rv_plic",
],
)
@@ -165,6 +180,7 @@
"@matcha//sw/device/lib/dif:smc_ctrl",
"@matcha//sw/device/lib/dif:tlul_mailbox",
"@kelvin_sw//benchmarks:benchmark_header",
+ "@kelvin_sw//benchmarks:cycle_count",
"@lowrisc_opentitan//sw/device/lib/dif:rv_plic",
],
)
@@ -174,13 +190,18 @@
srcs = [
"@kelvin_sw//benchmarks:benchmark_kelvin.cc",
],
- copts = ["-DITERATIONS={}".format(iterations), "-DBENCHMARK_NAME={}".format(name)],
+ copts = [
+ "-DITERATIONS={}".format(iterations),
+ "-DBENCHMARK_NAME={}".format(name),
+ "-DPROFILE={}".format(1 if profile else 0),
+ ],
hdrs = [
"@kelvin_sw//benchmarks:benchmark.h",
"{}_model.h".format(name),
],
deps = [
"@kelvin_sw//benchmarks:benchmark_header",
+ "@kelvin_sw//benchmarks:cycle_count",
"@tflite-micro//tensorflow/lite/micro:micro_framework",
"@tflite-micro//tensorflow/lite/micro:system_setup",
],
diff --git a/benchmarks/cycle_count.h b/benchmarks/cycle_count.h
new file mode 100644
index 0000000..949e2bd
--- /dev/null
+++ b/benchmarks/cycle_count.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef BENCHMARKS_CYCLE_COUNT_H_
+#define BENCHMARKS_CYCLE_COUNT_H_
+
+inline uint64_t mcycle_read(void) {
+ uint32_t cycle_low = 0;
+ uint32_t cycle_high = 0;
+ uint32_t cycle_high_2 = 0;
+ asm volatile(
+ "1:"
+ " csrr %0, mcycleh;" // Read `mcycleh`.
+ " csrr %1, mcycle;" // Read `mcycle`.
+ " csrr %2, mcycleh;" // Read `mcycleh` again.
+ " bne %0, %2, 1b;"
+ : "=r"(cycle_high), "=r"(cycle_low), "=r"(cycle_high_2)
+ :);
+ return static_cast<uint64_t>(cycle_high) << 32 | cycle_low;
+}
+
+#endif // #ifndef BENCHMARKS_CYCLE_COUNT_H_
\ No newline at end of file