Merge "Add Support for Profiling TFLM Models in Benchmarks"
diff --git a/benchmarks/BUILD b/benchmarks/BUILD
index 5a14073..7e1f4c0 100644
--- a/benchmarks/BUILD
+++ b/benchmarks/BUILD
@@ -20,6 +20,12 @@
     visibility = ["//visibility:public"],
 )
 
+cc_library(
+    name = "cycle_count",
+    hdrs = ["cycle_count.h"],
+    visibility = ["//visibility:public"],
+)
+
 exports_files(
     srcs = glob(["*.c", "*.cc", "*.h"]),
 )
diff --git a/benchmarks/benchmark_kelvin.cc b/benchmarks/benchmark_kelvin.cc
index 766d3d6..6a84038 100644
--- a/benchmarks/benchmark_kelvin.cc
+++ b/benchmarks/benchmark_kelvin.cc
@@ -19,11 +19,16 @@
 #include "crt/kelvin.h"
 #include "crt/log.h"
 #include "benchmarks/benchmark.h"
+#include "benchmarks/cycle_count.h"
 #include "tensorflow/lite/micro/micro_interpreter.h"
 #include "tensorflow/lite/micro/micro_log.h"
 #include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
+#if (PROFILE == 1)
+#include "tensorflow/lite/micro/micro_profiler.h"
+#endif
+
 #define STRINGIZE(x) #x
 #define STR(x) STRINGIZE(x)
 
@@ -44,21 +49,6 @@
     .cycles = 0,
 };
 
-inline uint64_t mcycle_read(void) {
-  uint32_t cycle_low = 0;
-  uint32_t cycle_high = 0;
-  uint32_t cycle_high_2 = 0;
-  asm volatile(
-      "1:"
-      "  csrr %0, mcycleh;"  // Read `mcycleh`.
-      "  csrr %1, mcycle;"   // Read `mcycle`.
-      "  csrr %2, mcycleh;"  // Read `mcycleh` again.
-      "  bne  %0, %2, 1b;"
-      : "=r"(cycle_high), "=r"(cycle_low), "=r"(cycle_high_2)
-      :);
-  return static_cast<uint64_t>(cycle_high) << 32 | cycle_low;
-}
-
 // This includes all ops currently used in the Kelvin model suite. More can be added.
 constexpr int kAllOpsNum = 22;
 std::unique_ptr<tflite::MicroMutableOpResolver<kAllOpsNum>> GetAllOpsResolver() {
@@ -110,8 +100,17 @@
       tflite::MicroAllocator::Create(variable_arena, 1024);
   tflite::MicroResourceVariables *resource_variables =
       tflite::MicroResourceVariables::Create(variable_allocator, 20);
+#if (PROFILE == 1)
+  tflite::MicroProfiler profiler;
+  std::unique_ptr<tflite::MicroInterpreter> interpreter = std::make_unique<tflite::MicroInterpreter>(
+      model, *resolver.get(), g_tensor_arena, kTensorArenaSize, resource_variables, &profiler);
+  // For a profiled model, just run a single iteration
+  const int iterations = 1;
+#else
   std::unique_ptr<tflite::MicroInterpreter> interpreter = std::make_unique<tflite::MicroInterpreter>(
       model, *resolver.get(), g_tensor_arena, kTensorArenaSize, resource_variables);
+  const int iterations = ITERATIONS;
+#endif
 
   // Run inference outside of benchmark to intialize model.
   if (interpreter->AllocateTensors() != kTfLiteOk) {
@@ -130,17 +129,22 @@
   uint64_t begin = mcycle_read();
 
   // TODO(michaelbrooks): Possibly set/verify test data?
-  for (int i = 0; i < ITERATIONS; ++i) {
+  for (int i = 0; i < iterations; ++i) {
     interpreter->Invoke();
   }
   uint64_t end = mcycle_read();
   uint64_t num_cycles = end - begin;
+
+#if (PROFILE == 1)
+  profiler.LogCsv();
+#endif
+
   // Stores benchmark information in output header for other cores to access.
-  output_header.iterations = ITERATIONS;
+  output_header.iterations = iterations;
   output_header.cycles = num_cycles;
 
   // If running on a simulator, print cycle information.
-  uint64_t average_cycles = num_cycles / ITERATIONS;
+  uint64_t average_cycles = num_cycles / iterations;
   LOG_INFO("Iterations: %ld", output_header.iterations);
   _print64("Total Cycles: ", output_header.cycles);
   _print64("Average Cycles per Iteration: ", average_cycles);
diff --git a/benchmarks/benchmarks.bzl b/benchmarks/benchmarks.bzl
index 3a269e1..318269d 100644
--- a/benchmarks/benchmarks.bzl
+++ b/benchmarks/benchmarks.bzl
@@ -21,6 +21,7 @@
         name,
         model,
         iterations,
+        profile = False,
         hw_test_size = "medium",
         hw_test_tags = [],
         iss_test_size = "small",
@@ -38,10 +39,15 @@
             name = "{}".format(name),
             srcs = ["@kelvin_sw//benchmarks:benchmark_kelvin.cc"],
             hdrs = ["@kelvin_sw//benchmarks:benchmark.h", "{}_model.h".format(name)],
-            copts = ["-DITERATIONS={}".format(iterations), "-DBENCHMARK_NAME={}".format(name)],
+            copts = [
+                "-DITERATIONS={}".format(iterations),
+                "-DBENCHMARK_NAME={}".format(name),
+                "-DPROFILE={}".format(1 if profile else 0),
+            ],
             deps = [
                 "@kelvin_sw//crt",
                 "@kelvin_sw//benchmarks:benchmark_header",
+                "@kelvin_sw//benchmarks:cycle_count",
                 "@tflite-micro//tensorflow/lite/micro:micro_framework",
                 "@tflite-micro//tensorflow/lite/micro:system_setup",
             ],
@@ -55,12 +61,14 @@
         name,
         model,
         iterations,
+        profile = False,
         **kwargs):
         _kelvin_benchmark_device(
             name = name,
             model = model,
             device_type = "fpga_nexus",
             iterations = iterations,
+            profile = profile,
             **kwargs,
         )
 
@@ -68,6 +76,7 @@
         name,
         model,
         iterations,
+        profile = False,
         **kwargs):
 
         _kelvin_benchmark_device(
@@ -75,6 +84,7 @@
             model = model,
             device_type = "asic",
             iterations = iterations,
+            profile = profile,
             **kwargs,
         )
 
@@ -82,12 +92,14 @@
         name,
         model,
         iterations,
+        profile = False,
         **kwargs):
 
         kelvin_benchmark_asic(
             name = "{}_asic".format(name),
             model = model,
             iterations = iterations,
+            profile = profile,
             **kwargs,
         )
 
@@ -95,6 +107,7 @@
             name = "{}_fpga".format(name),
             model = model,
             iterations = iterations,
+            profile = profile,
             **kwargs,
         )
 
@@ -113,6 +126,7 @@
         model,
         device_type,
         iterations,
+        profile = False,
         **kwargs):
 
         bin_to_c_file(
@@ -138,6 +152,7 @@
                 "@matcha//sw/device/lib/dif:i2s",
                 "@matcha//sw/device/lib/dif:tlul_mailbox",
                 "@kelvin_sw//benchmarks:benchmark_header",
+                "@kelvin_sw//benchmarks:cycle_count",
                 "@lowrisc_opentitan//sw/device/lib/dif:rv_plic",
             ],
         )
@@ -165,6 +180,7 @@
                 "@matcha//sw/device/lib/dif:smc_ctrl",
                 "@matcha//sw/device/lib/dif:tlul_mailbox",
                 "@kelvin_sw//benchmarks:benchmark_header",
+                "@kelvin_sw//benchmarks:cycle_count",
                 "@lowrisc_opentitan//sw/device/lib/dif:rv_plic",
             ],
         )
@@ -174,13 +190,18 @@
             srcs = [
                 "@kelvin_sw//benchmarks:benchmark_kelvin.cc",
             ],
-            copts = ["-DITERATIONS={}".format(iterations), "-DBENCHMARK_NAME={}".format(name)],
+            copts = [
+                "-DITERATIONS={}".format(iterations),
+                "-DBENCHMARK_NAME={}".format(name),
+                "-DPROFILE={}".format(1 if profile else 0),
+            ],
             hdrs = [
                 "@kelvin_sw//benchmarks:benchmark.h",
                 "{}_model.h".format(name),
             ],
             deps = [
                 "@kelvin_sw//benchmarks:benchmark_header",
+                "@kelvin_sw//benchmarks:cycle_count",
                 "@tflite-micro//tensorflow/lite/micro:micro_framework",
                 "@tflite-micro//tensorflow/lite/micro:system_setup",
             ],
diff --git a/benchmarks/cycle_count.h b/benchmarks/cycle_count.h
new file mode 100644
index 0000000..949e2bd
--- /dev/null
+++ b/benchmarks/cycle_count.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef BENCHMARKS_CYCLE_COUNT_H_
+#define BENCHMARKS_CYCLE_COUNT_H_
+
+inline uint64_t mcycle_read(void) {
+  uint32_t cycle_low = 0;
+  uint32_t cycle_high = 0;
+  uint32_t cycle_high_2 = 0;
+  asm volatile(
+      "1:"
+      "  csrr %0, mcycleh;"  // Read `mcycleh`.
+      "  csrr %1, mcycle;"   // Read `mcycle`.
+      "  csrr %2, mcycleh;"  // Read `mcycleh` again.
+      "  bne  %0, %2, 1b;"
+      : "=r"(cycle_high), "=r"(cycle_low), "=r"(cycle_high_2)
+      :);
+  return static_cast<uint64_t>(cycle_high) << 32 | cycle_low;
+}
+
+#endif // #ifndef BENCHMARKS_CYCLE_COUNT_H_
\ No newline at end of file