Add Support for Profiling TFLM Models in Benchmarks If the "profile" flag is set to True (default False), the benchmark will profile the cycles per op (for just one iteration) instead of the standard benchmark. Outputs cycles in CSV format. Tested: soundstream benchmark with profile=True/False Change-Id: I6fc7e23161d090aca03fb60741dcfe99aa3edc6b

diff --git a/benchmarks/BUILD b/benchmarks/BUILD
index 5a14073..7e1f4c0 100644
--- a/benchmarks/BUILD
+++ b/benchmarks/BUILD

@@ -20,6 +20,12 @@
     visibility = ["//visibility:public"],
 )
 
+cc_library(
+    name = "cycle_count",
+    hdrs = ["cycle_count.h"],
+    visibility = ["//visibility:public"],
+)
+
 exports_files(
     srcs = glob(["*.c", "*.cc", "*.h"]),
 )

diff --git a/benchmarks/benchmark_kelvin.cc b/benchmarks/benchmark_kelvin.cc
index 766d3d6..6a84038 100644
--- a/benchmarks/benchmark_kelvin.cc
+++ b/benchmarks/benchmark_kelvin.cc

@@ -19,11 +19,16 @@
 #include "crt/kelvin.h"
 #include "crt/log.h"
 #include "benchmarks/benchmark.h"
+#include "benchmarks/cycle_count.h"
 #include "tensorflow/lite/micro/micro_interpreter.h"
 #include "tensorflow/lite/micro/micro_log.h"
 #include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
+#if (PROFILE == 1)
+#include "tensorflow/lite/micro/micro_profiler.h"
+#endif
+
 #define STRINGIZE(x) #x
 #define STR(x) STRINGIZE(x)
 
@@ -44,21 +49,6 @@
     .cycles = 0,
 };
 
-inline uint64_t mcycle_read(void) {
-  uint32_t cycle_low = 0;
-  uint32_t cycle_high = 0;
-  uint32_t cycle_high_2 = 0;
-  asm volatile(
-      "1:"
-      "  csrr %0, mcycleh;"  // Read `mcycleh`.
-      "  csrr %1, mcycle;"   // Read `mcycle`.
-      "  csrr %2, mcycleh;"  // Read `mcycleh` again.
-      "  bne  %0, %2, 1b;"
-      : "=r"(cycle_high), "=r"(cycle_low), "=r"(cycle_high_2)
-      :);
-  return static_cast<uint64_t>(cycle_high) << 32 | cycle_low;
-}
-
 // This includes all ops currently used in the Kelvin model suite. More can be added.
 constexpr int kAllOpsNum = 22;
 std::unique_ptr<tflite::MicroMutableOpResolver<kAllOpsNum>> GetAllOpsResolver() {
@@ -110,8 +100,17 @@
       tflite::MicroAllocator::Create(variable_arena, 1024);
   tflite::MicroResourceVariables *resource_variables =
       tflite::MicroResourceVariables::Create(variable_allocator, 20);
+#if (PROFILE == 1)
+  tflite::MicroProfiler profiler;
+  std::unique_ptr<tflite::MicroInterpreter> interpreter = std::make_unique<tflite::MicroInterpreter>(
+      model, *resolver.get(), g_tensor_arena, kTensorArenaSize, resource_variables, &profiler);
+  // For a profiled model, just run a single iteration
+  const int iterations = 1;
+#else
   std::unique_ptr<tflite::MicroInterpreter> interpreter = std::make_unique<tflite::MicroInterpreter>(
       model, *resolver.get(), g_tensor_arena, kTensorArenaSize, resource_variables);
+  const int iterations = ITERATIONS;
+#endif
 
   // Run inference outside of benchmark to intialize model.
   if (interpreter->AllocateTensors() != kTfLiteOk) {
@@ -130,17 +129,22 @@
   uint64_t begin = mcycle_read();
 
   // TODO(michaelbrooks): Possibly set/verify test data?
-  for (int i = 0; i < ITERATIONS; ++i) {
+  for (int i = 0; i < iterations; ++i) {
     interpreter->Invoke();
   }
   uint64_t end = mcycle_read();
   uint64_t num_cycles = end - begin;
+
+#if (PROFILE == 1)
+  profiler.LogCsv();
+#endif
+
   // Stores benchmark information in output header for other cores to access.
-  output_header.iterations = ITERATIONS;
+  output_header.iterations = iterations;
   output_header.cycles = num_cycles;
 
   // If running on a simulator, print cycle information.
-  uint64_t average_cycles = num_cycles / ITERATIONS;
+  uint64_t average_cycles = num_cycles / iterations;
   LOG_INFO("Iterations: %ld", output_header.iterations);
   _print64("Total Cycles: ", output_header.cycles);
   _print64("Average Cycles per Iteration: ", average_cycles);

diff --git a/benchmarks/benchmarks.bzl b/benchmarks/benchmarks.bzl
index 3a269e1..318269d 100644
--- a/benchmarks/benchmarks.bzl
+++ b/benchmarks/benchmarks.bzl

@@ -21,6 +21,7 @@
         name,
         model,
         iterations,
+        profile = False,
         hw_test_size = "medium",
         hw_test_tags = [],
         iss_test_size = "small",
@@ -38,10 +39,15 @@
             name = "{}".format(name),
             srcs = ["@kelvin_sw//benchmarks:benchmark_kelvin.cc"],
             hdrs = ["@kelvin_sw//benchmarks:benchmark.h", "{}_model.h".format(name)],
-            copts = ["-DITERATIONS={}".format(iterations), "-DBENCHMARK_NAME={}".format(name)],
+            copts = [
+                "-DITERATIONS={}".format(iterations),
+                "-DBENCHMARK_NAME={}".format(name),
+                "-DPROFILE={}".format(1 if profile else 0),
+            ],
             deps = [
                 "@kelvin_sw//crt",
                 "@kelvin_sw//benchmarks:benchmark_header",
+                "@kelvin_sw//benchmarks:cycle_count",
                 "@tflite-micro//tensorflow/lite/micro:micro_framework",
                 "@tflite-micro//tensorflow/lite/micro:system_setup",
             ],
@@ -55,12 +61,14 @@
         name,
         model,
         iterations,
+        profile = False,
         **kwargs):
         _kelvin_benchmark_device(
             name = name,
             model = model,
             device_type = "fpga_nexus",
             iterations = iterations,
+            profile = profile,
             **kwargs,
         )
 
@@ -68,6 +76,7 @@
         name,
         model,
         iterations,
+        profile = False,
         **kwargs):
 
         _kelvin_benchmark_device(
@@ -75,6 +84,7 @@
             model = model,
             device_type = "asic",
             iterations = iterations,
+            profile = profile,
             **kwargs,
         )
 
@@ -82,12 +92,14 @@
         name,
         model,
         iterations,
+        profile = False,
         **kwargs):
 
         kelvin_benchmark_asic(
             name = "{}_asic".format(name),
             model = model,
             iterations = iterations,
+            profile = profile,
             **kwargs,
         )
 
@@ -95,6 +107,7 @@
             name = "{}_fpga".format(name),
             model = model,
             iterations = iterations,
+            profile = profile,
             **kwargs,
         )
 
@@ -113,6 +126,7 @@
         model,
         device_type,
         iterations,
+        profile = False,
         **kwargs):
 
         bin_to_c_file(
@@ -138,6 +152,7 @@
                 "@matcha//sw/device/lib/dif:i2s",
                 "@matcha//sw/device/lib/dif:tlul_mailbox",
                 "@kelvin_sw//benchmarks:benchmark_header",
+                "@kelvin_sw//benchmarks:cycle_count",
                 "@lowrisc_opentitan//sw/device/lib/dif:rv_plic",
             ],
         )
@@ -165,6 +180,7 @@
                 "@matcha//sw/device/lib/dif:smc_ctrl",
                 "@matcha//sw/device/lib/dif:tlul_mailbox",
                 "@kelvin_sw//benchmarks:benchmark_header",
+                "@kelvin_sw//benchmarks:cycle_count",
                 "@lowrisc_opentitan//sw/device/lib/dif:rv_plic",
             ],
         )
@@ -174,13 +190,18 @@
             srcs = [
                 "@kelvin_sw//benchmarks:benchmark_kelvin.cc",
             ],
-            copts = ["-DITERATIONS={}".format(iterations), "-DBENCHMARK_NAME={}".format(name)],
+            copts = [
+                "-DITERATIONS={}".format(iterations),
+                "-DBENCHMARK_NAME={}".format(name),
+                "-DPROFILE={}".format(1 if profile else 0),
+            ],
             hdrs = [
                 "@kelvin_sw//benchmarks:benchmark.h",
                 "{}_model.h".format(name),
             ],
             deps = [
                 "@kelvin_sw//benchmarks:benchmark_header",
+                "@kelvin_sw//benchmarks:cycle_count",
                 "@tflite-micro//tensorflow/lite/micro:micro_framework",
                 "@tflite-micro//tensorflow/lite/micro:system_setup",
             ],

diff --git a/benchmarks/cycle_count.h b/benchmarks/cycle_count.h
new file mode 100644
index 0000000..949e2bd
--- /dev/null
+++ b/benchmarks/cycle_count.h

@@ -0,0 +1,35 @@
+/*
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef BENCHMARKS_CYCLE_COUNT_H_
+#define BENCHMARKS_CYCLE_COUNT_H_
+
+inline uint64_t mcycle_read(void) {
+  uint32_t cycle_low = 0;
+  uint32_t cycle_high = 0;
+  uint32_t cycle_high_2 = 0;
+  asm volatile(
+      "1:"
+      "  csrr %0, mcycleh;"  // Read `mcycleh`.
+      "  csrr %1, mcycle;"   // Read `mcycle`.
+      "  csrr %2, mcycleh;"  // Read `mcycleh` again.
+      "  bne  %0, %2, 1b;"
+      : "=r"(cycle_high), "=r"(cycle_low), "=r"(cycle_high_2)
+      :);
+  return static_cast<uint64_t>(cycle_high) << 32 | cycle_low;
+}
+
+#endif // #ifndef BENCHMARKS_CYCLE_COUNT_H_
\ No newline at end of file