Update examples and documentation for Arm(R) Corstone(TM)-300 FVP (#2503)

* Updates benchmark documentation.
* Updates benchmarks and network tester example.
* Select only needed ops for memory measurements.
* Only pip install if needed.
* Also gen folder output will be created differently depending on also toolchain and type of kernels

BUG=documentation for the benchmark application is not correct
diff --git a/tensorflow/lite/micro/benchmarks/README.md b/tensorflow/lite/micro/benchmarks/README.md
index 390b27a..1ac5d25 100644
--- a/tensorflow/lite/micro/benchmarks/README.md
+++ b/tensorflow/lite/micro/benchmarks/README.md
@@ -70,26 +70,17 @@
 For more info about the Corstone-300 software see:
 [tensorflow/lite/micro/cortex_m_corstone_300/README.md](../cortex_m_corstone_300/README.md).
 
-Disclaimer: Executing the benchmark test on the Corstone-300 software will
-provide a general metric of instructions executed. The estimates are not cycle
-accurate, however it aligns to instruction per cycle, and is a consistent
-environment. This means it can detect if code changes changed performance.
+Disclaimer: The FVP can not be used to measure CPU performance.
+The results are not reliable, not even for relative measurements.
+FVP may however be used for performance measurements when running on NPU and only NPU PMU numbers can be used. The NPU model is cycle accurate within approximately +-10%.
 
-The person detection benchmark can also run with Ethos-U enabled, as the
-downloaded model will be optimized for Ethos-U. For more info see:
+As an example, the person detect downloaded model will be optimized for Ethos-U. For more info see:
 [tensorflow/lite/micro/kernels/ethos_u/README.md](../kernels/ethos_u/README.md).
+And since it only makes sense to measure performance on the NPU, only the person detection benchmark should be run and only with Ethos-U enabled.
+See also network tester example, where person detect model is used in the same way when Ethos-U is enabled:
+[tensorflow/lite/micro/examples/network_tester/README.md](../examples/network_tester/README.md).
 
-To run the keyword benchmark on FVP:
-
-```
-make -f tensorflow/lite/micro/tools/make/Makefile TARGET=cortex_m_corstone_300 TARGET_ARCH=cortex-m55 run_keyword_benchmark
-```
-
-To run the person detection benchmark on FVP:
-
-```
-make -f tensorflow/lite/micro/tools/make/Makefile TARGET=cortex_m_corstone_300 TARGET_ARCH=cortex-m55 run_person_detection_benchmark
-```
+The person detect model is not an optimial model for Ethos-U since it quite small. Also note that only the NPU PMU cycles are logged even though the CPU is setting up the Ethos-U driver in each iteration.
 
 To run the person detection benchmark on FVP with Ethos-U:
 
diff --git a/tensorflow/lite/micro/benchmarks/person_detection_benchmark.cc b/tensorflow/lite/micro/benchmarks/person_detection_benchmark.cc
index e21789b..29d30ee 100644
--- a/tensorflow/lite/micro/benchmarks/person_detection_benchmark.cc
+++ b/tensorflow/lite/micro/benchmarks/person_detection_benchmark.cc
@@ -1,4 +1,4 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -37,7 +37,11 @@
 
 namespace tflite {
 
+#ifdef ETHOS_U
+using PersonDetectionOpResolver = MicroMutableOpResolver<1>;
+#else
 using PersonDetectionOpResolver = MicroMutableOpResolver<6>;
+#endif
 using PersonDetectionBenchmarkRunner = MicroBenchmarkRunner<int8_t>;
 
 // Create an area of memory to use for input, output, and intermediate arrays.
@@ -57,12 +61,16 @@
   // PersonDetectionBenchmarkRunner object.
   PersonDetectionOpResolver* op_resolver =
       new (op_resolver_buffer) PersonDetectionOpResolver();
+#ifdef ETHOS_U
+  op_resolver->AddEthosU();
+#else
   op_resolver->AddFullyConnected(tflite::Register_FULLY_CONNECTED_INT8());
   op_resolver->AddConv2D(tflite::Register_CONV_2D_INT8REF());
   op_resolver->AddDepthwiseConv2D();
   op_resolver->AddSoftmax();
   op_resolver->AddAveragePool2D(tflite::Register_AVERAGE_POOL_2D_INT8());
   op_resolver->AddReshape();
+#endif
   return new (benchmark_runner_buffer)
       PersonDetectionBenchmarkRunner(g_person_detect_model_data, op_resolver,
                                      tensor_arena, kTensorArenaSize, profiler);
diff --git a/tensorflow/lite/micro/cortex_m_corstone_300/system_setup.cc b/tensorflow/lite/micro/cortex_m_corstone_300/system_setup.cc
index 95a11b2..6473340 100644
--- a/tensorflow/lite/micro/cortex_m_corstone_300/system_setup.cc
+++ b/tensorflow/lite/micro/cortex_m_corstone_300/system_setup.cc
@@ -1,4 +1,4 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,33 +14,90 @@
 ==============================================================================*/
 
 #ifdef ETHOS_U
+#include <inttypes.h>
+
+#include <algorithm>
+
 #include "ethosu_driver.h"
+#include "pmu_ethosu.h"
 #endif
 
 // This is set in micro/tools/make/targets/cortex_m_corstone_300_makefile.inc.
-// It is needed for the calls to NVIC_SetVector()/NVIC_EnableIR() and for the
-// DWT and PMU counters.
+// It is needed for the calls to NVIC_SetVector()/NVIC_EnableIR(),
 #include CMSIS_DEVICE_ARM_CORTEX_M_XX_HEADER_FILE
 
 #include "tensorflow/lite/micro/micro_log.h"
 #include "tensorflow/lite/micro/micro_time.h"
 #include "tensorflow/lite/micro/system_setup.h"
 
+#ifdef ETHOS_U
+
+bool npuPmuCycleCounterIsSet;
+uint64_t npuPmuCycleCounter;
+
+extern "C" {
+void ethosu_inference_begin(struct ethosu_driver* drv, void* userArg) {
+  // Enable PMU
+  ETHOSU_PMU_Enable(drv);
+
+  // Enable cycle counter
+  ETHOSU_PMU_PMCCNTR_CFG_Set_Stop_Event(drv, ETHOSU_PMU_NPU_IDLE);
+  ETHOSU_PMU_PMCCNTR_CFG_Set_Start_Event(drv, ETHOSU_PMU_NPU_ACTIVE);
+  ETHOSU_PMU_CNTR_Enable(drv, ETHOSU_PMU_CCNT_Msk);
+  ETHOSU_PMU_CYCCNT_Reset(drv);
+
+  // Reset all counters
+  ETHOSU_PMU_EVCNTR_ALL_Reset(drv);
+}
+
+void ethosu_inference_end(struct ethosu_driver* drv, void* userArg) {
+  // Save cycle counter
+  npuPmuCycleCounter += ETHOSU_PMU_Get_CCNTR(drv);
+  npuPmuCycleCounterIsSet = true;
+
+  // Disable PMU
+  ETHOSU_PMU_Disable(drv);
+}
+}
+#endif
+
 namespace tflite {
 
 namespace {
+#ifdef ETHOS_U
+constexpr uint32_t kClocksPerSecond = 200e6;
+#else
 constexpr uint32_t kClocksPerSecond = 25e6;
+#endif
 }  // namespace
 
 uint32_t ticks_per_second() { return kClocksPerSecond; }
 
 uint32_t GetCurrentTimeTicks() {
-#if (!defined(TF_LITE_STRIP_ERROR_STRINGS) && !defined(ARMCM0))
+#if (!defined(TF_LITE_STRIP_ERROR_STRINGS))
+#ifdef ETHOS_U
+  uint32_t ticks = static_cast<uint32_t>(npuPmuCycleCounter);
+
+  // Note cycle counter will be reset here for next iteration
+  if (npuPmuCycleCounterIsSet) {
+    npuPmuCycleCounter = 0;
+    npuPmuCycleCounterIsSet = false;
+  }
+
+  return ticks;
+#else
+
+#if defined(ARMCM0)
+  return 0;
+#else
 #ifdef ARMCM55
   return ARM_PMU_Get_CCNTR();
 #else
   return DWT->CYCCNT;
 #endif
+#endif
+
+#endif
 #else
   return 0;
 #endif
@@ -88,14 +145,17 @@
 #ifdef ETHOS_U
   constexpr int ethosu_base_address = 0x48102000;
   constexpr int ethosu_irq = 56;
+  constexpr int ethosu_irq_priority = 5;
 
   // Initialize Ethos-U NPU driver.
   if (ethosu_init(&ethosu0_driver, reinterpret_cast<void*>(ethosu_base_address),
                   ethosu0_scratch, ETHOSU_FAST_MEMORY_SIZE, 1, 1)) {
     MicroPrintf("Failed to initialize Ethos-U driver");
+    return;
   }
   NVIC_SetVector(static_cast<IRQn_Type>(ethosu_irq),
                  (uint32_t)&ethosuIrqHandler0);
+  NVIC_SetPriority(static_cast<IRQn_Type>(ethosu_irq), ethosu_irq_priority);
   NVIC_EnableIRQ(static_cast<IRQn_Type>(ethosu_irq));
 #endif
 }
diff --git a/tensorflow/lite/micro/examples/network_tester/network_tester_test.cc b/tensorflow/lite/micro/examples/network_tester/network_tester_test.cc
index e62e0c4..23e50c9 100644
--- a/tensorflow/lite/micro/examples/network_tester/network_tester_test.cc
+++ b/tensorflow/lite/micro/examples/network_tester/network_tester_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -40,7 +40,7 @@
 #define NUM_INFERENCES 1
 #endif
 
-uint8_t tensor_arena[TENSOR_ARENA_SIZE];
+alignas(16) uint8_t tensor_arena[TENSOR_ARENA_SIZE];
 
 #ifdef NUM_BYTES_TO_PRINT
 inline void print_output_data(TfLiteTensor* output) {
@@ -92,15 +92,19 @@
         model->version(), TFLITE_SCHEMA_VERSION);
     return kTfLiteError;
   }
+#ifdef ETHOS_U
+  tflite::MicroMutableOpResolver<1> resolver;
+  resolver.AddEthosU();
 
-  tflite::MicroMutableOpResolver<6> resolver;
+#else
+  tflite::MicroMutableOpResolver<5> resolver;
   resolver.AddAveragePool2D(tflite::Register_AVERAGE_POOL_2D_INT8());
   resolver.AddConv2D(tflite::Register_CONV_2D_INT8());
   resolver.AddDepthwiseConv2D(tflite::Register_DEPTHWISE_CONV_2D_INT8());
-  resolver.AddEthosU();
   resolver.AddReshape();
   resolver.AddSoftmax(tflite::Register_SOFTMAX_INT8());
 
+#endif
   tflite::MicroInterpreter interpreter(model, resolver, tensor_arena,
                                        TENSOR_ARENA_SIZE);
 
@@ -152,7 +156,8 @@
     }
 #endif
   }
-  MicroPrintf("Ran successfully\n");
+
+  MicroPrintf("~~~ALL TESTS PASSED~~~\n");
 }
 
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/tools/make/Makefile b/tensorflow/lite/micro/tools/make/Makefile
index bb4a983..213ac0e 100644
--- a/tensorflow/lite/micro/tools/make/Makefile
+++ b/tensorflow/lite/micro/tools/make/Makefile
@@ -272,7 +272,15 @@
 
 # Where compiled objects are stored.
 BASE_GENDIR := gen
-GENDIR := $(BASE_GENDIR)/$(TARGET)_$(TARGET_ARCH)_$(BUILD_TYPE)/
+GENDIR := $(BASE_GENDIR)/$(TARGET)_$(TARGET_ARCH)_$(BUILD_TYPE)
+ifneq ($(OPTIMIZED_KERNEL_DIR),)
+  GENDIR := $(GENDIR)_$(OPTIMIZED_KERNEL_DIR)
+endif
+ifneq ($(CO_PROCESSOR),)
+  GENDIR := $(GENDIR)_$(CO_PROCESSOR)
+endif
+GENDIR := $(GENDIR)_$(TOOLCHAIN)/
+
 CORE_OBJDIR := $(GENDIR)obj/core/
 KERNEL_OBJDIR := $(GENDIR)obj/kernels/
 THIRD_PARTY_KERNEL_OBJDIR := $(GENDIR)obj/third_party_kernels/
diff --git a/tensorflow/lite/micro/tools/make/ext_libs/person_detection_int8_vela_convert.sh b/tensorflow/lite/micro/tools/make/ext_libs/person_detection_int8_vela_convert.sh
index 4b0ee89..0d9df87 100755
--- a/tensorflow/lite/micro/tools/make/ext_libs/person_detection_int8_vela_convert.sh
+++ b/tensorflow/lite/micro/tools/make/ext_libs/person_detection_int8_vela_convert.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -50,16 +50,28 @@
 PERSON_MODEL_HEADER=${MODEL_DIR}/person_detect_model_data.h
 
 if [ ! -f ${CONVERTED_PERSON_MODEL_INT8} ]; then
+
+  # Install ethos-u-vela if not already installed.
+  set +e
+  pip show ethos-u-vela >&2
+  retVal=$?
+  set -e
+  if [ $retVal -ne 0 ]; then
+    TEMPFILE=$(mktemp -d)/
+    python3 -m venv $TEMPFILE
+    source $TEMPFILE/bin/activate
+    python3 -m pip install --upgrade pip >&2
+    pip install --upgrade cython >&2
+    pip install --prefer-binary ethos-u-vela >&2
+  fi
+
   # Compile an optimized .tflite version for Ethos-U.
-  TEMPFILE=$(mktemp -d)/
-  python3 -m venv $TEMPFILE
-  source $TEMPFILE/bin/activate
-  python3 -m pip install --upgrade pip >&2
-  pip install --upgrade cython >&2
-  pip install --prefer-binary ethos-u-vela >&2
   vela --accelerator-config=ethos-u55-256 ${DOWNLOADS_DIR}/../../../models/person_detect.tflite \
        --output-dir ${MODEL_DIR} >&2
-  deactivate
+
+  if [ $retVal -ne 0 ]; then
+    deactivate
+  fi
 
   # Convert .tflite back to C array.
   echo "// This file is generated by $0." > ${CONVERTED_PERSON_MODEL_INT8}
@@ -67,7 +79,7 @@
        ${CONVERTED_PERSON_MODEL_INT8}
   echo -n "const " >> ${CONVERTED_PERSON_MODEL_INT8}
   xxd -i ${MODEL_DIR}/person_detect_vela.tflite >> ${CONVERTED_PERSON_MODEL_INT8}
-  sed -i 's/gen_cortex_m_corstone_300_cortex_m55_default_genfiles_tensorflow_lite_micro_models_person_detect_vela_tflite/g_person_detect_model_data/' \
+  sed  -i 's/gen_cortex_m_corstone_300_cortex_m55_.*genfiles_tensorflow_lite_micro_models_person_detect_vela_tflite/g_person_detect_model_data/' \
       ${CONVERTED_PERSON_MODEL_INT8}
   sed -i 's/^const unsigned char g_person_detect_model_data/alignas\(16\) &/' ${CONVERTED_PERSON_MODEL_INT8}
   SIZE=$(sed -E -n -e 's/^.*g_person_detect_model_data_len = ([0-9]+);/\1/p' ${CONVERTED_PERSON_MODEL_INT8})
diff --git a/tensorflow/lite/micro/tools/make/targets/cortex_m_corstone_300_makefile.inc b/tensorflow/lite/micro/tools/make/targets/cortex_m_corstone_300_makefile.inc
index c9bb8ea..653afad 100644
--- a/tensorflow/lite/micro/tools/make/targets/cortex_m_corstone_300_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/cortex_m_corstone_300_makefile.inc
@@ -204,6 +204,11 @@
 EXCLUDED_TESTS := \
   $(TENSORFLOW_ROOT)tensorflow/lite/micro/memory_arena_threshold_test.cc  \
   $(TENSORFLOW_ROOT)tensorflow/lite/micro/recording_micro_allocator_test.cc
+ifeq ($(CO_PROCESSOR), ethos_u)
+# This does not work with Ethos-U enabled since then NPU PMU counters are used instead for the sake of the benchmark example.
+EXCLUDED_TESTS += \
+  $(TENSORFLOW_ROOT)tensorflow/lite/micro/micro_time_test.cc
+endif
 MICROLITE_TEST_SRCS := $(filter-out $(EXCLUDED_TESTS), $(MICROLITE_TEST_SRCS))
 
 # TODO(#2449) Examine why this test fails here.