Add kernel build flag for prioritizing speed or size (#2408)

Adds a build flag that can be used by any kernel to provide a different implementation depending on use case.
Adds a first use case for cmsis-nn transpose conv.

The background for this PR is in https://github.com/tensorflow/tflite-micro/pull/2345

BUG=none 
diff --git a/tensorflow/lite/micro/docs/optimized_kernel_implementations.md b/tensorflow/lite/micro/docs/optimized_kernel_implementations.md
index 4a5c81a..8eefb55 100644
--- a/tensorflow/lite/micro/docs/optimized_kernel_implementations.md
+++ b/tensorflow/lite/micro/docs/optimized_kernel_implementations.md
@@ -169,6 +169,12 @@
     *   Build a static libtensorflow-microlite.a using the TFLM makefile with:
         `make -f tensorflow/lite/micro/tools/make/Makefile TARGET=<target>
         OPTIMIZED_KERNEL_DIR=<optimize_dir> microlite`
+    *   Optionally build for size or speed. Translated to a valid make command it will be any of these two:
+        `make -f tensorflow/lite/micro/tools/make/Makefile TARGET=<target>
+        OPTIMIZED_KERNEL_DIR=<optimize_dir> OPTIMIZE_KERNELS_FOR=KERNELS_OPTIMIZED_FOR_SIZE microlite`
+        `make -f tensorflow/lite/micro/tools/make/Makefile TARGET=<target>
+        OPTIMIZED_KERNEL_DIR=<optimize_dir> OPTIMIZE_KERNELS_FOR=KERNELS_OPTIMIZED_FOR_SPEED microlite`
+        Check relevant README for given optimization library if this is applicable.
     *   Use the static library and any TFLM headers as part of the overall
         application (with its own build system).
 
diff --git a/tensorflow/lite/micro/kernels/cmsis_nn/README.md b/tensorflow/lite/micro/kernels/cmsis_nn/README.md
index e4a4de3..dc531b7 100644
--- a/tensorflow/lite/micro/kernels/cmsis_nn/README.md
+++ b/tensorflow/lite/micro/kernels/cmsis_nn/README.md
@@ -1,12 +1,14 @@
 <!-- mdformat off(b/169948621#comment2) -->
 
-# Info
+# General Info
 CMSIS-NN is a library containing kernel optimizations for Arm(R) Cortex(R)-M
 processors. To use CMSIS-NN optimized kernels instead of reference kernels, add
 `OPTIMIZED_KERNEL_DIR=cmsis_nn` to the make command line. See examples below.
 
 For more information about the optimizations, check out
-[CMSIS-NN documentation](https://github.com/ARM-software/CMSIS_5/blob/develop/CMSIS/NN/README.md).
+[CMSIS-NN documentation](https://github.com/ARM-software/CMSIS-NN/blob/main/README.md),
+
+# Specifying path to CMSIS-NN
 
 By default CMSIS-NN is built by code that is downloaded to the TFLM tree.
 It also possible to build CMSIS-NN code from an external path by specifying
@@ -14,7 +16,7 @@
 since CMSIS-NN has a dependency to CMSIS-Core. As a third option CMSIS-NN can be provided manually as an external library.
 The examples below will illustrate this.
 
-# Example - FVP based on Arm Corstone-300 software.
+## Example - FVP based on Arm Corstone-300 software.
 In this example, the kernel conv unit test is built. For more information about
 this specific target, check out the [Corstone-300 readme](https://github.com/tensorflow/tflite-micro/tree/main/tensorflow/lite/micro/cortex_m_corstone_300/README.md).
 
@@ -39,3 +41,22 @@
 Also note that if specifying CMSIS_NN_LIBS but not CMSIS_PATH and or CMSIS_NN_PATH, headers and
 system/startup code from the default downloaded path of CMSIS would be used.
 So CMSIS_NN_LIBS, CMSIS_NN_PATH and CMSIS_PATH should have the same base path and if not there will be a build error.
+
+# Build for speed or size
+It is possible to build for speed or size. The size option may be required for a large model on an embedded system with limited memory. Where applicable, building for size would result in higher latency paired with a smaller scratch buffer, whereas building for speed would result in lower latency with a larger scratch buffer. Currently only transpose conv supports this.  See examples below.
+
+## Example - building a static library with CMSIS-NN optimized kernels
+More info on the target used in this example: https://github.com/tensorflow/tflite-micro/blob/main/tensorflow/lite/micro/cortex_m_generic/README.md
+
+Bulding for speed (default):
+Note that speed is default so if leaving out OPTIMIZE_KERNELS_FOR completely that will be the default.
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=cortex_m_generic TARGET_ARCH=cortex-m55 OPTIMIZED_KERNEL_DIR=cmsis_nn OPTIMIZE_KERNELS_FOR=KERNELS_OPTIMIZED_FOR_SPEED microlite
+
+```
+
+Bulding for size:
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=cortex_m_generic TARGET_ARCH=cortex-m55 OPTIMIZED_KERNEL_DIR=cmsis_nn OPTIMIZE_KERNELS_FOR=KERNELS_OPTIMIZED_FOR_SIZE microlite
+
+```
diff --git a/tensorflow/lite/micro/kernels/cmsis_nn/transpose_conv.cc b/tensorflow/lite/micro/kernels/cmsis_nn/transpose_conv.cc
index 953f6de..06305bc 100644
--- a/tensorflow/lite/micro/kernels/cmsis_nn/transpose_conv.cc
+++ b/tensorflow/lite/micro/kernels/cmsis_nn/transpose_conv.cc
@@ -1,4 +1,4 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -198,14 +198,22 @@
   if (input->type == kTfLiteInt8) {
     TFLITE_DCHECK(context->RequestScratchBufferInArena != nullptr);
 
-    RuntimeShape filter_shape = GetTensorShape(filter);
     RuntimeShape input_shape = GetTensorShape(input);
     RuntimeShape output_shape = GetTensorShape(output);
+    RuntimeShape filter_shape = GetTensorShape(filter);
 
     const int batch_size = MatchingDim(input_shape, 0, output_shape, 0);
-    const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
     const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
 
+    cmsis_nn_dims output_dims;
+    output_dims.n = batch_size;
+    output_dims.h = output_shape.Dims(1);
+    output_dims.w = output_shape.Dims(2);
+    output_dims.c = output_depth;
+
+#if defined(KERNELS_OPTIMIZED_FOR_SPEED)
+    const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+
     cmsis_nn_dims input_dims;
     input_dims.n = batch_size;
     input_dims.h = input_shape.Dims(1);
@@ -218,17 +226,12 @@
     filter_dims.w = filter_shape.Dims(2);
     filter_dims.c = input_depth;
 
-    cmsis_nn_dims output_dims;
-    output_dims.n = batch_size;
-    output_dims.h = output_shape.Dims(1);
-    output_dims.w = output_shape.Dims(2);
-    output_dims.c = output_depth;
-
     const size_t buf_size = arm_transpose_conv_s8_get_buffer_size(
         &input_dims, &filter_dims, &output_dims);
     TFLITE_DCHECK(context->RequestScratchBufferInArena(
                       context, buf_size, &(data->scratch_buffer_index)) ==
                   kTfLiteOk);
+#endif
 
     // Quantized 8-bit kernels use an int32 scratch buffer.
     TFLITE_DCHECK(
@@ -285,6 +288,7 @@
   return kTfLiteOk;
 }
 
+#if defined(KERNELS_OPTIMIZED_FOR_SPEED)
 TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
                                      const TfLiteConvParams& params,
                                      const OpData& data,
@@ -376,6 +380,7 @@
 
   return kTfLiteOk;
 }
+#endif
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteEvalTensor* input =
@@ -416,8 +421,29 @@
       break;
     }
     case kTfLiteInt8: {
+#if defined(KERNELS_OPTIMIZED_FOR_SIZE)
+      int32_t* scratch_buffer = static_cast<int32_t*>(
+          context->GetScratchBuffer(context, data.scratch_buffer_index));
+      reference_integer_ops::TransposeConv(
+          data.params, data.per_channel_output_multiplier,
+          data.per_channel_output_shift, tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<int8_t>(input),
+          tflite::micro::GetTensorShape(filter),
+          tflite::micro::GetTensorData<int8_t>(filter),
+          tflite::micro::GetTensorShape(bias),
+          tflite::micro::GetOptionalTensorData<int32_t>(bias),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<int8_t>(output),
+          tflite::micro::GetTensorShape(nullptr), nullptr, scratch_buffer);
+#elif defined(KERNELS_OPTIMIZED_FOR_SPEED)
       return EvalQuantizedPerChannel(context, node, params, data, input, filter,
                                      bias, output);
+#else
+      MicroPrintf(
+          "Either KERNELS_OPTIMIZED_FOR_SIZE or KERNELS_OPTIMIZED_FOR_SPEED "
+          "must be defined");
+      return kTfLiteError;
+#endif
       break;
     }
     case kTfLiteInt16: {
@@ -481,12 +507,33 @@
   TFLITE_DCHECK(node->user_data != nullptr);
   const OpData& data = *(static_cast<const OpData*>(node->user_data));
 
-  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+#if defined(KERNELS_OPTIMIZED_FOR_SIZE)
+  int32_t* scratch_buffer = static_cast<int32_t*>(
+      context->GetScratchBuffer(context, data.scratch_buffer_index));
+  reference_integer_ops::TransposeConv(
+      data.params, data.per_channel_output_multiplier,
+      data.per_channel_output_shift, tflite::micro::GetTensorShape(input),
+      tflite::micro::GetTensorData<int8_t>(input),
+      tflite::micro::GetTensorShape(filter),
+      tflite::micro::GetTensorData<int8_t>(filter),
+      tflite::micro::GetTensorShape(bias),
+      tflite::micro::GetOptionalTensorData<int32_t>(bias),
+      tflite::micro::GetTensorShape(output),
+      tflite::micro::GetTensorData<int8_t>(output),
+      tflite::micro::GetTensorShape(nullptr), nullptr, scratch_buffer);
+#elif defined(KERNELS_OPTIMIZED_FOR_SPEED)
   const auto& params =
       *(reinterpret_cast<TfLiteConvParams*>(node->builtin_data));
 
   return EvalQuantizedPerChannel(context, node, params, data, input, filter,
                                  bias, output);
+#else
+  MicroPrintf(
+      "Either KERNELS_OPTIMIZED_FOR_SIZE or KERNELS_OPTIMIZED_FOR_SPEED must "
+      "be defined");
+  return kTfLiteError;
+#endif
+  return kTfLiteOk;
 }
 
 }  // namespace
diff --git a/tensorflow/lite/micro/tools/make/Makefile b/tensorflow/lite/micro/tools/make/Makefile
index 8f6c002..bb4a983 100644
--- a/tensorflow/lite/micro/tools/make/Makefile
+++ b/tensorflow/lite/micro/tools/make/Makefile
@@ -1,4 +1,4 @@
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -60,6 +60,17 @@
 # Specify which specialized kernel implementation should be pulled in.
 OPTIMIZED_KERNEL_DIR :=
 
+# Optimize kernels for speed or memory. This is similar but not the same as KERNEL_OPTIMIZATION_LEVEL and
+# CORE_OPTIMIZATION_LEVEL, which specify compiler optimization level.
+# Instead this enables a kernel to provide multiple implementations that is configured at build time.
+# An example could be a kernel requiring a bigger scratch buffer for certain use cases.
+# The example kernel would have a smaller scratch buffer usage when building for size.
+# Vice versa it would use more scratch buffer when building for speed and would be more performant.
+# Note that this is optional. If having one implementation, nothing needs to be done.
+# OPTIMIZE_KERNELS_FOR has only two valid values, KERNELS_OPTIMIZED_FOR_SIZE and KERNELS_OPTIMIZED_FOR_SPEED where the
+# former is default.
+OPTIMIZE_KERNELS_FOR := KERNELS_OPTIMIZED_FOR_SPEED
+
 # Override this variable from the command line in case the optimized kernels are
 # in a different directory.
 OPTIMIZED_KERNEL_DIR_PREFIX := $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels
@@ -99,7 +110,7 @@
 
 MICROLITE_LIBS := -lm
 
-# For the optimized_kernel_dir, and co-processor as specified on the
+# For the optimized_kernel_dir, co-processor and optimize_kernels_for as specified on the
 # command line we add -D<tag> to the cflags to allow for #idefs in the code.
 #
 # We apply the following transformations (via the tr command):
@@ -113,6 +124,10 @@
   ADDITIONAL_DEFINES += -D$(shell echo $(CO_PROCESSOR) | tr [a-z] [A-Z])
 endif
 
+ifneq ($(OPTIMIZE_KERNELS_FOR),)
+  ADDITIONAL_DEFINES += -D$(shell echo $(OPTIMIZE_KERNELS_FOR) | tr [a-z] [A-Z])
+endif
+
 ifeq ($(TOOLCHAIN), armclang)
   CORE_OPTIMIZATION_LEVEL := -Oz
 else
@@ -483,11 +498,11 @@
 
 ifneq ($(BUILD_TYPE), no_tf_lite_static_memory)
   EXCLUDED_TFL_CC_SRCS := \
-  	$(TENSORFLOW_ROOT)tensorflow/lite/array.cc
+	$(TENSORFLOW_ROOT)tensorflow/lite/array.cc
   TFL_CC_SRCS := $(filter-out $(EXCLUDED_TFL_CC_SRCS), $(TFL_CC_SRCS))
 
   EXCLUDED_TFL_CC_HDRS := \
-  	$(TENSORFLOW_ROOT)tensorflow/lite/array.h
+	$(TENSORFLOW_ROOT)tensorflow/lite/array.h
   TFL_CC_HDRS := $(filter-out $(EXCLUDED_TFL_CC_HDRS), $(TFL_CC_HDRS))
 endif
 
@@ -614,6 +629,11 @@
   include $(MAKEFILE_DIR)/targets/$(TARGET)_makefile.inc
 endif
 
+# Validate valid options.
+ifeq (,$(filter $(OPTIMIZE_KERNELS_FOR),KERNELS_OPTIMIZED_FOR_SPEED KERNELS_OPTIMIZED_FOR_SIZE))
+    $(error Incorrect OPTIMIZE_KERNELS_FOR: $(OPTIMIZE_KERNELS_FOR))
+endif
+
 ifneq ($(OPTIMIZED_KERNEL_DIR),)
   PATH_TO_OPTIMIZED_KERNELS := $(OPTIMIZED_KERNEL_DIR_PREFIX)/$(OPTIMIZED_KERNEL_DIR)
   PATH_TO_SIGNAL_OPTIMIZED_KERNELS := $(OPTIMIZED_SIGNAL_KERNEL_DIR_PREFIX)/$(OPTIMIZED_KERNEL_DIR)