Add kernel build flag for prioritizing speed or size (#2408) Adds a build flag that can be used by any kernel to provide a different implementation depending on use case. Adds a first use case for cmsis-nn transpose conv. The background for this PR is in https://github.com/tensorflow/tflite-micro/pull/2345 BUG=none
diff --git a/tensorflow/lite/micro/docs/optimized_kernel_implementations.md b/tensorflow/lite/micro/docs/optimized_kernel_implementations.md index 4a5c81a..8eefb55 100644 --- a/tensorflow/lite/micro/docs/optimized_kernel_implementations.md +++ b/tensorflow/lite/micro/docs/optimized_kernel_implementations.md
@@ -169,6 +169,12 @@ * Build a static libtensorflow-microlite.a using the TFLM makefile with: `make -f tensorflow/lite/micro/tools/make/Makefile TARGET=<target> OPTIMIZED_KERNEL_DIR=<optimize_dir> microlite` + * Optionally build for size or speed. Translated to a valid make command it will be any of these two: + `make -f tensorflow/lite/micro/tools/make/Makefile TARGET=<target> + OPTIMIZED_KERNEL_DIR=<optimize_dir> OPTIMIZE_KERNELS_FOR=KERNELS_OPTIMIZED_FOR_SIZE microlite` + `make -f tensorflow/lite/micro/tools/make/Makefile TARGET=<target> + OPTIMIZED_KERNEL_DIR=<optimize_dir> OPTIMIZE_KERNELS_FOR=KERNELS_OPTIMIZED_FOR_SPEED microlite` + Check relevant README for given optimization library if this is applicable. * Use the static library and any TFLM headers as part of the overall application (with its own build system).
diff --git a/tensorflow/lite/micro/kernels/cmsis_nn/README.md b/tensorflow/lite/micro/kernels/cmsis_nn/README.md index e4a4de3..dc531b7 100644 --- a/tensorflow/lite/micro/kernels/cmsis_nn/README.md +++ b/tensorflow/lite/micro/kernels/cmsis_nn/README.md
@@ -1,12 +1,14 @@ <!-- mdformat off(b/169948621#comment2) --> -# Info +# General Info CMSIS-NN is a library containing kernel optimizations for Arm(R) Cortex(R)-M processors. To use CMSIS-NN optimized kernels instead of reference kernels, add `OPTIMIZED_KERNEL_DIR=cmsis_nn` to the make command line. See examples below. For more information about the optimizations, check out -[CMSIS-NN documentation](https://github.com/ARM-software/CMSIS_5/blob/develop/CMSIS/NN/README.md). +[CMSIS-NN documentation](https://github.com/ARM-software/CMSIS-NN/blob/main/README.md), + +# Specifying path to CMSIS-NN By default CMSIS-NN is built by code that is downloaded to the TFLM tree. It also possible to build CMSIS-NN code from an external path by specifying @@ -14,7 +16,7 @@ since CMSIS-NN has a dependency to CMSIS-Core. As a third option CMSIS-NN can be provided manually as an external library. The examples below will illustrate this. -# Example - FVP based on Arm Corstone-300 software. +## Example - FVP based on Arm Corstone-300 software. In this example, the kernel conv unit test is built. For more information about this specific target, check out the [Corstone-300 readme](https://github.com/tensorflow/tflite-micro/tree/main/tensorflow/lite/micro/cortex_m_corstone_300/README.md). @@ -39,3 +41,22 @@ Also note that if specifying CMSIS_NN_LIBS but not CMSIS_PATH and or CMSIS_NN_PATH, headers and system/startup code from the default downloaded path of CMSIS would be used. So CMSIS_NN_LIBS, CMSIS_NN_PATH and CMSIS_PATH should have the same base path and if not there will be a build error. + +# Build for speed or size +It is possible to build for speed or size. The size option may be required for a large model on an embedded system with limited memory. Where applicable, building for size would result in higher latency paired with a smaller scratch buffer, whereas building for speed would result in lower latency with a larger scratch buffer. Currently only transpose conv supports this. See examples below. + +## Example - building a static library with CMSIS-NN optimized kernels +More info on the target used in this example: https://github.com/tensorflow/tflite-micro/blob/main/tensorflow/lite/micro/cortex_m_generic/README.md + +Bulding for speed (default): +Note that speed is default so if leaving out OPTIMIZE_KERNELS_FOR completely that will be the default. +``` +make -f tensorflow/lite/micro/tools/make/Makefile TARGET=cortex_m_generic TARGET_ARCH=cortex-m55 OPTIMIZED_KERNEL_DIR=cmsis_nn OPTIMIZE_KERNELS_FOR=KERNELS_OPTIMIZED_FOR_SPEED microlite + +``` + +Bulding for size: +``` +make -f tensorflow/lite/micro/tools/make/Makefile TARGET=cortex_m_generic TARGET_ARCH=cortex-m55 OPTIMIZED_KERNEL_DIR=cmsis_nn OPTIMIZE_KERNELS_FOR=KERNELS_OPTIMIZED_FOR_SIZE microlite + +```
diff --git a/tensorflow/lite/micro/kernels/cmsis_nn/transpose_conv.cc b/tensorflow/lite/micro/kernels/cmsis_nn/transpose_conv.cc index 953f6de..06305bc 100644 --- a/tensorflow/lite/micro/kernels/cmsis_nn/transpose_conv.cc +++ b/tensorflow/lite/micro/kernels/cmsis_nn/transpose_conv.cc
@@ -1,4 +1,4 @@ -/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2024 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -198,14 +198,22 @@ if (input->type == kTfLiteInt8) { TFLITE_DCHECK(context->RequestScratchBufferInArena != nullptr); - RuntimeShape filter_shape = GetTensorShape(filter); RuntimeShape input_shape = GetTensorShape(input); RuntimeShape output_shape = GetTensorShape(output); + RuntimeShape filter_shape = GetTensorShape(filter); const int batch_size = MatchingDim(input_shape, 0, output_shape, 0); - const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3); const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3); + cmsis_nn_dims output_dims; + output_dims.n = batch_size; + output_dims.h = output_shape.Dims(1); + output_dims.w = output_shape.Dims(2); + output_dims.c = output_depth; + +#if defined(KERNELS_OPTIMIZED_FOR_SPEED) + const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3); + cmsis_nn_dims input_dims; input_dims.n = batch_size; input_dims.h = input_shape.Dims(1); @@ -218,17 +226,12 @@ filter_dims.w = filter_shape.Dims(2); filter_dims.c = input_depth; - cmsis_nn_dims output_dims; - output_dims.n = batch_size; - output_dims.h = output_shape.Dims(1); - output_dims.w = output_shape.Dims(2); - output_dims.c = output_depth; - const size_t buf_size = arm_transpose_conv_s8_get_buffer_size( &input_dims, &filter_dims, &output_dims); TFLITE_DCHECK(context->RequestScratchBufferInArena( context, buf_size, &(data->scratch_buffer_index)) == kTfLiteOk); +#endif // Quantized 8-bit kernels use an int32 scratch buffer. TFLITE_DCHECK( @@ -285,6 +288,7 @@ return kTfLiteOk; } +#if defined(KERNELS_OPTIMIZED_FOR_SPEED) TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node, const TfLiteConvParams& params, const OpData& data, @@ -376,6 +380,7 @@ return kTfLiteOk; } +#endif TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { const TfLiteEvalTensor* input = @@ -416,8 +421,29 @@ break; } case kTfLiteInt8: { +#if defined(KERNELS_OPTIMIZED_FOR_SIZE) + int32_t* scratch_buffer = static_cast<int32_t*>( + context->GetScratchBuffer(context, data.scratch_buffer_index)); + reference_integer_ops::TransposeConv( + data.params, data.per_channel_output_multiplier, + data.per_channel_output_shift, tflite::micro::GetTensorShape(input), + tflite::micro::GetTensorData<int8_t>(input), + tflite::micro::GetTensorShape(filter), + tflite::micro::GetTensorData<int8_t>(filter), + tflite::micro::GetTensorShape(bias), + tflite::micro::GetOptionalTensorData<int32_t>(bias), + tflite::micro::GetTensorShape(output), + tflite::micro::GetTensorData<int8_t>(output), + tflite::micro::GetTensorShape(nullptr), nullptr, scratch_buffer); +#elif defined(KERNELS_OPTIMIZED_FOR_SPEED) return EvalQuantizedPerChannel(context, node, params, data, input, filter, bias, output); +#else + MicroPrintf( + "Either KERNELS_OPTIMIZED_FOR_SIZE or KERNELS_OPTIMIZED_FOR_SPEED " + "must be defined"); + return kTfLiteError; +#endif break; } case kTfLiteInt16: { @@ -481,12 +507,33 @@ TFLITE_DCHECK(node->user_data != nullptr); const OpData& data = *(static_cast<const OpData*>(node->user_data)); - TF_LITE_ENSURE_EQ(context, input->type, output->type); +#if defined(KERNELS_OPTIMIZED_FOR_SIZE) + int32_t* scratch_buffer = static_cast<int32_t*>( + context->GetScratchBuffer(context, data.scratch_buffer_index)); + reference_integer_ops::TransposeConv( + data.params, data.per_channel_output_multiplier, + data.per_channel_output_shift, tflite::micro::GetTensorShape(input), + tflite::micro::GetTensorData<int8_t>(input), + tflite::micro::GetTensorShape(filter), + tflite::micro::GetTensorData<int8_t>(filter), + tflite::micro::GetTensorShape(bias), + tflite::micro::GetOptionalTensorData<int32_t>(bias), + tflite::micro::GetTensorShape(output), + tflite::micro::GetTensorData<int8_t>(output), + tflite::micro::GetTensorShape(nullptr), nullptr, scratch_buffer); +#elif defined(KERNELS_OPTIMIZED_FOR_SPEED) const auto& params = *(reinterpret_cast<TfLiteConvParams*>(node->builtin_data)); return EvalQuantizedPerChannel(context, node, params, data, input, filter, bias, output); +#else + MicroPrintf( + "Either KERNELS_OPTIMIZED_FOR_SIZE or KERNELS_OPTIMIZED_FOR_SPEED must " + "be defined"); + return kTfLiteError; +#endif + return kTfLiteOk; } } // namespace
diff --git a/tensorflow/lite/micro/tools/make/Makefile b/tensorflow/lite/micro/tools/make/Makefile index 8f6c002..bb4a983 100644 --- a/tensorflow/lite/micro/tools/make/Makefile +++ b/tensorflow/lite/micro/tools/make/Makefile
@@ -1,4 +1,4 @@ -# Copyright 2023 The TensorFlow Authors. All Rights Reserved. +# Copyright 2024 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -60,6 +60,17 @@ # Specify which specialized kernel implementation should be pulled in. OPTIMIZED_KERNEL_DIR := +# Optimize kernels for speed or memory. This is similar but not the same as KERNEL_OPTIMIZATION_LEVEL and +# CORE_OPTIMIZATION_LEVEL, which specify compiler optimization level. +# Instead this enables a kernel to provide multiple implementations that is configured at build time. +# An example could be a kernel requiring a bigger scratch buffer for certain use cases. +# The example kernel would have a smaller scratch buffer usage when building for size. +# Vice versa it would use more scratch buffer when building for speed and would be more performant. +# Note that this is optional. If having one implementation, nothing needs to be done. +# OPTIMIZE_KERNELS_FOR has only two valid values, KERNELS_OPTIMIZED_FOR_SIZE and KERNELS_OPTIMIZED_FOR_SPEED where the +# former is default. +OPTIMIZE_KERNELS_FOR := KERNELS_OPTIMIZED_FOR_SPEED + # Override this variable from the command line in case the optimized kernels are # in a different directory. OPTIMIZED_KERNEL_DIR_PREFIX := $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels @@ -99,7 +110,7 @@ MICROLITE_LIBS := -lm -# For the optimized_kernel_dir, and co-processor as specified on the +# For the optimized_kernel_dir, co-processor and optimize_kernels_for as specified on the # command line we add -D<tag> to the cflags to allow for #idefs in the code. # # We apply the following transformations (via the tr command): @@ -113,6 +124,10 @@ ADDITIONAL_DEFINES += -D$(shell echo $(CO_PROCESSOR) | tr [a-z] [A-Z]) endif +ifneq ($(OPTIMIZE_KERNELS_FOR),) + ADDITIONAL_DEFINES += -D$(shell echo $(OPTIMIZE_KERNELS_FOR) | tr [a-z] [A-Z]) +endif + ifeq ($(TOOLCHAIN), armclang) CORE_OPTIMIZATION_LEVEL := -Oz else @@ -483,11 +498,11 @@ ifneq ($(BUILD_TYPE), no_tf_lite_static_memory) EXCLUDED_TFL_CC_SRCS := \ - $(TENSORFLOW_ROOT)tensorflow/lite/array.cc + $(TENSORFLOW_ROOT)tensorflow/lite/array.cc TFL_CC_SRCS := $(filter-out $(EXCLUDED_TFL_CC_SRCS), $(TFL_CC_SRCS)) EXCLUDED_TFL_CC_HDRS := \ - $(TENSORFLOW_ROOT)tensorflow/lite/array.h + $(TENSORFLOW_ROOT)tensorflow/lite/array.h TFL_CC_HDRS := $(filter-out $(EXCLUDED_TFL_CC_HDRS), $(TFL_CC_HDRS)) endif @@ -614,6 +629,11 @@ include $(MAKEFILE_DIR)/targets/$(TARGET)_makefile.inc endif +# Validate valid options. +ifeq (,$(filter $(OPTIMIZE_KERNELS_FOR),KERNELS_OPTIMIZED_FOR_SPEED KERNELS_OPTIMIZED_FOR_SIZE)) + $(error Incorrect OPTIMIZE_KERNELS_FOR: $(OPTIMIZE_KERNELS_FOR)) +endif + ifneq ($(OPTIMIZED_KERNEL_DIR),) PATH_TO_OPTIMIZED_KERNELS := $(OPTIMIZED_KERNEL_DIR_PREFIX)/$(OPTIMIZED_KERNEL_DIR) PATH_TO_SIGNAL_OPTIMIZED_KERNELS := $(OPTIMIZED_SIGNAL_KERNEL_DIR_PREFIX)/$(OPTIMIZED_KERNEL_DIR)