Merge "Support running a different Kelvin benchmark binary"
diff --git a/docs/tflm_ops.md b/docs/tflm_ops.md
new file mode 100644
index 0000000..1154ebf
--- /dev/null
+++ b/docs/tflm_ops.md
@@ -0,0 +1,25 @@
+# Optimized ops in Kelvin TFLM
+
+The following table is a list of currently optimized ops in Kelvin TFLM. The
+relevant source code can be found located [here](https://opensecura.googlesource.com/sw/kelvin/+/refs/heads/master/tflm/opt).
+
+## Non-Convolutional Ops
+
+| Op | Supported Data Type | Comments |
+| :-------------- | :-----------------: | :---------------------------------------- |
+| Elementwise Add | s8, s16, s32 | Rescaling with offset and shift, clamping |
+| Leaky ReLU | s8, s16 | |
+| Max Pooling | s8 | |
+
+## Convolutional Ops
+
+| Op | Weights | Activation | Bias | Comments |
+| :--------------- | :-----: | :--------: | :--: | :-------------------------------------- |
+| Depthwise Conv2d | s8 | s16 | s64 | filter size 3x1 |
+| Depthwise Conv2d | s8 | s8 | s64 | output depth % 32 == 0 |
+| Conv2d | s8 | s16 | s32 | |
+| Conv2d | s8 | s16 | s64 | filter size 1x1, filter depth % 32 == 0 |
+| Conv2d | s8 | s16 | s64 | filter size 1xn, grouped or ungroups |
+| Conv2d | s8 | s8 | s32 | filter size 1x1, output depth % 8 == 0 |
+| Conv2d | s8 | s8 | s32 | filter depth % 32 == 0 |
+| Conv2d | s8 | s8 | s32 | filter shape == (48x3x1x48) |
diff --git a/tflm/opt/BUILD b/tflm/opt/BUILD
index e4d533b..28dde26 100644
--- a/tflm/opt/BUILD
+++ b/tflm/opt/BUILD
@@ -17,7 +17,13 @@
cc_library(
name = "opt",
srcs = [
- "conv.cc",
+ "conv_s16_b32.cc",
+ "conv_s16_b64.cc",
+ "conv_s8.cc",
+ "conv_s8_1x1.cc",
+ "conv_s8_3x1_d48.cc",
+ "conv_s8_d4.cc",
+ "conv_s8_d32.cc",
"depthwise_conv_s16.cc",
"depthwise_conv_s8.cc",
"elementwise_add_s16.cc",
@@ -25,10 +31,12 @@
"elementwise_add_s8.cc",
"leaky_relu_s16.cc",
"leaky_relu_s8.cc",
+ "max_pool_s8.cc",
"memcpy.cc",
- "max_pool_s8.cc"
],
hdrs = [
+ "conv_s8.h",
+ "conv_util.h",
"opt.h",
"util.h",
],
diff --git a/tflm/opt/conv.cc b/tflm/opt/conv.cc
deleted file mode 100644
index 8d33848..0000000
--- a/tflm/opt/conv.cc
+++ /dev/null
@@ -1,731 +0,0 @@
-/*
- * Copyright 2023 Google LLC
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cassert>
-#include <memory>
-
-#include "crt/kelvin.h"
-#include "tensorflow/lite/kernels/internal/common.h"
-#include "tensorflow/lite/kernels/internal/runtime_shape.h"
-#include "tensorflow/lite/kernels/internal/types.h"
-#include "tflm/opt/opt.h"
-#include "tflm/opt/util.h"
-
-namespace kelvin::opt {
-namespace {
-/* clang-format off */
-constexpr const int swizzle[16] = {
- 0, 4, 8, 12,
- 2, 6, 10, 14,
- 1, 5, 9, 13,
- 3, 7, 11, 15,
-};
-/* clang-format on */
-
-constexpr int kFilterHeightIndex = 1;
-constexpr int kFilterWidthIndex = 2;
-constexpr int kFilterInputChannelIndex = 3;
-constexpr int kInputChannelIndex = 3;
-constexpr int kOutputChannelIndex = 3;
-} // namespace
-
-void conv_per_channel_b32(
- const tflite::ConvParams& params, const int32_t* output_multiplier,
- const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
- const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
- const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
- const int32_t* bias_data, const tflite::RuntimeShape& output_shape,
- int16_t* output_data) {
- const auto batches = MatchingDim(input_shape, 0, output_shape, 0);
- const auto stride_width = params.stride_width;
- const auto stride_height = params.stride_height;
- const auto dilation_width_factor = params.dilation_width_factor;
- const auto dilation_height_factor = params.dilation_height_factor;
- const auto pad_width = params.padding_values.width;
- const auto pad_height = params.padding_values.height;
- const auto input_height = input_shape.Dims(1);
- const auto input_width = input_shape.Dims(2);
- const auto input_depth = input_shape.Dims(3);
- const auto input_offset = params.input_offset;
- const auto filter_height = filter_shape.Dims(1);
- const auto filter_width = filter_shape.Dims(2);
- const auto filter_depth = filter_shape.Dims(3);
- const auto output_height = output_shape.Dims(1);
- const auto output_width = output_shape.Dims(2);
- const auto output_depth = output_shape.Dims(3);
- const auto output_offset = params.output_offset;
- const auto output_activation_min = params.quantized_activation_min;
- const auto output_activation_max = params.quantized_activation_max;
- const auto groups = input_depth / filter_depth;
- const auto filters_per_group = output_depth / groups;
-
- for (int batch = 0; batch < batches; ++batch) {
- for (int out_y = 0; out_y < output_height; ++out_y) {
- const int in_y_origin = out_y * stride_height - pad_height;
- for (int out_x = 0; out_x < output_width; ++out_x) {
- const int in_x_origin = out_x * stride_width - pad_width;
- for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
- auto group = out_channel / filters_per_group;
- int32_t acc32 = 0;
- for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
- const int in_y = in_y_origin + dilation_height_factor * filter_y;
- for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
- const int in_x = in_x_origin + dilation_width_factor * filter_x;
- const bool inside = (in_x >= 0) && (in_x < input_width) &&
- (in_y >= 0) && (in_y < input_height);
- if (!inside) {
- continue;
- }
- int in_channel = 0;
- do {
- int load_count = std::min(filter_depth - in_channel, 16L);
- int32_t input_swizzled[16];
- const int16_t* p_input = &input_data[tflite::Offset(
- input_shape, batch, in_y, in_x,
- in_channel + group * filter_depth)];
- for (int i = 0; i < 16; ++i) {
- int swizzle_idx = swizzle[i];
- if (swizzle_idx < load_count)
- input_swizzled[i] = *(p_input + swizzle_idx) + input_offset;
- else
- input_swizzled[i] = 0;
- }
- vld_w_l_xx(v0, input_swizzled, 4);
- vld_w_l_xx(v1, input_swizzled + 4, 4);
- vld_w_l_xx(v2, input_swizzled + 8, 4);
- vld_w_l_xx(v3, input_swizzled + 12, 4);
- vld_b_l_xx(v4,
- &filter_data[tflite::Offset(filter_shape,
- out_channel, filter_y,
- filter_x, in_channel)],
- load_count);
- vaddw_h_vx(v4, v4, 0);
- vaddw_w_vx(v6, v5, 0);
- vaddw_w_vx(v4, v4, 0);
-
- vmul_w_vv_m(vm0, vm0, vm1);
- vadd_w_vv(v0, v0, v1);
- vadd_w_vv(v0, v0, v2);
- vadd_w_vv(v0, v0, v3);
- int32_t acc_spill[4];
- vst_w_l_xx(v0, acc_spill, 4);
- for (int i = 0; i < 4; ++i) {
- acc32 += acc_spill[i];
- }
- in_channel += 16;
- } while (in_channel + 16 <= filter_depth);
- }
- }
- if (bias_data) {
- acc32 = acc32 + bias_data[out_channel];
- }
- int32_t acc = tflite::MultiplyByQuantizedMultiplier(
- acc32, output_multiplier[out_channel], output_shift[out_channel]);
- acc += output_offset;
- acc = std::clamp(acc, output_activation_min, output_activation_max);
- output_data[tflite::Offset(output_shape, batch, out_y, out_x,
- out_channel)] = static_cast<int16_t>(acc);
- }
- }
- }
- }
-}
-
-// Accumulates in v0-v7. [v0-v3], [v4-v7] are sub accumulators for two outputs.
-// Load/swizzle filters use [v52-v63].
-// Input activations use [v32-v33].
-// No clobbers.
-void ukernel_s8_s16(const int16_t* input_data0,
- const int8_t* filter_data0,
- const int8_t* filter_data1,
- size_t n) {
- n = n >> 5;
- while (n > 0) {
- // Load filters 0 to v58, v59
- vld_b_p_x(v52, filter_data0);
- vaddw_h_vx(v56, v52, 0);
- vzip_h_vv(v58, v56, v57);
-
- // Load activations
- vld_h_p_x(v32, input_data0);
- vld_h_p_x(v33, input_data0);
-
- // Multiply filters0 * activations
- vmulw_w_vv(v16, v58, v32);
- vmulw_w_vv(v18, v59, v33);
-
- // Accumulate v0
- vadd_w_vv_m(v0, v0, v16);
-
- // Load filters 1 to v62, v63
- vld_b_p_x(v53, filter_data1);
- vaddw_h_vx(v60, v53, 0);
- vzip_h_vv(v62, v60, v61);
-
- // Multiply filters1 * activations
- vmulw_w_vv(v20, v62, v32);
- vmulw_w_vv(v22, v63, v33);
-
- // Accumulate v4
- vadd_w_vv_m(v4, v4, v20);
- n--;
- }
-}
-
-void conv_per_channel_b64_1x1(
- const tflite::ConvParams& params, const int32_t* output_multiplier,
- const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
- const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
- const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
- const int64_t* bias_data, const tflite::RuntimeShape& output_shape,
- int16_t* output_data) {
- const auto batches = MatchingDim(input_shape, 0, output_shape, 0);
- const auto input_height = input_shape.Dims(1);
- const auto input_width = input_shape.Dims(2);
- const auto input_depth = input_shape.Dims(3);
- const auto input_offset = params.input_offset;
- const auto filter_input_depth = filter_shape.Dims(3);
- const auto output_depth = output_shape.Dims(3);
- const auto output_offset = params.output_offset;
- const auto output_activation_min = params.quantized_activation_min;
- const auto output_activation_max = params.quantized_activation_max;
- const auto groups = input_depth / filter_input_depth;
- const auto output_filters_per_group = output_depth / groups;
-
- int32_t accumulators[8];
- for (int bhw = 0; bhw < batches * input_height * input_width; bhw++) {
- const int16_t* local_input = input_data + (bhw * input_depth);
- int16_t* local_output = output_data + (bhw * output_depth);
- for (int g = 0; g < groups; g++) {
- const int16_t* group_input = local_input + (g * filter_input_depth);
- for (int gc = 0; gc + 2 <= output_filters_per_group; gc += 2) {
- int oc = (g * output_filters_per_group) + gc;
- const int8_t* local_filters0 = filter_data + (oc * filter_input_depth);
- const int8_t* local_filters1 = local_filters0 + filter_input_depth;
-
- vdup_w_x_m(v0, 0);
- vdup_w_x_m(v4, 0);
- ukernel_s8_s16(group_input, local_filters0, local_filters1,
- filter_input_depth);
- // sum accumulators
- vadd_w_vv(v0, v0, v1);
- vadd_w_vv(v2, v2, v3);
- vadd_w_vv(v0, v0, v2);
- vadd_w_vv(v4, v4, v5);
- vadd_w_vv(v6, v6, v7);
- vadd_w_vv(v4, v4, v6);
-
- {
- vst_w_x(v0, accumulators);
- int64_t acc64 = bias_data[oc];
- for (int i = 0; i < 8; i++) {
- acc64 += accumulators[i];
- }
- int32_t acc = tflite::MultiplyByQuantizedMultiplier(
- acc64, output_multiplier[oc], output_shift[oc]);
- acc += output_offset;
- acc = std::clamp(acc, output_activation_min, output_activation_max);
- local_output[oc] = static_cast<int16_t>(acc);
- }
-
- {
- vst_w_x(v4, accumulators);
- int64_t acc64 = bias_data[oc + 1];
- for (int i = 0; i < 8; i++) {
- acc64 += accumulators[i];
- }
- int32_t acc = tflite::MultiplyByQuantizedMultiplier(
- acc64, output_multiplier[oc + 1], output_shift[oc + 1]);
- acc += output_offset;
- acc = std::clamp(acc, output_activation_min, output_activation_max);
- local_output[oc + 1] = static_cast<int16_t>(acc);
- }
- }
- }
- }
-}
-
-// Optimized for grouped convolutions, no dilation, 1xn filter
-void conv_per_channel_b64_filter1xn_group(
- const tflite::ConvParams& params, const int32_t* output_multiplier,
- const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
- const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
- const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
- const int64_t* bias_data, const tflite::RuntimeShape& output_shape,
- int16_t* output_data) {
- const auto batches = MatchingDim(input_shape, 0, output_shape, 0);
- const auto stride_width = params.stride_width;
- const auto pad_width = params.padding_values.width;
- const auto input_width = input_shape.Dims(2);
- const auto input_depth = input_shape.Dims(3);
- const auto input_offset = params.input_offset;
- const auto filter_width = filter_shape.Dims(2);
- const auto filter_depth = filter_shape.Dims(3);
- const auto output_width = output_shape.Dims(2);
- const auto output_depth = output_shape.Dims(3);
- const auto output_offset = params.output_offset;
- const auto output_activation_min = params.quantized_activation_min;
- const auto output_activation_max = params.quantized_activation_max;
-
- const auto groups = input_depth / filter_depth;
- const auto output_filters_per_group = output_depth / groups;
-
- int32_t accumulators[8];
- for (int g = 0; g < groups; g++) {
- for (int gc = 0; gc + 2 <= output_filters_per_group; gc += 2) {
- int oc = (g * output_filters_per_group) + gc;
- for (int b = 0; b < batches; ++b) {
- for (int out_x = 0; out_x < output_width; ++out_x) {
- const int in_x_origin = out_x * stride_width - pad_width;
- const int8_t* local_filters0 =
- filter_data + (oc * filter_width * filter_depth);
- const int8_t* local_filters1 =
- local_filters0 + (filter_width * filter_depth);
- const int16_t* local_input = input_data +
- (b * input_width * input_depth) +
- (in_x_origin * input_depth) +
- (g * filter_depth);
- int16_t* local_output = output_data +
- (b * output_width * output_depth) +
- (out_x * output_depth);
-
- int64_t acc64_0 = 0;
- int64_t acc64_1 = 0;
- vdup_w_x_m(v0, 0);
- vdup_w_x_m(v4, 0);
- for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
- const int8_t* local_filters0x =
- local_filters0 + (filter_x * filter_depth);
- const int8_t* local_filters1x =
- local_filters1 + (filter_x * filter_depth);
- const int16_t* local_inputx =
- local_input + (filter_x * input_depth);
-
- ukernel_s8_s16(local_inputx, local_filters0x, local_filters1x,
- filter_depth);
- }
-
- // sum accumulators
- vadd_w_vv(v0, v0, v1);
- vadd_w_vv(v2, v2, v3);
- vadd_w_vv(v0, v0, v2);
- vadd_w_vv(v4, v4, v5);
- vadd_w_vv(v6, v6, v7);
- vadd_w_vv(v4, v4, v6);
-
- {
- vst_w_x(v0, accumulators);
- for (int i = 0; i < 8; i++) {
- acc64_0 += accumulators[i];
- }
- acc64_0 += bias_data[oc];
- int32_t acc = tflite::MultiplyByQuantizedMultiplier(
- acc64_0, output_multiplier[oc], output_shift[oc]);
- acc += output_offset;
- acc = std::clamp(acc, output_activation_min, output_activation_max);
- local_output[oc] = static_cast<int16_t>(acc);
- }
-
- {
- vst_w_x(v4, accumulators);
- for (int i = 0; i < 8; i++) {
- acc64_1 += accumulators[i];
- }
- acc64_1 += bias_data[oc + 1];
- int32_t acc = tflite::MultiplyByQuantizedMultiplier(
- acc64_1, output_multiplier[oc + 1], output_shift[oc + 1]);
- acc += output_offset;
- acc = std::clamp(acc, output_activation_min, output_activation_max);
- local_output[oc + 1] = static_cast<int16_t>(acc);
- }
- }
- }
- }
- }
-}
-
-// Optimized for no group, no dilation, 1xn filter.
-void conv_per_channel_b64_filter1xn_non_group(
- const tflite::ConvParams& params, const int32_t* output_multiplier,
- const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
- const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
- const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
- const int64_t* bias_data, const tflite::RuntimeShape& output_shape,
- int16_t* output_data) {
- const auto batches = MatchingDim(input_shape, 0, output_shape, 0);
- const auto stride_width = params.stride_width;
- const auto pad_width = params.padding_values.width;
- const auto input_width = input_shape.Dims(2);
- const auto input_depth = input_shape.Dims(3);
- const auto input_offset = params.input_offset;
- const auto filter_width = filter_shape.Dims(2);
- const auto filter_depth = filter_shape.Dims(3);
- const auto output_width = output_shape.Dims(2);
- const auto output_depth = output_shape.Dims(3);
- const auto output_offset = params.output_offset;
- const auto output_activation_min = params.quantized_activation_min;
- const auto output_activation_max = params.quantized_activation_max;
- int32_t accumulators[8];
- for (int oc = 0; oc + 2 <= output_depth; oc += 2) {
- for (int batch = 0; batch < batches; ++batch) {
- for (int out_x = 0; out_x < output_width; ++out_x) {
- const int in_x_origin = out_x * stride_width - pad_width;
-
- const int8_t* local_filters0 =
- filter_data + (oc * filter_width * filter_depth);
- const int8_t* local_filters1 =
- local_filters0 + (filter_width * filter_depth);
- const int16_t* local_input = input_data +
- (batch * input_width * input_depth) +
- (in_x_origin * input_depth);
- int16_t* local_output = output_data +
- (batch * output_width * output_depth) +
- (out_x * output_depth);
-
- vdup_w_x_m(v0, 0);
- vdup_w_x_m(v4, 0);
- ukernel_s8_s16(local_input, local_filters0, local_filters1,
- filter_width * filter_depth);
- // sum accumulators
- vadd_w_vv(v0, v0, v1);
- vadd_w_vv(v2, v2, v3);
- vadd_w_vv(v0, v0, v2);
- vadd_w_vv(v4, v4, v5);
- vadd_w_vv(v6, v6, v7);
- vadd_w_vv(v4, v4, v6);
- {
- vst_w_x(v0, accumulators);
- int64_t acc64 = bias_data[oc];
- for (int i = 0; i < 8; i++) {
- acc64 += accumulators[i];
- }
- int32_t acc = tflite::MultiplyByQuantizedMultiplier(
- acc64, output_multiplier[oc], output_shift[oc]);
- acc += output_offset;
- acc = std::clamp(acc, output_activation_min, output_activation_max);
- local_output[oc] = static_cast<int16_t>(acc);
- }
-
- {
- vst_w_x(v4, accumulators);
- int64_t acc64 = bias_data[oc + 1];
- for (int i = 0; i < 8; i++) {
- acc64 += accumulators[i];
- }
- int32_t acc = tflite::MultiplyByQuantizedMultiplier(
- acc64, output_multiplier[oc + 1], output_shift[oc + 1]);
- acc += output_offset;
- acc = std::clamp(acc, output_activation_min, output_activation_max);
- local_output[oc + 1] = static_cast<int16_t>(acc);
- }
- }
- }
- }
-}
-
-void conv_per_channel_b64_generic(
- const tflite::ConvParams& params, const int32_t* output_multiplier,
- const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
- const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
- const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
- const int64_t* bias_data, const tflite::RuntimeShape& output_shape,
- int16_t* output_data) {
- const auto batches = MatchingDim(input_shape, 0, output_shape, 0);
- const auto stride_width = params.stride_width;
- const auto stride_height = params.stride_height;
- const auto dilation_width_factor = params.dilation_width_factor;
- const auto dilation_height_factor = params.dilation_height_factor;
- const auto pad_width = params.padding_values.width;
- const auto pad_height = params.padding_values.height;
- const auto input_height = input_shape.Dims(1);
- const auto input_width = input_shape.Dims(2);
- const auto input_depth = input_shape.Dims(3);
- const auto input_offset = params.input_offset;
- const auto filter_height = filter_shape.Dims(1);
- const auto filter_width = filter_shape.Dims(2);
- const auto filter_depth = filter_shape.Dims(3);
- const auto output_height = output_shape.Dims(1);
- const auto output_width = output_shape.Dims(2);
- const auto output_depth = output_shape.Dims(3);
- const auto output_offset = params.output_offset;
- const auto output_activation_min = params.quantized_activation_min;
- const auto output_activation_max = params.quantized_activation_max;
- const auto groups = input_depth / filter_depth;
- const auto filters_per_group = output_depth / groups;
- for (int batch = 0; batch < batches; ++batch) {
- for (int out_y = 0; out_y < output_height; ++out_y) {
- const int in_y_origin = out_y * stride_height - pad_height;
- for (int out_x = 0; out_x < output_width; ++out_x) {
- const int in_x_origin = out_x * stride_width - pad_width;
- for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
- auto group = out_channel / filters_per_group;
- int64_t acc64 = 0;
- for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
- const int in_y = in_y_origin + dilation_height_factor * filter_y;
- for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
- const int in_x = in_x_origin + dilation_width_factor * filter_x;
- const bool inside = (in_x >= 0) && (in_x < input_width) &&
- (in_y >= 0) && (in_y < input_height);
- if (!inside) {
- continue;
- }
-
- int in_channel = 0;
- do {
- int load_count = std::min(filter_depth - in_channel, 16L);
- int32_t input_swizzled[16];
- const int16_t* p_input = &input_data[tflite::Offset(
- input_shape, batch, in_y, in_x,
- in_channel + group * filter_depth)];
- for (int i = 0; i < 16; ++i) {
- int swizzle_idx = swizzle[i];
- if (swizzle_idx < load_count)
- input_swizzled[i] = *(p_input + swizzle_idx) + input_offset;
- else
- input_swizzled[i] = 0;
- }
- vld_w_l_xx(v0, input_swizzled, 4);
- vld_w_l_xx(v1, input_swizzled + 4, 4);
- vld_w_l_xx(v2, input_swizzled + 8, 4);
- vld_w_l_xx(v3, input_swizzled + 12, 4);
- vld_b_l_xx(v4,
- &filter_data[tflite::Offset(filter_shape,
- out_channel, filter_y,
- filter_x, in_channel)],
- load_count);
- vaddw_h_vx(v4, v4, 0);
- vaddw_w_vx(v6, v5, 0);
- vaddw_w_vx(v4, v4, 0);
-
- vmul_w_vv_m(vm0, vm0, vm1);
- vadd_w_vv(v0, v0, v1);
- vadd_w_vv(v0, v0, v2);
- vadd_w_vv(v0, v0, v3);
- int32_t acc32[4];
- vst_w_l_xx(v0, acc32, 4);
- for (int i = 0; i < 4; ++i) {
- acc64 += acc32[i];
- }
- in_channel += 16;
- } while (in_channel + 16 <= filter_depth);
- }
- }
- if (bias_data) {
- acc64 = acc64 + bias_data[out_channel];
- }
- int32_t acc = tflite::MultiplyByQuantizedMultiplier(
- acc64, output_multiplier[out_channel], output_shift[out_channel]);
- acc += output_offset;
- acc = std::clamp(acc, output_activation_min, output_activation_max);
- output_data[tflite::Offset(output_shape, batch, out_y, out_x,
- out_channel)] = static_cast<int16_t>(acc);
- }
- }
- }
- }
-}
-
-void conv_per_channel_b64(
- const tflite::ConvParams& params, const int32_t* output_multiplier,
- const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
- const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
- const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
- const int64_t* bias_data, const tflite::RuntimeShape& output_shape,
- int16_t* output_data) {
- if (filter_shape.Dims(kFilterHeightIndex) == 1 &&
- output_shape.Dims(kOutputChannelIndex) % 2 == 0) {
- if (filter_shape.Dims(kFilterWidthIndex) == 1 &&
- filter_shape.Dims(kFilterInputChannelIndex) % 32 == 0) {
- kelvin::opt::conv_per_channel_b64_1x1(
- params, output_multiplier, output_shift, input_shape, input_data,
- filter_shape, filter_data, bias_shape, bias_data, output_shape,
- output_data);
- return;
- }
-
- // TODO(derekjchow): Check for valid padding
- bool group_conv = !(input_shape.Dims(kInputChannelIndex) ==
- filter_shape.Dims(kFilterInputChannelIndex));
- int32_t fan_in = filter_shape.Dims(kFilterWidthIndex) *
- filter_shape.Dims(kFilterInputChannelIndex);
- if (!group_conv && fan_in % 32 == 0) {
- kelvin::opt::conv_per_channel_b64_filter1xn_non_group(
- params, output_multiplier, output_shift, input_shape, input_data,
- filter_shape, filter_data, bias_shape, bias_data, output_shape,
- output_data);
- return;
- }
-
- if (fan_in % 32 == 0) {
- kelvin::opt::conv_per_channel_b64_filter1xn_group(
- params, output_multiplier, output_shift, input_shape, input_data,
- filter_shape, filter_data, bias_shape, bias_data, output_shape,
- output_data);
- return;
- }
- }
-
- kelvin::opt::conv_per_channel_b64_generic(
- params, output_multiplier, output_shift, input_shape, input_data,
- filter_shape, filter_data, bias_shape, bias_data, output_shape,
- output_data);
-}
-
-#define INA0 v0
-#define FLTA0 v8
-#define FLTA1 v9
-#define FLTA2 v10
-#define FLTA3 v11
-#define FLTA4 v12
-#define FLTA5 v13
-#define FLTA6 v14
-#define FLTA7 v15
-#define ACC v48
-#define ACC0 v48
-#define OUT0 v56
-void conv_per_channel_b8(
- const tflite::ConvParams& params, const int32_t* output_multiplier,
- const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
- const int8_t* input_data, const tflite::RuntimeShape& filter_shape,
- const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
- const int32_t* bias_data, const tflite::RuntimeShape& output_shape,
- int8_t* output_data) {
- const auto batches = MatchingDim(input_shape, 0, output_shape, 0);
- const auto stride_width = params.stride_width;
- const auto stride_height = params.stride_height;
- const auto dilation_width_factor = params.dilation_width_factor;
- const auto dilation_height_factor = params.dilation_height_factor;
- const auto pad_width = params.padding_values.width;
- const auto pad_height = params.padding_values.height;
- const auto input_height = input_shape.Dims(1);
- const auto input_width = input_shape.Dims(2);
- const auto input_depth = input_shape.Dims(3);
- const auto input_offset = params.input_offset;
- const auto filter_height = filter_shape.Dims(1);
- const auto filter_width = filter_shape.Dims(2);
- const auto filter_depth = filter_shape.Dims(3);
- const auto output_height = output_shape.Dims(1);
- const auto output_width = output_shape.Dims(2);
- const auto output_depth = output_shape.Dims(3);
- const auto output_offset = params.output_offset;
- const auto output_activation_min = params.quantized_activation_min;
- const auto output_activation_max = params.quantized_activation_max;
- const auto groups = input_depth / filter_depth;
- const auto filters_per_group = output_depth / groups;
- union {
- vconv_u8_t conv;
- uint32_t raw;
- } cmds;
- cmds.conv.mode = 0;
- cmds.conv.start = 0;
- cmds.conv.stop = 7;
- cmds.conv.sbias1 = input_offset;
- cmds.conv.sdata1 = true;
- cmds.conv.sbias2 = 0;
- cmds.conv.sdata2 = true;
-
- // Zero out accumulators.
- vdup_b_x(v0, 0);
- acset_v(ACC, v0);
- vdup_b_x_m(ACC0, 0);
- for (int batch = 0; batch < batches; ++batch) {
- for (int out_y = 0; out_y < output_height; ++out_y) {
- const int in_y_origin = (out_y * stride_height) - pad_height;
- for (int out_x = 0; out_x < output_width; /*out_x += 32*/ ++out_x) {
- const int in_x_origin = (out_x * stride_width) - pad_width;
- for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
- auto group = out_channel / filters_per_group;
-
- for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
- const int in_y = in_y_origin + dilation_height_factor * filter_y;
- const int in_x = in_x_origin + dilation_width_factor * 0;
-
- // Zero padding by omitting the areas outside the image.
- const bool is_point_inside_image =
- (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
- (in_y < input_height);
- if (!is_point_inside_image) {
- continue;
- }
-
- int q = filter_width * filter_depth;
- for (int i = 0; i < q; i += 32) {
- int count = std::min(q - i, 32);
- count = std::min(
- count, static_cast<int>((input_width - in_x) * filter_depth));
- int input_offset = tflite::Offset(input_shape, batch, in_y, in_x,
- group * filter_depth) +
- i;
- vdup_w_x_m(vm0, 0);
- vdup_w_x_m(vm1, 0);
- vld_b_l_xx(INA0, &input_data[input_offset], count);
- int filter_offset =
- tflite::Offset(filter_shape, out_channel, filter_y, 0, 0) + i;
- vdup_w_x_m(FLTA0, 0);
- vdup_w_x_m(FLTA4, 0);
- if (count > 0) {
- vld_b_l_xx(FLTA0, &filter_data[filter_offset],
- std::min(count, 4));
- }
- if (count > 4) {
- vld_b_l_xx(FLTA1, &filter_data[filter_offset + 4],
- std::min(count - 4, 4));
- }
- if (count > 8) {
- vld_b_l_xx(FLTA2, &filter_data[filter_offset + 8],
- std::min(count - 8, 4));
- }
- if (count > 12) {
- vld_b_l_xx(FLTA3, &filter_data[filter_offset + 12],
- std::min(count - 12, 4));
- }
- if (count > 16) {
- vld_b_l_xx(FLTA4, &filter_data[filter_offset + 16],
- std::min(count - 16, 4));
- }
- if (count > 20) {
- vld_b_l_xx(FLTA5, &filter_data[filter_offset + 20],
- std::min(count - 20, 4));
- }
- if (count > 24) {
- vld_b_l_xx(FLTA6, &filter_data[filter_offset + 24],
- std::min(count - 24, 4));
- }
- if (count > 28) {
- vld_b_l_xx(FLTA7, &filter_data[filter_offset + 28],
- std::min(count - 28, 4));
- }
- aconv_vxv(ACC, INA0, cmds, FLTA0);
- }
- }
- vcget(ACC);
- vadd_w_vx_m(ACC0, ACC0, bias_data[out_channel]);
- vsll_w_vx_m(ACC0, ACC0, LEFT_SHIFT(output_shift[out_channel]));
- vdmulh_w_r_vx_m(ACC0, ACC0, output_multiplier[out_channel]);
- vsha_w_r_vx_m(ACC0, ACC0, RIGHT_SHIFT(output_shift[out_channel]));
- vadd_w_vx_m(ACC0, ACC0, output_offset);
- vmin_w_vx_m(ACC0, ACC0, output_activation_max);
- vmax_w_vx_m(ACC0, ACC0, output_activation_min);
- vsraqs_b_vx(OUT0, ACC0, 0);
- size_t output_offset =
- tflite::Offset(output_shape, batch, out_y, out_x, out_channel);
- vst_b_l_xx(OUT0, &output_data[output_offset], 1);
- }
- }
- }
- }
-}
-} // namespace kelvin::opt
diff --git a/tflm/opt/conv_s16_b32.cc b/tflm/opt/conv_s16_b32.cc
new file mode 100644
index 0000000..07625d0
--- /dev/null
+++ b/tflm/opt/conv_s16_b32.cc
@@ -0,0 +1,144 @@
+/*
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Convolution based on Kelvin ops
+// Data types: input: s16, filter: s8, bias s32
+
+#include "tflm/opt/conv_util.h"
+
+namespace kelvin::opt {
+namespace {
+void ConvS16B32Generic(
+ const tflite::ConvParams& params, const int32_t* output_multiplier,
+ const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
+ const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
+ const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
+ const int32_t* bias_data, const tflite::RuntimeShape& output_shape,
+ int16_t* output_data) {
+ const auto batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const auto stride_width = params.stride_width;
+ const auto stride_height = params.stride_height;
+ const auto dilation_width_factor = params.dilation_width_factor;
+ const auto dilation_height_factor = params.dilation_height_factor;
+ const auto pad_width = params.padding_values.width;
+ const auto pad_height = params.padding_values.height;
+ const auto input_height = input_shape.Dims(1);
+ const auto input_width = input_shape.Dims(2);
+ const auto input_depth = input_shape.Dims(3);
+ const auto input_offset = params.input_offset;
+ const auto filter_height = filter_shape.Dims(1);
+ const auto filter_width = filter_shape.Dims(2);
+ const auto filter_depth = filter_shape.Dims(3);
+ const auto output_height = output_shape.Dims(1);
+ const auto output_width = output_shape.Dims(2);
+ const auto output_depth = output_shape.Dims(3);
+ const auto output_offset = params.output_offset;
+ const auto output_activation_min = params.quantized_activation_min;
+ const auto output_activation_max = params.quantized_activation_max;
+ const auto groups = input_depth / filter_depth;
+ const auto filters_per_group = output_depth / groups;
+
+ for (int batch = 0; batch < batches; ++batch) {
+ for (int out_y = 0; out_y < output_height; ++out_y) {
+ const int in_y_origin = out_y * stride_height - pad_height;
+ for (int out_x = 0; out_x < output_width; ++out_x) {
+ const int in_x_origin = out_x * stride_width - pad_width;
+ for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+ auto group = out_channel / filters_per_group;
+ int32_t acc32 = 0;
+ for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+ const int in_y = in_y_origin + dilation_height_factor * filter_y;
+ for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+ const int in_x = in_x_origin + dilation_width_factor * filter_x;
+ const bool inside = (in_x >= 0) && (in_x < input_width) &&
+ (in_y >= 0) && (in_y < input_height);
+ if (!inside) {
+ continue;
+ }
+ int in_channel = 0;
+ do {
+ int load_count = std::min(filter_depth - in_channel, 16L);
+ int32_t input_swizzled[16];
+ const int16_t* p_input = &input_data[tflite::Offset(
+ input_shape, batch, in_y, in_x,
+ in_channel + group * filter_depth)];
+ for (int i = 0; i < 16; ++i) {
+ int swizzle_idx = swizzle[i];
+ if (swizzle_idx < load_count)
+ input_swizzled[i] = *(p_input + swizzle_idx) + input_offset;
+ else
+ input_swizzled[i] = 0;
+ }
+ vld_w_l_xx(v0, input_swizzled, 4);
+ vld_w_l_xx(v1, input_swizzled + 4, 4);
+ vld_w_l_xx(v2, input_swizzled + 8, 4);
+ vld_w_l_xx(v3, input_swizzled + 12, 4);
+ vld_b_l_xx(v4,
+ &filter_data[tflite::Offset(filter_shape,
+ out_channel, filter_y,
+ filter_x, in_channel)],
+ load_count);
+ vaddw_h_vx(v4, v4, 0);
+ vaddw_w_vx(v6, v5, 0);
+ vaddw_w_vx(v4, v4, 0);
+
+ vmul_w_vv_m(vm0, vm0, vm1);
+ vadd_w_vv(v0, v0, v1);
+ vadd_w_vv(v0, v0, v2);
+ vadd_w_vv(v0, v0, v3);
+ int32_t acc_spill[4];
+ vst_w_l_xx(v0, acc_spill, 4);
+ for (int i = 0; i < 4; ++i) {
+ acc32 += acc_spill[i];
+ }
+ in_channel += 16;
+ } while (in_channel + 16 <= filter_depth);
+ }
+ }
+ if (bias_data) {
+ acc32 = acc32 + bias_data[out_channel];
+ }
+ int32_t acc = tflite::MultiplyByQuantizedMultiplier(
+ acc32, output_multiplier[out_channel], output_shift[out_channel]);
+ acc += output_offset;
+ acc = std::clamp(acc, output_activation_min, output_activation_max);
+ output_data[tflite::Offset(output_shape, batch, out_y, out_x,
+ out_channel)] = static_cast<int16_t>(acc);
+ }
+ }
+ }
+ }
+}
+} // namespace
+
+void ConvS16B32(
+ const tflite::ConvParams& params, const int32_t* output_multiplier,
+ const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
+ const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
+ const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
+ const int32_t* bias_data, const tflite::RuntimeShape& output_shape,
+ int16_t* output_data) {
+ // generic implementation by default
+ auto fn = ConvS16B32Generic;
+
+ // can add special cases below
+
+ fn(params, output_multiplier, output_shift, input_shape, input_data,
+ filter_shape, filter_data, bias_shape, bias_data, output_shape,
+ output_data);
+}
+
+} // namespace kelvin::opt
diff --git a/tflm/opt/conv_s16_b64.cc b/tflm/opt/conv_s16_b64.cc
new file mode 100644
index 0000000..48823dd
--- /dev/null
+++ b/tflm/opt/conv_s16_b64.cc
@@ -0,0 +1,454 @@
+/*
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Convolution based on Kelvin ops
+// Data types: input: s16, filter: s8, bias s64
+
+#include "tflm/opt/conv_util.h"
+
+namespace kelvin::opt {
+namespace {
+// Accumulates in v0-v7. [v0-v3], [v4-v7] are sub accumulators for two outputs.
+// Load/swizzle filters use [v52-v63].
+// Input activations use [v32-v33].
+// No clobbers.
+void ConvUkernelS8S16(const int16_t* input_data0, const int8_t* filter_data0,
+ const int8_t* filter_data1, size_t n) {
+ n = n >> 5;
+ while (n > 0) {
+ // Load filters 0 to v58, v59
+ vld_b_p_x(v52, filter_data0);
+ vaddw_h_vx(v56, v52, 0);
+ vzip_h_vv(v58, v56, v57);
+
+ // Load activations
+ vld_h_p_x(v32, input_data0);
+ vld_h_p_x(v33, input_data0);
+
+ // Multiply filters0 * activations
+ vmulw_w_vv(v16, v58, v32);
+ vmulw_w_vv(v18, v59, v33);
+
+ // Accumulate v0
+ vadd_w_vv_m(v0, v0, v16);
+
+ // Load filters 1 to v62, v63
+ vld_b_p_x(v53, filter_data1);
+ vaddw_h_vx(v60, v53, 0);
+ vzip_h_vv(v62, v60, v61);
+
+ // Multiply filters1 * activations
+ vmulw_w_vv(v20, v62, v32);
+ vmulw_w_vv(v22, v63, v33);
+
+ // Accumulate v4
+ vadd_w_vv_m(v4, v4, v20);
+ n--;
+ }
+}
+
+void ConvS16B64K1x1(
+ const tflite::ConvParams& params, const int32_t* output_multiplier,
+ const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
+ const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
+ const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
+ const int64_t* bias_data, const tflite::RuntimeShape& output_shape,
+ int16_t* output_data) {
+ const auto batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const auto input_height = input_shape.Dims(1);
+ const auto input_width = input_shape.Dims(2);
+ const auto input_depth = input_shape.Dims(3);
+ const auto filter_input_depth = filter_shape.Dims(3);
+ const auto output_depth = output_shape.Dims(3);
+ const auto output_offset = params.output_offset;
+ const auto output_activation_min = params.quantized_activation_min;
+ const auto output_activation_max = params.quantized_activation_max;
+ const auto groups = input_depth / filter_input_depth;
+ const auto output_filters_per_group = output_depth / groups;
+
+ int32_t accumulators[8];
+ for (int bhw = 0; bhw < batches * input_height * input_width; bhw++) {
+ const int16_t* local_input = input_data + (bhw * input_depth);
+ int16_t* local_output = output_data + (bhw * output_depth);
+ for (int g = 0; g < groups; g++) {
+ const int16_t* group_input = local_input + (g * filter_input_depth);
+ for (int gc = 0; gc + 2 <= output_filters_per_group; gc += 2) {
+ int oc = (g * output_filters_per_group) + gc;
+ const int8_t* local_filters0 = filter_data + (oc * filter_input_depth);
+ const int8_t* local_filters1 = local_filters0 + filter_input_depth;
+
+ vdup_w_x_m(v0, 0);
+ vdup_w_x_m(v4, 0);
+ ConvUkernelS8S16(group_input, local_filters0, local_filters1,
+ filter_input_depth);
+ // sum accumulators
+ vadd_w_vv(v0, v0, v1);
+ vadd_w_vv(v2, v2, v3);
+ vadd_w_vv(v0, v0, v2);
+ vadd_w_vv(v4, v4, v5);
+ vadd_w_vv(v6, v6, v7);
+ vadd_w_vv(v4, v4, v6);
+
+ {
+ vst_w_x(v0, accumulators);
+ int64_t acc64 = bias_data[oc];
+ for (int i = 0; i < 8; i++) {
+ acc64 += accumulators[i];
+ }
+ int32_t acc = tflite::MultiplyByQuantizedMultiplier(
+ acc64, output_multiplier[oc], output_shift[oc]);
+ acc += output_offset;
+ acc = std::clamp(acc, output_activation_min, output_activation_max);
+ local_output[oc] = static_cast<int16_t>(acc);
+ }
+
+ {
+ vst_w_x(v4, accumulators);
+ int64_t acc64 = bias_data[oc + 1];
+ for (int i = 0; i < 8; i++) {
+ acc64 += accumulators[i];
+ }
+ int32_t acc = tflite::MultiplyByQuantizedMultiplier(
+ acc64, output_multiplier[oc + 1], output_shift[oc + 1]);
+ acc += output_offset;
+ acc = std::clamp(acc, output_activation_min, output_activation_max);
+ local_output[oc + 1] = static_cast<int16_t>(acc);
+ }
+ }
+ }
+ }
+}
+
+// Optimized for grouped convolutions, no dilation, 1xn filter
+void ConvS16B64K1xnGroup(
+ const tflite::ConvParams& params, const int32_t* output_multiplier,
+ const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
+ const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
+ const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
+ const int64_t* bias_data, const tflite::RuntimeShape& output_shape,
+ int16_t* output_data) {
+ const auto batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const auto stride_width = params.stride_width;
+ const auto pad_width = params.padding_values.width;
+ const auto input_width = input_shape.Dims(2);
+ const auto input_depth = input_shape.Dims(3);
+ const auto filter_width = filter_shape.Dims(2);
+ const auto filter_depth = filter_shape.Dims(3);
+ const auto output_width = output_shape.Dims(2);
+ const auto output_depth = output_shape.Dims(3);
+ const auto output_offset = params.output_offset;
+ const auto output_activation_min = params.quantized_activation_min;
+ const auto output_activation_max = params.quantized_activation_max;
+
+ const auto groups = input_depth / filter_depth;
+ const auto output_filters_per_group = output_depth / groups;
+
+ int32_t accumulators[8];
+ for (int g = 0; g < groups; g++) {
+ for (int gc = 0; gc + 2 <= output_filters_per_group; gc += 2) {
+ int oc = (g * output_filters_per_group) + gc;
+ for (int b = 0; b < batches; ++b) {
+ for (int out_x = 0; out_x < output_width; ++out_x) {
+ const int in_x_origin = out_x * stride_width - pad_width;
+ const int8_t* local_filters0 =
+ filter_data + (oc * filter_width * filter_depth);
+ const int8_t* local_filters1 =
+ local_filters0 + (filter_width * filter_depth);
+ const int16_t* local_input =
+ input_data + (b * input_width * input_depth) +
+ (in_x_origin * input_depth) + (g * filter_depth);
+ int16_t* local_output = output_data +
+ (b * output_width * output_depth) +
+ (out_x * output_depth);
+
+ int64_t acc64_0 = 0;
+ int64_t acc64_1 = 0;
+ vdup_w_x_m(v0, 0);
+ vdup_w_x_m(v4, 0);
+ for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+ const int8_t* local_filters0x =
+ local_filters0 + (filter_x * filter_depth);
+ const int8_t* local_filters1x =
+ local_filters1 + (filter_x * filter_depth);
+ const int16_t* local_inputx =
+ local_input + (filter_x * input_depth);
+
+ ConvUkernelS8S16(local_inputx, local_filters0x, local_filters1x,
+ filter_depth);
+ }
+
+ // sum accumulators
+ vadd_w_vv(v0, v0, v1);
+ vadd_w_vv(v2, v2, v3);
+ vadd_w_vv(v0, v0, v2);
+ vadd_w_vv(v4, v4, v5);
+ vadd_w_vv(v6, v6, v7);
+ vadd_w_vv(v4, v4, v6);
+
+ {
+ vst_w_x(v0, accumulators);
+ for (int i = 0; i < 8; i++) {
+ acc64_0 += accumulators[i];
+ }
+ acc64_0 += bias_data[oc];
+ int32_t acc = tflite::MultiplyByQuantizedMultiplier(
+ acc64_0, output_multiplier[oc], output_shift[oc]);
+ acc += output_offset;
+ acc = std::clamp(acc, output_activation_min, output_activation_max);
+ local_output[oc] = static_cast<int16_t>(acc);
+ }
+
+ {
+ vst_w_x(v4, accumulators);
+ for (int i = 0; i < 8; i++) {
+ acc64_1 += accumulators[i];
+ }
+ acc64_1 += bias_data[oc + 1];
+ int32_t acc = tflite::MultiplyByQuantizedMultiplier(
+ acc64_1, output_multiplier[oc + 1], output_shift[oc + 1]);
+ acc += output_offset;
+ acc = std::clamp(acc, output_activation_min, output_activation_max);
+ local_output[oc + 1] = static_cast<int16_t>(acc);
+ }
+ }
+ }
+ }
+ }
+}
+
+// Optimized for no group, no dilation, 1xn filter.
+void ConvS16B64K1xnNonGroup(
+ const tflite::ConvParams& params, const int32_t* output_multiplier,
+ const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
+ const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
+ const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
+ const int64_t* bias_data, const tflite::RuntimeShape& output_shape,
+ int16_t* output_data) {
+ const auto batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const auto stride_width = params.stride_width;
+ const auto pad_width = params.padding_values.width;
+ const auto input_width = input_shape.Dims(2);
+ const auto input_depth = input_shape.Dims(3);
+ const auto filter_width = filter_shape.Dims(2);
+ const auto filter_depth = filter_shape.Dims(3);
+ const auto output_width = output_shape.Dims(2);
+ const auto output_depth = output_shape.Dims(3);
+ const auto output_offset = params.output_offset;
+ const auto output_activation_min = params.quantized_activation_min;
+ const auto output_activation_max = params.quantized_activation_max;
+ int32_t accumulators[8];
+ for (int oc = 0; oc + 2 <= output_depth; oc += 2) {
+ for (int batch = 0; batch < batches; ++batch) {
+ for (int out_x = 0; out_x < output_width; ++out_x) {
+ const int in_x_origin = out_x * stride_width - pad_width;
+
+ const int8_t* local_filters0 =
+ filter_data + (oc * filter_width * filter_depth);
+ const int8_t* local_filters1 =
+ local_filters0 + (filter_width * filter_depth);
+ const int16_t* local_input = input_data +
+ (batch * input_width * input_depth) +
+ (in_x_origin * input_depth);
+ int16_t* local_output = output_data +
+ (batch * output_width * output_depth) +
+ (out_x * output_depth);
+
+ vdup_w_x_m(v0, 0);
+ vdup_w_x_m(v4, 0);
+ ConvUkernelS8S16(local_input, local_filters0, local_filters1,
+ filter_width * filter_depth);
+ // sum accumulators
+ vadd_w_vv(v0, v0, v1);
+ vadd_w_vv(v2, v2, v3);
+ vadd_w_vv(v0, v0, v2);
+ vadd_w_vv(v4, v4, v5);
+ vadd_w_vv(v6, v6, v7);
+ vadd_w_vv(v4, v4, v6);
+ {
+ vst_w_x(v0, accumulators);
+ int64_t acc64 = bias_data[oc];
+ for (int i = 0; i < 8; i++) {
+ acc64 += accumulators[i];
+ }
+ int32_t acc = tflite::MultiplyByQuantizedMultiplier(
+ acc64, output_multiplier[oc], output_shift[oc]);
+ acc += output_offset;
+ acc = std::clamp(acc, output_activation_min, output_activation_max);
+ local_output[oc] = static_cast<int16_t>(acc);
+ }
+
+ {
+ vst_w_x(v4, accumulators);
+ int64_t acc64 = bias_data[oc + 1];
+ for (int i = 0; i < 8; i++) {
+ acc64 += accumulators[i];
+ }
+ int32_t acc = tflite::MultiplyByQuantizedMultiplier(
+ acc64, output_multiplier[oc + 1], output_shift[oc + 1]);
+ acc += output_offset;
+ acc = std::clamp(acc, output_activation_min, output_activation_max);
+ local_output[oc + 1] = static_cast<int16_t>(acc);
+ }
+ }
+ }
+ }
+}
+
+void ConvS16B64Generic(
+ const tflite::ConvParams& params, const int32_t* output_multiplier,
+ const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
+ const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
+ const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
+ const int64_t* bias_data, const tflite::RuntimeShape& output_shape,
+ int16_t* output_data) {
+ const auto batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const auto stride_width = params.stride_width;
+ const auto stride_height = params.stride_height;
+ const auto dilation_width_factor = params.dilation_width_factor;
+ const auto dilation_height_factor = params.dilation_height_factor;
+ const auto pad_width = params.padding_values.width;
+ const auto pad_height = params.padding_values.height;
+ const auto input_height = input_shape.Dims(1);
+ const auto input_width = input_shape.Dims(2);
+ const auto input_depth = input_shape.Dims(3);
+ const auto input_offset = params.input_offset;
+ const auto filter_height = filter_shape.Dims(1);
+ const auto filter_width = filter_shape.Dims(2);
+ const auto filter_depth = filter_shape.Dims(3);
+ const auto output_height = output_shape.Dims(1);
+ const auto output_width = output_shape.Dims(2);
+ const auto output_depth = output_shape.Dims(3);
+ const auto output_offset = params.output_offset;
+ const auto output_activation_min = params.quantized_activation_min;
+ const auto output_activation_max = params.quantized_activation_max;
+ const auto groups = input_depth / filter_depth;
+ const auto filters_per_group = output_depth / groups;
+ for (int batch = 0; batch < batches; ++batch) {
+ for (int out_y = 0; out_y < output_height; ++out_y) {
+ const int in_y_origin = out_y * stride_height - pad_height;
+ for (int out_x = 0; out_x < output_width; ++out_x) {
+ const int in_x_origin = out_x * stride_width - pad_width;
+ for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+ auto group = out_channel / filters_per_group;
+ int64_t acc64 = 0;
+ for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+ const int in_y = in_y_origin + dilation_height_factor * filter_y;
+ for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+ const int in_x = in_x_origin + dilation_width_factor * filter_x;
+ const bool inside = (in_x >= 0) && (in_x < input_width) &&
+ (in_y >= 0) && (in_y < input_height);
+ if (!inside) {
+ continue;
+ }
+
+ int in_channel = 0;
+ do {
+ int load_count = std::min(filter_depth - in_channel, 16L);
+ int32_t input_swizzled[16];
+ const int16_t* p_input = &input_data[tflite::Offset(
+ input_shape, batch, in_y, in_x,
+ in_channel + group * filter_depth)];
+ for (int i = 0; i < 16; ++i) {
+ int swizzle_idx = swizzle[i];
+ if (swizzle_idx < load_count)
+ input_swizzled[i] = *(p_input + swizzle_idx) + input_offset;
+ else
+ input_swizzled[i] = 0;
+ }
+ vld_w_l_xx(v0, input_swizzled, 4);
+ vld_w_l_xx(v1, input_swizzled + 4, 4);
+ vld_w_l_xx(v2, input_swizzled + 8, 4);
+ vld_w_l_xx(v3, input_swizzled + 12, 4);
+ vld_b_l_xx(v4,
+ &filter_data[tflite::Offset(filter_shape,
+ out_channel, filter_y,
+ filter_x, in_channel)],
+ load_count);
+ vaddw_h_vx(v4, v4, 0);
+ vaddw_w_vx(v6, v5, 0);
+ vaddw_w_vx(v4, v4, 0);
+
+ vmul_w_vv_m(vm0, vm0, vm1);
+ vadd_w_vv(v0, v0, v1);
+ vadd_w_vv(v0, v0, v2);
+ vadd_w_vv(v0, v0, v3);
+ int32_t acc32[4];
+ vst_w_l_xx(v0, acc32, 4);
+ for (int i = 0; i < 4; ++i) {
+ acc64 += acc32[i];
+ }
+ in_channel += 16;
+ } while (in_channel + 16 <= filter_depth);
+ }
+ }
+ if (bias_data) {
+ acc64 = acc64 + bias_data[out_channel];
+ }
+ int32_t acc = tflite::MultiplyByQuantizedMultiplier(
+ acc64, output_multiplier[out_channel], output_shift[out_channel]);
+ acc += output_offset;
+ acc = std::clamp(acc, output_activation_min, output_activation_max);
+ output_data[tflite::Offset(output_shape, batch, out_y, out_x,
+ out_channel)] = static_cast<int16_t>(acc);
+ }
+ }
+ }
+ }
+}
+} // namespace
+
+void ConvS16B64(
+ const tflite::ConvParams& params, const int32_t* output_multiplier,
+ const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
+ const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
+ const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
+ const int64_t* bias_data, const tflite::RuntimeShape& output_shape,
+ int16_t* output_data) {
+ const auto input_depth = input_shape.Dims(3);
+ const auto filter_height = filter_shape.Dims(1);
+ const auto filter_width = filter_shape.Dims(2);
+ const auto filter_depth = filter_shape.Dims(3);
+ const auto output_depth = output_shape.Dims(3);
+
+ // generic implementation by default
+ auto fn = ConvS16B64Generic;
+
+ // special cases
+ if (filter_height == 1 && output_depth % 2 == 0) {
+ // 1x1 filter, filter depth = 32n
+ if (filter_width == 1 && filter_depth % 32 == 0) {
+ fn = ConvS16B64K1x1;
+ }
+
+ // 1xn non group filter
+ bool group_conv = !(input_depth == filter_depth);
+ int32_t fan_in = filter_width * filter_depth;
+ if (!group_conv && fan_in % 32 == 0) {
+ fn = ConvS16B64K1xnNonGroup;
+ }
+
+ // 1xn group filter
+ if (fan_in % 32 == 0) {
+ fn = ConvS16B64K1xnGroup;
+ }
+ }
+
+ fn(params, output_multiplier, output_shift, input_shape, input_data,
+ filter_shape, filter_data, bias_shape, bias_data, output_shape,
+ output_data);
+}
+
+} // namespace kelvin::opt
diff --git a/tflm/opt/conv_s8.cc b/tflm/opt/conv_s8.cc
new file mode 100644
index 0000000..7d7d0ba
--- /dev/null
+++ b/tflm/opt/conv_s8.cc
@@ -0,0 +1,233 @@
+/*
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Convolution based on Kelvin ops
+// Data types: input: s8, filter: s8, bias: s32
+
+#include "tflm/opt/conv_s8.h"
+
+#include <algorithm>
+
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/conv.h"
+#include "tflm/opt/conv_util.h"
+
+namespace kelvin::opt {
+namespace {
+void ConvS8Generic(
+ const tflite::ConvParams& params, const int32_t* output_multiplier,
+ const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
+ const int8_t* input_data, const tflite::RuntimeShape& filter_shape,
+ const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
+ const int32_t* bias_data, const tflite::RuntimeShape& output_shape,
+ int8_t* output_data) {
+ const auto batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const auto stride_width = params.stride_width;
+ const auto stride_height = params.stride_height;
+ const auto dilation_width_factor = params.dilation_width_factor;
+ const auto dilation_height_factor = params.dilation_height_factor;
+ const auto pad_width = params.padding_values.width;
+ const auto pad_height = params.padding_values.height;
+ const auto input_height = input_shape.Dims(1);
+ const auto input_width = input_shape.Dims(2);
+ const auto input_depth = input_shape.Dims(3);
+ const auto input_offset = params.input_offset;
+ const auto filter_height = filter_shape.Dims(1);
+ const auto filter_width = filter_shape.Dims(2);
+ const auto filter_depth = filter_shape.Dims(3);
+ const auto output_height = output_shape.Dims(1);
+ const auto output_width = output_shape.Dims(2);
+ const auto output_depth = output_shape.Dims(3);
+ const auto output_offset = params.output_offset;
+ const auto output_activation_min = params.quantized_activation_min;
+ const auto output_activation_max = params.quantized_activation_max;
+ const auto groups = input_depth / filter_depth;
+ const auto filters_per_group = output_depth / groups;
+
+ if (pad_width > 0 || pad_height > 0 || dilation_width_factor > 1 ||
+ dilation_height_factor > 1) {
+ // use reference implementation
+ tflite::reference_integer_ops::ConvPerChannel(
+ params, output_multiplier, output_shift, input_shape, input_data,
+ filter_shape, filter_data, bias_shape, bias_data, output_shape,
+ output_data);
+ return;
+ }
+
+ union {
+ vconv_u8_t conv;
+ uint32_t raw;
+ } cmds;
+ cmds.conv.mode = 0;
+ cmds.conv.start = 0;
+ cmds.conv.stop = 7;
+ cmds.conv.sbias1 = input_offset;
+ cmds.conv.sdata1 = true;
+ cmds.conv.sbias2 = 0;
+ cmds.conv.sdata2 = true;
+
+ // Zero out accumulators.
+ vdup_b_x(v0, 0);
+ acset_v(ACC, v0);
+ vdup_b_x_m(ACC0, 0);
+ for (int batch = 0; batch < batches; ++batch) {
+ for (int out_y = 0; out_y < output_height; ++out_y) {
+ const int in_y_origin = (out_y * stride_height) - pad_height;
+ for (int out_x = 0; out_x < output_width; /*out_x += 32*/ ++out_x) {
+ const int in_x_origin = (out_x * stride_width) - pad_width;
+ for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+ auto group = out_channel / filters_per_group;
+
+ for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+ const int in_y = in_y_origin + dilation_height_factor * filter_y;
+ const int in_x = in_x_origin + dilation_width_factor * 0;
+
+ // Zero padding by omitting the areas outside the image.
+ const bool is_point_inside_image =
+ (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+ (in_y < input_height);
+ if (!is_point_inside_image) {
+ continue;
+ }
+
+ int q = filter_width * filter_depth;
+ for (int i = 0; i < q; i += 32) {
+ int count = std::min(q - i, 32);
+ count = std::min(
+ count, static_cast<int>((input_width - in_x) * filter_depth));
+ int input_offset = tflite::Offset(input_shape, batch, in_y, in_x,
+ group * filter_depth) +
+ i;
+ vdup_w_x_m(vm0, 0);
+ vdup_w_x_m(vm1, 0);
+ vld_b_l_xx(INA0, &input_data[input_offset], count);
+ int filter_offset =
+ tflite::Offset(filter_shape, out_channel, filter_y, 0, 0) + i;
+ vdup_w_x_m(FLTA0, 0);
+ vdup_w_x_m(FLTA4, 0);
+ if (count > 0) {
+ vld_b_l_xx(FLTA0, &filter_data[filter_offset],
+ std::min(count, 4));
+ }
+ if (count > 4) {
+ vld_b_l_xx(FLTA1, &filter_data[filter_offset + 4],
+ std::min(count - 4, 4));
+ }
+ if (count > 8) {
+ vld_b_l_xx(FLTA2, &filter_data[filter_offset + 8],
+ std::min(count - 8, 4));
+ }
+ if (count > 12) {
+ vld_b_l_xx(FLTA3, &filter_data[filter_offset + 12],
+ std::min(count - 12, 4));
+ }
+ if (count > 16) {
+ vld_b_l_xx(FLTA4, &filter_data[filter_offset + 16],
+ std::min(count - 16, 4));
+ }
+ if (count > 20) {
+ vld_b_l_xx(FLTA5, &filter_data[filter_offset + 20],
+ std::min(count - 20, 4));
+ }
+ if (count > 24) {
+ vld_b_l_xx(FLTA6, &filter_data[filter_offset + 24],
+ std::min(count - 24, 4));
+ }
+ if (count > 28) {
+ vld_b_l_xx(FLTA7, &filter_data[filter_offset + 28],
+ std::min(count - 28, 4));
+ }
+ aconv_vxv(ACC, INA0, cmds, FLTA0);
+ }
+ }
+ vcget(ACC);
+ vadd_w_vx_m(ACC0, ACC0, bias_data[out_channel]);
+ vsll_w_vx_m(ACC0, ACC0, LEFT_SHIFT(output_shift[out_channel]));
+ vdmulh_w_r_vx_m(ACC0, ACC0, output_multiplier[out_channel]);
+ vsha_w_r_vx_m(ACC0, ACC0, RIGHT_SHIFT(output_shift[out_channel]));
+ vadd_w_vx_m(ACC0, ACC0, output_offset);
+ vmin_w_vx_m(ACC0, ACC0, output_activation_max);
+ vmax_w_vx_m(ACC0, ACC0, output_activation_min);
+ vsraqs_b_vx(OUT0, ACC0, 0);
+ size_t output_offset =
+ tflite::Offset(output_shape, batch, out_y, out_x, out_channel);
+ vst_b_l_xx(OUT0, &output_data[output_offset], 1);
+ }
+ }
+ }
+ }
+}
+} // namespace
+
+void ConvS8(const tflite::ConvParams& params, const int32_t* output_multiplier,
+ const int32_t* output_shift,
+ const tflite::RuntimeShape& input_shape, const int8_t* input_data,
+ const tflite::RuntimeShape& filter_shape, const int8_t* filter_data,
+ const tflite::RuntimeShape& bias_shape, const int32_t* bias_data,
+ const tflite::RuntimeShape& output_shape, int8_t* output_data) {
+ const auto batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const auto stride_width = params.stride_width;
+ const auto stride_height = params.stride_height;
+ const auto dilation_width_factor = params.dilation_width_factor;
+ const auto dilation_height_factor = params.dilation_height_factor;
+ const auto pad_width = params.padding_values.width;
+ const auto pad_height = params.padding_values.height;
+ const auto input_width = input_shape.Dims(2);
+ const auto input_depth = input_shape.Dims(3);
+ const auto filter_height = filter_shape.Dims(1);
+ const auto filter_width = filter_shape.Dims(2);
+ const auto filter_depth = filter_shape.Dims(3);
+ const auto output_width = output_shape.Dims(2);
+ const auto output_depth = output_shape.Dims(3);
+
+ // use generic implementation by default
+ auto fn = ConvS8Generic;
+
+ // special case of filter_depth = 4n
+ if (dilation_width_factor == 1 && dilation_height_factor == 1 &&
+ stride_width <= 2 && stride_height <= 2 && filter_depth % 4 == 0 &&
+ output_depth % 8 == 0 && output_width >= 8 && pad_width <= 1) {
+ fn = kelvin::opt::ConvS8D4;
+ }
+
+ // special case of filter depth = 32n
+ else if (dilation_width_factor == 1 && dilation_height_factor == 1 &&
+ stride_width <= 2 && stride_height <= 2 && filter_depth % 32 == 0) {
+ fn = kelvin::opt::ConvS8D32;
+ }
+
+ // special case of filter size 1x1
+ else if (filter_height == 1 && filter_width == 1 && stride_height == 1 &&
+ stride_width == 1 && dilation_height_factor == 1 &&
+ dilation_width_factor == 1 && pad_height == 0 && pad_width == 0 &&
+ (output_depth % 8) == 0 && (input_depth % 32) == 0) {
+ // TODO(ndodda): uncomment it when all tests are passed
+ // fn = kelvin::opt::ConvS8K1x1;
+ }
+
+ // special case of filter size 48x3x1x48
+ else if (batches == 1 && filter_height == 3 && filter_width == 1 &&
+ input_width == 1 && input_depth == 48 && output_depth == 48 &&
+ stride_height == 1 && stride_width == 1 && dilation_height_factor == 1 &&
+ dilation_width_factor == 1 && pad_height == 0 && pad_width == 0) {
+ fn = kelvin::opt::ConvS8K3x1D48;
+ }
+
+ fn(params, output_multiplier, output_shift, input_shape, input_data,
+ filter_shape, filter_data, bias_shape, bias_data, output_shape,
+ output_data);
+}
+
+} // namespace kelvin::opt
diff --git a/tflm/opt/conv_s8.h b/tflm/opt/conv_s8.h
new file mode 100644
index 0000000..02dd79b
--- /dev/null
+++ b/tflm/opt/conv_s8.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TFLM_OPT_CONV_S8_H_
+#define TFLM_OPT_CONV_S8_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/runtime_shape.h"
+
+namespace kelvin::opt {
+
+// filter 1x1
+void ConvS8K1x1(const tflite::ConvParams& params,
+ const int32_t* output_multiplier, const int32_t* output_shift,
+ const tflite::RuntimeShape& input_shape,
+ const int8_t* input_data,
+ const tflite::RuntimeShape& filter_shape,
+ const int8_t* filter_data,
+ const tflite::RuntimeShape& bias_shape,
+ const int32_t* bias_data,
+ const tflite::RuntimeShape& output_shape, int8_t* output_data);
+
+// filter depth 4n
+void ConvS8D4(const tflite::ConvParams& params,
+ const int32_t* output_multiplier, const int32_t* output_shift,
+ const tflite::RuntimeShape& input_shape,
+ const int8_t* input_data,
+ const tflite::RuntimeShape& filter_shape,
+ const int8_t* filter_data,
+ const tflite::RuntimeShape& bias_shape, const int32_t* bias_data,
+ const tflite::RuntimeShape& output_shape, int8_t* output_data);
+
+// filter depth 32n
+void ConvS8D32(const tflite::ConvParams& params,
+ const int32_t* output_multiplier, const int32_t* output_shift,
+ const tflite::RuntimeShape& input_shape,
+ const int8_t* input_data,
+ const tflite::RuntimeShape& filter_shape,
+ const int8_t* filter_data,
+ const tflite::RuntimeShape& bias_shape, const int32_t* bias_data,
+ const tflite::RuntimeShape& output_shape, int8_t* output_data);
+
+// filter size 48x3x1x48
+void ConvS8K3x1D48(
+ const tflite::ConvParams& params, const int32_t* output_multiplier,
+ const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
+ const int8_t* input_data, const tflite::RuntimeShape& filter_shape,
+ const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
+ const int32_t* bias_data, const tflite::RuntimeShape& output_shape,
+ int8_t* output_data);
+
+} // namespace kelvin::opt
+
+#endif // TFLM_OPT_CONV_S8_H_
diff --git a/tflm/opt/conv_s8_1x1.cc b/tflm/opt/conv_s8_1x1.cc
new file mode 100644
index 0000000..9da99c3
--- /dev/null
+++ b/tflm/opt/conv_s8_1x1.cc
@@ -0,0 +1,97 @@
+/*
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Convolution based on Kelvin ops
+// Data types: input: s8, filter: s8, bias: s32
+// Special case for 1x1 filter
+
+#include "tflm/opt/conv_util.h"
+
+namespace kelvin::opt {
+
+void ConvS8K1x1(const tflite::ConvParams& params,
+ const int32_t* output_multiplier, const int32_t* output_shift,
+ const tflite::RuntimeShape& input_shape,
+ const int8_t* input_data,
+ const tflite::RuntimeShape& filter_shape,
+ const int8_t* filter_data,
+ const tflite::RuntimeShape& bias_shape,
+ const int32_t* bias_data,
+ const tflite::RuntimeShape& output_shape, int8_t* output_data) {
+ const auto batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const auto input_depth = input_shape.Dims(3);
+ const auto input_offset = params.input_offset;
+ const auto output_height = output_shape.Dims(1);
+ const auto output_width = output_shape.Dims(2);
+ const auto output_depth = output_shape.Dims(3);
+ const auto output_offset = params.output_offset;
+ const auto output_activation_min = params.quantized_activation_min;
+ const auto output_activation_max = params.quantized_activation_max;
+ // ToDo : support group convolutions.
+ int32_t bias[8 * 4];
+ int32_t mult[8 * 4];
+ int32_t shft[8 * 4];
+ union {
+ vconv_u8_t conv;
+ uint32_t raw;
+ } cmds;
+ cmds.conv.mode = 0;
+ cmds.conv.start = 0;
+ cmds.conv.stop = 7;
+ cmds.conv.sbias1 = input_offset;
+ cmds.conv.sdata1 = true;
+ cmds.conv.sbias2 = 0;
+ cmds.conv.sdata2 = true;
+ for (int zo_hi = 0; zo_hi < output_depth; zo_hi += 8) {
+ // transpose filter weigths to support outer prodcut multiplication
+ int8_t juggled_filter_data[1][1][1][input_depth / 4][8][4];
+ Filter_N_H_W_M<8>(filter_data, juggled_filter_data[0][0][0][0][0], 1, 1,
+ 32);
+
+ Swizzle(bias_data, bias, 8);
+ Swizzle(output_multiplier, mult, 8);
+ Swizzle(output_shift, shft, 8, true);
+ int out = 0;
+ for (; out + 8 <= output_height * output_width * batches; out += 8) {
+ // resetting accumulators to clean up old output
+ vdup_b_x_m(v48, 0);
+ vdup_b_x_m(v52, 0);
+
+ int in = 0;
+ for (; in <= input_depth; in += 32) {
+ vld_b_s_xx_m(v0, input_data + out * input_depth + in, input_depth);
+ vld_b_s_xx_m(v4, input_data + out * input_depth + in + 4 * input_depth,
+ input_depth);
+
+ vld_b_x_m(v8, juggled_filter_data[0][0][0][in / 32][0][0]);
+ vld_b_x_m(v12, juggled_filter_data[0][0][0][(in / 32) + 4][0][0]);
+
+ aconv_vxv(v48, v0, cmds, v8);
+ }
+
+ INT32_TO_INT8_OUTPUT_PIPELINE(bias, mult, shft, output_activation_min,
+ output_activation_max, output_offset, v16,
+ v20, v24);
+
+ // store the results to ouput memory
+ int8_t* p_out = output_data + (out * output_depth) + zo_hi;
+ vstq_b_sp_xx(v48, p_out, output_depth);
+ vstq_b_sp_xx(v52, p_out, output_depth);
+ }
+ }
+}
+
+} // namespace kelvin::opt
diff --git a/tflm/opt/conv_s8_3x1_d48.cc b/tflm/opt/conv_s8_3x1_d48.cc
new file mode 100644
index 0000000..70a23b0
--- /dev/null
+++ b/tflm/opt/conv_s8_3x1_d48.cc
@@ -0,0 +1,321 @@
+/*
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Convolution based on Kelvin ops
+// Data types: input: s8, filter: s8, bias: s32
+// Special case for 48x3x1x48 filter
+
+#include "tflm/opt/conv_util.h"
+
+namespace kelvin::opt {
+
+void ConvS8K3x1D48(
+ const tflite::ConvParams& params, const int32_t* output_multiplier,
+ const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
+ const int8_t* input_data, const tflite::RuntimeShape& filter_shape,
+ const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
+ const int32_t* bias_data, const tflite::RuntimeShape& output_shape,
+ int8_t* output_data) {
+ const auto batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const int stride_width = params.stride_width;
+ const int stride_height = params.stride_height;
+ const int dilation_width_factor = params.dilation_width_factor;
+ const int dilation_height_factor = params.dilation_height_factor;
+ const int pad_width = params.padding_values.width;
+ const int pad_height = params.padding_values.height;
+ const int input_width = input_shape.Dims(2);
+ const int input_depth = input_shape.Dims(3);
+ const int32_t input_offset = params.input_offset;
+ const int filter_height = filter_shape.Dims(1);
+ const int filter_width = filter_shape.Dims(2);
+ const int filter_depth = filter_shape.Dims(3);
+ const int output_height = output_shape.Dims(1);
+ const int output_depth = output_shape.Dims(3);
+ const int32_t output_offset = params.output_offset;
+ const int32_t output_activation_min = params.quantized_activation_min;
+ const int32_t output_activation_max = params.quantized_activation_max;
+
+ TFLITE_DCHECK(batches == 1);
+ TFLITE_DCHECK(filter_depth == input_depth);
+ TFLITE_DCHECK(filter_height == 3);
+ TFLITE_DCHECK(filter_width == 1);
+ TFLITE_DCHECK(input_width == 1);
+ TFLITE_DCHECK(stride_width == 1);
+ TFLITE_DCHECK(stride_height == 1);
+ TFLITE_DCHECK(dilation_width_factor == 1);
+ TFLITE_DCHECK(dilation_height_factor == 1);
+ TFLITE_DCHECK(pad_width == 0);
+ TFLITE_DCHECK(pad_height == 0);
+
+ int32_t bias[48 * 4];
+ int32_t mult[48 * 4];
+ int32_t shft[48 * 4];
+ Swizzle(bias_data, bias, 48);
+ Swizzle(output_multiplier, mult, 48);
+ Swizzle(output_shift, shft, 48, true);
+
+ int8_t juggled_filter_data[48 / 8][3][1][48 / 4][8][4];
+ Filter_N_H_W_M<48>(filter_data, juggled_filter_data[0][0][0][0][0], 3, 1, 48);
+ union {
+ vconv_u8_t conv;
+ uint32_t raw;
+ } cmds;
+ cmds.conv.mode = 0;
+ cmds.conv.start = 0;
+ cmds.conv.stop = 7;
+ cmds.conv.sbias1 = input_offset;
+ cmds.conv.sdata1 = true;
+ cmds.conv.sbias2 = 0;
+ cmds.conv.sdata2 = true;
+
+ union {
+ vconv_u8_t conv;
+ uint32_t raw;
+ } cmds16;
+ cmds16.conv.mode = 0;
+ cmds16.conv.start = 0;
+ cmds16.conv.stop = 3;
+ cmds16.conv.sbias1 = input_offset;
+ cmds16.conv.sdata1 = true;
+ cmds16.conv.sbias2 = 0;
+ cmds16.conv.sdata2 = true;
+
+ for (int zo_hi = 0; zo_hi < output_depth; zo_hi += 8) {
+// For each pixel, the general flow for this kernel looks like:
+// 1) Reset accumulator and load activations into [v32, v46]
+// 2) For each group of 32 scalars in the pixel fan-in, run MAC pipeline
+// 2a) Load subset of activations from [v32, v46] to [v0, v7]
+// 2b) Load subset of weights
+// 2c) Run aconv
+// 3) Run the output pipeline and store.
+//
+// For step 1, we'll alias [v32, v46] to [L0, LE]. For most iterations,
+// we load all of these registers (10 pixels). For remainder iterations,
+// we load a subset and pad the rest with 0's. The data will be stored as
+// follows, where each letter represents 16 bytes of a pixel stored into
+// a register (capitalization used to help distinguish channels in a pixel):
+// L0 L1 L2 L3 L4 L5 L6 L7 L8 L9 LA LB LC LD LE
+// Aa AB bB Cc CD dD Ee EF fF Gg GH hH Ii IJ jJ
+#define L0 v32
+#define L1 v33
+#define L2 v34
+#define L3 v35
+#define L4 v36
+#define L5 v37
+#define L6 v38
+#define L7 v39
+#define L8 v40
+#define L9 v41
+#define LA v42
+#define LB v43
+#define LC v44
+#define LD v45
+#define LE v46
+
+// We run 5 iterations of step 2, 4 full iterations and one half iteration.
+// Because each pixel takes 1.5 registers, we have to interleave vmv_v and
+// vsliden_w_4_vv instructions to ensure the same output channels are stored
+// in each register per-pixel. As a refresher, vsliden_w_4_vv takes two
+// register arguments (X and Y), and returns the concatenation of the last
+// half of X and the first half of Y. ie:
+// L1 L2
+// AB bB
+// vsliden_w_4_vv(v1, L1, L2); -> v1 = Bb
+#define CONV_PER_CHANNEL_B8_3X1_48C_MAC_PIPELINE(p_flt) \
+ { \
+ /* 1/5 */ \
+ /* Ky = 0, IC:[0-31] */ \
+ vmv_v(v0, L0); /* Aa */ \
+ vsliden_w_4_vv(v1, L1, L2); /* Bb */ \
+ vmv_v(v2, L3); /* Cc */ \
+ vsliden_w_4_vv(v3, L4, L5); /* Dd */ \
+ vmv_v(v4, L6); /* Ee */ \
+ vsliden_w_4_vv(v5, L7, L8); /* Ff */ \
+ vmv_v(v6, L9); /* Gg */ \
+ vsliden_w_4_vv(v7, LA, LB); /* Hh */ \
+ vld_b_x_m(v56, p_flt + 128 * 0); \
+ vld_b_x_m(v60, p_flt + 128 * 1); \
+ aconv_vxv(v48, v0, cmds, v56); \
+ \
+ /* 2/5 */ \
+ /* Ky = 0, IC:[32-47]; Ky = 1, IC:[0-15] */ \
+ vmv_v(v0, L1); /* AB */ \
+ vsliden_w_4_vv(v1, L2, L3); /* BC */ \
+ vmv_v(v2, L4); /* CD */ \
+ vsliden_w_4_vv(v3, L5, L6); /* DE */ \
+ vmv_v(v4, L7); /* EF */ \
+ vsliden_w_4_vv(v5, L8, L9); /* FG */ \
+ vmv_v(v6, LA); /* GH */ \
+ vsliden_w_4_vv(v7, LB, LC); /* HI */ \
+ vld_b_x_m(v56, p_flt + 128 * 2); \
+ vld_b_x_m(v60, p_flt + 128 * 3); \
+ aconv_vxv(v48, v0, cmds, v56); \
+ \
+ /* 3/5 */ \
+ /* Ky = 1, IC:[16-47] */ \
+ vmv_v(v0, L2); /* bB */ \
+ vsliden_w_4_vv(v1, L3, L4); /* cC */ \
+ vmv_v(v2, L5); /* dD */ \
+ vsliden_w_4_vv(v3, L6, L7); /* eE */ \
+ vmv_v(v4, L8); /* fF */ \
+ vsliden_w_4_vv(v5, L9, LA); /* gG */ \
+ vmv_v(v6, LB); /* hH */ \
+ vsliden_w_4_vv(v7, LC, LD); /* iI */ \
+ vld_b_x_m(v56, p_flt + 128 * 4); \
+ vld_b_x_m(v60, p_flt + 128 * 5); \
+ aconv_vxv(v48, v0, cmds, v56); \
+ \
+ /* 4/5 */ \
+ /* Ky = 2, IC:[0-31] */ \
+ vmv_v(v0, L3); /* Cc */ \
+ vsliden_w_4_vv(v1, L4, L5); /* Dd */ \
+ vmv_v(v2, L6); /* Ee */ \
+ vsliden_w_4_vv(v3, L4, L5); /* Ff */ \
+ vmv_v(v4, L9); /* Gg */ \
+ vsliden_w_4_vv(v5, LA, LB); /* Hh */ \
+ vmv_v(v6, LC); /* Ii */ \
+ vsliden_w_4_vv(v7, LD, LE); /* Jj */ \
+ vld_b_x_m(v56, p_flt + 128 * 6); \
+ vld_b_x_m(v60, p_flt + 128 * 7); \
+ aconv_vxv(v48, v0, cmds, v56); \
+ \
+ /* 5/5 */ \
+ /* Ky = 2, IC:[32-47] half iteration */ \
+ vmv_v(v0, L4); /* C(D- ignored) */ \
+ vsliden_w_4_vv(v1, L5, L6); /* D(E- ignored) */ \
+ vmv_v(v2, L7); /* E(F- ignored) */ \
+ vsliden_w_4_vv(v3, L8, L9); /* F(G- ignored) */ \
+ vmv_v(v4, LA); /* G(H- ignored) */ \
+ vsliden_w_4_vv(v5, LB, LC); /* H(I- ignored) */ \
+ vmv_v(v6, LD); /* I(J- ignored) */ \
+ /* Pad last iteration with first pixel. Gets ignored by cmd16 */ \
+ vsliden_w_4_vv(v7, LE, L0); /* J(A- ignored) */ \
+ vld_b_x_m(v56, p_flt + 128 * 8); /*Load once half iteration*/ \
+ /* cmds16 runs subset of outer product */ \
+ aconv_vxv(v48, v0, cmds16, v56); \
+ }
+
+ // Iterate over outputs
+ int out_y = 0;
+ for (; out_y + 8 <= output_height; out_y += 8) {
+ // Reset accumulator
+ vdup_w_x_m(v48, 0);
+ vdup_w_x_m(v52, 0);
+ acset_v(v48, v48);
+
+ const int8_t* p_flt = juggled_filter_data[zo_hi / 8][0][0][0][0];
+ const int8_t* p_in = input_data + (out_y * input_width * input_depth);
+
+ // Load 10*48 activations into 10*48*32 = 15 registers
+ vld_b_x_m(L0, p_in);
+ vld_b_x_m(L4, p_in + 32 * 4);
+ vld_b_x_m(L8, p_in + 32 * 8);
+ vld_b_x(LC, p_in + 32 * 12);
+ vld_b_x(LD, p_in + 32 * 13);
+ vld_b_x(LE, p_in + 32 * 14);
+
+ // MAC pipeline
+ CONV_PER_CHANNEL_B8_3X1_48C_MAC_PIPELINE(p_flt);
+
+ // Output pipeline
+ INT32_TO_INT8_OUTPUT_PIPELINE(bias + zo_hi * 4, mult + zo_hi * 4,
+ shft + zo_hi * 4, output_activation_min,
+ output_activation_max, output_offset, v36,
+ v40, v44);
+ int8_t* p_out =
+ output_data + tflite::Offset(output_shape, 0, out_y, 0, zo_hi);
+ vstq_b_sp_xx(v48, p_out, output_depth);
+ vstq_b_sp_xx(v52, p_out, output_depth);
+ }
+
+ // Left over minibatch
+ int remainder = output_height - out_y;
+ if (remainder != 0) {
+ // Reset accumulator
+ vdup_w_x_m(v48, 0);
+ vdup_w_x_m(v52, 0);
+ acset_v(v48, v48);
+
+ const int8_t* p_flt = juggled_filter_data[zo_hi / 8][0][0][0][0];
+ const int8_t* p_in = input_data + (out_y * input_width * input_depth);
+
+ // Load (remainder + 2) * 48 activations
+ // L0 L1 L2 L3 L4 L5 L6 L7 L8 L9 LA LB LC LD
+ // AA AB BB CC CD DD EE EF FF GG GH HH II I-
+ vld_b_x_m(L0, p_in);
+ vdup_w_x_m(L4, 0);
+ vdup_w_x_m(L8, 0);
+ vdup_w_x_m(LC, 0);
+ switch (remainder) {
+ case 7:
+ vld_b_x(LD, p_in + 32 * 13);
+ vld_b_x(LC, p_in + 32 * 12);
+ case 6:
+ vld_b_x(LB, p_in + 32 * 11);
+ case 5:
+ vld_b_x(LA, p_in + 32 * 10);
+ vld_b_x(L9, p_in + 32 * 9);
+ case 4:
+ vld_b_x(L8, p_in + 32 * 8);
+ case 3:
+ vld_b_x(L7, p_in + 32 * 7);
+ vld_b_x(L6, p_in + 32 * 6);
+ case 2:
+ vld_b_x(L5, p_in + 32 * 5);
+ default:
+ break;
+ }
+ vld_b_x(L4, p_in + 32 * 4);
+
+ // MAC pipeline
+ CONV_PER_CHANNEL_B8_3X1_48C_MAC_PIPELINE(p_flt);
+
+ // Output pipeline
+ INT32_TO_INT8_OUTPUT_PIPELINE(bias + zo_hi * 4, mult + zo_hi * 4,
+ shft + zo_hi * 4, output_activation_min,
+ output_activation_max, output_offset, v36,
+ v40, v44);
+
+ int8_t* p_out =
+ output_data + tflite::Offset(output_shape, 0, out_y, 0, zo_hi);
+ uint8_t local_data[64];
+ vst_b_x(v0, local_data);
+ vst_b_x(v1, local_data + 32);
+ for (int i = 0; i < remainder; i++) {
+ memcpy(p_out + (i * output_depth), local_data + (i * 8), 8);
+ }
+ }
+
+#undef CONV_PER_CHANNEL_B8_3X1_48C_MAC_PIPELINE
+#undef L0
+#undef L1
+#undef L2
+#undef L3
+#undef L4
+#undef L5
+#undef L6
+#undef L7
+#undef L8
+#undef L9
+#undef LA
+#undef LB
+#undef LC
+#undef LD
+#undef LE
+ }
+}
+
+} // namespace kelvin::opt
diff --git a/tflm/opt/conv_s8_d32.cc b/tflm/opt/conv_s8_d32.cc
new file mode 100644
index 0000000..6572ae8
--- /dev/null
+++ b/tflm/opt/conv_s8_d32.cc
@@ -0,0 +1,135 @@
+/*
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Convolution based on Kelvin ops
+// Data types: input: s8, filter: s8, bias: s32
+// Special case for filter depth = 32n
+
+#include "tflm/opt/conv_util.h"
+
+namespace kelvin::opt {
+
+// Fixed-point per-channel-quantization convolution reference kernel.
+void ConvS8D32(const tflite::ConvParams& params,
+ const int32_t* output_multiplier, const int32_t* output_shift,
+ const tflite::RuntimeShape& input_shape,
+ const int8_t* input_data,
+ const tflite::RuntimeShape& filter_shape,
+ const int8_t* filter_data,
+ const tflite::RuntimeShape& bias_shape, const int32_t* bias_data,
+ const tflite::RuntimeShape& output_shape, int8_t* output_data) {
+ // Get parameters.
+ const int32_t input_offset = params.input_offset; // r = s(q - Z)
+ const int stride_width = params.stride_width;
+ const int stride_height = params.stride_height;
+ const int dilation_width_factor = params.dilation_width_factor;
+ const int dilation_height_factor = params.dilation_height_factor;
+ const int pad_width = params.padding_values.width;
+ const int pad_height = params.padding_values.height;
+ const int32_t output_offset = params.output_offset;
+
+ // Set min and max value of the output.
+ const int32_t output_activation_min = params.quantized_activation_min;
+ const int32_t output_activation_max = params.quantized_activation_max;
+
+ // Consistency check.
+ TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+ TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+ TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+ TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+ const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const int input_depth = input_shape.Dims(3);
+ const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+ if (bias_data) {
+ TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+ }
+
+ // Check dimensions of the tensors.
+ const int input_height = input_shape.Dims(1);
+ const int input_width = input_shape.Dims(2);
+ const int filter_height = filter_shape.Dims(1);
+ const int filter_width = filter_shape.Dims(2);
+ const int filter_input_depth = filter_shape.Dims(3);
+ const int groups = input_depth / filter_input_depth;
+ TFLITE_DCHECK_NE(groups, 0);
+ TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0);
+ const int filters_per_group = output_depth / groups;
+ TFLITE_DCHECK_NE(filters_per_group, 0);
+ const int output_height = output_shape.Dims(1);
+ const int output_width = output_shape.Dims(2);
+
+ for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+ for (int batch = 0; batch < batches; ++batch) {
+ for (int out_y = 0; out_y < output_height; ++out_y) {
+ const int in_y_origin = (out_y * stride_height) - pad_height;
+ for (int out_x = 0; out_x < output_width; ++out_x) {
+ const int in_x_origin = (out_x * stride_width) - pad_width;
+ vdup_w_x_m(v60, 0);
+ int32_t acc = 0;
+ for (int in_channel = 0; in_channel + 32 <= filter_input_depth;
+ in_channel += 32) {
+ for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+ const int in_y = in_y_origin + dilation_height_factor * filter_y;
+ for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+ const int in_x = in_x_origin + dilation_width_factor * filter_x;
+
+ // Zero padding by omitting the areas outside the image.
+ const bool is_point_inside_image =
+ (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+ (in_y < input_height);
+
+ if (!is_point_inside_image) {
+ continue;
+ }
+
+ vld_b_x(v0, &input_data[tflite::Offset(input_shape, batch, in_y,
+ in_x, in_channel)]);
+ vaddw_h_vx(v0, v0, 0);
+ vadd_h_vx(v0, v0, static_cast<int16_t>(input_offset));
+ vadd_h_vx(v1, v1, static_cast<int16_t>(input_offset));
+ vld_b_x(v2, &filter_data[tflite::Offset(filter_shape,
+ out_channel, filter_y,
+ filter_x, in_channel)]);
+ vaddw_h_vx(v2, v2, 0);
+ vmulw_w_vv(v48, v0, v2);
+ vmulw_w_vv(v50, v1, v3);
+ vadd_w_vv_m(v60, v60, v48);
+ }
+ }
+ }
+ int32_t accumulators[32];
+ vst_w_x_m(v60, accumulators);
+ for (int i = 0; i < 32; ++i) {
+ acc += accumulators[i];
+ }
+
+ if (bias_data) {
+ acc += bias_data[out_channel];
+ }
+ acc = tflite::MultiplyByQuantizedMultiplier(
+ acc, output_multiplier[out_channel], output_shift[out_channel]);
+ acc += output_offset;
+ acc = std::max(acc, output_activation_min);
+ acc = std::min(acc, output_activation_max);
+ output_data[tflite::Offset(output_shape, batch, out_y, out_x,
+ out_channel)] = static_cast<int8_t>(acc);
+ }
+ }
+ }
+ }
+}
+
+} // namespace kelvin::opt
diff --git a/tflm/opt/conv_s8_d4.cc b/tflm/opt/conv_s8_d4.cc
new file mode 100644
index 0000000..0dd3e50
--- /dev/null
+++ b/tflm/opt/conv_s8_d4.cc
@@ -0,0 +1,384 @@
+/*
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Convolution based on Kelvin ops
+// Data types: input: s8, filter: s8, bias: s32
+// Special case for filter depth = 4n
+
+#include <cstdlib>
+#include <memory>
+
+#include "crt/kelvin.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/conv.h"
+#include "tensorflow/lite/kernels/internal/runtime_shape.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace kelvin::opt {
+namespace {
+void Filter_8_H_W_M(const int8_t* input, int8_t* output, int H, int W, int M) {
+ const int8_t(&in)[8][H][W][M] = *(int8_t(*)[8][H][W][M])input;
+ int8_t(&out)[H][W][M / 4][8][4] = *(int8_t(*)[H][W][M / 4][8][4]) output;
+ assert(M >= 4);
+ for (int zo = 0; zo < 8; ++zo) {
+ for (int ky = 0; ky < H; ++ky) {
+ for (int kx = 0; kx < W; ++kx) {
+ for (int zi = 0; zi < M; ++zi) {
+ const int zi_hi = zi >> 2; // div4
+ const int zi_lo = zi & 3; // rem4
+ out[ky][kx][zi_hi][zo][zi_lo] = in[zo][ky][kx][zi];
+ }
+ }
+ }
+ }
+}
+
+void Swizzle(const int32_t* input, int32_t* output, int N) {
+ const int32_t(&in)[N] = *(int32_t(*)[N])input;
+ int32_t(&out)[N * 4] = *(int32_t(*)[N * 4]) output;
+ // Convert to accumulator swizzle pattern.
+ for (int i = 0; i < N / 8; ++i) {
+ int32_t* out0 = out + i * 32 + 0;
+ int32_t* out1 = out + i * 32 + 16;
+ int32_t* out2 = out + i * 32 + 8;
+ int32_t* out3 = out + i * 32 + 24;
+ for (int j = 0; j < 4; ++j) {
+ const int32_t* p_in = in + i * 8;
+ for (int k = 0; k < 2; ++k) {
+ *out0++ = *p_in++;
+ *out1++ = *p_in++;
+ *out2++ = *p_in++;
+ *out3++ = *p_in++;
+ }
+ }
+ }
+}
+} // namespace
+
+void ConvS8D4(
+ const tflite::ConvParams& params, const int32_t* output_multiplier,
+ const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
+ const int8_t* input_data, const tflite::RuntimeShape& filter_shape,
+ const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
+ const int32_t* bias_data, const tflite::RuntimeShape& output_shape,
+ int8_t* output_data) {
+ // Get parameters.
+ const int32_t input_offset = params.input_offset; // r = s(q - Z)
+ const int32_t neg_input_offset = -params.input_offset; // r = s(q - Z)
+ const int stride_width = params.stride_width;
+ const int stride_height = params.stride_height;
+ const int dilation_width_factor = params.dilation_width_factor;
+ const int dilation_height_factor = params.dilation_height_factor;
+ const int pad_width = params.padding_values.width;
+ const int pad_height = params.padding_values.height;
+ const int32_t output_offset = params.output_offset;
+
+ // Set min and max value of the output.
+ const int32_t output_activation_min = params.quantized_activation_min;
+ const int32_t output_activation_max = params.quantized_activation_max;
+
+ // Consistency check.
+ TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+ TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+ TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+ TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+ const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const int input_depth = input_shape.Dims(3);
+ const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+ if (bias_data) {
+ TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+ }
+
+ // Check dimensions of the tensors.
+ const int input_height = input_shape.Dims(1);
+ const int input_width = input_shape.Dims(2);
+ const int filter_height = filter_shape.Dims(1);
+ const int filter_width = filter_shape.Dims(2);
+ const int filter_input_depth = filter_shape.Dims(3);
+ const int groups = input_depth / filter_input_depth;
+ TFLITE_DCHECK_NE(groups, 0);
+ TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0);
+ const int filters_per_group = output_depth / groups;
+ TFLITE_DCHECK_NE(filters_per_group, 0);
+ const int output_height = output_shape.Dims(1);
+ const int output_width = output_shape.Dims(2);
+
+ union {
+ vconv_u8_t conv;
+ uint32_t raw;
+ } cmds;
+ cmds.conv.mode = 0;
+ cmds.conv.start = 0;
+ cmds.conv.stop = 7;
+ cmds.conv.sbias1 = input_offset;
+ cmds.conv.sdata1 = true;
+ cmds.conv.sbias2 = 0;
+ cmds.conv.sdata2 = true;
+
+ const size_t swizzled_filter_data_size =
+ 8 * filter_height * filter_width * filter_input_depth;
+ std::unique_ptr<int8_t> swizzled_filter_data(reinterpret_cast<int8_t*>(
+ ::aligned_alloc(32, swizzled_filter_data_size)));
+ int8_t* p_swizzled_filter_data = swizzled_filter_data.get();
+ int32_t swizzled_bias_data[32];
+ int32_t swizzled_mult_data[32];
+ int32_t swizzled_shift_data[32];
+
+ for (int out_channel = 0; out_channel + 8 <= output_depth; out_channel += 8) {
+ Filter_8_H_W_M(filter_data + (out_channel * filter_height * filter_width *
+ filter_input_depth),
+ p_swizzled_filter_data, filter_height, filter_width,
+ filter_input_depth);
+ Swizzle(bias_data + out_channel, swizzled_bias_data, 8);
+ Swizzle(output_multiplier + out_channel, swizzled_mult_data, 8);
+ Swizzle(output_shift + out_channel, swizzled_shift_data, 8);
+ vld_w_x_m(v16, swizzled_bias_data);
+ vld_w_x_m(v20, swizzled_mult_data);
+ vld_w_x_m(v24, swizzled_shift_data);
+ vrsub_w_vx_m(v24, v24, 0);
+
+ for (int batch = 0; batch < batches; ++batch) {
+ for (int out_y = 0; out_y < output_height; ++out_y) {
+ const int in_y_origin = (out_y * stride_height) - pad_height;
+ int out_x = 0;
+ do {
+ int out_xs_this_iter = std::min(8, output_width - out_x);
+ // 8x accumulators
+ vdup_w_x_m(v48, 0);
+ vdup_w_x_m(v52, 0);
+ acset_v(v48, v48);
+ int in_channel = 0;
+ do {
+ int channels_this_iter = std::min(filter_input_depth, 32);
+ for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+ const int in_y = in_y_origin + dilation_height_factor * filter_y;
+ const bool is_row_inside_input =
+ (in_y >= 0) && (in_y < input_height);
+ if (!is_row_inside_input) {
+ continue;
+ }
+
+ for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+ int in_x[8];
+ bool right_pad = false;
+ int first_right_pad = -1;
+ for (int i = 0; i < 8; ++i) {
+ const int in_x_origin =
+ ((out_x + i) * stride_width) - pad_width;
+ in_x[i] = in_x_origin + dilation_width_factor * filter_x;
+ }
+ bool left_pad = (in_x[0] < 0);
+ for (int i = 7; i >= 0; --i) {
+ if (in_x[i] < input_width) {
+ break;
+ }
+ right_pad = true;
+ first_right_pad = i;
+ }
+
+ if (left_pad) {
+ vdup_b_x(v0, -input_offset);
+ vld_b_s_xx(
+ v1,
+ &input_data[tflite::Offset(input_shape, batch, in_y,
+ in_x[1], in_channel)],
+ input_depth * stride_width);
+ vld_b_s_xx(
+ v2,
+ &input_data[tflite::Offset(input_shape, batch, in_y,
+ in_x[2], in_channel)],
+ input_depth * stride_width);
+ vld_b_s_xx(
+ v3,
+ &input_data[tflite::Offset(input_shape, batch, in_y,
+ in_x[3], in_channel)],
+ input_depth * stride_width);
+ vld_b_s_xx_m(
+ v4,
+ &input_data[tflite::Offset(input_shape, batch, in_y,
+ in_x[4], in_channel)],
+ input_depth * stride_width);
+ } else if (right_pad) {
+ int first_pad = std::min(first_right_pad, out_xs_this_iter);
+ switch (first_pad) {
+ case 0:
+ vdup_b_x(v0, neg_input_offset);
+ case 1:
+ vdup_b_x(v1, neg_input_offset);
+ case 2:
+ vdup_b_x(v2, neg_input_offset);
+ case 3:
+ vdup_b_x(v3, neg_input_offset);
+ case 4:
+ vdup_b_x(v4, neg_input_offset);
+ case 5:
+ vdup_b_x(v5, neg_input_offset);
+ case 6:
+ vdup_b_x(v6, neg_input_offset);
+ case 7:
+ vdup_b_x(v7, neg_input_offset);
+ }
+ switch (8 - first_pad) { // rest (stripmines?)
+ case 0:
+ vld_b_s_xx(
+ v7,
+ &input_data[tflite::Offset(input_shape, batch, in_y,
+ in_x[7], in_channel)],
+ input_depth * stride_width);
+ case 1:
+ vld_b_s_xx(
+ v6,
+ &input_data[tflite::Offset(input_shape, batch, in_y,
+ in_x[6], in_channel)],
+ input_depth * stride_width);
+ case 2:
+ vld_b_s_xx(
+ v5,
+ &input_data[tflite::Offset(input_shape, batch, in_y,
+ in_x[5], in_channel)],
+ input_depth * stride_width);
+ case 3:
+ vld_b_s_xx(
+ v4,
+ &input_data[tflite::Offset(input_shape, batch, in_y,
+ in_x[4], in_channel)],
+ input_depth * stride_width);
+ case 4:
+ vld_b_s_xx(
+ v3,
+ &input_data[tflite::Offset(input_shape, batch, in_y,
+ in_x[3], in_channel)],
+ input_depth * stride_width);
+ case 5:
+ vld_b_s_xx(
+ v2,
+ &input_data[tflite::Offset(input_shape, batch, in_y,
+ in_x[2], in_channel)],
+ input_depth * stride_width);
+ case 6:
+ vld_b_s_xx(
+ v1,
+ &input_data[tflite::Offset(input_shape, batch, in_y,
+ in_x[1], in_channel)],
+ input_depth * stride_width);
+ case 7:
+ vld_b_s_xx(
+ v0,
+ &input_data[tflite::Offset(input_shape, batch, in_y,
+ in_x[0], in_channel)],
+ input_depth * stride_width);
+ }
+ } else if (!left_pad && !right_pad) {
+ // Inputs
+ vld_b_s_xx_m(
+ v0,
+ &input_data[tflite::Offset(input_shape, batch, in_y,
+ in_x[0], in_channel)],
+ input_depth * stride_width);
+ vld_b_s_xx_m(
+ v4,
+ &input_data[tflite::Offset(input_shape, batch, in_y,
+ in_x[4], in_channel)],
+ input_depth * stride_width);
+ } else {
+ vdup_b_x(v0, -input_offset);
+ vdup_b_x(v7, -input_offset);
+ vld_b_s_xx_m(
+ v1,
+ &input_data[tflite::Offset(input_shape, batch, in_y,
+ in_x[1], in_channel)],
+ input_depth * stride_width);
+ vld_b_s_xx(
+ v5,
+ &input_data[tflite::Offset(input_shape, batch, in_y,
+ in_x[5], in_channel)],
+ input_depth * stride_width);
+ vld_b_s_xx(
+ v6,
+ &input_data[tflite::Offset(input_shape, batch, in_y,
+ in_x[6], in_channel)],
+ input_depth * stride_width);
+ }
+ size_t local_filter_offset =
+ (filter_y * filter_width * 8 * input_depth) +
+ (filter_x * 8 * input_depth) + (in_channel * 8);
+ int8_t* p_local_filter_start =
+ p_swizzled_filter_data + local_filter_offset;
+ vld_b_p_x_m(v8, p_local_filter_start);
+ vld_b_x_m(v12, p_local_filter_start);
+
+ cmds.conv.stop = (channels_this_iter / 4) - 1;
+ aconv_vxv(v48, v0, cmds, v8);
+ }
+ }
+ in_channel += channels_this_iter;
+ } while (in_channel < filter_input_depth);
+ vcget(v48);
+ vadd_w_vv_m(v48, v48, v16);
+ vadd_w_vv_m(v52, v52, v16);
+ vdmulh_w_rn_vv_m(v48, v48, v20);
+ vdmulh_w_rn_vv_m(v52, v52, v20);
+ vsha_w_r_vv_m(v48, v48, v24);
+ vsha_w_r_vv_m(v52, v52, v24);
+ vadd_w_vx_m(v48, v48, output_offset);
+ vadd_w_vx_m(v52, v52, output_offset);
+ vmin_w_vx_m(v48, v48, output_activation_max);
+ vmin_w_vx_m(v52, v52, output_activation_max);
+ vmax_w_vx_m(v48, v48, output_activation_min);
+ vmax_w_vx_m(v52, v52, output_activation_min);
+ vsraqs_b_vx(v56, v48, 0);
+ vsraqs_b_vx(v57, v52, 0);
+ if (out_xs_this_iter >= 4) {
+ vstq_b_s_xx(v56,
+ &output_data[tflite::Offset(output_shape, batch, out_y,
+ out_x, out_channel)],
+ output_depth);
+ } else {
+ for (int i = 0; i < out_xs_this_iter; ++i) {
+ if (i > 0) {
+ vsliden_b_4_vv(v58, v56, v0);
+ vsliden_b_4_vv(v56, v58, v0);
+ }
+ vst_b_l_xx(v56,
+ &output_data[tflite::Offset(output_shape, batch, out_y,
+ out_x + i, out_channel)],
+ 8);
+ }
+ }
+ if (out_xs_this_iter == 8) {
+ vstq_b_s_xx(v57,
+ &output_data[tflite::Offset(output_shape, batch, out_y,
+ out_x + 4, out_channel)],
+ output_depth);
+ } else if (out_xs_this_iter > 4) {
+ for (int i = 4; i < out_xs_this_iter; ++i) {
+ if (i > 4) {
+ vsliden_b_4_vv(v58, v57, v0);
+ vsliden_b_4_vv(v57, v58, v0);
+ }
+ vst_b_l_xx(v57,
+ &output_data[tflite::Offset(output_shape, batch, out_y,
+ out_x + i, out_channel)],
+ 8);
+ }
+ }
+ out_x += out_xs_this_iter;
+ } while (out_x < output_width);
+ }
+ }
+ }
+}
+} // namespace kelvin::opt
diff --git a/tflm/opt/conv_util.h b/tflm/opt/conv_util.h
new file mode 100644
index 0000000..b9470aa
--- /dev/null
+++ b/tflm/opt/conv_util.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TFLM_OPT_CONV_UTIL_H_
+#define TFLM_OPT_CONV_UTIL_H_
+
+#include <cassert>
+#include <memory>
+
+#include "crt/kelvin.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/runtime_shape.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tflm/opt/util.h"
+
+namespace kelvin::opt {
+/* clang-format off */
+constexpr const int swizzle[16] = {
+ 0, 4, 8, 12,
+ 2, 6, 10, 14,
+ 1, 5, 9, 13,
+ 3, 7, 11, 15,
+};
+/* clang-format on */
+
+constexpr int kFilterHeightIndex = 1;
+constexpr int kFilterWidthIndex = 2;
+constexpr int kFilterInputChannelIndex = 3;
+constexpr int kInputChannelIndex = 3;
+constexpr int kOutputChannelIndex = 3;
+
+#define INA0 v0
+#define FLTA0 v8
+#define FLTA1 v9
+#define FLTA2 v10
+#define FLTA3 v11
+#define FLTA4 v12
+#define FLTA5 v13
+#define FLTA6 v14
+#define FLTA7 v15
+#define ACC v48
+#define ACC0 v48
+#define OUT0 v56
+
+// H,W ( height and width of filter) N -number of inputs, M -number of outputs
+template <int N>
+inline void Filter_N_H_W_M(const int8_t* input, int8_t* output, int H, int W,
+ int M) {
+ // Convert: input [zo][ky][kx][zi] (N,3,1,M)
+ // output [zo.hi=N/8][ky][kx][zi_hi=M/4][zo.lo=8][zi_lo=4]
+ const int8_t(&in)[N][H][W][M] = *(int8_t(*)[N][H][W][M])input;
+ int8_t(&out)[N / 8][H][W][M / 4][8][4] =
+ *(int8_t(*)[N / 8][H][W][M / 4][8][4]) output;
+ assert(N >= 4 && M >= 4);
+ for (int zo = 0; zo < N; ++zo) {
+ for (int ky = 0; ky < H; ++ky) {
+ for (int kx = 0; kx < W; ++kx) {
+ for (int zi = 0; zi < M; ++zi) {
+ const int zo_hi = zo >> 3; // div8
+ const int zo_lo = zo & 7; // rem8
+ const int zi_hi = zi >> 2; // div4
+ const int zi_lo = zi & 3; // rem4
+ out[zo_hi][ky][kx][zi_hi][zo_lo][zi_lo] = in[zo][ky][kx][zi];
+ }
+ }
+ }
+ }
+}
+
+// Swizzle values, and duplicate 4 times for stripmining.
+inline void Swizzle(const int32_t* input, int32_t* output, int N,
+ bool negate = false) {
+ const int32_t(&in)[N] = *(int32_t(*)[N])input;
+ int32_t(&out)[N * 4] = *(int32_t(*)[N * 4]) output;
+ // Convert to accumulator swizzle pattern.
+ for (int i = 0; i < N / 8; ++i) {
+ int32_t* out0 = out + i * 32 + 0;
+ int32_t* out1 = out + i * 32 + 16;
+ int32_t* out2 = out + i * 32 + 8;
+ int32_t* out3 = out + i * 32 + 24;
+ for (int j = 0; j < 4; ++j) {
+ const int32_t* p_in = in + i * 8;
+ for (int k = 0; k < 2; ++k) {
+ *out0++ = *p_in++;
+ *out1++ = *p_in++;
+ *out2++ = *p_in++;
+ *out3++ = *p_in++;
+ }
+ }
+ }
+ if (negate) {
+ for (int i = 0; i < N * 4; ++i) {
+ out[i] = -out[i];
+ }
+ }
+}
+
+// Run output pipeline on int32 accumulators in [v48-v55] and store results
+// in v48 and v52. Clobbers [v48-v55].
+#define INT32_TO_INT8_OUTPUT_PIPELINE(bias, mult, shft, output_min, \
+ output_max, output_offset, bias_reg, \
+ mult_reg, shift_reg) \
+ { \
+ vcget(v48); \
+ vld_w_x_m(bias_reg, bias); \
+ vld_w_x_m(mult_reg, mult); \
+ vld_w_x_m(shift_reg, shft); \
+ vadd_w_vv_m(v48, v48, bias_reg); \
+ vadd_w_vv_m(v52, v52, bias_reg); \
+ vmin_w_vx_m(v48, v48, output_max); \
+ vmax_w_vx_m(v52, v52, output_min); \
+ vdmulh_w_r_vv_m(v48, v48, mult_reg); \
+ vdmulh_w_r_vv_m(v52, v52, mult_reg); \
+ vsha_w_r_vv_m(v48, v48, shift_reg); \
+ vsha_w_r_vv_m(v52, v52, shift_reg); \
+ vadd_w_vx_m(v48, v48, output_offset); \
+ vadd_w_vx_m(v52, v52, output_offset); \
+ vsraqs_b_vx(v48, v48, 0); \
+ vsraqs_b_vx(v52, v52, 0); \
+ }
+} // namespace kelvin::opt
+
+#endif // TFLM_OPT_CONV_UTIL_H_
diff --git a/tflm/opt/depthwise_conv_s16.cc b/tflm/opt/depthwise_conv_s16.cc
index 13ae125..c7db407 100644
--- a/tflm/opt/depthwise_conv_s16.cc
+++ b/tflm/opt/depthwise_conv_s16.cc
@@ -14,23 +14,32 @@
* limitations under the License.
*/
-#include <algorithm>
+// Depthwise convolution based on Kelvin ops
+// Data types: input: s16, filter: s8, bias s64
-#include "crt/kelvin.h"
-#include "tflm/opt/opt.h"
-#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h"
+#include "tflm/opt/conv_util.h"
namespace kelvin::opt {
+namespace {
+void DepthwiseConvS16K3x1(
+ const tflite::DepthwiseParams& params, const int32_t* output_multiplier,
+ const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
+ const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
+ const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
+ const int64_t* bias_data, const tflite::RuntimeShape& output_shape,
+ int16_t* output_data) {
+ const int16_t* activations = input_data;
+ const int8_t* weights = filter_data;
+ const int64_t* biases = bias_data;
+ int channels = filter_shape.Dims(3);
+ int frames = input_shape.Dims(2);
+ int dilation = params.dilation_width_factor;
+ const int32_t* output_mult = output_multiplier;
+ int32_t output_activation_min = params.quantized_activation_min;
+ int32_t output_activation_max = params.quantized_activation_max;
+ int16_t* output = output_data;
-void DepthwiseConv2DKelvinS16K3x1(const int16_t* activations,
- const int8_t* weights,
- const int64_t* biases,
- int channels, int frames, int dilation,
- const int32_t* output_mult,
- const int32_t* output_shift,
- int32_t output_activation_min,
- int32_t output_activation_max,
- int16_t* output) {
for (int c = 0; c + 32 <= channels; c += 32) {
// Load weights and interleave into correct order [v58-v63].
// Because there are more activations than weights, interleave weights.
@@ -78,8 +87,8 @@
for (; frames_idx < frames; frames_idx += dilation) {
vld_h_p_xx(v4, local_activations0, step);
vld_h_p_xx(v5, local_activations1, step);
- vmulw_w_vv(v48, v58, v0); // Clobber accumulator
- vmulw_w_vv(v50, v59, v1); // Clobber accumulator
+ vmulw_w_vv(v48, v58, v0); // Clobber accumulator
+ vmulw_w_vv(v50, v59, v1); // Clobber accumulator
vadd_w_vv_m(v48, v48, v52); // Add bias.
vmulw_w_vv(v40, v60, v2);
vmulw_w_vv(v42, v61, v3);
@@ -118,4 +127,60 @@
// - one final loop handling remainder
}
+// generic implementation based on Kelvin ops
+void DepthwiseConvS16Generic(
+ const tflite::DepthwiseParams& params, const int32_t* output_multiplier,
+ const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
+ const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
+ const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
+ const int64_t* bias_data, const tflite::RuntimeShape& output_shape,
+ int16_t* output_data) {
+ // TBD: Use Kelvin implementation to replace the below
+ tflite::reference_integer_ops::DepthwiseConvPerChannel(
+ params, output_multiplier, output_shift, input_shape, input_data,
+ filter_shape, filter_data, bias_shape, bias_data, output_shape,
+ output_data);
+ return;
+}
+} // namespace
+
+void DepthwiseConvS16(
+ const tflite::DepthwiseParams& params, const int32_t* output_multiplier,
+ const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
+ const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
+ const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
+ const int64_t* bias_data, const tflite::RuntimeShape& output_shape,
+ int16_t* output_data) {
+ // Get parameters.
+ const int stride_width = params.stride_width;
+ const int stride_height = params.stride_height;
+ const int dilation_width_factor = params.dilation_width_factor;
+ const int dilation_height_factor = params.dilation_height_factor;
+ const int filter_height = filter_shape.Dims(1);
+ const int filter_width = filter_shape.Dims(2);
+
+ if (params.padding_type == tflite::PaddingType::kValid && stride_width == 1 &&
+ stride_height == 1 && dilation_width_factor == 1 &&
+ dilation_height_factor == 1) {
+ // generic implementation by default
+ auto fn = DepthwiseConvS16Generic;
+
+ // special case of filter size 3x1
+ if (filter_height == 1 && filter_width == 3) {
+ fn = DepthwiseConvS16K3x1;
+ }
+
+ fn(params, output_multiplier, output_shift, input_shape, input_data,
+ filter_shape, filter_data, bias_shape, bias_data, output_shape,
+ output_data);
+ return;
+ }
+
+ // Use reference implementation
+ tflite::reference_integer_ops::DepthwiseConvPerChannel(
+ params, output_multiplier, output_shift, input_shape, input_data,
+ filter_shape, filter_data, bias_shape, bias_data, output_shape,
+ output_data);
+}
+
} // namespace kelvin::opt
diff --git a/tflm/opt/depthwise_conv_s8.cc b/tflm/opt/depthwise_conv_s8.cc
index c4ee35a..111fdc1 100644
--- a/tflm/opt/depthwise_conv_s8.cc
+++ b/tflm/opt/depthwise_conv_s8.cc
@@ -1,5 +1,5 @@
/*
- * Copyright 2023 Google LLC
+ * Copyright 2024 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -14,39 +14,35 @@
* limitations under the License.
*/
-#include <algorithm>
+// Depthwise convolution based on Kelvin ops
+// Data types: input: s8, filter: s8, bias s32
-#include "crt/kelvin.h"
-#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h"
-#include "tensorflow/lite/kernels/internal/runtime_shape.h"
-#include "tensorflow/lite/kernels/internal/types.h"
-#include "tflm/opt/opt.h"
+#include "tflm/opt/conv_util.h"
namespace kelvin::opt {
-
-void Swizzle(const int32_t* input, int32_t* output, int N) {
+namespace {
+// Reorders a vector to match the pattern after double-widening.
+// N must be a multiple of 4.
+void VectorSwizzle(const int32_t* input, int32_t* output, int N) {
+ assert(N >= 4 && N % 4 == 0);
const int32_t(&in)[N] = *(int32_t(*)[N])input;
- int32_t(&out)[N * 4] = *(int32_t(*)[N * 4]) output;
- // Convert to accumulator swizzle pattern.
- for (int i = 0; i < N / 8; ++i) {
- int32_t* out0 = out + i * 32 + 0;
- int32_t* out1 = out + i * 32 + 16;
- int32_t* out2 = out + i * 32 + 8;
- int32_t* out3 = out + i * 32 + 24;
- for (int j = 0; j < 4; ++j) {
- const int32_t* p_in = in + i * 8;
- for (int k = 0; k < 2; ++k) {
- *out0++ = *p_in++;
- *out1++ = *p_in++;
- *out2++ = *p_in++;
- *out3++ = *p_in++;
- }
+ int32_t(&out)[N] = *(int32_t(*)[N]) output;
+ const int32_t* p_in = in;
+ for (int i = 0; i < N / 4; ++i) {
+ int32_t* out0 = out + i + 0;
+ int32_t* out1 = out + i + 16;
+ int32_t* out2 = out + i + 8;
+ int32_t* out3 = out + i + 24;
+ *out0 = *p_in++;
+ *out1 = *p_in++;
+ *out2 = *p_in++;
+ *out3 = *p_in++;
}
}
-}
-void DWConv2DKelvin_d32(
+// special case of input depth = 32n
+void DepthwiseConvS8D32(
const tflite::DepthwiseParams& params, const int32_t* output_multiplier,
const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
const int8_t* input_data, const tflite::RuntimeShape& filter_shape,
@@ -57,8 +53,6 @@
) {
const int stride_width = params.stride_width;
const int stride_height = params.stride_height;
- const int dilation_width_factor = params.dilation_width_factor;
- const int dilation_height_factor = params.dilation_height_factor;
const int pad_width = params.padding_values.width;
const int pad_height = params.padding_values.height;
const int32_t input_offset = params.input_offset;
@@ -73,15 +67,15 @@
const int filter_width = filter_shape.Dims(2);
const int output_height = output_shape.Dims(1);
const int output_width = output_shape.Dims(2);
- int32_t swizzled_bias_data[32 * 4];
- int32_t swizzled_shift_multi[32 * 4];
- int32_t swizzled_output_multi[32 * 4];
+ int32_t swizzled_bias_data[32];
+ int32_t swizzled_shift_multi[32];
+ int32_t swizzled_output_multi[32];
for (int in_channel = 0; in_channel + 32 <= input_depth; in_channel += 32) {
const int output_channel = in_channel;
- Swizzle(bias_data + output_channel, swizzled_bias_data, 32);
- Swizzle(output_multiplier + output_channel, swizzled_output_multi, 32);
- Swizzle(output_shift + output_channel, swizzled_shift_multi, 32);
+ VectorSwizzle(bias_data + output_channel, swizzled_bias_data, 32);
+ VectorSwizzle(output_multiplier + output_channel, swizzled_output_multi, 32);
+ VectorSwizzle(output_shift + output_channel, swizzled_shift_multi, 32);
vld_w_x_m(v20, swizzled_bias_data);
vld_w_x_m(v24, swizzled_output_multi);
@@ -94,6 +88,7 @@
const int in_x_origin = (out_x * stride_width) - pad_width;
const int in_y_origin = (out_y * stride_height) - pad_height;
+ vdup_w_x_m(v48, 0);
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
const int in_y = in_y_origin + filter_y;
if ((in_y < 0) || (in_y >= input_height)) {
@@ -124,7 +119,7 @@
}
vadd_w_vv_m(v48, v48, v20); // add bias
- vdmulh_w_r_vv_m(v48, v48, v24);
+ vdmulh_w_rn_vv_m(v48, v48, v24);
vsha_w_r_vv_m(v48, v48, v28);
vadd_w_vx_m(v48, v48, output_offset);
vmax_w_vx_m(v48, v48, output_activation_min);
@@ -138,7 +133,24 @@
}
}
-void DepthwiseConv2DKelvin(
+// generic implementation based on Kelvin ops
+void DepthwiseConvS8Generic(
+ const tflite::DepthwiseParams& params, const int32_t* output_multiplier,
+ const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
+ const int8_t* input_data, const tflite::RuntimeShape& filter_shape,
+ const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
+ const int32_t* bias_data, const tflite::RuntimeShape& output_shape,
+ int8_t* output_data) {
+ // TBD: Use Kelvin implementation to replace the below
+ tflite::reference_integer_ops::DepthwiseConvPerChannel(
+ params, output_multiplier, output_shift, input_shape, input_data,
+ filter_shape, filter_data, bias_shape, bias_data, output_shape,
+ output_data);
+ return;
+}
+} // namespace
+
+void DepthwiseConvS8(
const tflite::DepthwiseParams& params, const int32_t* output_multiplier,
const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
const int8_t* input_data, const tflite::RuntimeShape& filter_shape,
@@ -151,11 +163,7 @@
const int stride_height = params.stride_height;
const int dilation_width_factor = params.dilation_width_factor;
const int dilation_height_factor = params.dilation_height_factor;
- const int pad_width = params.padding_values.width;
- const int pad_height = params.padding_values.height;
const int depth_multiplier = params.depth_multiplier;
- const int32_t input_offset = params.input_offset;
- const int32_t output_offset = params.output_offset;
const int32_t output_activation_min = params.quantized_activation_min;
const int32_t output_activation_max = params.quantized_activation_max;
@@ -165,30 +173,33 @@
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
- const int batches = MatchingDim(input_shape, 0, output_shape, 0);
const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
- const int input_height = input_shape.Dims(1);
- const int input_width = input_shape.Dims(2);
const int input_depth = input_shape.Dims(3);
- const int filter_height = filter_shape.Dims(1);
- const int filter_width = filter_shape.Dims(2);
- const int output_height = output_shape.Dims(1);
- const int output_width = output_shape.Dims(2);
TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
- if (depth_multiplier == 1 && pad_height < 2 && pad_width < 2 &&
+ if (depth_multiplier == 1 &&
dilation_height_factor == 1 && dilation_width_factor == 1 &&
- stride_height == 1 && stride_width == 1 && output_depth % 32 == 0) {
- DWConv2DKelvin_d32(params, output_multiplier, output_shift, input_shape,
- input_data, filter_shape, filter_data, bias_shape,
- bias_data, output_shape, output_data);
+ stride_height <= 2 && stride_width <= 2) {
+ // generic implementation by default
+ auto fn = DepthwiseConvS8Generic;
+
+ // special case of output depth = 32n
+ if (output_depth % 32 == 0) {
+ fn = DepthwiseConvS8D32;
+ }
+
+ fn(params, output_multiplier, output_shift, input_shape, input_data,
+ filter_shape, filter_data, bias_shape, bias_data, output_shape,
+ output_data);
return;
}
+
+ // Use reference implementation
tflite::reference_integer_ops::DepthwiseConvPerChannel(
params, output_multiplier, output_shift, input_shape, input_data,
filter_shape, filter_data, bias_shape, bias_data, output_shape,
output_data);
- return;
}
-} // namespace kelvin::opt
\ No newline at end of file
+
+} // namespace kelvin::opt
diff --git a/tflm/opt/elementwise_add_s16.cc b/tflm/opt/elementwise_add_s16.cc
index e4220f0..001113e 100644
--- a/tflm/opt/elementwise_add_s16.cc
+++ b/tflm/opt/elementwise_add_s16.cc
@@ -1,5 +1,5 @@
/*
- * Copyright 2023 Google LLC
+ * Copyright 2024 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -20,16 +20,16 @@
namespace kelvin::opt {
-void elementwise_add_s16(const int16_t* input1, const int16_t* input2,
- const int32_t input1_offset, const int32_t input1_mult,
- const int32_t input1_shift,
- const int32_t input2_offset, const int32_t input2_mult,
- const int32_t input2_shift, const int32_t left_shift,
- int16_t* output, const int32_t output_offset,
- const int32_t output_mult, const int32_t output_shift,
- const int32_t output_activation_min,
- const int32_t output_activation_max,
- const int32_t block_size) {
+void ElementwiseAddS16(const int16_t* input1, const int16_t* input2,
+ const int32_t input1_offset, const int32_t input1_mult,
+ const int32_t input1_shift, const int32_t input2_offset,
+ const int32_t input2_mult, const int32_t input2_shift,
+ const int32_t left_shift, int16_t* output,
+ const int32_t output_offset, const int32_t output_mult,
+ const int32_t output_shift,
+ const int32_t output_activation_min,
+ const int32_t output_activation_max,
+ const int32_t block_size) {
int blocks = block_size;
int vl;
getmaxvl_h(vl);
diff --git a/tflm/opt/elementwise_add_s32.cc b/tflm/opt/elementwise_add_s32.cc
index 483799a..ab2b3d1 100644
--- a/tflm/opt/elementwise_add_s32.cc
+++ b/tflm/opt/elementwise_add_s32.cc
@@ -1,5 +1,5 @@
/*
- * Copyright 2023 Google LLC
+ * Copyright 2024 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -18,10 +18,10 @@
#include "tflm/opt/opt.h"
namespace kelvin::opt {
-void elementwise_add_s32(const int32_t* input1, const int32_t* input2,
- int32_t* output, const int32_t output_activation_min,
- const int32_t output_activation_max,
- const int32_t block_size) {
+void ElementwiseAddS32(const int32_t* input1, const int32_t* input2,
+ int32_t* output, const int32_t output_activation_min,
+ const int32_t output_activation_max,
+ const int32_t block_size) {
int blocks = block_size;
int vl;
getmaxvl_w_m(vl);
diff --git a/tflm/opt/elementwise_add_s8.cc b/tflm/opt/elementwise_add_s8.cc
index ac83e1f..762d7af 100644
--- a/tflm/opt/elementwise_add_s8.cc
+++ b/tflm/opt/elementwise_add_s8.cc
@@ -1,5 +1,5 @@
/*
- * Copyright 2023 Google LLC
+ * Copyright 2024 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -20,16 +20,16 @@
namespace kelvin::opt {
-void elementwise_add_s8(const int8_t* input1, const int8_t* input2,
- const int32_t input1_offset, const int32_t input1_mult,
- const int32_t input1_shift, const int32_t input2_offset,
- const int32_t input2_mult, const int32_t input2_shift,
- const int32_t left_shift, int8_t* output,
- const int32_t output_offset, const int32_t output_mult,
- const int32_t output_shift,
- const int32_t output_activation_min,
- const int32_t output_activation_max,
- const int32_t block_size) {
+void ElementwiseAddS8(const int8_t* input1, const int8_t* input2,
+ const int32_t input1_offset, const int32_t input1_mult,
+ const int32_t input1_shift, const int32_t input2_offset,
+ const int32_t input2_mult, const int32_t input2_shift,
+ const int32_t left_shift, int8_t* output,
+ const int32_t output_offset, const int32_t output_mult,
+ const int32_t output_shift,
+ const int32_t output_activation_min,
+ const int32_t output_activation_max,
+ const int32_t block_size) {
int blocks = block_size;
int vl;
getmaxvl_b(vl);
diff --git a/tflm/opt/leaky_relu_s16.cc b/tflm/opt/leaky_relu_s16.cc
index 5cd1128..7427a6c 100644
--- a/tflm/opt/leaky_relu_s16.cc
+++ b/tflm/opt/leaky_relu_s16.cc
@@ -1,5 +1,5 @@
/*
- * Copyright 2023 Google LLC
+ * Copyright 2024 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -21,13 +21,13 @@
#include "tflm/opt/util.h"
namespace kelvin::opt {
-void leaky_relu_s16(const int16_t* input, int16_t* output,
- const int32_t block_size, const int32_t input_zero_point,
- const int32_t output_zero_point,
- const int32_t output_multiplier_alpha,
- const int32_t output_shift_alpha,
- const int32_t output_multiplier_identity,
- const int32_t output_shift_identity) {
+void LeakyReluS16(const int16_t* input, int16_t* output,
+ const int32_t block_size, const int32_t input_zero_point,
+ const int32_t output_zero_point,
+ const int32_t output_multiplier_alpha,
+ const int32_t output_shift_alpha,
+ const int32_t output_multiplier_identity,
+ const int32_t output_shift_identity) {
constexpr int32_t quantized_output_min = std::numeric_limits<int16_t>::min();
constexpr int32_t quantized_output_max = std::numeric_limits<int16_t>::max();
int32_t right_shift_identity = std::min(output_shift_identity, 0L);
diff --git a/tflm/opt/leaky_relu_s8.cc b/tflm/opt/leaky_relu_s8.cc
index b32d260..8b30d19 100644
--- a/tflm/opt/leaky_relu_s8.cc
+++ b/tflm/opt/leaky_relu_s8.cc
@@ -1,5 +1,5 @@
/*
- * Copyright 2023 Google LLC
+ * Copyright 2024 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -22,13 +22,13 @@
namespace kelvin::opt {
-void leaky_relu_s8(const int8_t* input, int8_t* output,
- const int32_t block_size, const int32_t input_zero_point,
- const int32_t output_zero_point,
- const int32_t output_multiplier_alpha,
- const int32_t output_shift_alpha,
- const int32_t output_multiplier_identity,
- const int32_t output_shift_identity) {
+void LeakyReluS8(const int8_t* input, int8_t* output, const int32_t block_size,
+ const int32_t input_zero_point,
+ const int32_t output_zero_point,
+ const int32_t output_multiplier_alpha,
+ const int32_t output_shift_alpha,
+ const int32_t output_multiplier_identity,
+ const int32_t output_shift_identity) {
constexpr int32_t quantized_output_min = std::numeric_limits<int16_t>::min();
constexpr int32_t quantized_output_max = std::numeric_limits<int16_t>::max();
int32_t right_shift_identity = std::min(output_shift_identity, 0L);
diff --git a/tflm/opt/max_pool_s8.cc b/tflm/opt/max_pool_s8.cc
index 5986746..544f85a 100644
--- a/tflm/opt/max_pool_s8.cc
+++ b/tflm/opt/max_pool_s8.cc
@@ -1,5 +1,5 @@
/*
- * Copyright 2023 Google LLC
+ * Copyright 2024 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -20,11 +20,10 @@
#include "tensorflow/lite/kernels/internal/types.h"
namespace kelvin::opt {
-void MaxPoolGeneric(const tflite::PoolParams ¶ms,
- const tflite::RuntimeShape &input_shape,
- const int8_t *input_data,
- const tflite::RuntimeShape &output_shape,
- int8_t *output_data) {
+void MaxPoolS8(const tflite::PoolParams ¶ms,
+ const tflite::RuntimeShape &input_shape,
+ const int8_t *input_data,
+ const tflite::RuntimeShape &output_shape, int8_t *output_data) {
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
const int depth = MatchingDim(input_shape, 3, output_shape, 3);
const int input_height = input_shape.Dims(1);
@@ -97,4 +96,4 @@
}
}
-} // namespace kelvin::opt
+} // namespace kelvin::opt
diff --git a/tflm/opt/memcpy.cc b/tflm/opt/memcpy.cc
index 4669a83..29e0434 100644
--- a/tflm/opt/memcpy.cc
+++ b/tflm/opt/memcpy.cc
@@ -1,5 +1,5 @@
/*
- * Copyright 2023 Google LLC
+ * Copyright 2024 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
namespace kelvin::opt {
-void *memcpy(void *dst, const void *src, size_t n) {
+void *Memcpy(void *dst, const void *src, size_t n) {
const uint8_t *s = reinterpret_cast<const uint8_t *>(src);
uint8_t *d = reinterpret_cast<uint8_t *>(dst);
int vl;
diff --git a/tflm/opt/opt.h b/tflm/opt/opt.h
index 277f338..76d5218 100644
--- a/tflm/opt/opt.h
+++ b/tflm/opt/opt.h
@@ -1,5 +1,5 @@
/*
- * Copyright 2023 Google LLC
+ * Copyright 2024 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -24,121 +24,87 @@
/* clang-format on */
namespace kelvin::opt {
-void* memcpy(void* dst, const void* src, size_t n);
-void elementwise_add_s8(const int8_t* input1, const int8_t* input2,
- const int32_t input1_offset, const int32_t input1_mult,
- const int32_t input1_shift, const int32_t input2_offset,
- const int32_t input2_mult, const int32_t input2_shift,
- const int32_t left_shift, int8_t* output,
- const int32_t output_offset, const int32_t output_mult,
- const int32_t output_shift,
- const int32_t output_activation_min,
- const int32_t output_activation_max,
- const int32_t block_size);
-void elementwise_add_s16(const int16_t* input1, const int16_t* input2,
- const int32_t input1_offset, const int32_t input1_mult,
- const int32_t input1_shift,
- const int32_t input2_offset, const int32_t input2_mult,
- const int32_t input2_shift, const int32_t left_shift,
- int16_t* output, const int32_t output_offset,
- const int32_t output_mult, const int32_t output_shift,
- const int32_t output_activation_min,
- const int32_t output_activation_max,
- const int32_t block_size);
-void elementwise_add_s32(const int32_t* input1, const int32_t* input2,
- int32_t* output, const int32_t output_activation_min,
- const int32_t output_activation_max,
- const int32_t block_size);
-void leaky_relu_s8(const int8_t* input, int8_t* output,
- const int32_t block_size, const int32_t input_zero_point,
- const int32_t output_zero_point,
- const int32_t output_multiplier_alpha,
- const int32_t output_shift_alpha,
- const int32_t output_multiplier_identity,
- const int32_t output_shift_identity);
-void leaky_relu_s16(const int16_t* input, int16_t* output,
- const int32_t block_size, const int32_t input_zero_point,
- const int32_t output_zero_point,
- const int32_t output_multiplier_alpha,
- const int32_t output_shift_alpha,
- const int32_t output_multiplier_identity,
- const int32_t output_shift_identity);
-void conv_per_channel_b32(
- const tflite::ConvParams& params, const int32_t* output_multiplier,
- const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
- const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
- const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
- const int32_t* bias_data, const tflite::RuntimeShape& output_shape,
- int16_t* output_data);
-
-// Top level conv function, will invoke correct variant below.
-void conv_per_channel_b64(
- const tflite::ConvParams& params, const int32_t* output_multiplier,
- const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
- const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
- const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
- const int64_t* bias_data, const tflite::RuntimeShape& output_shape,
- int16_t* output_data);
-void conv_per_channel_b64_1x1(
- const tflite::ConvParams& params, const int32_t* output_multiplier,
- const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
- const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
- const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
- const int64_t* bias_data, const tflite::RuntimeShape& output_shape,
- int16_t* output_data);
-void conv_per_channel_b64_filter1xn_non_group(
- const tflite::ConvParams& params, const int32_t* output_multiplier,
- const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
- const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
- const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
- const int64_t* bias_data, const tflite::RuntimeShape& output_shape,
- int16_t* output_data);
-void conv_per_channel_b64_filter1xn_group(
- const tflite::ConvParams& params, const int32_t* output_multiplier,
- const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
- const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
- const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
- const int64_t* bias_data, const tflite::RuntimeShape& output_shape,
- int16_t* output_data);
-void conv_per_channel_b64_generic(
- const tflite::ConvParams& params, const int32_t* output_multiplier,
- const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
- const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
- const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
- const int64_t* bias_data, const tflite::RuntimeShape& output_shape,
- int16_t* output_data);
-
-void conv_per_channel_b8(
- const tflite::ConvParams& params, const int32_t* output_multiplier,
- const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
- const int8_t* input_data, const tflite::RuntimeShape& filter_shape,
- const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
- const int32_t* bias_data, const tflite::RuntimeShape& output_shape,
- int8_t* output_data);
-void DepthwiseConv2DKelvin(
+void* Memcpy(void* dst, const void* src, size_t n);
+void ElementwiseAddS8(const int8_t* input1, const int8_t* input2,
+ const int32_t input1_offset, const int32_t input1_mult,
+ const int32_t input1_shift, const int32_t input2_offset,
+ const int32_t input2_mult, const int32_t input2_shift,
+ const int32_t left_shift, int8_t* output,
+ const int32_t output_offset, const int32_t output_mult,
+ const int32_t output_shift,
+ const int32_t output_activation_min,
+ const int32_t output_activation_max,
+ const int32_t block_size);
+void ElementwiseAddS16(const int16_t* input1, const int16_t* input2,
+ const int32_t input1_offset, const int32_t input1_mult,
+ const int32_t input1_shift, const int32_t input2_offset,
+ const int32_t input2_mult, const int32_t input2_shift,
+ const int32_t left_shift, int16_t* output,
+ const int32_t output_offset, const int32_t output_mult,
+ const int32_t output_shift,
+ const int32_t output_activation_min,
+ const int32_t output_activation_max,
+ const int32_t block_size);
+void ElementwiseAddS32(const int32_t* input1, const int32_t* input2,
+ int32_t* output, const int32_t output_activation_min,
+ const int32_t output_activation_max,
+ const int32_t block_size);
+void LeakyReluS8(const int8_t* input, int8_t* output, const int32_t block_size,
+ const int32_t input_zero_point,
+ const int32_t output_zero_point,
+ const int32_t output_multiplier_alpha,
+ const int32_t output_shift_alpha,
+ const int32_t output_multiplier_identity,
+ const int32_t output_shift_identity);
+void LeakyReluS16(const int16_t* input, int16_t* output,
+ const int32_t block_size, const int32_t input_zero_point,
+ const int32_t output_zero_point,
+ const int32_t output_multiplier_alpha,
+ const int32_t output_shift_alpha,
+ const int32_t output_multiplier_identity,
+ const int32_t output_shift_identity);
+void ConvS16B32(const tflite::ConvParams& params,
+ const int32_t* output_multiplier, const int32_t* output_shift,
+ const tflite::RuntimeShape& input_shape,
+ const int16_t* input_data,
+ const tflite::RuntimeShape& filter_shape,
+ const int8_t* filter_data,
+ const tflite::RuntimeShape& bias_shape,
+ const int32_t* bias_data,
+ const tflite::RuntimeShape& output_shape, int16_t* output_data);
+void ConvS16B64(const tflite::ConvParams& params,
+ const int32_t* output_multiplier, const int32_t* output_shift,
+ const tflite::RuntimeShape& input_shape,
+ const int16_t* input_data,
+ const tflite::RuntimeShape& filter_shape,
+ const int8_t* filter_data,
+ const tflite::RuntimeShape& bias_shape,
+ const int64_t* bias_data,
+ const tflite::RuntimeShape& output_shape, int16_t* output_data);
+void ConvS8(const tflite::ConvParams& params, const int32_t* output_multiplier,
+ const int32_t* output_shift,
+ const tflite::RuntimeShape& input_shape, const int8_t* input_data,
+ const tflite::RuntimeShape& filter_shape, const int8_t* filter_data,
+ const tflite::RuntimeShape& bias_shape, const int32_t* bias_data,
+ const tflite::RuntimeShape& output_shape, int8_t* output_data);
+void DepthwiseConvS8(
const tflite::DepthwiseParams& params, const int32_t* output_multiplier,
const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
const int8_t* input_data, const tflite::RuntimeShape& filter_shape,
const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
const int32_t* bias_data, const tflite::RuntimeShape& output_shape,
int8_t* output_data);
-void DWConv2DKelvin_d32(
+void DepthwiseConvS16(
const tflite::DepthwiseParams& params, const int32_t* output_multiplier,
const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
- const int8_t* input_data, const tflite::RuntimeShape& filter_shape,
+ const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
- const int32_t* bias_data, const tflite::RuntimeShape& output_shape,
- int8_t* output_data);
-void DepthwiseConv2DKelvinS16K3x1(
- const int16_t* activations, const int8_t* weights, const int64_t* biases,
- int channels, int frames, int dilation, const int32_t* output_mult,
- const int32_t* output_shift, int32_t output_activation_min,
- int32_t output_activation_max, int16_t* output);
-void MaxPoolGeneric(const tflite::PoolParams& params,
- const tflite::RuntimeShape& input_shape,
- const int8_t* input_data,
- const tflite::RuntimeShape& output_shape,
- int8_t* output_data);
+ const int64_t* bias_data, const tflite::RuntimeShape& output_shape,
+ int16_t* output_data);
+void MaxPoolS8(const tflite::PoolParams& params,
+ const tflite::RuntimeShape& input_shape,
+ const int8_t* input_data,
+ const tflite::RuntimeShape& output_shape, int8_t* output_data);
} // namespace kelvin::opt
diff --git a/tflm/opt/util.h b/tflm/opt/util.h
index d94ef3e..d0c16db 100644
--- a/tflm/opt/util.h
+++ b/tflm/opt/util.h
@@ -1,5 +1,5 @@
/*
- * Copyright 2023 Google LLC
+ * Copyright 2024 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.