8x16 Depthwise convolution for Kelvin.
Change-Id: Ibb1beba0aad3d0f816c44982d4ec83e338f0d084
diff --git a/tflm/opt/BUILD b/tflm/opt/BUILD
index 28419ce..f3f6e7d 100644
--- a/tflm/opt/BUILD
+++ b/tflm/opt/BUILD
@@ -4,6 +4,7 @@
name = "opt",
srcs = [
"conv.cc",
+ "depthwise_conv_s16.cc",
"elementwise_add_s16.cc",
"elementwise_add_s32.cc",
"elementwise_add_s8.cc",
diff --git a/tflm/opt/depthwise_conv_s16.cc b/tflm/opt/depthwise_conv_s16.cc
new file mode 100644
index 0000000..e05ef13
--- /dev/null
+++ b/tflm/opt/depthwise_conv_s16.cc
@@ -0,0 +1,108 @@
+// Copyright 2023 Google LLC
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include <algorithm>
+
+#include "crt/kelvin.h"
+#include "tflm/opt/opt.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace kelvin::opt {
+
+void DepthwiseConv2DKelvinS16K3x1(const int16_t* activations,
+ const int8_t* weights,
+ const int64_t* biases,
+ int channels, int frames,
+ const int32_t* output_mult,
+ const int32_t* output_shift,
+ int32_t output_activation_min,
+ int32_t output_activation_max,
+ int16_t* output) {
+ for (int c = 0; c + 32 <= channels; c += 32) {
+ // Load weights and interleave into correct order [v58-v63].
+ // Because there are more activations than weights, interleave weights.
+ const int8_t* local_weights0 = weights + c;
+ vld_b_p_xx(v0, local_weights0, channels);
+ vaddw_h_vx(v48, v0, 0);
+ vzip_h_vv(v58, v48, v49);
+
+ vld_b_p_xx(v1, local_weights0, channels);
+ vaddw_h_vx(v50, v1, 0);
+ vzip_h_vv(v60, v50, v51);
+
+ vld_b_x(v2, local_weights0);
+ vaddw_h_vx(v52, v2, 0);
+ vzip_h_vv(v62, v52, v53);
+
+ // Assume biases fit in 32-bit. This assumption is verified offline.
+ // Load biases and swizzle [v52-v55].
+ int32_t local_biases[32];
+ for (int j = 0; j < 32; j++) {
+ local_biases[j] = static_cast<int32_t>(biases[c + j]);
+ }
+ vld_w_x_m(v4, local_biases);
+ vzip_w_vv(v52, v4, v5);
+ vzip_w_vv(v54, v6, v7);
+
+ // Accumulators will be [v48 - v51].
+ const int16_t* local_activations0 = activations + c;
+ const int16_t* local_activations1 = local_activations0 + 16;
+ int16_t* local_output = output + c;
+
+ // Registers [v0-v5 will be for loading activations]
+ // Preload for valid padding:
+ vld_h_p_xx(v0, local_activations0, channels);
+ vld_h_p_xx(v1, local_activations1, channels);
+ vld_h_p_xx(v2, local_activations0, channels);
+ vld_h_p_xx(v3, local_activations1, channels);
+ int frames_left = frames - 2;
+
+ const int32_t* local_output_mult = output_mult + c;
+ const int32_t* local_output_shift = output_shift + c;
+
+ int32_t accumulators[32];
+ while (frames_left > 0) {
+ vld_h_p_xx(v4, local_activations0, channels);
+ vld_h_p_xx(v5, local_activations1, channels);
+ vmulw_w_vv(v48, v58, v0); // Clobber accumulator
+ vmulw_w_vv(v50, v59, v1); // Clobber accumulator
+ vadd_w_vv_m(v48, v48, v52); // Add bias.
+ vmulw_w_vv(v40, v60, v2);
+ vmulw_w_vv(v42, v61, v3);
+ vadd_w_vv_m(v48, v48, v40);
+ vmulw_w_vv(v44, v62, v4);
+ vmulw_w_vv(v46, v63, v5);
+ vadd_w_vv_m(v48, v48, v44);
+
+ vzip_w_vv(v48, v48, v49); // Swizzle accumulators
+ vzip_w_vv(v50, v50, v51);
+
+ vst_w_x_m(v48, accumulators); // Store accumulators
+
+ // Output pipeline in scalar, to preserve bit accuracy with the ARM CPU
+ // implementation.
+ for (int i = 0; i < 32; i++) {
+ int32_t result = tflite::MultiplyByQuantizedMultiplier(
+ static_cast<int64_t>(accumulators[i]), local_output_mult[i],
+ local_output_shift[i]);
+
+ local_output[i] = static_cast<int16_t>(
+ std::clamp(result, output_activation_min, output_activation_max));
+ }
+
+ // Slide registers
+ vmvp_vv(v0, v2, v3);
+ vmvp_vv(v2, v4, v5);
+
+ local_output += channels;
+ frames_left--;
+ }
+ }
+ // TODO(derekjchow): Handle channels % 32 cases.
+ // Break it down into:
+ // - one loop looking for 16 byte stripes
+ // - one final loop handling remainder
+}
+
+} // namespace kelvin::opt
\ No newline at end of file
diff --git a/tflm/opt/opt.h b/tflm/opt/opt.h
index 29741cb..f12596c 100644
--- a/tflm/opt/opt.h
+++ b/tflm/opt/opt.h
@@ -72,6 +72,11 @@
const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
const int32_t* bias_data, const tflite::RuntimeShape& output_shape,
int8_t* output_data);
+void DepthwiseConv2DKelvinS16K3x1(
+ const int16_t* activations, const int8_t* weights, const int64_t* biases,
+ int channels, int frames, const int32_t* output_mult,
+ const int32_t* output_shift, int32_t output_activation_min,
+ int32_t output_activation_max, int16_t* output);
} // namespace kelvin::opt
#endif // TFLM_OPT_OPT_H_