8x16 Depthwise convolution for Kelvin. Change-Id: Ibb1beba0aad3d0f816c44982d4ec83e338f0d084

diff --git a/tflm/opt/BUILD b/tflm/opt/BUILD
index 28419ce..f3f6e7d 100644
--- a/tflm/opt/BUILD
+++ b/tflm/opt/BUILD

@@ -4,6 +4,7 @@
     name = "opt",
     srcs = [
         "conv.cc",
+        "depthwise_conv_s16.cc",
         "elementwise_add_s16.cc",
         "elementwise_add_s32.cc",
         "elementwise_add_s8.cc",

diff --git a/tflm/opt/depthwise_conv_s16.cc b/tflm/opt/depthwise_conv_s16.cc
new file mode 100644
index 0000000..e05ef13
--- /dev/null
+++ b/tflm/opt/depthwise_conv_s16.cc

@@ -0,0 +1,108 @@
+// Copyright 2023 Google LLC
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include <algorithm>
+
+#include "crt/kelvin.h"
+#include "tflm/opt/opt.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace kelvin::opt {
+
+void DepthwiseConv2DKelvinS16K3x1(const int16_t* activations,
+                                  const int8_t* weights,
+                                  const int64_t* biases,
+                                  int channels, int frames,
+                                  const int32_t* output_mult,
+                                  const int32_t* output_shift,
+                                  int32_t output_activation_min,
+                                  int32_t output_activation_max,
+                                  int16_t* output) {
+  for (int c = 0; c + 32 <= channels; c += 32) {
+    // Load weights and interleave into correct order [v58-v63].
+    // Because there are more activations than weights, interleave weights.
+    const int8_t* local_weights0 = weights + c;
+    vld_b_p_xx(v0, local_weights0, channels);
+    vaddw_h_vx(v48, v0, 0);
+    vzip_h_vv(v58, v48, v49);
+
+    vld_b_p_xx(v1, local_weights0, channels);
+    vaddw_h_vx(v50, v1, 0);
+    vzip_h_vv(v60, v50, v51);
+
+    vld_b_x(v2, local_weights0);
+    vaddw_h_vx(v52, v2, 0);
+    vzip_h_vv(v62, v52, v53);
+
+    // Assume biases fit in 32-bit. This assumption is verified offline.
+    // Load biases and swizzle [v52-v55].
+    int32_t local_biases[32];
+    for (int j = 0; j < 32; j++) {
+      local_biases[j] = static_cast<int32_t>(biases[c + j]);
+    }
+    vld_w_x_m(v4, local_biases);
+    vzip_w_vv(v52, v4, v5);
+    vzip_w_vv(v54, v6, v7);
+
+    // Accumulators will be [v48 - v51].
+    const int16_t* local_activations0 = activations + c;
+    const int16_t* local_activations1 = local_activations0 + 16;
+    int16_t* local_output = output + c;
+
+    // Registers [v0-v5 will be for loading activations]
+    // Preload for valid padding:
+    vld_h_p_xx(v0, local_activations0, channels);
+    vld_h_p_xx(v1, local_activations1, channels);
+    vld_h_p_xx(v2, local_activations0, channels);
+    vld_h_p_xx(v3, local_activations1, channels);
+    int frames_left = frames - 2;
+
+    const int32_t* local_output_mult = output_mult + c;
+    const int32_t* local_output_shift = output_shift + c;
+
+    int32_t accumulators[32];
+    while (frames_left > 0) {
+      vld_h_p_xx(v4, local_activations0, channels);
+      vld_h_p_xx(v5, local_activations1, channels);
+      vmulw_w_vv(v48, v58, v0);  // Clobber accumulator
+      vmulw_w_vv(v50, v59, v1);  // Clobber accumulator
+      vadd_w_vv_m(v48, v48, v52);  // Add bias.
+      vmulw_w_vv(v40, v60, v2);
+      vmulw_w_vv(v42, v61, v3);
+      vadd_w_vv_m(v48, v48, v40);
+      vmulw_w_vv(v44, v62, v4);
+      vmulw_w_vv(v46, v63, v5);
+      vadd_w_vv_m(v48, v48, v44);
+
+      vzip_w_vv(v48, v48, v49);  // Swizzle accumulators
+      vzip_w_vv(v50, v50, v51);
+
+      vst_w_x_m(v48, accumulators);  // Store accumulators
+
+      // Output pipeline in scalar, to preserve bit accuracy with the ARM CPU
+      // implementation.
+      for (int i = 0; i < 32; i++) {
+        int32_t result = tflite::MultiplyByQuantizedMultiplier(
+            static_cast<int64_t>(accumulators[i]), local_output_mult[i],
+            local_output_shift[i]);
+
+        local_output[i] = static_cast<int16_t>(
+            std::clamp(result, output_activation_min, output_activation_max));
+      }
+
+      // Slide registers
+      vmvp_vv(v0, v2, v3);
+      vmvp_vv(v2, v4, v5);
+
+      local_output += channels;
+      frames_left--;
+    }
+  }
+  // TODO(derekjchow): Handle channels % 32 cases.
+  // Break it down into:
+  //   - one loop looking for 16 byte stripes
+  //   - one final loop handling remainder
+}
+
+}  // namespace kelvin::opt
\ No newline at end of file

diff --git a/tflm/opt/opt.h b/tflm/opt/opt.h
index 29741cb..f12596c 100644
--- a/tflm/opt/opt.h
+++ b/tflm/opt/opt.h

@@ -72,6 +72,11 @@
     const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
     const int32_t* bias_data, const tflite::RuntimeShape& output_shape,
     int8_t* output_data);
+void DepthwiseConv2DKelvinS16K3x1(
+    const int16_t* activations, const int8_t* weights, const int64_t* biases,
+    int channels, int frames, const int32_t* output_mult,
+    const int32_t* output_shift, int32_t output_activation_min,
+    int32_t output_activation_max, int16_t* output);
 }  // namespace kelvin::opt
 
 #endif  // TFLM_OPT_OPT_H_