Support functions for LeakyRelu

- Add int8 and int16 helper methods for implementing a LeakyRelu kernel.

Change-Id: Id017f614fa6638273eb63f50e1196ca6da44ca51
diff --git a/tests/tflm/BUILD b/tests/tflm/BUILD
index 71f70ab..05c196b 100644
--- a/tests/tflm/BUILD
+++ b/tests/tflm/BUILD
@@ -1,4 +1,5 @@
 load("//build_tools/bazel:kelvin.bzl", "kelvin_test")
+
 package(default_visibility = ["//visibility:public"])
 
 kelvin_test(
@@ -10,10 +11,26 @@
         "//crt:crt_header",
         "@tflite-micro//tensorflow/lite/c:common",
         "@tflite-micro//tensorflow/lite/kernels/internal:tensor",
-        "@tflite-micro//tensorflow/lite/micro/kernels:kernel_runner",
-        "@tflite-micro//tensorflow/lite/micro/testing:micro_test",
         "@tflite-micro//tensorflow/lite/micro:micro_utils",
         "@tflite-micro//tensorflow/lite/micro:test_helpers",
+        "@tflite-micro//tensorflow/lite/micro/kernels:kernel_runner",
+        "@tflite-micro//tensorflow/lite/micro/testing:micro_test",
+    ],
+)
+
+kelvin_test(
+    name = "leaky_relu_test",
+    srcs = [
+        "@tflite-micro//tensorflow/lite/micro/kernels:leaky_relu_test.cc",
+    ],
+    deps = [
+        "//crt:crt_header",
+        "@tflite-micro//tensorflow/lite/c:common",
+        "@tflite-micro//tensorflow/lite/kernels/internal:tensor",
+        "@tflite-micro//tensorflow/lite/micro:micro_utils",
+        "@tflite-micro//tensorflow/lite/micro:test_helpers",
+        "@tflite-micro//tensorflow/lite/micro/kernels:kernel_runner",
+        "@tflite-micro//tensorflow/lite/micro/testing:micro_test",
     ],
 )
 
@@ -26,9 +43,9 @@
         "//crt:crt_header",
         "@tflite-micro//tensorflow/lite/c:common",
         "@tflite-micro//tensorflow/lite/kernels/internal:tensor",
-        "@tflite-micro//tensorflow/lite/micro/kernels:kernel_runner",
-        "@tflite-micro//tensorflow/lite/micro/testing:micro_test",
         "@tflite-micro//tensorflow/lite/micro:micro_utils",
         "@tflite-micro//tensorflow/lite/micro:test_helpers",
+        "@tflite-micro//tensorflow/lite/micro/kernels:kernel_runner",
+        "@tflite-micro//tensorflow/lite/micro/testing:micro_test",
     ],
 )
diff --git a/tflm/opt/BUILD b/tflm/opt/BUILD
index 2ba3900..8da2593 100644
--- a/tflm/opt/BUILD
+++ b/tflm/opt/BUILD
@@ -3,18 +3,20 @@
 cc_library(
     name = "opt",
     srcs = [
-        "elementwise_add_s8.cc",
         "elementwise_add_s16.cc",
         "elementwise_add_s32.cc",
+        "elementwise_add_s8.cc",
+        "leaky_relu_s16.cc",
+        "leaky_relu_s8.cc",
         "memcpy.cc",
     ],
     hdrs = [
         "opt.h",
         "util.h",
     ],
+    target_compatible_with = ["@kelvin_sw//platforms/cpu:kelvin"],
     deps = [
         "//crt:crt_header",
     ],
     alwayslink = True,
-    target_compatible_with = ["@kelvin_sw//platforms/cpu:kelvin"],
 )
diff --git a/tflm/opt/leaky_relu_s16.cc b/tflm/opt/leaky_relu_s16.cc
new file mode 100644
index 0000000..e3ac66d
--- /dev/null
+++ b/tflm/opt/leaky_relu_s16.cc
@@ -0,0 +1,73 @@
+// Copyright 2023 Google LLC
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include <limits>
+
+#include "crt/kelvin.h"
+#include "tflm/opt/opt.h"
+#include "tflm/opt/util.h"
+
+namespace kelvin::opt {
+void leaky_relu_s16(const int16_t* input, int16_t* output,
+                    const int32_t block_size, const int32_t input_zero_point,
+                    const int32_t output_zero_point,
+                    const int32_t output_multiplier_alpha,
+                    const int32_t output_shift_alpha,
+                    const int32_t output_multiplier_identity,
+                    const int32_t output_shift_identity) {
+  constexpr int32_t quantized_output_min = std::numeric_limits<int16_t>::min();
+  constexpr int32_t quantized_output_max = std::numeric_limits<int16_t>::max();
+  int32_t right_shift_identity = std::min(output_shift_identity, 0L);
+  int32_t left_shift_identity = std::max(output_shift_identity, 0L);
+  int32_t right_shift_alpha = std::min(output_shift_alpha, 0L);
+  int32_t left_shift_alpha = std::max(output_shift_alpha, 0L);
+  int blocks = block_size;
+  int vl;
+  getmaxvl_h(vl);
+  while (blocks) {
+    int count = std::min(blocks, vl);
+
+    // Load data from the input, and widen.
+    vld_h_lp_xx(v0, input, count);
+    vaddw_w_vx(v0, v0, 0);
+
+    // Subtract out the provided offset from the inputs.
+    vsub_w_vx_m(vm0, vm0, input_zero_point);
+
+    // Compute the Relu on all inputs, as if they were >=0.
+    vsll_w_vx_m(vm2, vm0, left_shift_identity);
+    vdmulh_w_r_vx_m(vm2, vm2, output_multiplier_identity);
+    vsha_w_vx_m(vm2, vm2, RIGHT_SHIFT(right_shift_identity));
+    vadd_w_vx_m(vm2, vm2, output_zero_point);
+    vmax_w_vx_m(vm2, vm2, quantized_output_min);
+    vmin_w_vx_m(vm2, vm2, quantized_output_max);
+
+    // Compute the Relu on all inputs, as if they were <0.
+    vsll_w_vx_m(vm1, vm0, left_shift_alpha);
+    vdmulh_w_r_vx_m(vm1, vm1, output_multiplier_alpha);
+    vsha_w_vx_m(vm1, vm1, RIGHT_SHIFT(right_shift_alpha));
+    vadd_w_vx_m(vm1, vm1, output_zero_point);
+    vmax_w_vx_m(vm1, vm1, quantized_output_min);
+    vmin_w_vx_m(vm1, vm1, quantized_output_max);
+
+    // Compute a boolean vector for inputs >=0.
+    vge_w_vx_m(vm3, vm0, 0);
+    // Compute a boolean vector for inputs <0.
+    vlt_w_vx_m(vm0, vm0, 0);
+    // Multiply the `identity` results by the >=0 vector.
+    vmul_w_vv_m(vm2, vm2, vm3);
+    // Multiply the `alpha` results by the <0 vector.
+    vmul_w_vv_m(vm0, vm1, vm0);
+    // Sum the two resulting vectors.
+    vadd_w_vv_m(vm0, vm0, vm2);
+
+    // Narrow/swizzle, and store to output.
+    vsrans_h_vx(v0, v0, 0);
+    vst_h_lp_xx(v0, output, count);
+
+    blocks -= count;
+  }
+}
+
+}  // namespace kelvin::opt
diff --git a/tflm/opt/leaky_relu_s8.cc b/tflm/opt/leaky_relu_s8.cc
new file mode 100644
index 0000000..15b9218
--- /dev/null
+++ b/tflm/opt/leaky_relu_s8.cc
@@ -0,0 +1,76 @@
+// Copyright 2023 Google LLC
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include <limits>
+
+#include "crt/kelvin.h"
+#include "tflm/opt/opt.h"
+#include "tflm/opt/util.h"
+
+namespace kelvin::opt {
+
+void leaky_relu_s8(const int8_t* input, int8_t* output,
+                   const int32_t block_size, const int32_t input_zero_point,
+                   const int32_t output_zero_point,
+                   const int32_t output_multiplier_alpha,
+                   const int32_t output_shift_alpha,
+                   const int32_t output_multiplier_identity,
+                   const int32_t output_shift_identity) {
+  constexpr int32_t quantized_output_min = std::numeric_limits<int16_t>::min();
+  constexpr int32_t quantized_output_max = std::numeric_limits<int16_t>::max();
+  int32_t right_shift_identity = std::min(output_shift_identity, 0L);
+  int32_t left_shift_identity = std::max(output_shift_identity, 0L);
+  int32_t right_shift_alpha = std::min(output_shift_alpha, 0L);
+  int32_t left_shift_alpha = std::max(output_shift_alpha, 0L);
+  int blocks = block_size;
+  int vl;
+  getmaxvl_b(vl);
+  while (blocks) {
+    int count = std::min(blocks, vl);
+
+    // Load data from the input, and widen (now we can use vm0).
+    vld_b_lp_xx(v0, input, count);
+    vaddw_h_vx(v0, v0, 0);
+    vaddw_w_vx(v2, v1, 0);
+    vaddw_w_vx(v0, v0, 0);
+
+    // Subtract out the provided offset from the inputs.
+    vsub_w_vx_m(vm0, vm0, input_zero_point);
+
+    // Compute the Relu on all inputs, as if they were >=0.
+    vsll_w_vx_m(vm2, vm0, left_shift_identity);
+    vdmulh_w_r_vx_m(vm2, vm2, output_multiplier_identity);
+    vsha_w_vx_m(vm2, vm2, RIGHT_SHIFT(right_shift_identity));
+    vadd_w_vx_m(vm2, vm2, output_zero_point);
+    vmax_w_vx_m(vm2, vm2, quantized_output_min);
+    vmin_w_vx_m(vm2, vm2, quantized_output_max);
+
+    // Compute the Relu on all inputs, as if they were <0.
+    vsll_w_vx_m(vm1, vm0, left_shift_alpha);
+    vdmulh_w_r_vx_m(vm1, vm1, output_multiplier_alpha);
+    vsha_w_vx_m(vm1, vm1, RIGHT_SHIFT(right_shift_alpha));
+    vadd_w_vx_m(vm1, vm1, output_zero_point);
+    vmax_w_vx_m(vm1, vm1, quantized_output_min);
+    vmin_w_vx_m(vm1, vm1, quantized_output_max);
+
+    // Compute a boolean vector for inputs >=0.
+    vge_w_vx_m(vm3, vm0, 0);
+    // Compute a boolean vector for inputs <0.
+    vlt_w_vx_m(vm0, vm0, 0);
+    // Multiply the `identity` results by the >=0 vector.
+    vmul_w_vv_m(vm2, vm2, vm3);
+    // Multiply the `alpha` results by the <0 vector.
+    vmul_w_vv_m(vm0, vm1, vm0);
+    // Sum the two resulting vectors.
+    vadd_w_vv_m(vm0, vm0, vm2);
+
+    // Narrow/swizzle, and store to output.
+    vsraqs_b_vx(v0, v0, 0);
+    vst_b_lp_xx(v0, output, count);
+
+    blocks -= count;
+  }
+}
+
+}  // namespace kelvin::opt
diff --git a/tflm/opt/opt.h b/tflm/opt/opt.h
index 12075ab..6009ee3 100644
--- a/tflm/opt/opt.h
+++ b/tflm/opt/opt.h
@@ -31,6 +31,20 @@
                          int32_t* output, const int32_t output_activation_min,
                          const int32_t output_activation_max,
                          const int32_t block_size);
+void leaky_relu_s8(const int8_t* input, int8_t* output,
+                   const int32_t block_size, const int32_t input_zero_point,
+                   const int32_t output_zero_point,
+                   const int32_t output_multiplier_alpha,
+                   const int32_t output_shift_alpha,
+                   const int32_t output_multiplier_identity,
+                   const int32_t output_shift_identity);
+void leaky_relu_s16(const int16_t* input, int16_t* output,
+                    const int32_t block_size, const int32_t input_zero_point,
+                    const int32_t output_zero_point,
+                    const int32_t output_multiplier_alpha,
+                    const int32_t output_shift_alpha,
+                    const int32_t output_multiplier_identity,
+                    const int32_t output_shift_identity);
 }  // namespace kelvin::opt
 
 #endif  // TFLM_OPT_OPT_H_