Merge "Depthwise conv for Depth % == 16"
diff --git a/docs/kelvin_isa.md b/docs/kelvin_isa.md
index 4b9b06c..5aaac0c 100644
--- a/docs/kelvin_isa.md
+++ b/docs/kelvin_isa.md
@@ -85,6 +85,10 @@
 where a stripmine register must use a mod4 base aligned register (eg. v0, v4,
 v8, ...). Normal instruction and stripmine variants may be mixed together.
 
+Currently, neither the assembler nor kelvin_sim checks for invalid stripmine
+registers. Code using invalid registers (like v1) will compile and sim, but
+will cause FPGA to hang.
+
 When stripmining is used in conjunction with instructions which use a register
 index as a base to several registers, the offset of +4 (instead of +1) shall be
 used. e.g., {vm0,vm1} becomes {{v0,v1,v2,v3},{v4,v5,v6,v7}}.
@@ -753,7 +757,7 @@
 
 ### ACONV
 
-Convolution ALU operation.
+Performs matmul vs1*vs3, accumulating into the accumulator.
 
 **Encodings**
 
@@ -787,29 +791,32 @@
          (signed(SData2,Data2{7:0}) + signed(Bias2{8:0})){9:0}){18:0}
 ```
 
-Length (stop - start + 1) is in 32bit accumulator lane count, as all inputs will
-horizontally reduce to this size.
+vs1 goes to the *narrow* port of the matmul. 8 vectors are always used.
 
-The Start and Stop definition allows for a partial window of input values to be
-transpose broadcast into the convolution unit.
+vs3 goes to the *wide* port of the matmul, up to 8 vectors are used.
+
+vx2 specifies control params used in the operation and has the following
+format:
 
 Mode   | Mode | Usage
 :----: | :--: | :-----------------------------------------------:
 Common |      | Mode[1:0] Start[6:2] Stop[11:7]
 s8     | 0    | SData2[31] SBias2[30:22] SData1[21] SBias1[20:12]
 
-```
-# SIMD256
-acc.out = {v48..55}
-narrow0 = {v0..7}
-narrow1 = {v16..23}
-narrow2 = {v32..39}
-narrow3 = {v48..55}
-wide0   = {v8..15}
-wide1   = {v24..31}
-wide2   = {v40..47}
-wide3   = {v56..63}
-```
+Start and Stop controls the window of input values to participate in the
+matmul:
+- On vs1 this is in 4-byte words on all 8 vectors at the same time.
+- On vs3 this is the register number to use (vs3+0 to vs3+7).
+- The operation takes (stop - start + 1) ticks to complete.
+
+When using SIMD256, the folling operands are valid:
+- vd: v48
+- vs1: v0, v16, v32, v48
+- vs3: v8, v24, v40, v56
+
+Notes:
+- v48 is used as vd but never written to.
+- v48-v55 will always be overwritten upon VCGET.
 
 ### VCGET
 
@@ -830,6 +837,8 @@
 
 ```
 
+v48 is the only valid vd in this instruction.
+
 ### ACSET
 
 Copy general registers into convolution accumulators.
@@ -847,6 +856,8 @@
   Accum{Y} = vd{Y}
 ```
 
+Note that v48 is used as vd but never written to.
+
 --------------------------------------------------------------------------------
 
 ### ACTR
@@ -860,13 +871,15 @@
 **Operation**
 
 ```
-assert(vd in {v48})
+assert(vd == 48)
 assert(vs1 in {v0, v16, v32, v48}
 for I in i32.typelen
   for J in i32.typelen
     ACCUM[J][I] = vs1[I][J]
 ```
 
+Note that v48 is used as vd but never written to.
+
 --------------------------------------------------------------------------------
 
 ### VCLB
@@ -1813,7 +1826,7 @@
 
 vslidep.[b,h,w].[1,2,3,4].vv vd, vs1, vs2 \
 vslidevp.[b,h,w].[1,2,3,4].vv.m vd, vs1, vs2 \
-vslidevp.[b,h,w].[1,2,3,4].vv.m vd, vs1, xs2
+vslidevp.[b,h,w].[1,2,3,4].vx.m vd, vs1, xs2
 
 **Operation**
 
diff --git a/tflm/opt/BUILD b/tflm/opt/BUILD
index b9c0d76..9957e7a 100644
--- a/tflm/opt/BUILD
+++ b/tflm/opt/BUILD
@@ -22,17 +22,20 @@
         "conv_s8.cc",
         "conv_s8_1x1.cc",
         "conv_s8_3x1_d48.cc",
-        "conv_s8_d4.cc",
-        "conv_s8_d32.cc",
         "conv_s8_d1.cc",
+        "conv_s8_d32.cc",
+        "conv_s8_d4.cc",
         "depthwise_conv_s16.cc",
         "depthwise_conv_s8.cc",
         "elementwise_add_s16.cc",
         "elementwise_add_s32.cc",
         "elementwise_add_s8.cc",
+        "elementwise_mul_s16.cc",
+        "elementwise_mul_s8.cc",
         "leaky_relu_s16.cc",
         "leaky_relu_s8.cc",
         "logistic_s8.cc",
+        "max_pool_s16.cc",
         "max_pool_s8.cc",
         "memcpy.cc",
         "resize_nearest_neighbor_s8.cc",
diff --git a/tflm/opt/elementwise_add_s16.cc b/tflm/opt/elementwise_add_s16.cc
index 106742b..bb445c3 100644
--- a/tflm/opt/elementwise_add_s16.cc
+++ b/tflm/opt/elementwise_add_s16.cc
@@ -20,16 +20,28 @@
 
 namespace kelvin::opt {
 
-void ElementwiseAddS16(const int16_t* input1, const int16_t* input2,
-                       const int32_t input1_offset, const int32_t input1_mult,
-                       const int32_t input1_shift, const int32_t input2_offset,
-                       const int32_t input2_mult, const int32_t input2_shift,
-                       const int32_t left_shift, int16_t* output,
-                       const int32_t output_offset, const int32_t output_mult,
-                       const int32_t output_shift,
-                       const int32_t output_activation_min,
-                       const int32_t output_activation_max,
-                       const int32_t block_size) {
+void ElementwiseAddS16(const tflite::ArithmeticParams& params,
+                       const tflite::RuntimeShape& input1_shape,
+                       const int16_t* input1,
+                       const tflite::RuntimeShape& input2_shape,
+                       const int16_t* input2,
+                       const tflite::RuntimeShape& output_shape,
+                       int16_t* output) {
+  const int32_t input1_offset = params.input1_offset;
+  const int32_t input1_mult = params.input1_multiplier;
+  const int32_t input1_shift = params.input1_shift;
+  const int32_t input2_offset = params.input2_offset;
+  const int32_t input2_mult = params.input2_multiplier;
+  const int32_t input2_shift = params.input2_shift;
+  const int32_t left_shift = params.left_shift;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_mult = params.output_multiplier;
+  const int32_t output_shift = params.output_shift;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  const int block_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
   int blocks = block_size;
   int vl;
   getmaxvl_h(vl);
diff --git a/tflm/opt/elementwise_add_s32.cc b/tflm/opt/elementwise_add_s32.cc
index ab2b3d1..e83d1eb 100644
--- a/tflm/opt/elementwise_add_s32.cc
+++ b/tflm/opt/elementwise_add_s32.cc
@@ -18,10 +18,18 @@
 #include "tflm/opt/opt.h"
 
 namespace kelvin::opt {
-void ElementwiseAddS32(const int32_t* input1, const int32_t* input2,
-                       int32_t* output, const int32_t output_activation_min,
-                       const int32_t output_activation_max,
-                       const int32_t block_size) {
+void ElementwiseAddS32(const tflite::ArithmeticParams& params,
+                       const tflite::RuntimeShape& input1_shape,
+                       const int32_t* input1,
+                       const tflite::RuntimeShape& input2_shape,
+                       const int32_t* input2,
+                       const tflite::RuntimeShape& output_shape,
+                       int32_t* output) {
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  const int block_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
   int blocks = block_size;
   int vl;
   getmaxvl_w_m(vl);
diff --git a/tflm/opt/elementwise_add_s8.cc b/tflm/opt/elementwise_add_s8.cc
index e664769..9d24449 100644
--- a/tflm/opt/elementwise_add_s8.cc
+++ b/tflm/opt/elementwise_add_s8.cc
@@ -20,16 +20,28 @@
 
 namespace kelvin::opt {
 
-void ElementwiseAddS8(const int8_t* input1, const int8_t* input2,
-                      const int32_t input1_offset, const int32_t input1_mult,
-                      const int32_t input1_shift, const int32_t input2_offset,
-                      const int32_t input2_mult, const int32_t input2_shift,
-                      const int32_t left_shift, int8_t* output,
-                      const int32_t output_offset, const int32_t output_mult,
-                      const int32_t output_shift,
-                      const int32_t output_activation_min,
-                      const int32_t output_activation_max,
-                      const int32_t block_size) {
+void ElementwiseAddS8(const tflite::ArithmeticParams& params,
+                      const tflite::RuntimeShape& input1_shape,
+                      const int8_t* input1,
+                      const tflite::RuntimeShape& input2_shape,
+                      const int8_t* input2,
+                      const tflite::RuntimeShape& output_shape,
+                      int8_t* output) {
+  const int32_t input1_offset = params.input1_offset;
+  const int32_t input1_mult = params.input1_multiplier;
+  const int32_t input1_shift = params.input1_shift;
+  const int32_t input2_offset = params.input2_offset;
+  const int32_t input2_mult = params.input2_multiplier;
+  const int32_t input2_shift = params.input2_shift;
+  const int32_t left_shift = params.left_shift;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_mult = params.output_multiplier;
+  const int32_t output_shift = params.output_shift;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  const int block_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
   int blocks = block_size;
 
   const int32_t input1_shift_mul = 1 << LEFT_SHIFT(input1_shift);
diff --git a/tflm/opt/elementwise_mul_s16.cc b/tflm/opt/elementwise_mul_s16.cc
new file mode 100644
index 0000000..478201c
--- /dev/null
+++ b/tflm/opt/elementwise_mul_s16.cc
@@ -0,0 +1,80 @@
+/*
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "crt/kelvin.h"
+#include "tflm/opt/opt.h"
+#include "tflm/opt/util.h"
+
+namespace kelvin::opt {
+void MulS16(const tflite::ArithmeticParams& params,
+            const tflite::RuntimeShape& input1_shape,
+            const int16_t* input1_data,
+            const tflite::RuntimeShape& input2_shape,
+            const int16_t* input2_data,
+            const tflite::RuntimeShape& output_shape, int16_t* output_data) {
+  const int32_t input1_offset = params.input1_offset;
+  const int32_t input2_offset = params.input2_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_mult = params.output_multiplier;
+  const int32_t output_shift = params.output_shift;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+
+  const int block_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  int blocks = block_size;
+  int vl;
+  getmaxvl_h(vl);
+  while (blocks) {
+    int count = std::min(blocks, vl);
+
+    // Widen input1 to 32-bit wide values (in vm0, vm1).
+    vld_h_lp_xx_m(vm0, input1_data, count);
+    vaddw_w_vx_m(vm0, vm0, input1_offset);
+
+    // Widen input2 to 32-bit wide values (in vm2, vm3).
+    vld_h_lp_xx_m(vm2, input2_data, count);
+    vaddw_w_vx_m(vm2, vm2, input2_offset);
+
+    // Multiply the rescaled inputs.
+    vmul_w_vv_m(vm0, vm0, vm2);
+    vmul_w_vv_m(vm1, vm1, vm3);
+
+    // Rescale the summed output.
+    rescale_m(vm0, vm0, output_mult, output_shift, output_offset);
+    rescale_m(vm1, vm1, output_mult, output_shift, output_offset);
+
+    // Clamp to the provided range.
+    vmin_w_vx_m(vm0, vm0, output_activation_max);
+    vmin_w_vx_m(vm1, vm1, output_activation_max);
+    vmax_w_vx_m(vm0, vm0, output_activation_min);
+    vmax_w_vx_m(vm1, vm1, output_activation_min);
+
+    // Swizzle and narrow back to bytes.
+    vand_w_vx_m(vm0, vm0, 0xFFFF);
+    vand_w_vx_m(vm1, vm1, 0xFFFF);
+    vsll_w_vx_m(vm1, vm1, 16);
+    vor_vv_m(vm0, vm0, vm1);
+
+    // Store to memory.
+    vst_h_lp_xx_m(vm0, output_data, count);
+
+    blocks -= count;
+  }
+}
+
+}  // namespace kelvin::opt
diff --git a/tflm/opt/elementwise_mul_s8.cc b/tflm/opt/elementwise_mul_s8.cc
new file mode 100644
index 0000000..6e4ef77
--- /dev/null
+++ b/tflm/opt/elementwise_mul_s8.cc
@@ -0,0 +1,127 @@
+/*
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "crt/kelvin.h"
+#include "tflm/opt/opt.h"
+#include "tflm/opt/util.h"
+
+namespace kelvin::opt {
+void MulS8(const tflite::ArithmeticParams& params,
+           const tflite::RuntimeShape& input1_shape, const int8_t* input1_data,
+           const tflite::RuntimeShape& input2_shape, const int8_t* input2_data,
+           const tflite::RuntimeShape& output_shape, int8_t* output_data) {
+  const int32_t input1_offset = params.input1_offset;
+  const int32_t input2_offset = params.input2_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_mult = params.output_multiplier;
+  const int32_t output_shift = params.output_shift;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+
+  const int block_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  int blocks = block_size;
+
+  while (blocks >= 96) {
+    vld_b_lp_xx(v0, input1_data, 32);
+    vld_b_lp_xx(v8, input2_data, 32);
+
+    vaddw_h_vx(v2, v0, 0);
+    vaddw_h_vx(v10, v8, 0);
+    vaddw_w_vx(v4, v2, input1_offset);
+    vaddw_w_vx(v6, v3, input1_offset);
+    vaddw_w_vx(v12, v10, input2_offset);
+    vaddw_w_vx(v14, v11, input2_offset);
+
+    vld_b_lp_xx(v16, input1_data, 32);
+    vld_b_lp_xx(v24, input2_data, 32);
+
+    vaddw_h_vx(v18, v16, 0);
+    vaddw_h_vx(v26, v24, 0);
+    vaddw_w_vx(v20, v18, input1_offset);
+    vaddw_w_vx(v22, v19, input1_offset);
+    vaddw_w_vx(v28, v26, input2_offset);
+    vaddw_w_vx(v30, v27, input2_offset);
+
+    vld_b_lp_xx(v32, input1_data, 32);
+    vld_b_lp_xx(v40, input2_data, 32);
+
+    vaddw_h_vx(v34, v32, 0);
+    vaddw_h_vx(v42, v40, 0);
+    vaddw_w_vx(v36, v34, input1_offset);
+    vaddw_w_vx(v38, v35, input1_offset);
+    vaddw_w_vx(v44, v42, input2_offset);
+    vaddw_w_vx(v46, v43, input2_offset);
+
+    vmul_w_vv_m(v12, v4, v12);
+    vmul_w_vv_m(v28, v20, v28);
+    vmul_w_vv_m(v44, v36, v44);
+
+    vdmulh_w_r_vx_m(v12, v12, output_mult);
+    vdmulh_w_r_vx_m(v28, v28, output_mult);
+    vdmulh_w_r_vx_m(v44, v44, output_mult);
+    vsha_w_r_vx_m(v12, v12, -output_shift);
+    vsha_w_r_vx_m(v28, v28, -output_shift);
+    vsha_w_r_vx_m(v44, v44, -output_shift);
+    vadd_w_vx_m(v12, v12, output_offset);
+    vadd_w_vx_m(v28, v28, output_offset);
+    vadd_w_vx_m(v44, v44, output_offset);
+
+    vmin_w_vx_m(v12, v12, output_activation_max);
+    vmin_w_vx_m(v28, v28, output_activation_max);
+    vmin_w_vx_m(v44, v44, output_activation_max);
+    vmax_w_vx_m(v12, v12, output_activation_min);
+    vmax_w_vx_m(v28, v28, output_activation_min);
+    vmax_w_vx_m(v44, v44, output_activation_min);
+
+    vsraqs_b_vx(v12, v12, 0);
+    vst_b_lp_xx(v12, output_data, 32);
+    vsraqs_b_vx(v28, v28, 0);
+    vst_b_lp_xx(v28, output_data, 32);
+    vsraqs_b_vx(v44, v44, 0);
+    vst_b_lp_xx(v44, output_data, 32);
+
+    blocks -= 96;
+  }
+
+  while (blocks) {
+    int count = std::min(blocks, 32);
+    vld_b_lp_xx(v0, input1_data, count);
+    vld_b_lp_xx(v8, input2_data, count);
+
+    vaddw_h_vx(v2, v0, 0);
+    vaddw_h_vx(v10, v8, 0);
+    vaddw_w_vx(v4, v2, input1_offset);
+    vaddw_w_vx(v6, v3, input1_offset);
+    vaddw_w_vx(v12, v10, input2_offset);
+    vaddw_w_vx(v14, v11, input2_offset);
+
+    vmul_w_vv_m(v16, v4, v12);
+
+    rescale_m(v16, v16, output_mult, output_shift, output_offset);
+
+    vmin_w_vx_m(v16, v16, output_activation_max);
+    vmax_w_vx_m(v16, v16, output_activation_min);
+
+    vsraqs_b_vx(v16, v16, 0);
+    vst_b_lp_xx(v16, output_data, count);
+
+    blocks -= count;
+  }
+}
+
+}  // namespace kelvin::opt
diff --git a/tflm/opt/leaky_relu_s16.cc b/tflm/opt/leaky_relu_s16.cc
index 7427a6c..c750f84 100644
--- a/tflm/opt/leaky_relu_s16.cc
+++ b/tflm/opt/leaky_relu_s16.cc
@@ -21,13 +21,17 @@
 #include "tflm/opt/util.h"
 
 namespace kelvin::opt {
-void LeakyReluS16(const int16_t* input, int16_t* output,
-                  const int32_t block_size, const int32_t input_zero_point,
-                  const int32_t output_zero_point,
-                  const int32_t output_multiplier_alpha,
-                  const int32_t output_shift_alpha,
-                  const int32_t output_multiplier_identity,
-                  const int32_t output_shift_identity) {
+void LeakyReluS16(const tflite::LeakyReluParams &params,
+                  const tflite::RuntimeShape &input_shape, const int16_t *input,
+                  const tflite::RuntimeShape &output_shape, int16_t *output) {
+  const int32_t input_zero_point = params.input_offset;
+  const int32_t output_zero_point = params.output_offset;
+  const int32_t output_multiplier_alpha = params.output_multiplier_alpha;
+  const int32_t output_shift_alpha = params.output_shift_alpha;
+  const int32_t output_multiplier_identity = params.output_multiplier_identity;
+  const int32_t output_shift_identity = params.output_shift_identity;
+  const int block_size = MatchingFlatSize(input_shape, output_shape);
+
   constexpr int32_t quantized_output_min = std::numeric_limits<int16_t>::min();
   constexpr int32_t quantized_output_max = std::numeric_limits<int16_t>::max();
   int32_t right_shift_identity = std::min(output_shift_identity, 0L);
diff --git a/tflm/opt/leaky_relu_s8.cc b/tflm/opt/leaky_relu_s8.cc
index 8b30d19..8e43100 100644
--- a/tflm/opt/leaky_relu_s8.cc
+++ b/tflm/opt/leaky_relu_s8.cc
@@ -22,19 +22,24 @@
 
 namespace kelvin::opt {
 
-void LeakyReluS8(const int8_t* input, int8_t* output, const int32_t block_size,
-                 const int32_t input_zero_point,
-                 const int32_t output_zero_point,
-                 const int32_t output_multiplier_alpha,
-                 const int32_t output_shift_alpha,
-                 const int32_t output_multiplier_identity,
-                 const int32_t output_shift_identity) {
+void LeakyReluS8(const tflite::LeakyReluParams& params,
+                 const tflite::RuntimeShape& input_shape, const int8_t* input,
+                 const tflite::RuntimeShape& output_shape, int8_t* output) {
+  const int32_t input_zero_point = params.input_offset;
+  const int32_t output_zero_point = params.output_offset;
+  const int32_t output_multiplier_alpha = params.output_multiplier_alpha;
+  const int32_t output_shift_alpha = params.output_shift_alpha;
+  const int32_t output_multiplier_identity = params.output_multiplier_identity;
+  const int32_t output_shift_identity = params.output_shift_identity;
+  const int block_size = MatchingFlatSize(input_shape, output_shape);
+
   constexpr int32_t quantized_output_min = std::numeric_limits<int16_t>::min();
   constexpr int32_t quantized_output_max = std::numeric_limits<int16_t>::max();
   int32_t right_shift_identity = std::min(output_shift_identity, 0L);
   int32_t left_shift_identity = std::max(output_shift_identity, 0L);
   int32_t right_shift_alpha = std::min(output_shift_alpha, 0L);
   int32_t left_shift_alpha = std::max(output_shift_alpha, 0L);
+
   int blocks = block_size;
   int vl;
   getmaxvl_b(vl);
diff --git a/tflm/opt/max_pool_s16.cc b/tflm/opt/max_pool_s16.cc
new file mode 100644
index 0000000..4e3aa46
--- /dev/null
+++ b/tflm/opt/max_pool_s16.cc
@@ -0,0 +1,100 @@
+/*
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "crt/kelvin.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/runtime_shape.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace kelvin::opt {
+void MaxPoolS16(const tflite::PoolParams &params,
+                const tflite::RuntimeShape &input_shape,
+                const int16_t *input_data,
+                const tflite::RuntimeShape &output_shape,
+                int16_t *output_data) {
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        const int in_x_origin =
+            (out_x * stride_width) - params.padding_values.width;
+        const int in_y_origin =
+            (out_y * stride_height) - params.padding_values.height;
+
+        // Compute the boundaries of the filter region clamped so as to
+        // ensure that the filter window fits in the input array.
+        const int filter_x_start = std::max(0, -in_x_origin);
+        const int filter_x_end =
+            std::min(params.filter_width, input_width - in_x_origin);
+        const int filter_y_start = std::max(0, -in_y_origin);
+        const int filter_y_end =
+            std::min(params.filter_height, input_height - in_y_origin);
+
+        int channel = 0;
+        for (; channel + 16 <= depth; channel += 16) {
+          vdup_h_x(v0, params.quantized_activation_min);
+          for (int filter_y = filter_y_start; filter_y < filter_y_end;
+               ++filter_y) {
+            for (int filter_x = filter_x_start; filter_x < filter_x_end;
+                 ++filter_x) {
+              const int in_x = in_x_origin + filter_x;
+              const int in_y = in_y_origin + filter_y;
+              const int16_t *local_input =
+                  input_data + Offset(input_shape, batch, in_y, in_x, channel);
+              vld_h_x(v1, local_input);
+              vmax_h_vv(v0, v0, v1);
+            }
+          }
+          vmin_h_vx(v0, v0, params.quantized_activation_max);
+          int16_t *local_output =
+              output_data + Offset(output_shape, batch, out_y, out_x, channel);
+          vst_h_x(v0, local_output);
+        }
+
+        if (channel == depth) {
+          continue;
+        }
+        int remaining_channels = depth - channel;
+        vdup_h_x(v0, params.quantized_activation_min);
+        for (int filter_y = filter_y_start; filter_y < filter_y_end;
+             ++filter_y) {
+          for (int filter_x = filter_x_start; filter_x < filter_x_end;
+               ++filter_x) {
+            const int in_x = in_x_origin + filter_x;
+            const int in_y = in_y_origin + filter_y;
+            const int16_t *local_input =
+                input_data + Offset(input_shape, batch, in_y, in_x, depth - 1);
+            vld_h_l_xx(v1, local_input, remaining_channels);
+            vmax_h_vv(v0, v0, v1);
+          }
+        }
+        vmin_h_vx(v0, v0, params.quantized_activation_max);
+        int16_t *local_output =
+            output_data + Offset(output_shape, batch, out_y, out_x, depth - 1);
+        vst_h_l_xx(v0, local_output, remaining_channels);
+      }
+    }
+  }
+}
+
+}  // namespace kelvin::opt
diff --git a/tflm/opt/opt.h b/tflm/opt/opt.h
index e5169b2..053fbd6 100644
--- a/tflm/opt/opt.h
+++ b/tflm/opt/opt.h
@@ -25,44 +25,36 @@
 
 namespace kelvin::opt {
 void* Memcpy(void* dst, const void* src, size_t n);
-void ElementwiseAddS8(const int8_t* input1, const int8_t* input2,
-                      const int32_t input1_offset, const int32_t input1_mult,
-                      const int32_t input1_shift, const int32_t input2_offset,
-                      const int32_t input2_mult, const int32_t input2_shift,
-                      const int32_t left_shift, int8_t* output,
-                      const int32_t output_offset, const int32_t output_mult,
-                      const int32_t output_shift,
-                      const int32_t output_activation_min,
-                      const int32_t output_activation_max,
-                      const int32_t block_size);
-void ElementwiseAddS16(const int16_t* input1, const int16_t* input2,
-                       const int32_t input1_offset, const int32_t input1_mult,
-                       const int32_t input1_shift, const int32_t input2_offset,
-                       const int32_t input2_mult, const int32_t input2_shift,
-                       const int32_t left_shift, int16_t* output,
-                       const int32_t output_offset, const int32_t output_mult,
-                       const int32_t output_shift,
-                       const int32_t output_activation_min,
-                       const int32_t output_activation_max,
-                       const int32_t block_size);
-void ElementwiseAddS32(const int32_t* input1, const int32_t* input2,
-                       int32_t* output, const int32_t output_activation_min,
-                       const int32_t output_activation_max,
-                       const int32_t block_size);
-void LeakyReluS8(const int8_t* input, int8_t* output, const int32_t block_size,
-                 const int32_t input_zero_point,
-                 const int32_t output_zero_point,
-                 const int32_t output_multiplier_alpha,
-                 const int32_t output_shift_alpha,
-                 const int32_t output_multiplier_identity,
-                 const int32_t output_shift_identity);
-void LeakyReluS16(const int16_t* input, int16_t* output,
-                  const int32_t block_size, const int32_t input_zero_point,
-                  const int32_t output_zero_point,
-                  const int32_t output_multiplier_alpha,
-                  const int32_t output_shift_alpha,
-                  const int32_t output_multiplier_identity,
-                  const int32_t output_shift_identity);
+void ElementwiseAddS8(const tflite::ArithmeticParams& params,
+                      const tflite::RuntimeShape& input1_shape,
+                      const int8_t* input1_data,
+                      const tflite::RuntimeShape& input2_shape,
+                      const int8_t* input2_data,
+                      const tflite::RuntimeShape& output_shape,
+                      int8_t* output_data);
+void ElementwiseAddS16(const tflite::ArithmeticParams& params,
+                       const tflite::RuntimeShape& input1_shape,
+                       const int16_t* input1_data,
+                       const tflite::RuntimeShape& input2_shape,
+                       const int16_t* input2_data,
+                       const tflite::RuntimeShape& output_shape,
+                       int16_t* output_data);
+void ElementwiseAddS32(const tflite::ArithmeticParams& params,
+                       const tflite::RuntimeShape& input1_shape,
+                       const int32_t* input1_data,
+                       const tflite::RuntimeShape& input2_shape,
+                       const int32_t* input2_data,
+                       const tflite::RuntimeShape& output_shape,
+                       int32_t* output_data);
+void LeakyReluS8(const tflite::LeakyReluParams& params,
+                 const tflite::RuntimeShape& input_shape,
+                 const int8_t* input_data,
+                 const tflite::RuntimeShape& output_shape, int8_t* output_data);
+void LeakyReluS16(const tflite::LeakyReluParams& params,
+                  const tflite::RuntimeShape& input_shape,
+                  const int16_t* input_data,
+                  const tflite::RuntimeShape& output_shape,
+                  int16_t* output_data);
 void ConvS16B32(const tflite::ConvParams& params,
                 const int32_t* output_multiplier, const int32_t* output_shift,
                 const tflite::RuntimeShape& input_shape,
@@ -105,30 +97,30 @@
                const tflite::RuntimeShape& input_shape,
                const int8_t* input_data,
                const tflite::RuntimeShape& output_shape, int8_t* output_data);
+void MaxPoolS16(const tflite::PoolParams& params,
+                const tflite::RuntimeShape& input_shape,
+                const int16_t* input_data,
+                const tflite::RuntimeShape& output_shape, int16_t* output_data);
+void MulS8(const tflite::ArithmeticParams& params,
+           const tflite::RuntimeShape& input1_shape, const int8_t* input1_data,
+           const tflite::RuntimeShape& input2_shape, const int8_t* input2_data,
+           const tflite::RuntimeShape& output_shape, int8_t* output_data);
+void MulS16(const tflite::ArithmeticParams& params,
+            const tflite::RuntimeShape& input1_shape,
+            const int16_t* input1_data,
+            const tflite::RuntimeShape& input2_shape,
+            const int16_t* input2_data,
+            const tflite::RuntimeShape& output_shape, int16_t* output_data);
 void LogisticS8(int32_t input_zero_point, int32_t input_range_radius,
                 int32_t input_multiplier, int32_t input_left_shift,
                 int32_t input_size, const int8_t* input_data,
                 int8_t* output_data);
-void KelvinResizeNearestNeighbor(
+void ResizeNearestNeighborS8(
     const tflite::ResizeNearestNeighborParams& op_params,
     const tflite::RuntimeShape& unextended_input_shape,
     const int8_t* input_data, const tflite::RuntimeShape& output_size_shape,
     const int32_t* output_size_data,
     const tflite::RuntimeShape& unextended_output_shape, int8_t* output_data);
-void KelvinResizeNN2x(const tflite::ResizeNearestNeighborParams& op_params,
-                      const tflite::RuntimeShape& input_shape,
-                      const tflite::RuntimeShape& output_shape,
-                      const int32_t input_height, const int32_t input_width,
-                      const int32_t output_height, const int32_t output_width,
-                      const int8_t* input_data, int8_t* output_data);
-void KelvinResizeNNGeneric(const tflite::ResizeNearestNeighborParams& op_params,
-                           const tflite::RuntimeShape& input_shape,
-                           const tflite::RuntimeShape& output_shape,
-                           const int32_t input_height,
-                           const int32_t input_width,
-                           const int32_t output_height,
-                           const int32_t output_width, const int8_t* input_data,
-                           int8_t* output_data);
 
 }  // namespace kelvin::opt
 
diff --git a/tflm/opt/resize_nearest_neighbor_s8.cc b/tflm/opt/resize_nearest_neighbor_s8.cc
index 1f5cafb..8da7ee9 100644
--- a/tflm/opt/resize_nearest_neighbor_s8.cc
+++ b/tflm/opt/resize_nearest_neighbor_s8.cc
@@ -21,12 +21,12 @@
 #include "tensorflow/lite/kernels/internal/types.h"
 #include "tflm/opt/opt.h"
 
-inline int32_t KelvinGetNearestNeighbor(const int input_value,
-                                        const int32_t input_size,
-                                        const int32_t output_size,
-                                        const bool align_corners,
-                                        const bool half_pixel_centers,
-                                        const float scale, const float offset) {
+namespace kelvin::opt {
+namespace {
+int32_t GetNearestNeighbor(const int input_value, const int32_t input_size,
+                           const int32_t output_size, const bool align_corners,
+                           const bool half_pixel_centers, const float scale,
+                           const float offset) {
   int32_t output_value = std::min(
       align_corners
           ? static_cast<int32_t>(
@@ -39,14 +39,12 @@
   return output_value;
 }
 
-namespace kelvin::opt {
-
-void KelvinResizeNN2x(const tflite::ResizeNearestNeighborParams& op_params,
-                      const tflite::RuntimeShape& input_shape,
-                      const tflite::RuntimeShape& output_shape,
-                      const int32_t input_height, const int32_t input_width,
-                      const int32_t output_height, const int32_t output_width,
-                      const int8_t* input_data, int8_t* output_data) {
+void ResizeNN2x(const tflite::ResizeNearestNeighborParams& op_params,
+                const tflite::RuntimeShape& input_shape,
+                const tflite::RuntimeShape& output_shape,
+                const int32_t input_height, const int32_t input_width,
+                const int32_t output_height, const int32_t output_width,
+                const int8_t* input_data, int8_t* output_data) {
   int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
   int32_t depth = MatchingDim(input_shape, 3, output_shape, 3);
   const int col_offset = input_shape.Dims(3);
@@ -54,7 +52,6 @@
   const int batch_offset = input_shape.Dims(1) * row_offset;
 
   const int8_t* input_ptr = input_data;
-  const int8_t* input_tmp_ptr = input_data;
   int8_t* output_ptr = output_data;
 
   for (int b = 0; b < batches; ++b) {
@@ -83,14 +80,12 @@
   }
 }
 
-void KelvinResizeNNGeneric(const tflite::ResizeNearestNeighborParams& op_params,
-                           const tflite::RuntimeShape& input_shape,
-                           const tflite::RuntimeShape& output_shape,
-                           const int32_t input_height,
-                           const int32_t input_width,
-                           const int32_t output_height,
-                           const int32_t output_width, const int8_t* input_data,
-                           int8_t* output_data) {
+void ResizeNNGeneric(const tflite::ResizeNearestNeighborParams& op_params,
+                     const tflite::RuntimeShape& input_shape,
+                     const tflite::RuntimeShape& output_shape,
+                     const int32_t input_height, const int32_t input_width,
+                     const int32_t output_height, const int32_t output_width,
+                     const int8_t* input_data, int8_t* output_data) {
   int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
   int32_t depth = MatchingDim(input_shape, 3, output_shape, 3);
   const int col_offset = input_shape.Dims(3);
@@ -113,12 +108,12 @@
 
   for (int b = 0; b < batches; ++b) {
     for (int y = 0; y < output_height; ++y) {
-      int32_t in_y = KelvinGetNearestNeighbor(
+      int32_t in_y = GetNearestNeighbor(
           y, input_height, output_height, op_params.align_corners,
           op_params.half_pixel_centers, y_scale, offset);
       const int8_t* y_input_ptr = input_ptr + in_y * row_offset;
       for (int x = 0; x < output_width; ++x) {
-        int32_t in_x = KelvinGetNearestNeighbor(
+        int32_t in_x = GetNearestNeighbor(
             x, input_width, output_width, op_params.align_corners,
             op_params.half_pixel_centers, x_scale, offset);
         const int8_t* x_input_ptr = y_input_ptr + in_x * col_offset;
@@ -130,8 +125,9 @@
     input_ptr += batch_offset;
   }
 }
+}  // namespace
 
-void KelvinResizeNearestNeighbor(
+void ResizeNearestNeighborS8(
     const tflite::ResizeNearestNeighborParams& op_params,
     const tflite::RuntimeShape& unextended_input_shape,
     const int8_t* input_data, const tflite::RuntimeShape& output_size_shape,
@@ -153,14 +149,13 @@
   int32_t output_width = output_size_data[1];
 
   if (output_height == 2 * input_height && output_width == 2 * input_width) {
-    KelvinResizeNN2x(op_params, input_shape, output_shape, input_height,
-                     input_width, output_height, output_width, input_data,
-                     output_data);
+    ResizeNN2x(op_params, input_shape, output_shape, input_height, input_width,
+               output_height, output_width, input_data, output_data);
 
   } else {
-    KelvinResizeNNGeneric(op_params, input_shape, output_shape, input_height,
-                          input_width, output_height, output_width, input_data,
-                          output_data);
+    ResizeNNGeneric(op_params, input_shape, output_shape, input_height,
+                    input_width, output_height, output_width, input_data,
+                    output_data);
   }
 }