Merge "Support running a different Kelvin benchmark binary"

diff --git a/docs/tflm_ops.md b/docs/tflm_ops.md
new file mode 100644
index 0000000..1154ebf
--- /dev/null
+++ b/docs/tflm_ops.md

@@ -0,0 +1,25 @@
+# Optimized ops in Kelvin TFLM
+
+The following table is a list of currently optimized ops in Kelvin TFLM. The
+relevant source code can be found located [here](https://opensecura.googlesource.com/sw/kelvin/+/refs/heads/master/tflm/opt).
+
+## Non-Convolutional Ops
+
+| Op              | Supported Data Type | Comments                                  |
+| :-------------- | :-----------------: | :---------------------------------------- |
+| Elementwise Add | s8, s16, s32        | Rescaling with offset and shift, clamping |
+| Leaky ReLU      | s8, s16             |                                           |
+| Max Pooling     | s8                  |                                           |
+
+## Convolutional Ops
+
+| Op               | Weights | Activation | Bias | Comments                                |
+| :--------------- | :-----: | :--------: | :--: | :-------------------------------------- |
+| Depthwise Conv2d | s8      |     s16    | s64  | filter size 3x1                         |
+| Depthwise Conv2d | s8      |     s8     | s64  | output depth % 32 == 0                  |
+| Conv2d           | s8      |     s16    | s32  |                                         |
+| Conv2d           | s8      |     s16    | s64  | filter size 1x1, filter depth % 32 == 0 |
+| Conv2d           | s8      |     s16    | s64  | filter size 1xn, grouped or ungroups    |
+| Conv2d           | s8      |     s8     | s32  | filter size 1x1, output depth % 8 == 0  |
+| Conv2d           | s8      |     s8     | s32  | filter depth % 32 == 0                  |
+| Conv2d           | s8      |     s8     | s32  | filter shape  == (48x3x1x48)            |

diff --git a/tflm/opt/BUILD b/tflm/opt/BUILD
index e4d533b..28dde26 100644
--- a/tflm/opt/BUILD
+++ b/tflm/opt/BUILD

@@ -17,7 +17,13 @@
 cc_library(
     name = "opt",
     srcs = [
-        "conv.cc",
+        "conv_s16_b32.cc",
+        "conv_s16_b64.cc",
+        "conv_s8.cc",
+        "conv_s8_1x1.cc",
+        "conv_s8_3x1_d48.cc",
+        "conv_s8_d4.cc",
+        "conv_s8_d32.cc",
         "depthwise_conv_s16.cc",
         "depthwise_conv_s8.cc",
         "elementwise_add_s16.cc",
@@ -25,10 +31,12 @@
         "elementwise_add_s8.cc",
         "leaky_relu_s16.cc",
         "leaky_relu_s8.cc",
+        "max_pool_s8.cc",
         "memcpy.cc",
-        "max_pool_s8.cc"
     ],
     hdrs = [
+        "conv_s8.h",
+        "conv_util.h",
         "opt.h",
         "util.h",
     ],

diff --git a/tflm/opt/conv.cc b/tflm/opt/conv.cc
deleted file mode 100644
index 8d33848..0000000
--- a/tflm/opt/conv.cc
+++ /dev/null

@@ -1,731 +0,0 @@
-/*
- * Copyright 2023 Google LLC
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cassert>
-#include <memory>
-
-#include "crt/kelvin.h"
-#include "tensorflow/lite/kernels/internal/common.h"
-#include "tensorflow/lite/kernels/internal/runtime_shape.h"
-#include "tensorflow/lite/kernels/internal/types.h"
-#include "tflm/opt/opt.h"
-#include "tflm/opt/util.h"
-
-namespace kelvin::opt {
-namespace {
-/* clang-format off */
-constexpr const int swizzle[16] = {
-    0, 4, 8, 12,
-    2, 6, 10, 14,
-    1, 5, 9, 13,
-    3, 7, 11, 15,
-};
-/* clang-format on */
-
-constexpr int kFilterHeightIndex = 1;
-constexpr int kFilterWidthIndex = 2;
-constexpr int kFilterInputChannelIndex = 3;
-constexpr int kInputChannelIndex = 3;
-constexpr int kOutputChannelIndex = 3;
-}  // namespace
-
-void conv_per_channel_b32(
-    const tflite::ConvParams& params, const int32_t* output_multiplier,
-    const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
-    const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
-    const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
-    const int32_t* bias_data, const tflite::RuntimeShape& output_shape,
-    int16_t* output_data) {
-  const auto batches = MatchingDim(input_shape, 0, output_shape, 0);
-  const auto stride_width = params.stride_width;
-  const auto stride_height = params.stride_height;
-  const auto dilation_width_factor = params.dilation_width_factor;
-  const auto dilation_height_factor = params.dilation_height_factor;
-  const auto pad_width = params.padding_values.width;
-  const auto pad_height = params.padding_values.height;
-  const auto input_height = input_shape.Dims(1);
-  const auto input_width = input_shape.Dims(2);
-  const auto input_depth = input_shape.Dims(3);
-  const auto input_offset = params.input_offset;
-  const auto filter_height = filter_shape.Dims(1);
-  const auto filter_width = filter_shape.Dims(2);
-  const auto filter_depth = filter_shape.Dims(3);
-  const auto output_height = output_shape.Dims(1);
-  const auto output_width = output_shape.Dims(2);
-  const auto output_depth = output_shape.Dims(3);
-  const auto output_offset = params.output_offset;
-  const auto output_activation_min = params.quantized_activation_min;
-  const auto output_activation_max = params.quantized_activation_max;
-  const auto groups = input_depth / filter_depth;
-  const auto filters_per_group = output_depth / groups;
-
-  for (int batch = 0; batch < batches; ++batch) {
-    for (int out_y = 0; out_y < output_height; ++out_y) {
-      const int in_y_origin = out_y * stride_height - pad_height;
-      for (int out_x = 0; out_x < output_width; ++out_x) {
-        const int in_x_origin = out_x * stride_width - pad_width;
-        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
-          auto group = out_channel / filters_per_group;
-          int32_t acc32 = 0;
-          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
-            const int in_y = in_y_origin + dilation_height_factor * filter_y;
-            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
-              const int in_x = in_x_origin + dilation_width_factor * filter_x;
-              const bool inside = (in_x >= 0) && (in_x < input_width) &&
-                                  (in_y >= 0) && (in_y < input_height);
-              if (!inside) {
-                continue;
-              }
-              int in_channel = 0;
-              do {
-                int load_count = std::min(filter_depth - in_channel, 16L);
-                int32_t input_swizzled[16];
-                const int16_t* p_input = &input_data[tflite::Offset(
-                    input_shape, batch, in_y, in_x,
-                    in_channel + group * filter_depth)];
-                for (int i = 0; i < 16; ++i) {
-                  int swizzle_idx = swizzle[i];
-                  if (swizzle_idx < load_count)
-                    input_swizzled[i] = *(p_input + swizzle_idx) + input_offset;
-                  else
-                    input_swizzled[i] = 0;
-                }
-                vld_w_l_xx(v0, input_swizzled, 4);
-                vld_w_l_xx(v1, input_swizzled + 4, 4);
-                vld_w_l_xx(v2, input_swizzled + 8, 4);
-                vld_w_l_xx(v3, input_swizzled + 12, 4);
-                vld_b_l_xx(v4,
-                           &filter_data[tflite::Offset(filter_shape,
-                                                       out_channel, filter_y,
-                                                       filter_x, in_channel)],
-                           load_count);
-                vaddw_h_vx(v4, v4, 0);
-                vaddw_w_vx(v6, v5, 0);
-                vaddw_w_vx(v4, v4, 0);
-
-                vmul_w_vv_m(vm0, vm0, vm1);
-                vadd_w_vv(v0, v0, v1);
-                vadd_w_vv(v0, v0, v2);
-                vadd_w_vv(v0, v0, v3);
-                int32_t acc_spill[4];
-                vst_w_l_xx(v0, acc_spill, 4);
-                for (int i = 0; i < 4; ++i) {
-                  acc32 += acc_spill[i];
-                }
-                in_channel += 16;
-              } while (in_channel + 16 <= filter_depth);
-            }
-          }
-          if (bias_data) {
-            acc32 = acc32 + bias_data[out_channel];
-          }
-          int32_t acc = tflite::MultiplyByQuantizedMultiplier(
-              acc32, output_multiplier[out_channel], output_shift[out_channel]);
-          acc += output_offset;
-          acc = std::clamp(acc, output_activation_min, output_activation_max);
-          output_data[tflite::Offset(output_shape, batch, out_y, out_x,
-                                     out_channel)] = static_cast<int16_t>(acc);
-        }
-      }
-    }
-  }
-}
-
-// Accumulates in v0-v7. [v0-v3], [v4-v7] are sub accumulators for two outputs.
-// Load/swizzle filters use [v52-v63].
-// Input activations use [v32-v33].
-// No clobbers.
-void ukernel_s8_s16(const int16_t* input_data0,
-                    const int8_t* filter_data0,
-                    const int8_t* filter_data1,
-                    size_t n) {
-  n = n >> 5;
-  while (n > 0) {
-    // Load filters 0 to v58, v59
-    vld_b_p_x(v52, filter_data0);
-    vaddw_h_vx(v56, v52, 0);
-    vzip_h_vv(v58, v56, v57);
-
-    // Load activations
-    vld_h_p_x(v32, input_data0);
-    vld_h_p_x(v33, input_data0);
-
-    // Multiply filters0 * activations
-    vmulw_w_vv(v16, v58, v32);
-    vmulw_w_vv(v18, v59, v33);
-
-    // Accumulate v0
-    vadd_w_vv_m(v0, v0, v16);
-
-    // Load filters 1 to v62, v63
-    vld_b_p_x(v53, filter_data1);
-    vaddw_h_vx(v60, v53, 0);
-    vzip_h_vv(v62, v60, v61);
-
-    // Multiply filters1 * activations
-    vmulw_w_vv(v20, v62, v32);
-    vmulw_w_vv(v22, v63, v33);
-
-    // Accumulate v4
-    vadd_w_vv_m(v4, v4, v20);
-    n--;
-  }
-}
-
-void conv_per_channel_b64_1x1(
-    const tflite::ConvParams& params, const int32_t* output_multiplier,
-    const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
-    const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
-    const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
-    const int64_t* bias_data, const tflite::RuntimeShape& output_shape,
-    int16_t* output_data) {
-  const auto batches = MatchingDim(input_shape, 0, output_shape, 0);
-  const auto input_height = input_shape.Dims(1);
-  const auto input_width = input_shape.Dims(2);
-  const auto input_depth = input_shape.Dims(3);
-  const auto input_offset = params.input_offset;
-  const auto filter_input_depth = filter_shape.Dims(3);
-  const auto output_depth = output_shape.Dims(3);
-  const auto output_offset = params.output_offset;
-  const auto output_activation_min = params.quantized_activation_min;
-  const auto output_activation_max = params.quantized_activation_max;
-  const auto groups = input_depth / filter_input_depth;
-  const auto output_filters_per_group = output_depth / groups;
-
-  int32_t accumulators[8];
-  for (int bhw = 0; bhw < batches * input_height * input_width; bhw++) {
-    const int16_t* local_input = input_data + (bhw * input_depth);
-    int16_t* local_output = output_data + (bhw * output_depth);
-    for (int g = 0; g < groups; g++) {
-      const int16_t* group_input = local_input + (g * filter_input_depth);
-      for (int gc = 0; gc + 2 <= output_filters_per_group; gc += 2) {
-        int oc = (g * output_filters_per_group) + gc;
-        const int8_t* local_filters0 = filter_data + (oc * filter_input_depth);
-        const int8_t* local_filters1 = local_filters0 + filter_input_depth;
-
-        vdup_w_x_m(v0, 0);
-        vdup_w_x_m(v4, 0);
-        ukernel_s8_s16(group_input, local_filters0, local_filters1,
-                       filter_input_depth);
-        // sum accumulators
-        vadd_w_vv(v0, v0, v1);
-        vadd_w_vv(v2, v2, v3);
-        vadd_w_vv(v0, v0, v2);
-        vadd_w_vv(v4, v4, v5);
-        vadd_w_vv(v6, v6, v7);
-        vadd_w_vv(v4, v4, v6);
-
-        {
-          vst_w_x(v0, accumulators);
-          int64_t acc64 = bias_data[oc];
-          for (int i = 0; i < 8; i++) {
-            acc64 += accumulators[i];
-          }
-          int32_t acc = tflite::MultiplyByQuantizedMultiplier(
-                acc64, output_multiplier[oc], output_shift[oc]);
-          acc += output_offset;
-          acc = std::clamp(acc, output_activation_min, output_activation_max);
-          local_output[oc] = static_cast<int16_t>(acc);
-        }
-
-        {
-          vst_w_x(v4, accumulators);
-          int64_t acc64 = bias_data[oc + 1];
-          for (int i = 0; i < 8; i++) {
-            acc64 += accumulators[i];
-          }
-          int32_t acc = tflite::MultiplyByQuantizedMultiplier(
-                acc64, output_multiplier[oc + 1], output_shift[oc + 1]);
-          acc += output_offset;
-          acc = std::clamp(acc, output_activation_min, output_activation_max);
-          local_output[oc + 1] = static_cast<int16_t>(acc);
-        }
-      }
-    }
-  }
-}
-
-// Optimized for grouped convolutions, no dilation, 1xn filter
-void conv_per_channel_b64_filter1xn_group(
-    const tflite::ConvParams& params, const int32_t* output_multiplier,
-    const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
-    const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
-    const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
-    const int64_t* bias_data, const tflite::RuntimeShape& output_shape,
-    int16_t* output_data) {
-  const auto batches = MatchingDim(input_shape, 0, output_shape, 0);
-  const auto stride_width = params.stride_width;
-  const auto pad_width = params.padding_values.width;
-  const auto input_width = input_shape.Dims(2);
-  const auto input_depth = input_shape.Dims(3);
-  const auto input_offset = params.input_offset;
-  const auto filter_width = filter_shape.Dims(2);
-  const auto filter_depth = filter_shape.Dims(3);
-  const auto output_width = output_shape.Dims(2);
-  const auto output_depth = output_shape.Dims(3);
-  const auto output_offset = params.output_offset;
-  const auto output_activation_min = params.quantized_activation_min;
-  const auto output_activation_max = params.quantized_activation_max;
-
-  const auto groups = input_depth / filter_depth;
-  const auto output_filters_per_group = output_depth / groups;
-
-  int32_t accumulators[8];
-  for (int g = 0; g < groups; g++) {
-    for (int gc = 0; gc + 2 <= output_filters_per_group; gc += 2) {
-      int oc = (g * output_filters_per_group) + gc;
-      for (int b = 0; b < batches; ++b) {
-        for (int out_x = 0; out_x < output_width; ++out_x) {
-          const int in_x_origin = out_x * stride_width - pad_width;
-          const int8_t* local_filters0 =
-              filter_data + (oc * filter_width * filter_depth);
-          const int8_t* local_filters1 =
-              local_filters0 + (filter_width * filter_depth);
-          const int16_t* local_input = input_data +
-            (b * input_width * input_depth) +
-            (in_x_origin * input_depth) +
-            (g * filter_depth);
-          int16_t* local_output = output_data +
-              (b * output_width * output_depth) +
-              (out_x * output_depth);
-
-          int64_t acc64_0 = 0;
-          int64_t acc64_1 = 0;
-          vdup_w_x_m(v0, 0);
-          vdup_w_x_m(v4, 0);
-          for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
-            const int8_t* local_filters0x =
-                local_filters0 + (filter_x * filter_depth);
-            const int8_t* local_filters1x =
-                local_filters1 + (filter_x * filter_depth);
-            const int16_t* local_inputx =
-                local_input + (filter_x * input_depth);
-
-            ukernel_s8_s16(local_inputx, local_filters0x, local_filters1x,
-                           filter_depth);
-          }
-
-          // sum accumulators
-          vadd_w_vv(v0, v0, v1);
-          vadd_w_vv(v2, v2, v3);
-          vadd_w_vv(v0, v0, v2);
-          vadd_w_vv(v4, v4, v5);
-          vadd_w_vv(v6, v6, v7);
-          vadd_w_vv(v4, v4, v6);
-
-          {
-            vst_w_x(v0, accumulators);
-            for (int i = 0; i < 8; i++) {
-              acc64_0 += accumulators[i];
-            }
-            acc64_0 += bias_data[oc];
-            int32_t acc = tflite::MultiplyByQuantizedMultiplier(
-                  acc64_0, output_multiplier[oc], output_shift[oc]);
-            acc += output_offset;
-            acc = std::clamp(acc, output_activation_min, output_activation_max);
-            local_output[oc] = static_cast<int16_t>(acc);
-          }
-
-          {
-            vst_w_x(v4, accumulators);
-            for (int i = 0; i < 8; i++) {
-              acc64_1 += accumulators[i];
-            }
-            acc64_1 += bias_data[oc + 1];
-            int32_t acc = tflite::MultiplyByQuantizedMultiplier(
-                  acc64_1, output_multiplier[oc + 1], output_shift[oc + 1]);
-            acc += output_offset;
-            acc = std::clamp(acc, output_activation_min, output_activation_max);
-            local_output[oc + 1] = static_cast<int16_t>(acc);
-          }
-        }
-      }
-    }
-  }
-}
-
-// Optimized for no group, no dilation, 1xn filter.
-void conv_per_channel_b64_filter1xn_non_group(
-    const tflite::ConvParams& params, const int32_t* output_multiplier,
-    const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
-    const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
-    const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
-    const int64_t* bias_data, const tflite::RuntimeShape& output_shape,
-    int16_t* output_data) {
-  const auto batches = MatchingDim(input_shape, 0, output_shape, 0);
-  const auto stride_width = params.stride_width;
-  const auto pad_width = params.padding_values.width;
-  const auto input_width = input_shape.Dims(2);
-  const auto input_depth = input_shape.Dims(3);
-  const auto input_offset = params.input_offset;
-  const auto filter_width = filter_shape.Dims(2);
-  const auto filter_depth = filter_shape.Dims(3);
-  const auto output_width = output_shape.Dims(2);
-  const auto output_depth = output_shape.Dims(3);
-  const auto output_offset = params.output_offset;
-  const auto output_activation_min = params.quantized_activation_min;
-  const auto output_activation_max = params.quantized_activation_max;
-  int32_t accumulators[8];
-  for (int oc = 0; oc + 2 <= output_depth; oc += 2) {
-    for (int batch = 0; batch < batches; ++batch) {
-      for (int out_x = 0; out_x < output_width; ++out_x) {
-        const int in_x_origin = out_x * stride_width - pad_width;
-
-        const int8_t* local_filters0 =
-            filter_data + (oc * filter_width * filter_depth);
-        const int8_t* local_filters1 =
-            local_filters0 + (filter_width * filter_depth);
-        const int16_t* local_input = input_data +
-            (batch * input_width * input_depth) +
-            (in_x_origin * input_depth);
-        int16_t* local_output = output_data +
-            (batch * output_width * output_depth) +
-            (out_x * output_depth);
-
-        vdup_w_x_m(v0, 0);
-        vdup_w_x_m(v4, 0);
-        ukernel_s8_s16(local_input, local_filters0, local_filters1,
-                       filter_width * filter_depth);
-        // sum accumulators
-        vadd_w_vv(v0, v0, v1);
-        vadd_w_vv(v2, v2, v3);
-        vadd_w_vv(v0, v0, v2);
-        vadd_w_vv(v4, v4, v5);
-        vadd_w_vv(v6, v6, v7);
-        vadd_w_vv(v4, v4, v6);
-        {
-          vst_w_x(v0, accumulators);
-          int64_t acc64 = bias_data[oc];
-          for (int i = 0; i < 8; i++) {
-            acc64 += accumulators[i];
-          }
-          int32_t acc = tflite::MultiplyByQuantizedMultiplier(
-                acc64, output_multiplier[oc], output_shift[oc]);
-          acc += output_offset;
-          acc = std::clamp(acc, output_activation_min, output_activation_max);
-          local_output[oc] = static_cast<int16_t>(acc);
-        }
-
-        {
-          vst_w_x(v4, accumulators);
-          int64_t acc64 = bias_data[oc + 1];
-          for (int i = 0; i < 8; i++) {
-            acc64 += accumulators[i];
-          }
-          int32_t acc = tflite::MultiplyByQuantizedMultiplier(
-                acc64, output_multiplier[oc + 1], output_shift[oc + 1]);
-          acc += output_offset;
-          acc = std::clamp(acc, output_activation_min, output_activation_max);
-          local_output[oc + 1] = static_cast<int16_t>(acc);
-        }
-      }
-    }
-  }
-}
-
-void conv_per_channel_b64_generic(
-    const tflite::ConvParams& params, const int32_t* output_multiplier,
-    const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
-    const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
-    const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
-    const int64_t* bias_data, const tflite::RuntimeShape& output_shape,
-    int16_t* output_data) {
-  const auto batches = MatchingDim(input_shape, 0, output_shape, 0);
-  const auto stride_width = params.stride_width;
-  const auto stride_height = params.stride_height;
-  const auto dilation_width_factor = params.dilation_width_factor;
-  const auto dilation_height_factor = params.dilation_height_factor;
-  const auto pad_width = params.padding_values.width;
-  const auto pad_height = params.padding_values.height;
-  const auto input_height = input_shape.Dims(1);
-  const auto input_width = input_shape.Dims(2);
-  const auto input_depth = input_shape.Dims(3);
-  const auto input_offset = params.input_offset;
-  const auto filter_height = filter_shape.Dims(1);
-  const auto filter_width = filter_shape.Dims(2);
-  const auto filter_depth = filter_shape.Dims(3);
-  const auto output_height = output_shape.Dims(1);
-  const auto output_width = output_shape.Dims(2);
-  const auto output_depth = output_shape.Dims(3);
-  const auto output_offset = params.output_offset;
-  const auto output_activation_min = params.quantized_activation_min;
-  const auto output_activation_max = params.quantized_activation_max;
-  const auto groups = input_depth / filter_depth;
-  const auto filters_per_group = output_depth / groups;
-  for (int batch = 0; batch < batches; ++batch) {
-    for (int out_y = 0; out_y < output_height; ++out_y) {
-      const int in_y_origin = out_y * stride_height - pad_height;
-      for (int out_x = 0; out_x < output_width; ++out_x) {
-        const int in_x_origin = out_x * stride_width - pad_width;
-        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
-          auto group = out_channel / filters_per_group;
-          int64_t acc64 = 0;
-          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
-            const int in_y = in_y_origin + dilation_height_factor * filter_y;
-            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
-              const int in_x = in_x_origin + dilation_width_factor * filter_x;
-              const bool inside = (in_x >= 0) && (in_x < input_width) &&
-                                  (in_y >= 0) && (in_y < input_height);
-              if (!inside) {
-                continue;
-              }
-
-              int in_channel = 0;
-              do {
-                int load_count = std::min(filter_depth - in_channel, 16L);
-                int32_t input_swizzled[16];
-                const int16_t* p_input = &input_data[tflite::Offset(
-                    input_shape, batch, in_y, in_x,
-                    in_channel + group * filter_depth)];
-                for (int i = 0; i < 16; ++i) {
-                  int swizzle_idx = swizzle[i];
-                  if (swizzle_idx < load_count)
-                    input_swizzled[i] = *(p_input + swizzle_idx) + input_offset;
-                  else
-                    input_swizzled[i] = 0;
-                }
-                vld_w_l_xx(v0, input_swizzled, 4);
-                vld_w_l_xx(v1, input_swizzled + 4, 4);
-                vld_w_l_xx(v2, input_swizzled + 8, 4);
-                vld_w_l_xx(v3, input_swizzled + 12, 4);
-                vld_b_l_xx(v4,
-                           &filter_data[tflite::Offset(filter_shape,
-                                                       out_channel, filter_y,
-                                                       filter_x, in_channel)],
-                           load_count);
-                vaddw_h_vx(v4, v4, 0);
-                vaddw_w_vx(v6, v5, 0);
-                vaddw_w_vx(v4, v4, 0);
-
-                vmul_w_vv_m(vm0, vm0, vm1);
-                vadd_w_vv(v0, v0, v1);
-                vadd_w_vv(v0, v0, v2);
-                vadd_w_vv(v0, v0, v3);
-                int32_t acc32[4];
-                vst_w_l_xx(v0, acc32, 4);
-                for (int i = 0; i < 4; ++i) {
-                  acc64 += acc32[i];
-                }
-                in_channel += 16;
-              } while (in_channel + 16 <= filter_depth);
-            }
-          }
-          if (bias_data) {
-            acc64 = acc64 + bias_data[out_channel];
-          }
-          int32_t acc = tflite::MultiplyByQuantizedMultiplier(
-              acc64, output_multiplier[out_channel], output_shift[out_channel]);
-          acc += output_offset;
-          acc = std::clamp(acc, output_activation_min, output_activation_max);
-          output_data[tflite::Offset(output_shape, batch, out_y, out_x,
-                                     out_channel)] = static_cast<int16_t>(acc);
-        }
-      }
-    }
-  }
-}
-
-void conv_per_channel_b64(
-    const tflite::ConvParams& params, const int32_t* output_multiplier,
-    const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
-    const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
-    const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
-    const int64_t* bias_data, const tflite::RuntimeShape& output_shape,
-    int16_t* output_data) {
-  if (filter_shape.Dims(kFilterHeightIndex) == 1 &&
-      output_shape.Dims(kOutputChannelIndex) % 2 == 0) {
-    if (filter_shape.Dims(kFilterWidthIndex) == 1 &&
-        filter_shape.Dims(kFilterInputChannelIndex) % 32 == 0) {
-      kelvin::opt::conv_per_channel_b64_1x1(
-          params, output_multiplier, output_shift, input_shape, input_data,
-          filter_shape, filter_data, bias_shape, bias_data, output_shape,
-          output_data);
-      return;
-    }
-
-    // TODO(derekjchow): Check for valid padding
-    bool group_conv = !(input_shape.Dims(kInputChannelIndex) ==
-        filter_shape.Dims(kFilterInputChannelIndex));
-    int32_t fan_in = filter_shape.Dims(kFilterWidthIndex) *
-        filter_shape.Dims(kFilterInputChannelIndex);
-    if (!group_conv && fan_in % 32 == 0) {
-      kelvin::opt::conv_per_channel_b64_filter1xn_non_group(
-          params, output_multiplier, output_shift, input_shape, input_data,
-          filter_shape, filter_data, bias_shape, bias_data, output_shape,
-          output_data);
-      return;
-    }
-
-    if (fan_in % 32 == 0) {
-      kelvin::opt::conv_per_channel_b64_filter1xn_group(
-          params, output_multiplier, output_shift, input_shape, input_data,
-          filter_shape, filter_data, bias_shape, bias_data, output_shape,
-          output_data);
-      return;
-    }
-  }
-
-  kelvin::opt::conv_per_channel_b64_generic(
-      params, output_multiplier, output_shift, input_shape, input_data,
-      filter_shape, filter_data, bias_shape, bias_data, output_shape,
-      output_data);
-}
-
-#define INA0 v0
-#define FLTA0 v8
-#define FLTA1 v9
-#define FLTA2 v10
-#define FLTA3 v11
-#define FLTA4 v12
-#define FLTA5 v13
-#define FLTA6 v14
-#define FLTA7 v15
-#define ACC v48
-#define ACC0 v48
-#define OUT0 v56
-void conv_per_channel_b8(
-    const tflite::ConvParams& params, const int32_t* output_multiplier,
-    const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
-    const int8_t* input_data, const tflite::RuntimeShape& filter_shape,
-    const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
-    const int32_t* bias_data, const tflite::RuntimeShape& output_shape,
-    int8_t* output_data) {
-  const auto batches = MatchingDim(input_shape, 0, output_shape, 0);
-  const auto stride_width = params.stride_width;
-  const auto stride_height = params.stride_height;
-  const auto dilation_width_factor = params.dilation_width_factor;
-  const auto dilation_height_factor = params.dilation_height_factor;
-  const auto pad_width = params.padding_values.width;
-  const auto pad_height = params.padding_values.height;
-  const auto input_height = input_shape.Dims(1);
-  const auto input_width = input_shape.Dims(2);
-  const auto input_depth = input_shape.Dims(3);
-  const auto input_offset = params.input_offset;
-  const auto filter_height = filter_shape.Dims(1);
-  const auto filter_width = filter_shape.Dims(2);
-  const auto filter_depth = filter_shape.Dims(3);
-  const auto output_height = output_shape.Dims(1);
-  const auto output_width = output_shape.Dims(2);
-  const auto output_depth = output_shape.Dims(3);
-  const auto output_offset = params.output_offset;
-  const auto output_activation_min = params.quantized_activation_min;
-  const auto output_activation_max = params.quantized_activation_max;
-  const auto groups = input_depth / filter_depth;
-  const auto filters_per_group = output_depth / groups;
-  union {
-    vconv_u8_t conv;
-    uint32_t raw;
-  } cmds;
-  cmds.conv.mode = 0;
-  cmds.conv.start = 0;
-  cmds.conv.stop = 7;
-  cmds.conv.sbias1 = input_offset;
-  cmds.conv.sdata1 = true;
-  cmds.conv.sbias2 = 0;
-  cmds.conv.sdata2 = true;
-
-  // Zero out accumulators.
-  vdup_b_x(v0, 0);
-  acset_v(ACC, v0);
-  vdup_b_x_m(ACC0, 0);
-  for (int batch = 0; batch < batches; ++batch) {
-    for (int out_y = 0; out_y < output_height; ++out_y) {
-      const int in_y_origin = (out_y * stride_height) - pad_height;
-      for (int out_x = 0; out_x < output_width; /*out_x += 32*/ ++out_x) {
-        const int in_x_origin = (out_x * stride_width) - pad_width;
-        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
-          auto group = out_channel / filters_per_group;
-
-          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
-            const int in_y = in_y_origin + dilation_height_factor * filter_y;
-            const int in_x = in_x_origin + dilation_width_factor * 0;
-
-            // Zero padding by omitting the areas outside the image.
-            const bool is_point_inside_image =
-                (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
-                (in_y < input_height);
-            if (!is_point_inside_image) {
-              continue;
-            }
-
-            int q = filter_width * filter_depth;
-            for (int i = 0; i < q; i += 32) {
-              int count = std::min(q - i, 32);
-              count = std::min(
-                  count, static_cast<int>((input_width - in_x) * filter_depth));
-              int input_offset = tflite::Offset(input_shape, batch, in_y, in_x,
-                                                group * filter_depth) +
-                                 i;
-              vdup_w_x_m(vm0, 0);
-              vdup_w_x_m(vm1, 0);
-              vld_b_l_xx(INA0, &input_data[input_offset], count);
-              int filter_offset =
-                  tflite::Offset(filter_shape, out_channel, filter_y, 0, 0) + i;
-              vdup_w_x_m(FLTA0, 0);
-              vdup_w_x_m(FLTA4, 0);
-              if (count > 0) {
-                vld_b_l_xx(FLTA0, &filter_data[filter_offset],
-                           std::min(count, 4));
-              }
-              if (count > 4) {
-                vld_b_l_xx(FLTA1, &filter_data[filter_offset + 4],
-                           std::min(count - 4, 4));
-              }
-              if (count > 8) {
-                vld_b_l_xx(FLTA2, &filter_data[filter_offset + 8],
-                           std::min(count - 8, 4));
-              }
-              if (count > 12) {
-                vld_b_l_xx(FLTA3, &filter_data[filter_offset + 12],
-                           std::min(count - 12, 4));
-              }
-              if (count > 16) {
-                vld_b_l_xx(FLTA4, &filter_data[filter_offset + 16],
-                           std::min(count - 16, 4));
-              }
-              if (count > 20) {
-                vld_b_l_xx(FLTA5, &filter_data[filter_offset + 20],
-                           std::min(count - 20, 4));
-              }
-              if (count > 24) {
-                vld_b_l_xx(FLTA6, &filter_data[filter_offset + 24],
-                           std::min(count - 24, 4));
-              }
-              if (count > 28) {
-                vld_b_l_xx(FLTA7, &filter_data[filter_offset + 28],
-                           std::min(count - 28, 4));
-              }
-              aconv_vxv(ACC, INA0, cmds, FLTA0);
-            }
-          }
-          vcget(ACC);
-          vadd_w_vx_m(ACC0, ACC0, bias_data[out_channel]);
-          vsll_w_vx_m(ACC0, ACC0, LEFT_SHIFT(output_shift[out_channel]));
-          vdmulh_w_r_vx_m(ACC0, ACC0, output_multiplier[out_channel]);
-          vsha_w_r_vx_m(ACC0, ACC0, RIGHT_SHIFT(output_shift[out_channel]));
-          vadd_w_vx_m(ACC0, ACC0, output_offset);
-          vmin_w_vx_m(ACC0, ACC0, output_activation_max);
-          vmax_w_vx_m(ACC0, ACC0, output_activation_min);
-          vsraqs_b_vx(OUT0, ACC0, 0);
-          size_t output_offset =
-              tflite::Offset(output_shape, batch, out_y, out_x, out_channel);
-          vst_b_l_xx(OUT0, &output_data[output_offset], 1);
-        }
-      }
-    }
-  }
-}
-}  // namespace kelvin::opt

diff --git a/tflm/opt/conv_s16_b32.cc b/tflm/opt/conv_s16_b32.cc
new file mode 100644
index 0000000..07625d0
--- /dev/null
+++ b/tflm/opt/conv_s16_b32.cc

@@ -0,0 +1,144 @@
+/*
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Convolution based on Kelvin ops
+// Data types: input: s16, filter: s8, bias s32
+
+#include "tflm/opt/conv_util.h"
+
+namespace kelvin::opt {
+namespace {
+void ConvS16B32Generic(
+    const tflite::ConvParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
+    const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
+    const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
+    const int32_t* bias_data, const tflite::RuntimeShape& output_shape,
+    int16_t* output_data) {
+  const auto batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const auto stride_width = params.stride_width;
+  const auto stride_height = params.stride_height;
+  const auto dilation_width_factor = params.dilation_width_factor;
+  const auto dilation_height_factor = params.dilation_height_factor;
+  const auto pad_width = params.padding_values.width;
+  const auto pad_height = params.padding_values.height;
+  const auto input_height = input_shape.Dims(1);
+  const auto input_width = input_shape.Dims(2);
+  const auto input_depth = input_shape.Dims(3);
+  const auto input_offset = params.input_offset;
+  const auto filter_height = filter_shape.Dims(1);
+  const auto filter_width = filter_shape.Dims(2);
+  const auto filter_depth = filter_shape.Dims(3);
+  const auto output_height = output_shape.Dims(1);
+  const auto output_width = output_shape.Dims(2);
+  const auto output_depth = output_shape.Dims(3);
+  const auto output_offset = params.output_offset;
+  const auto output_activation_min = params.quantized_activation_min;
+  const auto output_activation_max = params.quantized_activation_max;
+  const auto groups = input_depth / filter_depth;
+  const auto filters_per_group = output_depth / groups;
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      const int in_y_origin = out_y * stride_height - pad_height;
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        const int in_x_origin = out_x * stride_width - pad_width;
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          auto group = out_channel / filters_per_group;
+          int32_t acc32 = 0;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            const int in_y = in_y_origin + dilation_height_factor * filter_y;
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              const int in_x = in_x_origin + dilation_width_factor * filter_x;
+              const bool inside = (in_x >= 0) && (in_x < input_width) &&
+                                  (in_y >= 0) && (in_y < input_height);
+              if (!inside) {
+                continue;
+              }
+              int in_channel = 0;
+              do {
+                int load_count = std::min(filter_depth - in_channel, 16L);
+                int32_t input_swizzled[16];
+                const int16_t* p_input = &input_data[tflite::Offset(
+                    input_shape, batch, in_y, in_x,
+                    in_channel + group * filter_depth)];
+                for (int i = 0; i < 16; ++i) {
+                  int swizzle_idx = swizzle[i];
+                  if (swizzle_idx < load_count)
+                    input_swizzled[i] = *(p_input + swizzle_idx) + input_offset;
+                  else
+                    input_swizzled[i] = 0;
+                }
+                vld_w_l_xx(v0, input_swizzled, 4);
+                vld_w_l_xx(v1, input_swizzled + 4, 4);
+                vld_w_l_xx(v2, input_swizzled + 8, 4);
+                vld_w_l_xx(v3, input_swizzled + 12, 4);
+                vld_b_l_xx(v4,
+                           &filter_data[tflite::Offset(filter_shape,
+                                                       out_channel, filter_y,
+                                                       filter_x, in_channel)],
+                           load_count);
+                vaddw_h_vx(v4, v4, 0);
+                vaddw_w_vx(v6, v5, 0);
+                vaddw_w_vx(v4, v4, 0);
+
+                vmul_w_vv_m(vm0, vm0, vm1);
+                vadd_w_vv(v0, v0, v1);
+                vadd_w_vv(v0, v0, v2);
+                vadd_w_vv(v0, v0, v3);
+                int32_t acc_spill[4];
+                vst_w_l_xx(v0, acc_spill, 4);
+                for (int i = 0; i < 4; ++i) {
+                  acc32 += acc_spill[i];
+                }
+                in_channel += 16;
+              } while (in_channel + 16 <= filter_depth);
+            }
+          }
+          if (bias_data) {
+            acc32 = acc32 + bias_data[out_channel];
+          }
+          int32_t acc = tflite::MultiplyByQuantizedMultiplier(
+              acc32, output_multiplier[out_channel], output_shift[out_channel]);
+          acc += output_offset;
+          acc = std::clamp(acc, output_activation_min, output_activation_max);
+          output_data[tflite::Offset(output_shape, batch, out_y, out_x,
+                                     out_channel)] = static_cast<int16_t>(acc);
+        }
+      }
+    }
+  }
+}
+}  // namespace
+
+void ConvS16B32(
+    const tflite::ConvParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
+    const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
+    const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
+    const int32_t* bias_data, const tflite::RuntimeShape& output_shape,
+    int16_t* output_data) {
+  // generic implementation by default
+  auto fn = ConvS16B32Generic;
+
+  // can add special cases below
+
+  fn(params, output_multiplier, output_shift, input_shape, input_data,
+     filter_shape, filter_data, bias_shape, bias_data, output_shape,
+     output_data);
+}
+
+}  // namespace kelvin::opt

diff --git a/tflm/opt/conv_s16_b64.cc b/tflm/opt/conv_s16_b64.cc
new file mode 100644
index 0000000..48823dd
--- /dev/null
+++ b/tflm/opt/conv_s16_b64.cc

@@ -0,0 +1,454 @@
+/*
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Convolution based on Kelvin ops
+// Data types: input: s16, filter: s8, bias s64
+
+#include "tflm/opt/conv_util.h"
+
+namespace kelvin::opt {
+namespace {
+// Accumulates in v0-v7. [v0-v3], [v4-v7] are sub accumulators for two outputs.
+// Load/swizzle filters use [v52-v63].
+// Input activations use [v32-v33].
+// No clobbers.
+void ConvUkernelS8S16(const int16_t* input_data0, const int8_t* filter_data0,
+                      const int8_t* filter_data1, size_t n) {
+  n = n >> 5;
+  while (n > 0) {
+    // Load filters 0 to v58, v59
+    vld_b_p_x(v52, filter_data0);
+    vaddw_h_vx(v56, v52, 0);
+    vzip_h_vv(v58, v56, v57);
+
+    // Load activations
+    vld_h_p_x(v32, input_data0);
+    vld_h_p_x(v33, input_data0);
+
+    // Multiply filters0 * activations
+    vmulw_w_vv(v16, v58, v32);
+    vmulw_w_vv(v18, v59, v33);
+
+    // Accumulate v0
+    vadd_w_vv_m(v0, v0, v16);
+
+    // Load filters 1 to v62, v63
+    vld_b_p_x(v53, filter_data1);
+    vaddw_h_vx(v60, v53, 0);
+    vzip_h_vv(v62, v60, v61);
+
+    // Multiply filters1 * activations
+    vmulw_w_vv(v20, v62, v32);
+    vmulw_w_vv(v22, v63, v33);
+
+    // Accumulate v4
+    vadd_w_vv_m(v4, v4, v20);
+    n--;
+  }
+}
+
+void ConvS16B64K1x1(
+    const tflite::ConvParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
+    const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
+    const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
+    const int64_t* bias_data, const tflite::RuntimeShape& output_shape,
+    int16_t* output_data) {
+  const auto batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const auto input_height = input_shape.Dims(1);
+  const auto input_width = input_shape.Dims(2);
+  const auto input_depth = input_shape.Dims(3);
+  const auto filter_input_depth = filter_shape.Dims(3);
+  const auto output_depth = output_shape.Dims(3);
+  const auto output_offset = params.output_offset;
+  const auto output_activation_min = params.quantized_activation_min;
+  const auto output_activation_max = params.quantized_activation_max;
+  const auto groups = input_depth / filter_input_depth;
+  const auto output_filters_per_group = output_depth / groups;
+
+  int32_t accumulators[8];
+  for (int bhw = 0; bhw < batches * input_height * input_width; bhw++) {
+    const int16_t* local_input = input_data + (bhw * input_depth);
+    int16_t* local_output = output_data + (bhw * output_depth);
+    for (int g = 0; g < groups; g++) {
+      const int16_t* group_input = local_input + (g * filter_input_depth);
+      for (int gc = 0; gc + 2 <= output_filters_per_group; gc += 2) {
+        int oc = (g * output_filters_per_group) + gc;
+        const int8_t* local_filters0 = filter_data + (oc * filter_input_depth);
+        const int8_t* local_filters1 = local_filters0 + filter_input_depth;
+
+        vdup_w_x_m(v0, 0);
+        vdup_w_x_m(v4, 0);
+        ConvUkernelS8S16(group_input, local_filters0, local_filters1,
+                         filter_input_depth);
+        // sum accumulators
+        vadd_w_vv(v0, v0, v1);
+        vadd_w_vv(v2, v2, v3);
+        vadd_w_vv(v0, v0, v2);
+        vadd_w_vv(v4, v4, v5);
+        vadd_w_vv(v6, v6, v7);
+        vadd_w_vv(v4, v4, v6);
+
+        {
+          vst_w_x(v0, accumulators);
+          int64_t acc64 = bias_data[oc];
+          for (int i = 0; i < 8; i++) {
+            acc64 += accumulators[i];
+          }
+          int32_t acc = tflite::MultiplyByQuantizedMultiplier(
+              acc64, output_multiplier[oc], output_shift[oc]);
+          acc += output_offset;
+          acc = std::clamp(acc, output_activation_min, output_activation_max);
+          local_output[oc] = static_cast<int16_t>(acc);
+        }
+
+        {
+          vst_w_x(v4, accumulators);
+          int64_t acc64 = bias_data[oc + 1];
+          for (int i = 0; i < 8; i++) {
+            acc64 += accumulators[i];
+          }
+          int32_t acc = tflite::MultiplyByQuantizedMultiplier(
+              acc64, output_multiplier[oc + 1], output_shift[oc + 1]);
+          acc += output_offset;
+          acc = std::clamp(acc, output_activation_min, output_activation_max);
+          local_output[oc + 1] = static_cast<int16_t>(acc);
+        }
+      }
+    }
+  }
+}
+
+// Optimized for grouped convolutions, no dilation, 1xn filter
+void ConvS16B64K1xnGroup(
+    const tflite::ConvParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
+    const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
+    const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
+    const int64_t* bias_data, const tflite::RuntimeShape& output_shape,
+    int16_t* output_data) {
+  const auto batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const auto stride_width = params.stride_width;
+  const auto pad_width = params.padding_values.width;
+  const auto input_width = input_shape.Dims(2);
+  const auto input_depth = input_shape.Dims(3);
+  const auto filter_width = filter_shape.Dims(2);
+  const auto filter_depth = filter_shape.Dims(3);
+  const auto output_width = output_shape.Dims(2);
+  const auto output_depth = output_shape.Dims(3);
+  const auto output_offset = params.output_offset;
+  const auto output_activation_min = params.quantized_activation_min;
+  const auto output_activation_max = params.quantized_activation_max;
+
+  const auto groups = input_depth / filter_depth;
+  const auto output_filters_per_group = output_depth / groups;
+
+  int32_t accumulators[8];
+  for (int g = 0; g < groups; g++) {
+    for (int gc = 0; gc + 2 <= output_filters_per_group; gc += 2) {
+      int oc = (g * output_filters_per_group) + gc;
+      for (int b = 0; b < batches; ++b) {
+        for (int out_x = 0; out_x < output_width; ++out_x) {
+          const int in_x_origin = out_x * stride_width - pad_width;
+          const int8_t* local_filters0 =
+              filter_data + (oc * filter_width * filter_depth);
+          const int8_t* local_filters1 =
+              local_filters0 + (filter_width * filter_depth);
+          const int16_t* local_input =
+              input_data + (b * input_width * input_depth) +
+              (in_x_origin * input_depth) + (g * filter_depth);
+          int16_t* local_output = output_data +
+                                  (b * output_width * output_depth) +
+                                  (out_x * output_depth);
+
+          int64_t acc64_0 = 0;
+          int64_t acc64_1 = 0;
+          vdup_w_x_m(v0, 0);
+          vdup_w_x_m(v4, 0);
+          for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+            const int8_t* local_filters0x =
+                local_filters0 + (filter_x * filter_depth);
+            const int8_t* local_filters1x =
+                local_filters1 + (filter_x * filter_depth);
+            const int16_t* local_inputx =
+                local_input + (filter_x * input_depth);
+
+            ConvUkernelS8S16(local_inputx, local_filters0x, local_filters1x,
+                             filter_depth);
+          }
+
+          // sum accumulators
+          vadd_w_vv(v0, v0, v1);
+          vadd_w_vv(v2, v2, v3);
+          vadd_w_vv(v0, v0, v2);
+          vadd_w_vv(v4, v4, v5);
+          vadd_w_vv(v6, v6, v7);
+          vadd_w_vv(v4, v4, v6);
+
+          {
+            vst_w_x(v0, accumulators);
+            for (int i = 0; i < 8; i++) {
+              acc64_0 += accumulators[i];
+            }
+            acc64_0 += bias_data[oc];
+            int32_t acc = tflite::MultiplyByQuantizedMultiplier(
+                acc64_0, output_multiplier[oc], output_shift[oc]);
+            acc += output_offset;
+            acc = std::clamp(acc, output_activation_min, output_activation_max);
+            local_output[oc] = static_cast<int16_t>(acc);
+          }
+
+          {
+            vst_w_x(v4, accumulators);
+            for (int i = 0; i < 8; i++) {
+              acc64_1 += accumulators[i];
+            }
+            acc64_1 += bias_data[oc + 1];
+            int32_t acc = tflite::MultiplyByQuantizedMultiplier(
+                acc64_1, output_multiplier[oc + 1], output_shift[oc + 1]);
+            acc += output_offset;
+            acc = std::clamp(acc, output_activation_min, output_activation_max);
+            local_output[oc + 1] = static_cast<int16_t>(acc);
+          }
+        }
+      }
+    }
+  }
+}
+
+// Optimized for no group, no dilation, 1xn filter.
+void ConvS16B64K1xnNonGroup(
+    const tflite::ConvParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
+    const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
+    const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
+    const int64_t* bias_data, const tflite::RuntimeShape& output_shape,
+    int16_t* output_data) {
+  const auto batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const auto stride_width = params.stride_width;
+  const auto pad_width = params.padding_values.width;
+  const auto input_width = input_shape.Dims(2);
+  const auto input_depth = input_shape.Dims(3);
+  const auto filter_width = filter_shape.Dims(2);
+  const auto filter_depth = filter_shape.Dims(3);
+  const auto output_width = output_shape.Dims(2);
+  const auto output_depth = output_shape.Dims(3);
+  const auto output_offset = params.output_offset;
+  const auto output_activation_min = params.quantized_activation_min;
+  const auto output_activation_max = params.quantized_activation_max;
+  int32_t accumulators[8];
+  for (int oc = 0; oc + 2 <= output_depth; oc += 2) {
+    for (int batch = 0; batch < batches; ++batch) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        const int in_x_origin = out_x * stride_width - pad_width;
+
+        const int8_t* local_filters0 =
+            filter_data + (oc * filter_width * filter_depth);
+        const int8_t* local_filters1 =
+            local_filters0 + (filter_width * filter_depth);
+        const int16_t* local_input = input_data +
+                                     (batch * input_width * input_depth) +
+                                     (in_x_origin * input_depth);
+        int16_t* local_output = output_data +
+                                (batch * output_width * output_depth) +
+                                (out_x * output_depth);
+
+        vdup_w_x_m(v0, 0);
+        vdup_w_x_m(v4, 0);
+        ConvUkernelS8S16(local_input, local_filters0, local_filters1,
+                         filter_width * filter_depth);
+        // sum accumulators
+        vadd_w_vv(v0, v0, v1);
+        vadd_w_vv(v2, v2, v3);
+        vadd_w_vv(v0, v0, v2);
+        vadd_w_vv(v4, v4, v5);
+        vadd_w_vv(v6, v6, v7);
+        vadd_w_vv(v4, v4, v6);
+        {
+          vst_w_x(v0, accumulators);
+          int64_t acc64 = bias_data[oc];
+          for (int i = 0; i < 8; i++) {
+            acc64 += accumulators[i];
+          }
+          int32_t acc = tflite::MultiplyByQuantizedMultiplier(
+              acc64, output_multiplier[oc], output_shift[oc]);
+          acc += output_offset;
+          acc = std::clamp(acc, output_activation_min, output_activation_max);
+          local_output[oc] = static_cast<int16_t>(acc);
+        }
+
+        {
+          vst_w_x(v4, accumulators);
+          int64_t acc64 = bias_data[oc + 1];
+          for (int i = 0; i < 8; i++) {
+            acc64 += accumulators[i];
+          }
+          int32_t acc = tflite::MultiplyByQuantizedMultiplier(
+              acc64, output_multiplier[oc + 1], output_shift[oc + 1]);
+          acc += output_offset;
+          acc = std::clamp(acc, output_activation_min, output_activation_max);
+          local_output[oc + 1] = static_cast<int16_t>(acc);
+        }
+      }
+    }
+  }
+}
+
+void ConvS16B64Generic(
+    const tflite::ConvParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
+    const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
+    const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
+    const int64_t* bias_data, const tflite::RuntimeShape& output_shape,
+    int16_t* output_data) {
+  const auto batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const auto stride_width = params.stride_width;
+  const auto stride_height = params.stride_height;
+  const auto dilation_width_factor = params.dilation_width_factor;
+  const auto dilation_height_factor = params.dilation_height_factor;
+  const auto pad_width = params.padding_values.width;
+  const auto pad_height = params.padding_values.height;
+  const auto input_height = input_shape.Dims(1);
+  const auto input_width = input_shape.Dims(2);
+  const auto input_depth = input_shape.Dims(3);
+  const auto input_offset = params.input_offset;
+  const auto filter_height = filter_shape.Dims(1);
+  const auto filter_width = filter_shape.Dims(2);
+  const auto filter_depth = filter_shape.Dims(3);
+  const auto output_height = output_shape.Dims(1);
+  const auto output_width = output_shape.Dims(2);
+  const auto output_depth = output_shape.Dims(3);
+  const auto output_offset = params.output_offset;
+  const auto output_activation_min = params.quantized_activation_min;
+  const auto output_activation_max = params.quantized_activation_max;
+  const auto groups = input_depth / filter_depth;
+  const auto filters_per_group = output_depth / groups;
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      const int in_y_origin = out_y * stride_height - pad_height;
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        const int in_x_origin = out_x * stride_width - pad_width;
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          auto group = out_channel / filters_per_group;
+          int64_t acc64 = 0;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            const int in_y = in_y_origin + dilation_height_factor * filter_y;
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              const int in_x = in_x_origin + dilation_width_factor * filter_x;
+              const bool inside = (in_x >= 0) && (in_x < input_width) &&
+                                  (in_y >= 0) && (in_y < input_height);
+              if (!inside) {
+                continue;
+              }
+
+              int in_channel = 0;
+              do {
+                int load_count = std::min(filter_depth - in_channel, 16L);
+                int32_t input_swizzled[16];
+                const int16_t* p_input = &input_data[tflite::Offset(
+                    input_shape, batch, in_y, in_x,
+                    in_channel + group * filter_depth)];
+                for (int i = 0; i < 16; ++i) {
+                  int swizzle_idx = swizzle[i];
+                  if (swizzle_idx < load_count)
+                    input_swizzled[i] = *(p_input + swizzle_idx) + input_offset;
+                  else
+                    input_swizzled[i] = 0;
+                }
+                vld_w_l_xx(v0, input_swizzled, 4);
+                vld_w_l_xx(v1, input_swizzled + 4, 4);
+                vld_w_l_xx(v2, input_swizzled + 8, 4);
+                vld_w_l_xx(v3, input_swizzled + 12, 4);
+                vld_b_l_xx(v4,
+                           &filter_data[tflite::Offset(filter_shape,
+                                                       out_channel, filter_y,
+                                                       filter_x, in_channel)],
+                           load_count);
+                vaddw_h_vx(v4, v4, 0);
+                vaddw_w_vx(v6, v5, 0);
+                vaddw_w_vx(v4, v4, 0);
+
+                vmul_w_vv_m(vm0, vm0, vm1);
+                vadd_w_vv(v0, v0, v1);
+                vadd_w_vv(v0, v0, v2);
+                vadd_w_vv(v0, v0, v3);
+                int32_t acc32[4];
+                vst_w_l_xx(v0, acc32, 4);
+                for (int i = 0; i < 4; ++i) {
+                  acc64 += acc32[i];
+                }
+                in_channel += 16;
+              } while (in_channel + 16 <= filter_depth);
+            }
+          }
+          if (bias_data) {
+            acc64 = acc64 + bias_data[out_channel];
+          }
+          int32_t acc = tflite::MultiplyByQuantizedMultiplier(
+              acc64, output_multiplier[out_channel], output_shift[out_channel]);
+          acc += output_offset;
+          acc = std::clamp(acc, output_activation_min, output_activation_max);
+          output_data[tflite::Offset(output_shape, batch, out_y, out_x,
+                                     out_channel)] = static_cast<int16_t>(acc);
+        }
+      }
+    }
+  }
+}
+}  // namespace
+
+void ConvS16B64(
+    const tflite::ConvParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
+    const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
+    const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
+    const int64_t* bias_data, const tflite::RuntimeShape& output_shape,
+    int16_t* output_data) {
+  const auto input_depth = input_shape.Dims(3);
+  const auto filter_height = filter_shape.Dims(1);
+  const auto filter_width = filter_shape.Dims(2);
+  const auto filter_depth = filter_shape.Dims(3);
+  const auto output_depth = output_shape.Dims(3);
+
+  // generic implementation by default
+  auto fn = ConvS16B64Generic;
+
+  // special cases
+  if (filter_height == 1 && output_depth % 2 == 0) {
+    // 1x1 filter, filter depth = 32n
+    if (filter_width == 1 && filter_depth % 32 == 0) {
+      fn = ConvS16B64K1x1;
+    }
+
+    // 1xn non group filter
+    bool group_conv = !(input_depth == filter_depth);
+    int32_t fan_in = filter_width * filter_depth;
+    if (!group_conv && fan_in % 32 == 0) {
+      fn = ConvS16B64K1xnNonGroup;
+    }
+
+    // 1xn group filter
+    if (fan_in % 32 == 0) {
+      fn = ConvS16B64K1xnGroup;
+    }
+  }
+
+  fn(params, output_multiplier, output_shift, input_shape, input_data,
+     filter_shape, filter_data, bias_shape, bias_data, output_shape,
+     output_data);
+}
+
+}  // namespace kelvin::opt

diff --git a/tflm/opt/conv_s8.cc b/tflm/opt/conv_s8.cc
new file mode 100644
index 0000000..7d7d0ba
--- /dev/null
+++ b/tflm/opt/conv_s8.cc

@@ -0,0 +1,233 @@
+/*
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Convolution based on Kelvin ops
+// Data types: input: s8, filter: s8, bias: s32
+
+#include "tflm/opt/conv_s8.h"
+
+#include <algorithm>
+
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/conv.h"
+#include "tflm/opt/conv_util.h"
+
+namespace kelvin::opt {
+namespace {
+void ConvS8Generic(
+    const tflite::ConvParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
+    const int8_t* input_data, const tflite::RuntimeShape& filter_shape,
+    const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
+    const int32_t* bias_data, const tflite::RuntimeShape& output_shape,
+    int8_t* output_data) {
+  const auto batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const auto stride_width = params.stride_width;
+  const auto stride_height = params.stride_height;
+  const auto dilation_width_factor = params.dilation_width_factor;
+  const auto dilation_height_factor = params.dilation_height_factor;
+  const auto pad_width = params.padding_values.width;
+  const auto pad_height = params.padding_values.height;
+  const auto input_height = input_shape.Dims(1);
+  const auto input_width = input_shape.Dims(2);
+  const auto input_depth = input_shape.Dims(3);
+  const auto input_offset = params.input_offset;
+  const auto filter_height = filter_shape.Dims(1);
+  const auto filter_width = filter_shape.Dims(2);
+  const auto filter_depth = filter_shape.Dims(3);
+  const auto output_height = output_shape.Dims(1);
+  const auto output_width = output_shape.Dims(2);
+  const auto output_depth = output_shape.Dims(3);
+  const auto output_offset = params.output_offset;
+  const auto output_activation_min = params.quantized_activation_min;
+  const auto output_activation_max = params.quantized_activation_max;
+  const auto groups = input_depth / filter_depth;
+  const auto filters_per_group = output_depth / groups;
+
+  if (pad_width > 0 || pad_height > 0 || dilation_width_factor > 1 ||
+      dilation_height_factor > 1) {
+    // use reference implementation
+    tflite::reference_integer_ops::ConvPerChannel(
+        params, output_multiplier, output_shift, input_shape, input_data,
+        filter_shape, filter_data, bias_shape, bias_data, output_shape,
+        output_data);
+    return;
+  }
+
+  union {
+    vconv_u8_t conv;
+    uint32_t raw;
+  } cmds;
+  cmds.conv.mode = 0;
+  cmds.conv.start = 0;
+  cmds.conv.stop = 7;
+  cmds.conv.sbias1 = input_offset;
+  cmds.conv.sdata1 = true;
+  cmds.conv.sbias2 = 0;
+  cmds.conv.sdata2 = true;
+
+  // Zero out accumulators.
+  vdup_b_x(v0, 0);
+  acset_v(ACC, v0);
+  vdup_b_x_m(ACC0, 0);
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
+      for (int out_x = 0; out_x < output_width; /*out_x += 32*/ ++out_x) {
+        const int in_x_origin = (out_x * stride_width) - pad_width;
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          auto group = out_channel / filters_per_group;
+
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            const int in_y = in_y_origin + dilation_height_factor * filter_y;
+            const int in_x = in_x_origin + dilation_width_factor * 0;
+
+            // Zero padding by omitting the areas outside the image.
+            const bool is_point_inside_image =
+                (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                (in_y < input_height);
+            if (!is_point_inside_image) {
+              continue;
+            }
+
+            int q = filter_width * filter_depth;
+            for (int i = 0; i < q; i += 32) {
+              int count = std::min(q - i, 32);
+              count = std::min(
+                  count, static_cast<int>((input_width - in_x) * filter_depth));
+              int input_offset = tflite::Offset(input_shape, batch, in_y, in_x,
+                                                group * filter_depth) +
+                                 i;
+              vdup_w_x_m(vm0, 0);
+              vdup_w_x_m(vm1, 0);
+              vld_b_l_xx(INA0, &input_data[input_offset], count);
+              int filter_offset =
+                  tflite::Offset(filter_shape, out_channel, filter_y, 0, 0) + i;
+              vdup_w_x_m(FLTA0, 0);
+              vdup_w_x_m(FLTA4, 0);
+              if (count > 0) {
+                vld_b_l_xx(FLTA0, &filter_data[filter_offset],
+                           std::min(count, 4));
+              }
+              if (count > 4) {
+                vld_b_l_xx(FLTA1, &filter_data[filter_offset + 4],
+                           std::min(count - 4, 4));
+              }
+              if (count > 8) {
+                vld_b_l_xx(FLTA2, &filter_data[filter_offset + 8],
+                           std::min(count - 8, 4));
+              }
+              if (count > 12) {
+                vld_b_l_xx(FLTA3, &filter_data[filter_offset + 12],
+                           std::min(count - 12, 4));
+              }
+              if (count > 16) {
+                vld_b_l_xx(FLTA4, &filter_data[filter_offset + 16],
+                           std::min(count - 16, 4));
+              }
+              if (count > 20) {
+                vld_b_l_xx(FLTA5, &filter_data[filter_offset + 20],
+                           std::min(count - 20, 4));
+              }
+              if (count > 24) {
+                vld_b_l_xx(FLTA6, &filter_data[filter_offset + 24],
+                           std::min(count - 24, 4));
+              }
+              if (count > 28) {
+                vld_b_l_xx(FLTA7, &filter_data[filter_offset + 28],
+                           std::min(count - 28, 4));
+              }
+              aconv_vxv(ACC, INA0, cmds, FLTA0);
+            }
+          }
+          vcget(ACC);
+          vadd_w_vx_m(ACC0, ACC0, bias_data[out_channel]);
+          vsll_w_vx_m(ACC0, ACC0, LEFT_SHIFT(output_shift[out_channel]));
+          vdmulh_w_r_vx_m(ACC0, ACC0, output_multiplier[out_channel]);
+          vsha_w_r_vx_m(ACC0, ACC0, RIGHT_SHIFT(output_shift[out_channel]));
+          vadd_w_vx_m(ACC0, ACC0, output_offset);
+          vmin_w_vx_m(ACC0, ACC0, output_activation_max);
+          vmax_w_vx_m(ACC0, ACC0, output_activation_min);
+          vsraqs_b_vx(OUT0, ACC0, 0);
+          size_t output_offset =
+              tflite::Offset(output_shape, batch, out_y, out_x, out_channel);
+          vst_b_l_xx(OUT0, &output_data[output_offset], 1);
+        }
+      }
+    }
+  }
+}
+}  // namespace
+
+void ConvS8(const tflite::ConvParams& params, const int32_t* output_multiplier,
+            const int32_t* output_shift,
+            const tflite::RuntimeShape& input_shape, const int8_t* input_data,
+            const tflite::RuntimeShape& filter_shape, const int8_t* filter_data,
+            const tflite::RuntimeShape& bias_shape, const int32_t* bias_data,
+            const tflite::RuntimeShape& output_shape, int8_t* output_data) {
+  const auto batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const auto stride_width = params.stride_width;
+  const auto stride_height = params.stride_height;
+  const auto dilation_width_factor = params.dilation_width_factor;
+  const auto dilation_height_factor = params.dilation_height_factor;
+  const auto pad_width = params.padding_values.width;
+  const auto pad_height = params.padding_values.height;
+  const auto input_width = input_shape.Dims(2);
+  const auto input_depth = input_shape.Dims(3);
+  const auto filter_height = filter_shape.Dims(1);
+  const auto filter_width = filter_shape.Dims(2);
+  const auto filter_depth = filter_shape.Dims(3);
+  const auto output_width = output_shape.Dims(2);
+  const auto output_depth = output_shape.Dims(3);
+
+  // use generic implementation by default
+  auto fn = ConvS8Generic;
+
+  // special case of filter_depth = 4n
+  if (dilation_width_factor == 1 && dilation_height_factor == 1 &&
+      stride_width <= 2 && stride_height <= 2 && filter_depth % 4 == 0 &&
+      output_depth % 8 == 0 && output_width >= 8 && pad_width <= 1) {
+    fn = kelvin::opt::ConvS8D4;
+  }
+
+  // special case of filter depth = 32n
+  else if (dilation_width_factor == 1 && dilation_height_factor == 1 &&
+      stride_width <= 2 && stride_height <= 2 && filter_depth % 32 == 0) {
+    fn = kelvin::opt::ConvS8D32;
+  }
+
+  // special case of filter size 1x1
+  else if (filter_height == 1 && filter_width == 1 && stride_height == 1 &&
+      stride_width == 1 && dilation_height_factor == 1 &&
+      dilation_width_factor == 1 && pad_height == 0 && pad_width == 0 &&
+      (output_depth % 8) == 0 && (input_depth % 32) == 0) {
+    // TODO(ndodda): uncomment it when all tests are passed
+    // fn = kelvin::opt::ConvS8K1x1;
+  }
+
+  // special case of filter size 48x3x1x48
+  else if (batches == 1 && filter_height == 3 && filter_width == 1 &&
+      input_width == 1 && input_depth == 48 && output_depth == 48 &&
+      stride_height == 1 && stride_width == 1 && dilation_height_factor == 1 &&
+      dilation_width_factor == 1 && pad_height == 0 && pad_width == 0) {
+    fn = kelvin::opt::ConvS8K3x1D48;
+  }
+
+  fn(params, output_multiplier, output_shift, input_shape, input_data,
+     filter_shape, filter_data, bias_shape, bias_data, output_shape,
+     output_data);
+}
+
+}  // namespace kelvin::opt

diff --git a/tflm/opt/conv_s8.h b/tflm/opt/conv_s8.h
new file mode 100644
index 0000000..02dd79b
--- /dev/null
+++ b/tflm/opt/conv_s8.h

@@ -0,0 +1,67 @@
+/*
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TFLM_OPT_CONV_S8_H_
+#define TFLM_OPT_CONV_S8_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/runtime_shape.h"
+
+namespace kelvin::opt {
+
+// filter 1x1
+void ConvS8K1x1(const tflite::ConvParams& params,
+                const int32_t* output_multiplier, const int32_t* output_shift,
+                const tflite::RuntimeShape& input_shape,
+                const int8_t* input_data,
+                const tflite::RuntimeShape& filter_shape,
+                const int8_t* filter_data,
+                const tflite::RuntimeShape& bias_shape,
+                const int32_t* bias_data,
+                const tflite::RuntimeShape& output_shape, int8_t* output_data);
+
+// filter depth 4n
+void ConvS8D4(const tflite::ConvParams& params,
+              const int32_t* output_multiplier, const int32_t* output_shift,
+              const tflite::RuntimeShape& input_shape,
+              const int8_t* input_data,
+              const tflite::RuntimeShape& filter_shape,
+              const int8_t* filter_data,
+              const tflite::RuntimeShape& bias_shape, const int32_t* bias_data,
+              const tflite::RuntimeShape& output_shape, int8_t* output_data);
+
+// filter depth 32n
+void ConvS8D32(const tflite::ConvParams& params,
+               const int32_t* output_multiplier, const int32_t* output_shift,
+               const tflite::RuntimeShape& input_shape,
+               const int8_t* input_data,
+               const tflite::RuntimeShape& filter_shape,
+               const int8_t* filter_data,
+               const tflite::RuntimeShape& bias_shape, const int32_t* bias_data,
+               const tflite::RuntimeShape& output_shape, int8_t* output_data);
+
+// filter size 48x3x1x48
+void ConvS8K3x1D48(
+    const tflite::ConvParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
+    const int8_t* input_data, const tflite::RuntimeShape& filter_shape,
+    const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
+    const int32_t* bias_data, const tflite::RuntimeShape& output_shape,
+    int8_t* output_data);
+
+}  // namespace kelvin::opt
+
+#endif  // TFLM_OPT_CONV_S8_H_

diff --git a/tflm/opt/conv_s8_1x1.cc b/tflm/opt/conv_s8_1x1.cc
new file mode 100644
index 0000000..9da99c3
--- /dev/null
+++ b/tflm/opt/conv_s8_1x1.cc

@@ -0,0 +1,97 @@
+/*
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Convolution based on Kelvin ops
+// Data types: input: s8, filter: s8, bias: s32
+// Special case for 1x1 filter
+
+#include "tflm/opt/conv_util.h"
+
+namespace kelvin::opt {
+
+void ConvS8K1x1(const tflite::ConvParams& params,
+                const int32_t* output_multiplier, const int32_t* output_shift,
+                const tflite::RuntimeShape& input_shape,
+                const int8_t* input_data,
+                const tflite::RuntimeShape& filter_shape,
+                const int8_t* filter_data,
+                const tflite::RuntimeShape& bias_shape,
+                const int32_t* bias_data,
+                const tflite::RuntimeShape& output_shape, int8_t* output_data) {
+  const auto batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const auto input_depth = input_shape.Dims(3);
+  const auto input_offset = params.input_offset;
+  const auto output_height = output_shape.Dims(1);
+  const auto output_width = output_shape.Dims(2);
+  const auto output_depth = output_shape.Dims(3);
+  const auto output_offset = params.output_offset;
+  const auto output_activation_min = params.quantized_activation_min;
+  const auto output_activation_max = params.quantized_activation_max;
+  //  ToDo : support group convolutions.
+  int32_t bias[8 * 4];
+  int32_t mult[8 * 4];
+  int32_t shft[8 * 4];
+  union {
+    vconv_u8_t conv;
+    uint32_t raw;
+  } cmds;
+  cmds.conv.mode = 0;
+  cmds.conv.start = 0;
+  cmds.conv.stop = 7;
+  cmds.conv.sbias1 = input_offset;
+  cmds.conv.sdata1 = true;
+  cmds.conv.sbias2 = 0;
+  cmds.conv.sdata2 = true;
+  for (int zo_hi = 0; zo_hi < output_depth; zo_hi += 8) {
+    // transpose filter weigths to support outer prodcut multiplication
+    int8_t juggled_filter_data[1][1][1][input_depth / 4][8][4];
+    Filter_N_H_W_M<8>(filter_data, juggled_filter_data[0][0][0][0][0], 1, 1,
+                      32);
+
+    Swizzle(bias_data, bias, 8);
+    Swizzle(output_multiplier, mult, 8);
+    Swizzle(output_shift, shft, 8, true);
+    int out = 0;
+    for (; out + 8 <= output_height * output_width * batches; out += 8) {
+      // resetting accumulators to clean up old output
+      vdup_b_x_m(v48, 0);
+      vdup_b_x_m(v52, 0);
+
+      int in = 0;
+      for (; in <= input_depth; in += 32) {
+        vld_b_s_xx_m(v0, input_data + out * input_depth + in, input_depth);
+        vld_b_s_xx_m(v4, input_data + out * input_depth + in + 4 * input_depth,
+                     input_depth);
+
+        vld_b_x_m(v8, juggled_filter_data[0][0][0][in / 32][0][0]);
+        vld_b_x_m(v12, juggled_filter_data[0][0][0][(in / 32) + 4][0][0]);
+
+        aconv_vxv(v48, v0, cmds, v8);
+      }
+
+      INT32_TO_INT8_OUTPUT_PIPELINE(bias, mult, shft, output_activation_min,
+                                    output_activation_max, output_offset, v16,
+                                    v20, v24);
+
+      // store the results to ouput memory
+      int8_t* p_out = output_data + (out * output_depth) + zo_hi;
+      vstq_b_sp_xx(v48, p_out, output_depth);
+      vstq_b_sp_xx(v52, p_out, output_depth);
+    }
+  }
+}
+
+}  // namespace kelvin::opt

diff --git a/tflm/opt/conv_s8_3x1_d48.cc b/tflm/opt/conv_s8_3x1_d48.cc
new file mode 100644
index 0000000..70a23b0
--- /dev/null
+++ b/tflm/opt/conv_s8_3x1_d48.cc

@@ -0,0 +1,321 @@
+/*
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Convolution based on Kelvin ops
+// Data types: input: s8, filter: s8, bias: s32
+// Special case for 48x3x1x48 filter
+
+#include "tflm/opt/conv_util.h"
+
+namespace kelvin::opt {
+
+void ConvS8K3x1D48(
+    const tflite::ConvParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
+    const int8_t* input_data, const tflite::RuntimeShape& filter_shape,
+    const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
+    const int32_t* bias_data, const tflite::RuntimeShape& output_shape,
+    int8_t* output_data) {
+  const auto batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int32_t input_offset = params.input_offset;
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int filter_depth = filter_shape.Dims(3);
+  const int output_height = output_shape.Dims(1);
+  const int output_depth = output_shape.Dims(3);
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+
+  TFLITE_DCHECK(batches == 1);
+  TFLITE_DCHECK(filter_depth == input_depth);
+  TFLITE_DCHECK(filter_height == 3);
+  TFLITE_DCHECK(filter_width == 1);
+  TFLITE_DCHECK(input_width == 1);
+  TFLITE_DCHECK(stride_width == 1);
+  TFLITE_DCHECK(stride_height == 1);
+  TFLITE_DCHECK(dilation_width_factor == 1);
+  TFLITE_DCHECK(dilation_height_factor == 1);
+  TFLITE_DCHECK(pad_width == 0);
+  TFLITE_DCHECK(pad_height == 0);
+
+  int32_t bias[48 * 4];
+  int32_t mult[48 * 4];
+  int32_t shft[48 * 4];
+  Swizzle(bias_data, bias, 48);
+  Swizzle(output_multiplier, mult, 48);
+  Swizzle(output_shift, shft, 48, true);
+
+  int8_t juggled_filter_data[48 / 8][3][1][48 / 4][8][4];
+  Filter_N_H_W_M<48>(filter_data, juggled_filter_data[0][0][0][0][0], 3, 1, 48);
+  union {
+    vconv_u8_t conv;
+    uint32_t raw;
+  } cmds;
+  cmds.conv.mode = 0;
+  cmds.conv.start = 0;
+  cmds.conv.stop = 7;
+  cmds.conv.sbias1 = input_offset;
+  cmds.conv.sdata1 = true;
+  cmds.conv.sbias2 = 0;
+  cmds.conv.sdata2 = true;
+
+  union {
+    vconv_u8_t conv;
+    uint32_t raw;
+  } cmds16;
+  cmds16.conv.mode = 0;
+  cmds16.conv.start = 0;
+  cmds16.conv.stop = 3;
+  cmds16.conv.sbias1 = input_offset;
+  cmds16.conv.sdata1 = true;
+  cmds16.conv.sbias2 = 0;
+  cmds16.conv.sdata2 = true;
+
+  for (int zo_hi = 0; zo_hi < output_depth; zo_hi += 8) {
+// For each pixel, the general flow for this kernel looks like:
+// 1) Reset accumulator and load activations into [v32, v46]
+// 2) For each group of 32 scalars in the pixel fan-in, run MAC pipeline
+//    2a) Load subset of activations from [v32, v46] to [v0, v7]
+//    2b) Load subset of weights
+//    2c) Run aconv
+// 3) Run the output pipeline and store.
+//
+// For step 1, we'll alias [v32, v46] to [L0, LE]. For most iterations,
+// we load all of these registers (10 pixels). For remainder iterations,
+// we load a subset and pad the rest with 0's. The data will be stored as
+// follows, where each letter represents 16 bytes of a pixel stored into
+// a register (capitalization used to help distinguish channels in a pixel):
+// L0 L1 L2 L3 L4 L5 L6 L7 L8 L9 LA LB LC LD LE
+// Aa AB bB Cc CD dD Ee EF fF Gg GH hH Ii IJ jJ
+#define L0 v32
+#define L1 v33
+#define L2 v34
+#define L3 v35
+#define L4 v36
+#define L5 v37
+#define L6 v38
+#define L7 v39
+#define L8 v40
+#define L9 v41
+#define LA v42
+#define LB v43
+#define LC v44
+#define LD v45
+#define LE v46
+
+// We run 5 iterations of step 2, 4 full iterations and one half iteration.
+// Because each pixel takes 1.5 registers, we have to interleave vmv_v and
+// vsliden_w_4_vv instructions to ensure the same output channels are stored
+// in each register per-pixel. As a refresher, vsliden_w_4_vv takes two
+// register arguments (X and Y), and returns the concatenation of the last
+// half of X and the first half of Y. ie:
+// L1 L2
+// AB bB
+// vsliden_w_4_vv(v1, L1, L2); -> v1 = Bb
+#define CONV_PER_CHANNEL_B8_3X1_48C_MAC_PIPELINE(p_flt)              \
+  {                                                                  \
+    /* 1/5 */                                                        \
+    /* Ky = 0, IC:[0-31] */                                          \
+    vmv_v(v0, L0);              /* Aa */                             \
+    vsliden_w_4_vv(v1, L1, L2); /* Bb */                             \
+    vmv_v(v2, L3);              /* Cc */                             \
+    vsliden_w_4_vv(v3, L4, L5); /* Dd */                             \
+    vmv_v(v4, L6);              /* Ee */                             \
+    vsliden_w_4_vv(v5, L7, L8); /* Ff */                             \
+    vmv_v(v6, L9);              /* Gg */                             \
+    vsliden_w_4_vv(v7, LA, LB); /* Hh */                             \
+    vld_b_x_m(v56, p_flt + 128 * 0);                                 \
+    vld_b_x_m(v60, p_flt + 128 * 1);                                 \
+    aconv_vxv(v48, v0, cmds, v56);                                   \
+                                                                     \
+    /* 2/5 */                                                        \
+    /* Ky = 0, IC:[32-47]; Ky = 1, IC:[0-15] */                      \
+    vmv_v(v0, L1);              /* AB */                             \
+    vsliden_w_4_vv(v1, L2, L3); /* BC */                             \
+    vmv_v(v2, L4);              /* CD */                             \
+    vsliden_w_4_vv(v3, L5, L6); /* DE */                             \
+    vmv_v(v4, L7);              /* EF */                             \
+    vsliden_w_4_vv(v5, L8, L9); /* FG */                             \
+    vmv_v(v6, LA);              /* GH */                             \
+    vsliden_w_4_vv(v7, LB, LC); /* HI */                             \
+    vld_b_x_m(v56, p_flt + 128 * 2);                                 \
+    vld_b_x_m(v60, p_flt + 128 * 3);                                 \
+    aconv_vxv(v48, v0, cmds, v56);                                   \
+                                                                     \
+    /* 3/5 */                                                        \
+    /* Ky = 1, IC:[16-47] */                                         \
+    vmv_v(v0, L2);              /* bB */                             \
+    vsliden_w_4_vv(v1, L3, L4); /* cC */                             \
+    vmv_v(v2, L5);              /* dD */                             \
+    vsliden_w_4_vv(v3, L6, L7); /* eE */                             \
+    vmv_v(v4, L8);              /* fF */                             \
+    vsliden_w_4_vv(v5, L9, LA); /* gG */                             \
+    vmv_v(v6, LB);              /* hH */                             \
+    vsliden_w_4_vv(v7, LC, LD); /* iI */                             \
+    vld_b_x_m(v56, p_flt + 128 * 4);                                 \
+    vld_b_x_m(v60, p_flt + 128 * 5);                                 \
+    aconv_vxv(v48, v0, cmds, v56);                                   \
+                                                                     \
+    /* 4/5 */                                                        \
+    /* Ky = 2, IC:[0-31] */                                          \
+    vmv_v(v0, L3);              /* Cc */                             \
+    vsliden_w_4_vv(v1, L4, L5); /* Dd */                             \
+    vmv_v(v2, L6);              /* Ee */                             \
+    vsliden_w_4_vv(v3, L4, L5); /* Ff */                             \
+    vmv_v(v4, L9);              /* Gg */                             \
+    vsliden_w_4_vv(v5, LA, LB); /* Hh */                             \
+    vmv_v(v6, LC);              /* Ii */                             \
+    vsliden_w_4_vv(v7, LD, LE); /* Jj */                             \
+    vld_b_x_m(v56, p_flt + 128 * 6);                                 \
+    vld_b_x_m(v60, p_flt + 128 * 7);                                 \
+    aconv_vxv(v48, v0, cmds, v56);                                   \
+                                                                     \
+    /* 5/5 */                                                        \
+    /* Ky = 2, IC:[32-47] half iteration */                          \
+    vmv_v(v0, L4);              /* C(D- ignored) */                  \
+    vsliden_w_4_vv(v1, L5, L6); /* D(E- ignored) */                  \
+    vmv_v(v2, L7);              /* E(F- ignored) */                  \
+    vsliden_w_4_vv(v3, L8, L9); /* F(G- ignored) */                  \
+    vmv_v(v4, LA);              /* G(H- ignored) */                  \
+    vsliden_w_4_vv(v5, LB, LC); /* H(I- ignored) */                  \
+    vmv_v(v6, LD);              /* I(J- ignored) */                  \
+    /* Pad last iteration with first pixel. Gets ignored by cmd16 */ \
+    vsliden_w_4_vv(v7, LE, L0);      /* J(A- ignored) */             \
+    vld_b_x_m(v56, p_flt + 128 * 8); /*Load once half iteration*/    \
+    /* cmds16 runs subset of outer product */                        \
+    aconv_vxv(v48, v0, cmds16, v56);                                 \
+  }
+
+    // Iterate over outputs
+    int out_y = 0;
+    for (; out_y + 8 <= output_height; out_y += 8) {
+      // Reset accumulator
+      vdup_w_x_m(v48, 0);
+      vdup_w_x_m(v52, 0);
+      acset_v(v48, v48);
+
+      const int8_t* p_flt = juggled_filter_data[zo_hi / 8][0][0][0][0];
+      const int8_t* p_in = input_data + (out_y * input_width * input_depth);
+
+      // Load 10*48 activations into 10*48*32 = 15 registers
+      vld_b_x_m(L0, p_in);
+      vld_b_x_m(L4, p_in + 32 * 4);
+      vld_b_x_m(L8, p_in + 32 * 8);
+      vld_b_x(LC, p_in + 32 * 12);
+      vld_b_x(LD, p_in + 32 * 13);
+      vld_b_x(LE, p_in + 32 * 14);
+
+      // MAC pipeline
+      CONV_PER_CHANNEL_B8_3X1_48C_MAC_PIPELINE(p_flt);
+
+      // Output pipeline
+      INT32_TO_INT8_OUTPUT_PIPELINE(bias + zo_hi * 4, mult + zo_hi * 4,
+                                    shft + zo_hi * 4, output_activation_min,
+                                    output_activation_max, output_offset, v36,
+                                    v40, v44);
+      int8_t* p_out =
+          output_data + tflite::Offset(output_shape, 0, out_y, 0, zo_hi);
+      vstq_b_sp_xx(v48, p_out, output_depth);
+      vstq_b_sp_xx(v52, p_out, output_depth);
+    }
+
+    // Left over minibatch
+    int remainder = output_height - out_y;
+    if (remainder != 0) {
+      // Reset accumulator
+      vdup_w_x_m(v48, 0);
+      vdup_w_x_m(v52, 0);
+      acset_v(v48, v48);
+
+      const int8_t* p_flt = juggled_filter_data[zo_hi / 8][0][0][0][0];
+      const int8_t* p_in = input_data + (out_y * input_width * input_depth);
+
+      // Load (remainder + 2) * 48 activations
+      // L0 L1 L2 L3 L4 L5 L6 L7 L8 L9 LA LB LC LD
+      // AA AB BB CC CD DD EE EF FF GG GH HH II I-
+      vld_b_x_m(L0, p_in);
+      vdup_w_x_m(L4, 0);
+      vdup_w_x_m(L8, 0);
+      vdup_w_x_m(LC, 0);
+      switch (remainder) {
+        case 7:
+          vld_b_x(LD, p_in + 32 * 13);
+          vld_b_x(LC, p_in + 32 * 12);
+        case 6:
+          vld_b_x(LB, p_in + 32 * 11);
+        case 5:
+          vld_b_x(LA, p_in + 32 * 10);
+          vld_b_x(L9, p_in + 32 * 9);
+        case 4:
+          vld_b_x(L8, p_in + 32 * 8);
+        case 3:
+          vld_b_x(L7, p_in + 32 * 7);
+          vld_b_x(L6, p_in + 32 * 6);
+        case 2:
+          vld_b_x(L5, p_in + 32 * 5);
+        default:
+          break;
+      }
+      vld_b_x(L4, p_in + 32 * 4);
+
+      // MAC pipeline
+      CONV_PER_CHANNEL_B8_3X1_48C_MAC_PIPELINE(p_flt);
+
+      // Output pipeline
+      INT32_TO_INT8_OUTPUT_PIPELINE(bias + zo_hi * 4, mult + zo_hi * 4,
+                                    shft + zo_hi * 4, output_activation_min,
+                                    output_activation_max, output_offset, v36,
+                                    v40, v44);
+
+      int8_t* p_out =
+          output_data + tflite::Offset(output_shape, 0, out_y, 0, zo_hi);
+      uint8_t local_data[64];
+      vst_b_x(v0, local_data);
+      vst_b_x(v1, local_data + 32);
+      for (int i = 0; i < remainder; i++) {
+        memcpy(p_out + (i * output_depth), local_data + (i * 8), 8);
+      }
+    }
+
+#undef CONV_PER_CHANNEL_B8_3X1_48C_MAC_PIPELINE
+#undef L0
+#undef L1
+#undef L2
+#undef L3
+#undef L4
+#undef L5
+#undef L6
+#undef L7
+#undef L8
+#undef L9
+#undef LA
+#undef LB
+#undef LC
+#undef LD
+#undef LE
+  }
+}
+
+}  // namespace kelvin::opt

diff --git a/tflm/opt/conv_s8_d32.cc b/tflm/opt/conv_s8_d32.cc
new file mode 100644
index 0000000..6572ae8
--- /dev/null
+++ b/tflm/opt/conv_s8_d32.cc

@@ -0,0 +1,135 @@
+/*
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Convolution based on Kelvin ops
+// Data types: input: s8, filter: s8, bias: s32
+// Special case for filter depth = 32n
+
+#include "tflm/opt/conv_util.h"
+
+namespace kelvin::opt {
+
+// Fixed-point per-channel-quantization convolution reference kernel.
+void ConvS8D32(const tflite::ConvParams& params,
+               const int32_t* output_multiplier, const int32_t* output_shift,
+               const tflite::RuntimeShape& input_shape,
+               const int8_t* input_data,
+               const tflite::RuntimeShape& filter_shape,
+               const int8_t* filter_data,
+               const tflite::RuntimeShape& bias_shape, const int32_t* bias_data,
+               const tflite::RuntimeShape& output_shape, int8_t* output_data) {
+  // Get parameters.
+  const int32_t input_offset = params.input_offset;  // r = s(q - Z)
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int32_t output_offset = params.output_offset;
+
+  // Set min and max value of the output.
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+
+  // Consistency check.
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = input_shape.Dims(3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+  }
+
+  // Check dimensions of the tensors.
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int filter_input_depth = filter_shape.Dims(3);
+  const int groups = input_depth / filter_input_depth;
+  TFLITE_DCHECK_NE(groups, 0);
+  TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0);
+  const int filters_per_group = output_depth / groups;
+  TFLITE_DCHECK_NE(filters_per_group, 0);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+
+  for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+    for (int batch = 0; batch < batches; ++batch) {
+      for (int out_y = 0; out_y < output_height; ++out_y) {
+        const int in_y_origin = (out_y * stride_height) - pad_height;
+        for (int out_x = 0; out_x < output_width; ++out_x) {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          vdup_w_x_m(v60, 0);
+          int32_t acc = 0;
+          for (int in_channel = 0; in_channel + 32 <= filter_input_depth;
+               in_channel += 32) {
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              const int in_y = in_y_origin + dilation_height_factor * filter_y;
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+
+                if (!is_point_inside_image) {
+                  continue;
+                }
+
+                vld_b_x(v0, &input_data[tflite::Offset(input_shape, batch, in_y,
+                                                       in_x, in_channel)]);
+                vaddw_h_vx(v0, v0, 0);
+                vadd_h_vx(v0, v0, static_cast<int16_t>(input_offset));
+                vadd_h_vx(v1, v1, static_cast<int16_t>(input_offset));
+                vld_b_x(v2, &filter_data[tflite::Offset(filter_shape,
+                                                        out_channel, filter_y,
+                                                        filter_x, in_channel)]);
+                vaddw_h_vx(v2, v2, 0);
+                vmulw_w_vv(v48, v0, v2);
+                vmulw_w_vv(v50, v1, v3);
+                vadd_w_vv_m(v60, v60, v48);
+              }
+            }
+          }
+          int32_t accumulators[32];
+          vst_w_x_m(v60, accumulators);
+          for (int i = 0; i < 32; ++i) {
+            acc += accumulators[i];
+          }
+
+          if (bias_data) {
+            acc += bias_data[out_channel];
+          }
+          acc = tflite::MultiplyByQuantizedMultiplier(
+              acc, output_multiplier[out_channel], output_shift[out_channel]);
+          acc += output_offset;
+          acc = std::max(acc, output_activation_min);
+          acc = std::min(acc, output_activation_max);
+          output_data[tflite::Offset(output_shape, batch, out_y, out_x,
+                                     out_channel)] = static_cast<int8_t>(acc);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace kelvin::opt

diff --git a/tflm/opt/conv_s8_d4.cc b/tflm/opt/conv_s8_d4.cc
new file mode 100644
index 0000000..0dd3e50
--- /dev/null
+++ b/tflm/opt/conv_s8_d4.cc

@@ -0,0 +1,384 @@
+/*
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Convolution based on Kelvin ops
+// Data types: input: s8, filter: s8, bias: s32
+// Special case for filter depth = 4n
+
+#include <cstdlib>
+#include <memory>
+
+#include "crt/kelvin.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/conv.h"
+#include "tensorflow/lite/kernels/internal/runtime_shape.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace kelvin::opt {
+namespace {
+void Filter_8_H_W_M(const int8_t* input, int8_t* output, int H, int W, int M) {
+  const int8_t(&in)[8][H][W][M] = *(int8_t(*)[8][H][W][M])input;
+  int8_t(&out)[H][W][M / 4][8][4] = *(int8_t(*)[H][W][M / 4][8][4]) output;
+  assert(M >= 4);
+  for (int zo = 0; zo < 8; ++zo) {
+    for (int ky = 0; ky < H; ++ky) {
+      for (int kx = 0; kx < W; ++kx) {
+        for (int zi = 0; zi < M; ++zi) {
+          const int zi_hi = zi >> 2;  // div4
+          const int zi_lo = zi & 3;   // rem4
+          out[ky][kx][zi_hi][zo][zi_lo] = in[zo][ky][kx][zi];
+        }
+      }
+    }
+  }
+}
+
+void Swizzle(const int32_t* input, int32_t* output, int N) {
+  const int32_t(&in)[N] = *(int32_t(*)[N])input;
+  int32_t(&out)[N * 4] = *(int32_t(*)[N * 4]) output;
+  // Convert to accumulator swizzle pattern.
+  for (int i = 0; i < N / 8; ++i) {
+    int32_t* out0 = out + i * 32 + 0;
+    int32_t* out1 = out + i * 32 + 16;
+    int32_t* out2 = out + i * 32 + 8;
+    int32_t* out3 = out + i * 32 + 24;
+    for (int j = 0; j < 4; ++j) {
+      const int32_t* p_in = in + i * 8;
+      for (int k = 0; k < 2; ++k) {
+        *out0++ = *p_in++;
+        *out1++ = *p_in++;
+        *out2++ = *p_in++;
+        *out3++ = *p_in++;
+      }
+    }
+  }
+}
+}  // namespace
+
+void ConvS8D4(
+    const tflite::ConvParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
+    const int8_t* input_data, const tflite::RuntimeShape& filter_shape,
+    const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
+    const int32_t* bias_data, const tflite::RuntimeShape& output_shape,
+    int8_t* output_data) {
+  // Get parameters.
+  const int32_t input_offset = params.input_offset;  // r = s(q - Z)
+  const int32_t neg_input_offset = -params.input_offset;  // r = s(q - Z)
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int32_t output_offset = params.output_offset;
+
+  // Set min and max value of the output.
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+
+  // Consistency check.
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = input_shape.Dims(3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+  }
+
+  // Check dimensions of the tensors.
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int filter_input_depth = filter_shape.Dims(3);
+  const int groups = input_depth / filter_input_depth;
+  TFLITE_DCHECK_NE(groups, 0);
+  TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0);
+  const int filters_per_group = output_depth / groups;
+  TFLITE_DCHECK_NE(filters_per_group, 0);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+
+  union {
+    vconv_u8_t conv;
+    uint32_t raw;
+  } cmds;
+  cmds.conv.mode = 0;
+  cmds.conv.start = 0;
+  cmds.conv.stop = 7;
+  cmds.conv.sbias1 = input_offset;
+  cmds.conv.sdata1 = true;
+  cmds.conv.sbias2 = 0;
+  cmds.conv.sdata2 = true;
+
+  const size_t swizzled_filter_data_size =
+      8 * filter_height * filter_width * filter_input_depth;
+  std::unique_ptr<int8_t> swizzled_filter_data(reinterpret_cast<int8_t*>(
+      ::aligned_alloc(32, swizzled_filter_data_size)));
+  int8_t* p_swizzled_filter_data = swizzled_filter_data.get();
+  int32_t swizzled_bias_data[32];
+  int32_t swizzled_mult_data[32];
+  int32_t swizzled_shift_data[32];
+
+  for (int out_channel = 0; out_channel + 8 <= output_depth; out_channel += 8) {
+    Filter_8_H_W_M(filter_data + (out_channel * filter_height * filter_width *
+                                  filter_input_depth),
+                   p_swizzled_filter_data, filter_height, filter_width,
+                   filter_input_depth);
+    Swizzle(bias_data + out_channel, swizzled_bias_data, 8);
+    Swizzle(output_multiplier + out_channel, swizzled_mult_data, 8);
+    Swizzle(output_shift + out_channel, swizzled_shift_data, 8);
+    vld_w_x_m(v16, swizzled_bias_data);
+    vld_w_x_m(v20, swizzled_mult_data);
+    vld_w_x_m(v24, swizzled_shift_data);
+    vrsub_w_vx_m(v24, v24, 0);
+
+    for (int batch = 0; batch < batches; ++batch) {
+      for (int out_y = 0; out_y < output_height; ++out_y) {
+        const int in_y_origin = (out_y * stride_height) - pad_height;
+        int out_x = 0;
+        do {
+          int out_xs_this_iter = std::min(8, output_width - out_x);
+          // 8x accumulators
+          vdup_w_x_m(v48, 0);
+          vdup_w_x_m(v52, 0);
+          acset_v(v48, v48);
+          int in_channel = 0;
+          do {
+            int channels_this_iter = std::min(filter_input_depth, 32);
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              const int in_y = in_y_origin + dilation_height_factor * filter_y;
+              const bool is_row_inside_input =
+                  (in_y >= 0) && (in_y < input_height);
+              if (!is_row_inside_input) {
+                continue;
+              }
+
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                int in_x[8];
+                bool right_pad = false;
+                int first_right_pad = -1;
+                for (int i = 0; i < 8; ++i) {
+                  const int in_x_origin =
+                      ((out_x + i) * stride_width) - pad_width;
+                  in_x[i] = in_x_origin + dilation_width_factor * filter_x;
+                }
+                bool left_pad = (in_x[0] < 0);
+                for (int i = 7; i >= 0; --i) {
+                  if (in_x[i] < input_width) {
+                    break;
+                  }
+                  right_pad = true;
+                  first_right_pad = i;
+                }
+
+                if (left_pad) {
+                  vdup_b_x(v0, -input_offset);
+                  vld_b_s_xx(
+                      v1,
+                      &input_data[tflite::Offset(input_shape, batch, in_y,
+                                                 in_x[1], in_channel)],
+                      input_depth * stride_width);
+                  vld_b_s_xx(
+                      v2,
+                      &input_data[tflite::Offset(input_shape, batch, in_y,
+                                                 in_x[2], in_channel)],
+                      input_depth * stride_width);
+                  vld_b_s_xx(
+                      v3,
+                      &input_data[tflite::Offset(input_shape, batch, in_y,
+                                                 in_x[3], in_channel)],
+                      input_depth * stride_width);
+                  vld_b_s_xx_m(
+                      v4,
+                      &input_data[tflite::Offset(input_shape, batch, in_y,
+                                                 in_x[4], in_channel)],
+                      input_depth * stride_width);
+                } else if (right_pad) {
+                  int first_pad = std::min(first_right_pad, out_xs_this_iter);
+                  switch (first_pad) {
+                    case 0:
+                      vdup_b_x(v0, neg_input_offset);
+                    case 1:
+                      vdup_b_x(v1, neg_input_offset);
+                    case 2:
+                      vdup_b_x(v2, neg_input_offset);
+                    case 3:
+                      vdup_b_x(v3, neg_input_offset);
+                    case 4:
+                      vdup_b_x(v4, neg_input_offset);
+                    case 5:
+                      vdup_b_x(v5, neg_input_offset);
+                    case 6:
+                      vdup_b_x(v6, neg_input_offset);
+                    case 7:
+                      vdup_b_x(v7, neg_input_offset);
+                  }
+                  switch (8 - first_pad) { // rest (stripmines?)
+                    case 0:
+                      vld_b_s_xx(
+                          v7,
+                          &input_data[tflite::Offset(input_shape, batch, in_y,
+                                                     in_x[7], in_channel)],
+                          input_depth * stride_width);
+                    case 1:
+                      vld_b_s_xx(
+                          v6,
+                          &input_data[tflite::Offset(input_shape, batch, in_y,
+                                                     in_x[6], in_channel)],
+                          input_depth * stride_width);
+                    case 2:
+                      vld_b_s_xx(
+                          v5,
+                          &input_data[tflite::Offset(input_shape, batch, in_y,
+                                                     in_x[5], in_channel)],
+                          input_depth * stride_width);
+                    case 3:
+                      vld_b_s_xx(
+                          v4,
+                          &input_data[tflite::Offset(input_shape, batch, in_y,
+                                                     in_x[4], in_channel)],
+                          input_depth * stride_width);
+                    case 4:
+                      vld_b_s_xx(
+                          v3,
+                          &input_data[tflite::Offset(input_shape, batch, in_y,
+                                                     in_x[3], in_channel)],
+                          input_depth * stride_width);
+                    case 5:
+                      vld_b_s_xx(
+                          v2,
+                          &input_data[tflite::Offset(input_shape, batch, in_y,
+                                                     in_x[2], in_channel)],
+                          input_depth * stride_width);
+                    case 6:
+                      vld_b_s_xx(
+                          v1,
+                          &input_data[tflite::Offset(input_shape, batch, in_y,
+                                                     in_x[1], in_channel)],
+                          input_depth * stride_width);
+                    case 7:
+                      vld_b_s_xx(
+                          v0,
+                          &input_data[tflite::Offset(input_shape, batch, in_y,
+                                                     in_x[0], in_channel)],
+                          input_depth * stride_width);
+                  }
+                } else if (!left_pad && !right_pad) {
+                  // Inputs
+                  vld_b_s_xx_m(
+                      v0,
+                      &input_data[tflite::Offset(input_shape, batch, in_y,
+                                                 in_x[0], in_channel)],
+                      input_depth * stride_width);
+                  vld_b_s_xx_m(
+                      v4,
+                      &input_data[tflite::Offset(input_shape, batch, in_y,
+                                                 in_x[4], in_channel)],
+                      input_depth * stride_width);
+                } else {
+                  vdup_b_x(v0, -input_offset);
+                  vdup_b_x(v7, -input_offset);
+                  vld_b_s_xx_m(
+                      v1,
+                      &input_data[tflite::Offset(input_shape, batch, in_y,
+                                                 in_x[1], in_channel)],
+                      input_depth * stride_width);
+                  vld_b_s_xx(
+                      v5,
+                      &input_data[tflite::Offset(input_shape, batch, in_y,
+                                                 in_x[5], in_channel)],
+                      input_depth * stride_width);
+                  vld_b_s_xx(
+                      v6,
+                      &input_data[tflite::Offset(input_shape, batch, in_y,
+                                                 in_x[6], in_channel)],
+                      input_depth * stride_width);
+                }
+                size_t local_filter_offset =
+                    (filter_y * filter_width * 8 * input_depth) +
+                    (filter_x * 8 * input_depth) + (in_channel * 8);
+                int8_t* p_local_filter_start =
+                    p_swizzled_filter_data + local_filter_offset;
+                vld_b_p_x_m(v8, p_local_filter_start);
+                vld_b_x_m(v12, p_local_filter_start);
+
+                cmds.conv.stop = (channels_this_iter / 4) - 1;
+                aconv_vxv(v48, v0, cmds, v8);
+              }
+            }
+            in_channel += channels_this_iter;
+          } while (in_channel < filter_input_depth);
+          vcget(v48);
+          vadd_w_vv_m(v48, v48, v16);
+          vadd_w_vv_m(v52, v52, v16);
+          vdmulh_w_rn_vv_m(v48, v48, v20);
+          vdmulh_w_rn_vv_m(v52, v52, v20);
+          vsha_w_r_vv_m(v48, v48, v24);
+          vsha_w_r_vv_m(v52, v52, v24);
+          vadd_w_vx_m(v48, v48, output_offset);
+          vadd_w_vx_m(v52, v52, output_offset);
+          vmin_w_vx_m(v48, v48, output_activation_max);
+          vmin_w_vx_m(v52, v52, output_activation_max);
+          vmax_w_vx_m(v48, v48, output_activation_min);
+          vmax_w_vx_m(v52, v52, output_activation_min);
+          vsraqs_b_vx(v56, v48, 0);
+          vsraqs_b_vx(v57, v52, 0);
+          if (out_xs_this_iter >= 4) {
+            vstq_b_s_xx(v56,
+                        &output_data[tflite::Offset(output_shape, batch, out_y,
+                                                    out_x, out_channel)],
+                        output_depth);
+          } else {
+            for (int i = 0; i < out_xs_this_iter; ++i) {
+              if (i > 0) {
+                vsliden_b_4_vv(v58, v56, v0);
+                vsliden_b_4_vv(v56, v58, v0);
+              }
+              vst_b_l_xx(v56,
+                        &output_data[tflite::Offset(output_shape, batch, out_y,
+                                                    out_x + i, out_channel)],
+                        8);
+            }
+          }
+          if (out_xs_this_iter == 8) {
+            vstq_b_s_xx(v57,
+                        &output_data[tflite::Offset(output_shape, batch, out_y,
+                                                    out_x + 4, out_channel)],
+                        output_depth);
+          } else if (out_xs_this_iter > 4) {
+            for (int i = 4; i < out_xs_this_iter; ++i) {
+              if (i > 4) {
+                vsliden_b_4_vv(v58, v57, v0);
+                vsliden_b_4_vv(v57, v58, v0);
+              }
+              vst_b_l_xx(v57,
+                        &output_data[tflite::Offset(output_shape, batch, out_y,
+                                                    out_x + i, out_channel)],
+                        8);
+            }
+          }
+          out_x += out_xs_this_iter;
+        } while (out_x < output_width);
+      }
+    }
+  }
+}
+}  // namespace kelvin::opt

diff --git a/tflm/opt/conv_util.h b/tflm/opt/conv_util.h
new file mode 100644
index 0000000..b9470aa
--- /dev/null
+++ b/tflm/opt/conv_util.h

@@ -0,0 +1,136 @@
+/*
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TFLM_OPT_CONV_UTIL_H_
+#define TFLM_OPT_CONV_UTIL_H_
+
+#include <cassert>
+#include <memory>
+
+#include "crt/kelvin.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/runtime_shape.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tflm/opt/util.h"
+
+namespace kelvin::opt {
+/* clang-format off */
+constexpr const int swizzle[16] = {
+    0, 4, 8, 12,
+    2, 6, 10, 14,
+    1, 5, 9, 13,
+    3, 7, 11, 15,
+};
+/* clang-format on */
+
+constexpr int kFilterHeightIndex = 1;
+constexpr int kFilterWidthIndex = 2;
+constexpr int kFilterInputChannelIndex = 3;
+constexpr int kInputChannelIndex = 3;
+constexpr int kOutputChannelIndex = 3;
+
+#define INA0 v0
+#define FLTA0 v8
+#define FLTA1 v9
+#define FLTA2 v10
+#define FLTA3 v11
+#define FLTA4 v12
+#define FLTA5 v13
+#define FLTA6 v14
+#define FLTA7 v15
+#define ACC v48
+#define ACC0 v48
+#define OUT0 v56
+
+// H,W ( height and width of filter) N -number of inputs, M -number of outputs
+template <int N>
+inline void Filter_N_H_W_M(const int8_t* input, int8_t* output, int H, int W,
+                           int M) {
+  // Convert: input  [zo][ky][kx][zi] (N,3,1,M)
+  //          output [zo.hi=N/8][ky][kx][zi_hi=M/4][zo.lo=8][zi_lo=4]
+  const int8_t(&in)[N][H][W][M] = *(int8_t(*)[N][H][W][M])input;
+  int8_t(&out)[N / 8][H][W][M / 4][8][4] =
+      *(int8_t(*)[N / 8][H][W][M / 4][8][4]) output;
+  assert(N >= 4 && M >= 4);
+  for (int zo = 0; zo < N; ++zo) {
+    for (int ky = 0; ky < H; ++ky) {
+      for (int kx = 0; kx < W; ++kx) {
+        for (int zi = 0; zi < M; ++zi) {
+          const int zo_hi = zo >> 3;  // div8
+          const int zo_lo = zo & 7;   // rem8
+          const int zi_hi = zi >> 2;  // div4
+          const int zi_lo = zi & 3;   // rem4
+          out[zo_hi][ky][kx][zi_hi][zo_lo][zi_lo] = in[zo][ky][kx][zi];
+        }
+      }
+    }
+  }
+}
+
+// Swizzle values, and duplicate 4 times for stripmining.
+inline void Swizzle(const int32_t* input, int32_t* output, int N,
+                    bool negate = false) {
+  const int32_t(&in)[N] = *(int32_t(*)[N])input;
+  int32_t(&out)[N * 4] = *(int32_t(*)[N * 4]) output;
+  // Convert to accumulator swizzle pattern.
+  for (int i = 0; i < N / 8; ++i) {
+    int32_t* out0 = out + i * 32 + 0;
+    int32_t* out1 = out + i * 32 + 16;
+    int32_t* out2 = out + i * 32 + 8;
+    int32_t* out3 = out + i * 32 + 24;
+    for (int j = 0; j < 4; ++j) {
+      const int32_t* p_in = in + i * 8;
+      for (int k = 0; k < 2; ++k) {
+        *out0++ = *p_in++;
+        *out1++ = *p_in++;
+        *out2++ = *p_in++;
+        *out3++ = *p_in++;
+      }
+    }
+  }
+  if (negate) {
+    for (int i = 0; i < N * 4; ++i) {
+      out[i] = -out[i];
+    }
+  }
+}
+
+// Run output pipeline on int32 accumulators in [v48-v55] and store results
+// in v48 and v52. Clobbers [v48-v55].
+#define INT32_TO_INT8_OUTPUT_PIPELINE(bias, mult, shft, output_min,        \
+                                      output_max, output_offset, bias_reg, \
+                                      mult_reg, shift_reg)                 \
+  {                                                                        \
+    vcget(v48);                                                            \
+    vld_w_x_m(bias_reg, bias);                                             \
+    vld_w_x_m(mult_reg, mult);                                             \
+    vld_w_x_m(shift_reg, shft);                                            \
+    vadd_w_vv_m(v48, v48, bias_reg);                                       \
+    vadd_w_vv_m(v52, v52, bias_reg);                                       \
+    vmin_w_vx_m(v48, v48, output_max);                                     \
+    vmax_w_vx_m(v52, v52, output_min);                                     \
+    vdmulh_w_r_vv_m(v48, v48, mult_reg);                                   \
+    vdmulh_w_r_vv_m(v52, v52, mult_reg);                                   \
+    vsha_w_r_vv_m(v48, v48, shift_reg);                                    \
+    vsha_w_r_vv_m(v52, v52, shift_reg);                                    \
+    vadd_w_vx_m(v48, v48, output_offset);                                  \
+    vadd_w_vx_m(v52, v52, output_offset);                                  \
+    vsraqs_b_vx(v48, v48, 0);                                              \
+    vsraqs_b_vx(v52, v52, 0);                                              \
+  }
+}  // namespace kelvin::opt
+
+#endif  // TFLM_OPT_CONV_UTIL_H_

diff --git a/tflm/opt/depthwise_conv_s16.cc b/tflm/opt/depthwise_conv_s16.cc
index 13ae125..c7db407 100644
--- a/tflm/opt/depthwise_conv_s16.cc
+++ b/tflm/opt/depthwise_conv_s16.cc

@@ -14,23 +14,32 @@
  * limitations under the License.
  */
 
-#include <algorithm>
+// Depthwise convolution based on Kelvin ops
+// Data types: input: s16, filter: s8, bias s64
 
-#include "crt/kelvin.h"
-#include "tflm/opt/opt.h"
-#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h"
+#include "tflm/opt/conv_util.h"
 
 namespace kelvin::opt {
+namespace {
+void DepthwiseConvS16K3x1(
+    const tflite::DepthwiseParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
+    const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
+    const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
+    const int64_t* bias_data, const tflite::RuntimeShape& output_shape,
+    int16_t* output_data) {
+  const int16_t* activations = input_data;
+  const int8_t* weights = filter_data;
+  const int64_t* biases = bias_data;
+  int channels = filter_shape.Dims(3);
+  int frames = input_shape.Dims(2);
+  int dilation = params.dilation_width_factor;
+  const int32_t* output_mult = output_multiplier;
+  int32_t output_activation_min = params.quantized_activation_min;
+  int32_t output_activation_max = params.quantized_activation_max;
+  int16_t* output = output_data;
 
-void DepthwiseConv2DKelvinS16K3x1(const int16_t* activations,
-                                  const int8_t* weights,
-                                  const int64_t* biases,
-                                  int channels, int frames, int dilation,
-                                  const int32_t* output_mult,
-                                  const int32_t* output_shift,
-                                  int32_t output_activation_min,
-                                  int32_t output_activation_max,
-                                  int16_t* output) {
   for (int c = 0; c + 32 <= channels; c += 32) {
     // Load weights and interleave into correct order [v58-v63].
     // Because there are more activations than weights, interleave weights.
@@ -78,8 +87,8 @@
       for (; frames_idx < frames; frames_idx += dilation) {
         vld_h_p_xx(v4, local_activations0, step);
         vld_h_p_xx(v5, local_activations1, step);
-        vmulw_w_vv(v48, v58, v0);  // Clobber accumulator
-        vmulw_w_vv(v50, v59, v1);  // Clobber accumulator
+        vmulw_w_vv(v48, v58, v0);    // Clobber accumulator
+        vmulw_w_vv(v50, v59, v1);    // Clobber accumulator
         vadd_w_vv_m(v48, v48, v52);  // Add bias.
         vmulw_w_vv(v40, v60, v2);
         vmulw_w_vv(v42, v61, v3);
@@ -118,4 +127,60 @@
   //   - one final loop handling remainder
 }
 
+// generic implementation based on Kelvin ops
+void DepthwiseConvS16Generic(
+    const tflite::DepthwiseParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
+    const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
+    const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
+    const int64_t* bias_data, const tflite::RuntimeShape& output_shape,
+    int16_t* output_data) {
+  // TBD: Use Kelvin implementation to replace the below
+  tflite::reference_integer_ops::DepthwiseConvPerChannel(
+      params, output_multiplier, output_shift, input_shape, input_data,
+      filter_shape, filter_data, bias_shape, bias_data, output_shape,
+      output_data);
+  return;
+}
+}  // namespace
+
+void DepthwiseConvS16(
+    const tflite::DepthwiseParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
+    const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
+    const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
+    const int64_t* bias_data, const tflite::RuntimeShape& output_shape,
+    int16_t* output_data) {
+  // Get parameters.
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+
+  if (params.padding_type == tflite::PaddingType::kValid && stride_width == 1 &&
+      stride_height == 1 && dilation_width_factor == 1 &&
+      dilation_height_factor == 1) {
+    // generic implementation by default
+    auto fn = DepthwiseConvS16Generic;
+
+    // special case of filter size 3x1
+    if (filter_height == 1 && filter_width == 3) {
+      fn = DepthwiseConvS16K3x1;
+    }
+
+    fn(params, output_multiplier, output_shift, input_shape, input_data,
+       filter_shape, filter_data, bias_shape, bias_data, output_shape,
+       output_data);
+    return;
+  }
+
+  // Use reference implementation
+  tflite::reference_integer_ops::DepthwiseConvPerChannel(
+      params, output_multiplier, output_shift, input_shape, input_data,
+      filter_shape, filter_data, bias_shape, bias_data, output_shape,
+      output_data);
+}
+
 }  // namespace kelvin::opt

diff --git a/tflm/opt/depthwise_conv_s8.cc b/tflm/opt/depthwise_conv_s8.cc
index c4ee35a..111fdc1 100644
--- a/tflm/opt/depthwise_conv_s8.cc
+++ b/tflm/opt/depthwise_conv_s8.cc

@@ -1,5 +1,5 @@
 /*
- * Copyright 2023 Google LLC
+ * Copyright 2024 Google LLC
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,39 +14,35 @@
  * limitations under the License.
  */
 
-#include <algorithm>
+// Depthwise convolution based on Kelvin ops
+// Data types: input: s8, filter: s8, bias s32
 
-#include "crt/kelvin.h"
-#include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h"
-#include "tensorflow/lite/kernels/internal/runtime_shape.h"
-#include "tensorflow/lite/kernels/internal/types.h"
-#include "tflm/opt/opt.h"
+#include "tflm/opt/conv_util.h"
 
 namespace kelvin::opt {
-
-void Swizzle(const int32_t* input, int32_t* output, int N) {
+namespace {
+// Reorders a vector to match the pattern after double-widening.
+// N must be a multiple of 4.
+void VectorSwizzle(const int32_t* input, int32_t* output, int N) {
+  assert(N >= 4 && N % 4 == 0);
   const int32_t(&in)[N] = *(int32_t(*)[N])input;
-  int32_t(&out)[N * 4] = *(int32_t(*)[N * 4]) output;
-  // Convert to accumulator swizzle pattern.
-  for (int i = 0; i < N / 8; ++i) {
-    int32_t* out0 = out + i * 32 + 0;
-    int32_t* out1 = out + i * 32 + 16;
-    int32_t* out2 = out + i * 32 + 8;
-    int32_t* out3 = out + i * 32 + 24;
-    for (int j = 0; j < 4; ++j) {
-      const int32_t* p_in = in + i * 8;
-      for (int k = 0; k < 2; ++k) {
-        *out0++ = *p_in++;
-        *out1++ = *p_in++;
-        *out2++ = *p_in++;
-        *out3++ = *p_in++;
-      }
+  int32_t(&out)[N] = *(int32_t(*)[N]) output;
+  const int32_t* p_in = in;
+  for (int i = 0; i < N / 4; ++i) {
+    int32_t* out0 = out + i + 0;
+    int32_t* out1 = out + i + 16;
+    int32_t* out2 = out + i + 8;
+    int32_t* out3 = out + i + 24;
+    *out0 = *p_in++;
+    *out1 = *p_in++;
+    *out2 = *p_in++;
+    *out3 = *p_in++;
     }
   }
-}
 
-void DWConv2DKelvin_d32(
+// special case of input depth = 32n
+void DepthwiseConvS8D32(
     const tflite::DepthwiseParams& params, const int32_t* output_multiplier,
     const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
     const int8_t* input_data, const tflite::RuntimeShape& filter_shape,
@@ -57,8 +53,6 @@
 ) {
   const int stride_width = params.stride_width;
   const int stride_height = params.stride_height;
-  const int dilation_width_factor = params.dilation_width_factor;
-  const int dilation_height_factor = params.dilation_height_factor;
   const int pad_width = params.padding_values.width;
   const int pad_height = params.padding_values.height;
   const int32_t input_offset = params.input_offset;
@@ -73,15 +67,15 @@
   const int filter_width = filter_shape.Dims(2);
   const int output_height = output_shape.Dims(1);
   const int output_width = output_shape.Dims(2);
-  int32_t swizzled_bias_data[32 * 4];
-  int32_t swizzled_shift_multi[32 * 4];
-  int32_t swizzled_output_multi[32 * 4];
+  int32_t swizzled_bias_data[32];
+  int32_t swizzled_shift_multi[32];
+  int32_t swizzled_output_multi[32];
 
   for (int in_channel = 0; in_channel + 32 <= input_depth; in_channel += 32) {
     const int output_channel = in_channel;
-    Swizzle(bias_data + output_channel, swizzled_bias_data, 32);
-    Swizzle(output_multiplier + output_channel, swizzled_output_multi, 32);
-    Swizzle(output_shift + output_channel, swizzled_shift_multi, 32);
+    VectorSwizzle(bias_data + output_channel, swizzled_bias_data, 32);
+    VectorSwizzle(output_multiplier + output_channel, swizzled_output_multi, 32);
+    VectorSwizzle(output_shift + output_channel, swizzled_shift_multi, 32);
 
     vld_w_x_m(v20, swizzled_bias_data);
     vld_w_x_m(v24, swizzled_output_multi);
@@ -94,6 +88,7 @@
           const int in_x_origin = (out_x * stride_width) - pad_width;
           const int in_y_origin = (out_y * stride_height) - pad_height;
 
+          vdup_w_x_m(v48, 0);
           for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
             const int in_y = in_y_origin + filter_y;
             if ((in_y < 0) || (in_y >= input_height)) {
@@ -124,7 +119,7 @@
           }
 
           vadd_w_vv_m(v48, v48, v20);  // add bias
-          vdmulh_w_r_vv_m(v48, v48, v24);
+          vdmulh_w_rn_vv_m(v48, v48, v24);
           vsha_w_r_vv_m(v48, v48, v28);
           vadd_w_vx_m(v48, v48, output_offset);
           vmax_w_vx_m(v48, v48, output_activation_min);
@@ -138,7 +133,24 @@
   }
 }
 
-void DepthwiseConv2DKelvin(
+// generic implementation based on Kelvin ops
+void DepthwiseConvS8Generic(
+    const tflite::DepthwiseParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
+    const int8_t* input_data, const tflite::RuntimeShape& filter_shape,
+    const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
+    const int32_t* bias_data, const tflite::RuntimeShape& output_shape,
+    int8_t* output_data) {
+  // TBD: Use Kelvin implementation to replace the below
+  tflite::reference_integer_ops::DepthwiseConvPerChannel(
+      params, output_multiplier, output_shift, input_shape, input_data,
+      filter_shape, filter_data, bias_shape, bias_data, output_shape,
+      output_data);
+  return;
+}
+}  // namespace
+
+void DepthwiseConvS8(
     const tflite::DepthwiseParams& params, const int32_t* output_multiplier,
     const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
     const int8_t* input_data, const tflite::RuntimeShape& filter_shape,
@@ -151,11 +163,7 @@
   const int stride_height = params.stride_height;
   const int dilation_width_factor = params.dilation_width_factor;
   const int dilation_height_factor = params.dilation_height_factor;
-  const int pad_width = params.padding_values.width;
-  const int pad_height = params.padding_values.height;
   const int depth_multiplier = params.depth_multiplier;
-  const int32_t input_offset = params.input_offset;
-  const int32_t output_offset = params.output_offset;
   const int32_t output_activation_min = params.quantized_activation_min;
   const int32_t output_activation_max = params.quantized_activation_max;
 
@@ -165,30 +173,33 @@
   TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
 
   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
-  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
   const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
-  const int input_height = input_shape.Dims(1);
-  const int input_width = input_shape.Dims(2);
   const int input_depth = input_shape.Dims(3);
-  const int filter_height = filter_shape.Dims(1);
-  const int filter_width = filter_shape.Dims(2);
-  const int output_height = output_shape.Dims(1);
-  const int output_width = output_shape.Dims(2);
   TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
   TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
 
-  if (depth_multiplier == 1 && pad_height < 2 && pad_width < 2 &&
+  if (depth_multiplier == 1 &&
       dilation_height_factor == 1 && dilation_width_factor == 1 &&
-      stride_height == 1 && stride_width == 1 && output_depth % 32 == 0) {
-    DWConv2DKelvin_d32(params, output_multiplier, output_shift, input_shape,
-                       input_data, filter_shape, filter_data, bias_shape,
-                       bias_data, output_shape, output_data);
+      stride_height <= 2 && stride_width <= 2) {
+    // generic implementation by default
+    auto fn = DepthwiseConvS8Generic;
+
+    // special case of output depth = 32n
+    if (output_depth % 32 == 0) {
+      fn = DepthwiseConvS8D32;
+    }
+
+    fn(params, output_multiplier, output_shift, input_shape, input_data,
+       filter_shape, filter_data, bias_shape, bias_data, output_shape,
+       output_data);
     return;
   }
+
+  // Use reference implementation
   tflite::reference_integer_ops::DepthwiseConvPerChannel(
       params, output_multiplier, output_shift, input_shape, input_data,
       filter_shape, filter_data, bias_shape, bias_data, output_shape,
       output_data);
-  return;
 }
-}  // namespace kelvin::opt
\ No newline at end of file
+
+}  // namespace kelvin::opt

diff --git a/tflm/opt/elementwise_add_s16.cc b/tflm/opt/elementwise_add_s16.cc
index e4220f0..001113e 100644
--- a/tflm/opt/elementwise_add_s16.cc
+++ b/tflm/opt/elementwise_add_s16.cc

@@ -1,5 +1,5 @@
 /*
- * Copyright 2023 Google LLC
+ * Copyright 2024 Google LLC
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,16 +20,16 @@
 
 namespace kelvin::opt {
 
-void elementwise_add_s16(const int16_t* input1, const int16_t* input2,
-                         const int32_t input1_offset, const int32_t input1_mult,
-                         const int32_t input1_shift,
-                         const int32_t input2_offset, const int32_t input2_mult,
-                         const int32_t input2_shift, const int32_t left_shift,
-                         int16_t* output, const int32_t output_offset,
-                         const int32_t output_mult, const int32_t output_shift,
-                         const int32_t output_activation_min,
-                         const int32_t output_activation_max,
-                         const int32_t block_size) {
+void ElementwiseAddS16(const int16_t* input1, const int16_t* input2,
+                       const int32_t input1_offset, const int32_t input1_mult,
+                       const int32_t input1_shift, const int32_t input2_offset,
+                       const int32_t input2_mult, const int32_t input2_shift,
+                       const int32_t left_shift, int16_t* output,
+                       const int32_t output_offset, const int32_t output_mult,
+                       const int32_t output_shift,
+                       const int32_t output_activation_min,
+                       const int32_t output_activation_max,
+                       const int32_t block_size) {
   int blocks = block_size;
   int vl;
   getmaxvl_h(vl);

diff --git a/tflm/opt/elementwise_add_s32.cc b/tflm/opt/elementwise_add_s32.cc
index 483799a..ab2b3d1 100644
--- a/tflm/opt/elementwise_add_s32.cc
+++ b/tflm/opt/elementwise_add_s32.cc

@@ -1,5 +1,5 @@
 /*
- * Copyright 2023 Google LLC
+ * Copyright 2024 Google LLC
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,10 +18,10 @@
 #include "tflm/opt/opt.h"
 
 namespace kelvin::opt {
-void elementwise_add_s32(const int32_t* input1, const int32_t* input2,
-                         int32_t* output, const int32_t output_activation_min,
-                         const int32_t output_activation_max,
-                         const int32_t block_size) {
+void ElementwiseAddS32(const int32_t* input1, const int32_t* input2,
+                       int32_t* output, const int32_t output_activation_min,
+                       const int32_t output_activation_max,
+                       const int32_t block_size) {
   int blocks = block_size;
   int vl;
   getmaxvl_w_m(vl);

diff --git a/tflm/opt/elementwise_add_s8.cc b/tflm/opt/elementwise_add_s8.cc
index ac83e1f..762d7af 100644
--- a/tflm/opt/elementwise_add_s8.cc
+++ b/tflm/opt/elementwise_add_s8.cc

@@ -1,5 +1,5 @@
 /*
- * Copyright 2023 Google LLC
+ * Copyright 2024 Google LLC
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,16 +20,16 @@
 
 namespace kelvin::opt {
 
-void elementwise_add_s8(const int8_t* input1, const int8_t* input2,
-                        const int32_t input1_offset, const int32_t input1_mult,
-                        const int32_t input1_shift, const int32_t input2_offset,
-                        const int32_t input2_mult, const int32_t input2_shift,
-                        const int32_t left_shift, int8_t* output,
-                        const int32_t output_offset, const int32_t output_mult,
-                        const int32_t output_shift,
-                        const int32_t output_activation_min,
-                        const int32_t output_activation_max,
-                        const int32_t block_size) {
+void ElementwiseAddS8(const int8_t* input1, const int8_t* input2,
+                      const int32_t input1_offset, const int32_t input1_mult,
+                      const int32_t input1_shift, const int32_t input2_offset,
+                      const int32_t input2_mult, const int32_t input2_shift,
+                      const int32_t left_shift, int8_t* output,
+                      const int32_t output_offset, const int32_t output_mult,
+                      const int32_t output_shift,
+                      const int32_t output_activation_min,
+                      const int32_t output_activation_max,
+                      const int32_t block_size) {
   int blocks = block_size;
   int vl;
   getmaxvl_b(vl);

diff --git a/tflm/opt/leaky_relu_s16.cc b/tflm/opt/leaky_relu_s16.cc
index 5cd1128..7427a6c 100644
--- a/tflm/opt/leaky_relu_s16.cc
+++ b/tflm/opt/leaky_relu_s16.cc

@@ -1,5 +1,5 @@
 /*
- * Copyright 2023 Google LLC
+ * Copyright 2024 Google LLC
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,13 +21,13 @@
 #include "tflm/opt/util.h"
 
 namespace kelvin::opt {
-void leaky_relu_s16(const int16_t* input, int16_t* output,
-                    const int32_t block_size, const int32_t input_zero_point,
-                    const int32_t output_zero_point,
-                    const int32_t output_multiplier_alpha,
-                    const int32_t output_shift_alpha,
-                    const int32_t output_multiplier_identity,
-                    const int32_t output_shift_identity) {
+void LeakyReluS16(const int16_t* input, int16_t* output,
+                  const int32_t block_size, const int32_t input_zero_point,
+                  const int32_t output_zero_point,
+                  const int32_t output_multiplier_alpha,
+                  const int32_t output_shift_alpha,
+                  const int32_t output_multiplier_identity,
+                  const int32_t output_shift_identity) {
   constexpr int32_t quantized_output_min = std::numeric_limits<int16_t>::min();
   constexpr int32_t quantized_output_max = std::numeric_limits<int16_t>::max();
   int32_t right_shift_identity = std::min(output_shift_identity, 0L);

diff --git a/tflm/opt/leaky_relu_s8.cc b/tflm/opt/leaky_relu_s8.cc
index b32d260..8b30d19 100644
--- a/tflm/opt/leaky_relu_s8.cc
+++ b/tflm/opt/leaky_relu_s8.cc

@@ -1,5 +1,5 @@
 /*
- * Copyright 2023 Google LLC
+ * Copyright 2024 Google LLC
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,13 +22,13 @@
 
 namespace kelvin::opt {
 
-void leaky_relu_s8(const int8_t* input, int8_t* output,
-                   const int32_t block_size, const int32_t input_zero_point,
-                   const int32_t output_zero_point,
-                   const int32_t output_multiplier_alpha,
-                   const int32_t output_shift_alpha,
-                   const int32_t output_multiplier_identity,
-                   const int32_t output_shift_identity) {
+void LeakyReluS8(const int8_t* input, int8_t* output, const int32_t block_size,
+                 const int32_t input_zero_point,
+                 const int32_t output_zero_point,
+                 const int32_t output_multiplier_alpha,
+                 const int32_t output_shift_alpha,
+                 const int32_t output_multiplier_identity,
+                 const int32_t output_shift_identity) {
   constexpr int32_t quantized_output_min = std::numeric_limits<int16_t>::min();
   constexpr int32_t quantized_output_max = std::numeric_limits<int16_t>::max();
   int32_t right_shift_identity = std::min(output_shift_identity, 0L);

diff --git a/tflm/opt/max_pool_s8.cc b/tflm/opt/max_pool_s8.cc
index 5986746..544f85a 100644
--- a/tflm/opt/max_pool_s8.cc
+++ b/tflm/opt/max_pool_s8.cc

@@ -1,5 +1,5 @@
 /*
- * Copyright 2023 Google LLC
+ * Copyright 2024 Google LLC
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,11 +20,10 @@
 #include "tensorflow/lite/kernels/internal/types.h"
 
 namespace kelvin::opt {
-void MaxPoolGeneric(const tflite::PoolParams &params,
-                    const tflite::RuntimeShape &input_shape,
-                    const int8_t *input_data,
-                    const tflite::RuntimeShape &output_shape,
-                    int8_t *output_data) {
+void MaxPoolS8(const tflite::PoolParams &params,
+               const tflite::RuntimeShape &input_shape,
+               const int8_t *input_data,
+               const tflite::RuntimeShape &output_shape, int8_t *output_data) {
   const int batches = MatchingDim(input_shape, 0, output_shape, 0);
   const int depth = MatchingDim(input_shape, 3, output_shape, 3);
   const int input_height = input_shape.Dims(1);
@@ -97,4 +96,4 @@
   }
 }
 
-} // namespace kelvin::opt
+}  // namespace kelvin::opt

diff --git a/tflm/opt/memcpy.cc b/tflm/opt/memcpy.cc
index 4669a83..29e0434 100644
--- a/tflm/opt/memcpy.cc
+++ b/tflm/opt/memcpy.cc

@@ -1,5 +1,5 @@
 /*
- * Copyright 2023 Google LLC
+ * Copyright 2024 Google LLC
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 namespace kelvin::opt {
 
-void *memcpy(void *dst, const void *src, size_t n) {
+void *Memcpy(void *dst, const void *src, size_t n) {
   const uint8_t *s = reinterpret_cast<const uint8_t *>(src);
   uint8_t *d = reinterpret_cast<uint8_t *>(dst);
   int vl;

diff --git a/tflm/opt/opt.h b/tflm/opt/opt.h
index 277f338..76d5218 100644
--- a/tflm/opt/opt.h
+++ b/tflm/opt/opt.h

@@ -1,5 +1,5 @@
 /*
- * Copyright 2023 Google LLC
+ * Copyright 2024 Google LLC
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,121 +24,87 @@
 /* clang-format on */
 
 namespace kelvin::opt {
-void* memcpy(void* dst, const void* src, size_t n);
-void elementwise_add_s8(const int8_t* input1, const int8_t* input2,
-                        const int32_t input1_offset, const int32_t input1_mult,
-                        const int32_t input1_shift, const int32_t input2_offset,
-                        const int32_t input2_mult, const int32_t input2_shift,
-                        const int32_t left_shift, int8_t* output,
-                        const int32_t output_offset, const int32_t output_mult,
-                        const int32_t output_shift,
-                        const int32_t output_activation_min,
-                        const int32_t output_activation_max,
-                        const int32_t block_size);
-void elementwise_add_s16(const int16_t* input1, const int16_t* input2,
-                         const int32_t input1_offset, const int32_t input1_mult,
-                         const int32_t input1_shift,
-                         const int32_t input2_offset, const int32_t input2_mult,
-                         const int32_t input2_shift, const int32_t left_shift,
-                         int16_t* output, const int32_t output_offset,
-                         const int32_t output_mult, const int32_t output_shift,
-                         const int32_t output_activation_min,
-                         const int32_t output_activation_max,
-                         const int32_t block_size);
-void elementwise_add_s32(const int32_t* input1, const int32_t* input2,
-                         int32_t* output, const int32_t output_activation_min,
-                         const int32_t output_activation_max,
-                         const int32_t block_size);
-void leaky_relu_s8(const int8_t* input, int8_t* output,
-                   const int32_t block_size, const int32_t input_zero_point,
-                   const int32_t output_zero_point,
-                   const int32_t output_multiplier_alpha,
-                   const int32_t output_shift_alpha,
-                   const int32_t output_multiplier_identity,
-                   const int32_t output_shift_identity);
-void leaky_relu_s16(const int16_t* input, int16_t* output,
-                    const int32_t block_size, const int32_t input_zero_point,
-                    const int32_t output_zero_point,
-                    const int32_t output_multiplier_alpha,
-                    const int32_t output_shift_alpha,
-                    const int32_t output_multiplier_identity,
-                    const int32_t output_shift_identity);
-void conv_per_channel_b32(
-    const tflite::ConvParams& params, const int32_t* output_multiplier,
-    const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
-    const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
-    const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
-    const int32_t* bias_data, const tflite::RuntimeShape& output_shape,
-    int16_t* output_data);
-
-// Top level conv function, will invoke correct variant below.
-void conv_per_channel_b64(
-    const tflite::ConvParams& params, const int32_t* output_multiplier,
-    const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
-    const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
-    const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
-    const int64_t* bias_data, const tflite::RuntimeShape& output_shape,
-    int16_t* output_data);
-void conv_per_channel_b64_1x1(
-    const tflite::ConvParams& params, const int32_t* output_multiplier,
-    const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
-    const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
-    const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
-    const int64_t* bias_data, const tflite::RuntimeShape& output_shape,
-    int16_t* output_data);
-void conv_per_channel_b64_filter1xn_non_group(
-    const tflite::ConvParams& params, const int32_t* output_multiplier,
-    const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
-    const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
-    const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
-    const int64_t* bias_data, const tflite::RuntimeShape& output_shape,
-    int16_t* output_data);
-void conv_per_channel_b64_filter1xn_group(
-    const tflite::ConvParams& params, const int32_t* output_multiplier,
-    const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
-    const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
-    const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
-    const int64_t* bias_data, const tflite::RuntimeShape& output_shape,
-    int16_t* output_data);
-void conv_per_channel_b64_generic(
-    const tflite::ConvParams& params, const int32_t* output_multiplier,
-    const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
-    const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
-    const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
-    const int64_t* bias_data, const tflite::RuntimeShape& output_shape,
-    int16_t* output_data);
-
-void conv_per_channel_b8(
-    const tflite::ConvParams& params, const int32_t* output_multiplier,
-    const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
-    const int8_t* input_data, const tflite::RuntimeShape& filter_shape,
-    const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
-    const int32_t* bias_data, const tflite::RuntimeShape& output_shape,
-    int8_t* output_data);
-void DepthwiseConv2DKelvin(
+void* Memcpy(void* dst, const void* src, size_t n);
+void ElementwiseAddS8(const int8_t* input1, const int8_t* input2,
+                      const int32_t input1_offset, const int32_t input1_mult,
+                      const int32_t input1_shift, const int32_t input2_offset,
+                      const int32_t input2_mult, const int32_t input2_shift,
+                      const int32_t left_shift, int8_t* output,
+                      const int32_t output_offset, const int32_t output_mult,
+                      const int32_t output_shift,
+                      const int32_t output_activation_min,
+                      const int32_t output_activation_max,
+                      const int32_t block_size);
+void ElementwiseAddS16(const int16_t* input1, const int16_t* input2,
+                       const int32_t input1_offset, const int32_t input1_mult,
+                       const int32_t input1_shift, const int32_t input2_offset,
+                       const int32_t input2_mult, const int32_t input2_shift,
+                       const int32_t left_shift, int16_t* output,
+                       const int32_t output_offset, const int32_t output_mult,
+                       const int32_t output_shift,
+                       const int32_t output_activation_min,
+                       const int32_t output_activation_max,
+                       const int32_t block_size);
+void ElementwiseAddS32(const int32_t* input1, const int32_t* input2,
+                       int32_t* output, const int32_t output_activation_min,
+                       const int32_t output_activation_max,
+                       const int32_t block_size);
+void LeakyReluS8(const int8_t* input, int8_t* output, const int32_t block_size,
+                 const int32_t input_zero_point,
+                 const int32_t output_zero_point,
+                 const int32_t output_multiplier_alpha,
+                 const int32_t output_shift_alpha,
+                 const int32_t output_multiplier_identity,
+                 const int32_t output_shift_identity);
+void LeakyReluS16(const int16_t* input, int16_t* output,
+                  const int32_t block_size, const int32_t input_zero_point,
+                  const int32_t output_zero_point,
+                  const int32_t output_multiplier_alpha,
+                  const int32_t output_shift_alpha,
+                  const int32_t output_multiplier_identity,
+                  const int32_t output_shift_identity);
+void ConvS16B32(const tflite::ConvParams& params,
+                const int32_t* output_multiplier, const int32_t* output_shift,
+                const tflite::RuntimeShape& input_shape,
+                const int16_t* input_data,
+                const tflite::RuntimeShape& filter_shape,
+                const int8_t* filter_data,
+                const tflite::RuntimeShape& bias_shape,
+                const int32_t* bias_data,
+                const tflite::RuntimeShape& output_shape, int16_t* output_data);
+void ConvS16B64(const tflite::ConvParams& params,
+                const int32_t* output_multiplier, const int32_t* output_shift,
+                const tflite::RuntimeShape& input_shape,
+                const int16_t* input_data,
+                const tflite::RuntimeShape& filter_shape,
+                const int8_t* filter_data,
+                const tflite::RuntimeShape& bias_shape,
+                const int64_t* bias_data,
+                const tflite::RuntimeShape& output_shape, int16_t* output_data);
+void ConvS8(const tflite::ConvParams& params, const int32_t* output_multiplier,
+            const int32_t* output_shift,
+            const tflite::RuntimeShape& input_shape, const int8_t* input_data,
+            const tflite::RuntimeShape& filter_shape, const int8_t* filter_data,
+            const tflite::RuntimeShape& bias_shape, const int32_t* bias_data,
+            const tflite::RuntimeShape& output_shape, int8_t* output_data);
+void DepthwiseConvS8(
     const tflite::DepthwiseParams& params, const int32_t* output_multiplier,
     const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
     const int8_t* input_data, const tflite::RuntimeShape& filter_shape,
     const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
     const int32_t* bias_data, const tflite::RuntimeShape& output_shape,
     int8_t* output_data);
-void DWConv2DKelvin_d32(
+void DepthwiseConvS16(
     const tflite::DepthwiseParams& params, const int32_t* output_multiplier,
     const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
-    const int8_t* input_data, const tflite::RuntimeShape& filter_shape,
+    const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
     const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
-    const int32_t* bias_data, const tflite::RuntimeShape& output_shape,
-    int8_t* output_data);
-void DepthwiseConv2DKelvinS16K3x1(
-    const int16_t* activations, const int8_t* weights, const int64_t* biases,
-    int channels, int frames, int dilation, const int32_t* output_mult,
-    const int32_t* output_shift, int32_t output_activation_min,
-    int32_t output_activation_max, int16_t* output);
-void MaxPoolGeneric(const tflite::PoolParams& params,
-                    const tflite::RuntimeShape& input_shape,
-                    const int8_t* input_data,
-                    const tflite::RuntimeShape& output_shape,
-                    int8_t* output_data);
+    const int64_t* bias_data, const tflite::RuntimeShape& output_shape,
+    int16_t* output_data);
+void MaxPoolS8(const tflite::PoolParams& params,
+               const tflite::RuntimeShape& input_shape,
+               const int8_t* input_data,
+               const tflite::RuntimeShape& output_shape, int8_t* output_data);
 
 }  // namespace kelvin::opt
 

diff --git a/tflm/opt/util.h b/tflm/opt/util.h
index d94ef3e..d0c16db 100644
--- a/tflm/opt/util.h
+++ b/tflm/opt/util.h

@@ -1,5 +1,5 @@
 /*
- * Copyright 2023 Google LLC
+ * Copyright 2024 Google LLC
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.