Specialized Conv kernel to handle input tensors

*This kernel is helpful to reduce the bottleneck due to input conv layer for 3d inputs.
*Kernel handles nxnx3 inputs with a 3x3 filter
*reduces cycles count by ~51x

Change-Id: I7978508f32f8974c1236ba00c828a0d04f88c8e9
diff --git a/tflm/opt/BUILD b/tflm/opt/BUILD
index 9957e7a..0330215 100644
--- a/tflm/opt/BUILD
+++ b/tflm/opt/BUILD
@@ -21,6 +21,7 @@
         "conv_s16_b64.cc",
         "conv_s8.cc",
         "conv_s8_1x1.cc",
+        "conv_s8_I3xD8.cc",
         "conv_s8_3x1_d48.cc",
         "conv_s8_d1.cc",
         "conv_s8_d32.cc",
diff --git a/tflm/opt/conv_s8.cc b/tflm/opt/conv_s8.cc
index 10e8f41..6ae4eeb 100644
--- a/tflm/opt/conv_s8.cc
+++ b/tflm/opt/conv_s8.cc
@@ -202,6 +202,13 @@
   return; \
 }
 
+  if (dilation_width_factor == 1 && dilation_height_factor == 1 &&
+      stride_width == 2 && stride_height == 2 && filter_depth == 3 &&
+      input_depth == 3 && output_depth % 8 == 0 && output_width % 4 == 0 &&
+      output_height % 2 == 0 && pad_height == 0 && pad_width == 0) {
+    RUN_KERNEL(kelvin::opt::ConvS8I3xD8);
+  }
+
   // special case of filter size 1x1
   if (filter_height == 1 && filter_width == 1 && stride_height == 1 &&
       stride_width == 1 && dilation_height_factor == 1 &&
diff --git a/tflm/opt/conv_s8.h b/tflm/opt/conv_s8.h
index 96cea2e..a95b1e6 100644
--- a/tflm/opt/conv_s8.h
+++ b/tflm/opt/conv_s8.h
@@ -85,6 +85,16 @@
                const tflite::RuntimeShape& bias_shape, const int32_t* bias_data,
                const tflite::RuntimeShape& output_shape, int8_t* output_data);
 
+// input depth 3, filter depth 8n, stride 2
+void ConvS8I3xD8(const tflite::ConvParams& params,
+                 const int32_t* output_multiplier, const int32_t* output_shift,
+                 const tflite::RuntimeShape& input_shape,
+                 const int8_t* input_data,
+                 const tflite::RuntimeShape& filter_shape,
+                 const int8_t* filter_data,
+                 const tflite::RuntimeShape& bias_shape,
+                 const int32_t* bias_data,
+                 const tflite::RuntimeShape& output_shape, int8_t* output_data);
 // filter size 48x3x1x48
 void ConvS8K3x1D48(
     const tflite::ConvParams& params, const int32_t* output_multiplier,
diff --git a/tflm/opt/conv_s8_I3xD8.cc b/tflm/opt/conv_s8_I3xD8.cc
new file mode 100644
index 0000000..370bd97
--- /dev/null
+++ b/tflm/opt/conv_s8_I3xD8.cc
@@ -0,0 +1,537 @@
+/*
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdlib>
+#include <memory>
+
+#include "crt/kelvin.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/conv.h"
+#include "tensorflow/lite/kernels/internal/runtime_shape.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tflm/opt/conv_s8.h"
+#include "tflm/opt/conv_util.h"
+#include "tflm/opt/opt.h"
+
+namespace kelvin::opt {
+namespace {
+
+void VectorSwizzle8(const int32_t* input, int32_t* output, int32_t* output2) {
+  // swizzle to achive following pattern
+  // out 1 : [0, 2, 1, 3, 0, 2, 1, 3]
+  // out 2 : [4, 6, 5, 7, 4, 6, 5, 7]
+
+  const int32_t(&in)[8] = *(int32_t(*)[8])input;
+  int32_t(&out)[8] = *(int32_t(*)[8])output;
+  int32_t(&out2)[8] = *(int32_t(*)[8])output2;
+
+  out[0] = in[0];
+  out[2] = in[1];
+  out[1] = in[2];
+  out[3] = in[3];
+  out[4] = in[0];
+  out[6] = in[1];
+  out[5] = in[2];
+  out[7] = in[3];
+
+  out2[0] = in[4];
+  out2[2] = in[5];
+  out2[1] = in[6];
+  out2[3] = in[7];
+  out2[4] = in[4];
+  out2[6] = in[5];
+  out2[5] = in[6];
+  out2[7] = in[7];
+}
+
+void PaddedFilter_N_H_W_M(const int8_t* input, int8_t* output, int N, int H,
+                          int W, int M) {
+  if (M != 3) {
+    MicroPrintf("Filter shuffling can only handle M(input_depth) == 3");
+    exit(-1);
+  }
+
+  const int8_t(&in)[N][H][W][M] = *(int8_t(*)[N][H][W][M])input;
+  int8_t(&out)[N / 8][3][8 * 4 * 3] = *(int8_t(*)[N / 8][3][8 * 4 * 3]) output;
+  int group = 0;
+  // Filter data is being reorganized into groups of 8 channels and falttening
+  // row. 9th element of 3x3 filter is padded (9000 9000 9000 9000) 8 channels
+  // are aligned this way     (  c0  c1   c2    c3)
+  for (int ky = 0; ky < H; ++ky) {
+    int filter_element[N / 8]{0};
+    for (int kx = 0; kx < W; ++kx) {
+      for (int output_channel = 0; output_channel < N; ++output_channel) {
+        for (int input_channel = 0; input_channel < M; ++input_channel) {
+          group = output_channel >> 3;
+          if (kx == 1 && input_channel == 0) {
+            continue;
+          }
+          if (kx == 2 && (input_channel < 2)) {
+            continue;
+          }
+
+          if (kx == 0 && input_channel == 2) {
+            out[group][ky][filter_element[group]] =
+                in[output_channel][ky][kx][input_channel];
+            filter_element[group] += 1;
+            out[group][ky][filter_element[group]] =
+                in[output_channel][ky][kx + 1][0];
+            filter_element[group] += 1;
+          } else if (kx == 1 && input_channel == 2) {
+            out[group][ky][filter_element[group]] =
+                in[output_channel][ky][kx][input_channel];
+            filter_element[group] += 1;
+            out[group][ky][filter_element[group]] =
+                in[output_channel][ky][kx + 1][0];
+            filter_element[group] += 1;
+            out[group][ky][filter_element[group]] =
+                in[output_channel][ky][kx + 1][1];
+            filter_element[group] += 1;
+          } else if (kx == 2 && input_channel == 2) {
+            out[group][ky][filter_element[group]] =
+                in[output_channel][ky][kx][input_channel];
+            filter_element[group] += 1;
+            out[group][ky][filter_element[group]] = 0;
+            filter_element[group] += 1;
+            out[group][ky][filter_element[group]] = 0;
+            filter_element[group] += 1;
+            out[group][ky][filter_element[group]] = 0;
+            filter_element[group] += 1;
+          } else {
+            out[group][ky][filter_element[group]] =
+                in[output_channel][ky][kx][input_channel];
+            filter_element[group] += 1;
+          }
+        }
+      }
+    }
+  }
+}
+}  // namespace
+
+// IN:
+//  - v0, v4 (input pixels)
+//  - v11 (zeroed register)
+// OUT:
+//  - v0, v1, v2, v3, v4, v5, v6, v7 (reforged input columns)
+// CLOBBERS:
+//  - None
+#define FORGING_4_INPUT_COLUMNS_INTO_V0_V7_FOR_ACONV \
+  {                                                  \
+    vsliden_h_3_vv(v1, v0, v11);                     \
+    vsliden_w_3_vv(v2, v0, v11);                     \
+    vsliden_w_3_vv(v3, v1, v11);                     \
+    vsliden_h_3_vv(v5, v4, v11);                     \
+    vsliden_w_3_vv(v6, v4, v11);                     \
+    vsliden_w_3_vv(v7, v5, v11);                     \
+  }
+
+#define POST_PROCESS_ACONV_INT32_ACC_TO_INT8_ACC  \
+  {                                               \
+    vadd_w_vv_m(v48, v48, v35);                   \
+    vadd_w_vv_m(v52, v52, v35);                   \
+    vdmulh_w_rn_vv_m(v48, v48, v12);              \
+    vdmulh_w_rn_vv_m(v52, v52, v12);              \
+    vsha_w_r_vv_m(v48, v48, v16);                 \
+    vsha_w_r_vv_m(v52, v52, v16);                 \
+    vadd_w_vx_m(v48, v48, output_offset);         \
+    vadd_w_vx_m(v52, v52, output_offset);         \
+    vmin_w_vx_m(v48, v48, output_activation_max); \
+    vmin_w_vx_m(v52, v52, output_activation_max); \
+    vmax_w_vx_m(v48, v48, output_activation_min); \
+    vmax_w_vx_m(v52, v52, output_activation_min); \
+    vst_w_x_m(v48, &acc_out32[0]);                \
+    vst_w_x_m(v52, &acc_out32[32]);               \
+    for (int i = 0; i < 4; i++) {                 \
+      acc_out8[0][i][0] = acc_out32[i * 16 + 0];  \
+      acc_out8[0][i][2] = acc_out32[i * 16 + 1];  \
+      acc_out8[0][i][1] = acc_out32[i * 16 + 2];  \
+      acc_out8[0][i][3] = acc_out32[i * 16 + 3];  \
+      acc_out8[1][i][0] = acc_out32[i * 16 + 4];  \
+      acc_out8[1][i][2] = acc_out32[i * 16 + 5];  \
+      acc_out8[1][i][1] = acc_out32[i * 16 + 6];  \
+      acc_out8[1][i][3] = acc_out32[i * 16 + 7];  \
+      acc_out8[0][i][4] = acc_out32[i * 16 + 8];  \
+      acc_out8[0][i][6] = acc_out32[i * 16 + 9];  \
+      acc_out8[0][i][5] = acc_out32[i * 16 + 10]; \
+      acc_out8[0][i][7] = acc_out32[i * 16 + 11]; \
+      acc_out8[1][i][4] = acc_out32[i * 16 + 12]; \
+      acc_out8[1][i][6] = acc_out32[i * 16 + 13]; \
+      acc_out8[1][i][5] = acc_out32[i * 16 + 14]; \
+      acc_out8[1][i][7] = acc_out32[i * 16 + 15]; \
+    }                                             \
+  }
+
+void ConvS8I3xD8(
+    const tflite::ConvParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
+    const int8_t* input_data, const tflite::RuntimeShape& filter_shape,
+    const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
+    const int32_t* bias_data, const tflite::RuntimeShape& output_shape,
+    int8_t* output_data) {
+  // Get parameters.
+  const int32_t input_offset = params.input_offset;       // r = s(q - Z)
+  const int32_t neg_input_offset = -params.input_offset;  // r = s(q - Z)
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  // Consistency check.
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.Dims(2), 3);
+  TFLITE_DCHECK_EQ(filter_shape.Dims(1), 3);
+  TFLITE_DCHECK_EQ(input_shape.Dims(3), 3);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = input_shape.Dims(3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+  }
+
+  // Check dimensions of the tensors.
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int filter_input_depth = filter_shape.Dims(3);
+  const int groups = input_depth / filter_input_depth;
+  TFLITE_DCHECK_NE(groups, 0);
+  TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0);
+  const int filters_per_group = output_depth / groups;
+  TFLITE_DCHECK_NE(filters_per_group, 0);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+
+  union {
+    vconv_u8_t conv;
+    uint32_t raw;
+  } cmds;
+  cmds.conv.mode = 0;
+  cmds.conv.start = 0;
+  cmds.conv.stop = 2;
+  cmds.conv.sbias1 = input_offset;
+  cmds.conv.sdata1 = true;
+  cmds.conv.sbias2 = 0;
+  cmds.conv.sdata2 = true;
+
+  // Reg Map:
+  // v0-v7   : input patches[0..7]
+  // v8-v10  : filter row 1 (registers used for aconv)
+  // v11     : vdup 0 used during vsliden
+  // v12-v15 : Swizzled Biases
+  // v16-v19 : Swizzled shift mulitpliers
+  // v24-v26 : filter row 2 (registers used for aconv)
+  // v30     : negative offset mask
+  // v34-v37 : Swizzled Biases
+  // v40-v42 : filter row 3 (registers used for aconv)
+  // v48-v55 : Accumulators for aconv
+
+  int8_t acc_out8[2][4][8];
+  int32_t acc_out32[64];
+  int out_channel = 0;
+  const size_t swizzled_filter_data_size = output_depth * 3 * 3 * 4;
+  std::unique_ptr<int8_t> swizzled_filter_data(reinterpret_cast<int8_t*>(
+      ::aligned_alloc(32, swizzled_filter_data_size)));
+  int8_t* p_swizzled_filter_data = swizzled_filter_data.get();
+
+  PaddedFilter_N_H_W_M(filter_data, p_swizzled_filter_data, output_depth,
+                       filter_height, filter_width, filter_input_depth);
+
+  // structure of padded filter data : 1st row 0-8 channels 0-95 , 2nd row 0-8
+  // channels 96-191, 3rd row 0-8 channels 192-287
+
+  do {
+    int32_t temp_data_shuffle[2][8]{0};
+
+    VectorSwizzle8(bias_data + out_channel, &temp_data_shuffle[0][0],
+                   &temp_data_shuffle[1][0]);
+    vld_w_x(v35, &temp_data_shuffle[0][0]);
+    vld_w_x(v36, &temp_data_shuffle[1][0]);
+    vmv_v(v37, v35);
+    vmv_v(v38, v36);
+
+    VectorSwizzle8(output_multiplier + out_channel, &temp_data_shuffle[0][0],
+                   &temp_data_shuffle[1][0]);
+    vld_w_x(v12, &temp_data_shuffle[0][0]);
+    vld_w_x(v13, &temp_data_shuffle[1][0]);
+    vmv_v(v14, v12);
+    vmv_v(v15, v13);
+
+    VectorSwizzle8(output_shift + out_channel, &temp_data_shuffle[0][0],
+                   &temp_data_shuffle[1][0]);
+    vld_w_x(v16, &temp_data_shuffle[0][0]);
+    vld_w_x(v17, &temp_data_shuffle[1][0]);
+    vmv_v(v18, v16);
+    vmv_v(v19, v17);
+    vrsub_w_vx_m(v16, v16, 0);
+    vdup_b_x(v11, 0);  // used for vsliden
+
+    int8_t mask[32] = {0};
+    for (int i = 24; i < 32; ++i) {
+      mask[i] = neg_input_offset;
+    }
+    vld_b_x(v30, mask);  // mast to negate input offset
+
+    int fil_channels_offset = out_channel / 8;
+
+    // load filter this is done once per change in 8 channels
+    vld_b_x(v8, p_swizzled_filter_data + fil_channels_offset * 288);  // row 1
+    vld_b_x(v9,
+            p_swizzled_filter_data + fil_channels_offset * 288 + 32);  // row 1
+    vld_b_x(v10,
+            p_swizzled_filter_data + fil_channels_offset * 288 + 64);  // row 1
+
+    vld_b_x(v24,
+            p_swizzled_filter_data + fil_channels_offset * 288 + 96);  // row 2
+    vld_b_x(v25,
+            p_swizzled_filter_data + fil_channels_offset * 288 + 128);  // row 2
+    vld_b_x(v26,
+            p_swizzled_filter_data + fil_channels_offset * 288 + 160);  // row 2
+
+    vld_b_x(v40,
+            p_swizzled_filter_data + fil_channels_offset * 288 + 192);  // row 3
+    vld_b_x(v41,
+            p_swizzled_filter_data + fil_channels_offset * 288 + 224);  // row 3
+    vld_b_x(v42,
+            p_swizzled_filter_data + fil_channels_offset * 288 + 256);  // row 3
+
+    for (int batch = 0; batch < batches; ++batch) {
+      int8_t* p_output = output_data +
+                         (batch * output_height * output_width * output_depth) +
+                         out_channel;
+      const int8_t* p_input =
+          input_data + (batch * input_height * input_width * input_depth);
+      for (int out_y = 0; out_y + 2 < output_height; out_y += 2) {
+        const int in_y_origin = (out_y * stride_height);
+        for (int out_x = 0; out_x + 4 < output_width; out_x += 4) {
+          const int in_x_origin = (out_x * stride_width);
+
+          vdup_w_x_m(v48, 0);
+          vdup_w_x_m(v52, 0);
+
+          acset_v(v48, v48);
+
+          // inputs row 0 and row 2
+          vld_b_x(v0, p_input + (in_y_origin * input_width * input_depth) +
+                          (in_x_origin * input_depth));
+          vld_b_x(v4, p_input +
+                          ((in_y_origin + 2) * input_width * input_depth) +
+                          (in_x_origin * input_depth));
+
+          // explaining data slide strategy
+          // v0 loads 10 RGB pixels of Row 0 ( which turn out to be first 30
+          // values) 0 1 2 3 4 5 6 7 8 9 first vslide corresponds to outx + 1 2
+          // 3 4 5 6 7 8 9 ( stride == 2) 2nd vslide corresponds to outx + 2 4 5
+          // 6 7 8 9 ( stride == 2) 2nd vslide corresponds to outx + 3 6 7 8 9 (
+          // stride == 2)
+          FORGING_4_INPUT_COLUMNS_INTO_V0_V7_FOR_ACONV;
+
+          aconv_vxv(v48, v0, cmds, v8);  // filter r1
+
+          // inputs row 1 and row 3
+          vld_b_x(v0, p_input +
+                          ((in_y_origin + 1) * input_width * input_depth) +
+                          (in_x_origin * input_depth));
+          vld_b_x(v4, p_input +
+                          ((in_y_origin + 3) * input_width * input_depth) +
+                          (in_x_origin * input_depth));
+          FORGING_4_INPUT_COLUMNS_INTO_V0_V7_FOR_ACONV;
+          aconv_vxv(v48, v0, cmds, v24);  // filter r2
+
+          // row 2 and row4
+          vld_b_x(v0, p_input +
+                          ((in_y_origin + 2) * input_width * input_depth) +
+                          (in_x_origin * input_depth));
+          vld_b_x(v4, p_input +
+                          ((in_y_origin + 4) * input_width * input_depth) +
+                          (in_x_origin * input_depth));
+
+          FORGING_4_INPUT_COLUMNS_INTO_V0_V7_FOR_ACONV;
+          aconv_vxv(v48, v0, cmds, v40);  // filter r3
+
+          vcget(v48);
+          actr_v(v48, v48);
+          vcget(v48);
+
+          //     (x0,y0)   (x0, y1)
+          // v48 (0 2 1 3  0 2 1 3) -- even registers
+          // v49 (4 6 5 7  4 6 5 7) -- odd registers
+          POST_PROCESS_ACONV_INT32_ACC_TO_INT8_ACC;
+
+          for (int i = 0; i < 4; ++i) {
+            memcpy((p_output + ((out_x + i) * output_depth) +
+                    (out_y * output_width * output_depth)),
+                   &acc_out8[0][i][0], 8 * sizeof(int8_t));
+            memcpy((p_output + ((out_x + i) * output_depth) +
+                    ((out_y + 1) * output_width * output_depth)),
+                   &acc_out8[1][i][0], 8 * sizeof(int8_t));
+          }
+        }
+
+        int in_x_origin = (output_width - 4) * stride_width;
+        vdup_w_x_m(v48, 0);
+        vdup_w_x_m(v52, 0);
+        acset_v(v48, v48);
+
+        vld_b_l_xx(v0,
+                   p_input + (in_y_origin * input_width * input_depth) +
+                       (in_x_origin * input_depth),
+                   24);
+        vld_b_l_xx(v4,
+                   p_input + ((in_y_origin + 2) * input_width * input_depth) +
+                       (in_x_origin * input_depth),
+                   24);
+        vor_vv(v0, v0, v30);
+        vor_vv(v4, v4, v30);
+        FORGING_4_INPUT_COLUMNS_INTO_V0_V7_FOR_ACONV;
+        aconv_vxv(v48, v0, cmds, v8);  // filter r1
+
+        // inputs row 1 and row 3
+        vld_b_l_xx(v0,
+                   p_input + ((in_y_origin + 1) * input_width * input_depth) +
+                       (in_x_origin * input_depth),
+                   24);
+        vld_b_l_xx(v4,
+                   p_input + ((in_y_origin + 3) * input_width * input_depth) +
+                       (in_x_origin * input_depth),
+                   24);
+        vor_vv(v0, v0, v30);
+        vor_vv(v4, v4, v30);
+        FORGING_4_INPUT_COLUMNS_INTO_V0_V7_FOR_ACONV;
+        aconv_vxv(v48, v0, cmds, v24);  // filter r2
+
+        // row 2 and row4
+        vld_b_l_xx(v0,
+                   p_input + ((in_y_origin + 2) * input_width * input_depth) +
+                       (in_x_origin * input_depth),
+                   24);
+        vld_b_l_xx(v4,
+                   p_input + ((in_y_origin + 4) * input_width * input_depth) +
+                       (in_x_origin * input_depth),
+                   24);
+        vor_vv(v0, v0, v30);
+        vor_vv(v4, v4, v30);
+        FORGING_4_INPUT_COLUMNS_INTO_V0_V7_FOR_ACONV;
+        aconv_vxv(v48, v0, cmds, v40);  // filter r3
+
+        vcget(v48);
+        actr_v(v48, v48);
+        vcget(v48);
+
+        POST_PROCESS_ACONV_INT32_ACC_TO_INT8_ACC;
+
+        for (int i = 0; i < 4; ++i) {
+          memcpy((p_output + ((output_width - 4 + i) * output_depth) +
+                  (out_y * output_width * output_depth)),
+                 &acc_out8[0][i][0], 8 * sizeof(int8_t));
+
+          memcpy((p_output + ((output_width - 4 + i) * output_depth) +
+                  ((out_y + 1) * output_width * output_depth)),
+                 &acc_out8[1][i][0], 8 * sizeof(int8_t));
+        }
+      }
+      int load_until = 32;
+      bool negate_offset = false;
+      for (int out_x = 0; out_x + 4 <= output_width; out_x += 4) {
+        const int in_x_origin = (out_x * stride_width);
+        const int in_y_origin = (output_height - 2) * stride_height;
+
+        if (out_x + 4 == output_width) {
+          load_until = 24;
+          negate_offset = true;
+        }
+
+        vdup_w_x_m(v48, 0);
+        vdup_w_x_m(v52, 0);
+        acset_v(v48, v48);
+
+        // inputs row 0 and row 2
+        vld_b_l_xx(v0,
+                   p_input + (in_y_origin * input_width * input_depth) +
+                       (in_x_origin * input_depth),
+                   load_until);
+        vld_b_l_xx(v4,
+                   p_input + ((in_y_origin + 2) * input_width * input_depth) +
+                       (in_x_origin * input_depth),
+                   load_until);
+        if (negate_offset) {
+          vor_vv(v0, v0, v30);
+          vor_vv(v4, v4, v30);
+        }
+        FORGING_4_INPUT_COLUMNS_INTO_V0_V7_FOR_ACONV
+        aconv_vxv(v48, v0, cmds, v8);  // filter r1
+
+        // inputs row 1 and row 3
+        vld_b_l_xx(v0,
+                   p_input + ((in_y_origin + 1) * input_width * input_depth) +
+                       (in_x_origin * input_depth),
+                   load_until);
+        vld_b_l_xx(v4,
+                   p_input + ((in_y_origin + 3) * input_width * input_depth) +
+                       (in_x_origin * input_depth),
+                   load_until);
+        if (negate_offset) {
+          vor_vv(v0, v0, v30);
+          vor_vv(v4, v4, v30);
+        }
+        FORGING_4_INPUT_COLUMNS_INTO_V0_V7_FOR_ACONV;
+        aconv_vxv(v48, v0, cmds, v24);  // filter r2
+
+        // row 2 and row4
+        vld_b_l_xx(v0,
+                   p_input + ((in_y_origin + 2) * input_width * input_depth) +
+                       (in_x_origin * input_depth),
+                   load_until);
+        vdup_b_x_m(v4, neg_input_offset);
+        if (negate_offset) {
+          vor_vv(v0, v0, v30);
+          vor_vv(v4, v4, v30);
+        }
+        vsliden_h_3_vv(v1, v0, v11);
+        vsliden_w_3_vv(v2, v0, v11);
+        vsliden_w_3_vv(v3, v1, v11);
+        aconv_vxv(v48, v0, cmds, v40);  // filter r3
+
+        vcget(v48);
+        actr_v(v48, v48);
+        vcget(v48);
+        POST_PROCESS_ACONV_INT32_ACC_TO_INT8_ACC;
+
+        for (int i = 0; i < 4; ++i) {
+          memcpy((p_output + ((out_x + i) * output_depth) +
+                  ((output_height - 2) * output_width * output_depth)),
+                 &acc_out8[0][i][0], 8 * sizeof(int8_t));
+
+          memcpy((p_output + ((out_x + i) * output_depth) +
+                  ((output_height - 1) * output_width * output_depth)),
+                 &acc_out8[1][i][0], 8 * sizeof(int8_t));
+        }
+      }
+    }
+    out_channel += 8;
+  } while (out_channel < output_depth);
+}
+
+#undef FORGING_4_INPUT_COLUMNS_INTO_V0_V7_FOR_ACONV
+#undef POST_PROCESS_ACONV_INT32_ACC_TO_INT8_ACC
+
+}  // namespace kelvin::opt
\ No newline at end of file