Specialized Conv kernel to handle input tensors
*This kernel is helpful to reduce the bottleneck due to input conv layer for 3d inputs.
*Kernel handles nxnx3 inputs with a 3x3 filter
*reduces cycles count by ~51x
Change-Id: I7978508f32f8974c1236ba00c828a0d04f88c8e9
diff --git a/tflm/opt/BUILD b/tflm/opt/BUILD
index 9957e7a..0330215 100644
--- a/tflm/opt/BUILD
+++ b/tflm/opt/BUILD
@@ -21,6 +21,7 @@
"conv_s16_b64.cc",
"conv_s8.cc",
"conv_s8_1x1.cc",
+ "conv_s8_I3xD8.cc",
"conv_s8_3x1_d48.cc",
"conv_s8_d1.cc",
"conv_s8_d32.cc",
diff --git a/tflm/opt/conv_s8.cc b/tflm/opt/conv_s8.cc
index 10e8f41..6ae4eeb 100644
--- a/tflm/opt/conv_s8.cc
+++ b/tflm/opt/conv_s8.cc
@@ -202,6 +202,13 @@
return; \
}
+ if (dilation_width_factor == 1 && dilation_height_factor == 1 &&
+ stride_width == 2 && stride_height == 2 && filter_depth == 3 &&
+ input_depth == 3 && output_depth % 8 == 0 && output_width % 4 == 0 &&
+ output_height % 2 == 0 && pad_height == 0 && pad_width == 0) {
+ RUN_KERNEL(kelvin::opt::ConvS8I3xD8);
+ }
+
// special case of filter size 1x1
if (filter_height == 1 && filter_width == 1 && stride_height == 1 &&
stride_width == 1 && dilation_height_factor == 1 &&
diff --git a/tflm/opt/conv_s8.h b/tflm/opt/conv_s8.h
index 96cea2e..a95b1e6 100644
--- a/tflm/opt/conv_s8.h
+++ b/tflm/opt/conv_s8.h
@@ -85,6 +85,16 @@
const tflite::RuntimeShape& bias_shape, const int32_t* bias_data,
const tflite::RuntimeShape& output_shape, int8_t* output_data);
+// input depth 3, filter depth 8n, stride 2
+void ConvS8I3xD8(const tflite::ConvParams& params,
+ const int32_t* output_multiplier, const int32_t* output_shift,
+ const tflite::RuntimeShape& input_shape,
+ const int8_t* input_data,
+ const tflite::RuntimeShape& filter_shape,
+ const int8_t* filter_data,
+ const tflite::RuntimeShape& bias_shape,
+ const int32_t* bias_data,
+ const tflite::RuntimeShape& output_shape, int8_t* output_data);
// filter size 48x3x1x48
void ConvS8K3x1D48(
const tflite::ConvParams& params, const int32_t* output_multiplier,
diff --git a/tflm/opt/conv_s8_I3xD8.cc b/tflm/opt/conv_s8_I3xD8.cc
new file mode 100644
index 0000000..370bd97
--- /dev/null
+++ b/tflm/opt/conv_s8_I3xD8.cc
@@ -0,0 +1,537 @@
+/*
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdlib>
+#include <memory>
+
+#include "crt/kelvin.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/conv.h"
+#include "tensorflow/lite/kernels/internal/runtime_shape.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tflm/opt/conv_s8.h"
+#include "tflm/opt/conv_util.h"
+#include "tflm/opt/opt.h"
+
+namespace kelvin::opt {
+namespace {
+
+void VectorSwizzle8(const int32_t* input, int32_t* output, int32_t* output2) {
+ // swizzle to achive following pattern
+ // out 1 : [0, 2, 1, 3, 0, 2, 1, 3]
+ // out 2 : [4, 6, 5, 7, 4, 6, 5, 7]
+
+ const int32_t(&in)[8] = *(int32_t(*)[8])input;
+ int32_t(&out)[8] = *(int32_t(*)[8])output;
+ int32_t(&out2)[8] = *(int32_t(*)[8])output2;
+
+ out[0] = in[0];
+ out[2] = in[1];
+ out[1] = in[2];
+ out[3] = in[3];
+ out[4] = in[0];
+ out[6] = in[1];
+ out[5] = in[2];
+ out[7] = in[3];
+
+ out2[0] = in[4];
+ out2[2] = in[5];
+ out2[1] = in[6];
+ out2[3] = in[7];
+ out2[4] = in[4];
+ out2[6] = in[5];
+ out2[5] = in[6];
+ out2[7] = in[7];
+}
+
+void PaddedFilter_N_H_W_M(const int8_t* input, int8_t* output, int N, int H,
+ int W, int M) {
+ if (M != 3) {
+ MicroPrintf("Filter shuffling can only handle M(input_depth) == 3");
+ exit(-1);
+ }
+
+ const int8_t(&in)[N][H][W][M] = *(int8_t(*)[N][H][W][M])input;
+ int8_t(&out)[N / 8][3][8 * 4 * 3] = *(int8_t(*)[N / 8][3][8 * 4 * 3]) output;
+ int group = 0;
+ // Filter data is being reorganized into groups of 8 channels and falttening
+ // row. 9th element of 3x3 filter is padded (9000 9000 9000 9000) 8 channels
+ // are aligned this way ( c0 c1 c2 c3)
+ for (int ky = 0; ky < H; ++ky) {
+ int filter_element[N / 8]{0};
+ for (int kx = 0; kx < W; ++kx) {
+ for (int output_channel = 0; output_channel < N; ++output_channel) {
+ for (int input_channel = 0; input_channel < M; ++input_channel) {
+ group = output_channel >> 3;
+ if (kx == 1 && input_channel == 0) {
+ continue;
+ }
+ if (kx == 2 && (input_channel < 2)) {
+ continue;
+ }
+
+ if (kx == 0 && input_channel == 2) {
+ out[group][ky][filter_element[group]] =
+ in[output_channel][ky][kx][input_channel];
+ filter_element[group] += 1;
+ out[group][ky][filter_element[group]] =
+ in[output_channel][ky][kx + 1][0];
+ filter_element[group] += 1;
+ } else if (kx == 1 && input_channel == 2) {
+ out[group][ky][filter_element[group]] =
+ in[output_channel][ky][kx][input_channel];
+ filter_element[group] += 1;
+ out[group][ky][filter_element[group]] =
+ in[output_channel][ky][kx + 1][0];
+ filter_element[group] += 1;
+ out[group][ky][filter_element[group]] =
+ in[output_channel][ky][kx + 1][1];
+ filter_element[group] += 1;
+ } else if (kx == 2 && input_channel == 2) {
+ out[group][ky][filter_element[group]] =
+ in[output_channel][ky][kx][input_channel];
+ filter_element[group] += 1;
+ out[group][ky][filter_element[group]] = 0;
+ filter_element[group] += 1;
+ out[group][ky][filter_element[group]] = 0;
+ filter_element[group] += 1;
+ out[group][ky][filter_element[group]] = 0;
+ filter_element[group] += 1;
+ } else {
+ out[group][ky][filter_element[group]] =
+ in[output_channel][ky][kx][input_channel];
+ filter_element[group] += 1;
+ }
+ }
+ }
+ }
+ }
+}
+} // namespace
+
+// IN:
+// - v0, v4 (input pixels)
+// - v11 (zeroed register)
+// OUT:
+// - v0, v1, v2, v3, v4, v5, v6, v7 (reforged input columns)
+// CLOBBERS:
+// - None
+#define FORGING_4_INPUT_COLUMNS_INTO_V0_V7_FOR_ACONV \
+ { \
+ vsliden_h_3_vv(v1, v0, v11); \
+ vsliden_w_3_vv(v2, v0, v11); \
+ vsliden_w_3_vv(v3, v1, v11); \
+ vsliden_h_3_vv(v5, v4, v11); \
+ vsliden_w_3_vv(v6, v4, v11); \
+ vsliden_w_3_vv(v7, v5, v11); \
+ }
+
+#define POST_PROCESS_ACONV_INT32_ACC_TO_INT8_ACC \
+ { \
+ vadd_w_vv_m(v48, v48, v35); \
+ vadd_w_vv_m(v52, v52, v35); \
+ vdmulh_w_rn_vv_m(v48, v48, v12); \
+ vdmulh_w_rn_vv_m(v52, v52, v12); \
+ vsha_w_r_vv_m(v48, v48, v16); \
+ vsha_w_r_vv_m(v52, v52, v16); \
+ vadd_w_vx_m(v48, v48, output_offset); \
+ vadd_w_vx_m(v52, v52, output_offset); \
+ vmin_w_vx_m(v48, v48, output_activation_max); \
+ vmin_w_vx_m(v52, v52, output_activation_max); \
+ vmax_w_vx_m(v48, v48, output_activation_min); \
+ vmax_w_vx_m(v52, v52, output_activation_min); \
+ vst_w_x_m(v48, &acc_out32[0]); \
+ vst_w_x_m(v52, &acc_out32[32]); \
+ for (int i = 0; i < 4; i++) { \
+ acc_out8[0][i][0] = acc_out32[i * 16 + 0]; \
+ acc_out8[0][i][2] = acc_out32[i * 16 + 1]; \
+ acc_out8[0][i][1] = acc_out32[i * 16 + 2]; \
+ acc_out8[0][i][3] = acc_out32[i * 16 + 3]; \
+ acc_out8[1][i][0] = acc_out32[i * 16 + 4]; \
+ acc_out8[1][i][2] = acc_out32[i * 16 + 5]; \
+ acc_out8[1][i][1] = acc_out32[i * 16 + 6]; \
+ acc_out8[1][i][3] = acc_out32[i * 16 + 7]; \
+ acc_out8[0][i][4] = acc_out32[i * 16 + 8]; \
+ acc_out8[0][i][6] = acc_out32[i * 16 + 9]; \
+ acc_out8[0][i][5] = acc_out32[i * 16 + 10]; \
+ acc_out8[0][i][7] = acc_out32[i * 16 + 11]; \
+ acc_out8[1][i][4] = acc_out32[i * 16 + 12]; \
+ acc_out8[1][i][6] = acc_out32[i * 16 + 13]; \
+ acc_out8[1][i][5] = acc_out32[i * 16 + 14]; \
+ acc_out8[1][i][7] = acc_out32[i * 16 + 15]; \
+ } \
+ }
+
+void ConvS8I3xD8(
+ const tflite::ConvParams& params, const int32_t* output_multiplier,
+ const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
+ const int8_t* input_data, const tflite::RuntimeShape& filter_shape,
+ const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
+ const int32_t* bias_data, const tflite::RuntimeShape& output_shape,
+ int8_t* output_data) {
+ // Get parameters.
+ const int32_t input_offset = params.input_offset; // r = s(q - Z)
+ const int32_t neg_input_offset = -params.input_offset; // r = s(q - Z)
+ const int stride_width = params.stride_width;
+ const int stride_height = params.stride_height;
+ const int pad_width = params.padding_values.width;
+ const int pad_height = params.padding_values.height;
+ const int32_t output_offset = params.output_offset;
+ const int32_t output_activation_min = params.quantized_activation_min;
+ const int32_t output_activation_max = params.quantized_activation_max;
+ // Consistency check.
+ TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+ TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+ TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+ TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+ TFLITE_DCHECK_EQ(filter_shape.Dims(2), 3);
+ TFLITE_DCHECK_EQ(filter_shape.Dims(1), 3);
+ TFLITE_DCHECK_EQ(input_shape.Dims(3), 3);
+ const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const int input_depth = input_shape.Dims(3);
+ const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+ if (bias_data) {
+ TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+ }
+
+ // Check dimensions of the tensors.
+ const int input_height = input_shape.Dims(1);
+ const int input_width = input_shape.Dims(2);
+ const int filter_height = filter_shape.Dims(1);
+ const int filter_width = filter_shape.Dims(2);
+ const int filter_input_depth = filter_shape.Dims(3);
+ const int groups = input_depth / filter_input_depth;
+ TFLITE_DCHECK_NE(groups, 0);
+ TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0);
+ const int filters_per_group = output_depth / groups;
+ TFLITE_DCHECK_NE(filters_per_group, 0);
+ const int output_height = output_shape.Dims(1);
+ const int output_width = output_shape.Dims(2);
+
+ union {
+ vconv_u8_t conv;
+ uint32_t raw;
+ } cmds;
+ cmds.conv.mode = 0;
+ cmds.conv.start = 0;
+ cmds.conv.stop = 2;
+ cmds.conv.sbias1 = input_offset;
+ cmds.conv.sdata1 = true;
+ cmds.conv.sbias2 = 0;
+ cmds.conv.sdata2 = true;
+
+ // Reg Map:
+ // v0-v7 : input patches[0..7]
+ // v8-v10 : filter row 1 (registers used for aconv)
+ // v11 : vdup 0 used during vsliden
+ // v12-v15 : Swizzled Biases
+ // v16-v19 : Swizzled shift mulitpliers
+ // v24-v26 : filter row 2 (registers used for aconv)
+ // v30 : negative offset mask
+ // v34-v37 : Swizzled Biases
+ // v40-v42 : filter row 3 (registers used for aconv)
+ // v48-v55 : Accumulators for aconv
+
+ int8_t acc_out8[2][4][8];
+ int32_t acc_out32[64];
+ int out_channel = 0;
+ const size_t swizzled_filter_data_size = output_depth * 3 * 3 * 4;
+ std::unique_ptr<int8_t> swizzled_filter_data(reinterpret_cast<int8_t*>(
+ ::aligned_alloc(32, swizzled_filter_data_size)));
+ int8_t* p_swizzled_filter_data = swizzled_filter_data.get();
+
+ PaddedFilter_N_H_W_M(filter_data, p_swizzled_filter_data, output_depth,
+ filter_height, filter_width, filter_input_depth);
+
+ // structure of padded filter data : 1st row 0-8 channels 0-95 , 2nd row 0-8
+ // channels 96-191, 3rd row 0-8 channels 192-287
+
+ do {
+ int32_t temp_data_shuffle[2][8]{0};
+
+ VectorSwizzle8(bias_data + out_channel, &temp_data_shuffle[0][0],
+ &temp_data_shuffle[1][0]);
+ vld_w_x(v35, &temp_data_shuffle[0][0]);
+ vld_w_x(v36, &temp_data_shuffle[1][0]);
+ vmv_v(v37, v35);
+ vmv_v(v38, v36);
+
+ VectorSwizzle8(output_multiplier + out_channel, &temp_data_shuffle[0][0],
+ &temp_data_shuffle[1][0]);
+ vld_w_x(v12, &temp_data_shuffle[0][0]);
+ vld_w_x(v13, &temp_data_shuffle[1][0]);
+ vmv_v(v14, v12);
+ vmv_v(v15, v13);
+
+ VectorSwizzle8(output_shift + out_channel, &temp_data_shuffle[0][0],
+ &temp_data_shuffle[1][0]);
+ vld_w_x(v16, &temp_data_shuffle[0][0]);
+ vld_w_x(v17, &temp_data_shuffle[1][0]);
+ vmv_v(v18, v16);
+ vmv_v(v19, v17);
+ vrsub_w_vx_m(v16, v16, 0);
+ vdup_b_x(v11, 0); // used for vsliden
+
+ int8_t mask[32] = {0};
+ for (int i = 24; i < 32; ++i) {
+ mask[i] = neg_input_offset;
+ }
+ vld_b_x(v30, mask); // mast to negate input offset
+
+ int fil_channels_offset = out_channel / 8;
+
+ // load filter this is done once per change in 8 channels
+ vld_b_x(v8, p_swizzled_filter_data + fil_channels_offset * 288); // row 1
+ vld_b_x(v9,
+ p_swizzled_filter_data + fil_channels_offset * 288 + 32); // row 1
+ vld_b_x(v10,
+ p_swizzled_filter_data + fil_channels_offset * 288 + 64); // row 1
+
+ vld_b_x(v24,
+ p_swizzled_filter_data + fil_channels_offset * 288 + 96); // row 2
+ vld_b_x(v25,
+ p_swizzled_filter_data + fil_channels_offset * 288 + 128); // row 2
+ vld_b_x(v26,
+ p_swizzled_filter_data + fil_channels_offset * 288 + 160); // row 2
+
+ vld_b_x(v40,
+ p_swizzled_filter_data + fil_channels_offset * 288 + 192); // row 3
+ vld_b_x(v41,
+ p_swizzled_filter_data + fil_channels_offset * 288 + 224); // row 3
+ vld_b_x(v42,
+ p_swizzled_filter_data + fil_channels_offset * 288 + 256); // row 3
+
+ for (int batch = 0; batch < batches; ++batch) {
+ int8_t* p_output = output_data +
+ (batch * output_height * output_width * output_depth) +
+ out_channel;
+ const int8_t* p_input =
+ input_data + (batch * input_height * input_width * input_depth);
+ for (int out_y = 0; out_y + 2 < output_height; out_y += 2) {
+ const int in_y_origin = (out_y * stride_height);
+ for (int out_x = 0; out_x + 4 < output_width; out_x += 4) {
+ const int in_x_origin = (out_x * stride_width);
+
+ vdup_w_x_m(v48, 0);
+ vdup_w_x_m(v52, 0);
+
+ acset_v(v48, v48);
+
+ // inputs row 0 and row 2
+ vld_b_x(v0, p_input + (in_y_origin * input_width * input_depth) +
+ (in_x_origin * input_depth));
+ vld_b_x(v4, p_input +
+ ((in_y_origin + 2) * input_width * input_depth) +
+ (in_x_origin * input_depth));
+
+ // explaining data slide strategy
+ // v0 loads 10 RGB pixels of Row 0 ( which turn out to be first 30
+ // values) 0 1 2 3 4 5 6 7 8 9 first vslide corresponds to outx + 1 2
+ // 3 4 5 6 7 8 9 ( stride == 2) 2nd vslide corresponds to outx + 2 4 5
+ // 6 7 8 9 ( stride == 2) 2nd vslide corresponds to outx + 3 6 7 8 9 (
+ // stride == 2)
+ FORGING_4_INPUT_COLUMNS_INTO_V0_V7_FOR_ACONV;
+
+ aconv_vxv(v48, v0, cmds, v8); // filter r1
+
+ // inputs row 1 and row 3
+ vld_b_x(v0, p_input +
+ ((in_y_origin + 1) * input_width * input_depth) +
+ (in_x_origin * input_depth));
+ vld_b_x(v4, p_input +
+ ((in_y_origin + 3) * input_width * input_depth) +
+ (in_x_origin * input_depth));
+ FORGING_4_INPUT_COLUMNS_INTO_V0_V7_FOR_ACONV;
+ aconv_vxv(v48, v0, cmds, v24); // filter r2
+
+ // row 2 and row4
+ vld_b_x(v0, p_input +
+ ((in_y_origin + 2) * input_width * input_depth) +
+ (in_x_origin * input_depth));
+ vld_b_x(v4, p_input +
+ ((in_y_origin + 4) * input_width * input_depth) +
+ (in_x_origin * input_depth));
+
+ FORGING_4_INPUT_COLUMNS_INTO_V0_V7_FOR_ACONV;
+ aconv_vxv(v48, v0, cmds, v40); // filter r3
+
+ vcget(v48);
+ actr_v(v48, v48);
+ vcget(v48);
+
+ // (x0,y0) (x0, y1)
+ // v48 (0 2 1 3 0 2 1 3) -- even registers
+ // v49 (4 6 5 7 4 6 5 7) -- odd registers
+ POST_PROCESS_ACONV_INT32_ACC_TO_INT8_ACC;
+
+ for (int i = 0; i < 4; ++i) {
+ memcpy((p_output + ((out_x + i) * output_depth) +
+ (out_y * output_width * output_depth)),
+ &acc_out8[0][i][0], 8 * sizeof(int8_t));
+ memcpy((p_output + ((out_x + i) * output_depth) +
+ ((out_y + 1) * output_width * output_depth)),
+ &acc_out8[1][i][0], 8 * sizeof(int8_t));
+ }
+ }
+
+ int in_x_origin = (output_width - 4) * stride_width;
+ vdup_w_x_m(v48, 0);
+ vdup_w_x_m(v52, 0);
+ acset_v(v48, v48);
+
+ vld_b_l_xx(v0,
+ p_input + (in_y_origin * input_width * input_depth) +
+ (in_x_origin * input_depth),
+ 24);
+ vld_b_l_xx(v4,
+ p_input + ((in_y_origin + 2) * input_width * input_depth) +
+ (in_x_origin * input_depth),
+ 24);
+ vor_vv(v0, v0, v30);
+ vor_vv(v4, v4, v30);
+ FORGING_4_INPUT_COLUMNS_INTO_V0_V7_FOR_ACONV;
+ aconv_vxv(v48, v0, cmds, v8); // filter r1
+
+ // inputs row 1 and row 3
+ vld_b_l_xx(v0,
+ p_input + ((in_y_origin + 1) * input_width * input_depth) +
+ (in_x_origin * input_depth),
+ 24);
+ vld_b_l_xx(v4,
+ p_input + ((in_y_origin + 3) * input_width * input_depth) +
+ (in_x_origin * input_depth),
+ 24);
+ vor_vv(v0, v0, v30);
+ vor_vv(v4, v4, v30);
+ FORGING_4_INPUT_COLUMNS_INTO_V0_V7_FOR_ACONV;
+ aconv_vxv(v48, v0, cmds, v24); // filter r2
+
+ // row 2 and row4
+ vld_b_l_xx(v0,
+ p_input + ((in_y_origin + 2) * input_width * input_depth) +
+ (in_x_origin * input_depth),
+ 24);
+ vld_b_l_xx(v4,
+ p_input + ((in_y_origin + 4) * input_width * input_depth) +
+ (in_x_origin * input_depth),
+ 24);
+ vor_vv(v0, v0, v30);
+ vor_vv(v4, v4, v30);
+ FORGING_4_INPUT_COLUMNS_INTO_V0_V7_FOR_ACONV;
+ aconv_vxv(v48, v0, cmds, v40); // filter r3
+
+ vcget(v48);
+ actr_v(v48, v48);
+ vcget(v48);
+
+ POST_PROCESS_ACONV_INT32_ACC_TO_INT8_ACC;
+
+ for (int i = 0; i < 4; ++i) {
+ memcpy((p_output + ((output_width - 4 + i) * output_depth) +
+ (out_y * output_width * output_depth)),
+ &acc_out8[0][i][0], 8 * sizeof(int8_t));
+
+ memcpy((p_output + ((output_width - 4 + i) * output_depth) +
+ ((out_y + 1) * output_width * output_depth)),
+ &acc_out8[1][i][0], 8 * sizeof(int8_t));
+ }
+ }
+ int load_until = 32;
+ bool negate_offset = false;
+ for (int out_x = 0; out_x + 4 <= output_width; out_x += 4) {
+ const int in_x_origin = (out_x * stride_width);
+ const int in_y_origin = (output_height - 2) * stride_height;
+
+ if (out_x + 4 == output_width) {
+ load_until = 24;
+ negate_offset = true;
+ }
+
+ vdup_w_x_m(v48, 0);
+ vdup_w_x_m(v52, 0);
+ acset_v(v48, v48);
+
+ // inputs row 0 and row 2
+ vld_b_l_xx(v0,
+ p_input + (in_y_origin * input_width * input_depth) +
+ (in_x_origin * input_depth),
+ load_until);
+ vld_b_l_xx(v4,
+ p_input + ((in_y_origin + 2) * input_width * input_depth) +
+ (in_x_origin * input_depth),
+ load_until);
+ if (negate_offset) {
+ vor_vv(v0, v0, v30);
+ vor_vv(v4, v4, v30);
+ }
+ FORGING_4_INPUT_COLUMNS_INTO_V0_V7_FOR_ACONV
+ aconv_vxv(v48, v0, cmds, v8); // filter r1
+
+ // inputs row 1 and row 3
+ vld_b_l_xx(v0,
+ p_input + ((in_y_origin + 1) * input_width * input_depth) +
+ (in_x_origin * input_depth),
+ load_until);
+ vld_b_l_xx(v4,
+ p_input + ((in_y_origin + 3) * input_width * input_depth) +
+ (in_x_origin * input_depth),
+ load_until);
+ if (negate_offset) {
+ vor_vv(v0, v0, v30);
+ vor_vv(v4, v4, v30);
+ }
+ FORGING_4_INPUT_COLUMNS_INTO_V0_V7_FOR_ACONV;
+ aconv_vxv(v48, v0, cmds, v24); // filter r2
+
+ // row 2 and row4
+ vld_b_l_xx(v0,
+ p_input + ((in_y_origin + 2) * input_width * input_depth) +
+ (in_x_origin * input_depth),
+ load_until);
+ vdup_b_x_m(v4, neg_input_offset);
+ if (negate_offset) {
+ vor_vv(v0, v0, v30);
+ vor_vv(v4, v4, v30);
+ }
+ vsliden_h_3_vv(v1, v0, v11);
+ vsliden_w_3_vv(v2, v0, v11);
+ vsliden_w_3_vv(v3, v1, v11);
+ aconv_vxv(v48, v0, cmds, v40); // filter r3
+
+ vcget(v48);
+ actr_v(v48, v48);
+ vcget(v48);
+ POST_PROCESS_ACONV_INT32_ACC_TO_INT8_ACC;
+
+ for (int i = 0; i < 4; ++i) {
+ memcpy((p_output + ((out_x + i) * output_depth) +
+ ((output_height - 2) * output_width * output_depth)),
+ &acc_out8[0][i][0], 8 * sizeof(int8_t));
+
+ memcpy((p_output + ((out_x + i) * output_depth) +
+ ((output_height - 1) * output_width * output_depth)),
+ &acc_out8[1][i][0], 8 * sizeof(int8_t));
+ }
+ }
+ }
+ out_channel += 8;
+ } while (out_channel < output_depth);
+}
+
+#undef FORGING_4_INPUT_COLUMNS_INTO_V0_V7_FOR_ACONV
+#undef POST_PROCESS_ACONV_INT32_ACC_TO_INT8_ACC
+
+} // namespace kelvin::opt
\ No newline at end of file