Specialized Conv kernel to handle input tensors *This kernel is helpful to reduce the bottleneck due to input conv layer for 3d inputs. *Kernel handles nxnx3 inputs with a 3x3 filter *reduces cycles count by ~51x Change-Id: I7978508f32f8974c1236ba00c828a0d04f88c8e9
diff --git a/tflm/opt/BUILD b/tflm/opt/BUILD index 9957e7a..0330215 100644 --- a/tflm/opt/BUILD +++ b/tflm/opt/BUILD
@@ -21,6 +21,7 @@ "conv_s16_b64.cc", "conv_s8.cc", "conv_s8_1x1.cc", + "conv_s8_I3xD8.cc", "conv_s8_3x1_d48.cc", "conv_s8_d1.cc", "conv_s8_d32.cc",
diff --git a/tflm/opt/conv_s8.cc b/tflm/opt/conv_s8.cc index 10e8f41..6ae4eeb 100644 --- a/tflm/opt/conv_s8.cc +++ b/tflm/opt/conv_s8.cc
@@ -202,6 +202,13 @@ return; \ } + if (dilation_width_factor == 1 && dilation_height_factor == 1 && + stride_width == 2 && stride_height == 2 && filter_depth == 3 && + input_depth == 3 && output_depth % 8 == 0 && output_width % 4 == 0 && + output_height % 2 == 0 && pad_height == 0 && pad_width == 0) { + RUN_KERNEL(kelvin::opt::ConvS8I3xD8); + } + // special case of filter size 1x1 if (filter_height == 1 && filter_width == 1 && stride_height == 1 && stride_width == 1 && dilation_height_factor == 1 &&
diff --git a/tflm/opt/conv_s8.h b/tflm/opt/conv_s8.h index 96cea2e..a95b1e6 100644 --- a/tflm/opt/conv_s8.h +++ b/tflm/opt/conv_s8.h
@@ -85,6 +85,16 @@ const tflite::RuntimeShape& bias_shape, const int32_t* bias_data, const tflite::RuntimeShape& output_shape, int8_t* output_data); +// input depth 3, filter depth 8n, stride 2 +void ConvS8I3xD8(const tflite::ConvParams& params, + const int32_t* output_multiplier, const int32_t* output_shift, + const tflite::RuntimeShape& input_shape, + const int8_t* input_data, + const tflite::RuntimeShape& filter_shape, + const int8_t* filter_data, + const tflite::RuntimeShape& bias_shape, + const int32_t* bias_data, + const tflite::RuntimeShape& output_shape, int8_t* output_data); // filter size 48x3x1x48 void ConvS8K3x1D48( const tflite::ConvParams& params, const int32_t* output_multiplier,
diff --git a/tflm/opt/conv_s8_I3xD8.cc b/tflm/opt/conv_s8_I3xD8.cc new file mode 100644 index 0000000..370bd97 --- /dev/null +++ b/tflm/opt/conv_s8_I3xD8.cc
@@ -0,0 +1,537 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <cstdlib> +#include <memory> + +#include "crt/kelvin.h" +#include "tensorflow/lite/kernels/internal/common.h" +#include "tensorflow/lite/kernels/internal/reference/integer_ops/conv.h" +#include "tensorflow/lite/kernels/internal/runtime_shape.h" +#include "tensorflow/lite/kernels/internal/types.h" +#include "tflm/opt/conv_s8.h" +#include "tflm/opt/conv_util.h" +#include "tflm/opt/opt.h" + +namespace kelvin::opt { +namespace { + +void VectorSwizzle8(const int32_t* input, int32_t* output, int32_t* output2) { + // swizzle to achive following pattern + // out 1 : [0, 2, 1, 3, 0, 2, 1, 3] + // out 2 : [4, 6, 5, 7, 4, 6, 5, 7] + + const int32_t(&in)[8] = *(int32_t(*)[8])input; + int32_t(&out)[8] = *(int32_t(*)[8])output; + int32_t(&out2)[8] = *(int32_t(*)[8])output2; + + out[0] = in[0]; + out[2] = in[1]; + out[1] = in[2]; + out[3] = in[3]; + out[4] = in[0]; + out[6] = in[1]; + out[5] = in[2]; + out[7] = in[3]; + + out2[0] = in[4]; + out2[2] = in[5]; + out2[1] = in[6]; + out2[3] = in[7]; + out2[4] = in[4]; + out2[6] = in[5]; + out2[5] = in[6]; + out2[7] = in[7]; +} + +void PaddedFilter_N_H_W_M(const int8_t* input, int8_t* output, int N, int H, + int W, int M) { + if (M != 3) { + MicroPrintf("Filter shuffling can only handle M(input_depth) == 3"); + exit(-1); + } + + const int8_t(&in)[N][H][W][M] = *(int8_t(*)[N][H][W][M])input; + int8_t(&out)[N / 8][3][8 * 4 * 3] = *(int8_t(*)[N / 8][3][8 * 4 * 3]) output; + int group = 0; + // Filter data is being reorganized into groups of 8 channels and falttening + // row. 9th element of 3x3 filter is padded (9000 9000 9000 9000) 8 channels + // are aligned this way ( c0 c1 c2 c3) + for (int ky = 0; ky < H; ++ky) { + int filter_element[N / 8]{0}; + for (int kx = 0; kx < W; ++kx) { + for (int output_channel = 0; output_channel < N; ++output_channel) { + for (int input_channel = 0; input_channel < M; ++input_channel) { + group = output_channel >> 3; + if (kx == 1 && input_channel == 0) { + continue; + } + if (kx == 2 && (input_channel < 2)) { + continue; + } + + if (kx == 0 && input_channel == 2) { + out[group][ky][filter_element[group]] = + in[output_channel][ky][kx][input_channel]; + filter_element[group] += 1; + out[group][ky][filter_element[group]] = + in[output_channel][ky][kx + 1][0]; + filter_element[group] += 1; + } else if (kx == 1 && input_channel == 2) { + out[group][ky][filter_element[group]] = + in[output_channel][ky][kx][input_channel]; + filter_element[group] += 1; + out[group][ky][filter_element[group]] = + in[output_channel][ky][kx + 1][0]; + filter_element[group] += 1; + out[group][ky][filter_element[group]] = + in[output_channel][ky][kx + 1][1]; + filter_element[group] += 1; + } else if (kx == 2 && input_channel == 2) { + out[group][ky][filter_element[group]] = + in[output_channel][ky][kx][input_channel]; + filter_element[group] += 1; + out[group][ky][filter_element[group]] = 0; + filter_element[group] += 1; + out[group][ky][filter_element[group]] = 0; + filter_element[group] += 1; + out[group][ky][filter_element[group]] = 0; + filter_element[group] += 1; + } else { + out[group][ky][filter_element[group]] = + in[output_channel][ky][kx][input_channel]; + filter_element[group] += 1; + } + } + } + } + } +} +} // namespace + +// IN: +// - v0, v4 (input pixels) +// - v11 (zeroed register) +// OUT: +// - v0, v1, v2, v3, v4, v5, v6, v7 (reforged input columns) +// CLOBBERS: +// - None +#define FORGING_4_INPUT_COLUMNS_INTO_V0_V7_FOR_ACONV \ + { \ + vsliden_h_3_vv(v1, v0, v11); \ + vsliden_w_3_vv(v2, v0, v11); \ + vsliden_w_3_vv(v3, v1, v11); \ + vsliden_h_3_vv(v5, v4, v11); \ + vsliden_w_3_vv(v6, v4, v11); \ + vsliden_w_3_vv(v7, v5, v11); \ + } + +#define POST_PROCESS_ACONV_INT32_ACC_TO_INT8_ACC \ + { \ + vadd_w_vv_m(v48, v48, v35); \ + vadd_w_vv_m(v52, v52, v35); \ + vdmulh_w_rn_vv_m(v48, v48, v12); \ + vdmulh_w_rn_vv_m(v52, v52, v12); \ + vsha_w_r_vv_m(v48, v48, v16); \ + vsha_w_r_vv_m(v52, v52, v16); \ + vadd_w_vx_m(v48, v48, output_offset); \ + vadd_w_vx_m(v52, v52, output_offset); \ + vmin_w_vx_m(v48, v48, output_activation_max); \ + vmin_w_vx_m(v52, v52, output_activation_max); \ + vmax_w_vx_m(v48, v48, output_activation_min); \ + vmax_w_vx_m(v52, v52, output_activation_min); \ + vst_w_x_m(v48, &acc_out32[0]); \ + vst_w_x_m(v52, &acc_out32[32]); \ + for (int i = 0; i < 4; i++) { \ + acc_out8[0][i][0] = acc_out32[i * 16 + 0]; \ + acc_out8[0][i][2] = acc_out32[i * 16 + 1]; \ + acc_out8[0][i][1] = acc_out32[i * 16 + 2]; \ + acc_out8[0][i][3] = acc_out32[i * 16 + 3]; \ + acc_out8[1][i][0] = acc_out32[i * 16 + 4]; \ + acc_out8[1][i][2] = acc_out32[i * 16 + 5]; \ + acc_out8[1][i][1] = acc_out32[i * 16 + 6]; \ + acc_out8[1][i][3] = acc_out32[i * 16 + 7]; \ + acc_out8[0][i][4] = acc_out32[i * 16 + 8]; \ + acc_out8[0][i][6] = acc_out32[i * 16 + 9]; \ + acc_out8[0][i][5] = acc_out32[i * 16 + 10]; \ + acc_out8[0][i][7] = acc_out32[i * 16 + 11]; \ + acc_out8[1][i][4] = acc_out32[i * 16 + 12]; \ + acc_out8[1][i][6] = acc_out32[i * 16 + 13]; \ + acc_out8[1][i][5] = acc_out32[i * 16 + 14]; \ + acc_out8[1][i][7] = acc_out32[i * 16 + 15]; \ + } \ + } + +void ConvS8I3xD8( + const tflite::ConvParams& params, const int32_t* output_multiplier, + const int32_t* output_shift, const tflite::RuntimeShape& input_shape, + const int8_t* input_data, const tflite::RuntimeShape& filter_shape, + const int8_t* filter_data, const tflite::RuntimeShape& bias_shape, + const int32_t* bias_data, const tflite::RuntimeShape& output_shape, + int8_t* output_data) { + // Get parameters. + const int32_t input_offset = params.input_offset; // r = s(q - Z) + const int32_t neg_input_offset = -params.input_offset; // r = s(q - Z) + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const int32_t output_offset = params.output_offset; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; + // Consistency check. + TFLITE_DCHECK_LE(output_activation_min, output_activation_max); + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(filter_shape.Dims(2), 3); + TFLITE_DCHECK_EQ(filter_shape.Dims(1), 3); + TFLITE_DCHECK_EQ(input_shape.Dims(3), 3); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int input_depth = input_shape.Dims(3); + const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3); + if (bias_data) { + TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth); + } + + // Check dimensions of the tensors. + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int filter_input_depth = filter_shape.Dims(3); + const int groups = input_depth / filter_input_depth; + TFLITE_DCHECK_NE(groups, 0); + TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0); + const int filters_per_group = output_depth / groups; + TFLITE_DCHECK_NE(filters_per_group, 0); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + + union { + vconv_u8_t conv; + uint32_t raw; + } cmds; + cmds.conv.mode = 0; + cmds.conv.start = 0; + cmds.conv.stop = 2; + cmds.conv.sbias1 = input_offset; + cmds.conv.sdata1 = true; + cmds.conv.sbias2 = 0; + cmds.conv.sdata2 = true; + + // Reg Map: + // v0-v7 : input patches[0..7] + // v8-v10 : filter row 1 (registers used for aconv) + // v11 : vdup 0 used during vsliden + // v12-v15 : Swizzled Biases + // v16-v19 : Swizzled shift mulitpliers + // v24-v26 : filter row 2 (registers used for aconv) + // v30 : negative offset mask + // v34-v37 : Swizzled Biases + // v40-v42 : filter row 3 (registers used for aconv) + // v48-v55 : Accumulators for aconv + + int8_t acc_out8[2][4][8]; + int32_t acc_out32[64]; + int out_channel = 0; + const size_t swizzled_filter_data_size = output_depth * 3 * 3 * 4; + std::unique_ptr<int8_t> swizzled_filter_data(reinterpret_cast<int8_t*>( + ::aligned_alloc(32, swizzled_filter_data_size))); + int8_t* p_swizzled_filter_data = swizzled_filter_data.get(); + + PaddedFilter_N_H_W_M(filter_data, p_swizzled_filter_data, output_depth, + filter_height, filter_width, filter_input_depth); + + // structure of padded filter data : 1st row 0-8 channels 0-95 , 2nd row 0-8 + // channels 96-191, 3rd row 0-8 channels 192-287 + + do { + int32_t temp_data_shuffle[2][8]{0}; + + VectorSwizzle8(bias_data + out_channel, &temp_data_shuffle[0][0], + &temp_data_shuffle[1][0]); + vld_w_x(v35, &temp_data_shuffle[0][0]); + vld_w_x(v36, &temp_data_shuffle[1][0]); + vmv_v(v37, v35); + vmv_v(v38, v36); + + VectorSwizzle8(output_multiplier + out_channel, &temp_data_shuffle[0][0], + &temp_data_shuffle[1][0]); + vld_w_x(v12, &temp_data_shuffle[0][0]); + vld_w_x(v13, &temp_data_shuffle[1][0]); + vmv_v(v14, v12); + vmv_v(v15, v13); + + VectorSwizzle8(output_shift + out_channel, &temp_data_shuffle[0][0], + &temp_data_shuffle[1][0]); + vld_w_x(v16, &temp_data_shuffle[0][0]); + vld_w_x(v17, &temp_data_shuffle[1][0]); + vmv_v(v18, v16); + vmv_v(v19, v17); + vrsub_w_vx_m(v16, v16, 0); + vdup_b_x(v11, 0); // used for vsliden + + int8_t mask[32] = {0}; + for (int i = 24; i < 32; ++i) { + mask[i] = neg_input_offset; + } + vld_b_x(v30, mask); // mast to negate input offset + + int fil_channels_offset = out_channel / 8; + + // load filter this is done once per change in 8 channels + vld_b_x(v8, p_swizzled_filter_data + fil_channels_offset * 288); // row 1 + vld_b_x(v9, + p_swizzled_filter_data + fil_channels_offset * 288 + 32); // row 1 + vld_b_x(v10, + p_swizzled_filter_data + fil_channels_offset * 288 + 64); // row 1 + + vld_b_x(v24, + p_swizzled_filter_data + fil_channels_offset * 288 + 96); // row 2 + vld_b_x(v25, + p_swizzled_filter_data + fil_channels_offset * 288 + 128); // row 2 + vld_b_x(v26, + p_swizzled_filter_data + fil_channels_offset * 288 + 160); // row 2 + + vld_b_x(v40, + p_swizzled_filter_data + fil_channels_offset * 288 + 192); // row 3 + vld_b_x(v41, + p_swizzled_filter_data + fil_channels_offset * 288 + 224); // row 3 + vld_b_x(v42, + p_swizzled_filter_data + fil_channels_offset * 288 + 256); // row 3 + + for (int batch = 0; batch < batches; ++batch) { + int8_t* p_output = output_data + + (batch * output_height * output_width * output_depth) + + out_channel; + const int8_t* p_input = + input_data + (batch * input_height * input_width * input_depth); + for (int out_y = 0; out_y + 2 < output_height; out_y += 2) { + const int in_y_origin = (out_y * stride_height); + for (int out_x = 0; out_x + 4 < output_width; out_x += 4) { + const int in_x_origin = (out_x * stride_width); + + vdup_w_x_m(v48, 0); + vdup_w_x_m(v52, 0); + + acset_v(v48, v48); + + // inputs row 0 and row 2 + vld_b_x(v0, p_input + (in_y_origin * input_width * input_depth) + + (in_x_origin * input_depth)); + vld_b_x(v4, p_input + + ((in_y_origin + 2) * input_width * input_depth) + + (in_x_origin * input_depth)); + + // explaining data slide strategy + // v0 loads 10 RGB pixels of Row 0 ( which turn out to be first 30 + // values) 0 1 2 3 4 5 6 7 8 9 first vslide corresponds to outx + 1 2 + // 3 4 5 6 7 8 9 ( stride == 2) 2nd vslide corresponds to outx + 2 4 5 + // 6 7 8 9 ( stride == 2) 2nd vslide corresponds to outx + 3 6 7 8 9 ( + // stride == 2) + FORGING_4_INPUT_COLUMNS_INTO_V0_V7_FOR_ACONV; + + aconv_vxv(v48, v0, cmds, v8); // filter r1 + + // inputs row 1 and row 3 + vld_b_x(v0, p_input + + ((in_y_origin + 1) * input_width * input_depth) + + (in_x_origin * input_depth)); + vld_b_x(v4, p_input + + ((in_y_origin + 3) * input_width * input_depth) + + (in_x_origin * input_depth)); + FORGING_4_INPUT_COLUMNS_INTO_V0_V7_FOR_ACONV; + aconv_vxv(v48, v0, cmds, v24); // filter r2 + + // row 2 and row4 + vld_b_x(v0, p_input + + ((in_y_origin + 2) * input_width * input_depth) + + (in_x_origin * input_depth)); + vld_b_x(v4, p_input + + ((in_y_origin + 4) * input_width * input_depth) + + (in_x_origin * input_depth)); + + FORGING_4_INPUT_COLUMNS_INTO_V0_V7_FOR_ACONV; + aconv_vxv(v48, v0, cmds, v40); // filter r3 + + vcget(v48); + actr_v(v48, v48); + vcget(v48); + + // (x0,y0) (x0, y1) + // v48 (0 2 1 3 0 2 1 3) -- even registers + // v49 (4 6 5 7 4 6 5 7) -- odd registers + POST_PROCESS_ACONV_INT32_ACC_TO_INT8_ACC; + + for (int i = 0; i < 4; ++i) { + memcpy((p_output + ((out_x + i) * output_depth) + + (out_y * output_width * output_depth)), + &acc_out8[0][i][0], 8 * sizeof(int8_t)); + memcpy((p_output + ((out_x + i) * output_depth) + + ((out_y + 1) * output_width * output_depth)), + &acc_out8[1][i][0], 8 * sizeof(int8_t)); + } + } + + int in_x_origin = (output_width - 4) * stride_width; + vdup_w_x_m(v48, 0); + vdup_w_x_m(v52, 0); + acset_v(v48, v48); + + vld_b_l_xx(v0, + p_input + (in_y_origin * input_width * input_depth) + + (in_x_origin * input_depth), + 24); + vld_b_l_xx(v4, + p_input + ((in_y_origin + 2) * input_width * input_depth) + + (in_x_origin * input_depth), + 24); + vor_vv(v0, v0, v30); + vor_vv(v4, v4, v30); + FORGING_4_INPUT_COLUMNS_INTO_V0_V7_FOR_ACONV; + aconv_vxv(v48, v0, cmds, v8); // filter r1 + + // inputs row 1 and row 3 + vld_b_l_xx(v0, + p_input + ((in_y_origin + 1) * input_width * input_depth) + + (in_x_origin * input_depth), + 24); + vld_b_l_xx(v4, + p_input + ((in_y_origin + 3) * input_width * input_depth) + + (in_x_origin * input_depth), + 24); + vor_vv(v0, v0, v30); + vor_vv(v4, v4, v30); + FORGING_4_INPUT_COLUMNS_INTO_V0_V7_FOR_ACONV; + aconv_vxv(v48, v0, cmds, v24); // filter r2 + + // row 2 and row4 + vld_b_l_xx(v0, + p_input + ((in_y_origin + 2) * input_width * input_depth) + + (in_x_origin * input_depth), + 24); + vld_b_l_xx(v4, + p_input + ((in_y_origin + 4) * input_width * input_depth) + + (in_x_origin * input_depth), + 24); + vor_vv(v0, v0, v30); + vor_vv(v4, v4, v30); + FORGING_4_INPUT_COLUMNS_INTO_V0_V7_FOR_ACONV; + aconv_vxv(v48, v0, cmds, v40); // filter r3 + + vcget(v48); + actr_v(v48, v48); + vcget(v48); + + POST_PROCESS_ACONV_INT32_ACC_TO_INT8_ACC; + + for (int i = 0; i < 4; ++i) { + memcpy((p_output + ((output_width - 4 + i) * output_depth) + + (out_y * output_width * output_depth)), + &acc_out8[0][i][0], 8 * sizeof(int8_t)); + + memcpy((p_output + ((output_width - 4 + i) * output_depth) + + ((out_y + 1) * output_width * output_depth)), + &acc_out8[1][i][0], 8 * sizeof(int8_t)); + } + } + int load_until = 32; + bool negate_offset = false; + for (int out_x = 0; out_x + 4 <= output_width; out_x += 4) { + const int in_x_origin = (out_x * stride_width); + const int in_y_origin = (output_height - 2) * stride_height; + + if (out_x + 4 == output_width) { + load_until = 24; + negate_offset = true; + } + + vdup_w_x_m(v48, 0); + vdup_w_x_m(v52, 0); + acset_v(v48, v48); + + // inputs row 0 and row 2 + vld_b_l_xx(v0, + p_input + (in_y_origin * input_width * input_depth) + + (in_x_origin * input_depth), + load_until); + vld_b_l_xx(v4, + p_input + ((in_y_origin + 2) * input_width * input_depth) + + (in_x_origin * input_depth), + load_until); + if (negate_offset) { + vor_vv(v0, v0, v30); + vor_vv(v4, v4, v30); + } + FORGING_4_INPUT_COLUMNS_INTO_V0_V7_FOR_ACONV + aconv_vxv(v48, v0, cmds, v8); // filter r1 + + // inputs row 1 and row 3 + vld_b_l_xx(v0, + p_input + ((in_y_origin + 1) * input_width * input_depth) + + (in_x_origin * input_depth), + load_until); + vld_b_l_xx(v4, + p_input + ((in_y_origin + 3) * input_width * input_depth) + + (in_x_origin * input_depth), + load_until); + if (negate_offset) { + vor_vv(v0, v0, v30); + vor_vv(v4, v4, v30); + } + FORGING_4_INPUT_COLUMNS_INTO_V0_V7_FOR_ACONV; + aconv_vxv(v48, v0, cmds, v24); // filter r2 + + // row 2 and row4 + vld_b_l_xx(v0, + p_input + ((in_y_origin + 2) * input_width * input_depth) + + (in_x_origin * input_depth), + load_until); + vdup_b_x_m(v4, neg_input_offset); + if (negate_offset) { + vor_vv(v0, v0, v30); + vor_vv(v4, v4, v30); + } + vsliden_h_3_vv(v1, v0, v11); + vsliden_w_3_vv(v2, v0, v11); + vsliden_w_3_vv(v3, v1, v11); + aconv_vxv(v48, v0, cmds, v40); // filter r3 + + vcget(v48); + actr_v(v48, v48); + vcget(v48); + POST_PROCESS_ACONV_INT32_ACC_TO_INT8_ACC; + + for (int i = 0; i < 4; ++i) { + memcpy((p_output + ((out_x + i) * output_depth) + + ((output_height - 2) * output_width * output_depth)), + &acc_out8[0][i][0], 8 * sizeof(int8_t)); + + memcpy((p_output + ((out_x + i) * output_depth) + + ((output_height - 1) * output_width * output_depth)), + &acc_out8[1][i][0], 8 * sizeof(int8_t)); + } + } + } + out_channel += 8; + } while (out_channel < output_depth); +} + +#undef FORGING_4_INPUT_COLUMNS_INTO_V0_V7_FOR_ACONV +#undef POST_PROCESS_ACONV_INT32_ACC_TO_INT8_ACC + +} // namespace kelvin::opt \ No newline at end of file