tflm/opt/conv_s8_3x1_d48.cc - sw/kelvin - Git at Google

 /*
  * Copyright 2024 Google LLC
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 // Convolution based on Kelvin ops
 // Data types: input: s8, filter: s8, bias: s32
 // Special case for 48x3x1x48 filter

 #include "tflm/opt/conv_util.h"

 namespace kelvin::opt {

 void ConvS8K3x1D48(
     const tflite::ConvParams& params, const int32_t* output_multiplier,
     const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
     const int8_t* input_data, const tflite::RuntimeShape& filter_shape,
     const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
     const int32_t* bias_data, const tflite::RuntimeShape& output_shape,
     int8_t* output_data) {
   const auto batches = MatchingDim(input_shape, 0, output_shape, 0);
   const int stride_width = params.stride_width;
   const int stride_height = params.stride_height;
   const int dilation_width_factor = params.dilation_width_factor;
   const int dilation_height_factor = params.dilation_height_factor;
   const int pad_width = params.padding_values.width;
   const int pad_height = params.padding_values.height;
   const int input_width = input_shape.Dims(2);
   const int input_depth = input_shape.Dims(3);
   const int32_t input_offset = params.input_offset;
   const int filter_height = filter_shape.Dims(1);
   const int filter_width = filter_shape.Dims(2);
   const int filter_depth = filter_shape.Dims(3);
   const int output_height = output_shape.Dims(1);
   const int output_depth = output_shape.Dims(3);
   const int32_t output_offset = params.output_offset;
   const int32_t output_activation_min = params.quantized_activation_min;
   const int32_t output_activation_max = params.quantized_activation_max;

   TFLITE_DCHECK(batches == 1);
   TFLITE_DCHECK(filter_depth == input_depth);
   TFLITE_DCHECK(filter_height == 3);
   TFLITE_DCHECK(filter_width == 1);
   TFLITE_DCHECK(input_width == 1);
   TFLITE_DCHECK(stride_width == 1);
   TFLITE_DCHECK(stride_height == 1);
   TFLITE_DCHECK(dilation_width_factor == 1);
   TFLITE_DCHECK(dilation_height_factor == 1);
   TFLITE_DCHECK(pad_width == 0);
   TFLITE_DCHECK(pad_height == 0);

   int32_t bias[48 * 4];
   int32_t mult[48 * 4];
   int32_t shft[48 * 4];
   Swizzle<48>(bias_data, bias);
   Swizzle<48>(output_multiplier, mult);
   Swizzle<48, true>(output_shift, shft);

   int8_t juggled_filter_data[48 / 8][3][1][48 / 4][8][4];
   Filter_N_H_W_M<48>(filter_data, juggled_filter_data[0][0][0][0][0], 3, 1, 48);
   union {
     vconv_u8_t conv;
     uint32_t raw;
   } cmds;
   cmds.conv.mode = 0;
   cmds.conv.start = 0;
   cmds.conv.stop = 7;
   cmds.conv.sbias1 = input_offset;
   cmds.conv.sdata1 = true;
   cmds.conv.sbias2 = 0;
   cmds.conv.sdata2 = true;

   union {
     vconv_u8_t conv;
     uint32_t raw;
   } cmds16;
   cmds16.conv.mode = 0;
   cmds16.conv.start = 0;
   cmds16.conv.stop = 3;
   cmds16.conv.sbias1 = input_offset;
   cmds16.conv.sdata1 = true;
   cmds16.conv.sbias2 = 0;
   cmds16.conv.sdata2 = true;

   for (int zo_hi = 0; zo_hi < output_depth; zo_hi += 8) {
 // For each pixel, the general flow for this kernel looks like:
 // 1) Reset accumulator and load activations into [v32, v46]
 // 2) For each group of 32 scalars in the pixel fan-in, run MAC pipeline
 //    2a) Load subset of activations from [v32, v46] to [v0, v7]
 //    2b) Load subset of weights
 //    2c) Run aconv
 // 3) Run the output pipeline and store.
 //
 // For step 1, we'll alias [v32, v46] to [L0, LE]. For most iterations,
 // we load all of these registers (10 pixels). For remainder iterations,
 // we load a subset and pad the rest with 0's. The data will be stored as
 // follows, where each letter represents 16 bytes of a pixel stored into
 // a register (capitalization used to help distinguish channels in a pixel):
 // L0 L1 L2 L3 L4 L5 L6 L7 L8 L9 LA LB LC LD LE
 // Aa AB bB Cc CD dD Ee EF fF Gg GH hH Ii IJ jJ
 #define L0 v32
 #define L1 v33
 #define L2 v34
 #define L3 v35
 #define L4 v36
 #define L5 v37
 #define L6 v38
 #define L7 v39
 #define L8 v40
 #define L9 v41
 #define LA v42
 #define LB v43
 #define LC v44
 #define LD v45
 #define LE v46

 // We run 5 iterations of step 2, 4 full iterations and one half iteration.
 // Because each pixel takes 1.5 registers, we have to interleave vmv_v and
 // vsliden_w_4_vv instructions to ensure the same output channels are stored
 // in each register per-pixel. As a refresher, vsliden_w_4_vv takes two
 // register arguments (X and Y), and returns the concatenation of the last
 // half of X and the first half of Y. ie:
 // L1 L2
 // AB bB
 // vsliden_w_4_vv(v1, L1, L2); -> v1 = Bb
 #define CONV_PER_CHANNEL_B8_3X1_48C_MAC_PIPELINE(p_flt)              \
   {                                                                  \
     /* 1/5 */                                                        \
     /* Ky = 0, IC:[0-31] */                                          \
     vmv_v(v0, L0);              /* Aa */                             \
     vsliden_w_4_vv(v1, L1, L2); /* Bb */                             \
     vmv_v(v2, L3);              /* Cc */                             \
     vsliden_w_4_vv(v3, L4, L5); /* Dd */                             \
     vmv_v(v4, L6);              /* Ee */                             \
     vsliden_w_4_vv(v5, L7, L8); /* Ff */                             \
     vmv_v(v6, L9);              /* Gg */                             \
     vsliden_w_4_vv(v7, LA, LB); /* Hh */                             \
     vld_b_x_m(v56, p_flt + 128 * 0);                                 \
     vld_b_x_m(v60, p_flt + 128 * 1);                                 \
     aconv_vxv(v48, v0, cmds, v56);                                   \
                                                                      \
     /* 2/5 */                                                        \
     /* Ky = 0, IC:[32-47]; Ky = 1, IC:[0-15] */                      \
     vmv_v(v0, L1);              /* AB */                             \
     vsliden_w_4_vv(v1, L2, L3); /* BC */                             \
     vmv_v(v2, L4);              /* CD */                             \
     vsliden_w_4_vv(v3, L5, L6); /* DE */                             \
     vmv_v(v4, L7);              /* EF */                             \
     vsliden_w_4_vv(v5, L8, L9); /* FG */                             \
     vmv_v(v6, LA);              /* GH */                             \
     vsliden_w_4_vv(v7, LB, LC); /* HI */                             \
     vld_b_x_m(v56, p_flt + 128 * 2);                                 \
     vld_b_x_m(v60, p_flt + 128 * 3);                                 \
     aconv_vxv(v48, v0, cmds, v56);                                   \
                                                                      \
     /* 3/5 */                                                        \
     /* Ky = 1, IC:[16-47] */                                         \
     vmv_v(v0, L2);              /* bB */                             \
     vsliden_w_4_vv(v1, L3, L4); /* cC */                             \
     vmv_v(v2, L5);              /* dD */                             \
     vsliden_w_4_vv(v3, L6, L7); /* eE */                             \
     vmv_v(v4, L8);              /* fF */                             \
     vsliden_w_4_vv(v5, L9, LA); /* gG */                             \
     vmv_v(v6, LB);              /* hH */                             \
     vsliden_w_4_vv(v7, LC, LD); /* iI */                             \
     vld_b_x_m(v56, p_flt + 128 * 4);                                 \
     vld_b_x_m(v60, p_flt + 128 * 5);                                 \
     aconv_vxv(v48, v0, cmds, v56);                                   \
                                                                      \
     /* 4/5 */                                                        \
     /* Ky = 2, IC:[0-31] */                                          \
     vmv_v(v0, L3);              /* Cc */                             \
     vsliden_w_4_vv(v1, L4, L5); /* Dd */                             \
     vmv_v(v2, L6);              /* Ee */                             \
     vsliden_w_4_vv(v3, L4, L5); /* Ff */                             \
     vmv_v(v4, L9);              /* Gg */                             \
     vsliden_w_4_vv(v5, LA, LB); /* Hh */                             \
     vmv_v(v6, LC);              /* Ii */                             \
     vsliden_w_4_vv(v7, LD, LE); /* Jj */                             \
     vld_b_x_m(v56, p_flt + 128 * 6);                                 \
     vld_b_x_m(v60, p_flt + 128 * 7);                                 \
     aconv_vxv(v48, v0, cmds, v56);                                   \
                                                                      \
     /* 5/5 */                                                        \
     /* Ky = 2, IC:[32-47] half iteration */                          \
     vmv_v(v0, L4);              /* C(D- ignored) */                  \
     vsliden_w_4_vv(v1, L5, L6); /* D(E- ignored) */                  \
     vmv_v(v2, L7);              /* E(F- ignored) */                  \
     vsliden_w_4_vv(v3, L8, L9); /* F(G- ignored) */                  \
     vmv_v(v4, LA);              /* G(H- ignored) */                  \
     vsliden_w_4_vv(v5, LB, LC); /* H(I- ignored) */                  \
     vmv_v(v6, LD);              /* I(J- ignored) */                  \
     /* Pad last iteration with first pixel. Gets ignored by cmd16 */ \
     vsliden_w_4_vv(v7, LE, L0);      /* J(A- ignored) */             \
     vld_b_x_m(v56, p_flt + 128 * 8); /*Load once half iteration*/    \
     /* cmds16 runs subset of outer product */                        \
     aconv_vxv(v48, v0, cmds16, v56);                                 \
   }

     // Iterate over outputs
     int out_y = 0;
     for (; out_y + 8 <= output_height; out_y += 8) {
       // Reset accumulator
       vdup_w_x_m(v48, 0);
       vdup_w_x_m(v52, 0);
       acset_v(v48, v48);

       const int8_t* p_flt = juggled_filter_data[zo_hi / 8][0][0][0][0];
       const int8_t* p_in = input_data + (out_y * input_width * input_depth);

       // Load 10*48 activations into 10*48*32 = 15 registers
       vld_b_x_m(L0, p_in);
       vld_b_x_m(L4, p_in + 32 * 4);
       vld_b_x_m(L8, p_in + 32 * 8);
       vld_b_x(LC, p_in + 32 * 12);
       vld_b_x(LD, p_in + 32 * 13);
       vld_b_x(LE, p_in + 32 * 14);

       // MAC pipeline
       CONV_PER_CHANNEL_B8_3X1_48C_MAC_PIPELINE(p_flt);

       // Output pipeline
       INT32_TO_INT8_OUTPUT_PIPELINE(bias + zo_hi * 4, mult + zo_hi * 4,
                                     shft + zo_hi * 4, output_activation_min,
                                     output_activation_max, output_offset, v36,
                                     v40, v44);
       int8_t* p_out =
           output_data + tflite::Offset(output_shape, 0, out_y, 0, zo_hi);
       vstq_b_sp_xx(v48, p_out, output_depth);
       vstq_b_sp_xx(v52, p_out, output_depth);
     }

     // Left over minibatch
     int remainder = output_height - out_y;
     if (remainder != 0) {
       // Reset accumulator
       vdup_w_x_m(v48, 0);
       vdup_w_x_m(v52, 0);
       acset_v(v48, v48);

       const int8_t* p_flt = juggled_filter_data[zo_hi / 8][0][0][0][0];
       const int8_t* p_in = input_data + (out_y * input_width * input_depth);

       // Load (remainder + 2) * 48 activations
       // L0 L1 L2 L3 L4 L5 L6 L7 L8 L9 LA LB LC LD
       // AA AB BB CC CD DD EE EF FF GG GH HH II I-
       vld_b_x_m(L0, p_in);
       vdup_w_x_m(L4, 0);
       vdup_w_x_m(L8, 0);
       vdup_w_x_m(LC, 0);
       switch (remainder) {
         case 7:
           vld_b_x(LD, p_in + 32 * 13);
           vld_b_x(LC, p_in + 32 * 12);
         case 6:
           vld_b_x(LB, p_in + 32 * 11);
         case 5:
           vld_b_x(LA, p_in + 32 * 10);
           vld_b_x(L9, p_in + 32 * 9);
         case 4:
           vld_b_x(L8, p_in + 32 * 8);
         case 3:
           vld_b_x(L7, p_in + 32 * 7);
           vld_b_x(L6, p_in + 32 * 6);
         case 2:
           vld_b_x(L5, p_in + 32 * 5);
         default:
           break;
       }
       vld_b_x(L4, p_in + 32 * 4);

       // MAC pipeline
       CONV_PER_CHANNEL_B8_3X1_48C_MAC_PIPELINE(p_flt);

       // Output pipeline
       INT32_TO_INT8_OUTPUT_PIPELINE(bias + zo_hi * 4, mult + zo_hi * 4,
                                     shft + zo_hi * 4, output_activation_min,
                                     output_activation_max, output_offset, v36,
                                     v40, v44);

       int8_t* p_out =
           output_data + tflite::Offset(output_shape, 0, out_y, 0, zo_hi);
       uint8_t local_data[64];
       vst_b_x(v0, local_data);
       vst_b_x(v1, local_data + 32);
       for (int i = 0; i < remainder; i++) {
         memcpy(p_out + (i * output_depth), local_data + (i * 8), 8);
       }
     }

 #undef CONV_PER_CHANNEL_B8_3X1_48C_MAC_PIPELINE
 #undef L0
 #undef L1
 #undef L2
 #undef L3
 #undef L4
 #undef L5
 #undef L6
 #undef L7
 #undef L8
 #undef L9
 #undef LA
 #undef LB
 #undef LC
 #undef LD
 #undef LE
   }
 }

 }  // namespace kelvin::opt
	/*
	* Copyright 2024 Google LLC
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	// Convolution based on Kelvin ops
	// Data types: input: s8, filter: s8, bias: s32
	// Special case for 48x3x1x48 filter

	#include "tflm/opt/conv_util.h"

	namespace kelvin::opt {

	void ConvS8K3x1D48(
	const tflite::ConvParams& params, const int32_t* output_multiplier,
	const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
	const int8_t* input_data, const tflite::RuntimeShape& filter_shape,
	const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
	const int32_t* bias_data, const tflite::RuntimeShape& output_shape,
	int8_t* output_data) {
	const auto batches = MatchingDim(input_shape, 0, output_shape, 0);
	const int stride_width = params.stride_width;
	const int stride_height = params.stride_height;
	const int dilation_width_factor = params.dilation_width_factor;
	const int dilation_height_factor = params.dilation_height_factor;
	const int pad_width = params.padding_values.width;
	const int pad_height = params.padding_values.height;
	const int input_width = input_shape.Dims(2);
	const int input_depth = input_shape.Dims(3);
	const int32_t input_offset = params.input_offset;
	const int filter_height = filter_shape.Dims(1);
	const int filter_width = filter_shape.Dims(2);
	const int filter_depth = filter_shape.Dims(3);
	const int output_height = output_shape.Dims(1);
	const int output_depth = output_shape.Dims(3);
	const int32_t output_offset = params.output_offset;
	const int32_t output_activation_min = params.quantized_activation_min;
	const int32_t output_activation_max = params.quantized_activation_max;

	TFLITE_DCHECK(batches == 1);
	TFLITE_DCHECK(filter_depth == input_depth);
	TFLITE_DCHECK(filter_height == 3);
	TFLITE_DCHECK(filter_width == 1);
	TFLITE_DCHECK(input_width == 1);
	TFLITE_DCHECK(stride_width == 1);
	TFLITE_DCHECK(stride_height == 1);
	TFLITE_DCHECK(dilation_width_factor == 1);
	TFLITE_DCHECK(dilation_height_factor == 1);
	TFLITE_DCHECK(pad_width == 0);
	TFLITE_DCHECK(pad_height == 0);

	int32_t bias[48 * 4];
	int32_t mult[48 * 4];
	int32_t shft[48 * 4];
	Swizzle<48>(bias_data, bias);
	Swizzle<48>(output_multiplier, mult);
	Swizzle<48, true>(output_shift, shft);

	int8_t juggled_filter_data[48 / 8][3][1][48 / 4][8][4];
	Filter_N_H_W_M<48>(filter_data, juggled_filter_data[0][0][0][0][0], 3, 1, 48);
	union {
	vconv_u8_t conv;
	uint32_t raw;
	} cmds;
	cmds.conv.mode = 0;
	cmds.conv.start = 0;
	cmds.conv.stop = 7;
	cmds.conv.sbias1 = input_offset;
	cmds.conv.sdata1 = true;
	cmds.conv.sbias2 = 0;
	cmds.conv.sdata2 = true;

	union {
	vconv_u8_t conv;
	uint32_t raw;
	} cmds16;
	cmds16.conv.mode = 0;
	cmds16.conv.start = 0;
	cmds16.conv.stop = 3;
	cmds16.conv.sbias1 = input_offset;
	cmds16.conv.sdata1 = true;
	cmds16.conv.sbias2 = 0;
	cmds16.conv.sdata2 = true;

	for (int zo_hi = 0; zo_hi < output_depth; zo_hi += 8) {
	// For each pixel, the general flow for this kernel looks like:
	// 1) Reset accumulator and load activations into [v32, v46]
	// 2) For each group of 32 scalars in the pixel fan-in, run MAC pipeline
	// 2a) Load subset of activations from [v32, v46] to [v0, v7]
	// 2b) Load subset of weights
	// 2c) Run aconv
	// 3) Run the output pipeline and store.
	//
	// For step 1, we'll alias [v32, v46] to [L0, LE]. For most iterations,
	// we load all of these registers (10 pixels). For remainder iterations,
	// we load a subset and pad the rest with 0's. The data will be stored as
	// follows, where each letter represents 16 bytes of a pixel stored into
	// a register (capitalization used to help distinguish channels in a pixel):
	// L0 L1 L2 L3 L4 L5 L6 L7 L8 L9 LA LB LC LD LE
	// Aa AB bB Cc CD dD Ee EF fF Gg GH hH Ii IJ jJ
	#define L0 v32
	#define L1 v33
	#define L2 v34
	#define L3 v35
	#define L4 v36
	#define L5 v37
	#define L6 v38
	#define L7 v39
	#define L8 v40
	#define L9 v41
	#define LA v42
	#define LB v43
	#define LC v44
	#define LD v45
	#define LE v46

	// We run 5 iterations of step 2, 4 full iterations and one half iteration.
	// Because each pixel takes 1.5 registers, we have to interleave vmv_v and
	// vsliden_w_4_vv instructions to ensure the same output channels are stored
	// in each register per-pixel. As a refresher, vsliden_w_4_vv takes two
	// register arguments (X and Y), and returns the concatenation of the last
	// half of X and the first half of Y. ie:
	// L1 L2
	// AB bB
	// vsliden_w_4_vv(v1, L1, L2); -> v1 = Bb
	#define CONV_PER_CHANNEL_B8_3X1_48C_MAC_PIPELINE(p_flt) \
	{ \
	/* 1/5 */ \
	/* Ky = 0, IC:[0-31] */ \
	vmv_v(v0, L0); /* Aa */ \
	vsliden_w_4_vv(v1, L1, L2); /* Bb */ \
	vmv_v(v2, L3); /* Cc */ \
	vsliden_w_4_vv(v3, L4, L5); /* Dd */ \
	vmv_v(v4, L6); /* Ee */ \
	vsliden_w_4_vv(v5, L7, L8); /* Ff */ \
	vmv_v(v6, L9); /* Gg */ \
	vsliden_w_4_vv(v7, LA, LB); /* Hh */ \
	vld_b_x_m(v56, p_flt + 128 * 0); \
	vld_b_x_m(v60, p_flt + 128 * 1); \
	aconv_vxv(v48, v0, cmds, v56); \
	\
	/* 2/5 */ \
	/* Ky = 0, IC:[32-47]; Ky = 1, IC:[0-15] */ \
	vmv_v(v0, L1); /* AB */ \
	vsliden_w_4_vv(v1, L2, L3); /* BC */ \
	vmv_v(v2, L4); /* CD */ \
	vsliden_w_4_vv(v3, L5, L6); /* DE */ \
	vmv_v(v4, L7); /* EF */ \
	vsliden_w_4_vv(v5, L8, L9); /* FG */ \
	vmv_v(v6, LA); /* GH */ \
	vsliden_w_4_vv(v7, LB, LC); /* HI */ \
	vld_b_x_m(v56, p_flt + 128 * 2); \
	vld_b_x_m(v60, p_flt + 128 * 3); \
	aconv_vxv(v48, v0, cmds, v56); \
	\
	/* 3/5 */ \
	/* Ky = 1, IC:[16-47] */ \
	vmv_v(v0, L2); /* bB */ \
	vsliden_w_4_vv(v1, L3, L4); /* cC */ \
	vmv_v(v2, L5); /* dD */ \
	vsliden_w_4_vv(v3, L6, L7); /* eE */ \
	vmv_v(v4, L8); /* fF */ \
	vsliden_w_4_vv(v5, L9, LA); /* gG */ \
	vmv_v(v6, LB); /* hH */ \
	vsliden_w_4_vv(v7, LC, LD); /* iI */ \
	vld_b_x_m(v56, p_flt + 128 * 4); \
	vld_b_x_m(v60, p_flt + 128 * 5); \
	aconv_vxv(v48, v0, cmds, v56); \
	\
	/* 4/5 */ \
	/* Ky = 2, IC:[0-31] */ \
	vmv_v(v0, L3); /* Cc */ \
	vsliden_w_4_vv(v1, L4, L5); /* Dd */ \
	vmv_v(v2, L6); /* Ee */ \
	vsliden_w_4_vv(v3, L4, L5); /* Ff */ \
	vmv_v(v4, L9); /* Gg */ \
	vsliden_w_4_vv(v5, LA, LB); /* Hh */ \
	vmv_v(v6, LC); /* Ii */ \
	vsliden_w_4_vv(v7, LD, LE); /* Jj */ \
	vld_b_x_m(v56, p_flt + 128 * 6); \
	vld_b_x_m(v60, p_flt + 128 * 7); \
	aconv_vxv(v48, v0, cmds, v56); \
	\
	/* 5/5 */ \
	/* Ky = 2, IC:[32-47] half iteration */ \
	vmv_v(v0, L4); /* C(D- ignored) */ \
	vsliden_w_4_vv(v1, L5, L6); /* D(E- ignored) */ \
	vmv_v(v2, L7); /* E(F- ignored) */ \
	vsliden_w_4_vv(v3, L8, L9); /* F(G- ignored) */ \
	vmv_v(v4, LA); /* G(H- ignored) */ \
	vsliden_w_4_vv(v5, LB, LC); /* H(I- ignored) */ \
	vmv_v(v6, LD); /* I(J- ignored) */ \
	/* Pad last iteration with first pixel. Gets ignored by cmd16 */ \
	vsliden_w_4_vv(v7, LE, L0); /* J(A- ignored) */ \
	vld_b_x_m(v56, p_flt + 128 * 8); /Load once half iteration/ \
	/* cmds16 runs subset of outer product */ \
	aconv_vxv(v48, v0, cmds16, v56); \
	}

	// Iterate over outputs
	int out_y = 0;
	for (; out_y + 8 <= output_height; out_y += 8) {
	// Reset accumulator
	vdup_w_x_m(v48, 0);
	vdup_w_x_m(v52, 0);
	acset_v(v48, v48);

	const int8_t* p_flt = juggled_filter_data[zo_hi / 8][0][0][0][0];
	const int8_t* p_in = input_data + (out_y * input_width * input_depth);

	// Load 1048 activations into 1048*32 = 15 registers
	vld_b_x_m(L0, p_in);
	vld_b_x_m(L4, p_in + 32 * 4);
	vld_b_x_m(L8, p_in + 32 * 8);
	vld_b_x(LC, p_in + 32 * 12);
	vld_b_x(LD, p_in + 32 * 13);
	vld_b_x(LE, p_in + 32 * 14);

	// MAC pipeline
	CONV_PER_CHANNEL_B8_3X1_48C_MAC_PIPELINE(p_flt);

	// Output pipeline
	INT32_TO_INT8_OUTPUT_PIPELINE(bias + zo_hi * 4, mult + zo_hi * 4,
	shft + zo_hi * 4, output_activation_min,
	output_activation_max, output_offset, v36,
	v40, v44);
	int8_t* p_out =
	output_data + tflite::Offset(output_shape, 0, out_y, 0, zo_hi);
	vstq_b_sp_xx(v48, p_out, output_depth);
	vstq_b_sp_xx(v52, p_out, output_depth);
	}

	// Left over minibatch
	int remainder = output_height - out_y;
	if (remainder != 0) {
	// Reset accumulator
	vdup_w_x_m(v48, 0);
	vdup_w_x_m(v52, 0);
	acset_v(v48, v48);

	const int8_t* p_flt = juggled_filter_data[zo_hi / 8][0][0][0][0];
	const int8_t* p_in = input_data + (out_y * input_width * input_depth);

	// Load (remainder + 2) * 48 activations
	// L0 L1 L2 L3 L4 L5 L6 L7 L8 L9 LA LB LC LD
	// AA AB BB CC CD DD EE EF FF GG GH HH II I-
	vld_b_x_m(L0, p_in);
	vdup_w_x_m(L4, 0);
	vdup_w_x_m(L8, 0);
	vdup_w_x_m(LC, 0);
	switch (remainder) {
	case 7:
	vld_b_x(LD, p_in + 32 * 13);
	vld_b_x(LC, p_in + 32 * 12);
	case 6:
	vld_b_x(LB, p_in + 32 * 11);
	case 5:
	vld_b_x(LA, p_in + 32 * 10);
	vld_b_x(L9, p_in + 32 * 9);
	case 4:
	vld_b_x(L8, p_in + 32 * 8);
	case 3:
	vld_b_x(L7, p_in + 32 * 7);
	vld_b_x(L6, p_in + 32 * 6);
	case 2:
	vld_b_x(L5, p_in + 32 * 5);
	default:
	break;
	}
	vld_b_x(L4, p_in + 32 * 4);

	// MAC pipeline
	CONV_PER_CHANNEL_B8_3X1_48C_MAC_PIPELINE(p_flt);

	// Output pipeline
	INT32_TO_INT8_OUTPUT_PIPELINE(bias + zo_hi * 4, mult + zo_hi * 4,
	shft + zo_hi * 4, output_activation_min,
	output_activation_max, output_offset, v36,
	v40, v44);

	int8_t* p_out =
	output_data + tflite::Offset(output_shape, 0, out_y, 0, zo_hi);
	uint8_t local_data[64];
	vst_b_x(v0, local_data);
	vst_b_x(v1, local_data + 32);
	for (int i = 0; i < remainder; i++) {
	memcpy(p_out + (i * output_depth), local_data + (i * 8), 8);
	}
	}

	#undef CONV_PER_CHANNEL_B8_3X1_48C_MAC_PIPELINE
	#undef L0
	#undef L1
	#undef L2
	#undef L3
	#undef L4
	#undef L5
	#undef L6
	#undef L7
	#undef L8
	#undef L9
	#undef LA
	#undef LB
	#undef LC
	#undef LD
	#undef LE
	}
	}

	} // namespace kelvin::opt