blob: 1bb15aa88fc9de393dc227c6d0d3846ca14582a1 [file]
diff --git a/algo/kernels/cnn/hifi4/xa_nn_conv2d_std_sym8sxsym16s.c b/algo/kernels/cnn/hifi4/xa_nn_conv2d_std_sym8sxsym16s.c
index b9905e9..990b713 100644
--- a/algo/kernels/cnn/hifi4/xa_nn_conv2d_std_sym8sxsym16s.c
+++ b/algo/kernels/cnn/hifi4/xa_nn_conv2d_std_sym8sxsym16s.c
@@ -49,6 +49,24 @@ static inline ae_int32x2 MultiplyByQuantizedMultiplier_ref(ae_int64 d_x,
return result;
}
+static inline ae_int32x2 MultiplyByQuantizedMultiplier_x2_opt(ae_int64 d_x1, ae_int64 d_x2,
+ int32_t quantized_multiplier,
+ int shift) {
+ ae_int32x2 d_q_mul = AE_MOVDA32(quantized_multiplier);
+ ae_int16x4 d_red_mul16 = AE_ROUND16X4F32SASYM(d_q_mul, d_q_mul);
+ ae_int32x2 d_red_mul32 = AE_SEXT32X2D16_32(d_red_mul16);
+ ae_int64 qL1 = AE_MUL32U_LL(d_red_mul32, AE_MOVINT32X2_FROMINT64(d_x1));
+ ae_int64 qL2 = AE_MUL32U_LL(d_red_mul32, AE_MOVINT32X2_FROMINT64(d_x2));
+ ae_int64 qH1 = AE_SLAI64(AE_MUL32_LH(d_red_mul32, AE_MOVINT32X2_FROMINT64(d_x1)), 32);
+ ae_int64 qH2 = AE_SLAI64(AE_MUL32_LH(d_red_mul32, AE_MOVINT32X2_FROMINT64(d_x2)), 32);
+ ae_int64 q1 = AE_ADD64(qL1, qH1);
+ ae_int64 q2 = AE_ADD64(qL2, qH2);
+ q1 = AE_SRAA64(q1, (-shift-17));
+ q2 = AE_SRAA64(q2, (-shift-17));
+ ae_int32x2 result = AE_ROUND32X2F64SASYM(q1, q2);
+ return result;
+}
+
static WORD32 conv_x_left_pad(
WORD32 x_padding,
WORD32 kernel_width,
@@ -129,6 +147,160 @@ static WORD32 conv_x_right_pad(
return out_width_over_x_r_pad;
}
+static WORD32 xa_nn_conv2d_std_per_chan_sym8sxsym16s_no_circ_buf(
+ WORD16* __restrict__ p_out,
+ const WORD16* __restrict__ p_inp,
+ const WORD8* __restrict__ p_kernel,
+ const WORD64* __restrict__ p_bias,
+ WORD32 input_height,
+ WORD32 input_width,
+ WORD32 input_channels,
+ WORD32 kernel_height,
+ WORD32 kernel_width,
+ WORD32 out_channels,
+ WORD32 x_stride,
+ WORD32 y_stride,
+ WORD32 x_padding,
+ WORD32 y_padding,
+ WORD32 out_height,
+ WORD32 out_width,
+ WORD32 input_zero_bias,
+ WORD32 * p_out_multiplier,
+ WORD32 * p_out_shift,
+ WORD32 out_zero_bias,
+ WORD32 out_data_format
+ )
+ {
+
+ const WORD16 *p_dst0_0 = p_out + 0;
+ const WORD16 *p_dst0_1 = p_out + 1;
+ const WORD16 *p_dst0_2 = p_out + 2;
+ const WORD16 *p_dst0_3 = p_out + 3;
+ const WORD16 *p_dst1_0 = p_out + out_channels + 0;
+ const WORD16 *p_dst1_1 = p_out + out_channels + 1;
+ const WORD16 *p_dst1_2 = p_out + out_channels + 2;
+ const WORD16 *p_dst1_3 = p_out + out_channels + 3;
+ int kernel_out_ch_offset = kernel_height * kernel_width * input_channels;
+ int input_x_offset = (input_channels * x_stride) / 4;
+ int p_inp_vec_stride = (input_width * input_channels) / 4;
+ int p_kern_vec_stride = kernel_width * input_channels;
+ int vec_len = kernel_width * input_channels;
+ for (int out_y = 0; out_y < out_height; ++out_y) {
+ for (int out_x = 0; out_x < out_width; out_x += 2) {
+ for (int out_ch = 0; out_ch < out_channels; out_ch += 4) {
+ ae_int64 out0_0 = p_bias[out_ch + 0];
+ ae_int64 out0_1 = p_bias[out_ch + 1];
+ ae_int64 out0_2 = p_bias[out_ch + 2];
+ ae_int64 out0_3 = p_bias[out_ch + 3];
+ ae_int64 out1_0 = p_bias[out_ch + 0];
+ ae_int64 out1_1 = p_bias[out_ch + 1];
+ ae_int64 out1_2 = p_bias[out_ch + 2];
+ ae_int64 out1_3 = p_bias[out_ch + 3];
+
+ out0_0 = AE_SLAI64(out0_0, 8);
+ out0_1 = AE_SLAI64(out0_1, 8);
+ out0_2 = AE_SLAI64(out0_2, 8);
+ out0_3 = AE_SLAI64(out0_3, 8);
+ out1_0 = AE_SLAI64(out1_0, 8);
+ out1_1 = AE_SLAI64(out1_1, 8);
+ out1_2 = AE_SLAI64(out1_2, 8);
+ out1_3 = AE_SLAI64(out1_3, 8);
+
+ int in_x_o = out_x * x_stride;
+ int in_y_o = out_y * y_stride - y_padding;
+ int k_y_min = -in_y_o;
+ int k_y_max = input_height - in_y_o;
+ k_y_min = (k_y_min < 0) ? 0 : k_y_min;
+ k_y_min = (k_y_min < kernel_height) ? k_y_min : kernel_height;
+ k_y_max = (k_y_max < 0) ? 0 : k_y_max;
+ k_y_max = (k_y_max < kernel_height) ? k_y_max : kernel_height;
+ const ae_int16x4 *p_inp_vec =
+ (ae_int16x4 *)&p_inp[((in_y_o + k_y_min) * input_width + in_x_o) *
+ input_channels +
+ 0];
+ const WORD8 *p_kern_vec =
+ &p_kernel[(((out_ch + 0) * kernel_height + k_y_min) * kernel_width +
+ 0) *
+ input_channels +
+ 0];
+ for (int k_y = k_y_min; k_y < k_y_max; ++k_y) {
+ const ae_int16x4 *p_inp_vec0 = p_inp_vec;
+ const ae_int16x4 *p_inp_vec1 = p_inp_vec + input_x_offset;
+ const WORD8 *p_kern_vec0 = p_kern_vec;
+ const WORD8 *p_kern_vec1 = p_kern_vec0 + kernel_out_ch_offset;
+ const WORD8 *p_kern_vec2 = p_kern_vec1 + kernel_out_ch_offset;
+ const WORD8 *p_kern_vec3 = p_kern_vec2 + kernel_out_ch_offset;
+ p_inp_vec += p_inp_vec_stride;
+ p_kern_vec += p_kern_vec_stride;
+ ae_int16x4 d_inp0;
+ ae_int16x4 d_inp1;
+ ae_int16x4 d_kern0;
+ ae_int16x4 d_kern1;
+ ae_int16x4 d_kern2;
+ ae_int16x4 d_kern3;
+ for (int i = 0; i < vec_len; i += 4) {
+ AE_L16X4_IP(d_inp0, p_inp_vec0, 8);
+ AE_L16X4_IP(d_inp1, p_inp_vec1, 8);
+ AE_L8X4F_IP(d_kern0, p_kern_vec0, 4);
+ AE_L8X4F_IP(d_kern1, p_kern_vec1, 4);
+ AE_L8X4F_IP(d_kern2, p_kern_vec2, 4);
+ AE_L8X4F_IP(d_kern3, p_kern_vec3, 4);
+ AE_MULAAAAQ16(out0_0, d_inp0, d_kern0);
+ AE_MULAAAAQ16(out0_1, d_inp0, d_kern1);
+ AE_MULAAAAQ16(out0_2, d_inp0, d_kern2);
+ AE_MULAAAAQ16(out0_3, d_inp0, d_kern3);
+ AE_MULAAAAQ16(out1_0, d_inp1, d_kern0);
+ AE_MULAAAAQ16(out1_1, d_inp1, d_kern1);
+ AE_MULAAAAQ16(out1_2, d_inp1, d_kern2);
+ AE_MULAAAAQ16(out1_3, d_inp1, d_kern3);
+ }
+ }
+
+ out0_0 = AE_SRAI64(out0_0, 8);
+ out0_1 = AE_SRAI64(out0_1, 8);
+ out0_2 = AE_SRAI64(out0_2, 8);
+ out0_3 = AE_SRAI64(out0_3, 8);
+ out1_0 = AE_SRAI64(out1_0, 8);
+ out1_1 = AE_SRAI64(out1_1, 8);
+ out1_2 = AE_SRAI64(out1_2, 8);
+ out1_3 = AE_SRAI64(out1_3, 8);
+
+ ae_int32x2 acc_vec0 = MultiplyByQuantizedMultiplier_x2_opt(
+ out0_0, out1_0, p_out_multiplier[out_ch + 0],
+ p_out_shift[out_ch + 0]);
+ ae_int32x2 acc_vec1 = MultiplyByQuantizedMultiplier_x2_opt(
+ out0_1, out1_1, p_out_multiplier[out_ch + 1],
+ p_out_shift[out_ch + 1]);
+ ae_int32x2 acc_vec2 = MultiplyByQuantizedMultiplier_x2_opt(
+ out0_2, out1_2, p_out_multiplier[out_ch + 2],
+ p_out_shift[out_ch + 2]);
+ ae_int32x2 acc_vec3 = MultiplyByQuantizedMultiplier_x2_opt(
+ out0_3, out1_3, p_out_multiplier[out_ch + 3],
+ p_out_shift[out_ch + 3]);
+ ae_int16x4 d1 = AE_SAT16X4(acc_vec0, acc_vec1);
+ ae_int16x4 d2 = AE_SAT16X4(acc_vec2, acc_vec3);
+ AE_S16_0_XP(AE_SEL16_6543(d1, d1), (ae_int16 *)p_dst0_0, 8);
+ AE_S16_0_XP(AE_SEL16_5432(d1, d1), (ae_int16 *)p_dst1_0, 8);
+ AE_S16_0_XP(AE_SEL16_4321(d1, d1), (ae_int16 *)p_dst0_1, 8);
+ AE_S16_0_XP(d1, (ae_int16 *)p_dst1_1, 8);
+ AE_S16_0_XP(AE_SEL16_6543(d2, d2), (ae_int16 *)p_dst0_2, 8);
+ AE_S16_0_XP(AE_SEL16_5432(d2, d2), (ae_int16 *)p_dst1_2, 8);
+ AE_S16_0_XP(AE_SEL16_4321(d2, d2), (ae_int16 *)p_dst0_3, 8);
+ AE_S16_0_XP(d2, (ae_int16 *)p_dst1_3, 8);
+ }
+ p_dst0_0 += out_channels;
+ p_dst0_1 += out_channels;
+ p_dst0_2 += out_channels;
+ p_dst0_3 += out_channels;
+ p_dst1_0 += out_channels;
+ p_dst1_1 += out_channels;
+ p_dst1_2 += out_channels;
+ p_dst1_3 += out_channels;
+ }
+ }
+ return 0;
+}
+
WORD32 xa_nn_conv2d_std_per_chan_sym8sxsym16s(
WORD16* __restrict__ p_out,
const WORD16* __restrict__ p_inp,
@@ -180,6 +352,35 @@ WORD32 xa_nn_conv2d_std_per_chan_sym8sxsym16s(
XA_NNLIB_ARG_CHK_COND((p_out_shift[itr] < -31 || p_out_shift[itr] > 31), -1);
}
+ if ( !(x_padding) && !(input_channels & 0x3) && !(out_channels & 0x3) && !(out_width & 0x1) && (out_data_format == 0) && ((out_width-1)*x_stride <=(input_width-kernel_width) ) )
+ {
+ int ret_val=0;
+ ret_val=xa_nn_conv2d_std_per_chan_sym8sxsym16s_no_circ_buf(p_out,
+ p_inp,
+ p_kernel,
+ p_bias,
+ input_height,
+ input_width,
+ input_channels,
+ kernel_height,
+ kernel_width,
+ out_channels,
+ x_stride,
+ y_stride,
+ x_padding,
+ y_padding,
+ out_height,
+ out_width,
+ input_zero_bias,
+ p_out_multiplier,
+ p_out_shift,
+ out_zero_bias,
+ out_data_format
+ );
+
+ return ret_val;
+ }
+
WORD32 j;
WORD32 input_bytewidth = 2;
VOID *pp_inp = (VOID *)p_inp;