Depthwise conv for Depth % == 16
Change-Id: Icb11cccefb8725e8ece8b478db38b1d8e83f8d4f
diff --git a/tflm/opt/depthwise_conv_s8.cc b/tflm/opt/depthwise_conv_s8.cc
index a11c3d2..453a2c6 100644
--- a/tflm/opt/depthwise_conv_s8.cc
+++ b/tflm/opt/depthwise_conv_s8.cc
@@ -25,6 +25,7 @@
// Reorders a vector to match the pattern after double-widening.
// N must be a multiple of 4.
+// Working only for mutliples of 32
void VectorSwizzle(const int32_t* input, int32_t* output, int N) {
assert(N >= 4 && N % 4 == 0);
const int32_t(&in)[N] = *(int32_t(*)[N])input;
@@ -2673,6 +2674,118 @@
}
}
+void DepthwiseConvS8D16(
+ const tflite::DepthwiseParams& params, const int32_t* output_multiplier,
+ const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
+ const int8_t* input_data, const tflite::RuntimeShape& filter_shape,
+ const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
+ const int32_t* bias_data, const tflite::RuntimeShape& output_shape,
+ int8_t* output_data
+
+) {
+ const int stride_width = params.stride_width;
+ const int stride_height = params.stride_height;
+ const int pad_width = params.padding_values.width;
+ const int pad_height = params.padding_values.height;
+ const int32_t input_offset = params.input_offset;
+ const int32_t output_offset = params.output_offset;
+ const int32_t output_activation_min = params.quantized_activation_min;
+ const int32_t output_activation_max = params.quantized_activation_max;
+ const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const int input_height = input_shape.Dims(1);
+ const int input_width = input_shape.Dims(2);
+ const int input_depth = input_shape.Dims(3);
+ const int filter_height = filter_shape.Dims(1);
+ const int filter_width = filter_shape.Dims(2);
+ const int output_height = output_shape.Dims(1);
+ const int output_width = output_shape.Dims(2);
+ const int output_depth = output_shape.Dims(3);
+ for (int in_channel = 0; in_channel + 16 <= input_depth; in_channel += 16) {
+ const int output_channel = in_channel;
+
+ vld_w_x(v24, output_multiplier);
+ vld_w_x(v25, output_multiplier + 8);
+ vld_w_x(v28, output_shift);
+ vld_w_x(v29, output_shift + 8);
+ vrsub_w_vx(v28, v28, 0);
+ vrsub_w_vx(v29, v29, 0);
+
+ for (int batch = 0; batch < batches; ++batch) {
+ const int8_t* p_output =
+ output_data + (batch * output_width * output_height * output_depth) +
+ output_channel;
+ for (int out_y = 0; out_y < output_height; ++out_y) {
+ for (int out_x = 0; out_x < output_width; ++out_x) {
+ const int in_x_origin = (out_x * stride_width) - pad_width;
+ const int in_y_origin = (out_y * stride_height) - pad_height;
+ const int y_offset = (output_depth * output_width * out_y);
+
+ if (bias_data) {
+ vld_w_x(v48, bias_data);
+ vld_w_x(v49, bias_data + 8);
+ } else {
+ vdup_w_x(v48, 0);
+ vdup_w_x(v49, 0);
+ }
+
+ for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+ const int in_y = in_y_origin + filter_y;
+ if ((in_y < 0) || (in_y >= input_height)) {
+ continue;
+ }
+ for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+ const int in_x = in_x_origin + filter_x;
+ if ((in_x < 0) || (in_x >= input_width)) {
+ continue;
+ }
+
+ const int8_t* in_p =
+ input_data +
+ (batch * input_height * input_width * input_depth) +
+ (in_y * input_width * input_depth) + (in_x * input_depth) +
+ in_channel;
+
+ const int8_t* fl_p = filter_data +
+ (filter_y * filter_width * input_depth) +
+ (filter_x * input_depth) + in_channel;
+
+ vld_b_l_xx(v0, in_p, 16);
+ vld_b_l_xx(v4, fl_p, 16);
+
+ vaddw_h_vx(v0, v0, 0);
+ vadd_h_vx(v0, v0, static_cast<int16_t>(input_offset));
+ vadd_h_vx(v1, v1, static_cast<int16_t>(input_offset));
+ vzip_h_vv(v0, v0, v1);
+
+ vaddw_h_vx(v4, v4, static_cast<int16_t>(0));
+ vzip_h_vv(v4, v4, v5);
+ vmulw_w_vv(v8, v0, v4);
+
+ vadd_w_vv(v48, v48, v8);
+ vadd_w_vv(v49, v49, v9);
+ }
+ }
+
+ vdmulh_w_rn_vv(v48, v48, v24);
+ vdmulh_w_rn_vv(v49, v49, v25);
+ vsha_w_r_vv(v48, v48, v28);
+ vsha_w_r_vv(v49, v49, v29);
+
+ vadd_w_vx(v48, v48, output_offset);
+ vadd_w_vx(v49, v49, output_offset);
+ vmax_w_vx(v48, v48, output_activation_min);
+ vmax_w_vx(v49, v49, output_activation_min);
+ vmin_w_vx(v48, v48, output_activation_max);
+ vmin_w_vx(v49, v49, output_activation_max);
+ vsraqs_b_vx_m(v48, v48, 0);
+ vsraqs_b_vx(v49, v49, 0);
+ vst_b_l_xx(v48, p_output + (out_x * output_depth) + y_offset, 16);
+ }
+ }
+ }
+ }
+}
+
// generic implementation based on Kelvin ops
void DepthwiseConvS8Generic(
const tflite::DepthwiseParams& params, const int32_t* output_multiplier,
@@ -2746,8 +2859,9 @@
RUN_KERNEL(DepthwiseConvS83x3D32);
}
RUN_KERNEL(DepthwiseConvS8D32);
+ } else if (output_depth % 16 == 0) {
+ RUN_KERNEL(DepthwiseConvS8D16);
}
-
RUN_KERNEL(DepthwiseConvS8Generic);
}