5x5 DepthwiseConv, 4 outputs per loop iteration
Change-Id: I8a5bd0d7bcbfaea740f3ca1de012993771b78436
diff --git a/tflm/opt/depthwise_conv_s8.cc b/tflm/opt/depthwise_conv_s8.cc
index aa7b6bb..feb21cb 100644
--- a/tflm/opt/depthwise_conv_s8.cc
+++ b/tflm/opt/depthwise_conv_s8.cc
@@ -472,7 +472,26 @@
#undef FLT_2_0
#undef FLT_2_1
#undef FLT_2_2
+#undef INPUT_0_0
+#undef INPUT_0_1
+#undef INPUT_0_2
+#undef INPUT_0_3
+#undef INPUT_0_4
+#undef INPUT_0_5
+#undef INPUT_1_0
+#undef INPUT_1_1
+#undef INPUT_1_2
+#undef INPUT_1_3
+#undef INPUT_1_4
+#undef INPUT_1_5
+#undef INPUT_2_0
+#undef INPUT_2_1
+#undef INPUT_2_2
+#undef INPUT_2_3
+#undef INPUT_2_4
+#undef INPUT_2_5
#undef COMPUTE
+#undef INPUT_PTRS
}
// special case of input depth = 32n, filter shape of 3x3
@@ -654,6 +673,8 @@
const int stride_height = params.stride_height;
const int pad_width = params.padding_values.width;
const int pad_height = params.padding_values.height;
+ assert(pad_width == 2);
+ assert(pad_height == 2);
const int32_t input_offset = params.input_offset;
const int32_t output_offset = params.output_offset;
const int32_t output_activation_min = params.quantized_activation_min;
@@ -669,6 +690,108 @@
int32_t swizzled_shift_multi[32];
int32_t swizzled_output_multi[32];
+// INPUT_Y_X
+#define INPUT_0_0 v26
+#define INPUT_0_1 v29
+#define INPUT_0_2 v32
+#define INPUT_0_3 v35
+#define INPUT_0_4 v38
+#define INPUT_1_0 v27
+#define INPUT_1_1 v30
+#define INPUT_1_2 v33
+#define INPUT_1_3 v36
+#define INPUT_1_4 v39
+#define INPUT_2_0 v28
+#define INPUT_2_1 v31
+#define INPUT_2_2 v34
+#define INPUT_2_3 v37
+#define INPUT_2_4 v40
+#define INPUT_3_0 v41
+#define INPUT_3_1 v42
+#define INPUT_3_2 v43
+#define INPUT_3_3 v44
+#define INPUT_3_4 v45
+#define INPUT_4_0 v47
+#define INPUT_4_1 v48
+#define INPUT_4_2 v49
+#define INPUT_4_3 v50
+#define INPUT_4_4 v51
+
+#define INPUT_0_5 v53
+#define INPUT_1_5 v54
+#define INPUT_2_5 v55
+#define INPUT_3_5 v46
+#define INPUT_4_5 v52
+
+#define FLT_0_0 v0
+#define FLT_0_1 v3
+#define FLT_0_2 v6
+#define FLT_0_3 v9
+#define FLT_0_4 v12
+#define FLT_1_0 v1
+#define FLT_1_1 v4
+#define FLT_1_2 v7
+#define FLT_1_3 v10
+#define FLT_1_4 v13
+#define FLT_2_0 v2
+#define FLT_2_1 v5
+#define FLT_2_2 v8
+#define FLT_2_3 v11
+#define FLT_2_4 v14
+#define FLT_3_0 v15
+#define FLT_3_1 v16
+#define FLT_3_2 v17
+#define FLT_3_3 v18
+#define FLT_3_4 v19
+#define FLT_HOLE v20
+#define FLT_4_0 v21
+#define FLT_4_1 v22
+#define FLT_4_2 v23
+#define FLT_4_3 v24
+#define FLT_4_4 v25
+
+#define COMPUTE() \
+ vld_w_x_m(v60, swizzled_bias_data); \
+ adwinit_v(v60, v60); \
+ /* 0,0 1,0 2,0 */ \
+ adwconv_vxv(v60, INPUT_0_0, cmds, FLT_0_0); \
+ /* 0,1 1,1 2,1 */ \
+ adwconv_vxv(v60, INPUT_0_1, cmds, FLT_0_1); \
+ /* 0,2 1,2 2,2*/ \
+ adwconv_vxv(v60, INPUT_0_2, cmds, FLT_0_2); \
+ /* 0,3 1,3 2,3 */ \
+ adwconv_vxv(v60, INPUT_0_3, cmds, FLT_0_3); \
+ /* 0,4 1,4 2,4 */ \
+ adwconv_vxv(v60, INPUT_0_4, cmds, FLT_0_4); \
+ /* 3,0 3,1 3,2 */ \
+ adwconv_vxv(v60, INPUT_3_0, cmds, FLT_3_0); \
+ /* 3,3 3,4 hole */ \
+ adwconv_vxv(v60, INPUT_3_3, cmds, FLT_3_3); \
+ /* hole 4,0 4,1*/ \
+ adwconv_vxv(v60, INPUT_3_5, cmds, FLT_HOLE); \
+ /* 4,2 4,3 4,4*/ \
+ vdwconv_vxv(v60, INPUT_4_2, cmds, FLT_4_2); \
+ INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE(v60, v56, v52, \
+ output_activation_min, \
+ output_activation_max, \
+ output_offset); \
+ vsraqs_b_vx(v60, v60, 0); \
+ vst_b_x(v60, p_output);
+
+#define INPUT_PTRS(_strides) \
+ const int in_x_origin = (out_x * stride_width) - pad_width; \
+ const int in_y_origin = (out_y * stride_height) - pad_height; \
+ const int8_t* p_in_0 = input_data + \
+ (batch * input_height * input_width * input_depth) + \
+ (in_y_origin * input_width * input_depth) + \
+ ((in_x_origin + _strides) * input_depth) + \
+ in_channel; \
+ const int8_t* p_in_1 = p_in_0 + (input_width * input_depth); \
+ const int8_t* p_in_2 = p_in_1 + (input_width * input_depth); \
+ const int8_t* p_in_3 = p_in_2 + (input_width * input_depth); \
+ const int8_t* p_in_4 = p_in_3 + (input_width * input_depth); \
+ (void)p_in_4;
+
for (int in_channel = 0; in_channel + 32 <= input_depth; in_channel += 32) {
const int output_channel = in_channel;
VectorSwizzle(bias_data + output_channel, swizzled_bias_data, 32);
@@ -691,260 +814,1039 @@
// Don't reorder me!
const int8_t* p_flt0 = filter_data + in_channel;
const int32_t stride = input_depth;
- vld_b_sp_xx_m(v0, p_flt0, stride);
- vld_b_sp_xx_m(v4, p_flt0, stride);
- vld_b_sp_xx_m(v8, p_flt0, stride);
- vld_b_sp_xx_m(v12, p_flt0, stride);
- vld_b_sp_xx_m(v16, p_flt0, stride);
- vld_b_sp_xx_m(v20, p_flt0, stride);
- vld_b_sp_xx(v24, p_flt0, stride);
+ vld_b_sp_xx(FLT_0_0, p_flt0, stride);
+ vld_b_sp_xx(FLT_0_1, p_flt0, stride);
+ vld_b_sp_xx(FLT_0_2, p_flt0, stride);
+ vld_b_sp_xx(FLT_0_3, p_flt0, stride);
+ vld_b_sp_xx(FLT_0_4, p_flt0, stride);
+ vld_b_sp_xx(FLT_1_0, p_flt0, stride);
+ vld_b_sp_xx(FLT_1_1, p_flt0, stride);
+ vld_b_sp_xx(FLT_1_2, p_flt0, stride);
+ vld_b_sp_xx(FLT_1_3, p_flt0, stride);
+ vld_b_sp_xx(FLT_1_4, p_flt0, stride);
+ vld_b_sp_xx(FLT_2_0, p_flt0, stride);
+ vld_b_sp_xx(FLT_2_1, p_flt0, stride);
+ vld_b_sp_xx(FLT_2_2, p_flt0, stride);
+ vld_b_sp_xx(FLT_2_3, p_flt0, stride);
+ vld_b_sp_xx(FLT_2_4, p_flt0, stride);
+ vld_b_sp_xx(FLT_3_0, p_flt0, stride);
+ vld_b_sp_xx(FLT_3_1, p_flt0, stride);
+ vld_b_sp_xx(FLT_3_2, p_flt0, stride);
+ vld_b_sp_xx(FLT_3_3, p_flt0, stride);
+ vld_b_sp_xx(FLT_3_4, p_flt0, stride);
+ vld_b_sp_xx(FLT_4_0, p_flt0, stride);
+ vld_b_sp_xx(FLT_4_1, p_flt0, stride);
+ vld_b_sp_xx(FLT_4_2, p_flt0, stride);
+ vld_b_sp_xx(FLT_4_3, p_flt0, stride);
+ vld_b_sp_xx(FLT_4_4, p_flt0, stride);
+ vdup_b_x(FLT_HOLE, 0);
- // Extra two registers to get our
- // total usage to a multiple of 3 for dwconv.
- vdup_b_x(v25, 0);
- vdup_b_x(v26, 0);
-
+ vld_w_x_m(v56, swizzled_output_multi);
+ vld_w_x_m(v52, swizzled_shift_multi);
+ vrsub_w_vx_m(v52, v52, 0);
for (int batch = 0; batch < batches; ++batch) {
const int8_t* p_output = output_data + (batch * output_height * output_width * output_depth) + output_channel;
- for (int out_y = 0; out_y < output_height; ++out_y) {
- const int y_offset = out_y * output_width * output_depth;
- for (int out_x = 0; out_x < output_width; ++out_x) {
- const int in_x_origin = (out_x * stride_width) - pad_width;
- const int in_y_origin = (out_y * stride_height) - pad_height;
+ int out_y = 0;
+ // Done
+ { // out_y = 0;
+ int out_x = 0;
+ vdup_b_x(INPUT_0_0, -input_offset);
+ vdup_b_x(INPUT_0_1, -input_offset);
+ vdup_b_x(INPUT_0_2, -input_offset);
+ vdup_b_x(INPUT_0_3, -input_offset);
+ vdup_b_x(INPUT_0_4, -input_offset);
- bool top_pad = in_y_origin < 0;
- bool left_pad = in_x_origin < 0;
- int top_pad_count = top_pad ? 0 - in_y_origin : 0;
- int left_pad_count = left_pad ? 0 - in_x_origin : 0;
- bool bottom_pad = (in_y_origin + 4) >= input_height;
- bool right_pad = (in_x_origin + 4) >= input_width;
- int bottom_pad_count = std::abs(bottom_pad ? (in_y_origin + 4) - input_height + 1: 0);
- int right_pad_count = std::abs(right_pad ? (in_x_origin + 4) - input_width + 1 : 0);
- bool padding_required = top_pad || left_pad || bottom_pad || right_pad;
- assert(top_pad_count <= pad_height);
- assert(bottom_pad_count <= pad_height);
- assert(left_pad_count <= pad_width);
- assert(right_pad_count <= pad_width);
- assert(!(left_pad && right_pad));
- const int8_t* p_in_0 = input_data +
- (batch * input_height * input_width * input_depth) +
- (in_y_origin * input_width * input_depth) +
- (in_x_origin * input_depth) +
- in_channel;
- const int8_t* p_in_1 = p_in_0 + (input_width * input_depth);
- const int8_t* p_in_2 = p_in_1 + (input_width * input_depth);
- const int8_t* p_in_3 = p_in_2 + (input_width * input_depth);
- const int8_t* p_in_4 = p_in_3 + (input_width * input_depth);
- // Extra two registers to get our
- // total usage to a multiple of 3 for dwconv.
- vdup_b_x(v52, -input_offset);
- vdup_b_x(v53, -input_offset);
- if (!padding_required) {
- vld_b_sp_xx(v27, p_in_0, input_depth);
- vld_b_sp_xx_m(v28, p_in_0, input_depth);
- vld_b_sp_xx_m(v32, p_in_1, input_depth);
- vld_b_sp_xx(v36, p_in_1, input_depth);
- vld_b_sp_xx(v37, p_in_2, input_depth);
- vld_b_sp_xx(v38, p_in_2, input_depth);
- vld_b_sp_xx(v39, p_in_2, input_depth);
- vld_b_sp_xx(v40, p_in_2, input_depth);
- vld_b_sp_xx(v41, p_in_2, input_depth);
- vld_b_sp_xx(v42, p_in_3, input_depth);
- vld_b_sp_xx(v43, p_in_3, input_depth);
- vld_b_sp_xx(v44, p_in_3, input_depth);
- vld_b_sp_xx(v45, p_in_3, input_depth);
- vld_b_sp_xx(v46, p_in_3, input_depth);
- vld_b_sp_xx(v47, p_in_4, input_depth);
- vld_b_sp_xx_m(v48, p_in_4, input_depth);
- } else {
- // Top row
- if (top_pad_count >= 1) {
- vdup_b_x(v27, -input_offset);
- vdup_b_x_m(v28, -input_offset);
- } else {
- switch (left_pad_count) {
- case 2:
- vdup_b_x(v28, -input_offset);
- case 1:
- vdup_b_x(v27, -input_offset);
- }
- switch (left_pad_count) {
- case 0:
- vld_b_x(v27, p_in_0);
- case 1:
- vld_b_x(v28, p_in_0 + input_depth);
- }
- vld_b_x(v29, p_in_0 + (2 * input_depth));
- switch (right_pad_count) {
- case 2:
- vdup_b_x(v30, -input_offset);
- case 1:
- vdup_b_x(v31, -input_offset);
- }
- switch (right_pad_count) {
- case 0:
- vld_b_x(v31, p_in_0 + (4 * input_depth));
- case 1:
- vld_b_x(v30, p_in_0 + (3 * input_depth));
- }
- }
+ vdup_b_x(INPUT_1_0, -input_offset);
+ vdup_b_x(INPUT_1_1, -input_offset);
+ vdup_b_x(INPUT_1_2, -input_offset);
+ vdup_b_x(INPUT_1_3, -input_offset);
+ vdup_b_x(INPUT_1_4, -input_offset);
+ { // out_x == 0
+ INPUT_PTRS(2);
- // 2nd row
- if (top_pad_count == 2) {
- vdup_b_x_m(v32, -input_offset);
- vdup_b_x(v36, -input_offset);
- } else {
- switch (left_pad_count) {
- case 2:
- vdup_b_x(v33, -input_offset);
- case 1:
- vdup_b_x(v32, -input_offset);
- }
- switch (left_pad_count) {
- case 0:
- vld_b_x(v32, p_in_1);
- case 1:
- vld_b_x(v33, p_in_1 + input_depth);
- }
- vld_b_x(v34, p_in_1 + (2 * input_depth));
- switch (right_pad_count) {
- case 2:
- vdup_b_x(v35, -input_offset);
- case 1:
- vdup_b_x(v36, -input_offset);
- }
- switch (right_pad_count) {
- case 0:
- vld_b_x(v36, p_in_1 + (4 * input_depth));
- case 1:
- vld_b_x(v35, p_in_1 + (3 * input_depth));
- }
- }
+ vdup_b_x(INPUT_2_0, -input_offset);
+ vdup_b_x(INPUT_2_1, -input_offset);
+ vld_b_sp_xx(INPUT_2_2, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_3, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_4, p_in_2, input_depth);
- // 3rd row
- switch (left_pad_count) {
- case 2:
- vdup_b_x(v38, -input_offset);
- case 1:
- vdup_b_x(v37, -input_offset);
- }
- switch (left_pad_count) {
- case 0:
- vld_b_x(v37, p_in_2);
- case 1:
- vld_b_x(v38, p_in_2 + input_depth);
- }
- vld_b_x(v39, p_in_2 + (2 * input_depth));
- switch (right_pad_count) {
- case 2:
- vdup_b_x(v40, -input_offset);
- case 1:
- vdup_b_x(v41, -input_offset);
- }
- switch (right_pad_count) {
- case 0:
- vld_b_x(v41, p_in_2 + (4 * input_depth));
- case 1:
- vld_b_x(v40, p_in_2 + (3 * input_depth));
- }
+ vdup_b_x(INPUT_3_0, -input_offset);
+ vdup_b_x(INPUT_3_1, -input_offset);
+ vld_b_sp_xx(INPUT_3_2, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_3, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_4, p_in_3, input_depth);
- // 4th row
- if (bottom_pad_count == 2) {
- vdup_b_x(v42, -input_offset);
- vdup_b_x(v43, -input_offset);
- vdup_b_x(v44, -input_offset);
- vdup_b_x(v45, -input_offset);
- vdup_b_x(v46, -input_offset);
- } else {
- switch (left_pad_count) {
- case 2:
- vdup_b_x(v43, -input_offset);
- case 1:
- vdup_b_x(v42, -input_offset);
- }
- switch (left_pad_count) {
- case 0:
- vld_b_x(v42, p_in_3);
- case 1:
- vld_b_x(v43, p_in_3 + input_depth);
- }
- switch (right_pad_count) {
- case 2:
- vdup_b_x(v45, -input_offset);
- case 1:
- vdup_b_x(v46, -input_offset);
- }
- vld_b_x(v44, p_in_3 + (2 * input_depth));
- switch (right_pad_count) {
- case 0:
- vld_b_x(v46, p_in_3 + (4 * input_depth));
- case 1:
- vld_b_x(v45, p_in_3 + (3 * input_depth));
- }
- }
+ vdup_b_x(INPUT_4_0, -input_offset);
+ vdup_b_x(INPUT_4_1, -input_offset);
+ vld_b_sp_xx(INPUT_4_2, p_in_4, input_depth);
+ vld_b_sp_xx(INPUT_4_3, p_in_4, input_depth);
+ vld_b_sp_xx(INPUT_4_4, p_in_4, input_depth);
- // 5th row
- if (bottom_pad_count >= 1) {
- vdup_b_x(v47, -input_offset);
- vdup_b_x(v48, -input_offset);
- vdup_b_x(v49, -input_offset);
- vdup_b_x(v50, -input_offset);
- vdup_b_x(v51, -input_offset);
- } else {
- switch (left_pad_count) {
- case 2:
- vdup_b_x(v48, -input_offset);
- case 1:
- vdup_b_x(v47, -input_offset);
- }
- switch (left_pad_count) {
- case 0:
- vld_b_x(v47, p_in_4);
- case 1:
- vld_b_x(v48, p_in_4 + input_depth);
- }
- vld_b_x(v49, p_in_4 + (2 * input_depth));
- switch (right_pad_count) {
- case 2:
- vdup_b_x(v50, -input_offset);
- case 1:
- vdup_b_x(v51, -input_offset);
- }
- switch (right_pad_count) {
- case 0:
- vld_b_x(v51, p_in_4 + (4 * input_depth));
- case 1:
- vld_b_x(v50, p_in_4 + (3 * input_depth));
- }
- }
- }
+ COMPUTE();
+ p_output += output_depth;
+ ++out_x;
+ }
+ { // out_x == 1
+ INPUT_PTRS(1);
+
+ vdup_b_x(INPUT_2_0, -input_offset);
+ vld_b_sp_xx(INPUT_2_1, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_2, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_3, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_4, p_in_2, input_depth);
+
+ vdup_b_x(INPUT_3_0, -input_offset);
+ vld_b_sp_xx(INPUT_3_1, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_2, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_3, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_4, p_in_3, input_depth);
+
+ vdup_b_x(INPUT_4_0, -input_offset);
+ vld_b_sp_xx(INPUT_4_1, p_in_4, input_depth);
+ vld_b_sp_xx(INPUT_4_2, p_in_4, input_depth);
+ vld_b_sp_xx(INPUT_4_3, p_in_4, input_depth);
+ vld_b_sp_xx(INPUT_4_4, p_in_4, input_depth);
+
+ COMPUTE();
+ p_output += output_depth;
+ ++out_x;
+ }
+ for (; out_x < output_width - pad_width; ++out_x) {
+ INPUT_PTRS(0);
+
+ vld_b_sp_xx(INPUT_2_0, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_1, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_2, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_3, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_4, p_in_2, input_depth);
+
+ vld_b_sp_xx(INPUT_3_0, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_1, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_2, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_3, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_4, p_in_3, input_depth);
+
+ vld_b_sp_xx(INPUT_4_0, p_in_4, input_depth);
+ vld_b_sp_xx(INPUT_4_1, p_in_4, input_depth);
+ vld_b_sp_xx(INPUT_4_2, p_in_4, input_depth);
+ vld_b_sp_xx(INPUT_4_3, p_in_4, input_depth);
+ vld_b_sp_xx(INPUT_4_4, p_in_4, input_depth);
+ COMPUTE();
+ p_output += output_depth;
+ }
+ { // out_x == output_width - 2
+ INPUT_PTRS(0);
+
+ vld_b_sp_xx(INPUT_2_0, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_1, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_2, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_3, p_in_2, input_depth);
+ vdup_b_x(INPUT_2_4, -input_offset);
+
+ vld_b_sp_xx(INPUT_3_0, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_1, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_2, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_3, p_in_3, input_depth);
+ vdup_b_x(INPUT_3_4, -input_offset);
+
+ vld_b_sp_xx(INPUT_4_0, p_in_4, input_depth);
+ vld_b_sp_xx(INPUT_4_1, p_in_4, input_depth);
+ vld_b_sp_xx(INPUT_4_2, p_in_4, input_depth);
+ vld_b_sp_xx(INPUT_4_3, p_in_4, input_depth);
+ vdup_b_x(INPUT_4_4, -input_offset);
+
+ COMPUTE();
+ p_output += output_depth;
+ ++out_x;
+ }
+ { // out_x == output_width - 1
+ INPUT_PTRS(0);
+
+ vld_b_sp_xx(INPUT_2_0, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_1, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_2, p_in_2, input_depth);
+ vdup_b_x(INPUT_2_3, -input_offset);
+ vdup_b_x(INPUT_2_4, -input_offset);
+
+ vld_b_sp_xx(INPUT_3_0, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_1, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_2, p_in_3, input_depth);
+ vdup_b_x(INPUT_3_3, -input_offset);
+ vdup_b_x(INPUT_3_4, -input_offset);
+
+ vld_b_sp_xx(INPUT_4_0, p_in_4, input_depth);
+ vld_b_sp_xx(INPUT_4_1, p_in_4, input_depth);
+ vld_b_sp_xx(INPUT_4_2, p_in_4, input_depth);
+ vdup_b_x(INPUT_4_3, -input_offset);
+ vdup_b_x(INPUT_4_4, -input_offset);
+
+ COMPUTE();
+ p_output += output_depth;
+ ++out_x;
+ }
+ ++out_y;
+ }
+ // Done
+ { // out_y = 1;
+ int out_x = 0;
+ vdup_b_x(INPUT_0_0, -input_offset);
+ vdup_b_x(INPUT_0_1, -input_offset);
+ vdup_b_x(INPUT_0_2, -input_offset);
+ vdup_b_x(INPUT_0_3, -input_offset);
+ vdup_b_x(INPUT_0_4, -input_offset);
+ { // out_x = 0;
+ INPUT_PTRS(2);
+
+ vdup_b_x(INPUT_1_0, -input_offset);
+ vdup_b_x(INPUT_1_1, -input_offset);
+ vld_b_sp_xx(INPUT_1_2, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_3, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_4, p_in_1, input_depth);
+
+ vdup_b_x(INPUT_2_0, -input_offset);
+ vdup_b_x(INPUT_2_1, -input_offset);
+ vld_b_sp_xx(INPUT_2_2, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_3, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_4, p_in_2, input_depth);
+
+ vdup_b_x(INPUT_3_0, -input_offset);
+ vdup_b_x(INPUT_3_1, -input_offset);
+ vld_b_sp_xx(INPUT_3_2, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_3, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_4, p_in_3, input_depth);
+
+ vdup_b_x(INPUT_4_0, -input_offset);
+ vdup_b_x(INPUT_4_1, -input_offset);
+ vld_b_sp_xx(INPUT_4_2, p_in_4, input_depth);
+ vld_b_sp_xx(INPUT_4_3, p_in_4, input_depth);
+ vld_b_sp_xx(INPUT_4_4, p_in_4, input_depth);
+
+ COMPUTE();
+ p_output += output_depth;
+ ++out_x;
+ }
+ { // out_x = 1;
+ INPUT_PTRS(1);
+
+ vdup_b_x(INPUT_1_0, -input_offset);
+ vld_b_sp_xx(INPUT_1_1, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_2, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_3, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_4, p_in_1, input_depth);
+
+ vdup_b_x(INPUT_2_0, -input_offset);
+ vld_b_sp_xx(INPUT_2_1, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_2, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_3, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_4, p_in_2, input_depth);
+
+ vdup_b_x(INPUT_3_0, -input_offset);
+ vld_b_sp_xx(INPUT_3_1, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_2, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_3, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_4, p_in_3, input_depth);
+
+ vdup_b_x(INPUT_4_0, -input_offset);
+ vld_b_sp_xx(INPUT_4_1, p_in_4, input_depth);
+ vld_b_sp_xx(INPUT_4_2, p_in_4, input_depth);
+ vld_b_sp_xx(INPUT_4_3, p_in_4, input_depth);
+ vld_b_sp_xx(INPUT_4_4, p_in_4, input_depth);
+
+ COMPUTE();
+ p_output += output_depth;
+ ++out_x;
+ }
+ for (; out_x < output_width - pad_width; ++out_x) {
+ INPUT_PTRS(0);
+
+ vld_b_sp_xx(INPUT_1_0, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_1, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_2, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_3, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_4, p_in_1, input_depth);
+
+ vld_b_sp_xx(INPUT_2_0, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_1, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_2, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_3, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_4, p_in_2, input_depth);
+
+ vld_b_sp_xx(INPUT_3_0, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_1, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_2, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_3, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_4, p_in_3, input_depth);
+
+ vld_b_sp_xx(INPUT_4_0, p_in_4, input_depth);
+ vld_b_sp_xx(INPUT_4_1, p_in_4, input_depth);
+ vld_b_sp_xx(INPUT_4_2, p_in_4, input_depth);
+ vld_b_sp_xx(INPUT_4_3, p_in_4, input_depth);
+ vld_b_sp_xx(INPUT_4_4, p_in_4, input_depth);
+
+ COMPUTE();
+ p_output += output_depth;
+ }
+ { // out_x = output_width - 2
+ INPUT_PTRS(0);
+
+ vld_b_sp_xx(INPUT_1_0, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_1, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_2, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_3, p_in_1, input_depth);
+ vdup_b_x(INPUT_1_4, -input_offset);
+
+ vld_b_sp_xx(INPUT_2_0, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_1, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_2, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_3, p_in_2, input_depth);
+ vdup_b_x(INPUT_2_4, -input_offset);
+
+ vld_b_sp_xx(INPUT_3_0, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_1, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_2, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_3, p_in_3, input_depth);
+ vdup_b_x(INPUT_3_4, -input_offset);
+
+ vld_b_sp_xx(INPUT_4_0, p_in_4, input_depth);
+ vld_b_sp_xx(INPUT_4_1, p_in_4, input_depth);
+ vld_b_sp_xx(INPUT_4_2, p_in_4, input_depth);
+ vld_b_sp_xx(INPUT_4_3, p_in_4, input_depth);
+ vdup_b_x(INPUT_4_4, -input_offset);
+
+ COMPUTE();
+ p_output += output_depth;
+ ++out_x;
+ }
+ { // out_x = output_width - 1
+ INPUT_PTRS(0);
+
+ vld_b_sp_xx(INPUT_1_0, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_1, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_2, p_in_1, input_depth);
+ vdup_b_x(INPUT_1_3, -input_offset);
+ vdup_b_x(INPUT_1_4, -input_offset);
+
+ vld_b_sp_xx(INPUT_2_0, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_1, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_2, p_in_2, input_depth);
+ vdup_b_x(INPUT_2_3, -input_offset);
+ vdup_b_x(INPUT_2_4, -input_offset);
+
+ vld_b_sp_xx(INPUT_3_0, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_1, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_2, p_in_3, input_depth);
+ vdup_b_x(INPUT_3_3, -input_offset);
+ vdup_b_x(INPUT_3_4, -input_offset);
+
+ vld_b_sp_xx(INPUT_4_0, p_in_4, input_depth);
+ vld_b_sp_xx(INPUT_4_1, p_in_4, input_depth);
+ vld_b_sp_xx(INPUT_4_2, p_in_4, input_depth);
+ vdup_b_x(INPUT_4_3, -input_offset);
+ vdup_b_x(INPUT_4_4, -input_offset);
+
+ COMPUTE();
+ p_output += output_depth;
+ }
+ ++out_y;
+ }
+ // Done
+ for (; out_y < output_height - pad_height; ++out_y) {
+ int out_x = 0;
+ { // out_x == 0
+ INPUT_PTRS(2);
+
+ vdup_b_x(INPUT_0_0, -input_offset);
+ vdup_b_x(INPUT_0_1, -input_offset);
+ vdup_b_x(INPUT_1_0, -input_offset);
+ vdup_b_x(INPUT_1_1, -input_offset);
+ vdup_b_x(INPUT_2_0, -input_offset);
+ vdup_b_x(INPUT_2_1, -input_offset);
+ vdup_b_x(INPUT_3_0, -input_offset);
+ vdup_b_x(INPUT_3_1, -input_offset);
+ vdup_b_x(INPUT_4_0, -input_offset);
+ vdup_b_x(INPUT_4_1, -input_offset);
+
+ vld_b_sp_xx(INPUT_0_2, p_in_0, input_depth);
+ vld_b_sp_xx(INPUT_0_3, p_in_0, input_depth);
+ vld_b_sp_xx(INPUT_0_4, p_in_0, input_depth);
+ vld_b_sp_xx(INPUT_1_2, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_3, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_4, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_2_2, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_3, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_4, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_3_2, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_3, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_4, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_4_2, p_in_4, input_depth);
+ vld_b_sp_xx(INPUT_4_3, p_in_4, input_depth);
+ vld_b_sp_xx(INPUT_4_4, p_in_4, input_depth);
+
+ COMPUTE();
+ p_output += output_depth;
+ ++out_x;
+ }
+ { // out_x == 1
+ INPUT_PTRS(1);
+ vdup_b_x(INPUT_0_0, -input_offset);
+ vdup_b_x(INPUT_1_0, -input_offset);
+ vdup_b_x(INPUT_2_0, -input_offset);
+ vdup_b_x(INPUT_3_0, -input_offset);
+ vdup_b_x(INPUT_4_0, -input_offset);
+
+ vld_b_sp_xx(INPUT_0_1, p_in_0, input_depth);
+ vld_b_sp_xx(INPUT_0_2, p_in_0, input_depth);
+ vld_b_sp_xx(INPUT_0_3, p_in_0, input_depth);
+ vld_b_sp_xx(INPUT_0_4, p_in_0, input_depth);
+ vld_b_sp_xx(INPUT_1_1, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_2, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_3, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_4, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_2_1, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_2, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_3, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_4, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_3_1, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_2, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_3, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_4, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_4_1, p_in_4, input_depth);
+ vld_b_sp_xx(INPUT_4_2, p_in_4, input_depth);
+ vld_b_sp_xx(INPUT_4_3, p_in_4, input_depth);
+ vld_b_sp_xx(INPUT_4_4, p_in_4, input_depth);
+
+ COMPUTE();
+ p_output += input_depth;
+ ++out_x;
+ }
+ for (; out_x + 4 <= output_width - pad_width; out_x += 4) {
+ INPUT_PTRS(0);
vld_w_x_m(v60, swizzled_bias_data);
adwinit_v(v60, v60);
- adwconv_vxv(v60, v27, cmds, v0);
- adwconv_vxv(v60, v30, cmds, v3);
- adwconv_vxv(v60, v33, cmds, v6);
- adwconv_vxv(v60, v36, cmds, v9);
- adwconv_vxv(v60, v39, cmds, v12);
- adwconv_vxv(v60, v42, cmds, v15);
- adwconv_vxv(v60, v45, cmds, v18);
- adwconv_vxv(v60, v48, cmds, v21);
- vdwconv_vxv(v60, v51, cmds, v24);
+ // Load top 3x8, in column-major order
+ vld_b_sp_xx(v26, p_in_0, input_depth);
+ vld_b_sp_xx(v27, p_in_1, input_depth);
+ vld_b_sp_xx(v28, p_in_2, input_depth);
+ vld_b_sp_xx(v29, p_in_0, input_depth);
+ vld_b_sp_xx(v30, p_in_1, input_depth);
+ vld_b_sp_xx(v31, p_in_2, input_depth);
+ vld_b_sp_xx(v32, p_in_0, input_depth);
+ vld_b_sp_xx(v33, p_in_1, input_depth);
+ vld_b_sp_xx(v34, p_in_2, input_depth);
+ vld_b_sp_xx(v35, p_in_0, input_depth);
+ vld_b_sp_xx(v36, p_in_1, input_depth);
+ vld_b_sp_xx(v37, p_in_2, input_depth);
+ vld_b_sp_xx(v38, p_in_0, input_depth);
+ vld_b_sp_xx(v39, p_in_1, input_depth);
+ vld_b_sp_xx(v40, p_in_2, input_depth);
+ vld_b_sp_xx(v41, p_in_0, input_depth);
+ vld_b_sp_xx(v42, p_in_1, input_depth);
+ vld_b_sp_xx(v43, p_in_2, input_depth);
+ vld_b_sp_xx(v44, p_in_0, input_depth);
+ vld_b_sp_xx(v45, p_in_1, input_depth);
+ vld_b_sp_xx(v46, p_in_2, input_depth);
+ vld_b_sp_xx(v47, p_in_0, input_depth);
+ vld_b_sp_xx(v48, p_in_1, input_depth);
+ vld_b_sp_xx(v49, p_in_2, input_depth);
- vld_w_x_m(v56, swizzled_output_multi);
- vdmulh_w_rn_vv_m(v60, v60, v56);
- vld_w_x_m(v56, swizzled_shift_multi);
- vrsub_w_vx_m(v56, v56, 0);
- vsha_w_r_vv_m(v60, v60, v56);
+ // Compute 3x5, starting from 0,3
+ adwconv_vxv(v60, v35, cmds, FLT_0_0);
+ adwconv_vxv(v60, v38, cmds, FLT_0_1);
+ adwconv_vxv(v60, v41, cmds, FLT_0_2);
+ adwconv_vxv(v60, v44, cmds, FLT_0_3);
+ vdwconv_vxv(v60, v47, cmds, FLT_0_4);
+
+ // Compute 3x5, starting from 0,2
+ vld_w_x_m(v56, swizzled_bias_data);
+ adwinit_v(v56, v56);
+ adwconv_vxv(v56, v32, cmds, FLT_0_0);
+ adwconv_vxv(v56, v35, cmds, FLT_0_1);
+ adwconv_vxv(v56, v38, cmds, FLT_0_2);
+ adwconv_vxv(v56, v41, cmds, FLT_0_3);
+ vdwconv_vxv(v56, v44, cmds, FLT_0_4);
+
+ // Compute 3x5, starting from 0,1
+ vld_w_x_m(v52, swizzled_bias_data);
+ adwinit_v(v52, v52);
+ adwconv_vxv(v52, v29, cmds, FLT_0_0);
+ adwconv_vxv(v52, v32, cmds, FLT_0_1);
+ adwconv_vxv(v52, v35, cmds, FLT_0_2);
+ adwconv_vxv(v52, v38, cmds, FLT_0_3);
+ vdwconv_vxv(v52, v41, cmds, FLT_0_4);
+
+ // Compute 3x5, starting from 0,3
+ vld_w_x_m(v48, swizzled_bias_data);
+ adwinit_v(v48, v48);
+ adwconv_vxv(v48, v26, cmds, FLT_0_0);
+ adwconv_vxv(v48, v29, cmds, FLT_0_1);
+ adwconv_vxv(v48, v32, cmds, FLT_0_2);
+ adwconv_vxv(v48, v35, cmds, FLT_0_3);
+ vdwconv_vxv(v48, v38, cmds, FLT_0_4);
+
+ // Load bottom 2x8, row major
+ vld_b_sp_xx(v26, p_in_3, input_depth);
+ vld_b_sp_xx(v27, p_in_3, input_depth);
+ vld_b_sp_xx(v28, p_in_3, input_depth);
+ vld_b_sp_xx(v29, p_in_3, input_depth);
+ vld_b_sp_xx(v30, p_in_3, input_depth);
+ vld_b_sp_xx(v31, p_in_3, input_depth);
+ vld_b_sp_xx(v32, p_in_3, input_depth);
+ vld_b_sp_xx(v33, p_in_3, input_depth);
+ vld_b_sp_xx(v34, p_in_4, input_depth);
+ vld_b_sp_xx(v35, p_in_4, input_depth);
+ vld_b_sp_xx(v36, p_in_4, input_depth);
+ vld_b_sp_xx(v37, p_in_4, input_depth);
+ vld_b_sp_xx(v38, p_in_4, input_depth);
+ vld_b_sp_xx(v39, p_in_4, input_depth);
+ vld_b_sp_xx(v40, p_in_4, input_depth);
+ vld_b_sp_xx(v41, p_in_4, input_depth);
+
+ // Compute bottom 2x5, starting at 3,3
+ adwinit_v(v60, v60);
+ adwconv_vxv(v60, v29, cmds, FLT_3_0);
+ adwconv_vxv(v60, v32, cmds, FLT_3_3);
+ adwconv_vxv(v60, v36, cmds, FLT_HOLE);
+ vdwconv_vxv(v60, v39, cmds, FLT_4_2);
+
+ // Compute bottom 2x5, starting at 3,2
+ adwinit_v(v56, v56);
+ adwconv_vxv(v56, v28, cmds, FLT_3_0);
+ adwconv_vxv(v56, v31, cmds, FLT_3_3);
+ adwconv_vxv(v56, v35, cmds, FLT_HOLE);
+ vdwconv_vxv(v56, v38, cmds, FLT_4_2);
+
+ // Compute bottom 2x5, starting at 3,1
+ adwinit_v(v52, v52);
+ adwconv_vxv(v52, v27, cmds, FLT_3_0);
+ adwconv_vxv(v52, v30, cmds, FLT_3_3);
+ adwconv_vxv(v52, v34, cmds, FLT_HOLE);
+ vdwconv_vxv(v52, v37, cmds, FLT_4_2);
+
+ // Compute bottom 2x5, starting at 3,0
+ adwinit_v(v48, v48);
+ adwconv_vxv(v48, v26, cmds, FLT_3_0);
+ adwconv_vxv(v48, v29, cmds, FLT_3_3);
+ adwconv_vxv(v48, v33, cmds, FLT_HOLE);
+ vdwconv_vxv(v48, v36, cmds, FLT_4_2);
+
+ // Load output parameters
+ vld_w_x_m(v40, swizzled_output_multi);
+ vld_w_x_m(v44, swizzled_shift_multi);
+ vrsub_w_vx_m(v44, v44, 0);
+
+ // Compute final outputs, for both 5x5 patches, and store.
+ // NB: We don't use the normal output pipeline macro here,
+ // as interleaving improves performance on hardware.
+ vdmulh_w_rn_vv_m(v60, v60, v40);
+ vdmulh_w_rn_vv_m(v56, v56, v40);
+ vdmulh_w_rn_vv_m(v52, v52, v40);
+ vdmulh_w_rn_vv_m(v48, v48, v40);
+ vsha_w_r_vv_m(v60, v60, v44);
+ vsha_w_r_vv_m(v56, v56, v44);
+ vsha_w_r_vv_m(v52, v52, v44);
+ vsha_w_r_vv_m(v48, v48, v44);
vadd_w_vx_m(v60, v60, output_offset);
+ vadd_w_vx_m(v56, v56, output_offset);
+ vadd_w_vx_m(v52, v52, output_offset);
+ vadd_w_vx_m(v48, v48, output_offset);
vmax_w_vx_m(v60, v60, output_activation_min);
+ vmax_w_vx_m(v56, v56, output_activation_min);
+ vmax_w_vx_m(v52, v52, output_activation_min);
+ vmax_w_vx_m(v48, v48, output_activation_min);
vmin_w_vx_m(v60, v60, output_activation_max);
+ vmin_w_vx_m(v56, v56, output_activation_max);
+ vmin_w_vx_m(v52, v52, output_activation_max);
+ vmin_w_vx_m(v48, v48, output_activation_max);
+ vsraqs_b_vx(v48, v48, 0);
+ vst_b_x(v48, p_output);
+ p_output += output_depth;
+ vsraqs_b_vx(v52, v52, 0);
+ vst_b_x(v52, p_output);
+ p_output += output_depth;
+ vsraqs_b_vx(v56, v56, 0);
+ vst_b_x(v56, p_output);
+ p_output += output_depth;
vsraqs_b_vx(v60, v60, 0);
- vst_b_x(v60, p_output + y_offset + (out_x * output_depth));
+ vst_b_x(v60, p_output);
+ p_output += output_depth;
+ }
+ // These were clobbered due to the different compute pattern
+ // in the previous loop, so re-load them.
+ vld_w_x_m(v56, swizzled_output_multi);
+ vld_w_x_m(v52, swizzled_shift_multi);
+ vrsub_w_vx_m(v52, v52, 0);
+ for (; out_x < output_width - pad_width; ++out_x) {
+ INPUT_PTRS(0);
+
+ vld_b_sp_xx(INPUT_0_0, p_in_0, input_depth);
+ vld_b_sp_xx(INPUT_0_1, p_in_0, input_depth);
+ vld_b_sp_xx(INPUT_0_2, p_in_0, input_depth);
+ vld_b_sp_xx(INPUT_0_3, p_in_0, input_depth);
+ vld_b_sp_xx(INPUT_0_4, p_in_0, input_depth);
+
+ vld_b_sp_xx(INPUT_1_0, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_1, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_2, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_3, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_4, p_in_1, input_depth);
+
+ vld_b_sp_xx(INPUT_2_0, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_1, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_2, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_3, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_4, p_in_2, input_depth);
+
+ vld_b_sp_xx(INPUT_3_0, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_1, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_2, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_3, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_4, p_in_3, input_depth);
+
+ vld_b_sp_xx(INPUT_4_0, p_in_4, input_depth);
+ vld_b_sp_xx(INPUT_4_1, p_in_4, input_depth);
+ vld_b_sp_xx(INPUT_4_2, p_in_4, input_depth);
+ vld_b_sp_xx(INPUT_4_3, p_in_4, input_depth);
+ vld_b_sp_xx(INPUT_4_4, p_in_4, input_depth);
+
+ COMPUTE();
+ p_output += output_depth;
+ }
+ { // out_x == output_width - 2
+ INPUT_PTRS(0);
+ vld_b_sp_xx(INPUT_0_0, p_in_0, input_depth);
+ vld_b_sp_xx(INPUT_0_1, p_in_0, input_depth);
+ vld_b_sp_xx(INPUT_0_2, p_in_0, input_depth);
+ vld_b_sp_xx(INPUT_0_3, p_in_0, input_depth);
+ vdup_b_x(INPUT_0_4, -input_offset);
+
+ vld_b_sp_xx(INPUT_1_0, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_1, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_2, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_3, p_in_1, input_depth);
+ vdup_b_x(INPUT_1_4, -input_offset);
+
+ vld_b_sp_xx(INPUT_2_0, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_1, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_2, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_3, p_in_2, input_depth);
+ vdup_b_x(INPUT_2_4, -input_offset);
+
+ vld_b_sp_xx(INPUT_3_0, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_1, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_2, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_3, p_in_3, input_depth);
+ vdup_b_x(INPUT_3_4, -input_offset);
+
+ vld_b_sp_xx(INPUT_4_0, p_in_4, input_depth);
+ vld_b_sp_xx(INPUT_4_1, p_in_4, input_depth);
+ vld_b_sp_xx(INPUT_4_2, p_in_4, input_depth);
+ vld_b_sp_xx(INPUT_4_3, p_in_4, input_depth);
+ vdup_b_x(INPUT_4_4, -input_offset);
+
+ COMPUTE();
+ p_output += output_depth;
+ ++out_x;
+ }
+ { // out_x == output_width - 1
+ INPUT_PTRS(0);
+
+ vld_b_sp_xx(INPUT_0_0, p_in_0, input_depth);
+ vld_b_sp_xx(INPUT_0_1, p_in_0, input_depth);
+ vld_b_sp_xx(INPUT_0_2, p_in_0, input_depth);
+ vdup_b_x(INPUT_0_3, -input_offset);
+ vdup_b_x(INPUT_0_4, -input_offset);
+
+ vld_b_sp_xx(INPUT_1_0, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_1, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_2, p_in_1, input_depth);
+ vdup_b_x(INPUT_1_3, -input_offset);
+ vdup_b_x(INPUT_1_4, -input_offset);
+
+ vld_b_sp_xx(INPUT_2_0, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_1, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_2, p_in_2, input_depth);
+ vdup_b_x(INPUT_2_3, -input_offset);
+ vdup_b_x(INPUT_2_4, -input_offset);
+
+ vld_b_sp_xx(INPUT_3_0, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_1, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_2, p_in_3, input_depth);
+ vdup_b_x(INPUT_3_3, -input_offset);
+ vdup_b_x(INPUT_3_4, -input_offset);
+
+ vld_b_sp_xx(INPUT_4_0, p_in_4, input_depth);
+ vld_b_sp_xx(INPUT_4_1, p_in_4, input_depth);
+ vld_b_sp_xx(INPUT_4_2, p_in_4, input_depth);
+ vdup_b_x(INPUT_4_3, -input_offset);
+ vdup_b_x(INPUT_4_4, -input_offset);
+
+ COMPUTE();
+ p_output += output_depth;
+ }
+ }
+ // Done
+ { // out_y == output_height - 2
+ int out_x = 0;
+ vdup_b_x(INPUT_4_0, -input_offset);
+ vdup_b_x(INPUT_4_1, -input_offset);
+ vdup_b_x(INPUT_4_2, -input_offset);
+ vdup_b_x(INPUT_4_3, -input_offset);
+ vdup_b_x(INPUT_4_4, -input_offset);
+ { // out_x == 0
+ INPUT_PTRS(2);
+
+ vdup_b_x(INPUT_0_0, -input_offset);
+ vdup_b_x(INPUT_0_1, -input_offset);
+ vld_b_sp_xx(INPUT_0_2, p_in_0, input_depth);
+ vld_b_sp_xx(INPUT_0_3, p_in_0, input_depth);
+ vld_b_sp_xx(INPUT_0_4, p_in_0, input_depth);
+
+ vdup_b_x(INPUT_1_0, -input_offset);
+ vdup_b_x(INPUT_1_1, -input_offset);
+ vld_b_sp_xx(INPUT_1_2, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_3, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_4, p_in_1, input_depth);
+
+ vdup_b_x(INPUT_2_0, -input_offset);
+ vdup_b_x(INPUT_2_1, -input_offset);
+ vld_b_sp_xx(INPUT_2_2, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_3, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_4, p_in_2, input_depth);
+
+ vdup_b_x(INPUT_3_0, -input_offset);
+ vdup_b_x(INPUT_3_1, -input_offset);
+ vld_b_sp_xx(INPUT_3_2, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_3, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_4, p_in_3, input_depth);
+
+ COMPUTE();
+ p_output += output_depth;
+ ++out_x;
+ }
+ { // out_x == 1
+ INPUT_PTRS(1);
+
+ vdup_b_x(INPUT_0_0, -input_offset);
+ vld_b_sp_xx(INPUT_0_1, p_in_0, input_depth);
+ vld_b_sp_xx(INPUT_0_2, p_in_0, input_depth);
+ vld_b_sp_xx(INPUT_0_3, p_in_0, input_depth);
+ vld_b_sp_xx(INPUT_0_4, p_in_0, input_depth);
+
+ vdup_b_x(INPUT_1_0, -input_offset);
+ vld_b_sp_xx(INPUT_1_1, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_2, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_3, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_4, p_in_1, input_depth);
+
+ vdup_b_x(INPUT_2_0, -input_offset);
+ vld_b_sp_xx(INPUT_2_1, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_2, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_3, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_4, p_in_2, input_depth);
+
+ vdup_b_x(INPUT_3_0, -input_offset);
+ vld_b_sp_xx(INPUT_3_1, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_2, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_3, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_4, p_in_3, input_depth);
+
+ COMPUTE();
+ p_output += output_depth;
+ ++out_x;
+ }
+ for (; out_x < output_width - pad_width; ++out_x) {
+ INPUT_PTRS(0);
+
+ vld_b_sp_xx(INPUT_0_0, p_in_0, input_depth);
+ vld_b_sp_xx(INPUT_0_1, p_in_0, input_depth);
+ vld_b_sp_xx(INPUT_0_2, p_in_0, input_depth);
+ vld_b_sp_xx(INPUT_0_3, p_in_0, input_depth);
+ vld_b_sp_xx(INPUT_0_4, p_in_0, input_depth);
+
+ vld_b_sp_xx(INPUT_1_0, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_1, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_2, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_3, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_4, p_in_1, input_depth);
+
+ vld_b_sp_xx(INPUT_2_0, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_1, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_2, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_3, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_4, p_in_2, input_depth);
+
+ vld_b_sp_xx(INPUT_3_0, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_1, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_2, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_3, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_4, p_in_3, input_depth);
+
+ COMPUTE();
+ p_output += output_depth;
+ }
+ { // out_x == output_width - 2
+ INPUT_PTRS(0);
+
+ vld_b_sp_xx(INPUT_0_0, p_in_0, input_depth);
+ vld_b_sp_xx(INPUT_0_1, p_in_0, input_depth);
+ vld_b_sp_xx(INPUT_0_2, p_in_0, input_depth);
+ vld_b_sp_xx(INPUT_0_3, p_in_0, input_depth);
+ vdup_b_x(INPUT_0_4, -input_offset);
+
+ vld_b_sp_xx(INPUT_1_0, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_1, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_2, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_3, p_in_1, input_depth);
+ vdup_b_x(INPUT_1_4, -input_offset);
+
+ vld_b_sp_xx(INPUT_2_0, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_1, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_2, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_3, p_in_2, input_depth);
+ vdup_b_x(INPUT_2_4, -input_offset);
+
+ vld_b_sp_xx(INPUT_3_0, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_1, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_2, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_3, p_in_3, input_depth);
+ vdup_b_x(INPUT_3_4, -input_offset);
+
+ COMPUTE();
+ p_output += output_depth;
+ ++out_x;
+ }
+ { // out_x == output_width - 1
+ INPUT_PTRS(0);
+
+ vld_b_sp_xx(INPUT_0_0, p_in_0, input_depth);
+ vld_b_sp_xx(INPUT_0_1, p_in_0, input_depth);
+ vld_b_sp_xx(INPUT_0_2, p_in_0, input_depth);
+ vdup_b_x(INPUT_0_3, -input_offset);
+ vdup_b_x(INPUT_0_4, -input_offset);
+
+ vld_b_sp_xx(INPUT_1_0, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_1, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_2, p_in_1, input_depth);
+ vdup_b_x(INPUT_1_3, -input_offset);
+ vdup_b_x(INPUT_1_4, -input_offset);
+
+ vld_b_sp_xx(INPUT_2_0, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_1, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_2, p_in_2, input_depth);
+ vdup_b_x(INPUT_2_3, -input_offset);
+ vdup_b_x(INPUT_2_4, -input_offset);
+
+ vld_b_sp_xx(INPUT_3_0, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_1, p_in_3, input_depth);
+ vld_b_sp_xx(INPUT_3_2, p_in_3, input_depth);
+ vdup_b_x(INPUT_3_3, -input_offset);
+ vdup_b_x(INPUT_3_4, -input_offset);
+
+ COMPUTE();
+ p_output += output_depth;
+ ++out_x;
+ }
+ ++out_y;
+ }
+ // Done
+ { // out_y == output_height - 1
+ int out_x = 0;
+ vdup_b_x(INPUT_3_0, -input_offset);
+ vdup_b_x(INPUT_3_1, -input_offset);
+ vdup_b_x(INPUT_3_2, -input_offset);
+ vdup_b_x(INPUT_3_3, -input_offset);
+ vdup_b_x(INPUT_3_4, -input_offset);
+
+ vdup_b_x(INPUT_4_0, -input_offset);
+ vdup_b_x(INPUT_4_1, -input_offset);
+ vdup_b_x(INPUT_4_2, -input_offset);
+ vdup_b_x(INPUT_4_3, -input_offset);
+ vdup_b_x(INPUT_4_4, -input_offset);
+ { // out_x == 0
+ INPUT_PTRS(2);
+
+ vdup_b_x(INPUT_0_0, -input_offset);
+ vdup_b_x(INPUT_0_1, -input_offset);
+ vld_b_sp_xx(INPUT_0_2, p_in_0, input_depth);
+ vld_b_sp_xx(INPUT_0_3, p_in_0, input_depth);
+ vld_b_sp_xx(INPUT_0_4, p_in_0, input_depth);
+
+ vdup_b_x(INPUT_1_0, -input_offset);
+ vdup_b_x(INPUT_1_1, -input_offset);
+ vld_b_sp_xx(INPUT_1_2, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_3, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_4, p_in_1, input_depth);
+
+ vdup_b_x(INPUT_2_0, -input_offset);
+ vdup_b_x(INPUT_2_1, -input_offset);
+ vld_b_sp_xx(INPUT_2_2, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_3, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_4, p_in_2, input_depth);
+
+ COMPUTE();
+ p_output += output_depth;
+ ++out_x;
+ }
+ { // out_x == 1
+ INPUT_PTRS(1);
+
+ vdup_b_x(INPUT_0_0, -input_offset);
+ vld_b_sp_xx(INPUT_0_1, p_in_0, input_depth);
+ vld_b_sp_xx(INPUT_0_2, p_in_0, input_depth);
+ vld_b_sp_xx(INPUT_0_3, p_in_0, input_depth);
+ vld_b_sp_xx(INPUT_0_4, p_in_0, input_depth);
+
+ vdup_b_x(INPUT_1_0, -input_offset);
+ vld_b_sp_xx(INPUT_1_1, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_2, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_3, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_4, p_in_1, input_depth);
+
+ vdup_b_x(INPUT_2_0, -input_offset);
+ vld_b_sp_xx(INPUT_2_1, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_2, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_3, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_4, p_in_2, input_depth);
+
+ COMPUTE();
+ p_output += output_depth;
+ ++out_x;
+ }
+ for (; out_x < output_width - pad_width; ++out_x) {
+ INPUT_PTRS(0);
+
+ vld_b_sp_xx(INPUT_0_0, p_in_0, input_depth);
+ vld_b_sp_xx(INPUT_0_1, p_in_0, input_depth);
+ vld_b_sp_xx(INPUT_0_2, p_in_0, input_depth);
+ vld_b_sp_xx(INPUT_0_3, p_in_0, input_depth);
+ vld_b_sp_xx(INPUT_0_4, p_in_0, input_depth);
+
+ vld_b_sp_xx(INPUT_1_0, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_1, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_2, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_3, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_4, p_in_1, input_depth);
+
+ vld_b_sp_xx(INPUT_2_0, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_1, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_2, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_3, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_4, p_in_2, input_depth);
+
+ COMPUTE();
+ p_output += output_depth;
+ }
+ { // out_x == output_width - 2
+ INPUT_PTRS(0);
+
+ vld_b_sp_xx(INPUT_0_0, p_in_0, input_depth);
+ vld_b_sp_xx(INPUT_0_1, p_in_0, input_depth);
+ vld_b_sp_xx(INPUT_0_2, p_in_0, input_depth);
+ vld_b_sp_xx(INPUT_0_3, p_in_0, input_depth);
+ vdup_b_x(INPUT_0_4, -input_offset);
+
+ vld_b_sp_xx(INPUT_1_0, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_1, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_2, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_3, p_in_1, input_depth);
+ vdup_b_x(INPUT_1_4, -input_offset);
+
+ vld_b_sp_xx(INPUT_2_0, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_1, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_2, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_3, p_in_2, input_depth);
+ vdup_b_x(INPUT_2_4, -input_offset);
+
+ COMPUTE();
+ p_output += output_depth;
+ ++out_x;
+ }
+ { // out_x == output_width - 1
+ INPUT_PTRS(0);
+
+ vld_b_sp_xx(INPUT_0_0, p_in_0, input_depth);
+ vld_b_sp_xx(INPUT_0_1, p_in_0, input_depth);
+ vld_b_sp_xx(INPUT_0_2, p_in_0, input_depth);
+ vdup_b_x(INPUT_0_3, -input_offset);
+ vdup_b_x(INPUT_0_4, -input_offset);
+
+ vld_b_sp_xx(INPUT_1_0, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_1, p_in_1, input_depth);
+ vld_b_sp_xx(INPUT_1_2, p_in_1, input_depth);
+ vdup_b_x(INPUT_1_3, -input_offset);
+ vdup_b_x(INPUT_1_4, -input_offset);
+
+ vld_b_sp_xx(INPUT_2_0, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_1, p_in_2, input_depth);
+ vld_b_sp_xx(INPUT_2_2, p_in_2, input_depth);
+ vdup_b_x(INPUT_2_3, -input_offset);
+ vdup_b_x(INPUT_2_4, -input_offset);
+
+ COMPUTE();
+ p_output += output_depth;
}
}
}
}
+#undef INPUT_PTRS
+#undef COMPUTE
+#undef INPUT_0_0
+#undef INPUT_0_1
+#undef INPUT_0_2
+#undef INPUT_0_3
+#undef INPUT_0_4
+#undef INPUT_1_0
+#undef INPUT_1_1
+#undef INPUT_1_2
+#undef INPUT_1_3
+#undef INPUT_1_4
+#undef INPUT_2_0
+#undef INPUT_2_1
+#undef INPUT_2_2
+#undef INPUT_2_3
+#undef INPUT_2_4
+#undef INPUT_3_0
+#undef INPUT_3_1
+#undef INPUT_3_2
+#undef INPUT_3_3
+#undef INPUT_3_4
+#undef INPUT_4_0
+#undef INPUT_4_1
+#undef INPUT_4_2
+#undef INPUT_4_3
+#undef INPUT_4_4
+#undef INPUT_0_5
+#undef INPUT_1_5
+#undef INPUT_2_5
+#undef INPUT_3_5
+#undef INPUT_4_5
+#undef FLT_0_0
+#undef FLT_0_1
+#undef FLT_0_2
+#undef FLT_0_3
+#undef FLT_0_4
+#undef FLT_1_0
+#undef FLT_1_1
+#undef FLT_1_2
+#undef FLT_1_3
+#undef FLT_1_4
+#undef FLT_2_0
+#undef FLT_2_1
+#undef FLT_2_2
+#undef FLT_2_3
+#undef FLT_2_4
+#undef FLT_3_0
+#undef FLT_3_1
+#undef FLT_3_2
+#undef FLT_3_3
+#undef FLT_3_4
+#undef FLT_HOLE
+#undef FLT_4_0
+#undef FLT_4_1
+#undef FLT_4_2
+#undef FLT_4_3
+#undef FLT_4_4
}
// special case of input depth = 32n, filter shape of 5x5
@@ -1263,7 +2165,7 @@
// special case of output depth = 32n
if (output_depth % 32 == 0) {
if (filter_width == 5 && filter_height == 5) {
- if (stride_width <= 1 && stride_height <= 1) {
+ if (stride_width <= 1 && stride_height <= 1 && params.padding_type == tflite::PaddingType::kSame) {
RUN_KERNEL(DepthwiseConvS85x5D32_Stride1);
}
RUN_KERNEL(DepthwiseConvS85x5D32);