Reduce register pressure in ConvS8W8D4.
Change-Id: I16ad976ad5c263a269bd2a876f30909048398c1f
diff --git a/tflm/opt/conv_s8_d4.cc b/tflm/opt/conv_s8_d4.cc
index ba1b960..4ee4d33 100644
--- a/tflm/opt/conv_s8_d4.cc
+++ b/tflm/opt/conv_s8_d4.cc
@@ -592,15 +592,13 @@
vld_b_s_xx_m(v4, p_in_x0, stride);
p_in_x0 += 4 * stride;
- {
- size_t local_filter_offset = y_filter_offset +
- (filter_x * 8 * input_depth) +
- (in_channel * 8);
- int8_t* p_local_filter_start =
- p_swizzled_filter_data + local_filter_offset;
- vld_b_p_x_m(v8, p_local_filter_start);
- vld_b_x_m(v12, p_local_filter_start);
- }
+ const size_t local_filter_offset = y_filter_offset +
+ (filter_x * 8 * input_depth) + (in_channel * 8);
+ int8_t* p_local_filter_start =
+ p_swizzled_filter_data + local_filter_offset;
+ vld_b_x_m(v8, p_local_filter_start);
+ vld_b_x_m(v12, p_local_filter_start + 128);
+ p_local_filter_start += 8 * stride;
aconv_vxv(v48, v0, cmds, v8);
filter_x += stride_width;
@@ -618,13 +616,9 @@
vld_b_l_xx(v23, p_in_x0, in_channels_this_iter);
p_in_x0 += stride;
- size_t local_filter_offset0 = y_filter_offset +
- (filter_x * 8 * input_depth) +
- (in_channel * 8);
- int8_t* p_local_filter_start0 =
- p_swizzled_filter_data + local_filter_offset0;
- vld_b_x_m(v24, p_local_filter_start0);
- vld_b_x_m(v28, p_local_filter_start0 + 128);
+ vld_b_x_m(v24, p_local_filter_start);
+ vld_b_x_m(v28, p_local_filter_start + 128);
+ p_local_filter_start += 8 * stride;
aconv_vxv(v48, v16, cmds, v24);
@@ -639,19 +633,15 @@
vld_b_l_xx(v7, p_in_x0, in_channels_this_iter);
p_in_x0 += stride;
- size_t local_filter_offset1 = y_filter_offset +
- ((filter_x + stride_width) * 8 * input_depth) +
- (in_channel * 8);
- int8_t* p_local_filter_start1 =
- p_swizzled_filter_data + local_filter_offset1;
- vld_b_x_m(v8, p_local_filter_start1);
- vld_b_x_m(v12, p_local_filter_start1 + 128);
+ vld_b_x_m(v8, p_local_filter_start);
+ vld_b_x_m(v12, p_local_filter_start + 128);
+ p_local_filter_start += 8 * stride;
aconv_vxv(v48, v0, cmds, v8);
}
- for (; filter_x < filter_width; filter_x += stride_width) {
- // Iteration 1
+ if (filter_x < filter_width) {
+ // Final Iteration
vmv_v(v16, v1);
vmv_v(v17, v2);
vmv_v(v18, v3);
@@ -660,13 +650,7 @@
vmv_v(v21, v6);
vmv_v(v22, v7);
vld_b_l_xx(v23, p_in_x0, in_channels_this_iter);
- p_in_x0 += stride;
- size_t local_filter_offset = y_filter_offset +
- (filter_x * 8 * input_depth) +
- (in_channel * 8);
- int8_t* p_local_filter_start =
- p_swizzled_filter_data + local_filter_offset;
vld_b_x_m(v24, p_local_filter_start);
vld_b_x_m(v28, p_local_filter_start + 128);