Create and apply interleaved output pipeline macros
- INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE{2,3,4} -- interleaved variants
of the normal output pipeline macro, for handling multiple outputs.
Interleaving provides a tangible performance increase on hardware.
Change-Id: Iddea3b22ed71bfdfc1a17fba4fdde746783f6e6d
diff --git a/tflm/opt/conv_s8_1x1.cc b/tflm/opt/conv_s8_1x1.cc
index 66e449c..26a681b 100644
--- a/tflm/opt/conv_s8_1x1.cc
+++ b/tflm/opt/conv_s8_1x1.cc
@@ -144,11 +144,8 @@
}
vcget(v48);
- INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE(
- v48, v20, v24, output_activation_min, output_activation_max,
- output_offset);
- INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE(
- v52, v20, v24, output_activation_min, output_activation_max,
+ INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE2(
+ v48, v52, v20, v24, output_activation_min, output_activation_max,
output_offset);
vsraqs_b_vx(v48, v48, 0);
vsraqs_b_vx(v52, v52, 0);
@@ -262,11 +259,8 @@
aconv_vxv(v48, v0, cmds, v8);
vcget(v48);
- INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE(
- v48, v20, v24, output_activation_min, output_activation_max,
- output_offset);
- INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE(
- v52, v20, v24, output_activation_min, output_activation_max,
+ INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE2(
+ v48, v52, v20, v24, output_activation_min, output_activation_max,
output_offset);
vsraqs_b_vx(v48, v48, 0);
vsraqs_b_vx(v52, v52, 0);
@@ -307,11 +301,8 @@
aconv_vxv(v48, v0, cmds, v8);
vcget(v48);
- INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE(
- v48, v20, v24, output_activation_min, output_activation_max,
- output_offset);
- INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE(
- v52, v20, v24, output_activation_min, output_activation_max,
+ INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE2(
+ v48, v52, v20, v24, output_activation_min, output_activation_max,
output_offset);
vsraqs_b_vx(v48, v48, 0);
vsraqs_b_vx(v52, v52, 0);
diff --git a/tflm/opt/conv_util.h b/tflm/opt/conv_util.h
index 7c925c4..a142c6f 100644
--- a/tflm/opt/conv_util.h
+++ b/tflm/opt/conv_util.h
@@ -150,6 +150,73 @@
vmin_w_vx_m(result, result, output_activation_max); \
}
+// As above, but interleaves 2 sets of outputs.
+#define INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE2(result0, result1, mult, shft, \
+ output_min, \
+ output_max, output_offset) \
+ { \
+ vdmulh_w_rn_vv_m(result0, result0, mult); \
+ vdmulh_w_rn_vv_m(result1, result1, mult); \
+ vsha_w_r_vv_m(result0, result0, shft); \
+ vsha_w_r_vv_m(result1, result1, shft); \
+ vadd_w_vx_m(result0, result0, output_offset); \
+ vadd_w_vx_m(result1, result1, output_offset); \
+ vmax_w_vx_m(result0, result0, output_activation_min); \
+ vmax_w_vx_m(result1, result1, output_activation_min); \
+ vmin_w_vx_m(result0, result0, output_activation_max); \
+ vmin_w_vx_m(result1, result1, output_activation_max); \
+ }
+
+// As above, but interleaves 3 sets of outputs.
+#define INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE3(result0, result1, result2, \
+ mult, shft, \
+ output_min, \
+ output_max, output_offset) \
+ { \
+ vdmulh_w_rn_vv_m(result0, result0, mult); \
+ vdmulh_w_rn_vv_m(result1, result1, mult); \
+ vdmulh_w_rn_vv_m(result2, result2, mult); \
+ vsha_w_r_vv_m(result0, result0, shft); \
+ vsha_w_r_vv_m(result1, result1, shft); \
+ vsha_w_r_vv_m(result2, result2, shft); \
+ vadd_w_vx_m(result0, result0, output_offset); \
+ vadd_w_vx_m(result1, result1, output_offset); \
+ vadd_w_vx_m(result2, result2, output_offset); \
+ vmax_w_vx_m(result0, result0, output_activation_min); \
+ vmax_w_vx_m(result1, result1, output_activation_min); \
+ vmax_w_vx_m(result2, result2, output_activation_min); \
+ vmin_w_vx_m(result0, result0, output_activation_max); \
+ vmin_w_vx_m(result1, result1, output_activation_max); \
+ vmin_w_vx_m(result2, result2, output_activation_max); \
+ }
+
+// As above, but interleaves 4 sets of outputs.
+#define INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE4(result0, result1, result2, result3, mult, shft, \
+ output_min, \
+ output_max, output_offset) \
+ { \
+ vdmulh_w_rn_vv_m(result0, result0, mult); \
+ vdmulh_w_rn_vv_m(result1, result1, mult); \
+ vdmulh_w_rn_vv_m(result2, result2, mult); \
+ vdmulh_w_rn_vv_m(result3, result3, mult); \
+ vsha_w_r_vv_m(result0, result0, shft); \
+ vsha_w_r_vv_m(result1, result1, shft); \
+ vsha_w_r_vv_m(result2, result2, shft); \
+ vsha_w_r_vv_m(result3, result3, shft); \
+ vadd_w_vx_m(result0, result0, output_offset); \
+ vadd_w_vx_m(result1, result1, output_offset); \
+ vadd_w_vx_m(result2, result2, output_offset); \
+ vadd_w_vx_m(result3, result3, output_offset); \
+ vmax_w_vx_m(result0, result0, output_activation_min); \
+ vmax_w_vx_m(result1, result1, output_activation_min); \
+ vmax_w_vx_m(result2, result2, output_activation_min); \
+ vmax_w_vx_m(result3, result3, output_activation_min); \
+ vmin_w_vx_m(result0, result0, output_activation_max); \
+ vmin_w_vx_m(result1, result1, output_activation_max); \
+ vmin_w_vx_m(result2, result2, output_activation_max); \
+ vmin_w_vx_m(result3, result3, output_activation_max); \
+ }
+
// Run output pipeline on int32 accumulators in [v48-v55] and store results
// in v48 and v52. Clobbers [v48-v55].
#define INT32_TO_INT8_OUTPUT_PIPELINE(bias, mult, shft, output_min, \
diff --git a/tflm/opt/depthwise_conv_s8.cc b/tflm/opt/depthwise_conv_s8.cc
index feb21cb..4f9440c 100644
--- a/tflm/opt/depthwise_conv_s8.cc
+++ b/tflm/opt/depthwise_conv_s8.cc
@@ -307,39 +307,25 @@
adwconv_vxv(v36, INPUT_0_4, cmds, FLT_0_1);
vdwconv_vxv(v36, INPUT_0_5, cmds, FLT_0_2);
- INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE(
- v48, v56, v60,
+ INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE4(
+ v48, v44, v40, v36, v56, v60,
output_activation_min,
output_activation_max,
output_offset);
vsraqs_b_vx(v48, v48, 0);
+ vsraqs_b_vx(v44, v44, 0);
+ vsraqs_b_vx(v40, v40, 0);
+ vsraqs_b_vx(v36, v36, 0);
+
vst_b_x(v48, p_output);
p_output += output_depth;
- INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE(
- v44, v56, v60,
- output_activation_min,
- output_activation_max,
- output_offset);
- vsraqs_b_vx(v44, v44, 0);
vst_b_x(v44, p_output);
p_output += output_depth;
- INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE(
- v40, v56, v60,
- output_activation_min,
- output_activation_max,
- output_offset);
- vsraqs_b_vx(v40, v40, 0);
vst_b_x(v40, p_output);
p_output += output_depth;
- INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE(
- v36, v56, v60,
- output_activation_min,
- output_activation_max,
- output_offset);
- vsraqs_b_vx(v36, v36, 0);
vst_b_x(v36, p_output);
p_output += output_depth;
}
@@ -1336,28 +1322,7 @@
vrsub_w_vx_m(v44, v44, 0);
// Compute final outputs, for both 5x5 patches, and store.
- // NB: We don't use the normal output pipeline macro here,
- // as interleaving improves performance on hardware.
- vdmulh_w_rn_vv_m(v60, v60, v40);
- vdmulh_w_rn_vv_m(v56, v56, v40);
- vdmulh_w_rn_vv_m(v52, v52, v40);
- vdmulh_w_rn_vv_m(v48, v48, v40);
- vsha_w_r_vv_m(v60, v60, v44);
- vsha_w_r_vv_m(v56, v56, v44);
- vsha_w_r_vv_m(v52, v52, v44);
- vsha_w_r_vv_m(v48, v48, v44);
- vadd_w_vx_m(v60, v60, output_offset);
- vadd_w_vx_m(v56, v56, output_offset);
- vadd_w_vx_m(v52, v52, output_offset);
- vadd_w_vx_m(v48, v48, output_offset);
- vmax_w_vx_m(v60, v60, output_activation_min);
- vmax_w_vx_m(v56, v56, output_activation_min);
- vmax_w_vx_m(v52, v52, output_activation_min);
- vmax_w_vx_m(v48, v48, output_activation_min);
- vmin_w_vx_m(v60, v60, output_activation_max);
- vmin_w_vx_m(v56, v56, output_activation_max);
- vmin_w_vx_m(v52, v52, output_activation_max);
- vmin_w_vx_m(v48, v48, output_activation_max);
+ INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE4(v60, v56, v52, v48, v40, v44, output_activation_min, output_activation_max, output_offset);
vsraqs_b_vx(v48, v48, 0);
vst_b_x(v48, p_output);
p_output += output_depth;