Create and apply interleaved output pipeline macros

- INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE{2,3,4} -- interleaved variants
  of the normal output pipeline macro, for handling multiple outputs.
  Interleaving provides a tangible performance increase on hardware.

Change-Id: Iddea3b22ed71bfdfc1a17fba4fdde746783f6e6d
diff --git a/tflm/opt/conv_s8_1x1.cc b/tflm/opt/conv_s8_1x1.cc
index 66e449c..26a681b 100644
--- a/tflm/opt/conv_s8_1x1.cc
+++ b/tflm/opt/conv_s8_1x1.cc
@@ -144,11 +144,8 @@
       }
 
       vcget(v48);
-      INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE(
-          v48, v20, v24, output_activation_min, output_activation_max,
-          output_offset);
-      INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE(
-          v52, v20, v24, output_activation_min, output_activation_max,
+      INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE2(
+          v48, v52, v20, v24, output_activation_min, output_activation_max,
           output_offset);
       vsraqs_b_vx(v48, v48, 0);
       vsraqs_b_vx(v52, v52, 0);
@@ -262,11 +259,8 @@
       aconv_vxv(v48, v0, cmds, v8);
 
       vcget(v48);
-      INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE(
-          v48, v20, v24, output_activation_min, output_activation_max,
-          output_offset);
-      INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE(
-          v52, v20, v24, output_activation_min, output_activation_max,
+      INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE2(
+          v48, v52, v20, v24, output_activation_min, output_activation_max,
           output_offset);
       vsraqs_b_vx(v48, v48, 0);
       vsraqs_b_vx(v52, v52, 0);
@@ -307,11 +301,8 @@
       aconv_vxv(v48, v0, cmds, v8);
 
       vcget(v48);
-      INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE(
-          v48, v20, v24, output_activation_min, output_activation_max,
-          output_offset);
-      INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE(
-          v52, v20, v24, output_activation_min, output_activation_max,
+      INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE2(
+          v48, v52, v20, v24, output_activation_min, output_activation_max,
           output_offset);
       vsraqs_b_vx(v48, v48, 0);
       vsraqs_b_vx(v52, v52, 0);
diff --git a/tflm/opt/conv_util.h b/tflm/opt/conv_util.h
index 7c925c4..a142c6f 100644
--- a/tflm/opt/conv_util.h
+++ b/tflm/opt/conv_util.h
@@ -150,6 +150,73 @@
     vmin_w_vx_m(result, result, output_activation_max);                       \
   }
 
+// As above, but interleaves 2 sets of outputs.
+#define INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE2(result0, result1, mult, shft,  \
+                                              output_min, \
+                                              output_max, output_offset)      \
+  {                                                                           \
+    vdmulh_w_rn_vv_m(result0, result0, mult);                                   \
+    vdmulh_w_rn_vv_m(result1, result1, mult);                                   \
+    vsha_w_r_vv_m(result0, result0, shft);                                      \
+    vsha_w_r_vv_m(result1, result1, shft);                                      \
+    vadd_w_vx_m(result0, result0, output_offset);                               \
+    vadd_w_vx_m(result1, result1, output_offset);                               \
+    vmax_w_vx_m(result0, result0, output_activation_min);                       \
+    vmax_w_vx_m(result1, result1, output_activation_min);                       \
+    vmin_w_vx_m(result0, result0, output_activation_max);                       \
+    vmin_w_vx_m(result1, result1, output_activation_max);                       \
+  }
+
+// As above, but interleaves 3 sets of outputs.
+#define INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE3(result0, result1, result2, \
+                                              mult, shft,  \
+                                              output_min, \
+                                              output_max, output_offset)      \
+  {                                                                           \
+    vdmulh_w_rn_vv_m(result0, result0, mult);                                   \
+    vdmulh_w_rn_vv_m(result1, result1, mult);                                   \
+    vdmulh_w_rn_vv_m(result2, result2, mult);                                   \
+    vsha_w_r_vv_m(result0, result0, shft);                                      \
+    vsha_w_r_vv_m(result1, result1, shft);                                      \
+    vsha_w_r_vv_m(result2, result2, shft);                                      \
+    vadd_w_vx_m(result0, result0, output_offset);                               \
+    vadd_w_vx_m(result1, result1, output_offset);                               \
+    vadd_w_vx_m(result2, result2, output_offset);                               \
+    vmax_w_vx_m(result0, result0, output_activation_min);                       \
+    vmax_w_vx_m(result1, result1, output_activation_min);                       \
+    vmax_w_vx_m(result2, result2, output_activation_min);                       \
+    vmin_w_vx_m(result0, result0, output_activation_max);                       \
+    vmin_w_vx_m(result1, result1, output_activation_max);                       \
+    vmin_w_vx_m(result2, result2, output_activation_max);                       \
+  }
+
+// As above, but interleaves 4 sets of outputs.
+#define INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE4(result0, result1, result2, result3, mult, shft,  \
+                                              output_min, \
+                                              output_max, output_offset)      \
+  {                                                                           \
+    vdmulh_w_rn_vv_m(result0, result0, mult);                                   \
+    vdmulh_w_rn_vv_m(result1, result1, mult);                                   \
+    vdmulh_w_rn_vv_m(result2, result2, mult);                                   \
+    vdmulh_w_rn_vv_m(result3, result3, mult);                                   \
+    vsha_w_r_vv_m(result0, result0, shft);                                      \
+    vsha_w_r_vv_m(result1, result1, shft);                                      \
+    vsha_w_r_vv_m(result2, result2, shft);                                      \
+    vsha_w_r_vv_m(result3, result3, shft);                                      \
+    vadd_w_vx_m(result0, result0, output_offset);                               \
+    vadd_w_vx_m(result1, result1, output_offset);                               \
+    vadd_w_vx_m(result2, result2, output_offset);                               \
+    vadd_w_vx_m(result3, result3, output_offset);                               \
+    vmax_w_vx_m(result0, result0, output_activation_min);                       \
+    vmax_w_vx_m(result1, result1, output_activation_min);                       \
+    vmax_w_vx_m(result2, result2, output_activation_min);                       \
+    vmax_w_vx_m(result3, result3, output_activation_min);                       \
+    vmin_w_vx_m(result0, result0, output_activation_max);                       \
+    vmin_w_vx_m(result1, result1, output_activation_max);                       \
+    vmin_w_vx_m(result2, result2, output_activation_max);                       \
+    vmin_w_vx_m(result3, result3, output_activation_max);                       \
+  }
+
 // Run output pipeline on int32 accumulators in [v48-v55] and store results
 // in v48 and v52. Clobbers [v48-v55].
 #define INT32_TO_INT8_OUTPUT_PIPELINE(bias, mult, shft, output_min,        \
diff --git a/tflm/opt/depthwise_conv_s8.cc b/tflm/opt/depthwise_conv_s8.cc
index feb21cb..4f9440c 100644
--- a/tflm/opt/depthwise_conv_s8.cc
+++ b/tflm/opt/depthwise_conv_s8.cc
@@ -307,39 +307,25 @@
           adwconv_vxv(v36, INPUT_0_4, cmds, FLT_0_1);
           vdwconv_vxv(v36, INPUT_0_5, cmds, FLT_0_2);
 
-          INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE(
-              v48, v56, v60,
+          INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE4(
+              v48, v44, v40, v36, v56, v60,
               output_activation_min,
               output_activation_max,
               output_offset);
           vsraqs_b_vx(v48, v48, 0);
+          vsraqs_b_vx(v44, v44, 0);
+          vsraqs_b_vx(v40, v40, 0);
+          vsraqs_b_vx(v36, v36, 0);
+
           vst_b_x(v48, p_output);
           p_output += output_depth;
 
-          INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE(
-              v44, v56, v60,
-              output_activation_min,
-              output_activation_max,
-              output_offset);
-          vsraqs_b_vx(v44, v44, 0);
           vst_b_x(v44, p_output);
           p_output += output_depth;
 
-          INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE(
-              v40, v56, v60,
-              output_activation_min,
-              output_activation_max,
-              output_offset);
-          vsraqs_b_vx(v40, v40, 0);
           vst_b_x(v40, p_output);
           p_output += output_depth;
 
-          INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE(
-              v36, v56, v60,
-              output_activation_min,
-              output_activation_max,
-              output_offset);
-          vsraqs_b_vx(v36, v36, 0);
           vst_b_x(v36, p_output);
           p_output += output_depth;
         }
@@ -1336,28 +1322,7 @@
           vrsub_w_vx_m(v44, v44, 0);
 
           // Compute final outputs, for both 5x5 patches, and store.
-          // NB: We don't use the normal output pipeline macro here,
-          // as interleaving improves performance on hardware.
-          vdmulh_w_rn_vv_m(v60, v60, v40);
-          vdmulh_w_rn_vv_m(v56, v56, v40);
-          vdmulh_w_rn_vv_m(v52, v52, v40);
-          vdmulh_w_rn_vv_m(v48, v48, v40);
-          vsha_w_r_vv_m(v60, v60, v44);
-          vsha_w_r_vv_m(v56, v56, v44);
-          vsha_w_r_vv_m(v52, v52, v44);
-          vsha_w_r_vv_m(v48, v48, v44);
-          vadd_w_vx_m(v60, v60, output_offset);
-          vadd_w_vx_m(v56, v56, output_offset);
-          vadd_w_vx_m(v52, v52, output_offset);
-          vadd_w_vx_m(v48, v48, output_offset);
-          vmax_w_vx_m(v60, v60, output_activation_min);
-          vmax_w_vx_m(v56, v56, output_activation_min);
-          vmax_w_vx_m(v52, v52, output_activation_min);
-          vmax_w_vx_m(v48, v48, output_activation_min);
-          vmin_w_vx_m(v60, v60, output_activation_max);
-          vmin_w_vx_m(v56, v56, output_activation_max);
-          vmin_w_vx_m(v52, v52, output_activation_max);
-          vmin_w_vx_m(v48, v48, output_activation_max);
+          INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE4(v60, v56, v52, v48, v40, v44, output_activation_min, output_activation_max, output_offset);
           vsraqs_b_vx(v48, v48, 0);
           vst_b_x(v48, p_output);
           p_output += output_depth;