Enable O2 optimizations.

Change-Id: I264e9c56c92c80339fb3291fc4aba778d4387b05
diff --git a/platforms/riscv32/features/BUILD b/platforms/riscv32/features/BUILD
index 5f1e820..9ce5a4c 100644
--- a/platforms/riscv32/features/BUILD
+++ b/platforms/riscv32/features/BUILD
@@ -111,19 +111,38 @@
     ],
 )
 
+feature(
+    name = "fastbuild",
+    enabled = False,
+    flag_sets = [
+        flag_set(
+            actions = CPP_ALL_COMPILE_ACTIONS + C_ALL_COMPILE_ACTIONS,
+            flag_groups = [
+                flag_group(
+                    flags = [
+                        "-O2",
+                        "-g",
+                    ],
+                ),
+            ],
+        ),
+    ],
+    provides = ["compilation_mode"],
+)
+
 feature_set(
     name = "rv32im",
     feature = [
-        ":architecture",
-        ":sys_spec",
         ":all_warnings",
         ":all_warnings_as_errors",
+        ":architecture",
+        ":fastbuild",
+        ":sys_spec",
         "@crt//features/common:includes",
         "@crt//features/common:reproducible",
         "@crt//features/common:symbol_garbage_collection",
         "@crt//features/embedded:cc_constructor_destructor",
         "@crt//features/embedded:exceptions",
         "@crt//features/embedded:runtime_type_information",
-        "@crt//platforms/riscv32/features:fastbuild",
     ],
 )
diff --git a/tflm/opt/conv_s8_d1.cc b/tflm/opt/conv_s8_d1.cc
index 80f3a4f..b886f32 100644
--- a/tflm/opt/conv_s8_d1.cc
+++ b/tflm/opt/conv_s8_d1.cc
@@ -172,19 +172,24 @@
 
             const int8_t* local_input_data = input_data +
                 tflite::Offset(input_shape, batch, in_y, 0, 0);
-            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
-              const int in_x = in_x_origin + dilation_width_factor * filter_x;
-              if ((in_x < 0) || (in_x >= input_width)) {
-                continue;
-              }
+            int filter_x = 0;
+            int in_x = in_x_origin;
+            const int8_t* local_filter_data = swizzled_filter_data.get() +
+                  (filter_y * filter_width * 32);
+            while (in_x < 0) {
+              filter_x++;
+              in_x += dilation_width_factor;
+              local_filter_data += 32;
+            }
+            for (; (filter_x < filter_width) && (in_x < input_width);
+                 ++filter_x, in_x += dilation_width_factor,
+                 local_filter_data += 32) {
 
               int16_t input_val = local_input_data[in_x];
               int16_t input_val16 = static_cast<int16_t>(
                   input_val + input_offset);
               vdup_h_x(v32, input_val16);
 
-              const int8_t* local_filter_data = swizzled_filter_data.get() +
-                  (filter_y * filter_width * 32) + (filter_x * 32);
               vld_b_l_xx(v0, local_filter_data, n_channels);
               vaddw_h_vx(v0, v0, 0);
 
@@ -198,11 +203,9 @@
           }
 
           // Output pipeline
-          vdmulh_w_rn_vv_m(v48, v48, v56);
-          vsha_w_r_vv_m(v48, v48, v60);
-          vadd_w_vx_m(v48, v48, output_offset);
-          vmin_w_vx_m(v48, v48, output_activation_max);
-          vmax_w_vx_m(v48, v48, output_activation_min);
+          INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE(
+              v48, v56, v60, output_activation_min, output_activation_max,
+              output_offset);
           vsraqs_b_vx(v48, v48, 0);
           vst_b_l_xx(v48, output_data, n_channels);
           output_data += output_depth;
diff --git a/tflm/opt/conv_util.h b/tflm/opt/conv_util.h
index b9470aa..e552d52 100644
--- a/tflm/opt/conv_util.h
+++ b/tflm/opt/conv_util.h
@@ -108,6 +108,18 @@
   }
 }
 
+// Runs strip-mined output pipeline (without bias addition) in place on
+// registers.
+#define INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE(result, mult, shft, output_min, \
+                                              output_max, output_offset)      \
+  {                                                                           \
+    vdmulh_w_rn_vv_m(result, result, mult);                                   \
+    vsha_w_r_vv_m(result, result, shft);                                      \
+    vadd_w_vx_m(result, result, output_offset);                               \
+    vmax_w_vx_m(result, result, output_activation_min);                       \
+    vmin_w_vx_m(result, result, output_activation_max);                       \
+  }
+
 // Run output pipeline on int32 accumulators in [v48-v55] and store results
 // in v48 and v52. Clobbers [v48-v55].
 #define INT32_TO_INT8_OUTPUT_PIPELINE(bias, mult, shft, output_min,        \