Enable O2 optimizations.
Change-Id: I264e9c56c92c80339fb3291fc4aba778d4387b05
diff --git a/platforms/riscv32/features/BUILD b/platforms/riscv32/features/BUILD
index 5f1e820..9ce5a4c 100644
--- a/platforms/riscv32/features/BUILD
+++ b/platforms/riscv32/features/BUILD
@@ -111,19 +111,38 @@
],
)
+feature(
+ name = "fastbuild",
+ enabled = False,
+ flag_sets = [
+ flag_set(
+ actions = CPP_ALL_COMPILE_ACTIONS + C_ALL_COMPILE_ACTIONS,
+ flag_groups = [
+ flag_group(
+ flags = [
+ "-O2",
+ "-g",
+ ],
+ ),
+ ],
+ ),
+ ],
+ provides = ["compilation_mode"],
+)
+
feature_set(
name = "rv32im",
feature = [
- ":architecture",
- ":sys_spec",
":all_warnings",
":all_warnings_as_errors",
+ ":architecture",
+ ":fastbuild",
+ ":sys_spec",
"@crt//features/common:includes",
"@crt//features/common:reproducible",
"@crt//features/common:symbol_garbage_collection",
"@crt//features/embedded:cc_constructor_destructor",
"@crt//features/embedded:exceptions",
"@crt//features/embedded:runtime_type_information",
- "@crt//platforms/riscv32/features:fastbuild",
],
)
diff --git a/tflm/opt/conv_s8_d1.cc b/tflm/opt/conv_s8_d1.cc
index 80f3a4f..b886f32 100644
--- a/tflm/opt/conv_s8_d1.cc
+++ b/tflm/opt/conv_s8_d1.cc
@@ -172,19 +172,24 @@
const int8_t* local_input_data = input_data +
tflite::Offset(input_shape, batch, in_y, 0, 0);
- for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
- const int in_x = in_x_origin + dilation_width_factor * filter_x;
- if ((in_x < 0) || (in_x >= input_width)) {
- continue;
- }
+ int filter_x = 0;
+ int in_x = in_x_origin;
+ const int8_t* local_filter_data = swizzled_filter_data.get() +
+ (filter_y * filter_width * 32);
+ while (in_x < 0) {
+ filter_x++;
+ in_x += dilation_width_factor;
+ local_filter_data += 32;
+ }
+ for (; (filter_x < filter_width) && (in_x < input_width);
+ ++filter_x, in_x += dilation_width_factor,
+ local_filter_data += 32) {
int16_t input_val = local_input_data[in_x];
int16_t input_val16 = static_cast<int16_t>(
input_val + input_offset);
vdup_h_x(v32, input_val16);
- const int8_t* local_filter_data = swizzled_filter_data.get() +
- (filter_y * filter_width * 32) + (filter_x * 32);
vld_b_l_xx(v0, local_filter_data, n_channels);
vaddw_h_vx(v0, v0, 0);
@@ -198,11 +203,9 @@
}
// Output pipeline
- vdmulh_w_rn_vv_m(v48, v48, v56);
- vsha_w_r_vv_m(v48, v48, v60);
- vadd_w_vx_m(v48, v48, output_offset);
- vmin_w_vx_m(v48, v48, output_activation_max);
- vmax_w_vx_m(v48, v48, output_activation_min);
+ INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE(
+ v48, v56, v60, output_activation_min, output_activation_max,
+ output_offset);
vsraqs_b_vx(v48, v48, 0);
vst_b_l_xx(v48, output_data, n_channels);
output_data += output_depth;
diff --git a/tflm/opt/conv_util.h b/tflm/opt/conv_util.h
index b9470aa..e552d52 100644
--- a/tflm/opt/conv_util.h
+++ b/tflm/opt/conv_util.h
@@ -108,6 +108,18 @@
}
}
+// Runs strip-mined output pipeline (without bias addition) in place on
+// registers.
+#define INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE(result, mult, shft, output_min, \
+ output_max, output_offset) \
+ { \
+ vdmulh_w_rn_vv_m(result, result, mult); \
+ vsha_w_r_vv_m(result, result, shft); \
+ vadd_w_vx_m(result, result, output_offset); \
+ vmax_w_vx_m(result, result, output_activation_min); \
+ vmin_w_vx_m(result, result, output_activation_max); \
+ }
+
// Run output pipeline on int32 accumulators in [v48-v55] and store results
// in v48 and v52. Clobbers [v48-v55].
#define INT32_TO_INT8_OUTPUT_PIPELINE(bias, mult, shft, output_min, \