Optimize dot product in audio MFCC pre-processing via RVV
Use RVV for the dot product operation in audio preprocessing. Yield ~30% of saving in term of # of instructions.
Results validated (within 1 code of difference due to quantization) and unit tests are passed.
Other optimization opportunies for audio preprocessing are not trivial nor straightforword, and will be put on hold due to priority shift.
Change-Id: Iea6f78bf488bacb8c617fbed0671dd607137e50f
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b54cb7e..b3d7f43 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,7 +12,6 @@
${CMAKE_CURRENT_LIST_DIR}/cmake/
)
-set(BUILD_INTERNAL_MODELS OFF CACHE BOOL "Build applications from internal models (default: OFF)")
set(IREE_SOURCE_DIR "$ENV{ROOTDIR}/toolchain/iree" CACHE PATH
"IREE source code path. (default: $ENV{ROOTDIR}/toolchain/iree)")
@@ -29,6 +28,7 @@
set(SPRINGBOK_LINKER_SCRIPT "$ENV{ROOTDIR}/sw/vec/springbok/springbok.ld" CACHE PATH "Springbok linker script path (default: springbok.ld)")
set(BUILD_WITH_SPRINGBOK ON CACHE BOOL "Build the target with springbok BSP (default: ON)")
set(BUILD_ISP_WITH_RVV ON CACHE BOOL "Build the ISP pipeline with RVV (default: ON)")
+set(BUILD_MFCC_WITH_RVV OFF CACHE BOOL "Build audio preprocessing with RVV (default: OFF)")
#-------------------------------------------------------------------------------
# IREE-specific settings
diff --git a/samples/audio_prep/CMakeLists.txt b/samples/audio_prep/CMakeLists.txt
index 6f3467b..c8085de 100644
--- a/samples/audio_prep/CMakeLists.txt
+++ b/samples/audio_prep/CMakeLists.txt
@@ -20,6 +20,10 @@
::util
)
+if (${BUILD_MFCC_WITH_RVV})
+ target_compile_definitions(samples_audio_prep_mfcc PUBLIC MFCC_WITH_RVV)
+endif()
+
springbok_test(
NAME
mfcc_test
diff --git a/samples/audio_prep/mfcc.c b/samples/audio_prep/mfcc.c
index bc01d8b..b0f6334 100644
--- a/samples/audio_prep/mfcc.c
+++ b/samples/audio_prep/mfcc.c
@@ -24,6 +24,33 @@
#include "samples/audio_prep/mfcc.h"
#include "samples/audio_prep/util.h"
+#ifdef MFCC_WITH_RVV
+#include <riscv_vector.h>
+
+static const uint8_t kWeightsFracBits = 8;
+static const uint8_t kSpectraFracBits = 7;
+
+// Calculate the dot product of two int vectors using RVV
+static uint32_t dot_product_rvv(uint32_t* u, uint32_t* w, int n) {
+ size_t vl;
+ // auxiliary variables
+ vuint32m8_t vx;
+ vuint32m8_t vu, vw;
+ vuint32m1_t v_sum;
+ uint32_t sum = 0;
+ for (size_t i = 0; i < n; i += vl) {
+ vl = vsetvl_e32m8(n - i);
+ vu = vle32_v_u32m8(u + i, vl); // load
+ vw = vle32_v_u32m8(w + i, vl); // load
+ vx = vmul(vu, vw, vl); // multiply
+ v_sum = vmv_s(v_sum, 0, vl); // init
+ v_sum = vredsum(v_sum, vx, v_sum, vl); // sum
+ sum += vmv_x(v_sum);
+ }
+ return sum;
+}
+#endif
+
// config struct
typedef struct {
MfccParams params;
@@ -77,7 +104,11 @@
// Calculate short-time Fourier transform magnitude for one frame
// output shape: num_spectra_bins
+#ifdef MFCC_WITH_RVV
+static void stft_magnitude(float* in, float* window, uint32_t* out) {
+#else
static void stft_magnitude(float* in, float* window, float* out) {
+#endif
float* frame = (float*)malloc(config.fft_len * sizeof(float));
memset(frame, 0, config.fft_len * sizeof(float));
memcpy(frame, in, config.win_len * sizeof(float));
@@ -91,13 +122,19 @@
rfft(frame, config.fft_order);
// compute STFT magnitude
- out[0] = frame[0] > 0 ? frame[0] : -frame[0];
- out[config.fft_len / 2] = frame[config.fft_len / 2] > 0
- ? frame[config.fft_len / 2]
- : -frame[config.fft_len / 2];
- for (int j = 1; j < config.fft_len / 2; j++) {
- out[j] = sqrtf(frame[j] * frame[j] +
+ float temp = 0.0;
+ for (int j = 0; j <= config.fft_len / 2; j++) {
+ if (j == 0 || j == config.fft_len / 2) {
+ temp = frame[j] > 0 ? frame[j] : -frame[j];
+ } else {
+ temp = sqrtf(frame[j] * frame[j] +
frame[config.fft_len - j] * frame[config.fft_len - j]);
+ }
+#ifdef MFCC_WITH_RVV
+ out[j] = (uint32_t)(temp * (1 << kSpectraFracBits));
+#else
+ out[j] = temp;
+#endif
}
free(frame);
@@ -105,7 +142,11 @@
// Return a matrix that can post-multiply spectrogram rows to make mel
// output shape: params.num_mel_bins * num_spectra_bins
+#ifdef MFCC_WITH_RVV
+static void spectra_to_mel_matrix(uint32_t* weights) {
+#else
static void spectra_to_mel_matrix(float* weights) {
+#endif
MfccParams* params = &config.params;
float nyquist_hz = params->audio_samp_rate / 2;
float* spectra_bins = (float*)malloc(config.num_spectra_bins * sizeof(float));
@@ -123,7 +164,7 @@
float lower_slope = 0.0, upper_slope = 0.0;
for (int i = 0; i < params->num_mel_bins; i++) {
// spectrogram DC bin
- weights[i * config.num_spectra_bins] = 0.0;
+ weights[i * config.num_spectra_bins] = 0;
lower = band_edges[i];
center = band_edges[i + 1];
@@ -133,7 +174,12 @@
upper_slope = (upper - spectra_bins[j]) / (upper - center);
float clamp = (lower_slope < upper_slope) ? lower_slope : upper_slope;
clamp = (clamp < 0) ? 0 : clamp;
+#ifdef MFCC_WITH_RVV
+ weights[i * config.num_spectra_bins + j] =
+ (uint32_t)(clamp * (1 << kWeightsFracBits));
+#else
weights[i * config.num_spectra_bins + j] = clamp;
+#endif
}
}
@@ -151,20 +197,28 @@
float* window = (float*)malloc(config.win_len * sizeof(float));
hanning(window);
- // Compute weights
+#ifdef MFCC_WITH_RVV
+ uint32_t* weights = (uint32_t*)malloc(
+ params->num_mel_bins * config.num_spectra_bins * sizeof(uint32_t));
+ uint32_t* spectra =
+ (uint32_t*)malloc(config.num_spectra_bins * sizeof(uint32_t));
+#else
float* weights = (float*)malloc(params->num_mel_bins *
config.num_spectra_bins * sizeof(float));
+ float* spectra = (float*)malloc(config.num_spectra_bins * sizeof(float));
+#endif
+
+ // Compute weights
spectra_to_mel_matrix(weights);
float* frame = (float*)malloc(config.win_len * sizeof(float));
memset(frame, 0, config.win_len * sizeof(float));
- float* spectra = (float*)malloc(config.num_spectra_bins * sizeof(float));
for (int i = 0; i < params->num_frames; i++) {
// update buffer
- for (int j = 0; j < config.win_len - config.hop_len; j++) {
- frame[j] = frame[j + config.hop_len];
- }
+ memmove(frame, frame + config.hop_len,
+ (config.win_len - config.hop_len) * sizeof(float));
+
// feed in new samples
for (int j = 0; j < config.hop_len; j++) {
int idx = i * config.hop_len + j;
@@ -177,12 +231,19 @@
// compute MFCC
for (int j = 0; j < params->num_mel_bins; j++) {
- float temp = dot_product(spectra, weights + j * config.num_spectra_bins,
- config.num_spectra_bins);
- if (temp < params->log_floor) temp = params->log_floor;
- temp = params->log_scaler * logf(temp);
- temp = temp < 0.0 ? 0.0 : (temp > 255.0 ? 255.0 : temp);
- out[i * params->num_mel_bins + j] = (uint8_t)temp;
+#ifdef MFCC_WITH_RVV
+ uint32_t temp =
+ dot_product_rvv(spectra, weights + j * config.num_spectra_bins,
+ config.num_spectra_bins);
+ float tempf = (float)temp / (1 << (kSpectraFracBits + kWeightsFracBits));
+#else
+ float tempf = dot_product(spectra, weights + j * config.num_spectra_bins,
+ config.num_spectra_bins);
+#endif
+ if (tempf < params->log_floor) tempf = params->log_floor;
+ tempf = params->log_scaler * logf(tempf);
+ tempf = tempf < 0.0 ? 0.0 : (tempf > 255.0 ? 255.0 : tempf);
+ out[i * params->num_mel_bins + j] = (uint8_t)tempf;
}
}
diff --git a/samples/audio_prep/mfcc_test.cc b/samples/audio_prep/mfcc_test.cc
index b801649..f39f68e 100644
--- a/samples/audio_prep/mfcc_test.cc
+++ b/samples/audio_prep/mfcc_test.cc
@@ -39,8 +39,14 @@
// extract MFCC
extract_mfcc(golden_input, out, sizeof(golden_input) / sizeof(int16_t));
+ int kTolerance = 0;
+#ifdef MFCC_WITH_RVV
+ kTolerance = 1;
+#endif
for (int i = 0; i < out_len; i++) {
- ASSERT_EQ(out[i], golden_output[i]);
+ int diff = out[i] - golden_output[i];
+ if (diff < 0) diff = -diff;
+ ASSERT_LE(diff, kTolerance);
}
free(out);
}