Optimize dot product in audio MFCC pre-processing via RVV Use RVV for the dot product operation in audio preprocessing. Yield ~30% of saving in term of # of instructions. Results validated (within 1 code of difference due to quantization) and unit tests are passed. Other optimization opportunies for audio preprocessing are not trivial nor straightforword, and will be put on hold due to priority shift. Change-Id: Iea6f78bf488bacb8c617fbed0671dd607137e50f

commit: c4839b36b96d627acba585ff889c3901aa30b7bc [log] [tgz]
author: Lun Dong <lundong@google.com> Thu Jul 28 17:02:09 2022 -0700
committer: Lun Dong <lundong@google.com> Fri Jul 29 15:20:44 2022 -0700
tree: 2d069a85c6b8cdd4e7aa51a128653e97365ebb2d
parent: 530ea6bbb3d5bec61379b14ad8b245add5a828e0 [diff]
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b54cb7e..b3d7f43 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt

@@ -12,7 +12,6 @@
   ${CMAKE_CURRENT_LIST_DIR}/cmake/
 )
 
-set(BUILD_INTERNAL_MODELS OFF CACHE BOOL "Build applications from internal models (default: OFF)")
 set(IREE_SOURCE_DIR "$ENV{ROOTDIR}/toolchain/iree" CACHE PATH
     "IREE source code path. (default: $ENV{ROOTDIR}/toolchain/iree)")
 
@@ -29,6 +28,7 @@
 set(SPRINGBOK_LINKER_SCRIPT "$ENV{ROOTDIR}/sw/vec/springbok/springbok.ld" CACHE PATH "Springbok linker script path (default: springbok.ld)")
 set(BUILD_WITH_SPRINGBOK ON CACHE BOOL "Build the target with springbok BSP (default: ON)")
 set(BUILD_ISP_WITH_RVV ON CACHE BOOL "Build the ISP pipeline with RVV (default: ON)")
+set(BUILD_MFCC_WITH_RVV OFF CACHE BOOL "Build audio preprocessing with RVV (default: OFF)")
 
 #-------------------------------------------------------------------------------
 # IREE-specific settings

diff --git a/samples/audio_prep/CMakeLists.txt b/samples/audio_prep/CMakeLists.txt
index 6f3467b..c8085de 100644
--- a/samples/audio_prep/CMakeLists.txt
+++ b/samples/audio_prep/CMakeLists.txt

@@ -20,6 +20,10 @@
     ::util
 )
 
+if (${BUILD_MFCC_WITH_RVV})
+  target_compile_definitions(samples_audio_prep_mfcc PUBLIC MFCC_WITH_RVV)
+endif()
+
 springbok_test(
   NAME
     mfcc_test

diff --git a/samples/audio_prep/mfcc.c b/samples/audio_prep/mfcc.c
index bc01d8b..b0f6334 100644
--- a/samples/audio_prep/mfcc.c
+++ b/samples/audio_prep/mfcc.c

@@ -24,6 +24,33 @@
 #include "samples/audio_prep/mfcc.h"
 #include "samples/audio_prep/util.h"
 
+#ifdef MFCC_WITH_RVV
+#include <riscv_vector.h>
+
+static const uint8_t kWeightsFracBits = 8;
+static const uint8_t kSpectraFracBits = 7;
+
+// Calculate the dot product of two int vectors using RVV
+static uint32_t dot_product_rvv(uint32_t* u, uint32_t* w, int n) {
+  size_t vl;
+  // auxiliary variables
+  vuint32m8_t vx;
+  vuint32m8_t vu, vw;
+  vuint32m1_t v_sum;
+  uint32_t sum = 0;
+  for (size_t i = 0; i < n; i += vl) {
+    vl = vsetvl_e32m8(n - i);
+    vu = vle32_v_u32m8(u + i, vl);          // load
+    vw = vle32_v_u32m8(w + i, vl);          // load
+    vx = vmul(vu, vw, vl);                  // multiply
+    v_sum = vmv_s(v_sum, 0, vl);            // init
+    v_sum = vredsum(v_sum, vx, v_sum, vl);  // sum
+    sum += vmv_x(v_sum);
+  }
+  return sum;
+}
+#endif
+
 // config struct
 typedef struct {
   MfccParams params;
@@ -77,7 +104,11 @@
 
 // Calculate short-time Fourier transform magnitude for one frame
 // output shape: num_spectra_bins
+#ifdef MFCC_WITH_RVV
+static void stft_magnitude(float* in, float* window, uint32_t* out) {
+#else
 static void stft_magnitude(float* in, float* window, float* out) {
+#endif
   float* frame = (float*)malloc(config.fft_len * sizeof(float));
   memset(frame, 0, config.fft_len * sizeof(float));
   memcpy(frame, in, config.win_len * sizeof(float));
@@ -91,13 +122,19 @@
   rfft(frame, config.fft_order);
 
   // compute STFT magnitude
-  out[0] = frame[0] > 0 ? frame[0] : -frame[0];
-  out[config.fft_len / 2] = frame[config.fft_len / 2] > 0
-                                ? frame[config.fft_len / 2]
-                                : -frame[config.fft_len / 2];
-  for (int j = 1; j < config.fft_len / 2; j++) {
-    out[j] = sqrtf(frame[j] * frame[j] +
+  float temp = 0.0;
+  for (int j = 0; j <= config.fft_len / 2; j++) {
+    if (j == 0 || j == config.fft_len / 2) {
+      temp = frame[j] > 0 ? frame[j] : -frame[j];
+    } else {
+      temp = sqrtf(frame[j] * frame[j] +
                    frame[config.fft_len - j] * frame[config.fft_len - j]);
+    }
+#ifdef MFCC_WITH_RVV
+    out[j] = (uint32_t)(temp * (1 << kSpectraFracBits));
+#else
+    out[j] = temp;
+#endif
   }
 
   free(frame);
@@ -105,7 +142,11 @@
 
 // Return a matrix that can post-multiply spectrogram rows to make mel
 // output shape: params.num_mel_bins * num_spectra_bins
+#ifdef MFCC_WITH_RVV
+static void spectra_to_mel_matrix(uint32_t* weights) {
+#else
 static void spectra_to_mel_matrix(float* weights) {
+#endif
   MfccParams* params = &config.params;
   float nyquist_hz = params->audio_samp_rate / 2;
   float* spectra_bins = (float*)malloc(config.num_spectra_bins * sizeof(float));
@@ -123,7 +164,7 @@
   float lower_slope = 0.0, upper_slope = 0.0;
   for (int i = 0; i < params->num_mel_bins; i++) {
     // spectrogram DC bin
-    weights[i * config.num_spectra_bins] = 0.0;
+    weights[i * config.num_spectra_bins] = 0;
 
     lower = band_edges[i];
     center = band_edges[i + 1];
@@ -133,7 +174,12 @@
       upper_slope = (upper - spectra_bins[j]) / (upper - center);
       float clamp = (lower_slope < upper_slope) ? lower_slope : upper_slope;
       clamp = (clamp < 0) ? 0 : clamp;
+#ifdef MFCC_WITH_RVV
+      weights[i * config.num_spectra_bins + j] =
+          (uint32_t)(clamp * (1 << kWeightsFracBits));
+#else
       weights[i * config.num_spectra_bins + j] = clamp;
+#endif
     }
   }
 
@@ -151,20 +197,28 @@
   float* window = (float*)malloc(config.win_len * sizeof(float));
   hanning(window);
 
-  // Compute weights
+#ifdef MFCC_WITH_RVV
+  uint32_t* weights = (uint32_t*)malloc(
+      params->num_mel_bins * config.num_spectra_bins * sizeof(uint32_t));
+  uint32_t* spectra =
+      (uint32_t*)malloc(config.num_spectra_bins * sizeof(uint32_t));
+#else
   float* weights = (float*)malloc(params->num_mel_bins *
                                   config.num_spectra_bins * sizeof(float));
+  float* spectra = (float*)malloc(config.num_spectra_bins * sizeof(float));
+#endif
+
+  // Compute weights
   spectra_to_mel_matrix(weights);
 
   float* frame = (float*)malloc(config.win_len * sizeof(float));
   memset(frame, 0, config.win_len * sizeof(float));
-  float* spectra = (float*)malloc(config.num_spectra_bins * sizeof(float));
 
   for (int i = 0; i < params->num_frames; i++) {
     // update buffer
-    for (int j = 0; j < config.win_len - config.hop_len; j++) {
-      frame[j] = frame[j + config.hop_len];
-    }
+    memmove(frame, frame + config.hop_len,
+            (config.win_len - config.hop_len) * sizeof(float));
+
     // feed in new samples
     for (int j = 0; j < config.hop_len; j++) {
       int idx = i * config.hop_len + j;
@@ -177,12 +231,19 @@
 
     // compute MFCC
     for (int j = 0; j < params->num_mel_bins; j++) {
-      float temp = dot_product(spectra, weights + j * config.num_spectra_bins,
-                               config.num_spectra_bins);
-      if (temp < params->log_floor) temp = params->log_floor;
-      temp = params->log_scaler * logf(temp);
-      temp = temp < 0.0 ? 0.0 : (temp > 255.0 ? 255.0 : temp);
-      out[i * params->num_mel_bins + j] = (uint8_t)temp;
+#ifdef MFCC_WITH_RVV
+      uint32_t temp =
+          dot_product_rvv(spectra, weights + j * config.num_spectra_bins,
+                          config.num_spectra_bins);
+      float tempf = (float)temp / (1 << (kSpectraFracBits + kWeightsFracBits));
+#else
+      float tempf = dot_product(spectra, weights + j * config.num_spectra_bins,
+                                config.num_spectra_bins);
+#endif
+      if (tempf < params->log_floor) tempf = params->log_floor;
+      tempf = params->log_scaler * logf(tempf);
+      tempf = tempf < 0.0 ? 0.0 : (tempf > 255.0 ? 255.0 : tempf);
+      out[i * params->num_mel_bins + j] = (uint8_t)tempf;
     }
   }
 

diff --git a/samples/audio_prep/mfcc_test.cc b/samples/audio_prep/mfcc_test.cc
index b801649..f39f68e 100644
--- a/samples/audio_prep/mfcc_test.cc
+++ b/samples/audio_prep/mfcc_test.cc

@@ -39,8 +39,14 @@
   // extract MFCC
   extract_mfcc(golden_input, out, sizeof(golden_input) / sizeof(int16_t));
 
+  int kTolerance = 0;
+#ifdef MFCC_WITH_RVV
+  kTolerance = 1;
+#endif
   for (int i = 0; i < out_len; i++) {
-    ASSERT_EQ(out[i], golden_output[i]);
+    int diff = out[i] - golden_output[i];
+    if (diff < 0) diff = -diff;
+    ASSERT_LE(diff, kTolerance);
   }
   free(out);
 }
commit	c4839b36b96d627acba585ff889c3901aa30b7bc	[log] [tgz]
author	Lun Dong <lundong@google.com>	Thu Jul 28 17:02:09 2022 -0700
committer	Lun Dong <lundong@google.com>	Fri Jul 29 15:20:44 2022 -0700
tree	2d069a85c6b8cdd4e7aa51a128653e97365ebb2d
parent	530ea6bbb3d5bec61379b14ad8b245add5a828e0 [diff]