Lun Dong | a43730e | 2022-06-22 21:52:14 -0700 | [diff] [blame] | 1 | /* |
| 2 | * Copyright 2022 Google LLC |
| 3 | * |
| 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | * you may not use this file except in compliance with the License. |
| 6 | * You may obtain a copy of the License at |
| 7 | * |
| 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | * |
| 10 | * Unless required by applicable law or agreed to in writing, software |
| 11 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | * See the License for the specific language governing permissions and |
| 14 | * limitations under the License. |
| 15 | */ |
| 16 | |
| 17 | // Audio preprocessing: MLCC feature extraction |
| 18 | |
Cindy Liu | 3c4d627 | 2022-08-04 18:54:12 -0700 | [diff] [blame] | 19 | #include "audio_prep/mfcc.h" |
| 20 | |
Lun Dong | a43730e | 2022-06-22 21:52:14 -0700 | [diff] [blame] | 21 | #include <math.h> |
Lun Dong | a43730e | 2022-06-22 21:52:14 -0700 | [diff] [blame] | 22 | #include <stdlib.h> |
| 23 | #include <string.h> |
| 24 | |
Cindy Liu | 3c4d627 | 2022-08-04 18:54:12 -0700 | [diff] [blame] | 25 | #include "audio_prep/util.h" |
Lun Dong | a43730e | 2022-06-22 21:52:14 -0700 | [diff] [blame] | 26 | |
Lun Dong | c4839b3 | 2022-07-28 17:02:09 -0700 | [diff] [blame] | 27 | #ifdef MFCC_WITH_RVV |
| 28 | #include <riscv_vector.h> |
| 29 | |
| 30 | static const uint8_t kWeightsFracBits = 8; |
| 31 | static const uint8_t kSpectraFracBits = 7; |
| 32 | |
| 33 | // Calculate the dot product of two int vectors using RVV |
| 34 | static uint32_t dot_product_rvv(uint32_t* u, uint32_t* w, int n) { |
| 35 | size_t vl; |
| 36 | // auxiliary variables |
| 37 | vuint32m8_t vx; |
| 38 | vuint32m8_t vu, vw; |
| 39 | vuint32m1_t v_sum; |
| 40 | uint32_t sum = 0; |
| 41 | for (size_t i = 0; i < n; i += vl) { |
| 42 | vl = vsetvl_e32m8(n - i); |
| 43 | vu = vle32_v_u32m8(u + i, vl); // load |
| 44 | vw = vle32_v_u32m8(w + i, vl); // load |
| 45 | vx = vmul(vu, vw, vl); // multiply |
| 46 | v_sum = vmv_s(v_sum, 0, vl); // init |
| 47 | v_sum = vredsum(v_sum, vx, v_sum, vl); // sum |
| 48 | sum += vmv_x(v_sum); |
| 49 | } |
| 50 | return sum; |
| 51 | } |
| 52 | #endif |
| 53 | |
Lun Dong | a43730e | 2022-06-22 21:52:14 -0700 | [diff] [blame] | 54 | // config struct |
| 55 | typedef struct { |
| 56 | MfccParams params; |
| 57 | int win_len; |
| 58 | int hop_len; |
| 59 | int fft_order; |
| 60 | int fft_len; |
| 61 | int num_spectra_bins; |
| 62 | } MfccConfig; |
| 63 | |
| 64 | static MfccConfig config = {.params.num_frames = 96, |
| 65 | .params.num_mel_bins = 64, |
| 66 | .params.audio_samp_rate = 16000, |
| 67 | .params.low_edge_hz = 125, |
| 68 | .params.upper_edge_hz = 7500, |
| 69 | .params.win_len_sec = 0.025, |
| 70 | .params.hop_len_sec = 0.010, |
| 71 | .params.log_floor = 0.01, |
| 72 | .params.log_scaler = 20, |
| 73 | .win_len = 400, |
| 74 | .hop_len = 160, |
| 75 | .fft_order = 9, |
| 76 | .fft_len = 512, |
| 77 | .num_spectra_bins = 257}; |
| 78 | |
| 79 | // set mfcc parameters |
| 80 | void set_mfcc_params(MfccParams* in_params) { |
| 81 | config.params = *in_params; |
| 82 | config.win_len = |
| 83 | (int)(config.params.audio_samp_rate * config.params.win_len_sec + 0.5); |
| 84 | config.hop_len = |
| 85 | (int)(config.params.audio_samp_rate * config.params.hop_len_sec + 0.5); |
| 86 | config.fft_order = ceilf(log2f(config.win_len)); |
| 87 | config.fft_len = 1 << config.fft_order; // 512 |
| 88 | config.num_spectra_bins = config.fft_len / 2 + 1; |
| 89 | } |
| 90 | |
| 91 | // Convert frequencies to mel scale using HTK formula |
| 92 | static float hz_to_mel(float freq_hz) { |
| 93 | const float kMelBreakFreqHz = 700.0; |
| 94 | const float kMelHighFreqQ = 1127.0; |
| 95 | return kMelHighFreqQ * logf(1.0 + (freq_hz / kMelBreakFreqHz)); |
| 96 | } |
| 97 | |
| 98 | // Compute Hanning window coefficients |
| 99 | static void hanning(float* window) { |
| 100 | for (int j = 0; j < config.win_len; j++) { |
| 101 | window[j] = 0.5 - 0.5 * cosf(2 * M_PI * j / config.win_len); |
| 102 | } |
| 103 | } |
| 104 | |
| 105 | // Calculate short-time Fourier transform magnitude for one frame |
| 106 | // output shape: num_spectra_bins |
Lun Dong | c4839b3 | 2022-07-28 17:02:09 -0700 | [diff] [blame] | 107 | #ifdef MFCC_WITH_RVV |
| 108 | static void stft_magnitude(float* in, float* window, uint32_t* out) { |
| 109 | #else |
Lun Dong | a43730e | 2022-06-22 21:52:14 -0700 | [diff] [blame] | 110 | static void stft_magnitude(float* in, float* window, float* out) { |
Lun Dong | c4839b3 | 2022-07-28 17:02:09 -0700 | [diff] [blame] | 111 | #endif |
Lun Dong | a43730e | 2022-06-22 21:52:14 -0700 | [diff] [blame] | 112 | float* frame = (float*)malloc(config.fft_len * sizeof(float)); |
| 113 | memset(frame, 0, config.fft_len * sizeof(float)); |
| 114 | memcpy(frame, in, config.win_len * sizeof(float)); |
| 115 | |
| 116 | // apply hanning window |
| 117 | for (int j = 0; j < config.win_len; j++) { |
| 118 | frame[j] *= window[j]; |
| 119 | } |
| 120 | |
| 121 | // real-valued FFT |
| 122 | rfft(frame, config.fft_order); |
| 123 | |
| 124 | // compute STFT magnitude |
Lun Dong | c4839b3 | 2022-07-28 17:02:09 -0700 | [diff] [blame] | 125 | float temp = 0.0; |
| 126 | for (int j = 0; j <= config.fft_len / 2; j++) { |
| 127 | if (j == 0 || j == config.fft_len / 2) { |
| 128 | temp = frame[j] > 0 ? frame[j] : -frame[j]; |
| 129 | } else { |
| 130 | temp = sqrtf(frame[j] * frame[j] + |
Lun Dong | a43730e | 2022-06-22 21:52:14 -0700 | [diff] [blame] | 131 | frame[config.fft_len - j] * frame[config.fft_len - j]); |
Lun Dong | c4839b3 | 2022-07-28 17:02:09 -0700 | [diff] [blame] | 132 | } |
| 133 | #ifdef MFCC_WITH_RVV |
| 134 | out[j] = (uint32_t)(temp * (1 << kSpectraFracBits)); |
| 135 | #else |
| 136 | out[j] = temp; |
| 137 | #endif |
Lun Dong | a43730e | 2022-06-22 21:52:14 -0700 | [diff] [blame] | 138 | } |
| 139 | |
| 140 | free(frame); |
| 141 | } |
| 142 | |
| 143 | // Return a matrix that can post-multiply spectrogram rows to make mel |
| 144 | // output shape: params.num_mel_bins * num_spectra_bins |
Lun Dong | c4839b3 | 2022-07-28 17:02:09 -0700 | [diff] [blame] | 145 | #ifdef MFCC_WITH_RVV |
| 146 | static void spectra_to_mel_matrix(uint32_t* weights) { |
| 147 | #else |
Lun Dong | a43730e | 2022-06-22 21:52:14 -0700 | [diff] [blame] | 148 | static void spectra_to_mel_matrix(float* weights) { |
Lun Dong | c4839b3 | 2022-07-28 17:02:09 -0700 | [diff] [blame] | 149 | #endif |
Lun Dong | a43730e | 2022-06-22 21:52:14 -0700 | [diff] [blame] | 150 | MfccParams* params = &config.params; |
| 151 | float nyquist_hz = params->audio_samp_rate / 2; |
| 152 | float* spectra_bins = (float*)malloc(config.num_spectra_bins * sizeof(float)); |
| 153 | linspace(spectra_bins, 0.0, nyquist_hz, config.num_spectra_bins); |
| 154 | for (int i = 0; i < config.num_spectra_bins; i++) { |
| 155 | spectra_bins[i] = hz_to_mel(spectra_bins[i]); |
| 156 | } |
| 157 | |
| 158 | float* band_edges = |
| 159 | (float*)malloc((params->num_mel_bins + 2) * sizeof(float)); |
| 160 | linspace(band_edges, hz_to_mel(params->low_edge_hz), |
| 161 | hz_to_mel(params->upper_edge_hz), params->num_mel_bins + 2); |
| 162 | |
| 163 | float lower = 0.0, center = 0.0, upper = 0.0; |
| 164 | float lower_slope = 0.0, upper_slope = 0.0; |
| 165 | for (int i = 0; i < params->num_mel_bins; i++) { |
| 166 | // spectrogram DC bin |
Lun Dong | c4839b3 | 2022-07-28 17:02:09 -0700 | [diff] [blame] | 167 | weights[i * config.num_spectra_bins] = 0; |
Lun Dong | a43730e | 2022-06-22 21:52:14 -0700 | [diff] [blame] | 168 | |
| 169 | lower = band_edges[i]; |
| 170 | center = band_edges[i + 1]; |
| 171 | upper = band_edges[i + 2]; |
| 172 | for (int j = 1; j < config.num_spectra_bins; j++) { |
| 173 | lower_slope = (spectra_bins[j] - lower) / (center - lower); |
| 174 | upper_slope = (upper - spectra_bins[j]) / (upper - center); |
| 175 | float clamp = (lower_slope < upper_slope) ? lower_slope : upper_slope; |
| 176 | clamp = (clamp < 0) ? 0 : clamp; |
Lun Dong | c4839b3 | 2022-07-28 17:02:09 -0700 | [diff] [blame] | 177 | #ifdef MFCC_WITH_RVV |
| 178 | weights[i * config.num_spectra_bins + j] = |
| 179 | (uint32_t)(clamp * (1 << kWeightsFracBits)); |
| 180 | #else |
Lun Dong | a43730e | 2022-06-22 21:52:14 -0700 | [diff] [blame] | 181 | weights[i * config.num_spectra_bins + j] = clamp; |
Lun Dong | c4839b3 | 2022-07-28 17:02:09 -0700 | [diff] [blame] | 182 | #endif |
Lun Dong | a43730e | 2022-06-22 21:52:14 -0700 | [diff] [blame] | 183 | } |
| 184 | } |
| 185 | |
| 186 | free(band_edges); |
| 187 | free(spectra_bins); |
| 188 | } |
| 189 | |
| 190 | // Convert waveform to a log magnitude mel-frequency spectrogram |
| 191 | // input: audio samples (int16) with params.num_frames * hop_len samples |
| 192 | // zero pre-padding win_len - hop_len samples |
| 193 | // output shape: params.num_frames * params.num_mel_bins (uint8) |
| 194 | void extract_mfcc(int16_t* in, uint8_t* out, int in_len) { |
| 195 | MfccParams* params = &config.params; |
| 196 | // Calculate a "periodic" Hann window |
| 197 | float* window = (float*)malloc(config.win_len * sizeof(float)); |
| 198 | hanning(window); |
| 199 | |
Lun Dong | c4839b3 | 2022-07-28 17:02:09 -0700 | [diff] [blame] | 200 | #ifdef MFCC_WITH_RVV |
| 201 | uint32_t* weights = (uint32_t*)malloc( |
| 202 | params->num_mel_bins * config.num_spectra_bins * sizeof(uint32_t)); |
| 203 | uint32_t* spectra = |
| 204 | (uint32_t*)malloc(config.num_spectra_bins * sizeof(uint32_t)); |
| 205 | #else |
Lun Dong | a43730e | 2022-06-22 21:52:14 -0700 | [diff] [blame] | 206 | float* weights = (float*)malloc(params->num_mel_bins * |
| 207 | config.num_spectra_bins * sizeof(float)); |
Lun Dong | c4839b3 | 2022-07-28 17:02:09 -0700 | [diff] [blame] | 208 | float* spectra = (float*)malloc(config.num_spectra_bins * sizeof(float)); |
| 209 | #endif |
| 210 | |
| 211 | // Compute weights |
Lun Dong | a43730e | 2022-06-22 21:52:14 -0700 | [diff] [blame] | 212 | spectra_to_mel_matrix(weights); |
| 213 | |
| 214 | float* frame = (float*)malloc(config.win_len * sizeof(float)); |
| 215 | memset(frame, 0, config.win_len * sizeof(float)); |
Lun Dong | a43730e | 2022-06-22 21:52:14 -0700 | [diff] [blame] | 216 | |
| 217 | for (int i = 0; i < params->num_frames; i++) { |
| 218 | // update buffer |
Lun Dong | c4839b3 | 2022-07-28 17:02:09 -0700 | [diff] [blame] | 219 | memmove(frame, frame + config.hop_len, |
| 220 | (config.win_len - config.hop_len) * sizeof(float)); |
| 221 | |
Lun Dong | a43730e | 2022-06-22 21:52:14 -0700 | [diff] [blame] | 222 | // feed in new samples |
| 223 | for (int j = 0; j < config.hop_len; j++) { |
| 224 | int idx = i * config.hop_len + j; |
| 225 | frame[config.win_len - config.hop_len + j] = |
| 226 | idx < in_len ? (float)in[idx] : 0.0; |
| 227 | } |
| 228 | |
| 229 | // compute STFT magnitude |
| 230 | stft_magnitude(frame, window, spectra); |
| 231 | |
| 232 | // compute MFCC |
| 233 | for (int j = 0; j < params->num_mel_bins; j++) { |
Lun Dong | c4839b3 | 2022-07-28 17:02:09 -0700 | [diff] [blame] | 234 | #ifdef MFCC_WITH_RVV |
| 235 | uint32_t temp = |
| 236 | dot_product_rvv(spectra, weights + j * config.num_spectra_bins, |
| 237 | config.num_spectra_bins); |
| 238 | float tempf = (float)temp / (1 << (kSpectraFracBits + kWeightsFracBits)); |
| 239 | #else |
| 240 | float tempf = dot_product(spectra, weights + j * config.num_spectra_bins, |
| 241 | config.num_spectra_bins); |
| 242 | #endif |
| 243 | if (tempf < params->log_floor) tempf = params->log_floor; |
| 244 | tempf = params->log_scaler * logf(tempf); |
| 245 | tempf = tempf < 0.0 ? 0.0 : (tempf > 255.0 ? 255.0 : tempf); |
| 246 | out[i * params->num_mel_bins + j] = (uint8_t)tempf; |
Lun Dong | a43730e | 2022-06-22 21:52:14 -0700 | [diff] [blame] | 247 | } |
| 248 | } |
| 249 | |
| 250 | free(window); |
| 251 | free(weights); |
| 252 | free(spectra); |
| 253 | free(frame); |
| 254 | } |