blob: 0a137e69b4a1cd1257173feaeef871c25a556beb [file] [log] [blame]
Lun Donga43730e2022-06-22 21:52:14 -07001/*
2 * Copyright 2022 Google LLC
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17// Audio preprocessing: MLCC feature extraction
18
Cindy Liu3c4d6272022-08-04 18:54:12 -070019#include "audio_prep/mfcc.h"
20
Lun Donga43730e2022-06-22 21:52:14 -070021#include <math.h>
Lun Donga43730e2022-06-22 21:52:14 -070022#include <stdlib.h>
23#include <string.h>
24
Cindy Liu3c4d6272022-08-04 18:54:12 -070025#include "audio_prep/util.h"
Lun Donga43730e2022-06-22 21:52:14 -070026
Lun Dongc4839b32022-07-28 17:02:09 -070027#ifdef MFCC_WITH_RVV
28#include <riscv_vector.h>
29
30static const uint8_t kWeightsFracBits = 8;
31static const uint8_t kSpectraFracBits = 7;
32
33// Calculate the dot product of two int vectors using RVV
34static uint32_t dot_product_rvv(uint32_t* u, uint32_t* w, int n) {
35 size_t vl;
36 // auxiliary variables
37 vuint32m8_t vx;
38 vuint32m8_t vu, vw;
39 vuint32m1_t v_sum;
40 uint32_t sum = 0;
41 for (size_t i = 0; i < n; i += vl) {
42 vl = vsetvl_e32m8(n - i);
43 vu = vle32_v_u32m8(u + i, vl); // load
44 vw = vle32_v_u32m8(w + i, vl); // load
45 vx = vmul(vu, vw, vl); // multiply
46 v_sum = vmv_s(v_sum, 0, vl); // init
47 v_sum = vredsum(v_sum, vx, v_sum, vl); // sum
48 sum += vmv_x(v_sum);
49 }
50 return sum;
51}
52#endif
53
Lun Donga43730e2022-06-22 21:52:14 -070054// config struct
55typedef struct {
56 MfccParams params;
57 int win_len;
58 int hop_len;
59 int fft_order;
60 int fft_len;
61 int num_spectra_bins;
62} MfccConfig;
63
64static MfccConfig config = {.params.num_frames = 96,
65 .params.num_mel_bins = 64,
66 .params.audio_samp_rate = 16000,
67 .params.low_edge_hz = 125,
68 .params.upper_edge_hz = 7500,
69 .params.win_len_sec = 0.025,
70 .params.hop_len_sec = 0.010,
71 .params.log_floor = 0.01,
72 .params.log_scaler = 20,
73 .win_len = 400,
74 .hop_len = 160,
75 .fft_order = 9,
76 .fft_len = 512,
77 .num_spectra_bins = 257};
78
79// set mfcc parameters
80void set_mfcc_params(MfccParams* in_params) {
81 config.params = *in_params;
82 config.win_len =
83 (int)(config.params.audio_samp_rate * config.params.win_len_sec + 0.5);
84 config.hop_len =
85 (int)(config.params.audio_samp_rate * config.params.hop_len_sec + 0.5);
86 config.fft_order = ceilf(log2f(config.win_len));
87 config.fft_len = 1 << config.fft_order; // 512
88 config.num_spectra_bins = config.fft_len / 2 + 1;
89}
90
91// Convert frequencies to mel scale using HTK formula
92static float hz_to_mel(float freq_hz) {
93 const float kMelBreakFreqHz = 700.0;
94 const float kMelHighFreqQ = 1127.0;
95 return kMelHighFreqQ * logf(1.0 + (freq_hz / kMelBreakFreqHz));
96}
97
98// Compute Hanning window coefficients
99static void hanning(float* window) {
100 for (int j = 0; j < config.win_len; j++) {
101 window[j] = 0.5 - 0.5 * cosf(2 * M_PI * j / config.win_len);
102 }
103}
104
105// Calculate short-time Fourier transform magnitude for one frame
106// output shape: num_spectra_bins
Lun Dongc4839b32022-07-28 17:02:09 -0700107#ifdef MFCC_WITH_RVV
108static void stft_magnitude(float* in, float* window, uint32_t* out) {
109#else
Lun Donga43730e2022-06-22 21:52:14 -0700110static void stft_magnitude(float* in, float* window, float* out) {
Lun Dongc4839b32022-07-28 17:02:09 -0700111#endif
Lun Donga43730e2022-06-22 21:52:14 -0700112 float* frame = (float*)malloc(config.fft_len * sizeof(float));
113 memset(frame, 0, config.fft_len * sizeof(float));
114 memcpy(frame, in, config.win_len * sizeof(float));
115
116 // apply hanning window
117 for (int j = 0; j < config.win_len; j++) {
118 frame[j] *= window[j];
119 }
120
121 // real-valued FFT
122 rfft(frame, config.fft_order);
123
124 // compute STFT magnitude
Lun Dongc4839b32022-07-28 17:02:09 -0700125 float temp = 0.0;
126 for (int j = 0; j <= config.fft_len / 2; j++) {
127 if (j == 0 || j == config.fft_len / 2) {
128 temp = frame[j] > 0 ? frame[j] : -frame[j];
129 } else {
130 temp = sqrtf(frame[j] * frame[j] +
Lun Donga43730e2022-06-22 21:52:14 -0700131 frame[config.fft_len - j] * frame[config.fft_len - j]);
Lun Dongc4839b32022-07-28 17:02:09 -0700132 }
133#ifdef MFCC_WITH_RVV
134 out[j] = (uint32_t)(temp * (1 << kSpectraFracBits));
135#else
136 out[j] = temp;
137#endif
Lun Donga43730e2022-06-22 21:52:14 -0700138 }
139
140 free(frame);
141}
142
143// Return a matrix that can post-multiply spectrogram rows to make mel
144// output shape: params.num_mel_bins * num_spectra_bins
Lun Dongc4839b32022-07-28 17:02:09 -0700145#ifdef MFCC_WITH_RVV
146static void spectra_to_mel_matrix(uint32_t* weights) {
147#else
Lun Donga43730e2022-06-22 21:52:14 -0700148static void spectra_to_mel_matrix(float* weights) {
Lun Dongc4839b32022-07-28 17:02:09 -0700149#endif
Lun Donga43730e2022-06-22 21:52:14 -0700150 MfccParams* params = &config.params;
151 float nyquist_hz = params->audio_samp_rate / 2;
152 float* spectra_bins = (float*)malloc(config.num_spectra_bins * sizeof(float));
153 linspace(spectra_bins, 0.0, nyquist_hz, config.num_spectra_bins);
154 for (int i = 0; i < config.num_spectra_bins; i++) {
155 spectra_bins[i] = hz_to_mel(spectra_bins[i]);
156 }
157
158 float* band_edges =
159 (float*)malloc((params->num_mel_bins + 2) * sizeof(float));
160 linspace(band_edges, hz_to_mel(params->low_edge_hz),
161 hz_to_mel(params->upper_edge_hz), params->num_mel_bins + 2);
162
163 float lower = 0.0, center = 0.0, upper = 0.0;
164 float lower_slope = 0.0, upper_slope = 0.0;
165 for (int i = 0; i < params->num_mel_bins; i++) {
166 // spectrogram DC bin
Lun Dongc4839b32022-07-28 17:02:09 -0700167 weights[i * config.num_spectra_bins] = 0;
Lun Donga43730e2022-06-22 21:52:14 -0700168
169 lower = band_edges[i];
170 center = band_edges[i + 1];
171 upper = band_edges[i + 2];
172 for (int j = 1; j < config.num_spectra_bins; j++) {
173 lower_slope = (spectra_bins[j] - lower) / (center - lower);
174 upper_slope = (upper - spectra_bins[j]) / (upper - center);
175 float clamp = (lower_slope < upper_slope) ? lower_slope : upper_slope;
176 clamp = (clamp < 0) ? 0 : clamp;
Lun Dongc4839b32022-07-28 17:02:09 -0700177#ifdef MFCC_WITH_RVV
178 weights[i * config.num_spectra_bins + j] =
179 (uint32_t)(clamp * (1 << kWeightsFracBits));
180#else
Lun Donga43730e2022-06-22 21:52:14 -0700181 weights[i * config.num_spectra_bins + j] = clamp;
Lun Dongc4839b32022-07-28 17:02:09 -0700182#endif
Lun Donga43730e2022-06-22 21:52:14 -0700183 }
184 }
185
186 free(band_edges);
187 free(spectra_bins);
188}
189
190// Convert waveform to a log magnitude mel-frequency spectrogram
191// input: audio samples (int16) with params.num_frames * hop_len samples
192// zero pre-padding win_len - hop_len samples
193// output shape: params.num_frames * params.num_mel_bins (uint8)
194void extract_mfcc(int16_t* in, uint8_t* out, int in_len) {
195 MfccParams* params = &config.params;
196 // Calculate a "periodic" Hann window
197 float* window = (float*)malloc(config.win_len * sizeof(float));
198 hanning(window);
199
Lun Dongc4839b32022-07-28 17:02:09 -0700200#ifdef MFCC_WITH_RVV
201 uint32_t* weights = (uint32_t*)malloc(
202 params->num_mel_bins * config.num_spectra_bins * sizeof(uint32_t));
203 uint32_t* spectra =
204 (uint32_t*)malloc(config.num_spectra_bins * sizeof(uint32_t));
205#else
Lun Donga43730e2022-06-22 21:52:14 -0700206 float* weights = (float*)malloc(params->num_mel_bins *
207 config.num_spectra_bins * sizeof(float));
Lun Dongc4839b32022-07-28 17:02:09 -0700208 float* spectra = (float*)malloc(config.num_spectra_bins * sizeof(float));
209#endif
210
211 // Compute weights
Lun Donga43730e2022-06-22 21:52:14 -0700212 spectra_to_mel_matrix(weights);
213
214 float* frame = (float*)malloc(config.win_len * sizeof(float));
215 memset(frame, 0, config.win_len * sizeof(float));
Lun Donga43730e2022-06-22 21:52:14 -0700216
217 for (int i = 0; i < params->num_frames; i++) {
218 // update buffer
Lun Dongc4839b32022-07-28 17:02:09 -0700219 memmove(frame, frame + config.hop_len,
220 (config.win_len - config.hop_len) * sizeof(float));
221
Lun Donga43730e2022-06-22 21:52:14 -0700222 // feed in new samples
223 for (int j = 0; j < config.hop_len; j++) {
224 int idx = i * config.hop_len + j;
225 frame[config.win_len - config.hop_len + j] =
226 idx < in_len ? (float)in[idx] : 0.0;
227 }
228
229 // compute STFT magnitude
230 stft_magnitude(frame, window, spectra);
231
232 // compute MFCC
233 for (int j = 0; j < params->num_mel_bins; j++) {
Lun Dongc4839b32022-07-28 17:02:09 -0700234#ifdef MFCC_WITH_RVV
235 uint32_t temp =
236 dot_product_rvv(spectra, weights + j * config.num_spectra_bins,
237 config.num_spectra_bins);
238 float tempf = (float)temp / (1 << (kSpectraFracBits + kWeightsFracBits));
239#else
240 float tempf = dot_product(spectra, weights + j * config.num_spectra_bins,
241 config.num_spectra_bins);
242#endif
243 if (tempf < params->log_floor) tempf = params->log_floor;
244 tempf = params->log_scaler * logf(tempf);
245 tempf = tempf < 0.0 ? 0.0 : (tempf > 255.0 ? 255.0 : tempf);
246 out[i * params->num_mel_bins + j] = (uint8_t)tempf;
Lun Donga43730e2022-06-22 21:52:14 -0700247 }
248 }
249
250 free(window);
251 free(weights);
252 free(spectra);
253 free(frame);
254}