blob: bc01d8b714ea8d8e408697ff4d0fb5982b5c0fbe [file] [log] [blame]
/*
* Copyright 2022 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Audio preprocessing: MLCC feature extraction
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "samples/audio_prep/mfcc.h"
#include "samples/audio_prep/util.h"
// config struct
typedef struct {
MfccParams params;
int win_len;
int hop_len;
int fft_order;
int fft_len;
int num_spectra_bins;
} MfccConfig;
static MfccConfig config = {.params.num_frames = 96,
.params.num_mel_bins = 64,
.params.audio_samp_rate = 16000,
.params.low_edge_hz = 125,
.params.upper_edge_hz = 7500,
.params.win_len_sec = 0.025,
.params.hop_len_sec = 0.010,
.params.log_floor = 0.01,
.params.log_scaler = 20,
.win_len = 400,
.hop_len = 160,
.fft_order = 9,
.fft_len = 512,
.num_spectra_bins = 257};
// set mfcc parameters
void set_mfcc_params(MfccParams* in_params) {
config.params = *in_params;
config.win_len =
(int)(config.params.audio_samp_rate * config.params.win_len_sec + 0.5);
config.hop_len =
(int)(config.params.audio_samp_rate * config.params.hop_len_sec + 0.5);
config.fft_order = ceilf(log2f(config.win_len));
config.fft_len = 1 << config.fft_order; // 512
config.num_spectra_bins = config.fft_len / 2 + 1;
}
// Convert frequencies to mel scale using HTK formula
static float hz_to_mel(float freq_hz) {
const float kMelBreakFreqHz = 700.0;
const float kMelHighFreqQ = 1127.0;
return kMelHighFreqQ * logf(1.0 + (freq_hz / kMelBreakFreqHz));
}
// Compute Hanning window coefficients
static void hanning(float* window) {
for (int j = 0; j < config.win_len; j++) {
window[j] = 0.5 - 0.5 * cosf(2 * M_PI * j / config.win_len);
}
}
// Calculate short-time Fourier transform magnitude for one frame
// output shape: num_spectra_bins
static void stft_magnitude(float* in, float* window, float* out) {
float* frame = (float*)malloc(config.fft_len * sizeof(float));
memset(frame, 0, config.fft_len * sizeof(float));
memcpy(frame, in, config.win_len * sizeof(float));
// apply hanning window
for (int j = 0; j < config.win_len; j++) {
frame[j] *= window[j];
}
// real-valued FFT
rfft(frame, config.fft_order);
// compute STFT magnitude
out[0] = frame[0] > 0 ? frame[0] : -frame[0];
out[config.fft_len / 2] = frame[config.fft_len / 2] > 0
? frame[config.fft_len / 2]
: -frame[config.fft_len / 2];
for (int j = 1; j < config.fft_len / 2; j++) {
out[j] = sqrtf(frame[j] * frame[j] +
frame[config.fft_len - j] * frame[config.fft_len - j]);
}
free(frame);
}
// Return a matrix that can post-multiply spectrogram rows to make mel
// output shape: params.num_mel_bins * num_spectra_bins
static void spectra_to_mel_matrix(float* weights) {
MfccParams* params = &config.params;
float nyquist_hz = params->audio_samp_rate / 2;
float* spectra_bins = (float*)malloc(config.num_spectra_bins * sizeof(float));
linspace(spectra_bins, 0.0, nyquist_hz, config.num_spectra_bins);
for (int i = 0; i < config.num_spectra_bins; i++) {
spectra_bins[i] = hz_to_mel(spectra_bins[i]);
}
float* band_edges =
(float*)malloc((params->num_mel_bins + 2) * sizeof(float));
linspace(band_edges, hz_to_mel(params->low_edge_hz),
hz_to_mel(params->upper_edge_hz), params->num_mel_bins + 2);
float lower = 0.0, center = 0.0, upper = 0.0;
float lower_slope = 0.0, upper_slope = 0.0;
for (int i = 0; i < params->num_mel_bins; i++) {
// spectrogram DC bin
weights[i * config.num_spectra_bins] = 0.0;
lower = band_edges[i];
center = band_edges[i + 1];
upper = band_edges[i + 2];
for (int j = 1; j < config.num_spectra_bins; j++) {
lower_slope = (spectra_bins[j] - lower) / (center - lower);
upper_slope = (upper - spectra_bins[j]) / (upper - center);
float clamp = (lower_slope < upper_slope) ? lower_slope : upper_slope;
clamp = (clamp < 0) ? 0 : clamp;
weights[i * config.num_spectra_bins + j] = clamp;
}
}
free(band_edges);
free(spectra_bins);
}
// Convert waveform to a log magnitude mel-frequency spectrogram
// input: audio samples (int16) with params.num_frames * hop_len samples
// zero pre-padding win_len - hop_len samples
// output shape: params.num_frames * params.num_mel_bins (uint8)
void extract_mfcc(int16_t* in, uint8_t* out, int in_len) {
MfccParams* params = &config.params;
// Calculate a "periodic" Hann window
float* window = (float*)malloc(config.win_len * sizeof(float));
hanning(window);
// Compute weights
float* weights = (float*)malloc(params->num_mel_bins *
config.num_spectra_bins * sizeof(float));
spectra_to_mel_matrix(weights);
float* frame = (float*)malloc(config.win_len * sizeof(float));
memset(frame, 0, config.win_len * sizeof(float));
float* spectra = (float*)malloc(config.num_spectra_bins * sizeof(float));
for (int i = 0; i < params->num_frames; i++) {
// update buffer
for (int j = 0; j < config.win_len - config.hop_len; j++) {
frame[j] = frame[j + config.hop_len];
}
// feed in new samples
for (int j = 0; j < config.hop_len; j++) {
int idx = i * config.hop_len + j;
frame[config.win_len - config.hop_len + j] =
idx < in_len ? (float)in[idx] : 0.0;
}
// compute STFT magnitude
stft_magnitude(frame, window, spectra);
// compute MFCC
for (int j = 0; j < params->num_mel_bins; j++) {
float temp = dot_product(spectra, weights + j * config.num_spectra_bins,
config.num_spectra_bins);
if (temp < params->log_floor) temp = params->log_floor;
temp = params->log_scaler * logf(temp);
temp = temp < 0.0 ? 0.0 : (temp > 255.0 ? 255.0 : temp);
out[i * params->num_mel_bins + j] = (uint8_t)temp;
}
}
free(window);
free(weights);
free(spectra);
free(frame);
}