|
ailia
1.5.0.0
|
audio processing library More...
Go to the source code of this file.
Macros | |
| #define | AILIA_API |
| #define | AILIA_AUDIO_WIN_TYPE_HANN (1) |
| use a Hann window function More... | |
| #define | AILIA_AUDIO_WIN_TYPE_HAMMING (2) |
| use a Hamming window function More... | |
| #define | AILIA_AUDIO_STFT_CENTER_NONE (0) |
| for the STFT, do not insert padding before and after More... | |
| #define | AILIA_AUDIO_STFT_CENTER_ENABLE (1) |
| for the STFT, insert a padding (reflect) of fft_n/2 before and after the sample_n samples More... | |
| #define | AILIA_AUDIO_STFT_CENTER_SCIPY_DEFAULT (2) |
| for the STFT, insert a padding (zeros) of fft_n/2 before and after the sample_n samples, and also pad at the end with zeros to process in units of hop_n More... | |
| #define | AILIA_AUDIO_FFT_NORMALIZE_NONE (0) |
| Do not normalize the FFT output. More... | |
| #define | AILIA_AUDIO_FFT_NORMALIZE_LIBROSA_COMPAT (1) |
| Normalize the FFT output in a way compatible with librosa. More... | |
| #define | AILIA_AUDIO_FFT_NORMALIZE_PYTORCH_COMPAT (1) |
| Normalize the FFT output in a way compatible with PyTorch. More... | |
| #define | AILIA_AUDIO_FFT_NORMALIZE_SCIPY_COMPAT (2) |
| Normalize the FFT output in a way compatible with SciPy. More... | |
| #define | AILIA_AUDIO_MEL_NORMALIZE_NONE (0) |
| Do not normalize the output of the mel spectrogram. More... | |
| #define | AILIA_AUDIO_MEL_NORMALIZE_ENABLE (1) |
| Normalize the output of the mel spectrogram. More... | |
| #define | AILIA_AUDIO_MEL_SCALE_FORMULA_HTK (1) |
| Get the mel scale from the HTK formula (PyTorch compatible) More... | |
| #define | AILIA_AUDIO_MEL_SCALE_FORMULA_SLANYE (0) |
| Get the mel scale from the Slanye's formula (compatible with the default of librosa) More... | |
| #define | AILIA_AUDIO_PHASE_FORM_COMPLEX (1) |
| Output the phase in complex format (compatible with the default of librosa) More... | |
| #define | AILIA_AUDIO_PHASE_FORM_REAL (0) |
| Output the phase in complex format (compatible with the default of PyTorch) More... | |
| #define | AILIA_AUDIO_FILTFILT_PAD_NONE (0) |
| During zero-phase filtering, do not pad. More... | |
| #define | AILIA_AUDIO_FILTFILT_PAD_ODD (1) |
| During zero-phase filtering, pad with an odd reflection (substract the reflected values from two times the edge value) More... | |
| #define | AILIA_AUDIO_FILTFILT_PAD_EVEN (2) |
| During zero-phase filtering, pad with an even reflection (normal reflection) More... | |
| #define | AILIA_AUDIO_FILTFILT_PAD_CONSTANT (3) |
| During zero-phase filtering, pad using the edge value. More... | |
Functions | |
| int AILIA_API | ailiaAudioLog1p (void *dst, const void *src, int src_n) |
| Convert the input values to a logarithmic scale. More... | |
| int AILIA_API | ailiaAudioConvertPowerToDB (void *dst, const void *src, int src_n, float top_db) |
| Convert non-negative input values to decibel scale. More... | |
| int AILIA_API | ailiaAudioGetFrameLen (int *frame_n, int sample_n, int fft_n, int hop_n, int center) |
| Get the number of frames generated by the STFT. More... | |
| int AILIA_API | ailiaAudioGetSampleLen (int *sample_n, int frame_n, int freq_n, int hop_n, int center) |
| Get the number of samples generated by the ISTFT. More... | |
| int AILIA_API | ailiaAudioGetWindow (void *dst, int window_n, int win_type) |
| Get the window function. More... | |
| int AILIA_API | ailiaAudioFFT (void *dst, const void *src, int fft_n) |
| Execute the FFT. More... | |
| int AILIA_API | ailiaAudioIFFT (void *dst, const void *src, int fft_n) |
| Execute the IFFT. More... | |
| int AILIA_API | ailiaAudioGetSpectrogram (void *dst, const void *src, int sample_n, int fft_n, int hop_n, int win_n, int win_type, int max_frame_n, int center, float power, int norm_type) |
| Generate the spectrogram from the audio signal. More... | |
| int AILIA_API | ailiaAudioGetInverseSpectrogram (void *dst, const void *src, int frame_n, int freq_n, int hop_n, int win_n, int win_type, int max_sample_n, int center, int norm_type) |
| Generate an audio signal from a complex spectrogram. More... | |
| int AILIA_API | ailiaAudioGetFBMatrix (void *dst, const int freq_n, float f_min, float f_max, int mel_n, int sample_rate, int mel_norm, int mel_formula) |
| Create a mel filter-bank. More... | |
| int AILIA_API | ailiaAudioGetMelSpectrogram (void *dst, const void *src, int sample_n, int sample_rate, int fft_n, int hop_n, int win_n, int win_type, int max_frame_n, int center, float power, int fft_norm_type, float f_min, float f_max, int mel_n, int mel_norm_type, int mel_formula) |
| Generate the mel spectrogram from the audio signal. More... | |
| int AILIA_API | ailiaAudioMagPhase (void *dst_mag, void *dst_phase, const void *src, int freq_n, int frame_n, float power, int phase_form) |
| Get the amplitude and the phase from the spectrogram. More... | |
| int AILIA_API | ailiaAudioStandardize (void *dst, const void *src, const int src_n) |
| Standardize a real signal. More... | |
| int AILIA_API | ailiaAudioComplexNorm (void *dst, const void *src, const int src_n, float power) |
| Get the norm of the complex signal. More... | |
| int AILIA_API | ailiaAudioConvertToMel (void *dst, const void *src, const void *fb_mtrx, int freq_n, int frame_n, int mel_n) |
| Convert the real output of the STFT to the mel scale. More... | |
| int AILIA_API | ailiaAudioFixFrameLen (void *dst, const void *src, int freq_n, int dst_frame_n, int src_frame_n, float pad_data) |
| Fix the number of time frames of a real-valued spectrogram/mel-spectrogram. More... | |
| int AILIA_API | ailiaAudioResample (void *dst, const void *src, int dst_sample_rate, int dst_n, int src_sample_rate, int src_n) |
| Resample the signal. More... | |
| int AILIA_API | ailiaAudioGetResampleLen (int *dst_sample_n, int dst_sample_rate, int src_sample_n, int src_sample_rate) |
| Get the number of samples after the resampling. More... | |
| int AILIA_API | ailiaAudioLinerFilter (void *dst, const void *src, const void *n_coef, const void *d_coef, void *zi, int dst_n, int src_n, int n_coef_n, int d_coef_n, int zi_n) |
| Apply a filter to the signal. More... | |
| int AILIA_API | ailiaAudioGetLinerFilterZiCoef (void *dst_zi, const void *n_coef, const void *d_coef, int dst_n, int n_coef_n, int d_coef_n) |
| Calculate the initial delay coefficients for filtering. More... | |
| int AILIA_API | ailiaAudioFilterFilter (void *dst, const void *src, const void *n_coef, const void *d_coef, int dst_n, int src_n, int n_coef_n, int d_coef_n, int pad_type, int pad_len) |
| Apply a zero-phase filter to the signal. More... | |
| int AILIA_API | ailiaAudioGetNonSilentPos (int *dst_start_pos, int *dst_length, const void *src, int sample_n, int win_n, int hop_n, float thr_db) |
| Find the region of the signal between the first and the last non-silence samples. Detects the area excluding the silent range before and after the signal input. More... | |
audio processing library
| #define AILIA_API |
| #define AILIA_AUDIO_FFT_NORMALIZE_LIBROSA_COMPAT (1) |
Normalize the FFT output in a way compatible with librosa.
| #define AILIA_AUDIO_FFT_NORMALIZE_NONE (0) |
Do not normalize the FFT output.
| #define AILIA_AUDIO_FFT_NORMALIZE_PYTORCH_COMPAT (1) |
Normalize the FFT output in a way compatible with PyTorch.
| #define AILIA_AUDIO_FFT_NORMALIZE_SCIPY_COMPAT (2) |
Normalize the FFT output in a way compatible with SciPy.
| #define AILIA_AUDIO_FILTFILT_PAD_CONSTANT (3) |
During zero-phase filtering, pad using the edge value.
| #define AILIA_AUDIO_FILTFILT_PAD_EVEN (2) |
During zero-phase filtering, pad with an even reflection (normal reflection)
| #define AILIA_AUDIO_FILTFILT_PAD_NONE (0) |
During zero-phase filtering, do not pad.
| #define AILIA_AUDIO_FILTFILT_PAD_ODD (1) |
During zero-phase filtering, pad with an odd reflection (substract the reflected values from two times the edge value)
| #define AILIA_AUDIO_MEL_NORMALIZE_ENABLE (1) |
Normalize the output of the mel spectrogram.
| #define AILIA_AUDIO_MEL_NORMALIZE_NONE (0) |
Do not normalize the output of the mel spectrogram.
| #define AILIA_AUDIO_MEL_SCALE_FORMULA_HTK (1) |
Get the mel scale from the HTK formula (PyTorch compatible)
| #define AILIA_AUDIO_MEL_SCALE_FORMULA_SLANYE (0) |
Get the mel scale from the Slanye's formula (compatible with the default of librosa)
| #define AILIA_AUDIO_PHASE_FORM_COMPLEX (1) |
Output the phase in complex format (compatible with the default of librosa)
| #define AILIA_AUDIO_PHASE_FORM_REAL (0) |
Output the phase in complex format (compatible with the default of PyTorch)
| #define AILIA_AUDIO_STFT_CENTER_ENABLE (1) |
for the STFT, insert a padding (reflect) of fft_n/2 before and after the sample_n samples
| #define AILIA_AUDIO_STFT_CENTER_NONE (0) |
for the STFT, do not insert padding before and after
| #define AILIA_AUDIO_STFT_CENTER_SCIPY_DEFAULT (2) |
for the STFT, insert a padding (zeros) of fft_n/2 before and after the sample_n samples, and also pad at the end with zeros to process in units of hop_n
| #define AILIA_AUDIO_WIN_TYPE_HAMMING (2) |
use a Hamming window function
| #define AILIA_AUDIO_WIN_TYPE_HANN (1) |
use a Hann window function
| int AILIA_API ailiaAudioComplexNorm | ( | void * | dst, |
| const void * | src, | ||
| const int | src_n, | ||
| float | power | ||
| ) |
Get the norm of the complex signal.
| dst | pointer to the output data, of float format, and of length src_n |
| src | pointer to the input data, of float format, an array of length (2 * src_n) (sequence of complex pairs [real part, imaginary part]). (memory layout, using the row-major convention: (src_n, 2)) |
| src_n | length of the input data |
| power | exponent to apply to the spectrogram (> 0.0). 1.0: amplitude spectrogram |
Compute the norm of the input data. For each src_cmp = src[0] + i * src[1], tmp_dst = pow(src[0],2.0) + pow(src[1],2.0) dst[0] = pow(tmp_dst,0.5*power);
| int AILIA_API ailiaAudioConvertPowerToDB | ( | void * | dst, |
| const void * | src, | ||
| int | src_n, | ||
| float | top_db | ||
| ) |
Convert non-negative input values to decibel scale.
| dst | pointer to the output data, of float format, and of length src_n |
| src | pointer to the input data, of float format, and of length src_n |
| src_n | number of elements to be calculated |
| top_db | float >= 0.0 |
Output compatible with librosa.power_to_db. dst = trimlow( 10 * log10(src / ref) ) where ref is the max of 1e-10 and of positive values of src, and trimlow(), if top_db > 0, trims all values inferior to (- top_db) and replaces them by (- top_db)), else, trimlow() does nothing.
| int AILIA_API ailiaAudioConvertToMel | ( | void * | dst, |
| const void * | src, | ||
| const void * | fb_mtrx, | ||
| int | freq_n, | ||
| int | frame_n, | ||
| int | mel_n | ||
| ) |
Convert the real output of the STFT to the mel scale.
| dst | pointer to the output data, of float format, of length (mel_n * frame_n), and of memory layout (in row-major convention) (mel_n, frame_n). |
| src | pointer to the input data, of float format, of length (freq_n * frame_n), and of memory layout (in row-major convention) (freq_n, frame_n). |
| fb_mtrx | the mel filter-bank, of float format, of length (mel_n * freq_n), and of memory layout (in row-major convention) (mel_n, freq_n). |
| freq_n | number of frequency indices |
| frame_n | number of time frames in the input data |
| mel_n | number of mel frequency indices |
Converts the real spectrogram given in input to the mel scale. The argument fb_mtrx can take the coefficients outputted by ailiaAudioGetFBMatrix() .
| int AILIA_API ailiaAudioFFT | ( | void * | dst, |
| const void * | src, | ||
| int | fft_n | ||
| ) |
Execute the FFT.
| dst | pointer to the output data, of float format, of length 2*fft_n, and which memory layout is a sequence of fft_n pairs [real part, imaginary part]. Memory layout, using the row-major convention: (fft_n, 2). |
| src | pointer to the input data, of float format, and of length fft_n |
| fft_n | count of FFT values (i.e. of frequency bins) |
If fft_n is a power of 2, this function uses a faster algorithm. As the output data alternates real and imaginary parts, its length is 2*fft_n.
| int AILIA_API ailiaAudioFilterFilter | ( | void * | dst, |
| const void * | src, | ||
| const void * | n_coef, | ||
| const void * | d_coef, | ||
| int | dst_n, | ||
| int | src_n, | ||
| int | n_coef_n, | ||
| int | d_coef_n, | ||
| int | pad_type, | ||
| int | pad_len | ||
| ) |
Apply a zero-phase filter to the signal.
| dst | pointer to the output data, of float format, and of length dst_n |
| src | pointer to the input data, of float format, and of length src_n |
| n_coef | pointer to the numerator coefficients of the filter, of float format, and length n_coef_n |
| d_coef | pointer to the denominator coefficients of the filter, of float format, and length d_coef_n |
| dst_n | length (in number of samples) reserved in the output buffer (dst_n >= src_n) |
| src_n | number of samples in the input signal |
| n_coef_n | number of numerator coefficients of the filter |
| d_coef_n | number of denominator coefficients of the filter |
| pad_type | type of padding to apply at the start and at the end of the input signal: any of the AILIA_AUDIO_FILTFILT_PAD_* constants |
| pad_len | length of the padding applied to the start and to the end of the input signal |
The number of values written to the output dst is min(dst_m,src_n). The largest of n_coef_n and d_coef_n is taken as reference and zeros are added for padding where necessary.
| int AILIA_API ailiaAudioFixFrameLen | ( | void * | dst, |
| const void * | src, | ||
| int | freq_n, | ||
| int | dst_frame_n, | ||
| int | src_frame_n, | ||
| float | pad_data | ||
| ) |
Fix the number of time frames of a real-valued spectrogram/mel-spectrogram.
| dst | pointer to the output data, of length (freq_n * dst_frame_n), and of memory layout (in row-major convention) (freq_n, dst_frame_n). |
| src | pointer to the input data, of length (freq_n * src_frame_n), and of memory layout (in row-major convention) (freq_n, src_frame_n). |
| freq_n | number of frequency indices |
| dst_frame_n | number of time frames in the output data |
| src_frame_n | number of time frames in the input data |
| pad_data | value inserted for padding (used when dst_frame_n > src_frame_n) |
dst_frame_n > src_frame_n : missing time frames are added and filled with the value pad_data. dst_frame_n <= src_frame_n : only keeps the first dst_frame_n data.
| int AILIA_API ailiaAudioGetFBMatrix | ( | void * | dst, |
| const int | freq_n, | ||
| float | f_min, | ||
| float | f_max, | ||
| int | mel_n, | ||
| int | sample_rate, | ||
| int | mel_norm, | ||
| int | mel_formula | ||
| ) |
Create a mel filter-bank.
| dst | pointer to the output data, of float format, and of length (mel_n * freq_n). (memory layout, using the row-major convention: (mel_n, freq_n)) |
| freq_n | number of frequency indices for the FFT (1+fft_n/2) |
| f_min | lowest frequency |
| f_max | highest frequency |
| mel_n | number of mel frequency bins in the output (< freq_n) |
| sample_rate | sampling rate for the signal that will be inputted to this filter |
| mel_norm | whether to normalize the output (and the type of the normalization): any of the AILIA_AUDIO_MEL_NORMALIZE_* constants |
| mel_formula | mel scale format: any of the AILIA_AUDIO_MEL_SCALE_FORMULA_* constants |
| int AILIA_API ailiaAudioGetFrameLen | ( | int * | frame_n, |
| int | sample_n, | ||
| int | fft_n, | ||
| int | hop_n, | ||
| int | center | ||
| ) |
Get the number of frames generated by the STFT.
| frame_n | pointer to the destination where to write the output (the number of frames) |
| sample_n | count of samples on which the STFT is performed |
| fft_n | size of the FFT at each frame (i.e. number of frequency bins at each frame) |
| hop_n | stride of each window shift (in number of samples). This is the quantum of time for the time axis of the STFT output. |
| center | any of the AILIA_AUDIO_STFT_CENTER_* constants |
Before executing the STFT, use this function to determine the space required for the output buffer. If AILIA_AUDIO_STFT_CENTER_NONE is used, the sample_n samples are cut in packets of size hop_n, and no padding occurs before the first sample nor after the last sample. If AILIA_AUDIO_STFT_CENTER_ENABLE is used, a reflection padding of length fft_n/n is performed before the first sample and after the last sample. If AILIA_AUDIO_STFT_CENTER_ENABLE is used, a zero padding of length fft_n/n is performed before the first sample and after the last sample, and moreover an additional zero padding is performed to ensure that the total length is a multiple of hop_n.
| int AILIA_API ailiaAudioGetInverseSpectrogram | ( | void * | dst, |
| const void * | src, | ||
| int | frame_n, | ||
| int | freq_n, | ||
| int | hop_n, | ||
| int | win_n, | ||
| int | win_type, | ||
| int | max_sample_n, | ||
| int | center, | ||
| int | norm_type | ||
| ) |
Generate an audio signal from a complex spectrogram.
| dst | pointer to the output data, of float format, and of length sample_n |
| src | pointer to the input data, of float format, of length (2 * freq_n * frame_n), and which memory layout is a sequence of pairs [real part, imaginary part]. Memory layout, using the row-major convention: (freq_n, frame_n, 2). |
| frame_n | number of time frames in the input data |
| freq_n | number of frequencies bins for each time frame (freq_n = fft_n/2+1) |
| hop_n | step size of the time frame increment (expressed in number of samples) for the inputted spectrogram. |
| win_n | size of the window function |
| win_type | type of the window function: any of the AILIA_AUDIO_WIN_TYPE_* constants |
| max_sample_n | maximum value of the sample index in the outputted data |
| center | whether padding (before and after) was used or not (and its type) during the generation of the input data: any of the AILIA_AUDIO_STFT_CENTER_* constants |
| norm_type | normalization type that was used during the generation of the input data: any of the AILIA_AUDIO_FFT_NORMALIZE_* constants |
For each time frame the normalization is executed at the end of the IFFT. Only accepts a complex spectrogram in input.
| int AILIA_API ailiaAudioGetLinerFilterZiCoef | ( | void * | dst_zi, |
| const void * | n_coef, | ||
| const void * | d_coef, | ||
| int | dst_n, | ||
| int | n_coef_n, | ||
| int | d_coef_n | ||
| ) |
Calculate the initial delay coefficients for filtering.
| dst_zi | pointer to the output (initial delay coefficients), of float format, and of length dst_n (dst_n >= max(n_coef_n,d_coef_n)-1) |
| n_coef | pointer to the numerator coefficients of the filter, of float format, and length n_coef_n |
| d_coef | pointer to the denominator coefficients of the filter, of float format, and length d_coef_n |
| dst_n | size, in number of samples, reserved in the output buffer (dst_n >= max(n_coef_n,d_coef_n)-1) |
| n_coef_n | number of numerator coefficients of the filter |
| d_coef_n | number of denominator coefficients of the filter |
These initial delay coefficients dst_zi, once multiplied with the early values of the signal, can be passed as initial delayed values, the zi argument, to ailiaAudioLinerFilter() . Of the dst_n reserved length of the output buffer, the length used is max(n_coef_n,d_coef_n)-1. If dst_n is less than that, only the corresponding first values are output. If dst_n is larger, the remaining is filled with 0. The largest of n_coef_n and d_coef_n is taken as reference and zeros are added for padding where necessary.
| int AILIA_API ailiaAudioGetMelSpectrogram | ( | void * | dst, |
| const void * | src, | ||
| int | sample_n, | ||
| int | sample_rate, | ||
| int | fft_n, | ||
| int | hop_n, | ||
| int | win_n, | ||
| int | win_type, | ||
| int | max_frame_n, | ||
| int | center, | ||
| float | power, | ||
| int | fft_norm_type, | ||
| float | f_min, | ||
| float | f_max, | ||
| int | mel_n, | ||
| int | mel_norm_type, | ||
| int | mel_formula | ||
| ) |
Generate the mel spectrogram from the audio signal.
| dst | pointer to the output data, of float format, and of length (mel_n * frame_n) (with frame_n the number of time frames outputted). (memory layout, using the row-major convention: (mel_n, frame_n)) |
| src | pointer to the input data, of float format, monoral PCM audio data. |
| sample_n | count of samples in the input data |
| sample_rate | sampling rate of the input signal |
| fft_n | number of FFT components |
| hop_n | stride of each window shift (in number of samples). This is the size of the time increment for the spectrogram. |
| win_n | size of the window function (in number of samples) |
| win_type | type of the window function: any of the AILIA_AUDIO_WIN_TYPE_* constants |
| max_frame_n | maximum value of the time frame index in the outputted data |
| center | whether to pad or not (and the type of padding) before and after the input data: any of the AILIA_AUDIO_STFT_CENTER_* constants |
| power | exponent to apply to the spectrogram (> 0.0). 1.0: amplitude spectrogram, 2.0: power spectrogram, etc, any other positive exponent value is allowed. |
| fft_norm_type | normalization after the FFT: any of the AILIA_AUDIO_FFT_NORMALIZE_* constants |
| f_min | lowest frequency |
| f_max | highest frequency |
| mel_n | number of mel frequency bins in the output (< freq_n) |
| mel_norm | whether to normalize the mel spectrogram (and the type of the normalization): any of the AILIA_AUDIO_MEL_NORMALIZE_* constants |
| mel_formula | mel scale format: any of the AILIA_AUDIO_MEL_SCALE_FORMULA_* constants |
For each time frame, the operations are processed in this order: FFT(STFT) -> normalization -> power exponentiation -> get the mel filter-bank coefficients -> convert to the mel scale. The output is real values, and its length is mel_n*frame_n (with frame_n the number of time frames outputted).
| int AILIA_API ailiaAudioGetNonSilentPos | ( | int * | dst_start_pos, |
| int * | dst_length, | ||
| const void * | src, | ||
| int | sample_n, | ||
| int | win_n, | ||
| int | hop_n, | ||
| float | thr_db | ||
| ) |
Find the region of the signal between the first and the last non-silence samples. Detects the area excluding the silent range before and after the signal input.
| dst_start_pos | pointer to the destination where to write the outputted start position of the non-silence area, of int format |
| dst_length | pointer to the destination where to write the outputted length of the non-silence area, of int format |
| src | pointer to the input data, of float format, and of length sample_n |
| sample_n | count of samples in the input data |
| win_n | size of the window function |
| hop_n | stride of each window shift (in number of samples) |
| thr_db | threshold (in dB) above which the signal is considered non-silence (thr_db > 0) |
In case the whole signal is considered silence, the following happens: *dst_start_pos = -1, *dst_length = 0
| int AILIA_API ailiaAudioGetResampleLen | ( | int * | dst_sample_n, |
| int | dst_sample_rate, | ||
| int | src_sample_n, | ||
| int | src_sample_rate | ||
| ) |
Get the number of samples after the resampling.
| dst_sample_n | pointer to the destination where to write the output (the number of samples after resampling) |
| dst_sample_rate | sampling rate after the resampling |
| src_sample_n | number of samples in the input signal |
| src_sample_rate | sampling rate of the input signal |
| int AILIA_API ailiaAudioGetSampleLen | ( | int * | sample_n, |
| int | frame_n, | ||
| int | freq_n, | ||
| int | hop_n, | ||
| int | center | ||
| ) |
Get the number of samples generated by the ISTFT.
| sample_n | pointer to the destination where to write the output (the number of samples) |
| frame_n | length of the STFT data, expressed in number of frames |
| fft_n | size of the FFT at each frame (i.e. number of frequency bins at each frame) |
| hop_n | stride of each window shift (in number of samples). This is the quantum of time for the time axis of the STFT output. |
| center | any of the AILIA_AUDIO_STFT_CENTER_* constants |
Before executing the ISTFT, use this function to determine the space required for the output buffer. If AILIA_AUDIO_STFT_CENTER_NONE is used, no truncation is performed at the beginning nor at the end. If AILIA_AUDIO_STFT_CENTER_NONE is not used, a truncation is performed at the beginning and at the end.
| int AILIA_API ailiaAudioGetSpectrogram | ( | void * | dst, |
| const void * | src, | ||
| int | sample_n, | ||
| int | fft_n, | ||
| int | hop_n, | ||
| int | win_n, | ||
| int | win_type, | ||
| int | max_frame_n, | ||
| int | center, | ||
| float | power, | ||
| int | norm_type | ||
| ) |
Generate the spectrogram from the audio signal.
| dst | pointer to the output data, of float format, of length (2 * freq_n * frame_n), and which memory layout is a sequence of pairs [real part, imaginary part]. (where freq_n = fft_n/2+1). Memory layout, using the row-major convention: (freq_n, frame_n, 2). |
| src | pointer to the input data, of float format, and of length sample_n |
| sample_n | count of samples in the input data |
| fft_n | size of the FFT at each frame (i.e. number of frequency bins at each frame) |
| hop_n | stride of each window shift (in number of samples). This is the size of the time increment for the spectrogram. |
| win_n | size of the window function |
| win_type | type of the window function: any of the AILIA_AUDIO_WIN_TYPE_* constants |
| max_frame_n | maximum value of the time frame index in the outputted data |
| center | whether to pad or not (and the type of padding) before and after the input data: any of the AILIA_AUDIO_STFT_CENTER_* constants |
| power | exponent to apply to the spectrogram (> = 0.0). A special case is for 0.0: complex spectrogram. For other cases the amplitude is just exponentiated accordingly: 1.0: amplitude spectrogram, 2.0: power spectrogram, etc, any other positive exponent value is allowed. |
| norm_type | normalization after the FFT: any of the AILIA_AUDIO_FFT_NORMALIZE_* constants |
For each time frame, the operations are processed in this order: FFT -> normalization -> power exponentiation. As the output data alternates real and imaginary parts, its length is 2*(fft_n/2+1)*frame_n. (where frame_n is the number of time frames outputted) When the power argument is a non-zero value, all the complex parts are set to 0 in the output.
| int AILIA_API ailiaAudioGetWindow | ( | void * | dst, |
| int | window_n, | ||
| int | win_type | ||
| ) |
Get the window function.
| dst | pointer to the output data, of float format, and of length window_n |
| window_n | length of the window (in number of samples) |
| win_type | type of the window function: any of the AILIA_AUDIO_WIN_TYPE_* constants |
Only the Hann and the Hamming window functions are supported.
| int AILIA_API ailiaAudioIFFT | ( | void * | dst, |
| const void * | src, | ||
| int | fft_n | ||
| ) |
Execute the IFFT.
| dst | pointer to the output data, of float format, of length 2*fft_n, and which memory layout is a sequence of fft_n pairs [real part, imaginary part]. Memory layout, using the row-major convention: (fft_n, 2). |
| src | pointer to the input data, of float format, of length 2*fft_n, and which memory layout is a sequence of fft_n pairs [real part, imaginary part]. Memory layout, using the row-major convention: (fft_n, 2). |
| fft_n | count of FFT values (i.e. of frequency bins) |
If fft_n is a power of 2, this function uses a faster algorithm. As the output data alternates real and imaginary parts, its length is 2*fft_n.
| int AILIA_API ailiaAudioLinerFilter | ( | void * | dst, |
| const void * | src, | ||
| const void * | n_coef, | ||
| const void * | d_coef, | ||
| void * | zi, | ||
| int | dst_n, | ||
| int | src_n, | ||
| int | n_coef_n, | ||
| int | d_coef_n, | ||
| int | zi_n | ||
| ) |
Apply a filter to the signal.
| dst | pointer to the output data, of float format, and of length dst_n |
| src | pointer to the input data, of float format, and of length src_n |
| n_coef | pointer to the numerator coefficients of the filter, of float format, and length n_coef_n |
| d_coef | pointer to the denominator coefficients of the filter, of float format, and length d_coef_n |
| zi | pointer to the initial delayed values to be used, of float format, and of length zi_n (zi_n = max(n_coef_n,d_coef_n)-1). nullptr is allowed. |
| dst_n | size, in number of samples, reserved in the output buffer (dst_n >= src_n) |
| src_n | number of samples in the input signal |
| n_coef_n | number of numerator coefficients of the filter |
| d_coef_n | number of denominator coefficients of the filter |
| zi_n | number of initial delayed values provided (zi_n >= max(n_coef_n,d_coef_n)-1) |
The number of samples outputted to dst is min(dst_m,src_n). Use zi to provide the initial delayed values. During processing, this array is overriden with the new delayed values. Out of the zi_n, the number of delayed values used is max(n_coef_n,d_coef_n)-1. If there are less than that, the remaining is assumed to be zeros, and the array zi is not updated with the new values. When zi is nullptr, zi_n is ignored, all the delayed values are assumed to be zero, and the new delayed values are not returned. The largest of n_coef_n and d_coef_n is taken as reference and zeros are added for padding where necessary.
| int AILIA_API ailiaAudioLog1p | ( | void * | dst, |
| const void * | src, | ||
| int | src_n | ||
| ) |
Convert the input values to a logarithmic scale.
| dst | pointer to the output data, of float format, and of length src_n |
| src | pointer to the input data, of float format, and of length src_n |
| src_n | number of elements to be calculated |
dst = log_e(1.0 + src)
| int AILIA_API ailiaAudioMagPhase | ( | void * | dst_mag, |
| void * | dst_phase, | ||
| const void * | src, | ||
| int | freq_n, | ||
| int | frame_n, | ||
| float | power, | ||
| int | phase_form | ||
| ) |
Get the amplitude and the phase from the spectrogram.
| dst_mag | pointer to the outputted amplitudes, an array of length (freq_n * frame_n). (memory layout, using the row-major convention: (freq_n, frame_n)) |
| dst_phase | pointer to the outputted phases, an array of length (2 * freq_n * frame_n) (sequence of complex pairs [real part, imaginary part]). (memory layout, using the row-major convention: (freq_n, frame_n, 2)) |
| src | pointer to the input data, of length (2 * frame_n * freq_n) (a sequence of complex pairs [real, imaginary]). (memory layout, using the row-major convention: (frame_n, freq_n, 2)) |
| freq_n | number of frequency indices |
| frame_n | number of time frames |
| power | exponent to apply to the spectrogram (> 0.0). 1.0: amplitude spectrogram, 2.0: power spectrogram, etc, any other positive exponent value is allowed. |
| phase_form | format of the outputted phase: any of the AILIA_AUDIO_PHASE_FORM_* constants |
To be compatible with librosa, use: phase_form = AILIA_AUDIO_PHASE_FORM_COMPLEX , power = 1.0 To be compatible with PyTorch, use: phase_form = AILIA_AUDIO_PHASE_FORM_REAL , power = 1.0 The dst_phase output depends on phase_form:
| int AILIA_API ailiaAudioResample | ( | void * | dst, |
| const void * | src, | ||
| int | dst_sample_rate, | ||
| int | dst_n, | ||
| int | src_sample_rate, | ||
| int | src_n | ||
| ) |
Resample the signal.
| dst | pointer to the output data, of float format, and of length dst_n |
| src | pointer to the input data, of float format, and of length src_n |
| dst_sample_rate | sampling rate after the resampling |
| dst_n | length (in number of samples) reserved in the output buffer(dst_n >= max_resample_n) |
| src_sample_rate | sampling rate of the input signal |
| src_n | number of samples in the input signal |
The max number of samples in the output, max_resample_n, can be obtained from ailiaAudioGetResampleLen() . dst_n < max_resample_n : only the first dst_n samples are outputted dst_n >= max_resample_n : max_resample_n samples are outputted
| int AILIA_API ailiaAudioStandardize | ( | void * | dst, |
| const void * | src, | ||
| const int | src_n | ||
| ) |
Standardize a real signal.
| dst | pointer to the output data, of float format, and of length src_n |
| src | pointer to the input data, of float format, and of length src_n |
| src_n | length of the input data |
Standardize the input data so that its average value becomes 0 and its variance 1. dst = (src - mean(src)) / std(src)