ailia
1.5.0.0
|
Public Member Functions | |
static int | ailiaAudioLog1p (float[] dst, float[] src, int src_n) |
Convert the input values to a logarithmic scale. More... | |
static int | ailiaAudioConvertPowerToDB (float[] dst, float[] src, int src_n, float top_db) |
Convert non-negative input values to decibel scale. More... | |
static int | ailiaAudioGetFrameLen (ref Int32 frame_n, int sample_n, int fft_n, int hop_n, int center) |
Get the number of frames generated by the STFT. More... | |
static int | ailiaAudioGetSampleLen (ref Int32 sample_n, int frame_n, int freq_n, int hop_n, int center) |
Get the number of samples generated by the ISTFT. More... | |
static int | ailiaAudioGetWindow (float[] dst, int window_n, int win_type) |
Get the window function. More... | |
static int | ailiaAudioFFT (float[] dst, float[] src, int fft_n) |
Execute the FFT. More... | |
static int | ailiaAudioIFFT (float[] dst, float[] src, int fft_n) |
Execute the IFFT. More... | |
static int | ailiaAudioGetSpectrogram (float[] dst, float[] src, int sample_n, int fft_n, int hop_n, int win_n, int win_type, int max_frame_n, int center, float power, int norm_type) |
Generate the spectrogram from the audio signal. More... | |
static int | ailiaAudioGetInverseSpectrogram (float[] dst, float[] src, int frame_n, int freq_n, int hop_n, int win_n, int win_type, int max_sample_n, int center, int norm_type) |
Generate an audio signal from a complex spectrogram. More... | |
static int | ailiaAudioGetFBMatrix (float[] dst, int freq_n, float f_min, float f_max, int mel_n, int sample_rate, int mel_norm, int mel_formula) |
Create a mel filter-bank. More... | |
static int | ailiaAudioGetMelSpectrogram (float[] dst, float[] src, int sample_n, int sample_rate, int fft_n, int hop_n, int win_n, int win_type, int max_frame_n, int center, float power, int fft_norm_type, float f_min, float f_max, int mel_n, int mel_norm_type, int mel_formula) |
Generate the mel spectrogram from the audio signal. More... | |
static int | ailiaAudioMagPhase (float[] dst_mag, float[] dst_phase, float[] src, int freq_n, int frame_n, float power, int phase_form) |
Get the amplitude and the phase from the spectrogram. More... | |
static int | ailiaAudioStandardize (float[] dst, float[] src, int src_n) |
Standardize a real signal. More... | |
static int | ailiaAudioComplexNorm (float[] dst, float[] src, int src_n, float power) |
Get the norm of the complex signal. More... | |
static int | ailiaAudioConvertToMel (float[] dst, float[] src, float[] fb_mtrx, int freq_n, int frame_n, int mel_n) |
Convert the real output of the STFT to the mel scale. More... | |
static int | ailiaAudioFixFrameLen (float[] dst, float[] src, int freq_n, int dst_frame_n, int src_frame_n, float pad_data) |
Fix the number of time frames of a real-valued spectrogram/mel-spectrogram. More... | |
static int | ailiaAudioResample (float[] dst, float[] src, int dst_sample_rate, int dst_n, int src_sample_rate, int src_n) |
Resample the signal. More... | |
static int | ailiaAudioGetResampleLen (ref Int32 dst_sample_n, int dst_sample_rate, int src_sample_n, int src_sample_rate) |
Get the number of samples after the resampling. More... | |
static int | ailiaAudioLinerFilter (float[] dst, float[] src, float[] n_coef, float[] d_coef, float[] zi, int dst_n, int src_n, int n_coef_n, int d_coef_n, int zi_n) |
Apply a filter to the signal. More... | |
static int | ailiaAudioGetLinerFilterZiCoef (float[] dst_zi, float[] n_coef, float[] d_coef, int dst_n, int n_coef_n, int d_coef_n) |
Calculate the initial delay coefficients for filtering. More... | |
static int | ailiaAudioFilterFilter (float[] dst, float[] src, float[] n_coef, float[] d_coef, int dst_n, int src_n, int n_coef_n, int d_coef_n, int pad_type, int pad_len) |
Apply a zero-phase filter to the signal. More... | |
static int | ailiaAudioGetNonSilentPos (ref Int32 dst_start_pos, ref Int32 dst_length, float[] src, int sample_n, int win_n, int hop_n, float thr_db) |
Find the region of the signal between the first and the last non-silence samples. Detects the area excluding the silent range before and after the signal input. More... | |
static int ailiaAudio.AiliaAudio.ailiaAudioComplexNorm | ( | float[] | dst, |
float[] | src, | ||
int | src_n, | ||
float | power | ||
) |
Get the norm of the complex signal.
dst | pointer to the output data, of float format, and of length src_n |
src | pointer to the input data, of float format, an array of length (2 * src_n) (sequence of complex pairs [real part, imaginary part]). (memory layout, using the row-major convention: (src_n, 2)) |
src_n | length of the input data |
power | exponent to apply to the spectrogram (> 0.0). 1.0: amplitude spectrogram |
Compute the norm of the input data. For each src_cmp = src[0] + i * src[1], tmp_dst = pow(src[0],2.0) + pow(src[1],2.0) dst[0] = pow(tmp_dst,0.5*power);
static int ailiaAudio.AiliaAudio.ailiaAudioConvertPowerToDB | ( | float[] | dst, |
float[] | src, | ||
int | src_n, | ||
float | top_db | ||
) |
Convert non-negative input values to decibel scale.
dst | pointer to the output data, of float format, and of length src_n |
src | pointer to the input data, of float format, and of length src_n |
src_n | number of elements to be calculated |
top_db | float >= 0.0 |
Output compatible with librosa.power_to_db. dst = trimlow( 10 * log10(src / ref) ) where ref is the max of 1e-10 and of positive values of src, and trimlow(), if top_db > 0, trims all values inferior to (- top_db) and replaces them by (- top_db)), else, trimlow() does nothing.
static int ailiaAudio.AiliaAudio.ailiaAudioConvertToMel | ( | float[] | dst, |
float[] | src, | ||
float[] | fb_mtrx, | ||
int | freq_n, | ||
int | frame_n, | ||
int | mel_n | ||
) |
Convert the real output of the STFT to the mel scale.
dst | pointer to the output data, of float format, of length (mel_n * frame_n), and of memory layout (in row-major convention) (mel_n, frame_n). |
src | pointer to the input data, of float format, of length (freq_n * frame_n), and of memory layout (in row-major convention) (freq_n, frame_n). |
fb_mtrx | the mel filter-bank, of float format, of length (mel_n * freq_n), and of memory layout (in row-major convention) (mel_n, freq_n). |
freq_n | number of frequency indices |
frame_n | number of time frames in the input data |
mel_n | number of mel frequency indices |
Converts the real spectrogram given in input to the mel scale. The argument fb_mtrx can take the coefficients outputted by ailiaAudioGetFBMatrix() .
static int ailiaAudio.AiliaAudio.ailiaAudioFFT | ( | float[] | dst, |
float[] | src, | ||
int | fft_n | ||
) |
Execute the FFT.
dst | pointer to the output data, of float format, of length 2*fft_n, and which memory layout is a sequence of fft_n pairs [real part, imaginary part]. Memory layout, using the row-major convention: (fft_n, 2). |
src | pointer to the input data, of float format, and of length fft_n |
fft_n | count of FFT values (i.e. of frequency bins) |
If fft_n is a power of 2, this function uses a faster algorithm. As the output data alternates real and imaginary parts, its length is 2*fft_n.
static int ailiaAudio.AiliaAudio.ailiaAudioFilterFilter | ( | float[] | dst, |
float[] | src, | ||
float[] | n_coef, | ||
float[] | d_coef, | ||
int | dst_n, | ||
int | src_n, | ||
int | n_coef_n, | ||
int | d_coef_n, | ||
int | pad_type, | ||
int | pad_len | ||
) |
Apply a zero-phase filter to the signal.
dst | pointer to the output data, of float format, and of length dst_n |
src | pointer to the input data, of float format, and of length src_n |
n_coef | pointer to the numerator coefficients of the filter, of float format, and length n_coef_n |
d_coef | pointer to the denominator coefficients of the filter, of float format, and length d_coef_n |
dst_n | length (in number of samples) reserved in the output buffer (dst_n >= src_n) |
src_n | number of samples in the input signal |
n_coef_n | number of numerator coefficients of the filter |
d_coef_n | number of denominator coefficients of the filter |
pad_type | type of padding to apply at the start and at the end of the input signal: any of the AILIA_AUDIO_FILTFILT_PAD_* constants |
pad_len | length of the padding applied to the start and to the end of the input signal |
The number of values written to the output dst is min(dst_m,src_n). The largest of n_coef_n and d_coef_n is taken as reference and zeros are added for padding where necessary.
static int ailiaAudio.AiliaAudio.ailiaAudioFixFrameLen | ( | float[] | dst, |
float[] | src, | ||
int | freq_n, | ||
int | dst_frame_n, | ||
int | src_frame_n, | ||
float | pad_data | ||
) |
Fix the number of time frames of a real-valued spectrogram/mel-spectrogram.
dst | pointer to the output data, of length (freq_n * dst_frame_n), and of memory layout (in row-major convention) (freq_n, dst_frame_n). |
src | pointer to the input data, of length (freq_n * src_frame_n), and of memory layout (in row-major convention) (freq_n, src_frame_n). |
freq_n | number of frequency indices |
dst_frame_n | number of time frames in the output data |
src_frame_n | number of time frames in the input data |
pad_data | value inserted for padding (used when dst_frame_n > src_frame_n) |
dst_frame_n > src_frame_n : missing time frames are added and filled with the value pad_data. dst_frame_n <= src_frame_n : only keeps the first dst_frame_n data.
static int ailiaAudio.AiliaAudio.ailiaAudioGetFBMatrix | ( | float[] | dst, |
int | freq_n, | ||
float | f_min, | ||
float | f_max, | ||
int | mel_n, | ||
int | sample_rate, | ||
int | mel_norm, | ||
int | mel_formula | ||
) |
Create a mel filter-bank.
dst | pointer to the output data, of float format, and of length (mel_n * freq_n). (memory layout, using the row-major convention: (mel_n, freq_n)) |
freq_n | number of frequency indices for the FFT (1+fft_n/2) |
f_min | lowest frequency |
f_max | highest frequency |
mel_n | number of mel frequency bins in the output (< freq_n) |
sample_rate | sampling rate for the signal that will be inputted to this filter |
mel_norm | whether to normalize the output (and the type of the normalization): any of the AILIA_AUDIO_MEL_NORMALIZE_* constants |
mel_formula | mel scale format: any of the AILIA_AUDIO_MEL_SCALE_FORMULA_* constants |
static int ailiaAudio.AiliaAudio.ailiaAudioGetFrameLen | ( | ref Int32 | frame_n, |
int | sample_n, | ||
int | fft_n, | ||
int | hop_n, | ||
int | center | ||
) |
Get the number of frames generated by the STFT.
frame_n | pointer to the destination where to write the output (the number of frames) |
sample_n | count of samples on which the STFT is performed |
fft_n | size of the FFT at each frame (i.e. number of frequency bins at each frame) |
hop_n | stride of each window shift (in number of samples). This is the quantum of time for the time axis of the STFT output. |
center | any of the AILIA_AUDIO_STFT_CENTER_* constants |
Before executing the STFT, use this function to determine the space required for the output buffer. If AILIA_AUDIO_STFT_CENTER_NONE is used, the sample_n samples are cut in packets of size hop_n, and no padding occurs before the first sample nor after the last sample. If AILIA_AUDIO_STFT_CENTER_ENABLE is used, a reflection padding of length fft_n/n is performed before the first sample and after the last sample. If AILIA_AUDIO_STFT_CENTER_ENABLE is used, a zero padding of length fft_n/n is performed before the first sample and after the last sample, and moreover an additional zero padding is performed to ensure that the total length is a multiple of hop_n.
static int ailiaAudio.AiliaAudio.ailiaAudioGetInverseSpectrogram | ( | float[] | dst, |
float[] | src, | ||
int | frame_n, | ||
int | freq_n, | ||
int | hop_n, | ||
int | win_n, | ||
int | win_type, | ||
int | max_sample_n, | ||
int | center, | ||
int | norm_type | ||
) |
Generate an audio signal from a complex spectrogram.
dst | pointer to the output data, of float format, and of length sample_n |
src | pointer to the input data, of float format, of length (2 * freq_n * frame_n), and which memory layout is a sequence of pairs [real part, imaginary part]. Memory layout, using the row-major convention: (freq_n, frame_n, 2). |
frame_n | number of time frames in the input data |
freq_n | number of frequencies bins for each time frame (freq_n = fft_n/2+1) |
hop_n | step size of the time frame increment (expressed in number of samples) for the inputted spectrogram. |
win_n | size of the window function |
win_type | type of the window function: any of the AILIA_AUDIO_WIN_TYPE_* constants |
max_sample_n | maximum value of the sample index in the outputted data |
center | whether padding (before and after) was used or not (and its type) during the generation of the input data: any of the AILIA_AUDIO_STFT_CENTER_* constants |
norm_type | normalization type that was used during the generation of the input data: any of the AILIA_AUDIO_FFT_NORMALIZE_* constants |
For each time frame the normalization is executed at the end of the IFFT. Only accepts a complex spectrogram in input.
static int ailiaAudio.AiliaAudio.ailiaAudioGetLinerFilterZiCoef | ( | float[] | dst_zi, |
float[] | n_coef, | ||
float[] | d_coef, | ||
int | dst_n, | ||
int | n_coef_n, | ||
int | d_coef_n | ||
) |
Calculate the initial delay coefficients for filtering.
dst_zi | pointer to the output (initial delay coefficients), of float format, and of length dst_n (dst_n >= max(n_coef_n,d_coef_n)-1) |
n_coef | pointer to the numerator coefficients of the filter, of float format, and length n_coef_n |
d_coef | pointer to the denominator coefficients of the filter, of float format, and length d_coef_n |
dst_n | size, in number of samples, reserved in the output buffer (dst_n >= max(n_coef_n,d_coef_n)-1) |
n_coef_n | number of numerator coefficients of the filter |
d_coef_n | number of denominator coefficients of the filter |
These initial delay coefficients dst_zi, once multiplied with the early values of the signal, can be passed as initial delayed values, the zi argument, to ailiaAudioLinerFilter() . Of the dst_n reserved length of the output buffer, the length used is max(n_coef_n,d_coef_n)-1. If dst_n is less than that, only the corresponding first values are output. If dst_n is larger, the remaining is filled with 0. The largest of n_coef_n and d_coef_n is taken as reference and zeros are added for padding where necessary.
static int ailiaAudio.AiliaAudio.ailiaAudioGetMelSpectrogram | ( | float[] | dst, |
float[] | src, | ||
int | sample_n, | ||
int | sample_rate, | ||
int | fft_n, | ||
int | hop_n, | ||
int | win_n, | ||
int | win_type, | ||
int | max_frame_n, | ||
int | center, | ||
float | power, | ||
int | fft_norm_type, | ||
float | f_min, | ||
float | f_max, | ||
int | mel_n, | ||
int | mel_norm_type, | ||
int | mel_formula | ||
) |
Generate the mel spectrogram from the audio signal.
dst | pointer to the output data, of float format, and of length (mel_n * frame_n) (with frame_n the number of time frames outputted). (memory layout, using the row-major convention: (mel_n, frame_n)) |
src | pointer to the input data, of float format, monoral PCM audio data. |
sample_n | count of samples in the input data |
sample_rate | sampling rate of the input signal |
fft_n | number of FFT components |
hop_n | stride of each window shift (in number of samples). This is the size of the time increment for the spectrogram. |
win_n | size of the window function (in number of samples) |
win_type | type of the window function: any of the AILIA_AUDIO_WIN_TYPE_* constants |
max_frame_n | maximum value of the time frame index in the outputted data |
center | whether to pad or not (and the type of padding) before and after the input data: any of the AILIA_AUDIO_STFT_CENTER_* constants |
power | exponent to apply to the spectrogram (> 0.0). 1.0: amplitude spectrogram, 2.0: power spectrogram, etc, any other positive exponent value is allowed. |
fft_norm_type | normalization after the FFT: any of the AILIA_AUDIO_FFT_NORMALIZE_* constants |
f_min | lowest frequency |
f_max | highest frequency |
mel_n | number of mel frequency bins in the output (< freq_n) |
mel_norm | whether to normalize the mel spectrogram (and the type of the normalization): any of the AILIA_AUDIO_MEL_NORMALIZE_* constants |
mel_formula | mel scale format: any of the AILIA_AUDIO_MEL_SCALE_FORMULA_* constants |
For each time frame, the operations are processed in this order: FFT(STFT) -> normalization -> power exponentiation -> get the mel filter-bank coefficients -> convert to the mel scale. The output is real values, and its length is mel_n*frame_n (with frame_n the number of time frames outputted).
static int ailiaAudio.AiliaAudio.ailiaAudioGetNonSilentPos | ( | ref Int32 | dst_start_pos, |
ref Int32 | dst_length, | ||
float[] | src, | ||
int | sample_n, | ||
int | win_n, | ||
int | hop_n, | ||
float | thr_db | ||
) |
Find the region of the signal between the first and the last non-silence samples. Detects the area excluding the silent range before and after the signal input.
dst_start_pos | pointer to the destination where to write the outputted start position of the non-silence area, of int format |
dst_length | pointer to the destination where to write the outputted length of the non-silence area, of int format |
src | pointer to the input data, of float format, and of length sample_n |
sample_n | count of samples in the input data |
win_n | size of the window function |
hop_n | stride of each window shift (in number of samples) |
thr_db | threshold (in dB) above which the signal is considered non-silence (thr_db > 0) |
In case the whole signal is considered silence, the following happens: *dst_start_pos = -1, *dst_length = 0
static int ailiaAudio.AiliaAudio.ailiaAudioGetResampleLen | ( | ref Int32 | dst_sample_n, |
int | dst_sample_rate, | ||
int | src_sample_n, | ||
int | src_sample_rate | ||
) |
Get the number of samples after the resampling.
dst_sample_n | pointer to the destination where to write the output (the number of samples after resampling) |
dst_sample_rate | sampling rate after the resampling |
src_sample_n | number of samples in the input signal |
src_sample_rate | sampling rate of the input signal |
static int ailiaAudio.AiliaAudio.ailiaAudioGetSampleLen | ( | ref Int32 | sample_n, |
int | frame_n, | ||
int | freq_n, | ||
int | hop_n, | ||
int | center | ||
) |
Get the number of samples generated by the ISTFT.
sample_n | pointer to the destination where to write the output (the number of samples) |
frame_n | length of the STFT data, expressed in number of frames |
fft_n | size of the FFT at each frame (i.e. number of frequency bins at each frame) |
hop_n | stride of each window shift (in number of samples). This is the quantum of time for the time axis of the STFT output. |
center | any of the AILIA_AUDIO_STFT_CENTER_* constants |
Before executing the ISTFT, use this function to determine the space required for the output buffer. If AILIA_AUDIO_STFT_CENTER_NONE is used, no truncation is performed at the beginning nor at the end. If AILIA_AUDIO_STFT_CENTER_NONE is not used, a truncation is performed at the beginning and at the end.
static int ailiaAudio.AiliaAudio.ailiaAudioGetSpectrogram | ( | float[] | dst, |
float[] | src, | ||
int | sample_n, | ||
int | fft_n, | ||
int | hop_n, | ||
int | win_n, | ||
int | win_type, | ||
int | max_frame_n, | ||
int | center, | ||
float | power, | ||
int | norm_type | ||
) |
Generate the spectrogram from the audio signal.
dst | pointer to the output data, of float format, of length (2 * freq_n * frame_n), and which memory layout is a sequence of pairs [real part, imaginary part]. (where freq_n = fft_n/2+1). Memory layout, using the row-major convention: (freq_n, frame_n, 2). |
src | pointer to the input data, of float format, and of length sample_n |
sample_n | count of samples in the input data |
fft_n | size of the FFT at each frame (i.e. number of frequency bins at each frame) |
hop_n | stride of each window shift (in number of samples). This is the size of the time increment for the spectrogram. |
win_n | size of the window function |
win_type | type of the window function: any of the AILIA_AUDIO_WIN_TYPE_* constants |
max_frame_n | maximum value of the time frame index in the outputted data |
center | whether to pad or not (and the type of padding) before and after the input data: any of the AILIA_AUDIO_STFT_CENTER_* constants |
power | exponent to apply to the spectrogram (> = 0.0). A special case is for 0.0: complex spectrogram. For other cases the amplitude is just exponentiated accordingly: 1.0: amplitude spectrogram, 2.0: power spectrogram, etc, any other positive exponent value is allowed. |
norm_type | normalization after the FFT: any of the AILIA_AUDIO_FFT_NORMALIZE_* constants |
For each time frame, the operations are processed in this order: FFT -> normalization -> power exponentiation. As the output data alternates real and imaginary parts, its length is 2*(fft_n/2+1)*frame_n. (where frame_n is the number of time frames outputted) When the power argument is a non-zero value, all the complex parts are set to 0 in the output.
static int ailiaAudio.AiliaAudio.ailiaAudioGetWindow | ( | float[] | dst, |
int | window_n, | ||
int | win_type | ||
) |
Get the window function.
dst | pointer to the output data, of float format, and of length window_n |
window_n | length of the window (in number of samples) |
win_type | type of the window function: any of the AILIA_AUDIO_WIN_TYPE_* constants |
Only the Hann and the Hamming window functions are supported.
static int ailiaAudio.AiliaAudio.ailiaAudioIFFT | ( | float[] | dst, |
float[] | src, | ||
int | fft_n | ||
) |
Execute the IFFT.
dst | pointer to the output data, of float format, of length 2*fft_n, and which memory layout is a sequence of fft_n pairs [real part, imaginary part]. Memory layout, using the row-major convention: (fft_n, 2). |
src | pointer to the input data, of float format, of length 2*fft_n, and which memory layout is a sequence of fft_n pairs [real part, imaginary part]. Memory layout, using the row-major convention: (fft_n, 2). |
fft_n | count of FFT values (i.e. of frequency bins) |
If fft_n is a power of 2, this function uses a faster algorithm. As the output data alternates real and imaginary parts, its length is 2*fft_n.
static int ailiaAudio.AiliaAudio.ailiaAudioLinerFilter | ( | float[] | dst, |
float[] | src, | ||
float[] | n_coef, | ||
float[] | d_coef, | ||
float[] | zi, | ||
int | dst_n, | ||
int | src_n, | ||
int | n_coef_n, | ||
int | d_coef_n, | ||
int | zi_n | ||
) |
Apply a filter to the signal.
dst | pointer to the output data, of float format, and of length dst_n |
src | pointer to the input data, of float format, and of length src_n |
n_coef | pointer to the numerator coefficients of the filter, of float format, and length n_coef_n |
d_coef | pointer to the denominator coefficients of the filter, of float format, and length d_coef_n |
zi | pointer to the initial delayed values to be used, of float format, and of length zi_n (zi_n = max(n_coef_n,d_coef_n)-1). nullptr is allowed. |
dst_n | size, in number of samples, reserved in the output buffer (dst_n >= src_n) |
src_n | number of samples in the input signal |
n_coef_n | number of numerator coefficients of the filter |
d_coef_n | number of denominator coefficients of the filter |
zi_n | number of initial delayed values provided (zi_n >= max(n_coef_n,d_coef_n)-1) |
The number of samples outputted to dst is min(dst_m,src_n). Use zi to provide the initial delayed values. During processing, this array is overriden with the new delayed values. Out of the zi_n, the number of delayed values used is max(n_coef_n,d_coef_n)-1. If there are less than that, the remaining is assumed to be zeros, and the array zi is not updated with the new values. When zi is nullptr, zi_n is ignored, all the delayed values are assumed to be zero, and the new delayed values are not returned. The largest of n_coef_n and d_coef_n is taken as reference and zeros are added for padding where necessary.
static int ailiaAudio.AiliaAudio.ailiaAudioLog1p | ( | float[] | dst, |
float[] | src, | ||
int | src_n | ||
) |
Convert the input values to a logarithmic scale.
dst | pointer to the output data, of float format, and of length src_n |
src | pointer to the input data, of float format, and of length src_n |
src_n | number of elements to be calculated |
dst = log_e(1.0 + src)
static int ailiaAudio.AiliaAudio.ailiaAudioMagPhase | ( | float[] | dst_mag, |
float[] | dst_phase, | ||
float[] | src, | ||
int | freq_n, | ||
int | frame_n, | ||
float | power, | ||
int | phase_form | ||
) |
Get the amplitude and the phase from the spectrogram.
dst_mag | pointer to the outputted amplitudes, an array of length (freq_n * frame_n). (memory layout, using the row-major convention: (freq_n, frame_n)) |
dst_phase | pointer to the outputted phases, an array of length (2 * freq_n * frame_n) (sequence of complex pairs [real part, imaginary part]). (memory layout, using the row-major convention: (freq_n, frame_n, 2)) |
src | pointer to the input data, of length (2 * frame_n * freq_n) (a sequence of complex pairs [real, imaginary]). (memory layout, using the row-major convention: (frame_n, freq_n, 2)) |
freq_n | number of frequency indices |
frame_n | number of time frames |
power | exponent to apply to the spectrogram (> 0.0). 1.0: amplitude spectrogram, 2.0: power spectrogram, etc, any other positive exponent value is allowed. |
phase_form | format of the outputted phase: any of the AILIA_AUDIO_PHASE_FORM_* constants |
To be compatible with librosa, use: phase_form = AILIA_AUDIO_PHASE_FORM_COMPLEX , power = 1.0 To be compatible with PyTorch, use: phase_form = AILIA_AUDIO_PHASE_FORM_REAL , power = 1.0 The dst_phase output depends on phase_form:
static int ailiaAudio.AiliaAudio.ailiaAudioResample | ( | float[] | dst, |
float[] | src, | ||
int | dst_sample_rate, | ||
int | dst_n, | ||
int | src_sample_rate, | ||
int | src_n | ||
) |
Resample the signal.
dst | pointer to the output data, of float format, and of length dst_n |
src | pointer to the input data, of float format, and of length src_n |
dst_sample_rate | sampling rate after the resampling |
dst_n | length (in number of samples) reserved in the output buffer(dst_n >= max_resample_n) |
src_sample_rate | sampling rate of the input signal |
src_n | number of samples in the input signal |
The max number of samples in the output, max_resample_n, can be obtained from ailiaAudioGetResampleLen() . dst_n < max_resample_n : only the first dst_n samples are outputted dst_n >= max_resample_n : max_resample_n samples are outputted
static int ailiaAudio.AiliaAudio.ailiaAudioStandardize | ( | float[] | dst, |
float[] | src, | ||
int | src_n | ||
) |
Standardize a real signal.
dst | pointer to the output data, of float format, and of length src_n |
src | pointer to the input data, of float format, and of length src_n |
src_n | length of the input data |
Standardize the input data so that its average value becomes 0 and its variance 1. dst = (src - mean(src)) / std(src)
|
static |
Normalize the FFT output in a way compatible with librosa
|
static |
Do not normalize the FFT output
|
static |
Normalize the FFT output in a way compatible with PyTorch
|
static |
Normalize the FFT output in a way compatible with SciPy
|
static |
During zero-phase filtering, pad using the edge value
|
static |
During zero-phase filtering, pad with an even reflection (normal reflection)
|
static |
During zero-phase filtering, do not pad
|
static |
During zero-phase filtering, pad with an odd reflection (substract the reflected values from two times the edge value)
|
static |
Normalize the output of the mel spectrogram
|
static |
Do not normalize the output of the mel spectrogram
|
static |
Get the mel scale from the HTK formula (PyTorch compatible)
|
static |
Get the mel scale from the Slanye's formula (compatible with the default of librosa)
|
static |
Output the phase in complex format (compatible with the default of librosa)
|
static |
Output the phase in complex format (compatible with the default of PyTorch)
|
static |
for the STFT, insert a padding (reflect) of fft_n/2 before and after the sample_n samples
|
static |
for the STFT, do not insert padding before and after
|
static |
for the STFT, insert a padding (zeros) of fft_n/2 before and after the sample_n samples, and also pad at the end with zeros to process in units of hop_n
|
static |
use a Hamming window function
|
static |
use a Hann window function
|
static |