mirror of https://github.com/coqui-ai/TTS.git
Fix configs
This commit is contained in:
parent
0f19f8c911
commit
94e8e0d416
|
@ -12,60 +12,89 @@ class BaseAudioConfig(Coqpit):
|
||||||
Args:
|
Args:
|
||||||
fft_size (int):
|
fft_size (int):
|
||||||
Number of STFT frequency levels aka.size of the linear spectogram frame. Defaults to 1024.
|
Number of STFT frequency levels aka.size of the linear spectogram frame. Defaults to 1024.
|
||||||
|
|
||||||
win_length (int):
|
win_length (int):
|
||||||
Each frame of audio is windowed by window of length ```win_length``` and then padded with zeros to match
|
Each frame of audio is windowed by window of length ```win_length``` and then padded with zeros to match
|
||||||
```fft_size```. Defaults to 1024.
|
```fft_size```. Defaults to 1024.
|
||||||
|
|
||||||
hop_length (int):
|
hop_length (int):
|
||||||
Number of audio samples between adjacent STFT columns. Defaults to 1024.
|
Number of audio samples between adjacent STFT columns. Defaults to 1024.
|
||||||
|
|
||||||
frame_shift_ms (int):
|
frame_shift_ms (int):
|
||||||
Set ```hop_length``` based on milliseconds and sampling rate.
|
Set ```hop_length``` based on milliseconds and sampling rate.
|
||||||
|
|
||||||
frame_length_ms (int):
|
frame_length_ms (int):
|
||||||
Set ```win_length``` based on milliseconds and sampling rate.
|
Set ```win_length``` based on milliseconds and sampling rate.
|
||||||
|
|
||||||
stft_pad_mode (str):
|
stft_pad_mode (str):
|
||||||
Padding method used in STFT. 'reflect' or 'center'. Defaults to 'reflect'.
|
Padding method used in STFT. 'reflect' or 'center'. Defaults to 'reflect'.
|
||||||
|
|
||||||
sample_rate (int):
|
sample_rate (int):
|
||||||
Audio sampling rate. Defaults to 22050.
|
Audio sampling rate. Defaults to 22050.
|
||||||
|
|
||||||
resample (bool):
|
resample (bool):
|
||||||
Enable / Disable resampling audio to ```sample_rate```. Defaults to ```False```.
|
Enable / Disable resampling audio to ```sample_rate```. Defaults to ```False```.
|
||||||
|
|
||||||
preemphasis (float):
|
preemphasis (float):
|
||||||
Preemphasis coefficient. Defaults to 0.0.
|
Preemphasis coefficient. Defaults to 0.0.
|
||||||
|
|
||||||
ref_level_db (int): 20
|
ref_level_db (int): 20
|
||||||
Reference Db level to rebase the audio signal and ignore the level below. 20Db is assumed the sound of air.
|
Reference Db level to rebase the audio signal and ignore the level below. 20Db is assumed the sound of air.
|
||||||
Defaults to 20.
|
Defaults to 20.
|
||||||
|
|
||||||
do_sound_norm (bool):
|
do_sound_norm (bool):
|
||||||
Enable / Disable sound normalization to reconcile the volume differences among samples. Defaults to False.
|
Enable / Disable sound normalization to reconcile the volume differences among samples. Defaults to False.
|
||||||
|
|
||||||
|
log_func (str):
|
||||||
|
Numpy log function used for amplitude to DB conversion. Defaults to 'np.log10'.
|
||||||
|
|
||||||
do_trim_silence (bool):
|
do_trim_silence (bool):
|
||||||
Enable / Disable trimming silences at the beginning and the end of the audio clip. Defaults to ```True```.
|
Enable / Disable trimming silences at the beginning and the end of the audio clip. Defaults to ```True```.
|
||||||
|
|
||||||
do_amp_to_db_linear (bool, optional):
|
do_amp_to_db_linear (bool, optional):
|
||||||
enable/disable amplitude to dB conversion of linear spectrograms. Defaults to True.
|
enable/disable amplitude to dB conversion of linear spectrograms. Defaults to True.
|
||||||
|
|
||||||
do_amp_to_db_mel (bool, optional):
|
do_amp_to_db_mel (bool, optional):
|
||||||
enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True.
|
enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True.
|
||||||
|
|
||||||
trim_db (int):
|
trim_db (int):
|
||||||
Silence threshold used for silence trimming. Defaults to 45.
|
Silence threshold used for silence trimming. Defaults to 45.
|
||||||
|
|
||||||
power (float):
|
power (float):
|
||||||
Exponent used for expanding spectrogra levels before running Griffin Lim. It helps to reduce the
|
Exponent used for expanding spectrogra levels before running Griffin Lim. It helps to reduce the
|
||||||
artifacts in the synthesized voice. Defaults to 1.5.
|
artifacts in the synthesized voice. Defaults to 1.5.
|
||||||
|
|
||||||
griffin_lim_iters (int):
|
griffin_lim_iters (int):
|
||||||
Number of Griffing Lim iterations. Defaults to 60.
|
Number of Griffing Lim iterations. Defaults to 60.
|
||||||
|
|
||||||
num_mels (int):
|
num_mels (int):
|
||||||
Number of mel-basis frames that defines the frame lengths of each mel-spectrogram frame. Defaults to 80.
|
Number of mel-basis frames that defines the frame lengths of each mel-spectrogram frame. Defaults to 80.
|
||||||
|
|
||||||
mel_fmin (float): Min frequency level used for the mel-basis filters. ~50 for male and ~95 for female voices.
|
mel_fmin (float): Min frequency level used for the mel-basis filters. ~50 for male and ~95 for female voices.
|
||||||
It needs to be adjusted for a dataset. Defaults to 0.
|
It needs to be adjusted for a dataset. Defaults to 0.
|
||||||
|
|
||||||
mel_fmax (float):
|
mel_fmax (float):
|
||||||
Max frequency level used for the mel-basis filters. It needs to be adjusted for a dataset.
|
Max frequency level used for the mel-basis filters. It needs to be adjusted for a dataset.
|
||||||
|
|
||||||
spec_gain (int):
|
spec_gain (int):
|
||||||
Gain applied when converting amplitude to DB. Defaults to 20.
|
Gain applied when converting amplitude to DB. Defaults to 20.
|
||||||
|
|
||||||
signal_norm (bool):
|
signal_norm (bool):
|
||||||
enable/disable signal normalization. Defaults to True.
|
enable/disable signal normalization. Defaults to True.
|
||||||
|
|
||||||
min_level_db (int):
|
min_level_db (int):
|
||||||
minimum db threshold for the computed melspectrograms. Defaults to -100.
|
minimum db threshold for the computed melspectrograms. Defaults to -100.
|
||||||
|
|
||||||
symmetric_norm (bool):
|
symmetric_norm (bool):
|
||||||
enable/disable symmetric normalization. If set True normalization is performed in the range [-k, k] else
|
enable/disable symmetric normalization. If set True normalization is performed in the range [-k, k] else
|
||||||
[0, k], Defaults to True.
|
[0, k], Defaults to True.
|
||||||
|
|
||||||
max_norm (float):
|
max_norm (float):
|
||||||
```k``` defining the normalization range. Defaults to 4.0.
|
```k``` defining the normalization range. Defaults to 4.0.
|
||||||
|
|
||||||
clip_norm (bool):
|
clip_norm (bool):
|
||||||
enable/disable clipping the our of range values in the normalized audio signal. Defaults to True.
|
enable/disable clipping the our of range values in the normalized audio signal. Defaults to True.
|
||||||
|
|
||||||
stats_path (str):
|
stats_path (str):
|
||||||
Path to the computed stats file. Defaults to None.
|
Path to the computed stats file. Defaults to None.
|
||||||
"""
|
"""
|
||||||
|
@ -298,7 +327,7 @@ class BaseTrainingConfig(Coqpit):
|
||||||
keep_all_best: bool = False
|
keep_all_best: bool = False
|
||||||
keep_after: int = 10000
|
keep_after: int = 10000
|
||||||
# dataloading
|
# dataloading
|
||||||
num_loader_workers: int = None
|
num_loader_workers: int = 0
|
||||||
num_eval_loader_workers: int = 0
|
num_eval_loader_workers: int = 0
|
||||||
use_noise_augment: bool = False
|
use_noise_augment: bool = False
|
||||||
# paths
|
# paths
|
||||||
|
|
Loading…
Reference in New Issue