mirror of https://github.com/coqui-ai/TTS.git
6.5 MiB
6.5 MiB
None
<html lang="en">
<head>
</head>
</html>
In [1]:
cd /home/erogol/projects/
/home/erogol/projects
In [2]:
%matplotlib inline from TTS.utils.audio import AudioProcessor from TTS.utils.visual import plot_spectrogram from TTS.utils.generic_utils import load_config import IPython.display as ipd
/home/erogol/miniconda3/lib/python3.7/site-packages/matplotlib/__init__.py:1405: UserWarning: This call to matplotlib.use() has no effect because the backend has already been chosen; matplotlib.use() must be called *before* pylab, matplotlib.pyplot, or matplotlib.backends is imported for the first time. warnings.warn(_use_error_msg)
In [3]:
config_path = "/media/erogol/data_ssd/Data/models/tr/TTS-phoneme-January-14-2019_06+52PM-4ad64a7/config.json" data_path = "/media/erogol/data_ssd/Data/TR/wavs/" CONFIG = load_config(config_path)
Setup Audio Processor¶
Play with the AP parameters until you find a good fit with the synthesis speech below.
In [67]:
audio={ 'audio_processor': 'audio', 'num_mels': 80, # In general, you don'tneed to change it 'num_freq': 1025, # In general, you don'tneed to change it 'sample_rate': 22050, # It depends to the sample rate of the dataset. 'frame_length_ms': 50, # In general, you don'tneed to change it 'frame_shift_ms': 12.5, # In general, you don'tneed to change it 'preemphasis': 0.99, # In general, 0 gives better voice recovery but makes traning harder. If your model does not train, try 0.97 - 0.99. 'min_level_db': -100, 'ref_level_db': 40, # It is the base DB, higher until you remove the background noise in the spectrogram and then lower until you hear a better speech below. 'power': 1.1, # Change this value and listen the synthesized voice. 1.2 - 1.5 are some resonable values. 'griffin_lim_iters': 30, # It does not give any imporvement for values > 60 'signal_norm': True, # This is more about your model. It does not give any change for the synthsis performance. 'symmetric_norm': False, # Same as above 'max_norm': 1, # Same as above 'clip_norm': True, # Same as above 'mel_fmin': None, # You can play with this and check mel-spectrogram based voice synthesis below. 'mel_fmax': None, # You can play with this and check mel-spectrogram based voice synthesis below. 'do_trim_silence': True} # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file. AP = AudioProcessor(**audio);
> Setting up Audio Processor... | > fft size: 2048, hop length: 275, win length: 1102 | > Audio Processor attributes. | > bits:None | > sample_rate:22050 | > num_mels:80 | > min_level_db:-100 | > frame_shift_ms:12.5 | > frame_length_ms:50 | > ref_level_db:40 | > num_freq:1025 | > power:1.1 | > preemphasis:0.99 | > griffin_lim_iters:30 | > signal_norm:True | > symmetric_norm:False | > mel_fmin:0 | > mel_fmax:None | > max_norm:1.0 | > clip_norm:True | > do_trim_silence:True | > n_fft:2048 | > hop_length:275 | > win_length:1102
Check audio loading¶
In [71]:
wav = AP.load_wav(data_path + "5_5uIl38UFVh0.wav") ipd.Audio(data=wav, rate=AP.sample_rate)
Out[71]:
Your browser does not support the audio element.
Generate Mel-Spectrogram and Re-synthesis with GL¶
In [69]:
mel = AP.melspectrogram(wav) print("Max:", mel.max()) print("Min:", mel.min()) print("Mean:", mel.mean()) plot_spectrogram(mel.T, AP); wav_gen = AP.inv_mel_spectrogram(mel) ipd.Audio(wav_gen, rate=AP.sample_rate)
Max: 0.6203846939023735 Min: 0.0 Mean: 0.23149373486123542
Out[69]:
Your browser does not support the audio element.
Generate Linear-Spectrogram and Re-synthesis with GL¶
In [72]:
spec = AP.spectrogram(wav) print("Max:", spec.max()) print("Min:", spec.min()) print("Mean:", spec.mean()) plot_spectrogram(spec.T, AP); wav_gen = AP.inv_spectrogram(spec) ipd.Audio(wav_gen, rate=AP.sample_rate)
Max: 0.8925319 Min: 0.0 Mean: 0.41698578
Out[72]:
Your browser does not support the audio element.
Compare values for a certain parameter¶
Optimize your parameters by comparing different values per parameter at a time.
In [54]:
audio={ 'audio_processor': 'audio', 'num_mels': 80, # In general, you don'tneed to change it 'num_freq': 1025, # In general, you don'tneed to change it 'sample_rate': 22050, # It depends to the sample rate of the dataset. 'frame_length_ms': 50, # In general, you don'tneed to change it 'frame_shift_ms': 12.5, # In general, you don'tneed to change it 'preemphasis': 0.99, # In general, 0 gives better voice recovery but makes traning harder. If your model does not train, try 0.97 - 0.99. 'min_level_db': -100, 'ref_level_db': 40, # It is the base DB, higher until you remove the background noise in the spectrogram and then lower until you hear a better speech below. 'power': 1.1, # Change this value and listen the synthesized voice. 1.2 - 1.5 are some resonable values. 'griffin_lim_iters': 30, # It does not give any imporvement for values > 60 'signal_norm': True, # This is more about your model. It does not give any change for the synthsis performance. 'symmetric_norm': False, # Same as above 'max_norm': 1, # Same as above 'clip_norm': True, # Same as above 'mel_fmin': None, # You can play with this and check mel-spectrogram based voice synthesis below. 'mel_fmax': None, # You can play with this and check mel-spectrogram based voice synthesis below. 'do_trim_silence': True} # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file. AP = AudioProcessor(**audio);
> Setting up Audio Processor... | > fft size: 2048, hop length: 275, win length: 1102 | > Audio Processor attributes. | > bits:None | > sample_rate:22050 | > num_mels:80 | > min_level_db:-100 | > frame_shift_ms:12.5 | > frame_length_ms:50 | > ref_level_db:40 | > num_freq:1025 | > power:1.1 | > preemphasis:0.99 | > griffin_lim_iters:30 | > signal_norm:True | > symmetric_norm:False | > mel_fmin:0 | > mel_fmax:None | > max_norm:1.0 | > clip_norm:True | > do_trim_silence:True | > n_fft:2048 | > hop_length:275 | > win_length:1102
In [55]:
from librosa import display from matplotlib import pylab as plt import IPython plt.rcParams['figure.figsize'] = (20.0, 16.0) def compare_values(attribute, values, file): """ attributes (str): the names of the attribute you like to test. values (list): list of values to compare. file (str): file name to perform the tests. """ wavs = [] for idx, val in enumerate(values): set_val_cmd = "AP.{}={}".format(attribute, val) exec(set_val_cmd) wav = AP.load_wav(data_path + file) spec = AP.spectrogram(wav) spec_norm = AP._denormalize(spec.T) plt.subplot(len(values), 2, 2*idx + 1) plt.imshow(spec_norm.T, aspect="auto", origin="lower") # plt.colorbar() plt.tight_layout() wav_gen = AP.inv_spectrogram(spec) wavs.append(wav_gen) plt.subplot(len(values), 2, 2*idx + 2) display.waveplot(wav, alpha=0.5) display.waveplot(wav_gen, alpha=0.25) plt.title("{}={}".format(attribute, val)) plt.tight_layout() wav = AP.load_wav(data_path + file) print(" > Ground-truth") IPython.display.display(IPython.display.Audio(wav, rate=AP.sample_rate)) for idx, wav_gen in enumerate(wavs): val = values[idx] print(" > {} = {}".format(attribute, val)) IPython.display.display(IPython.display.Audio(wav_gen, rate=AP.sample_rate))
In [56]:
compare_values("preemphasis", [0, 0.97, 0.98, 0.99], "5_5uIl38UFVh0.wav")
> Ground-truth
Your browser does not support the audio element.
> preemphasis = 0
Your browser does not support the audio element.
> preemphasis = 0.97
Your browser does not support the audio element.
> preemphasis = 0.98
Your browser does not support the audio element.
> preemphasis = 0.99
Your browser does not support the audio element.
In [53]:
compare_values("ref_level_db", [10, 15, 20, 25, 30, 35, 40], "5_5uIl38UFVh0.wav")
> Ground-truth
Your browser does not support the audio element.
> ref_level_db = 10
Your browser does not support the audio element.
> ref_level_db = 15
Your browser does not support the audio element.
> ref_level_db = 20
Your browser does not support the audio element.
> ref_level_db = 25
Your browser does not support the audio element.
> ref_level_db = 30
Your browser does not support the audio element.
> ref_level_db = 35
Your browser does not support the audio element.
> ref_level_db = 40
Your browser does not support the audio element.