coqui-tts/notebooks/dataset_analysis/CheckSpectrograms.ipynb

450 KiB

None <html lang="en"> <head> </head>
In [5]:
%matplotlib inline
from tts.utils.audio import AudioProcessor
from tts.tts.utils.visual import plot_spectrogram
from tts.utils.io import load_config
import glob 
In [20]:
config_path = "/home/erogol/Projects/TTS/tts/tts/config_thorsten_de.json"
data_path = "/home/erogol/Data/thorsten-german/"
file_paths = glob.glob(data_path + "/**/*.wav", recursive=True)
CONFIG = load_config(config_path)

Setup Audio Processor

Play with the AP parameters until you find a good fit with the synthesis speech below.

In [21]:
# audio={
#  'audio_processor': 'audio',
#  'num_mels': 80,          # In general, you don'tneed to change it 
#  'fft_size': 1024,        # In general, you don'tneed to change it 
#  'sample_rate': 22050,    # It depends to the sample rate of the dataset.
#  'hop_length': 256,   # In general, you don'tneed to change it 
#  'win_length': 1024,  # In general, you don'tneed to change it 
#  'preemphasis': 0.98,        # In general, 0 gives better voice recovery but makes traning harder. If your model does not train, try 0.97 - 0.99.
#  'min_level_db': -100,
#  'ref_level_db': 20,      # It is the base DB, higher until you remove the background noise in the spectrogram and then lower until you hear a better speech below.
#  'power': 1.5,            # Change this value and listen the synthesized voice. 1.2 - 1.5 are some resonable values.
#  'griffin_lim_iters': 60, # It does not give any imporvement for values > 60
#  'signal_norm': True,     # This is more about your model. It does not give any change for the synthsis performance.
#  'symmetric_norm': False,   # Same as above
#  'max_norm': 1,           # Same as above
#  'clip_norm': True,       # Same as above
#  'mel_fmin': 0.0,        # You can play with this and check mel-spectrogram based voice synthesis below.
#  'mel_fmax': 8000.0,        # You can play with this and check mel-spectrogram based voice synthesis below.
#  'do_trim_silence': True} # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file.

AP = AudioProcessor(**CONFIG.audio);
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > num_mels:80
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:True
 | > stats_path:None
 | > hop_length:256
 | > win_length:1024

Check audio loading

In [22]:
wav = AP.load_wav(file_paths[10])
ipd.Audio(data=wav, rate=AP.sample_rate) 
Out[22]:
Your browser does not support the audio element.

Generate Mel-Spectrogram and Re-synthesis with GL

In [28]:
AP.power = 1.0
In [29]:
mel = AP.melspectrogram(wav)
print("Max:", mel.max())
print("Min:", mel.min())
print("Mean:", mel.mean())
plot_spectrogram(mel.T, AP);

wav_gen = AP.inv_melspectrogram(mel)
ipd.Audio(wav_gen, rate=AP.sample_rate)
Max: 2.4340844
Min: 2.0181823
Mean: 2.2137265
Out[29]:
Your browser does not support the audio element.

Generate Linear-Spectrogram and Re-synthesis with GL

In [18]:
spec = AP.spectrogram(wav)
print("Max:", spec.max())
print("Min:", spec.min())
print("Mean:", spec.mean())
plot_spectrogram(spec.T, AP);

wav_gen = AP.inv_spectrogram(spec)
ipd.Audio(wav_gen, rate=AP.sample_rate)
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-18-91e8914b5c6a> in <module>
----> 1 spec = AP.spectrogram(wav)
      2 print("Max:", spec.max())
      3 print("Min:", spec.min())
      4 print("Mean:", spec.mean())
      5 plot_spectrogram(spec.T, AP);

~/Projects/TTS/tts/utils/audio.py in spectrogram(self, y)
    218             D = self._stft(y)
    219         S = self._amp_to_db(np.abs(D))
--> 220         return self.normalize(S)
    221 
    222     def melspectrogram(self, y):

~/Projects/TTS/tts/utils/audio.py in normalize(self, S)
    117                     return self.linear_scaler.transform(S.T).T
    118                 else:
--> 119                     raise RuntimeError(' [!] Mean-Var stats does not match the given feature dimensions.')
    120             # range normalization
    121             S -= self.ref_level_db  # discard certain range of DB assuming it is air noise

RuntimeError:  [!] Mean-Var stats does not match the given feature dimensions.

Compare values for a certain parameter

Optimize your parameters by comparing different values per parameter at a time.

In [ ]:
audio={
 'audio_processor': 'audio',
 'num_mels': 80,          # In general, you don'tneed to change it 
 'num_freq': 1025,        # In general, you don'tneed to change it 
 'sample_rate': 22050,    # It depends to the sample rate of the dataset.
 'frame_length_ms': 50,   # In general, you don'tneed to change it 
 'frame_shift_ms': 12.5,  # In general, you don'tneed to change it 
 'preemphasis': 0.98,        # In general, 0 gives better voice recovery but makes traning harder. If your model does not train, try 0.97 - 0.99.
 'min_level_db': -100,
 'ref_level_db': 20,      # It is the base DB, higher until you remove the background noise in the spectrogram and then lower until you hear a better speech below.
 'power': 1.5,            # Change this value and listen the synthesized voice. 1.2 - 1.5 are some resonable values.
 'griffin_lim_iters': 60, # It does not give any imporvement for values > 60
 'signal_norm': True,     # This is more about your model. It does not give any change for the synthsis performance.
 'symmetric_norm': False,   # Same as above
 'max_norm': 1,           # Same as above
 'clip_norm': True,       # Same as above
 'mel_fmin': 0.0,        # You can play with this and check mel-spectrogram based voice synthesis below.
 'mel_fmax': 8000.0,        # You can play with this and check mel-spectrogram based voice synthesis below.
 'do_trim_silence': True} # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file.

AP = AudioProcessor(**audio);
In [ ]:
from librosa import display
from matplotlib import pylab as plt
import IPython
plt.rcParams['figure.figsize'] = (20.0, 16.0)

def compare_values(attribute, values, file):
    """
    attributes (str): the names of the attribute you like to test.
    values (list): list of values to compare.
    file (str): file name to perform the tests.
    """
    wavs = []
    for idx, val in enumerate(values):
        set_val_cmd = "AP.{}={}".format(attribute, val)
        exec(set_val_cmd)
        wav = AP.load_wav(file)
        spec = AP.spectrogram(wav)
        spec_norm = AP.denormalize(spec.T)
        plt.subplot(len(values), 2, 2*idx + 1)
        plt.imshow(spec_norm.T, aspect="auto", origin="lower")
        #         plt.colorbar()
        plt.tight_layout()
        wav_gen = AP.inv_spectrogram(spec)
        wavs.append(wav_gen)
        plt.subplot(len(values), 2, 2*idx + 2)
        display.waveplot(wav, alpha=0.5)
        display.waveplot(wav_gen, alpha=0.25)
        plt.title("{}={}".format(attribute, val))
        plt.tight_layout()
    
    wav = AP.load_wav(file)
    print(" > Ground-truth")
    IPython.display.display(IPython.display.Audio(wav, rate=AP.sample_rate))
    
    for idx, wav_gen in enumerate(wavs):
        val = values[idx]
        print(" > {} = {}".format(attribute, val))
        IPython.display.display(IPython.display.Audio(wav_gen, rate=AP.sample_rate))
In [ ]:
compare_values("preemphasis", [0, 0.5, 0.97, 0.98, 0.99], file_paths[10])
In [ ]:
compare_values("ref_level_db", [10, 15, 20, 25, 30, 35, 40], file_paths[10])
</html>