Merge branch 'dev' of https://github.com/mozilla/TTS into dev

2020-03-09 10:25:13 +01:00 · 2020-03-09 10:25:13 +01:00 · 0f78f5c277
parent c6440c257e c825b63d4c
commit 0f78f5c277
13 changed files with 97 additions and 23 deletions
--- a/config.json
+++ b/config.json
@ -1,7 +1,7 @@
 {
    "model": "Tacotron2",          // one of the model in models/  
-    "run_name": "ljspeech-gravesv2",
-    "run_description": "tacotron2 wuth graves attention",
+    "run_name": "ljspeech-stft_params",
+    "run_description": "tacotron2 cosntant stf parameters",

    // AUDIO PARAMETERS
    "audio":{
@ -50,12 +50,11 @@
    "reinit_layers": [],    // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.

    // TRAINING
-    "batch_size": 2,       // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
+    "batch_size": 32,       // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
    "eval_batch_size":16,   
    "r": 7,                 // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.  
    "gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed.
    "loss_masking": true,         // enable / disable loss masking against the sequence padding.
-    "grad_accum": 2,        // if N > 1, enable gradient accumulation for N iterations. It is useful for low memory GPUs. 

    // VALIDATION
    "run_eval": true,
@ -110,7 +109,7 @@
    "output_path": "/data4/rw/home/Trainings/",
 
    // PHONEMES
-    "phoneme_cache_path": "mozilla_us_phonemes",  // phoneme computation is slow, therefore, it caches results in the given folder.
+    "phoneme_cache_path": "mozilla_us_phonemes_2_1",  // phoneme computation is slow, therefore, it caches results in the given folder.
    "use_phonemes": true,           // use phonemes instead of raw characters. It is suggested for better pronounciation.
    "phoneme_language": "en-us",     // depending on your target language, pick one from  https://github.com/bootphon/phonemizer#languages

--- a/datasets/preprocess.py
+++ b/datasets/preprocess.py
@ -84,7 +84,7 @@ def mozilla_de(root_path, meta_file):
        for line in ttf:
            cols = line.strip().split('|')
            wav_file = cols[0].strip()
-            text = cols[1].strip()            
+            text = cols[1].strip()
            folder_name = f"BATCH_{wav_file.split('_')[0]}_FINAL"
            wav_file = os.path.join(root_path, folder_name, wav_file)
            items.append([text, wav_file, speaker_name])
--- a/layers/losses.py
+++ b/layers/losses.py
@ -96,3 +96,32 @@ class AttentionEntropyLoss(nn.Module):
        entropy = torch.distributions.Categorical(probs=align).entropy()
        loss = (entropy / np.log(align.shape[1])).mean()
        return loss
+
+
+class BCELossMasked(nn.Module):
+
+    def __init__(self, pos_weight):
+        super(BCELossMasked, self).__init__()
+        self.pos_weight = pos_weight
+
+    def forward(self, x, target, length):
+        """
+        Args:
+            x: A Variable containing a FloatTensor of size
+                (batch, max_len) which contains the
+                unnormalized probability for each class.
+            target: A Variable containing a LongTensor of size
+                (batch, max_len) which contains the index of the true
+                class for each corresponding step.
+            length: A Variable containing a LongTensor of size (batch,)
+                which contains the length of each data in a batch.
+        Returns:
+            loss: An average loss value in range [0, 1] masked by the length.
+        """
+        # mask: (batch, max_len, 1)
+        target.requires_grad = False
+        mask = sequence_mask(sequence_length=length, max_len=target.size(1)).float()
+        loss = functional.binary_cross_entropy_with_logits(
+            x * mask, target * mask, pos_weight=self.pos_weight, reduction='sum')
+        loss = loss / mask.sum()
+        return loss
--- a/models/tacotron.py
+++ b/models/tacotron.py
@ -39,7 +39,7 @@ class Tacotron(nn.Module):
        encoder_dim = 512 if num_speakers > 1 else 256
        proj_speaker_dim = 80 if num_speakers > 1 else 0
        # embedding layer
-        self.embedding = nn.Embedding(num_chars, 256)
+        self.embedding = nn.Embedding(num_chars, 256, padding_idx=0)
        self.embedding.weight.data.normal_(0, 0.3)
        # boilerplate model
        self.encoder = Encoder(encoder_dim)
--- a/models/tacotron2.py
+++ b/models/tacotron2.py
@ -35,7 +35,7 @@ class Tacotron2(nn.Module):
        encoder_dim = 512 if num_speakers > 1 else 512
        proj_speaker_dim = 80 if num_speakers > 1 else 0
        # embedding layer
-        self.embedding = nn.Embedding(num_chars, 512)
+        self.embedding = nn.Embedding(num_chars, 512, padding_idx=0)
        std = sqrt(2.0 / (num_chars + 512))
        val = sqrt(3.0) * std  # uniform bounds for std
        self.embedding.weight.data.uniform_(-val, val)
--- a/notebooks/Benchmark-PWGAN.ipynb
+++ b/notebooks/Benchmark-PWGAN.ipynb
@ -85,7 +85,10 @@
    "    if use_cuda:\n",
    "        waveform = waveform.cpu()\n",
    "    waveform = waveform.numpy()\n",
-    "    print(\" >  Run-time: {}\".format(time.time() - t_1))\n",
+    "    rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n",
+    "    print(waveform.shape)\n",
+    "    print(\" > Run-time: {}\".format(time.time() - t_1))\n",
+    "    print(\" > Real-time factor: {}\".format(rtf))\n",
    "    if figures:  \n",
    "        visualize(alignment, mel_postnet_spec, stop_tokens, text, ap.hop_length, CONFIG, ap._denormalize(mel_spec))                                                                       \n",
    "    IPython.display.display(Audio(waveform, rate=CONFIG.audio['sample_rate'], normalize=False))  \n",
--- a/server/synthesizer.py
+++ b/server/synthesizer.py
@ -105,8 +105,8 @@ class Synthesizer(object):
            sample_rate=self.ap.sample_rate,
        ).cuda()

-        check = torch.load(model_file)
-        self.wavernn.load_state_dict(check['model'], map_location="cpu")
+        check = torch.load(model_file, map_location="cpu")
+        self.wavernn.load_state_dict(check['model'])
        if use_cuda:
            self.wavernn.cuda()
        self.wavernn.eval()
--- a/train.py
+++ b/train.py
@ -13,7 +13,7 @@ from torch.utils.data import DataLoader
 from TTS.datasets.TTSDataset import MyDataset
 from distribute import (DistributedSampler, apply_gradient_allreduce,
                        init_distributed, reduce_tensor)
-from TTS.layers.losses import L1LossMasked, MSELossMasked
+from TTS.layers.losses import L1LossMasked, MSELossMasked, BCELossMasked
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.generic_utils import (
    NoamLR, check_update, count_parameters, create_experiment_folder,
@ -168,7 +168,7 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,

        # loss computation
        stop_loss = criterion_st(stop_tokens,
-                                 stop_targets) if c.stopnet else torch.zeros(1)
+                                 stop_targets, mel_lengths) if c.stopnet else torch.zeros(1)
        if c.loss_masking:
            decoder_loss = criterion(decoder_output, mel_input, mel_lengths)
            if c.model in ["Tacotron", "TacotronGST"]:
@ -366,7 +366,7 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch):

            # loss computation
            stop_loss = criterion_st(
-                stop_tokens, stop_targets) if c.stopnet else torch.zeros(1)
+                stop_tokens, stop_targets, mel_lengths) if c.stopnet else torch.zeros(1)
            if c.loss_masking:
                decoder_loss = criterion(decoder_output, mel_input,
                                         mel_lengths)
@ -494,7 +494,12 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
                    use_cuda,
                    ap,
                    speaker_id=speaker_id,
-                    style_wav=style_wav)
+                    style_wav=style_wav,
+                    truncated=False,
+                    enable_eos_bos_chars=c.enable_eos_bos_chars, #pylint: disable=unused-argument
+                    use_griffin_lim=True,
+                    do_trim_silence=False)
+
                file_path = os.path.join(AUDIO_PATH, str(global_step))
                os.makedirs(file_path, exist_ok=True)
                file_path = os.path.join(file_path,
@ -570,7 +575,7 @@ def main(args):  # pylint: disable=redefined-outer-name
    else:
        criterion = nn.L1Loss() if c.model in ["Tacotron", "TacotronGST"
                                               ] else nn.MSELoss()
-    criterion_st = nn.BCEWithLogitsLoss(
+    criterion_st = BCELossMasked(
        pos_weight=torch.tensor(10)) if c.stopnet else None

    if args.restore_path:
--- a/utils/data.py
+++ b/utils/data.py
@ -14,7 +14,7 @@ def prepare_data(inputs):


 def _pad_tensor(x, length):
-    _pad = 0
+    _pad = 0.
    assert x.ndim == 2
    x = np.pad(
        x, [[0, 0], [0, length - x.shape[1]]],
@ -31,7 +31,7 @@ def prepare_tensor(inputs, out_steps):


 def _pad_stop_target(x, length):
-    _pad = 1.
+    _pad = 0.
    assert x.ndim == 1
    return np.pad(
        x, (0, length - x.shape[0]), mode='constant', constant_values=_pad)
--- a/utils/generic_utils.py
+++ b/utils/generic_utils.py
@ -391,7 +391,9 @@ class KeepAverage():
            self.update_value(key, value)


-def _check_argument(name, c, enum_list=None, max_val=None, min_val=None, restricted=False, val_type=None):
+def _check_argument(name, c, enum_list=None, max_val=None, min_val=None, restricted=False, val_type=None, alternative=None):
+    if alternative in c.keys() and c[alternative] is not None:
+        return
    if restricted:
        assert name in c.keys(), f' [!] {name} not defined in config.json'
    if name in c.keys():
@ -417,8 +419,8 @@ def check_config(c):
    _check_argument('num_mels', c['audio'], restricted=True, val_type=int, min_val=10, max_val=2056)
    _check_argument('num_freq', c['audio'], restricted=True, val_type=int, min_val=128, max_val=4058)
    _check_argument('sample_rate', c['audio'], restricted=True, val_type=int, min_val=512, max_val=100000)
-    _check_argument('frame_length_ms', c['audio'], restricted=True, val_type=float, min_val=10, max_val=1000)
-    _check_argument('frame_shift_ms', c['audio'], restricted=True, val_type=float, min_val=1, max_val=1000)
+    _check_argument('frame_length_ms', c['audio'], restricted=True, val_type=float, min_val=10, max_val=1000, alternative='win_length')
+    _check_argument('frame_shift_ms', c['audio'], restricted=True, val_type=float, min_val=1, max_val=1000, alternative='hop_length')
    _check_argument('preemphasis', c['audio'], restricted=True, val_type=float, min_val=0, max_val=1)
    _check_argument('min_level_db', c['audio'], restricted=True, val_type=int, min_val=-1000, max_val=10)
    _check_argument('ref_level_db', c['audio'], restricted=True, val_type=int, min_val=0, max_val=1000)
--- a/utils/synthesis.py
+++ b/utils/synthesis.py
@ -70,6 +70,24 @@ def id_to_torch(speaker_id):
    return speaker_id


+# TODO: perform GL with pytorch for batching
+def apply_griffin_lim(inputs, input_lens, CONFIG, ap):
+    '''Apply griffin-lim to each sample iterating throught the first dimension.
+    Args:
+        inputs (Tensor or np.Array): Features to be converted by GL. First dimension is the batch size.
+        input_lens (Tensor or np.Array): 1D array of sample lengths.
+        CONFIG (Dict): TTS config.
+        ap (AudioProcessor): TTS audio processor.
+    '''
+    wavs = []
+    for idx, spec in enumerate(inputs):
+        wav_len = (input_lens[idx] * ap.hop_length) - ap.hop_length  # inverse librosa padding
+        wav = inv_spectrogram(spec, ap, CONFIG)
+        # assert len(wav) == wav_len, f" [!] wav lenght: {len(wav)} vs expected: {wav_len}"
+        wavs.append(wav[:wav_len])
+    return wavs
+
+
 def synthesis(model,
              text,
              CONFIG,
--- a/utils/text/init.py
+++ b/utils/text/init.py
@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-

 import re
+from packaging import version
 import phonemizer
 from phonemizer.phonemize import phonemize
 from TTS.utils.text import cleaners
@ -28,7 +29,7 @@ def text2phone(text, language):
    seperator = phonemizer.separator.Separator(' |', '', '|')
    #try:
    punctuations = re.findall(PHONEME_PUNCTUATION_PATTERN, text)
-    if float(phonemizer.__version__) < 2.1:
+    if version.parse(phonemizer.__version__) < version.parse('2.1'):
        ph = phonemize(text, separator=seperator, strip=False, njobs=1, backend='espeak', language=language)
        ph = ph[:-1].strip() # skip the last empty character
        # phonemizer does not tackle punctuations. Here we do.
@ -42,7 +43,7 @@ def text2phone(text, language):
            else:
                for punct in punctuations:
                    ph = ph.replace('| |\n', '|'+punct+'| |', 1)
-    elif float(phonemizer.__version__) > 2.1:
+    elif version.parse(phonemizer.__version__) >= version.parse('2.1'):
        ph = phonemize(text, separator=seperator, strip=False, njobs=1, backend='espeak', language=language, preserve_punctuation=True)
        # this is a simple fix for phonemizer.
        # https://github.com/bootphon/phonemizer/issues/32
--- a/utils/text/cleaners.py
+++ b/utils/text/cleaners.py
@ -63,6 +63,19 @@ def convert_to_ascii(text):
    return unidecode(text)


+def remove_aux_symbols(text):
+    text = re.sub(r'[\<\>\(\)\[\]\"]+', '', text)
+    return text
+
+
+def replace_symbols(text):
+    text = text.replace(';', ',')
+    text = text.replace('-', ' ')
+    text = text.replace(':', ' ')
+    text = text.replace('&', 'and')
+    return text
+
+
 def basic_cleaners(text):
    '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
    text = lowercase(text)
@ -84,6 +97,8 @@ def english_cleaners(text):
    text = lowercase(text)
    text = expand_numbers(text)
    text = expand_abbreviations(text)
+    text = replace_symbols(text)
+    text = remove_aux_symbols(text)
    text = collapse_whitespace(text)
    return text

@ -93,5 +108,7 @@ def phoneme_cleaners(text):
    text = convert_to_ascii(text)
    text = expand_numbers(text)
    text = expand_abbreviations(text)
+    text = replace_symbols(text)
+    text = remove_aux_symbols(text)
    text = collapse_whitespace(text)
    return text