From be77e24a39dcc65c842096d9c289b4091811ddcf Mon Sep 17 00:00:00 2001
From: Edresson <edresson1@gmail.com>
Date: Tue, 28 Jul 2020 17:11:32 -0300
Subject: [PATCH] bugfix in DDC now DDC work on Tacotron1

---
 mozilla_voice_tts/tts/configs/config.json     |  8 ++-
 mozilla_voice_tts/tts/models/tacotron.py      | 18 +++----
 mozilla_voice_tts/tts/models/tacotron2.py     | 54 ++++++-------------
 .../tts/models/tacotron_abstract.py           | 19 +++++--
 mozilla_voice_tts/tts/utils/generic_utils.py  |  6 +++
 5 files changed, 52 insertions(+), 53 deletions(-)

diff --git a/mozilla_voice_tts/tts/configs/config.json b/mozilla_voice_tts/tts/configs/config.json
index 090540ab..70529fea 100644
--- a/mozilla_voice_tts/tts/configs/config.json
+++ b/mozilla_voice_tts/tts/configs/config.json
@@ -1,5 +1,5 @@
 {
-    "model": "Tacotron2",
+    "model": "Tacotron",
     "run_name": "ljspeech-ddc-bn",
     "run_description": "tacotron2 with ddc and batch-normalization",
 
@@ -114,7 +114,7 @@
     "tb_model_param_stats": false,     // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
 
     // DATA LOADING
-    "text_cleaner": "phoneme_cleaners",
+    "text_cleaner": "portuguese_cleaners",
     "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
     "num_loader_workers": 4,        // number of training data loader processes. Don't set it too big. 4-8 are good values.
     "num_val_loader_workers": 4,    // number of evaluation data loader processes.
@@ -131,9 +131,13 @@
     "phoneme_language": "en-us",     // depending on your target language, pick one from  https://github.com/bootphon/phonemizer#languages
 
     // MULTI-SPEAKER and GST
+<<<<<<< HEAD:mozilla_voice_tts/tts/configs/config.json
     "use_speaker_embedding": true,      // use speaker embedding to enable multi-speaker learning.
     "use_external_speaker_embedding_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
     "external_speaker_embedding_file": "../../speakers-vctk-en.json", // if not null and use_external_speaker_embedding_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
+=======
+    "use_speaker_embedding": true,     // use speaker embedding to enable multi-speaker learning.
+>>>>>>> bugfix in DDC now DDC work on Tacotron1:TTS/tts/configs/config.json
     "use_gst": true,       			    // use global style tokens
     "gst":	{			                // gst parameter if gst is enabled
         "gst_style_input": null,        // Condition the style input either on a 
diff --git a/mozilla_voice_tts/tts/models/tacotron.py b/mozilla_voice_tts/tts/models/tacotron.py
index 682d2b59..1395de97 100644
--- a/mozilla_voice_tts/tts/models/tacotron.py
+++ b/mozilla_voice_tts/tts/models/tacotron.py
@@ -42,19 +42,13 @@ class Tacotron(TacotronAbstract):
                              bidirectional_decoder, double_decoder_consistency,
                              ddc_r, gst)
 
+
         # init layer dims
         decoder_in_features = 256
         encoder_in_features = 256
+        speaker_embedding_dim = 256
+        proj_speaker_dim = 80 if num_speakers > 1 else 0
 
-        if speaker_embedding_dim is None:
-            # if speaker_embedding_dim is None we need use the nn.Embedding, with default speaker_embedding_dim
-            self.embeddings_per_sample = False
-            speaker_embedding_dim = 256
-        else:
-            # if speaker_embedding_dim is not None we need use speaker embedding per sample
-            self.embeddings_per_sample = True
-
-        # speaker and gst embeddings is concat in decoder input
         if num_speakers > 1:
             decoder_in_features = decoder_in_features + speaker_embedding_dim # add speaker embedding dim
         if self.gst:
@@ -109,6 +103,9 @@ class Tacotron(TacotronAbstract):
         input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths)
         # B x T_in x embed_dim
         inputs = self.embedding(characters)
+        # B x speaker_embed_dim
+        if speaker_ids is not None:
+            self.compute_speaker_embedding(speaker_ids)
         # B x T_in x encoder_in_features
         encoder_outputs = self.encoder(inputs)
         # sequence masking
@@ -155,6 +152,9 @@ class Tacotron(TacotronAbstract):
     @torch.no_grad()
     def inference(self, characters, speaker_ids=None, style_mel=None, speaker_embeddings=None):
         inputs = self.embedding(characters)
+        self._init_states()
+        if speaker_ids is not None:
+            self.compute_speaker_embedding(speaker_ids)
         encoder_outputs = self.encoder(inputs)
         if self.gst:
             # B x gst_dim
diff --git a/mozilla_voice_tts/tts/models/tacotron2.py b/mozilla_voice_tts/tts/models/tacotron2.py
index 0aa237ff..47057c56 100644
--- a/mozilla_voice_tts/tts/models/tacotron2.py
+++ b/mozilla_voice_tts/tts/models/tacotron2.py
@@ -1,9 +1,15 @@
 import torch
 from torch import nn
 
+<<<<<<< HEAD:mozilla_voice_tts/tts/models/tacotron2.py
 from mozilla_voice_tts.tts.layers.gst_layers import GST
 from mozilla_voice_tts.tts.layers.tacotron2 import Decoder, Encoder, Postnet
 from mozilla_voice_tts.tts.models.tacotron_abstract import TacotronAbstract
+=======
+from TTS.tts.layers.gst_layers import GST
+from TTS.tts.layers.tacotron2 import Decoder, Encoder, Postnet
+from TTS.tts.models.tacotron_abstract import TacotronAbstract
+>>>>>>> bugfix in DDC now DDC work on Tacotron1:TTS/tts/models/tacotron2.py
 
 # TODO: match function arguments with tacotron
 class Tacotron2(TacotronAbstract):
@@ -85,24 +91,6 @@ class Tacotron2(TacotronAbstract):
         mel_outputs_postnet = mel_outputs_postnet.transpose(1, 2)
         return mel_outputs, mel_outputs_postnet, alignments
 
-    def compute_gst(self, inputs, style_input):
-        """ Compute global style token """
-        device = inputs.device
-        if isinstance(style_input, dict):
-            query = torch.zeros(1, 1, self.gst_embedding_dim//2).to(device)
-            _GST = torch.tanh(self.gst_layer.style_token_layer.style_tokens)
-            gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device)
-            for k_token, v_amplifier in style_input.items():
-                key = _GST[int(k_token)].unsqueeze(0).expand(1, -1, -1)
-                gst_outputs_att = self.gst_layer.style_token_layer.attention(query, key)
-                gst_outputs = gst_outputs + gst_outputs_att * v_amplifier
-        elif style_input is None:
-            gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device)
-        else:
-            gst_outputs = self.gst_layer(style_input) # pylint: disable=not-callable
-        embedded_gst = gst_outputs.repeat(1, inputs.size(1), 1)
-        return inputs, embedded_gst
-
     def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, speaker_ids=None):
         # compute mask for padding
         # B x T_in_max (boolean)
@@ -112,20 +100,13 @@ class Tacotron2(TacotronAbstract):
         # B x T_in_max x D_en
         encoder_outputs = self.encoder(embedded_inputs, text_lengths)
 
+        if self.gst:
+            # B x gst_dim
+            encoder_outputs = self.compute_gst(encoder_outputs, mel_specs)
+
         if self.num_speakers > 1:
             embedded_speakers = self.speaker_embedding(speaker_ids)[:, None]
-            embedded_speakers = embedded_speakers.repeat(1, encoder_outputs.size(1), 1)
-            if self.gst:
-                # B x gst_dim
-                encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, mel_specs)
-                encoder_outputs = torch.cat([encoder_outputs, embedded_gst, embedded_speakers], dim=-1)
-            else:
-                encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1)
-        else:
-            if self.gst:
-                # B x gst_dim
-                encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, mel_specs)
-                encoder_outputs = torch.cat([encoder_outputs, embedded_gst], dim=-1)
+            encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers)
 
         encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs)
 
@@ -162,15 +143,14 @@ class Tacotron2(TacotronAbstract):
             embedded_speakers = embedded_speakers.repeat(1, encoder_outputs.size(1), 1)
             if self.gst:
                 # B x gst_dim
-                encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, style_mel)
-                encoder_outputs = torch.cat([encoder_outputs, embedded_gst, embedded_speakers], dim=-1)
+                encoder_outputs = self.compute_gst(encoder_outputs, style_mel)
+                encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1)
             else:
                 encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1)
         else:
             if self.gst:
                 # B x gst_dim
-                encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, style_mel)
-                encoder_outputs = torch.cat([encoder_outputs, embedded_gst], dim=-1)
+                encoder_outputs = self.compute_gst(encoder_outputs, style_mel)
 
         decoder_outputs, alignments, stop_tokens = self.decoder.inference(
             encoder_outputs)
@@ -192,15 +172,13 @@ class Tacotron2(TacotronAbstract):
             embedded_speakers = embedded_speakers.repeat(1, encoder_outputs.size(1), 1)
             if self.gst:
                 # B x gst_dim
-                encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, style_mel)
-                encoder_outputs = torch.cat([encoder_outputs, embedded_gst, embedded_speakers], dim=-1)
+                encoder_outputs = self.compute_gst(encoder_outputs, style_mel)
             else:
                 encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1)
         else:
             if self.gst:
                 # B x gst_dim
-                encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, style_mel)
-                encoder_outputs = torch.cat([encoder_outputs, embedded_gst], dim=-1)
+                encoder_outputs = self.compute_gst(encoder_outputs, style_mel)
 
         mel_outputs, alignments, stop_tokens = self.decoder.inference_truncated(
             encoder_outputs)
diff --git a/mozilla_voice_tts/tts/models/tacotron_abstract.py b/mozilla_voice_tts/tts/models/tacotron_abstract.py
index a4b8c227..6f3d32ad 100644
--- a/mozilla_voice_tts/tts/models/tacotron_abstract.py
+++ b/mozilla_voice_tts/tts/models/tacotron_abstract.py
@@ -164,11 +164,22 @@ class TacotronAbstract(ABC, nn.Module):
             self.speaker_embeddings_projected = self.speaker_project_mel(
                 self.speaker_embeddings).squeeze(1)
 
-    def compute_gst(self, inputs, mel_specs):
+    def compute_gst(self, inputs, style_input):
         """ Compute global style token """
-        # pylint: disable=not-callable
-        gst_outputs = self.gst_layer(mel_specs)
-        inputs = self._add_speaker_embedding(inputs, gst_outputs)
+        device = inputs.device
+        if isinstance(style_input, dict):
+            query = torch.zeros(1, 1, self.gst_embedding_dim//2).to(device)
+            _GST = torch.tanh(self.gst_layer.style_token_layer.style_tokens)
+            gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device)
+            for k_token, v_amplifier in style_input.items():
+                key = _GST[int(k_token)].unsqueeze(0).expand(1, -1, -1)
+                gst_outputs_att = self.gst_layer.style_token_layer.attention(query, key)
+                gst_outputs = gst_outputs + gst_outputs_att * v_amplifier
+        elif style_input is None:
+            gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device)
+        else:
+            gst_outputs = self.gst_layer(style_input) # pylint: disable=not-callable
+        inputs = self._concat_speaker_embedding(inputs, gst_outputs)
         return inputs
 
     @staticmethod
diff --git a/mozilla_voice_tts/tts/utils/generic_utils.py b/mozilla_voice_tts/tts/utils/generic_utils.py
index 2d5044ef..fc35840d 100644
--- a/mozilla_voice_tts/tts/utils/generic_utils.py
+++ b/mozilla_voice_tts/tts/utils/generic_utils.py
@@ -265,6 +265,12 @@ def check_config(c):
     check_argument('gst_num_heads', c['gst'], restricted=True, val_type=int, min_val=2, max_val=10)
     check_argument('gst_style_tokens', c['gst'], restricted=True, val_type=int, min_val=1, max_val=1000)
 
+    check_argument('gst', c, restricted=True, val_type=dict)
+    check_argument('gst_style_input', c['gst'], restricted=True, val_type=[str, dict])
+    check_argument('gst_embedding_dim', c['gst'], restricted=True, val_type=int, min_val=0, max_val=1000)
+    check_argument('gst_num_heads', c['gst'], restricted=True, val_type=int, min_val=2, max_val=10)
+    check_argument('gst_style_tokens', c['gst'], restricted=True, val_type=int, min_val=1, max_val=1000)
+
     # datasets - checking only the first entry
     check_argument('datasets', c, restricted=True, val_type=list)
     for dataset_entry in c['datasets']: