From 1c5d3b52cf8139ea20dc33c865a9eefb7900687e Mon Sep 17 00:00:00 2001
From: root <root@sp-mlc3-5423-0.mlc>
Date: Wed, 15 Jan 2020 23:10:11 +0100
Subject: [PATCH 01/61] test updates

---
 tests/test_layers.py          | 2 +-
 tests/test_tacotron2_model.py | 2 +-
 tests/test_tacotron_model.py  | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/test_layers.py b/tests/test_layers.py
index 6e3c4b13..7d02b673 100644
--- a/tests/test_layers.py
+++ b/tests/test_layers.py
@@ -119,7 +119,7 @@ class EncoderTests(unittest.TestCase):
 class L1LossMaskedTests(unittest.TestCase):
     def test_in_out(self):
         # test input == target
-        layer = L1LossMasked()
+        layer = L1LossMasked(seq_len_norm=False)
         dummy_input = T.ones(4, 8, 128).float()
         dummy_target = T.ones(4, 8, 128).float()
         dummy_length = (T.ones(4) * 8).long()
diff --git a/tests/test_tacotron2_model.py b/tests/test_tacotron2_model.py
index a26f1ddf..aa2869eb 100644
--- a/tests/test_tacotron2_model.py
+++ b/tests/test_tacotron2_model.py
@@ -38,7 +38,7 @@ class TacotronTrainTest(unittest.TestCase):
                                          stop_targets.size(1) // c.r, -1)
         stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
 
-        criterion = MSELossMasked().to(device)
+        criterion = MSELossMasked(seq_len_norm=False).to(device)
         criterion_st = nn.BCEWithLogitsLoss().to(device)
         model = Tacotron2(num_chars=24, r=c.r, num_speakers=5).to(device)
         model.train()
diff --git a/tests/test_tacotron_model.py b/tests/test_tacotron_model.py
index 7e5e8daf..ac6712b0 100644
--- a/tests/test_tacotron_model.py
+++ b/tests/test_tacotron_model.py
@@ -44,7 +44,7 @@ class TacotronTrainTest(unittest.TestCase):
         stop_targets = (stop_targets.sum(2) >
                         0.0).unsqueeze(2).float().squeeze()
 
-        criterion = L1LossMasked().to(device)
+        criterion = L1LossMasked(seq_len_norm=False).to(device)
         criterion_st = nn.BCEWithLogitsLoss().to(device)
         model = Tacotron(
             num_chars=32,
@@ -106,7 +106,7 @@ class TacotronGSTTrainTest(unittest.TestCase):
         stop_targets = (stop_targets.sum(2) >
                         0.0).unsqueeze(2).float().squeeze()
 
-        criterion = L1LossMasked().to(device)
+        criterion = L1LossMasked(seq_len_norm=False).to(device)
         criterion_st = nn.BCEWithLogitsLoss().to(device)
         model = Tacotron(
             num_chars=32,

From e37503cb710bb229d8adc12a40d73338f1201351 Mon Sep 17 00:00:00 2001
From: root <root@sp-mlc3-5423-0.mlc>
Date: Mon, 27 Jan 2020 16:38:57 +0100
Subject: [PATCH 02/61] stale.yml

---
 .github/stale.yml | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 .github/stale.yml

diff --git a/.github/stale.yml b/.github/stale.yml
new file mode 100644
index 00000000..5bac63d3
--- /dev/null
+++ b/.github/stale.yml
@@ -0,0 +1,19 @@
+# Number of days of inactivity before an issue becomes stale
+daysUntilStale: 60
+# Number of days of inactivity before a stale issue is closed
+daysUntilClose: 7
+# Issues with these labels will never be considered stale
+exemptLabels:
+  - pinned
+  - security
+# Label to use when marking an issue as stale
+staleLabel: wontfix
+# Comment to post when marking an issue as stale. Set to `false` to disable
+markComment: >
+  This issue has been automatically marked as stale because it has not had
+  recent activity. It will be closed if no further activity occurs. Thank you
+  for your contributions. You might also look our discourse page for further help.
+  https://discourse.mozilla.org/c/tts
+# Comment to post when closing a stale issue. Set to `false` to disable
+closeComment: false
+

From 5c78816f5181a743dc46df3a0ee1746207a57da9 Mon Sep 17 00:00:00 2001
From: root <root@sp-mlc3-5423-0.mlc>
Date: Tue, 4 Feb 2020 17:09:59 +0100
Subject: [PATCH 03/61] update server and synthesizer to handle ParallelWaveGAN

---
 server/server.py      |  9 ++++++---
 server/synthesizer.py | 46 ++++++++++++++++++++++++++++++-------------
 2 files changed, 38 insertions(+), 17 deletions(-)

diff --git a/server/server.py b/server/server.py
index 3be66f9e..6af119bf 100644
--- a/server/server.py
+++ b/server/server.py
@@ -14,10 +14,13 @@ def create_argparser():
     parser.add_argument('--tts_checkpoint', type=str, help='path to TTS checkpoint file')
     parser.add_argument('--tts_config', type=str, help='path to TTS config.json file')
     parser.add_argument('--tts_speakers', type=str, help='path to JSON file containing speaker ids, if speaker ids are used in the model')
-    parser.add_argument('--wavernn_lib_path', type=str, help='path to WaveRNN project folder to be imported. If this is not passed, model uses Griffin-Lim for synthesis.')
-    parser.add_argument('--wavernn_file', type=str, help='path to WaveRNN checkpoint file.')
-    parser.add_argument('--wavernn_config', type=str, help='path to WaveRNN config file.')
+    parser.add_argument('--wavernn_lib_path', type=str, default=None, help='path to WaveRNN project folder to be imported. If this is not passed, model uses Griffin-Lim for synthesis.')
+    parser.add_argument('--wavernn_file', type=str, default=None, help='path to WaveRNN checkpoint file.')
+    parser.add_argument('--wavernn_config', type=str, default=None, help='path to WaveRNN config file.')
     parser.add_argument('--is_wavernn_batched', type=convert_boolean, default=False, help='true to use batched WaveRNN.')
+    parser.add_argument('--pwgan_lib_path', type=str, help='path to ParallelWaveGAN project folder to be imported. If this is not passed, model uses Griffin-Lim for synthesis.')
+    parser.add_argument('--pwgan_file', type=str, help='path to ParallelWaveGAN checkpoint file.')
+    parser.add_argument('--pwgan_config', type=str, help='path to ParallelWaveGAN config file.')
     parser.add_argument('--port', type=int, default=5002, help='port to listen on.')
     parser.add_argument('--use_cuda', type=convert_boolean, default=False, help='true to use CUDA.')
     parser.add_argument('--debug', type=convert_boolean, default=False, help='true to enable Flask debug mode.')
diff --git a/server/synthesizer.py b/server/synthesizer.py
index d8852a3e..b703c62e 100644
--- a/server/synthesizer.py
+++ b/server/synthesizer.py
@@ -1,17 +1,18 @@
 import io
 import os
+import re
+import sys
 
 import numpy as np
 import torch
-import sys
+import yaml
 
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.generic_utils import load_config, setup_model
-from TTS.utils.text import phonemes, symbols
 from TTS.utils.speakers import load_speaker_mapping
 from TTS.utils.synthesis import *
+from TTS.utils.text import phonemes, symbols
 
-import re
 alphabets = r"([A-Za-z])"
 prefixes = r"(Mr|St|Mrs|Ms|Dr)[.]"
 suffixes = r"(Inc|Ltd|Jr|Sr|Co)"
@@ -23,6 +24,7 @@ websites = r"[.](com|net|org|io|gov)"
 class Synthesizer(object):
     def __init__(self, config):
         self.wavernn = None
+        self.pwgan = None
         self.config = config
         self.use_cuda = self.config.use_cuda
         if self.use_cuda:
@@ -30,9 +32,11 @@ class Synthesizer(object):
         self.load_tts(self.config.tts_checkpoint, self.config.tts_config,
                       self.config.use_cuda)
         if self.config.wavernn_lib_path:
-            self.load_wavernn(self.config.wavernn_lib_path, self.config.wavernn_path,
-                              self.config.wavernn_file, self.config.wavernn_config,
-                              self.config.use_cuda)
+            self.load_wavernn(self.config.wavernn_lib_path, self.config.wavernn_file,
+                              self.config.wavernn_config, self.config.use_cuda)
+        if self.config.pwgan_lib_path:
+            self.load_pwgan(self.config.pwgan_lib_path, self.config.pwgan_file,
+                            self.config.pwgan_config, self.config.use_cuda)
 
     def load_tts(self, tts_checkpoint, tts_config, use_cuda):
         print(" > Loading TTS model ...")
@@ -45,9 +49,9 @@ class Synthesizer(object):
             self.input_size = len(phonemes)
         else:
             self.input_size = len(symbols)
-        # load speakers
+        # TODO: fix this for multi-speaker model - load speakers
         if self.config.tts_speakers is not None:
-            self.tts_speakers = load_speaker_mapping(os.path.join(model_path, self.config.tts_speakers))
+            self.tts_speakers = load_speaker_mapping(self.config.tts_speakers)
             num_speakers = len(self.tts_speakers)
         else:
             num_speakers = 0
@@ -63,16 +67,14 @@ class Synthesizer(object):
         if 'r' in cp:
             self.tts_model.decoder.set_r(cp['r'])
 
-    def load_wavernn(self, lib_path, model_path, model_file, model_config, use_cuda):
+    def load_wavernn(self, lib_path, model_file, model_config, use_cuda):
         # TODO: set a function in wavernn code base for model setup and call it here.
         sys.path.append(lib_path) # set this if TTS is not installed globally
         from WaveRNN.models.wavernn import Model
-        wavernn_config = os.path.join(model_path, model_config)
-        model_file = os.path.join(model_path, model_file)
         print(" > Loading WaveRNN model ...")
-        print(" | > model config: ", wavernn_config)
+        print(" | > model config: ", model_config)
         print(" | > model file: ", model_file)
-        self.wavernn_config = load_config(wavernn_config)
+        self.wavernn_config = load_config(model_config)
         self.wavernn = Model(
             rnn_dims=512,
             fc_dims=512,
@@ -91,11 +93,27 @@ class Synthesizer(object):
         ).cuda()
 
         check = torch.load(model_file)
-        self.wavernn.load_state_dict(check['model'])
+        self.wavernn.load_state_dict(check['model'], map_location="cpu")
         if use_cuda:
             self.wavernn.cuda()
         self.wavernn.eval()
 
+    def load_pwgan(self, lib_path, model_file, model_config, use_cuda):
+        sys.path.append(lib_path) # set this if TTS is not installed globally
+        from parallel_wavegan.models import ParallelWaveGANGenerator
+        from parallel_wavegan.utils.audio import AudioProcessor as AudioProcessorVocoder
+        print(" > Loading PWGAN model ...")
+        print(" | > model config: ", model_config)
+        print(" | > model file: ", model_file)
+        with open(model_config) as f:
+            self.pwgan_config = yaml.load(f, Loader=yaml.Loader)
+        self.pwgan = ParallelWaveGANGenerator(**self.pwgan_config["generator_params"])
+        self.pwgan.load_state_dict(torch.load(model_file, map_location="cpu")["model"]["generator"])
+        self.pwgan.remove_weight_norm()
+        if use_cuda:
+            self.pwgan.cuda()
+        self.pwgan.eval()
+
     def save_wav(self, wav, path):
         # wav *= 32767 / max(1e-8, np.max(np.abs(wav)))
         wav = np.array(wav)

From 61bdb265540321889a3e959676a0995842833562 Mon Sep 17 00:00:00 2001
From: root <root@sp-mlc3-5423-0.mlc>
Date: Tue, 4 Feb 2020 17:19:12 +0100
Subject: [PATCH 04/61] README update

---
 server/README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/server/README.md b/server/README.md
index 95297225..0563ef94 100644
--- a/server/README.md
+++ b/server/README.md
@@ -6,6 +6,10 @@ Instructions below are based on a Ubuntu 18.04 machine, but it should be simple
 
 #### Development server:
 
+##### Using server.py
+If you have the environment set already for TTS, then you can directly call ```setup.py```.
+
+##### Using .whl
 1. apt-get install -y espeak libsndfile1 python3-venv
 2. python3 -m venv /tmp/venv
 3. source /tmp/venv/bin/activate

From 2a6bce31cb41fb365c5d5f605bb1084ff49f1b5f Mon Sep 17 00:00:00 2001
From: root <root@sp-mlc3-5423-0.mlc>
Date: Tue, 4 Feb 2020 17:31:02 +0100
Subject: [PATCH 05/61] update server test

---
 server/synthesizer.py           | 2 --
 tests/inputs/server_config.json | 4 +++-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/server/synthesizer.py b/server/synthesizer.py
index b703c62e..63f2080a 100644
--- a/server/synthesizer.py
+++ b/server/synthesizer.py
@@ -1,5 +1,4 @@
 import io
-import os
 import re
 import sys
 
@@ -101,7 +100,6 @@ class Synthesizer(object):
     def load_pwgan(self, lib_path, model_file, model_config, use_cuda):
         sys.path.append(lib_path) # set this if TTS is not installed globally
         from parallel_wavegan.models import ParallelWaveGANGenerator
-        from parallel_wavegan.utils.audio import AudioProcessor as AudioProcessorVocoder
         print(" > Loading PWGAN model ...")
         print(" | > model config: ", model_config)
         print(" | > model file: ", model_file)
diff --git a/tests/inputs/server_config.json b/tests/inputs/server_config.json
index 3988db4c..7f5a60fb 100644
--- a/tests/inputs/server_config.json
+++ b/tests/inputs/server_config.json
@@ -3,9 +3,11 @@
     "tts_config":"dummy_model_config.json",     // tts config.json file
     "tts_speakers": null,           // json file listing speaker ids. null if no speaker embedding.
     "wavernn_lib_path": null,   // Rootpath to wavernn project folder to be imported. If this is null, model uses GL for speech synthesis.
-    "wavernn_path": null,  // wavernn model root path
     "wavernn_file": null, // wavernn checkpoint file name
     "wavernn_config": null, // wavernn config file
+    "pwgan_lib_path": null,
+    "pwgan_file": null,
+    "pwgan_config": null,
     "is_wavernn_batched":true, 
     "port": 5002,
     "use_cuda": false,

From 451f7da6980301820402b82d502b29976fd6ca31 Mon Sep 17 00:00:00 2001
From: erogol <erogol@hotmail.com>
Date: Thu, 6 Feb 2020 15:16:29 +0100
Subject: [PATCH 06/61] pylint check

---
 server/synthesizer.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/server/synthesizer.py b/server/synthesizer.py
index 63f2080a..75fd4e76 100644
--- a/server/synthesizer.py
+++ b/server/synthesizer.py
@@ -68,12 +68,15 @@ class Synthesizer(object):
 
     def load_wavernn(self, lib_path, model_file, model_config, use_cuda):
         # TODO: set a function in wavernn code base for model setup and call it here.
-        sys.path.append(lib_path) # set this if TTS is not installed globally
+        sys.path.append(lib_path) # set this if WaveRNN is not installed globally
+        #pylint: disable=import-outside-toplevel
         from WaveRNN.models.wavernn import Model
         print(" > Loading WaveRNN model ...")
         print(" | > model config: ", model_config)
         print(" | > model file: ", model_file)
         self.wavernn_config = load_config(model_config)
+        # This is the default architecture we use for our models.
+        # You might need to update it
         self.wavernn = Model(
             rnn_dims=512,
             fc_dims=512,
@@ -98,7 +101,8 @@ class Synthesizer(object):
         self.wavernn.eval()
 
     def load_pwgan(self, lib_path, model_file, model_config, use_cuda):
-        sys.path.append(lib_path) # set this if TTS is not installed globally
+        sys.path.append(lib_path) # set this if ParallelWaveGAN is not installed globally
+        #pylint: disable=import-outside-toplevel
         from parallel_wavegan.models import ParallelWaveGANGenerator
         print(" > Loading PWGAN model ...")
         print(" | > model config: ", model_config)

From 631fbdcb8e158733b4ec1c9996c6c7cc105cd114 Mon Sep 17 00:00:00 2001
From: Markus Toman <m.toman@neuratec.com>
Date: Fri, 7 Feb 2020 11:08:21 +0100
Subject: [PATCH 07/61] Fix vocoder normalization when no vocoder is used

When G&L is used, ap_vocoder is None and crashes
---
 synthesize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/synthesize.py b/synthesize.py
index cb0ee8af..eec022ab 100644
--- a/synthesize.py
+++ b/synthesize.py
@@ -31,8 +31,8 @@ def tts(model,
         postnet_output = ap.out_linear_to_mel(postnet_output.T).T
     # correct if there is a scale difference b/w two models
     postnet_output = ap._denormalize(postnet_output)
-    postnet_output = ap_vocoder._normalize(postnet_output)
     if use_vocoder_model:
+        postnet_output = ap_vocoder._normalize(postnet_output)
         vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0)
         waveform = vocoder_model.generate(
             vocoder_input.cuda() if use_cuda else vocoder_input,

From 3f54c39b0a4bb4678aec99a2e6b13b825387d712 Mon Sep 17 00:00:00 2001
From: Markus Toman <m.toman@neuratec.com>
Date: Fri, 7 Feb 2020 12:35:03 +0100
Subject: [PATCH 08/61] Pacify pylint

---
 synthesize.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/synthesize.py b/synthesize.py
index eec022ab..47b409ef 100644
--- a/synthesize.py
+++ b/synthesize.py
@@ -30,9 +30,9 @@ def tts(model,
     if C.model == "Tacotron" and use_vocoder_model:
         postnet_output = ap.out_linear_to_mel(postnet_output.T).T
     # correct if there is a scale difference b/w two models
-    postnet_output = ap._denormalize(postnet_output)
+    postnet_output = ap._denormalize(postnet_output) # pylint: disable=W021
     if use_vocoder_model:
-        postnet_output = ap_vocoder._normalize(postnet_output)
+        postnet_output = ap_vocoder._normalize(postnet_output) # pylint: disable=W021
         vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0)
         waveform = vocoder_model.generate(
             vocoder_input.cuda() if use_cuda else vocoder_input,

From 8f37ea9b84c556440c0fca3c7682f101be03cb0a Mon Sep 17 00:00:00 2001
From: Markus Toman <m.toman@neuratec.com>
Date: Fri, 7 Feb 2020 12:58:58 +0100
Subject: [PATCH 09/61] Pacify pylint even more

---
 synthesize.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/synthesize.py b/synthesize.py
index 47b409ef..8312d78d 100644
--- a/synthesize.py
+++ b/synthesize.py
@@ -1,3 +1,4 @@
+# pylint: disable=redefined-outer-name, unused-argument
 import os
 import time
 import argparse
@@ -30,9 +31,9 @@ def tts(model,
     if C.model == "Tacotron" and use_vocoder_model:
         postnet_output = ap.out_linear_to_mel(postnet_output.T).T
     # correct if there is a scale difference b/w two models
-    postnet_output = ap._denormalize(postnet_output) # pylint: disable=W021
+    postnet_output = ap._denormalize(postnet_output) # pylint: disable=protected-access
     if use_vocoder_model:
-        postnet_output = ap_vocoder._normalize(postnet_output) # pylint: disable=W021
+        postnet_output = ap_vocoder._normalize(postnet_output) # pylint: disable=protected-access
         vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0)
         waveform = vocoder_model.generate(
             vocoder_input.cuda() if use_cuda else vocoder_input,

From 1d13bb5f8df82fe0ea13dcb0be33ece4e2d477fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@mozilla.com>
Date: Tue, 11 Feb 2020 16:52:06 +0100
Subject: [PATCH 10/61] Update README.md

Contact and getting help
---
 README.md | 23 +----------------------
 1 file changed, 1 insertion(+), 22 deletions(-)

diff --git a/README.md b/README.md
index e98be3c4..7c2e4e3c 100644
--- a/README.md
+++ b/README.md
@@ -139,28 +139,7 @@ If you like to use TTS to try a new idea and like to share your experiments with
 - Share your results as you proceed. (Tensorboard log files, audio results, visuals etc.)
 - Use LJSpeech dataset (for English) if you like to compare results with the released models. (It is the most open scalable dataset for quick experimentation)
 
-## Contact/Getting Help
-- [Wiki](https://github.com/mozilla/TTS/wiki)
-
-- [Discourse Forums](https://discourse.mozilla.org/c/tts) - If your question is not addressed in the Wiki, the Discourse Forums is the next place to look. They contain conversations on General Topics, Using TTS, and TTS Development.
-
-- [Issues](https://github.com/mozilla/TTS/issues) - Finally, if all else fails, you can open an issue in our repo.
-
-<!--## What is new with TTS
-If you train TTS with LJSpeech dataset, you start to hear reasonable results after 12.5K iterations with batch size 32. This is the fastest training with character-based methods up to our knowledge. Out implementation is also quite robust against long sentences.
-- Location sensitive attention ([ref](https://arxiv.org/pdf/1506.07503.pdf)). Attention is a vital part of text2speech models. Therefore, it is important to use an attention mechanism that suits the diagonal nature of the problem where the output strictly aligns with the text monotonically. Location sensitive attention performs better by looking into the previous alignment vectors and learns diagonal attention more easily. Yet, I believe there is a good space for research at this front to find a better solution.
-- Attention smoothing with sigmoid ([ref](https://arxiv.org/pdf/1506.07503.pdf)). Attention weights are computed by normalized sigmoid values instead of softmax for sharper values. That enables the model to pick multiple highly scored inputs for alignments while reducing the noise.
-- Weight decay ([ref](http://www.fast.ai/2018/07/02/adam-weight-decay/)). After a certain point of the training, you might observe the model over-fitting. That is, the model is able to pronounce words probably better but the quality of the speech quality gets lower and sometimes attention alignment gets disoriented.
-- Stop token prediction with an additional module. The original Tacotron model does not propose a stop token to stop the decoding process. Therefore, you need to use heuristic measures to stop the decoder. Here, we prefer to use additional layers at the end to decide when to stop.
-- Applying sigmoid to the model outputs. Since the output values are expected to be in the range [0, 1], we apply sigmoid to make things easier to approximate the expected output distribution.
-- Phoneme based training is enabled for easier learning and robust pronunciation. It also makes easier to adapt TTS to the most languages without worrying about language specific characters.
-- Configurable attention windowing at inference-time for robust alignment. It enforces network to only consider a certain window of encoder steps per iteration.
-- Detailed Tensorboard stats for activation, weight and gradient values per layer. It is useful to detect defects and compare networks.
-- Constant history window. Instead of using only the last frame of predictions, define a constant history queue. It enables training with gradually decreasing prediction frame (r=5 -> r=1) by only changing the last layer. For instance, you can train the model with r=5 and then fine-tune it with r=1 without any performance loss. It also solves well-known PreNet problem [#50](https://github.com/mozilla/TTS/issues/50). 
-- Initialization of hidden decoder states with Embedding layers instead of zero initialization. 
-One common question is to ask why we don't use Tacotron2 architecture. According to our ablation experiments, nothing, except Location Sensitive Attention, improves the performance, given the increase in the model size.
-Please feel free to offer new changes and pull things off. We are happy to discuss and make things better.
--->
+## [Contact/Getting Help](https://github.com/mozilla/TTS/wiki/Contact-and-Getting-Help)
 
 ## Major TODOs
 - [x] Implement the model.

From 02e6d0538272f589d6c3c290b81575b7bd866991 Mon Sep 17 00:00:00 2001
From: Reuben Morais <reuben.morais@gmail.com>
Date: Thu, 13 Feb 2020 15:49:46 +0100
Subject: [PATCH 11/61] Use PWGAN if available in Synthesizer.tts

---
 server/synthesizer.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/server/synthesizer.py b/server/synthesizer.py
index 75fd4e76..455bd332 100644
--- a/server/synthesizer.py
+++ b/server/synthesizer.py
@@ -168,9 +168,16 @@ class Synthesizer(object):
             postnet_output, decoder_output, _ = parse_outputs(
                 postnet_output, decoder_output, alignments)
 
+            if self.pwgan:
+                vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0)
+                if self.use_cuda:
+                    vocoder_input.cuda()
+                wav = self.pwgan.inference(vocoder_input, hop_size=self.ap.hop_length)
             if self.wavernn:
-                postnet_output = postnet_output[0].data.cpu().numpy()
-                wav = self.wavernn.generate(torch.FloatTensor(postnet_output.T).unsqueeze(0).cuda(), batched=self.config.is_wavernn_batched, target=11000, overlap=550)
+                vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0)
+                if self.use_cuda:
+                    vocoder_input.cuda()
+                wav = self.wavernn.generate(vocoder_input, batched=self.config.is_wavernn_batched, target=11000, overlap=550)
             else:
                 wav = inv_spectrogram(postnet_output, self.ap, self.tts_config)
             # trim silence

From b539ffafc0a0c185438bab262719f4259b6c8f9f Mon Sep 17 00:00:00 2001
From: Reuben Morais <reuben.morais@gmail.com>
Date: Thu, 13 Feb 2020 15:54:30 +0100
Subject: [PATCH 12/61] Load PWGAN/WaveRNN embedded files if present

---
 server/server.py | 47 +++++++++++++++++++++++++++--------------------
 1 file changed, 27 insertions(+), 20 deletions(-)

diff --git a/server/server.py b/server/server.py
index 6af119bf..705937e2 100644
--- a/server/server.py
+++ b/server/server.py
@@ -18,9 +18,9 @@ def create_argparser():
     parser.add_argument('--wavernn_file', type=str, default=None, help='path to WaveRNN checkpoint file.')
     parser.add_argument('--wavernn_config', type=str, default=None, help='path to WaveRNN config file.')
     parser.add_argument('--is_wavernn_batched', type=convert_boolean, default=False, help='true to use batched WaveRNN.')
-    parser.add_argument('--pwgan_lib_path', type=str, help='path to ParallelWaveGAN project folder to be imported. If this is not passed, model uses Griffin-Lim for synthesis.')
-    parser.add_argument('--pwgan_file', type=str, help='path to ParallelWaveGAN checkpoint file.')
-    parser.add_argument('--pwgan_config', type=str, help='path to ParallelWaveGAN config file.')
+    parser.add_argument('--pwgan_lib_path', type=str, default=None, help='path to ParallelWaveGAN project folder to be imported. If this is not passed, model uses Griffin-Lim for synthesis.')
+    parser.add_argument('--pwgan_file', type=str, default=None, help='path to ParallelWaveGAN checkpoint file.')
+    parser.add_argument('--pwgan_config', type=str, default=None, help='path to ParallelWaveGAN config file.')
     parser.add_argument('--port', type=int, default=5002, help='port to listen on.')
     parser.add_argument('--use_cuda', type=convert_boolean, default=False, help='true to use CUDA.')
     parser.add_argument('--debug', type=convert_boolean, default=False, help='true to enable Flask debug mode.')
@@ -29,28 +29,35 @@ def create_argparser():
 
 synthesizer = None
 
-embedded_model_folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'model')
-checkpoint_file = os.path.join(embedded_model_folder, 'checkpoint.pth.tar')
-config_file = os.path.join(embedded_model_folder, 'config.json')
+embedded_models_folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'model')
 
-# Default options with embedded model files
-if os.path.isfile(checkpoint_file):
-    default_tts_checkpoint = checkpoint_file
-else:
-    default_tts_checkpoint = None
+embedded_tts_folder = os.path.join(embedded_models_folder, 'tts')
+tts_checkpoint_file = os.path.join(embedded_tts_folder, 'checkpoint.pth.tar')
+tts_config_file = os.path.join(embedded_tts_folder, 'config.json')
 
-if os.path.isfile(config_file):
-    default_tts_config = config_file
-else:
-    default_tts_config = None
+embedded_wavernn_folder = os.path.join(embedded_models_folder, 'wavernn')
+wavernn_checkpoint_file = os.path.join(embedded_wavernn_folder, 'checkpoint.pth.tar')
+wavernn_config_file = os.path.join(embedded_wavernn_folder, 'config.json')
+
+embedded_pwgan_folder = os.path.join(embedded_models_folder, 'pwgan')
+pwgan_checkpoint_file = os.path.join(embedded_pwgan_folder, 'checkpoint.pkl')
+pwgan_config_file = os.path.join(embedded_pwgan_folder, 'config.yml')
 
 args = create_argparser().parse_args()
 
-# If these were not specified in the CLI args, use default values
-if not args.tts_checkpoint:
-    args.tts_checkpoint = default_tts_checkpoint
-if not args.tts_config:
-    args.tts_config = default_tts_config
+# If these were not specified in the CLI args, use default values with embedded model files
+if not args.tts_checkpoint and os.path.isfile(tts_checkpoint_file):
+    args.tts_checkpoint = tts_checkpoint_file
+if not args.tts_config and os.path.isfile(tts_config_file):
+    args.tts_config = tts_config_file
+if not args.wavernn_file and os.path.isfile(wavernn_checkpoint_file):
+    args.wavernn_file = wavernn_checkpoint_file
+if not args.wavernn_config and os.path.isfile(wavernn_config_file):
+    args.wavernn_config = wavernn_config_file
+if not args.pwgan_file and os.path.isfile(pwgan_checkpoint_file):
+    args.pwgan_file = pwgan_checkpoint_file
+if not args.pwgan_config and os.path.isfile(pwgan_config_file):
+    args.pwgan_config = pwgan_config_file
 
 synthesizer = Synthesizer(args)
 

From 995eb1bf074caae257a87f5ef54ae5f63617b227 Mon Sep 17 00:00:00 2001
From: Reuben Morais <reuben.morais@gmail.com>
Date: Thu, 13 Feb 2020 16:03:30 +0100
Subject: [PATCH 13/61] Fix bug where sometimes the second sentence disappears
 if it doesn't end with punctuation

---
 server/synthesizer.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/server/synthesizer.py b/server/synthesizer.py
index 455bd332..1082b73a 100644
--- a/server/synthesizer.py
+++ b/server/synthesizer.py
@@ -122,7 +122,7 @@ class Synthesizer(object):
         self.ap.save_wav(wav, path)
 
     def split_into_sentences(self, text):
-        text = " " + text + "  "
+        text = " " + text + "  <stop>"
         text = text.replace("\n", " ")
         text = re.sub(prefixes, "\\1<prd>", text)
         text = re.sub(websites, "<prd>\\1", text)
@@ -149,15 +149,13 @@ class Synthesizer(object):
         text = text.replace("<prd>", ".")
         sentences = text.split("<stop>")
         sentences = sentences[:-1]
-        sentences = [s.strip() for s in sentences]
+        sentences = list(filter(None, [s.strip() for s in sentences])) # remove empty sentences
         return sentences
 
     def tts(self, text):
         wavs = []
         sens = self.split_into_sentences(text)
         print(sens)
-        if not sens:
-            sens = [text+'.']
         for sen in sens:
             # preprocess the given text
             inputs = text_to_seqvec(sen, self.tts_config, self.use_cuda)

From df42a4a03ac886af4f2ef1bdb8ff25745f74d798 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@mozilla.com>
Date: Thu, 13 Feb 2020 16:53:16 +0100
Subject: [PATCH 14/61] Update README.md

---
 README.md | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 7c2e4e3c..19d7fa24 100644
--- a/README.md
+++ b/README.md
@@ -115,10 +115,7 @@ In case of any error or intercepted execution, if there is no checkpoint yet und
 
 You can also enjoy Tensorboard,  if you point Tensorboard argument```--logdir``` to the experiment folder.
 
-## Testing
-Best way to test your network is to use Notebooks under ```notebooks``` folder.
-
-There is also a good [CoLab](https://colab.research.google.com/github/tugstugi/dl-colab-notebooks/blob/master/notebooks/Mozilla_TTS_WaveRNN.ipynb) sample using pre-trained models (by @tugstugi).
+## [Testing and Examples](https://github.com/mozilla/TTS/wiki/Examples-using-TTS)
 
 ## Contribution guidelines
 This repository is governed by Mozilla's code of conduct and etiquette guidelines. For more details, please read the [Mozilla Community Participation Guidelines.](https://www.mozilla.org/about/governance/policies/participation/)

From 0e35fdc2a1c8a4bc669e3c6d755c551489ee221b Mon Sep 17 00:00:00 2001
From: erogol <erogol@hotmail.com>
Date: Thu, 13 Feb 2020 17:23:37 +0100
Subject: [PATCH 15/61] fix linter problems and loader test

---
 tests/test_loader.py          | 4 +---
 tests/test_text_processing.py | 4 ++--
 utils/text/__init__.py        | 3 ---
 3 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/tests/test_loader.py b/tests/test_loader.py
index 751bc181..d8727895 100644
--- a/tests/test_loader.py
+++ b/tests/test_loader.py
@@ -137,9 +137,7 @@ class TestTTSDataset(unittest.TestCase):
                 # NOTE: Below needs to check == 0 but due to an unknown reason
                 # there is a slight difference between two matrices.
                 # TODO: Check this assert cond more in detail.
-                assert abs((abs(mel.T)
-                            - abs(mel_dl)
-                            ).sum()) < 1e-5, (abs(mel.T) - abs(mel_dl)).sum()
+                assert abs(mel.T - mel_dl).max() < 1e-5, abs(mel.T - mel_dl).max()
 
                 # check mel-spec correctness
                 mel_spec = mel_input[0].cpu().numpy()
diff --git a/tests/test_text_processing.py b/tests/test_text_processing.py
index 0ecb9962..aa17f694 100644
--- a/tests/test_text_processing.py
+++ b/tests/test_text_processing.py
@@ -71,5 +71,5 @@ def test_text2phone():
     text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!"
     gt = "ɹ|iː|s|ə|n|t| |ɹ|ɪ|s|ɜː|tʃ| |æ|t| |h|ɑːɹ|v|ɚ|d| |h|ɐ|z| |ʃ|oʊ|n| |m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| |f|ɔː|ɹ| |æ|z| |l|ɪ|ɾ|əl| |æ|z| |eɪ|t| |w|iː|k|s| |k|æ|n| |æ|k|tʃ|uː|əl|i| |ɪ|n|k|ɹ|iː|s|,| |ð|ə| |ɡ|ɹ|eɪ| |m|æ|ɾ|ɚ|ɹ| |ɪ|n|ð|ə| |p|ɑːɹ|t|s| |ʌ|v|ð|ə| |b|ɹ|eɪ|n| |ɹ|ɪ|s|p|ɑː|n|s|ə|b|əl| |f|ɔː|ɹ| |ɪ|m|oʊ|ʃ|ə|n|əl| |ɹ|ɛ|ɡ|j|uː|l|eɪ|ʃ|ə|n| |æ|n|d| |l|ɜː|n|ɪ|ŋ|!"
     lang = "en-us"
-    phonemes = text2phone(text, lang)
-    assert gt == phonemes, f"\n{phonemes} \n vs \n{gt}"
+    ph = text2phone(text, lang)
+    assert gt == ph, f"\n{phonemes} \n vs \n{gt}"
diff --git a/utils/text/__init__.py b/utils/text/__init__.py
index e6842dfa..0e6684d2 100644
--- a/utils/text/__init__.py
+++ b/utils/text/__init__.py
@@ -38,10 +38,7 @@ def text2phone(text, language):
             if text[-1] == punctuations[-1]:
                 for punct in punctuations[:-1]:
                     ph = ph.replace('| |\n', '|'+punct+'| |', 1)
-                try:
                     ph = ph + punctuations[-1]
-                except:
-                    print(text)
             else:
                 for punct in punctuations:
                     ph = ph.replace('| |\n', '|'+punct+'| |', 1)

From ffd00ce295e8b68e59dccda99bc467823a62940d Mon Sep 17 00:00:00 2001
From: Reuben Morais <reuben.morais@gmail.com>
Date: Thu, 13 Feb 2020 17:30:41 +0100
Subject: [PATCH 16/61] Fix linter and server package test

---
 server/synthesizer.py        | 3 ++-
 setup.py                     | 7 ++++---
 tests/test_server_package.sh | 2 +-
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/server/synthesizer.py b/server/synthesizer.py
index 1082b73a..fcdc8787 100644
--- a/server/synthesizer.py
+++ b/server/synthesizer.py
@@ -121,7 +121,8 @@ class Synthesizer(object):
         wav = np.array(wav)
         self.ap.save_wav(wav, path)
 
-    def split_into_sentences(self, text):
+    @staticmethod
+    def split_into_sentences(text):
         text = " " + text + "  <stop>"
         text = text.replace("\n", " ")
         text = re.sub(prefixes, "\\1<prd>", text)
diff --git a/setup.py b/setup.py
index 63782800..f92dac8a 100644
--- a/setup.py
+++ b/setup.py
@@ -61,10 +61,11 @@ package_data = ['server/templates/*']
 if 'bdist_wheel' in unknown_args and args.checkpoint and args.model_config:
     print('Embedding model in wheel file...')
     model_dir = os.path.join('server', 'model')
-    os.makedirs(model_dir, exist_ok=True)
-    embedded_checkpoint_path = os.path.join(model_dir, 'checkpoint.pth.tar')
+    tts_dir = os.path.join(model_dir, 'tts')
+    os.makedirs(tts_dir, exist_ok=True)
+    embedded_checkpoint_path = os.path.join(tts_dir, 'checkpoint.pth.tar')
     shutil.copy(args.checkpoint, embedded_checkpoint_path)
-    embedded_config_path = os.path.join(model_dir, 'config.json')
+    embedded_config_path = os.path.join(tts_dir, 'config.json')
     shutil.copy(args.model_config, embedded_config_path)
     package_data.extend([embedded_checkpoint_path, embedded_config_path])
 
diff --git a/tests/test_server_package.sh b/tests/test_server_package.sh
index 01e42843..9fe5e8b1 100755
--- a/tests/test_server_package.sh
+++ b/tests/test_server_package.sh
@@ -11,7 +11,7 @@ source /tmp/venv/bin/activate
 pip install --quiet --upgrade pip setuptools wheel
 
 rm -f dist/*.whl
-python setup.py bdist_wheel --checkpoint tests/outputs/checkpoint_10.pth.tar --model_config tests/outputs/dummy_model_config.json
+python setup.py --quiet bdist_wheel --checkpoint tests/outputs/checkpoint_10.pth.tar --model_config tests/outputs/dummy_model_config.json
 pip install --quiet dist/TTS*.whl
 
 python -m TTS.server.server &

From 9c5c68626825fdebd4af5d02f0bb792fb9f6fa44 Mon Sep 17 00:00:00 2001
From: erogol <erogol@hotmail.com>
Date: Thu, 13 Feb 2020 22:16:40 +0100
Subject: [PATCH 17/61] check config with a function

---
 config.json            |   9 +--
 train.py               |   3 +-
 utils/generic_utils.py | 128 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 135 insertions(+), 5 deletions(-)

diff --git a/config.json b/config.json
index 9e4fa906..c1a8158d 100644
--- a/config.json
+++ b/config.json
@@ -9,7 +9,7 @@
         "num_mels": 80,         // size of the mel spec frame. 
         "num_freq": 1025,       // number of stft frequency levels. Size of the linear spectogram frame.
         "sample_rate": 22050,   // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
-        "frame_length_ms": 50,  // stft window length in ms.
+        "frame_length_ms": 50.0,  // stft window length in ms.
         "frame_shift_ms": 12.5, // stft window hop-lengh in ms.
         "preemphasis": 0.98,    // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
         "min_level_db": -100,   // normalization range
@@ -19,7 +19,7 @@
         // Normalization parameters
         "signal_norm": true,    // normalize the spec values in range [0, 1]
         "symmetric_norm": true, // move normalization to range [-1, 1]
-        "max_norm": 4,          // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
+        "max_norm": 4.0,          // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
         "clip_norm": true,      // clip normalized values into the range.
         "mel_fmin": 0.0,         // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
         "mel_fmax": 8000.0,        // maximum freq level for mel-spec. Tune for dataset!!
@@ -36,11 +36,12 @@
     "reinit_layers": [],    // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
 
     // TRAINING
-    "batch_size": 32,       // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
+    "batch_size": 2,       // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
     "eval_batch_size":16,   
     "r": 7,                 // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.  
     "gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed.
     "loss_masking": true,         // enable / disable loss masking against the sequence padding.
+    "grad_accum": 2,        // if N > 1, enable gradient accumulation for N iterations. It is useful for low memory GPUs. 
 
     // VALIDATION
     "run_eval": true,
@@ -49,7 +50,7 @@
 
     // OPTIMIZER
     "noam_schedule": false,        // use noam warmup and lr schedule.
-    "grad_clip": 1,                // upper limit for gradients for clipping.
+    "grad_clip": 1.0,                // upper limit for gradients for clipping.
     "epochs": 1000,                // total number of epochs to train.
     "lr": 0.0001,                  // Initial learning rate. If Noam decay is active, maximum learning rate.
     "wd": 0.000001,         // Weight decay weight.
diff --git a/train.py b/train.py
index e8c240f3..7bfb8751 100644
--- a/train.py
+++ b/train.py
@@ -20,7 +20,7 @@ from TTS.utils.generic_utils import (
     get_git_branch, load_config, remove_experiment_folder, save_best_model,
     save_checkpoint, adam_weight_decay, set_init_dict, copy_config_file,
     setup_model, gradual_training_scheduler, KeepAverage,
-    set_weight_decay)
+    set_weight_decay, check_config)
 from TTS.utils.logger import Logger
 from TTS.utils.speakers import load_speaker_mapping, save_speaker_mapping, \
     get_speakers
@@ -687,6 +687,7 @@ if __name__ == '__main__':
 
     # setup output paths and read configs
     c = load_config(args.config_path)
+    check_config(c)
     _ = os.path.dirname(os.path.realpath(__file__))
 
     OUT_PATH = args.continue_path
diff --git a/utils/generic_utils.py b/utils/generic_utils.py
index cf1a83a6..7a5c2ac2 100644
--- a/utils/generic_utils.py
+++ b/utils/generic_utils.py
@@ -389,3 +389,131 @@ class KeepAverage():
     def update_values(self, value_dict):
         for key, value in value_dict.items():
             self.update_value(key, value)
+
+
+def _check_argument(name, c, enum_list=None, max_val=None, min_val=None, restricted=False, val_type=None):
+    if restricted:
+        assert name in c.keys(), f' [!] {name} not defined in config.json'
+    if name in c.keys():
+        if max_val:
+            assert c[name] <= max_val, f' [!] {name} is larger than max value {max_val}'
+        if min_val:
+            assert c[name] >= min_val, f' [!] {name} is smaller than min value {min_val}'
+        if enum_list:
+            assert c[name].lower() in enum_list, f' [!] {name} is not a valid value'
+        if val_type:
+            assert type(c[name]) is val_type or c[name] is None, f' [!] {name} has wrong type - {type(c[name])} vs {val_type}'
+    
+
+
+def check_config(c):
+    _check_argument('model', c, enum_list=['tacotron', 'tacotron2'], restricted=True, val_type=str)
+    _check_argument('run_name', c, restricted=True, val_type=str)
+    _check_argument('run_description', c, val_type=str)
+
+    # AUDIO
+    _check_argument('audio', c, restricted=True, val_type=dict)
+
+    # audio processing parameters
+    _check_argument('num_mels', c['audio'], restricted=True, val_type=int, min_val=10, max_val=2056)
+    _check_argument('num_freq', c['audio'], restricted=True, val_type=int, min_val=128, max_val=4058)
+    _check_argument('sample_rate', c['audio'], restricted=True, val_type=int, min_val=512, max_val=100000)
+    _check_argument('frame_length_ms', c['audio'], restricted=True, val_type=float, min_val=10, max_val=1000)
+    _check_argument('frame_shift_ms', c['audio'], restricted=True, val_type=float, min_val=1, max_val=1000)
+    _check_argument('preemphasis', c['audio'], restricted=True, val_type=float, min_val=0, max_val=1)
+    _check_argument('min_level_db', c['audio'], restricted=True, val_type=int, min_val=-1000, max_val=10)
+    _check_argument('ref_level_db', c['audio'], restricted=True, val_type=int, min_val=0, max_val=1000)
+    _check_argument('power', c['audio'], restricted=True, val_type=float, min_val=1, max_val=5)
+    _check_argument('griffin_lim_iters', c['audio'], restricted=True, val_type=int, min_val=10, max_val=1000)
+
+    # normalization parameters
+    _check_argument('signal_norm', c['audio'], restricted=True, val_type=bool)
+    _check_argument('symmetric_norm', c['audio'], restricted=True, val_type=bool)
+    _check_argument('max_norm', c['audio'], restricted=True, val_type=float, min_val=0.1, max_val=1000)
+    _check_argument('clip_norm', c['audio'], restricted=True, val_type=bool)
+    _check_argument('mel_fmin', c['audio'], restricted=True, val_type=float, min_val=0.0, max_val=1000)
+    _check_argument('mel_fmax', c['audio'], restricted=True, val_type=float, min_val=500.0)
+    _check_argument('do_trim_silence', c['audio'], restricted=True, val_type=bool)
+    _check_argument('trim_db', c['audio'], restricted=True, val_type=int)
+
+    # training parameters
+    _check_argument('batch_size', c, restricted=True, val_type=int, min_val=1)
+    _check_argument('eval_batch_size', c, restricted=True, val_type=int, min_val=1)
+    _check_argument('r', c, restricted=True, val_type=int, min_val=1)
+    _check_argument('gradual_training', c, restricted=False, val_type=list)
+    _check_argument('loss_masking', c, restricted=True, val_type=bool)
+    _check_argument('grad_accum', c, restricted=True, val_type=int, min_val=1, max_val=100)
+
+    # validation parameters
+    _check_argument('run_eval', c, restricted=True, val_type=bool)
+    _check_argument('test_delay_epochs', c, restricted=True, val_type=int, min_val=0)
+    _check_argument('test_sentences_file', c, restricted=False, val_type=str)
+
+    # optimizer
+    _check_argument('noam_schedule', c, restricted=False, val_type=bool)
+    _check_argument('grad_clip', c, restricted=True, val_type=float, min_val=0.0)
+    _check_argument('epochs', c, restricted=True, val_type=int, min_val=1)
+    _check_argument('lr', c, restricted=True, val_type=float, min_val=0)
+    _check_argument('wd', c, restricted=True, val_type=float, min_val=0)
+    _check_argument('warmup_steps', c, restricted=True, val_type=int, min_val=0)
+    _check_argument('seq_len_norm', c, restricted=True, val_type=bool)
+
+    # tacotron prenet
+    _check_argument('memory_size', c, restricted=True, val_type=int, min_val=-1)
+    _check_argument('prenet_type', c, restricted=True, val_type=str, enum_list=['original', 'bn'])
+    _check_argument('prenet_dropout', c, restricted=True, val_type=bool)
+
+    # attention
+    _check_argument('attention_type', c, restricted=True, val_type=str, enum_list=['graves', 'original'])
+    _check_argument('attention_heads', c, restricted=True, val_type=int)
+    _check_argument('attention_norm', c, restricted=True, val_type=str, enum_list=['sigmoid', 'softmax'])
+    _check_argument('windowing', c, restricted=True, val_type=bool)
+    _check_argument('use_forward_attn', c, restricted=True, val_type=bool)
+    _check_argument('forward_attn_mask', c, restricted=True, val_type=bool)
+    _check_argument('transition_agent', c, restricted=True, val_type=bool)
+    _check_argument('transition_agent', c, restricted=True, val_type=bool)
+    _check_argument('location_attn', c, restricted=True, val_type=bool)
+    _check_argument('bidirectional_decoder', c, restricted=True, val_type=bool)
+
+    # stopnet
+    _check_argument('stopnet', c, restricted=True, val_type=bool)
+    _check_argument('separate_stopnet', c, restricted=True, val_type=bool)
+
+    # tensorboard
+    _check_argument('print_step', c, restricted=True, val_type=int, min_val=1)
+    _check_argument('save_step', c, restricted=True, val_type=int, min_val=1)
+    _check_argument('checkpoint', c, restricted=True, val_type=bool)
+    _check_argument('tb_model_param_stats', c, restricted=True, val_type=bool)
+
+    # dataloading
+    _check_argument('text_cleaner', c, restricted=True, val_type=str, enum_list=['english_cleaners', 'phoneme_cleaners', 'transliteration_cleaners', 'basic_cleaners'])
+    _check_argument('enable_eos_bos_chars', c, restricted=True, val_type=bool)
+    _check_argument('num_loader_workers', c, restricted=True, val_type=int, min_val=0)
+    _check_argument('num_val_loader_workers', c, restricted=True, val_type=int, min_val=0)
+    _check_argument('batch_group_size', c, restricted=True, val_type=int, min_val=0)
+    _check_argument('min_seq_len', c, restricted=True, val_type=int, min_val=0)
+    _check_argument('max_seq_len', c, restricted=True, val_type=int, min_val=10)
+
+    # paths
+    _check_argument('output_path', c, restricted=True, val_type=str)
+
+    # multi-speaker gst
+    _check_argument('use_speaker_embedding', c, restricted=True, val_type=bool)
+    _check_argument('style_wav_for_test', c, restricted=True, val_type=str)
+    _check_argument('use_gst', c, restricted=True, val_type=bool)
+
+    # datasets - checking only the first entry
+    _check_argument('datasets', c, restricted=True, val_type=list)
+    for dataset_entry in c['datasets']:
+        _check_argument('name', dataset_entry, restricted=True, val_type=str)
+        _check_argument('path', dataset_entry, restricted=True, val_type=str)
+        _check_argument('meta_file_train', dataset_entry, restricted=True, val_type=str)
+        _check_argument('meta_file_val', dataset_entry, restricted=True, val_type=str)
+
+
+
+
+    
+
+
+

From 3331afa21932596ca791260e1c14e6942c1d6df2 Mon Sep 17 00:00:00 2001
From: erogol <erogol@hotmail.com>
Date: Fri, 14 Feb 2020 17:47:33 +0100
Subject: [PATCH 18/61] remove grad_accum from config checker

---
 utils/generic_utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/utils/generic_utils.py b/utils/generic_utils.py
index 7a5c2ac2..942fedf9 100644
--- a/utils/generic_utils.py
+++ b/utils/generic_utils.py
@@ -405,7 +405,6 @@ def _check_argument(name, c, enum_list=None, max_val=None, min_val=None, restric
             assert type(c[name]) is val_type or c[name] is None, f' [!] {name} has wrong type - {type(c[name])} vs {val_type}'
     
 
-
 def check_config(c):
     _check_argument('model', c, enum_list=['tacotron', 'tacotron2'], restricted=True, val_type=str)
     _check_argument('run_name', c, restricted=True, val_type=str)
@@ -442,7 +441,7 @@ def check_config(c):
     _check_argument('r', c, restricted=True, val_type=int, min_val=1)
     _check_argument('gradual_training', c, restricted=False, val_type=list)
     _check_argument('loss_masking', c, restricted=True, val_type=bool)
-    _check_argument('grad_accum', c, restricted=True, val_type=int, min_val=1, max_val=100)
+    # _check_argument('grad_accum', c, restricted=True, val_type=int, min_val=1, max_val=100)
 
     # validation parameters
     _check_argument('run_eval', c, restricted=True, val_type=bool)

From c48b053cdee1a183d747c8151b96febdb102a291 Mon Sep 17 00:00:00 2001
From: erogol <erogol@hotmail.com>
Date: Fri, 14 Feb 2020 18:00:15 +0100
Subject: [PATCH 19/61] linter fixes

---
 utils/generic_utils.py | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/utils/generic_utils.py b/utils/generic_utils.py
index 942fedf9..a8de5bbb 100644
--- a/utils/generic_utils.py
+++ b/utils/generic_utils.py
@@ -402,8 +402,8 @@ def _check_argument(name, c, enum_list=None, max_val=None, min_val=None, restric
         if enum_list:
             assert c[name].lower() in enum_list, f' [!] {name} is not a valid value'
         if val_type:
-            assert type(c[name]) is val_type or c[name] is None, f' [!] {name} has wrong type - {type(c[name])} vs {val_type}'
-    
+            assert isinstance(c[name], val_type) or c[name] is None, f' [!] {name} has wrong type - {type(c[name])} vs {val_type}'
+
 
 def check_config(c):
     _check_argument('model', c, enum_list=['tacotron', 'tacotron2'], restricted=True, val_type=str)
@@ -507,12 +507,4 @@ def check_config(c):
         _check_argument('name', dataset_entry, restricted=True, val_type=str)
         _check_argument('path', dataset_entry, restricted=True, val_type=str)
         _check_argument('meta_file_train', dataset_entry, restricted=True, val_type=str)
-        _check_argument('meta_file_val', dataset_entry, restricted=True, val_type=str)
-
-
-
-
-    
-
-
-
+        _check_argument('meta_file_val', dataset_entry, restricted=True, val_type=str)
\ No newline at end of file

From 02df28c7d6059afa31d615a6f24eb27b7c017cff Mon Sep 17 00:00:00 2001
From: richardburleigh <richard@richardburleigh.com.au>
Date: Sat, 15 Feb 2020 14:47:50 +1100
Subject: [PATCH 20/61] Fix GL overriding PWGAN inference

---
 server/synthesizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server/synthesizer.py b/server/synthesizer.py
index fcdc8787..347bef21 100644
--- a/server/synthesizer.py
+++ b/server/synthesizer.py
@@ -172,7 +172,7 @@ class Synthesizer(object):
                 if self.use_cuda:
                     vocoder_input.cuda()
                 wav = self.pwgan.inference(vocoder_input, hop_size=self.ap.hop_length)
-            if self.wavernn:
+            elif self.wavernn:
                 vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0)
                 if self.use_cuda:
                     vocoder_input.cuda()

From 6977899d072a4705bd44e81c1632ff4698524bb1 Mon Sep 17 00:00:00 2001
From: erogol <erogol@hotmail.com>
Date: Wed, 19 Feb 2020 17:54:06 +0100
Subject: [PATCH 21/61] fix constant GL bug in synthesis

---
 utils/synthesis.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/utils/synthesis.py b/utils/synthesis.py
index f066228a..79a17c78 100644
--- a/utils/synthesis.py
+++ b/utils/synthesis.py
@@ -78,6 +78,7 @@ def synthesis(model,
               style_wav=None,
               truncated=False,
               enable_eos_bos_chars=False, #pylint: disable=unused-argument
+              use_griffin_lim=False,
               do_trim_silence=False):
     """Synthesize voice for the given text.
 
@@ -111,8 +112,10 @@ def synthesis(model,
     postnet_output, decoder_output, alignment = parse_outputs(
         postnet_output, decoder_output, alignments)
     # plot results
-    wav = inv_spectrogram(postnet_output, ap, CONFIG)
-    # trim silence
-    if do_trim_silence:
-        wav = trim_silence(wav, ap)
+    wav = None
+    if use_griffin_lim:
+        wav = inv_spectrogram(postnet_output, ap, CONFIG)
+        # trim silence
+        if do_trim_silence:
+            wav = trim_silence(wav, ap)
     return wav, alignment, decoder_output, postnet_output, stop_tokens

From f8ebf6abcdf269fe6278246b2255e5c098ae5395 Mon Sep 17 00:00:00 2001
From: erogol <erogol@hotmail.com>
Date: Wed, 19 Feb 2020 18:17:10 +0100
Subject: [PATCH 22/61] fix the benchmark notebook after GL fix

---
 notebooks/Benchmark.ipynb | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/notebooks/Benchmark.ipynb b/notebooks/Benchmark.ipynb
index 7c528506..00ac7d16 100644
--- a/notebooks/Benchmark.ipynb
+++ b/notebooks/Benchmark.ipynb
@@ -65,6 +65,7 @@
     "from TTS.utils.text import text_to_sequence\n",
     "from TTS.utils.synthesis import synthesis\n",
     "from TTS.utils.visual import visualize\n",
+    "from TTS.utils.text.symbols import symbols, phonemes\n",
     "\n",
     "import IPython\n",
     "from IPython.display import Audio\n",
@@ -81,13 +82,15 @@
    "source": [
     "def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True):\n",
     "    t_1 = time.time()\n",
-    "    waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, False, CONFIG.enable_eos_bos_chars)\n",
+    "    waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None, \n",
+    "                                                                             truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars,\n",
+    "                                                                             use_griffin_lim=use_gl)\n",
     "    if CONFIG.model == \"Tacotron\" and not use_gl:\n",
     "        # coorect the normalization differences b/w TTS and the Vocoder.\n",
     "        mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n",
-    "    mel_postnet_spec = ap._denormalize(mel_postnet_spec)\n",
-    "    mel_postnet_spec = ap_vocoder._normalize(mel_postnet_spec)\n",
     "    if not use_gl:\n",
+    "        mel_postnet_spec = ap._denormalize(mel_postnet_spec)\n",
+    "        mel_postnet_spec = ap_vocoder._normalize(mel_postnet_spec)\n",
     "        waveform = wavernn.generate(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0).cuda(), batched=batched_wavernn, target=8000, overlap=400)\n",
     "\n",
     "    print(\" >  Run-time: {}\".format(time.time() - t_1))\n",
@@ -108,7 +111,7 @@
    "outputs": [],
    "source": [
     "# Set constants\n",
-    "ROOT_PATH = '/media/erogol/data_ssd/Models/libri_tts/5099/'\n",
+    "ROOT_PATH = '/home/erogol/Models/LJSpeech/ljspeech-bn-December-23-2019_08+34AM-ffea133/'\n",
     "MODEL_PATH = ROOT_PATH + '/best_model.pth.tar'\n",
     "CONFIG_PATH = ROOT_PATH + '/config.json'\n",
     "OUT_FOLDER = '/home/erogol/Dropbox/AudioSamples/benchmark_samples/'\n",
@@ -116,7 +119,7 @@
     "VOCODER_MODEL_PATH = \"/media/erogol/data_ssd/Models/wavernn/ljspeech/mold_ljspeech_best_model/checkpoint_433000.pth.tar\"\n",
     "VOCODER_CONFIG_PATH = \"/media/erogol/data_ssd/Models/wavernn/ljspeech/mold_ljspeech_best_model/config.json\"\n",
     "VOCODER_CONFIG = load_config(VOCODER_CONFIG_PATH)\n",
-    "use_cuda = False\n",
+    "use_cuda = True\n",
     "\n",
     "# Set some config fields manually for testing\n",
     "# CONFIG.windowing = False\n",
@@ -127,7 +130,7 @@
     "# CONFIG.stopnet = True\n",
     "\n",
     "# Set the vocoder\n",
-    "use_gl = False # use GL if True\n",
+    "use_gl = True # use GL if True\n",
     "batched_wavernn = True    # use batched wavernn inference if True"
    ]
   },
@@ -138,8 +141,6 @@
    "outputs": [],
    "source": [
     "# LOAD TTS MODEL\n",
-    "from utils.text.symbols import symbols, phonemes\n",
-    "\n",
     "# multi speaker \n",
     "if CONFIG.use_speaker_embedding:\n",
     "    speakers = json.load(open(f\"{ROOT_PATH}/speakers.json\", 'r'))\n",
@@ -181,7 +182,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# LOAD WAVERNN\n",
+    "# LOAD WAVERNN - Make sure you downloaded the model and installed the module\n",
     "if use_gl == False:\n",
     "    from WaveRNN.models.wavernn import Model\n",
     "    from WaveRNN.utils.audio import AudioProcessor as AudioProcessorVocoder\n",
@@ -533,7 +534,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.3"
+   "version": "3.7.4"
   }
  },
  "nbformat": 4,

From e540a5495996e7fec9142b0c372f6c8b37356577 Mon Sep 17 00:00:00 2001
From: erogol <erogol@hotmail.com>
Date: Wed, 19 Feb 2020 18:24:02 +0100
Subject: [PATCH 23/61] fix synthesize.py

---
 synthesize.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/synthesize.py b/synthesize.py
index cb0ee8af..a338f8b8 100644
--- a/synthesize.py
+++ b/synthesize.py
@@ -25,14 +25,16 @@ def tts(model,
     t_1 = time.time()
     use_vocoder_model = vocoder_model is not None
     waveform, alignment, _, postnet_output, stop_tokens = synthesis(
-        model, text, C, use_cuda, ap, speaker_id, False,
-        C.enable_eos_bos_chars)
+        model, text, C, use_cuda, ap, speaker_id, style_wav=False,
+        truncated=False, enable_eos_bos_chars=C.enable_eos_bos_chars,
+        use_griffin_lim=(not use_vocoder_model), do_trim_silence=True)
+
     if C.model == "Tacotron" and use_vocoder_model:
         postnet_output = ap.out_linear_to_mel(postnet_output.T).T
     # correct if there is a scale difference b/w two models
-    postnet_output = ap._denormalize(postnet_output)
-    postnet_output = ap_vocoder._normalize(postnet_output)
     if use_vocoder_model:
+        postnet_output = ap._denormalize(postnet_output)
+        postnet_output = ap_vocoder._normalize(postnet_output)
         vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0)
         waveform = vocoder_model.generate(
             vocoder_input.cuda() if use_cuda else vocoder_input,
@@ -58,7 +60,7 @@ if __name__ == "__main__":
     parser.add_argument(
         'out_path',
         type=str,
-        help='Path to save final wav file.',
+        help='Path to save final wav file. Wav file will be names as the text given.',
     )
     parser.add_argument('--use_cuda',
                         type=bool,

From dc0e6c80197fa2e52e5abc6f2d7568637e04c968 Mon Sep 17 00:00:00 2001
From: root <root@sp-mlc3-5423-0.mlc>
Date: Thu, 9 Jan 2020 15:56:09 +0100
Subject: [PATCH 24/61] simpler gmm attention implementaiton

---
 config.json             |  2 +-
 layers/common_layers.py | 78 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 79 insertions(+), 1 deletion(-)

diff --git a/config.json b/config.json
index ef999fa9..0bf6c378 100644
--- a/config.json
+++ b/config.json
@@ -108,7 +108,7 @@
         [
             {
                 "name": "ljspeech",
-                "path": "/data5/ro/shared/data/keithito/LJSpeech-1.1/",
+                "path": "/root/LJSpeech-1.1/",
                 // "path": "/home/erogol/Data/LJSpeech-1.1",
                 "meta_file_train": "metadata_train.csv",
                 "meta_file_val": "metadata_val.csv"
diff --git a/layers/common_layers.py b/layers/common_layers.py
index c2b042b0..5365d605 100644
--- a/layers/common_layers.py
+++ b/layers/common_layers.py
@@ -110,6 +110,84 @@ class LocationLayer(nn.Module):
         return processed_attention
 
 
+class GravesAttention(nn.Module):
+    """ Graves attention as described here:
+        - https://arxiv.org/abs/1910.10288
+    """
+    COEF = 0.3989422917366028  # numpy.sqrt(1/(2*numpy.pi))
+
+    def __init__(self, query_dim, K):
+        super(GravesAttention, self).__init__()
+        self._mask_value = 0.0
+        self.K = K
+        # self.attention_alignment = 0.05
+        self.eps = 1e-5
+        self.J = None
+        self.N_a = nn.Sequential(
+            nn.Linear(query_dim, query_dim, bias=True),
+            nn.ReLU(),
+            nn.Linear(query_dim, 3*K, bias=True))
+        self.attention_weights = None
+        self.mu_prev = None
+        self.init_layers()
+
+    def init_layers(self):
+        torch.nn.init.constant_(self.N_a[2].bias[(2*self.K):(3*self.K)], 1.)
+        torch.nn.init.constant_(self.N_a[2].bias[self.K:(2*self.K)], 10)
+
+    def init_states(self, inputs):
+        if self.J is None or inputs.shape[1] > self.J.shape[-1]:
+            self.J = torch.arange(0, inputs.shape[1]+1).to(inputs.device) + 0.5
+        self.attention_weights = torch.zeros(inputs.shape[0], inputs.shape[1]).to(inputs.device)
+        self.mu_prev = torch.zeros(inputs.shape[0], self.K).to(inputs.device)
+
+    # pylint: disable=R0201
+    # pylint: disable=unused-argument
+    def preprocess_inputs(self, inputs):
+        return None
+
+    def forward(self, query, inputs, processed_inputs, mask):
+        """
+        shapes:
+            query: B x D_attention_rnn
+            inputs: B x T_in x D_encoder
+            processed_inputs: place_holder
+            mask: B x T_in
+        """
+        gbk_t = self.N_a(query)
+        gbk_t = gbk_t.view(gbk_t.size(0), -1, self.K)
+
+        # attention model parameters
+        # each B x K
+        g_t = gbk_t[:, 0, :]
+        b_t = gbk_t[:, 1, :]
+        k_t = gbk_t[:, 2, :]
+
+        # attention GMM parameters
+        sig_t = torch.nn.functional.softplus(b_t) + self.eps
+
+        mu_t = self.mu_prev + torch.nn.functional.softplus(k_t)
+        g_t = torch.softmax(g_t, dim=-1) / sig_t + self.eps
+
+        j = self.J[:inputs.size(1)+1]
+
+        # attention weights
+        phi_t = g_t.unsqueeze(-1) * torch.exp(-0.5 * (mu_t.unsqueeze(-1) - j)**2 / (sig_t.unsqueeze(-1)**2))
+
+        # discritize attention weights
+        alpha_t = self.COEF * torch.sum(phi_t, 1)
+        alpha_t = alpha_t[:, 1:] - alpha_t[:, :-1]
+
+        # apply masking
+        if mask is not None:
+            alpha_t.data.masked_fill_(~mask, self._mask_value)
+
+        context = torch.bmm(alpha_t.unsqueeze(1), inputs).squeeze(1)
+        self.attention_weights = alpha_t
+        self.mu_prev = mu_t
+        return context
+
+
 class OriginalAttention(nn.Module):
     """Following the methods proposed here:
         - https://arxiv.org/abs/1712.05884

From cf7d968f575894e53434ee295eaa52e1f17b6b26 Mon Sep 17 00:00:00 2001
From: root <root@sp-mlc3-5423-0.mlc>
Date: Tue, 14 Jan 2020 13:22:23 +0100
Subject: [PATCH 25/61] graves attention as in melnet paper

---
 layers/common_layers.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/layers/common_layers.py b/layers/common_layers.py
index 5365d605..a768e684 100644
--- a/layers/common_layers.py
+++ b/layers/common_layers.py
@@ -136,8 +136,8 @@ class GravesAttention(nn.Module):
         torch.nn.init.constant_(self.N_a[2].bias[self.K:(2*self.K)], 10)
 
     def init_states(self, inputs):
-        if self.J is None or inputs.shape[1] > self.J.shape[-1]:
-            self.J = torch.arange(0, inputs.shape[1]+1).to(inputs.device) + 0.5
+        if self.J is None or inputs.shape[1]+1 > self.J.shape[-1]:
+            self.J = torch.arange(0, inputs.shape[1]+2).to(inputs.device) + 0.5
         self.attention_weights = torch.zeros(inputs.shape[0], inputs.shape[1]).to(inputs.device)
         self.mu_prev = torch.zeros(inputs.shape[0], self.K).to(inputs.device)
 
@@ -165,24 +165,25 @@ class GravesAttention(nn.Module):
 
         # attention GMM parameters
         sig_t = torch.nn.functional.softplus(b_t) + self.eps
-
         mu_t = self.mu_prev + torch.nn.functional.softplus(k_t)
         g_t = torch.softmax(g_t, dim=-1) / sig_t + self.eps
-
         j = self.J[:inputs.size(1)+1]
 
         # attention weights
-        phi_t = g_t.unsqueeze(-1) * torch.exp(-0.5 * (mu_t.unsqueeze(-1) - j)**2 / (sig_t.unsqueeze(-1)**2))
+        phi_t = g_t.unsqueeze(-1) * (1 / (1 + torch.exp((mu_t.unsqueeze(-1) - j) / sig_t.unsqueeze(-1))))
 
         # discritize attention weights
-        alpha_t = self.COEF * torch.sum(phi_t, 1)
+        alpha_t = torch.sum(phi_t, 1)
         alpha_t = alpha_t[:, 1:] - alpha_t[:, :-1]
+        alpha_t[alpha_t == 0] = 1e-8
 
         # apply masking
         if mask is not None:
             alpha_t.data.masked_fill_(~mask, self._mask_value)
 
         context = torch.bmm(alpha_t.unsqueeze(1), inputs).squeeze(1)
+        # for better visualization
+        # self.attention_weights = torch.clamp(alpha_t, min=0)
         self.attention_weights = alpha_t
         self.mu_prev = mu_t
         return context
@@ -355,7 +356,7 @@ class OriginalAttention(nn.Module):
         if self.forward_attn:
             alignment = self.apply_forward_attention(alignment)
             self.alpha = alignment
-
+        
         context = torch.bmm(alignment.unsqueeze(1), inputs)
         context = context.squeeze(1)
         self.attention_weights = alignment

From 72817438db4d805754d19dea818e6b4eb0ce425d Mon Sep 17 00:00:00 2001
From: root <root@sp-mlc3-5423-0.mlc>
Date: Wed, 15 Jan 2020 01:53:27 +0100
Subject: [PATCH 26/61] graves v2

---
 config.json             |  2 +-
 layers/common_layers.py | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/config.json b/config.json
index 0bf6c378..fc33d16a 100644
--- a/config.json
+++ b/config.json
@@ -1,6 +1,6 @@
 {
     "model": "Tacotron2",          // one of the model in models/  
-    "run_name": "ljspeech-graves",
+    "run_name": "ljspeech-gravesv2",
     "run_description": "tacotron2 wuth graves attention",
 
     // AUDIO PARAMETERS
diff --git a/layers/common_layers.py b/layers/common_layers.py
index a768e684..f27ecf56 100644
--- a/layers/common_layers.py
+++ b/layers/common_layers.py
@@ -118,7 +118,7 @@ class GravesAttention(nn.Module):
 
     def __init__(self, query_dim, K):
         super(GravesAttention, self).__init__()
-        self._mask_value = 0.0
+        self._mask_value = 1e-8
         self.K = K
         # self.attention_alignment = 0.05
         self.eps = 1e-5
@@ -165,12 +165,14 @@ class GravesAttention(nn.Module):
 
         # attention GMM parameters
         sig_t = torch.nn.functional.softplus(b_t) + self.eps
+
         mu_t = self.mu_prev + torch.nn.functional.softplus(k_t)
         g_t = torch.softmax(g_t, dim=-1) / sig_t + self.eps
+
         j = self.J[:inputs.size(1)+1]
 
         # attention weights
-        phi_t = g_t.unsqueeze(-1) * (1 / (1 + torch.exp((mu_t.unsqueeze(-1) - j) / sig_t.unsqueeze(-1))))
+        phi_t = g_t.unsqueeze(-1) * (1 / (1 + torch.sigmoid((mu_t.unsqueeze(-1) - j) / sig_t.unsqueeze(-1))))
 
         # discritize attention weights
         alpha_t = torch.sum(phi_t, 1)
@@ -182,8 +184,6 @@ class GravesAttention(nn.Module):
             alpha_t.data.masked_fill_(~mask, self._mask_value)
 
         context = torch.bmm(alpha_t.unsqueeze(1), inputs).squeeze(1)
-        # for better visualization
-        # self.attention_weights = torch.clamp(alpha_t, min=0)
         self.attention_weights = alpha_t
         self.mu_prev = mu_t
         return context
@@ -356,7 +356,7 @@ class OriginalAttention(nn.Module):
         if self.forward_attn:
             alignment = self.apply_forward_attention(alignment)
             self.alpha = alignment
-        
+
         context = torch.bmm(alignment.unsqueeze(1), inputs)
         context = context.squeeze(1)
         self.attention_weights = alignment

From 9921d682c325d6f7159c71969bbfdb228c685329 Mon Sep 17 00:00:00 2001
From: root <root@sp-mlc3-5423-0.mlc>
Date: Wed, 15 Jan 2020 12:30:07 +0100
Subject: [PATCH 27/61] seq_len_norm for imbalanced datasets

---
 layers/losses.py | 40 ++++++++++++++++++++++++++++++++--------
 train.py         |  4 ++--
 2 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/layers/losses.py b/layers/losses.py
index e7ecff5f..b8b17c17 100644
--- a/layers/losses.py
+++ b/layers/losses.py
@@ -6,6 +6,11 @@ from TTS.utils.generic_utils import sequence_mask
 
 
 class L1LossMasked(nn.Module):
+
+    def __init__(self, seq_len_norm):
+        super(L1LossMasked, self).__init__()
+        self.seq_len_norm = seq_len_norm
+
     def forward(self, x, target, length):
         """
         Args:
@@ -24,14 +29,26 @@ class L1LossMasked(nn.Module):
         target.requires_grad = False
         mask = sequence_mask(
             sequence_length=length, max_len=target.size(1)).unsqueeze(2).float()
-        mask = mask.expand_as(x)
-        loss = functional.l1_loss(
-            x * mask, target * mask, reduction="sum")
-        loss = loss / mask.sum()
+        if self.seq_len_norm:
+            norm_w = mask / mask.sum(dim=1, keepdim=True)
+            out_weights = norm_w.div(target.shape[0] * target.shape[2])
+            mask = mask.expand_as(x)
+            loss = functional.l1_loss(
+                x * mask, target * mask, reduction='none')
+            loss = loss.mul(out_weights.cuda()).sum() 
+        else:
+            loss = functional.l1_loss(
+                x * mask, target * mask, reduction='sum')
+            loss = loss / mask.sum()
         return loss
 
 
 class MSELossMasked(nn.Module):
+
+    def __init__(self, seq_len_norm):
+        super(MSELossMasked, self).__init__()
+        self.seq_len_norm = seq_len_norm
+
     def forward(self, x, target, length):
         """
         Args:
@@ -50,10 +67,17 @@ class MSELossMasked(nn.Module):
         target.requires_grad = False
         mask = sequence_mask(
             sequence_length=length, max_len=target.size(1)).unsqueeze(2).float()
-        mask = mask.expand_as(x)
-        loss = functional.mse_loss(
-            x * mask, target * mask, reduction="sum")
-        loss = loss / mask.sum()
+        if self.seq_len_norm:
+            norm_w = mask / mask.sum(dim=1, keepdim=True)
+            out_weights = norm_w.div(target.shape[0] * target.shape[2])
+            mask = mask.expand_as(x)
+            loss = functional.mse_loss(
+                x * mask, target * mask, reduction='none')
+            loss = loss.mul(out_weights.cuda()).sum() 
+        else:
+            loss = functional.mse_loss(
+                x * mask, target * mask, reduction='sum')
+            loss = loss / mask.sum()
         return loss
 
 
diff --git a/train.py b/train.py
index 81bc2c72..f52d24c1 100644
--- a/train.py
+++ b/train.py
@@ -561,8 +561,8 @@ def main(args):  # pylint: disable=redefined-outer-name
         optimizer_st = None
 
     if c.loss_masking:
-        criterion = L1LossMasked() if c.model in ["Tacotron", "TacotronGST"
-                                                  ] else MSELossMasked()
+        criterion = L1LossMasked(c.seq_len_norm) if c.model in ["Tacotron", "TacotronGST"
+                                                  ] else MSELossMasked(c.seq_len_norm)
     else:
         criterion = nn.L1Loss() if c.model in ["Tacotron", "TacotronGST"
                                                ] else nn.MSELoss()

From 34d2e9438d36eface47643dc22e7871c6830de97 Mon Sep 17 00:00:00 2001
From: root <root@sp-mlc3-5423-0.mlc>
Date: Wed, 15 Jan 2020 12:38:04 +0100
Subject: [PATCH 28/61] seq_len_norm set in config

---
 config.json | 1 +
 1 file changed, 1 insertion(+)

diff --git a/config.json b/config.json
index fc33d16a..71ba261e 100644
--- a/config.json
+++ b/config.json
@@ -53,6 +53,7 @@
     "lr": 0.0001,                  // Initial learning rate. If Noam decay is active, maximum learning rate.
     "wd": 0.000001,         // Weight decay weight.
     "warmup_steps": 4000,          // Noam decay steps to increase the learning rate from 0 to "lr"
+    "seq_len_norm": false,	   // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths.
     
     // TACOTRON PRENET
     "memory_size": -1,              // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame. 

From 678d56cdef7d671f6e3d2cd70629b50a246e5491 Mon Sep 17 00:00:00 2001
From: root <root@sp-mlc3-5423-0.mlc>
Date: Wed, 15 Jan 2020 23:17:55 +0100
Subject: [PATCH 29/61] bug fix for losses

---
 layers/losses.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/layers/losses.py b/layers/losses.py
index b8b17c17..90d2ac80 100644
--- a/layers/losses.py
+++ b/layers/losses.py
@@ -37,6 +37,7 @@ class L1LossMasked(nn.Module):
                 x * mask, target * mask, reduction='none')
             loss = loss.mul(out_weights.cuda()).sum() 
         else:
+            mask = mask.expand_as(x)
             loss = functional.l1_loss(
                 x * mask, target * mask, reduction='sum')
             loss = loss / mask.sum()
@@ -75,6 +76,7 @@ class MSELossMasked(nn.Module):
                 x * mask, target * mask, reduction='none')
             loss = loss.mul(out_weights.cuda()).sum() 
         else:
+            mask = mask.expand_as(x)
             loss = functional.mse_loss(
                 x * mask, target * mask, reduction='sum')
             loss = loss / mask.sum()

From bb1117ff32d91a9ba32710810391e062596f62b7 Mon Sep 17 00:00:00 2001
From: root <root@sp-mlc3-5423-0.mlc>
Date: Sat, 18 Jan 2020 00:33:51 +0100
Subject: [PATCH 30/61] stop dividing g_t with sig_t and commenting

---
 layers/common_layers.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/layers/common_layers.py b/layers/common_layers.py
index f27ecf56..023c7404 100644
--- a/layers/common_layers.py
+++ b/layers/common_layers.py
@@ -132,8 +132,8 @@ class GravesAttention(nn.Module):
         self.init_layers()
 
     def init_layers(self):
-        torch.nn.init.constant_(self.N_a[2].bias[(2*self.K):(3*self.K)], 1.)
-        torch.nn.init.constant_(self.N_a[2].bias[self.K:(2*self.K)], 10)
+        torch.nn.init.constant_(self.N_a[2].bias[(2*self.K):(3*self.K)], 1.)  # bias mean
+        torch.nn.init.constant_(self.N_a[2].bias[self.K:(2*self.K)], 10)  # bias std
 
     def init_states(self, inputs):
         if self.J is None or inputs.shape[1]+1 > self.J.shape[-1]:
@@ -167,7 +167,7 @@ class GravesAttention(nn.Module):
         sig_t = torch.nn.functional.softplus(b_t) + self.eps
 
         mu_t = self.mu_prev + torch.nn.functional.softplus(k_t)
-        g_t = torch.softmax(g_t, dim=-1) / sig_t + self.eps
+        g_t = torch.softmax(g_t, dim=-1) + self.eps
 
         j = self.J[:inputs.size(1)+1]
 

From 284daba116e2022d13573b89dcc6766fcfa2e342 Mon Sep 17 00:00:00 2001
From: root <root@sp-mlc3-5423-0.mlc>
Date: Mon, 27 Jan 2020 15:42:56 +0100
Subject: [PATCH 31/61] bug fixes

---
 utils/audio.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/utils/audio.py b/utils/audio.py
index 708f0853..82e5aa47 100644
--- a/utils/audio.py
+++ b/utils/audio.py
@@ -66,12 +66,11 @@ class AudioProcessor(object):
         return np.maximum(1e-10, np.dot(inv_mel_basis, mel_spec))
 
     def _build_mel_basis(self, ):
-        n_fft = (self.num_freq - 1) * 2
         if self.mel_fmax is not None:
             assert self.mel_fmax <= self.sample_rate // 2
         return librosa.filters.mel(
             self.sample_rate,
-            n_fft,
+            self.n_fft,
             n_mels=self.num_mels,
             fmin=self.mel_fmin,
             fmax=self.mel_fmax)
@@ -197,6 +196,7 @@ class AudioProcessor(object):
             n_fft=self.n_fft,
             hop_length=self.hop_length,
             win_length=self.win_length,
+            pad_mode='constant'
         )
 
     def _istft(self, y):
@@ -217,7 +217,7 @@ class AudioProcessor(object):
         margin = int(self.sample_rate * 0.01)
         wav = wav[margin:-margin]
         return librosa.effects.trim(
-            wav, top_db=60, frame_length=self.win_length, hop_length=self.hop_length)[0]
+            wav, top_db=40, frame_length=self.win_length, hop_length=self.hop_length)[0]
 
     @staticmethod
     def mulaw_encode(wav, qc):

From 0d17019d224c7db47c2370088e35986e2e8c69af Mon Sep 17 00:00:00 2001
From: root <root@sp-mlc3-5423-0.mlc>
Date: Mon, 27 Jan 2020 15:46:59 +0100
Subject: [PATCH 32/61] remove old graves

---
 layers/common_layers.py | 79 ++---------------------------------------
 1 file changed, 2 insertions(+), 77 deletions(-)

diff --git a/layers/common_layers.py b/layers/common_layers.py
index 023c7404..592f017c 100644
--- a/layers/common_layers.py
+++ b/layers/common_layers.py
@@ -111,8 +111,9 @@ class LocationLayer(nn.Module):
 
 
 class GravesAttention(nn.Module):
-    """ Graves attention as described here:
+    """ Discretized Graves attention:
         - https://arxiv.org/abs/1910.10288
+        - https://arxiv.org/pdf/1906.01083.pdf
     """
     COEF = 0.3989422917366028  # numpy.sqrt(1/(2*numpy.pi))
 
@@ -368,82 +369,6 @@ class OriginalAttention(nn.Module):
         return context
 
 
-class GravesAttention(nn.Module):
-    """ Graves attention as described here:
-        - https://arxiv.org/abs/1910.10288
-    """
-    COEF = 0.3989422917366028  # numpy.sqrt(1/(2*numpy.pi))
-
-    def __init__(self, query_dim, K):
-        super(GravesAttention, self).__init__()
-        self._mask_value = 0.0
-        self.K = K
-        # self.attention_alignment = 0.05
-        self.eps = 1e-5
-        self.J = None
-        self.N_a = nn.Sequential(
-            nn.Linear(query_dim, query_dim, bias=True),
-            nn.ReLU(),
-            nn.Linear(query_dim, 3*K, bias=True))
-        self.attention_weights = None
-        self.mu_prev = None
-        self.init_layers()
-
-    def init_layers(self):
-        torch.nn.init.constant_(self.N_a[2].bias[(2*self.K):(3*self.K)], 1.)
-        torch.nn.init.constant_(self.N_a[2].bias[self.K:(2*self.K)], 10)
-
-    def init_states(self, inputs):
-        if self.J is None or inputs.shape[1] > self.J.shape[-1]:
-            self.J = torch.arange(0, inputs.shape[1]).to(inputs.device)
-        self.attention_weights = torch.zeros(inputs.shape[0], inputs.shape[1]).to(inputs.device)
-        self.mu_prev = torch.zeros(inputs.shape[0], self.K).to(inputs.device)
-
-    # pylint: disable=R0201
-    # pylint: disable=unused-argument
-    def preprocess_inputs(self, inputs):
-        return None
-
-    def forward(self, query, inputs, processed_inputs, mask):
-        """
-        shapes:
-            query: B x D_attention_rnn
-            inputs: B x T_in x D_encoder
-            processed_inputs: place_holder
-            mask: B x T_in
-        """
-        gbk_t = self.N_a(query)
-        gbk_t = gbk_t.view(gbk_t.size(0), -1, self.K)
-
-        # attention model parameters
-        # each B x K
-        g_t = gbk_t[:, 0, :]
-        b_t = gbk_t[:, 1, :]
-        k_t = gbk_t[:, 2, :]
-
-        # attention GMM parameters
-        sig_t = torch.nn.functional.softplus(b_t) + self.eps
-
-        mu_t = self.mu_prev + torch.nn.functional.softplus(k_t)
-        g_t = torch.softmax(g_t, dim=-1) / sig_t + self.eps
-
-        # each B x K x T_in
-        j = self.J[:inputs.size(1)]
-
-        # attention weights
-        phi_t = g_t.unsqueeze(-1) * torch.exp(-0.5 * (mu_t.unsqueeze(-1) - j)**2 / (sig_t.unsqueeze(-1)**2))
-        alpha_t = self.COEF * torch.sum(phi_t, 1)
-
-        # apply masking
-        if mask is not None:
-            alpha_t.data.masked_fill_(~mask, self._mask_value)
-
-        context = torch.bmm(alpha_t.unsqueeze(1), inputs).squeeze(1)
-        self.attention_weights = alpha_t
-        self.mu_prev = mu_t
-        return context
-
-
 def init_attn(attn_type, query_dim, embedding_dim, attention_dim,
               location_attention, attention_location_n_filters,
               attention_location_kernel_size, windowing, norm, forward_attn,

From ca33336ae0f34751f5ca393c7998f7ac85c16b79 Mon Sep 17 00:00:00 2001
From: root <root@sp-mlc3-5423-0.mlc>
Date: Mon, 27 Jan 2020 16:02:34 +0100
Subject: [PATCH 33/61] testing seq_len_norm

---
 layers/losses.py     |  4 ++--
 tests/test_layers.py | 39 ++++++++++++++++++++++++++++++++++++---
 2 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/layers/losses.py b/layers/losses.py
index 90d2ac80..176e2f09 100644
--- a/layers/losses.py
+++ b/layers/losses.py
@@ -35,7 +35,7 @@ class L1LossMasked(nn.Module):
             mask = mask.expand_as(x)
             loss = functional.l1_loss(
                 x * mask, target * mask, reduction='none')
-            loss = loss.mul(out_weights.cuda()).sum() 
+            loss = loss.mul(out_weights.to(loss.device)).sum()
         else:
             mask = mask.expand_as(x)
             loss = functional.l1_loss(
@@ -74,7 +74,7 @@ class MSELossMasked(nn.Module):
             mask = mask.expand_as(x)
             loss = functional.mse_loss(
                 x * mask, target * mask, reduction='none')
-            loss = loss.mul(out_weights.cuda()).sum() 
+            loss = loss.mul(out_weights.to(loss.device)).sum()
         else:
             mask = mask.expand_as(x)
             loss = functional.mse_loss(
diff --git a/tests/test_layers.py b/tests/test_layers.py
index 7d02b673..d7c8829f 100644
--- a/tests/test_layers.py
+++ b/tests/test_layers.py
@@ -131,7 +131,7 @@ class L1LossMaskedTests(unittest.TestCase):
         dummy_target = T.zeros(4, 8, 128).float()
         dummy_length = (T.ones(4) * 8).long()
         output = layer(dummy_input, dummy_target, dummy_length)
-        assert output.item() == 1.0, "1.0 vs {}".format(output.data[0])
+        assert output.item() == 1.0, "1.0 vs {}".format(output.item())
 
         # test if padded values of input makes any difference
         dummy_input = T.ones(4, 8, 128).float()
@@ -140,7 +140,7 @@ class L1LossMaskedTests(unittest.TestCase):
         mask = (
             (sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
         output = layer(dummy_input + mask, dummy_target, dummy_length)
-        assert output.item() == 1.0, "1.0 vs {}".format(output.data[0])
+        assert output.item() == 1.0, "1.0 vs {}".format(output.item())
 
         dummy_input = T.rand(4, 8, 128).float()
         dummy_target = dummy_input.detach()
@@ -148,4 +148,37 @@ class L1LossMaskedTests(unittest.TestCase):
         mask = (
             (sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
         output = layer(dummy_input + mask, dummy_target, dummy_length)
-        assert output.item() == 0, "0 vs {}".format(output.data[0])
+        assert output.item() == 0, "0 vs {}".format(output.item())
+
+        # seq_len_norm = True
+        # test input == target
+        layer = L1LossMasked(seq_len_norm=True)
+        dummy_input = T.ones(4, 8, 128).float()
+        dummy_target = T.ones(4, 8, 128).float()
+        dummy_length = (T.ones(4) * 8).long()
+        output = layer(dummy_input, dummy_target, dummy_length)
+        assert output.item() == 0.0
+
+        # test input != target
+        dummy_input = T.ones(4, 8, 128).float()
+        dummy_target = T.zeros(4, 8, 128).float()
+        dummy_length = (T.ones(4) * 8).long()
+        output = layer(dummy_input, dummy_target, dummy_length)
+        assert output.item() == 1.0, "1.0 vs {}".format(output.item())
+
+        # test if padded values of input makes any difference
+        dummy_input = T.ones(4, 8, 128).float()
+        dummy_target = T.zeros(4, 8, 128).float()
+        dummy_length = (T.arange(5, 9)).long()
+        mask = (
+            (sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
+        output = layer(dummy_input + mask, dummy_target, dummy_length)
+        assert abs(output.item() - 1.0) < 1e-5, "1.0 vs {}".format(output.item())
+
+        dummy_input = T.rand(4, 8, 128).float()
+        dummy_target = dummy_input.detach()
+        dummy_length = (T.arange(5, 9)).long()
+        mask = (
+            (sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
+        output = layer(dummy_input + mask, dummy_target, dummy_length)
+        assert output.item() == 0, "0 vs {}".format(output.item())

From ffe9a32813c03400576fbea78029a6b869729b9b Mon Sep 17 00:00:00 2001
From: root <root@sp-mlc3-5423-0.mlc>
Date: Mon, 3 Feb 2020 14:16:40 +0100
Subject: [PATCH 34/61] set silence trimming threshold in config

---
 config.json    | 1 +
 utils/audio.py | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/config.json b/config.json
index 71ba261e..89266a94 100644
--- a/config.json
+++ b/config.json
@@ -24,6 +24,7 @@
         "mel_fmin": 0.0,         // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
         "mel_fmax": 8000.0,        // maximum freq level for mel-spec. Tune for dataset!!
         "do_trim_silence": true  // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
+        "trim_db": 60,          // threshold for timming silence. Set this according to your dataset.
     },
 
     // DISTRIBUTED TRAINING
diff --git a/utils/audio.py b/utils/audio.py
index 82e5aa47..7b2c4834 100644
--- a/utils/audio.py
+++ b/utils/audio.py
@@ -24,6 +24,7 @@ class AudioProcessor(object):
                  clip_norm=True,
                  griffin_lim_iters=None,
                  do_trim_silence=False,
+                 trim_db=60,
                  sound_norm=False,
                  **_):
 
@@ -46,6 +47,7 @@ class AudioProcessor(object):
         self.max_norm = 1.0 if max_norm is None else float(max_norm)
         self.clip_norm = clip_norm
         self.do_trim_silence = do_trim_silence
+        self.trim_db = trim_db
         self.sound_norm = sound_norm
         self.n_fft, self.hop_length, self.win_length = self._stft_parameters()
         assert min_level_db != 0.0, " [!] min_level_db is 0"
@@ -217,7 +219,7 @@ class AudioProcessor(object):
         margin = int(self.sample_rate * 0.01)
         wav = wav[margin:-margin]
         return librosa.effects.trim(
-            wav, top_db=40, frame_length=self.win_length, hop_length=self.hop_length)[0]
+            wav, top_db=self.trim_db, frame_length=self.win_length, hop_length=self.hop_length)[0]
 
     @staticmethod
     def mulaw_encode(wav, qc):

From 1ef6278d2d247ea904ad98bfa78ab376628678bc Mon Sep 17 00:00:00 2001
From: root <root@sp-mlc3-5423-0.mlc>
Date: Mon, 3 Feb 2020 15:29:44 +0100
Subject: [PATCH 35/61] tacotron2 stop condition

---
 layers/tacotron2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/layers/tacotron2.py b/layers/tacotron2.py
index 78bdd10d..c195b277 100644
--- a/layers/tacotron2.py
+++ b/layers/tacotron2.py
@@ -290,7 +290,7 @@ class Decoder(nn.Module):
             stop_tokens += [stop_token]
             alignments += [alignment]
 
-            if stop_token > 0.7:
+            if stop_token > 0.7 and t > inputs.shape[0] / 2:
                 break
             if len(outputs) == self.max_decoder_steps:
                 print("   | > Decoder stopped with 'max_decoder_steps")

From fbe5310be01321220ad219efb48ea68d38f30267 Mon Sep 17 00:00:00 2001
From: Reuben Morais <reuben.morais@gmail.com>
Date: Tue, 4 Feb 2020 11:16:48 +0100
Subject: [PATCH 36/61] Only use embedded model files if they're not overriden
 by CLI flags

---
 server/server.py | 35 ++++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/server/server.py b/server/server.py
index d40e2427..3be66f9e 100644
--- a/server/server.py
+++ b/server/server.py
@@ -24,20 +24,32 @@ def create_argparser():
     return parser
 
 
-config = None
 synthesizer = None
 
 embedded_model_folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'model')
 checkpoint_file = os.path.join(embedded_model_folder, 'checkpoint.pth.tar')
 config_file = os.path.join(embedded_model_folder, 'config.json')
 
-if os.path.isfile(checkpoint_file) and os.path.isfile(config_file):
-    # Use default config with embedded model files
-    config = create_argparser().parse_args([])
-    config.tts_checkpoint = checkpoint_file
-    config.tts_config = config_file
-    synthesizer = Synthesizer(config)
+# Default options with embedded model files
+if os.path.isfile(checkpoint_file):
+    default_tts_checkpoint = checkpoint_file
+else:
+    default_tts_checkpoint = None
 
+if os.path.isfile(config_file):
+    default_tts_config = config_file
+else:
+    default_tts_config = None
+
+args = create_argparser().parse_args()
+
+# If these were not specified in the CLI args, use default values
+if not args.tts_checkpoint:
+    args.tts_checkpoint = default_tts_checkpoint
+if not args.tts_config:
+    args.tts_config = default_tts_config
+
+synthesizer = Synthesizer(args)
 
 app = Flask(__name__)
 
@@ -55,11 +67,4 @@ def tts():
 
 
 if __name__ == '__main__':
-    args = create_argparser().parse_args()
-
-    # Setup synthesizer from CLI args if they're specified or no embedded model
-    # is present.
-    if not config or not synthesizer or args.tts_checkpoint or args.tts_config:
-        synthesizer = Synthesizer(args)
-
-    app.run(debug=config.debug, host='0.0.0.0', port=config.port)
+    app.run(debug=args.debug, host='0.0.0.0', port=args.port)

From ed8a9fc82a383209518733ae4bdedfa986f9648d Mon Sep 17 00:00:00 2001
From: root <root@sp-mlc3-5423-0.mlc>
Date: Tue, 4 Feb 2020 17:09:59 +0100
Subject: [PATCH 37/61] update server and synthesizer to handle ParallelWaveGAN

---
 server/server.py      |  9 ++++++---
 server/synthesizer.py | 46 ++++++++++++++++++++++++++++++-------------
 2 files changed, 38 insertions(+), 17 deletions(-)

diff --git a/server/server.py b/server/server.py
index 3be66f9e..6af119bf 100644
--- a/server/server.py
+++ b/server/server.py
@@ -14,10 +14,13 @@ def create_argparser():
     parser.add_argument('--tts_checkpoint', type=str, help='path to TTS checkpoint file')
     parser.add_argument('--tts_config', type=str, help='path to TTS config.json file')
     parser.add_argument('--tts_speakers', type=str, help='path to JSON file containing speaker ids, if speaker ids are used in the model')
-    parser.add_argument('--wavernn_lib_path', type=str, help='path to WaveRNN project folder to be imported. If this is not passed, model uses Griffin-Lim for synthesis.')
-    parser.add_argument('--wavernn_file', type=str, help='path to WaveRNN checkpoint file.')
-    parser.add_argument('--wavernn_config', type=str, help='path to WaveRNN config file.')
+    parser.add_argument('--wavernn_lib_path', type=str, default=None, help='path to WaveRNN project folder to be imported. If this is not passed, model uses Griffin-Lim for synthesis.')
+    parser.add_argument('--wavernn_file', type=str, default=None, help='path to WaveRNN checkpoint file.')
+    parser.add_argument('--wavernn_config', type=str, default=None, help='path to WaveRNN config file.')
     parser.add_argument('--is_wavernn_batched', type=convert_boolean, default=False, help='true to use batched WaveRNN.')
+    parser.add_argument('--pwgan_lib_path', type=str, help='path to ParallelWaveGAN project folder to be imported. If this is not passed, model uses Griffin-Lim for synthesis.')
+    parser.add_argument('--pwgan_file', type=str, help='path to ParallelWaveGAN checkpoint file.')
+    parser.add_argument('--pwgan_config', type=str, help='path to ParallelWaveGAN config file.')
     parser.add_argument('--port', type=int, default=5002, help='port to listen on.')
     parser.add_argument('--use_cuda', type=convert_boolean, default=False, help='true to use CUDA.')
     parser.add_argument('--debug', type=convert_boolean, default=False, help='true to enable Flask debug mode.')
diff --git a/server/synthesizer.py b/server/synthesizer.py
index d8852a3e..b703c62e 100644
--- a/server/synthesizer.py
+++ b/server/synthesizer.py
@@ -1,17 +1,18 @@
 import io
 import os
+import re
+import sys
 
 import numpy as np
 import torch
-import sys
+import yaml
 
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.generic_utils import load_config, setup_model
-from TTS.utils.text import phonemes, symbols
 from TTS.utils.speakers import load_speaker_mapping
 from TTS.utils.synthesis import *
+from TTS.utils.text import phonemes, symbols
 
-import re
 alphabets = r"([A-Za-z])"
 prefixes = r"(Mr|St|Mrs|Ms|Dr)[.]"
 suffixes = r"(Inc|Ltd|Jr|Sr|Co)"
@@ -23,6 +24,7 @@ websites = r"[.](com|net|org|io|gov)"
 class Synthesizer(object):
     def __init__(self, config):
         self.wavernn = None
+        self.pwgan = None
         self.config = config
         self.use_cuda = self.config.use_cuda
         if self.use_cuda:
@@ -30,9 +32,11 @@ class Synthesizer(object):
         self.load_tts(self.config.tts_checkpoint, self.config.tts_config,
                       self.config.use_cuda)
         if self.config.wavernn_lib_path:
-            self.load_wavernn(self.config.wavernn_lib_path, self.config.wavernn_path,
-                              self.config.wavernn_file, self.config.wavernn_config,
-                              self.config.use_cuda)
+            self.load_wavernn(self.config.wavernn_lib_path, self.config.wavernn_file,
+                              self.config.wavernn_config, self.config.use_cuda)
+        if self.config.pwgan_lib_path:
+            self.load_pwgan(self.config.pwgan_lib_path, self.config.pwgan_file,
+                            self.config.pwgan_config, self.config.use_cuda)
 
     def load_tts(self, tts_checkpoint, tts_config, use_cuda):
         print(" > Loading TTS model ...")
@@ -45,9 +49,9 @@ class Synthesizer(object):
             self.input_size = len(phonemes)
         else:
             self.input_size = len(symbols)
-        # load speakers
+        # TODO: fix this for multi-speaker model - load speakers
         if self.config.tts_speakers is not None:
-            self.tts_speakers = load_speaker_mapping(os.path.join(model_path, self.config.tts_speakers))
+            self.tts_speakers = load_speaker_mapping(self.config.tts_speakers)
             num_speakers = len(self.tts_speakers)
         else:
             num_speakers = 0
@@ -63,16 +67,14 @@ class Synthesizer(object):
         if 'r' in cp:
             self.tts_model.decoder.set_r(cp['r'])
 
-    def load_wavernn(self, lib_path, model_path, model_file, model_config, use_cuda):
+    def load_wavernn(self, lib_path, model_file, model_config, use_cuda):
         # TODO: set a function in wavernn code base for model setup and call it here.
         sys.path.append(lib_path) # set this if TTS is not installed globally
         from WaveRNN.models.wavernn import Model
-        wavernn_config = os.path.join(model_path, model_config)
-        model_file = os.path.join(model_path, model_file)
         print(" > Loading WaveRNN model ...")
-        print(" | > model config: ", wavernn_config)
+        print(" | > model config: ", model_config)
         print(" | > model file: ", model_file)
-        self.wavernn_config = load_config(wavernn_config)
+        self.wavernn_config = load_config(model_config)
         self.wavernn = Model(
             rnn_dims=512,
             fc_dims=512,
@@ -91,11 +93,27 @@ class Synthesizer(object):
         ).cuda()
 
         check = torch.load(model_file)
-        self.wavernn.load_state_dict(check['model'])
+        self.wavernn.load_state_dict(check['model'], map_location="cpu")
         if use_cuda:
             self.wavernn.cuda()
         self.wavernn.eval()
 
+    def load_pwgan(self, lib_path, model_file, model_config, use_cuda):
+        sys.path.append(lib_path) # set this if TTS is not installed globally
+        from parallel_wavegan.models import ParallelWaveGANGenerator
+        from parallel_wavegan.utils.audio import AudioProcessor as AudioProcessorVocoder
+        print(" > Loading PWGAN model ...")
+        print(" | > model config: ", model_config)
+        print(" | > model file: ", model_file)
+        with open(model_config) as f:
+            self.pwgan_config = yaml.load(f, Loader=yaml.Loader)
+        self.pwgan = ParallelWaveGANGenerator(**self.pwgan_config["generator_params"])
+        self.pwgan.load_state_dict(torch.load(model_file, map_location="cpu")["model"]["generator"])
+        self.pwgan.remove_weight_norm()
+        if use_cuda:
+            self.pwgan.cuda()
+        self.pwgan.eval()
+
     def save_wav(self, wav, path):
         # wav *= 32767 / max(1e-8, np.max(np.abs(wav)))
         wav = np.array(wav)

From af0fa9f6da4f170de4f33d4e68d1a09d7518c13f Mon Sep 17 00:00:00 2001
From: root <root@sp-mlc3-5423-0.mlc>
Date: Tue, 4 Feb 2020 17:19:12 +0100
Subject: [PATCH 38/61] README update

---
 server/README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/server/README.md b/server/README.md
index 95297225..0563ef94 100644
--- a/server/README.md
+++ b/server/README.md
@@ -6,6 +6,10 @@ Instructions below are based on a Ubuntu 18.04 machine, but it should be simple
 
 #### Development server:
 
+##### Using server.py
+If you have the environment set already for TTS, then you can directly call ```setup.py```.
+
+##### Using .whl
 1. apt-get install -y espeak libsndfile1 python3-venv
 2. python3 -m venv /tmp/venv
 3. source /tmp/venv/bin/activate

From c776526c45d4229940390c6f468c29842b992dba Mon Sep 17 00:00:00 2001
From: root <root@sp-mlc3-5423-0.mlc>
Date: Tue, 4 Feb 2020 17:31:02 +0100
Subject: [PATCH 39/61] update server test

---
 server/synthesizer.py           | 2 --
 tests/inputs/server_config.json | 4 +++-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/server/synthesizer.py b/server/synthesizer.py
index b703c62e..63f2080a 100644
--- a/server/synthesizer.py
+++ b/server/synthesizer.py
@@ -1,5 +1,4 @@
 import io
-import os
 import re
 import sys
 
@@ -101,7 +100,6 @@ class Synthesizer(object):
     def load_pwgan(self, lib_path, model_file, model_config, use_cuda):
         sys.path.append(lib_path) # set this if TTS is not installed globally
         from parallel_wavegan.models import ParallelWaveGANGenerator
-        from parallel_wavegan.utils.audio import AudioProcessor as AudioProcessorVocoder
         print(" > Loading PWGAN model ...")
         print(" | > model config: ", model_config)
         print(" | > model file: ", model_file)
diff --git a/tests/inputs/server_config.json b/tests/inputs/server_config.json
index 3988db4c..7f5a60fb 100644
--- a/tests/inputs/server_config.json
+++ b/tests/inputs/server_config.json
@@ -3,9 +3,11 @@
     "tts_config":"dummy_model_config.json",     // tts config.json file
     "tts_speakers": null,           // json file listing speaker ids. null if no speaker embedding.
     "wavernn_lib_path": null,   // Rootpath to wavernn project folder to be imported. If this is null, model uses GL for speech synthesis.
-    "wavernn_path": null,  // wavernn model root path
     "wavernn_file": null, // wavernn checkpoint file name
     "wavernn_config": null, // wavernn config file
+    "pwgan_lib_path": null,
+    "pwgan_file": null,
+    "pwgan_config": null,
     "is_wavernn_batched":true, 
     "port": 5002,
     "use_cuda": false,

From 532cf8160ccc68edc76ebcbcc2ba777ecd7a453e Mon Sep 17 00:00:00 2001
From: erogol <erogol@hotmail.com>
Date: Thu, 6 Feb 2020 15:16:29 +0100
Subject: [PATCH 40/61] pylint check

---
 server/synthesizer.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/server/synthesizer.py b/server/synthesizer.py
index 63f2080a..75fd4e76 100644
--- a/server/synthesizer.py
+++ b/server/synthesizer.py
@@ -68,12 +68,15 @@ class Synthesizer(object):
 
     def load_wavernn(self, lib_path, model_file, model_config, use_cuda):
         # TODO: set a function in wavernn code base for model setup and call it here.
-        sys.path.append(lib_path) # set this if TTS is not installed globally
+        sys.path.append(lib_path) # set this if WaveRNN is not installed globally
+        #pylint: disable=import-outside-toplevel
         from WaveRNN.models.wavernn import Model
         print(" > Loading WaveRNN model ...")
         print(" | > model config: ", model_config)
         print(" | > model file: ", model_file)
         self.wavernn_config = load_config(model_config)
+        # This is the default architecture we use for our models.
+        # You might need to update it
         self.wavernn = Model(
             rnn_dims=512,
             fc_dims=512,
@@ -98,7 +101,8 @@ class Synthesizer(object):
         self.wavernn.eval()
 
     def load_pwgan(self, lib_path, model_file, model_config, use_cuda):
-        sys.path.append(lib_path) # set this if TTS is not installed globally
+        sys.path.append(lib_path) # set this if ParallelWaveGAN is not installed globally
+        #pylint: disable=import-outside-toplevel
         from parallel_wavegan.models import ParallelWaveGANGenerator
         print(" > Loading PWGAN model ...")
         print(" | > model config: ", model_config)

From 5daaadc9dc3d46ba2048401e3a681eae248eb68f Mon Sep 17 00:00:00 2001
From: Markus Toman <m.toman@neuratec.com>
Date: Fri, 7 Feb 2020 12:58:58 +0100
Subject: [PATCH 41/61] Pacify pylint even more

---
 synthesize.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/synthesize.py b/synthesize.py
index a338f8b8..bf85d7c9 100644
--- a/synthesize.py
+++ b/synthesize.py
@@ -1,3 +1,4 @@
+# pylint: disable=redefined-outer-name, unused-argument
 import os
 import time
 import argparse

From 65b8b33d712df5c60ee7ebc39a455c86563c1a64 Mon Sep 17 00:00:00 2001
From: erogol <erogol@hotmail.com>
Date: Fri, 7 Feb 2020 13:00:04 +0100
Subject: [PATCH 42/61] config fixes and enable graves attention wq

---
 config.json | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/config.json b/config.json
index 89266a94..9e4fa906 100644
--- a/config.json
+++ b/config.json
@@ -23,8 +23,8 @@
         "clip_norm": true,      // clip normalized values into the range.
         "mel_fmin": 0.0,         // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
         "mel_fmax": 8000.0,        // maximum freq level for mel-spec. Tune for dataset!!
-        "do_trim_silence": true  // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
-        "trim_db": 60,          // threshold for timming silence. Set this according to your dataset.
+        "do_trim_silence": true,  // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
+        "trim_db": 60          // threshold for timming silence. Set this according to your dataset.
     },
 
     // DISTRIBUTED TRAINING
@@ -62,14 +62,14 @@
     "prenet_dropout": true,        // enable/disable dropout at prenet. 
 
     // ATTENTION
-    "attention_type": "original",  // 'original' or 'graves'
-    "attention_heads": 5,          // number of attention heads (only for 'graves')
+    "attention_type": "graves",  // 'original' or 'graves'
+    "attention_heads": 4,          // number of attention heads (only for 'graves')
     "attention_norm": "sigmoid",   // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron.
     "windowing": false,            // Enables attention windowing. Used only in eval mode.
     "use_forward_attn": false,      // if it uses forward attention. In general, it aligns faster.
     "forward_attn_mask": false,    // Additional masking forcing monotonicity only in eval mode.
     "transition_agent": false,     // enable/disable transition agent of forward attention.
-    "location_attn": true,        // enable_disable location sensitive attention. It is enabled for TACOTRON by default.
+    "location_attn": false,        // enable_disable location sensitive attention. It is enabled for TACOTRON by default.
     "bidirectional_decoder": false,  // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset.
 
     // STOPNET
@@ -92,8 +92,8 @@
     "max_seq_len": 150,     // DATASET-RELATED: maximum text length
 
     // PATHS
-    "output_path": "/data5/rw/pit/keep/",      // DATASET-RELATED: output path for all training outputs.
-    // "output_path": "/media/erogol/data_ssd/Models/runs/",
+    // "output_path": "/data5/rw/pit/keep/",      // DATASET-RELATED: output path for all training outputs.
+    "output_path": "/home/erogol/Models/LJSpeech/",
  
     // PHONEMES
     "phoneme_cache_path": "mozilla_us_phonemes",  // phoneme computation is slow, therefore, it caches results in the given folder.
@@ -110,10 +110,10 @@
         [
             {
                 "name": "ljspeech",
-                "path": "/root/LJSpeech-1.1/",
+                "path": "/home/erogol/Data/LJSpeech-1.1/",
                 // "path": "/home/erogol/Data/LJSpeech-1.1",
-                "meta_file_train": "metadata_train.csv",
-                "meta_file_val": "metadata_val.csv"
+                "meta_file_train": "metadata.csv",
+                "meta_file_val": null
             }
         ]
 

From abf8ea4633a70b150a575bf0ad269cd30481fbcf Mon Sep 17 00:00:00 2001
From: erogol <erogol@hotmail.com>
Date: Fri, 7 Feb 2020 13:00:48 +0100
Subject: [PATCH 43/61] Notebook for PWGAN vocoder

---
 notebooks/Benchmark-PWGAN.ipynb | 578 ++++++++++++++++++++++++++++++++
 1 file changed, 578 insertions(+)
 create mode 100644 notebooks/Benchmark-PWGAN.ipynb

diff --git a/notebooks/Benchmark-PWGAN.ipynb b/notebooks/Benchmark-PWGAN.ipynb
new file mode 100644
index 00000000..430d329f
--- /dev/null
+++ b/notebooks/Benchmark-PWGAN.ipynb
@@ -0,0 +1,578 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This is to test TTS models with benchmark sentences for speech synthesis.\n",
+    "\n",
+    "Before running this script please DON'T FORGET: \n",
+    "- to set file paths.\n",
+    "- to download related model files from TTS and PWGAN.\n",
+    "- download or clone related repos, linked below.\n",
+    "- setup the repositories. ```python setup.py install```\n",
+    "- to checkout right commit versions (given next to the model) of TTS and PWGAN.\n",
+    "- to set the right paths in the cell below.\n",
+    "\n",
+    "Repositories:\n",
+    "- TTS: https://github.com/mozilla/TTS\n",
+    "- PWGAN: https://github.com/erogol/ParallelWaveGAN"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "import os\n",
+    "import sys\n",
+    "import io\n",
+    "import torch \n",
+    "import time\n",
+    "import json\n",
+    "import yaml\n",
+    "import numpy as np\n",
+    "from collections import OrderedDict\n",
+    "import matplotlib.pyplot as plt\n",
+    "plt.rcParams[\"figure.figsize\"] = (16,5)\n",
+    "\n",
+    "import librosa\n",
+    "import librosa.display\n",
+    "\n",
+    "from TTS.models.tacotron import Tacotron \n",
+    "from TTS.layers import *\n",
+    "from TTS.utils.data import *\n",
+    "from TTS.utils.audio import AudioProcessor\n",
+    "from TTS.utils.generic_utils import load_config, setup_model\n",
+    "from TTS.utils.text import text_to_sequence\n",
+    "from TTS.utils.synthesis import synthesis\n",
+    "from TTS.utils.visual import visualize\n",
+    "\n",
+    "import IPython\n",
+    "from IPython.display import Audio\n",
+    "\n",
+    "import os\n",
+    "\n",
+    "# you may need to change this depending on your system\n",
+    "os.environ['CUDA_VISIBLE_DEVICES']='1'\n",
+    "%matplotlib inline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True):\n",
+    "    t_1 = time.time()\n",
+    "    waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, False, CONFIG.enable_eos_bos_chars)\n",
+    "    if CONFIG.model == \"Tacotron\" and not use_gl:\n",
+    "        # coorect the normalization differences b/w TTS and the Vocoder.\n",
+    "        mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n",
+    "    mel_postnet_spec = ap._denormalize(mel_postnet_spec)\n",
+    "#     mel_postnet_spec = np.pad(mel_postnet_spec, pad_width=((2, 2), (0, 0)))\n",
+    "    print(mel_postnet_spec.shape)\n",
+    "    print(\"max- \", mel_postnet_spec.max(), \" -- min- \", mel_postnet_spec.min())\n",
+    "    if not use_gl:\n",
+    "        waveform = vocoder_model.inference(torch.FloatTensor(ap_vocoder._normalize(mel_postnet_spec).T).unsqueeze(0), hop_size=ap_vocoder.hop_length)\n",
+    "#         waveform = waveform / abs(waveform).max() * 0.9\n",
+    "    if use_cuda:\n",
+    "        waveform = waveform.cpu()\n",
+    "    waveform = waveform.numpy()\n",
+    "    print(\" >  Run-time: {}\".format(time.time() - t_1))\n",
+    "    if figures:  \n",
+    "        visualize(alignment, mel_postnet_spec, stop_tokens, text, ap.hop_length, CONFIG, ap._denormalize(mel_spec))                                                                       \n",
+    "    IPython.display.display(Audio(waveform, rate=CONFIG.audio['sample_rate'], normalize=False))  \n",
+    "    os.makedirs(OUT_FOLDER, exist_ok=True)\n",
+    "    file_name = text.replace(\" \", \"_\").replace(\".\",\"\") + \".wav\"\n",
+    "    out_path = os.path.join(OUT_FOLDER, file_name)\n",
+    "    ap.save_wav(waveform, out_path)\n",
+    "    return alignment, mel_postnet_spec, stop_tokens, waveform"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Set constants\n",
+    "ROOT_PATH = '/home/erogol/Models/LJSpeech/ljspeech-bn-December-23-2019_08+34AM-ffea133/'\n",
+    "MODEL_PATH = ROOT_PATH + '/checkpoint_670000.pth.tar'\n",
+    "CONFIG_PATH = ROOT_PATH + '/config.json'\n",
+    "OUT_FOLDER = '/home/erogol/Dropbox/AudioSamples/benchmark_samples/'\n",
+    "CONFIG = load_config(CONFIG_PATH)\n",
+    "VOCODER_MODEL_PATH = \"/home/erogol/Models/LJSpeech/pwgan-ljspeech/checkpoint-400000steps.pkl\"\n",
+    "VOCODER_CONFIG_PATH = \"/home/erogol/Models/LJSpeech/pwgan-ljspeech/config.yml\"\n",
+    "\n",
+    "# load PWGAN config\n",
+    "with open(VOCODER_CONFIG_PATH) as f:\n",
+    "    VOCODER_CONFIG = yaml.load(f, Loader=yaml.Loader)\n",
+    "    \n",
+    "# Run FLAGs\n",
+    "use_cuda = False\n",
+    "# Set some config fields manually for testing\n",
+    "CONFIG.windowing = True\n",
+    "CONFIG.use_forward_attn = True \n",
+    "# Set the vocoder\n",
+    "use_gl = False # use GL if True\n",
+    "batched_wavernn = True    # use batched wavernn inference if True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# LOAD TTS MODEL\n",
+    "from TTS.utils.text.symbols import symbols, phonemes\n",
+    "\n",
+    "# multi speaker \n",
+    "if CONFIG.use_speaker_embedding:\n",
+    "    speakers = json.load(open(f\"{ROOT_PATH}/speakers.json\", 'r'))\n",
+    "    speakers_idx_to_id = {v: k for k, v in speakers.items()}\n",
+    "else:\n",
+    "    speakers = []\n",
+    "    speaker_id = None\n",
+    "\n",
+    "# load the model\n",
+    "num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n",
+    "model = setup_model(num_chars, len(speakers), CONFIG)\n",
+    "\n",
+    "# load the audio processor\n",
+    "ap = AudioProcessor(**CONFIG.audio)         \n",
+    "\n",
+    "\n",
+    "# load model state\n",
+    "cp =  torch.load(MODEL_PATH, map_location=torch.device('cpu'))\n",
+    "\n",
+    "# load the model\n",
+    "model.load_state_dict(cp['model'])\n",
+    "if use_cuda:\n",
+    "    model.cuda()\n",
+    "model.eval()\n",
+    "print(cp['step'])\n",
+    "print(cp['r'])\n",
+    "\n",
+    "# set model stepsize\n",
+    "if 'r' in cp:\n",
+    "    model.decoder.set_r(cp['r'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# LOAD WAVERNN\n",
+    "if use_gl == False:\n",
+    "    from parallel_wavegan.models import ParallelWaveGANGenerator\n",
+    "    from parallel_wavegan.utils.audio import AudioProcessor as AudioProcessorVocoder\n",
+    "    \n",
+    "    vocoder_model = ParallelWaveGANGenerator(**VOCODER_CONFIG[\"generator_params\"])\n",
+    "    vocoder_model.load_state_dict(torch.load(VOCODER_MODEL_PATH, map_location=\"cpu\")[\"model\"][\"generator\"])\n",
+    "    vocoder_model.remove_weight_norm()\n",
+    "    ap_vocoder = AudioProcessorVocoder(**VOCODER_CONFIG['audio'])    \n",
+    "    if use_cuda:\n",
+    "        vocoder_model.cuda()\n",
+    "    vocoder_model.eval();"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Comparision with https://mycroft.ai/blog/available-voices/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.eval()\n",
+    "model.decoder.max_decoder_steps = 2000\n",
+    "model.decoder.prenet.eval()\n",
+    "speaker_id = None\n",
+    "sentence = '''A breeding jennet, lusty, young, and proud,'''\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence =  \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### https://espnet.github.io/icassp2020-tts/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"The Commission also recommends\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"As a result of these studies, the planning document submitted by the Secretary of the Treasury to the Bureau of the Budget on August thirty-one.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"The FBI now transmits information on all defectors, a category which would, of course, have included Oswald.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"they seem unduly restrictive in continuing to require some manifestation of animus against a Government official.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"and each agency given clear understanding of the assistance which the Secret Service expects.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Other examples"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"Be a voice, not an echo.\"  # 'echo' is not in training set. \n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"The human voice is the most perfect instrument of all.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"I'm sorry Dave. I'm afraid I can't do that.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"This cake is great. It's so delicious and moist.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Comparison with https://keithito.github.io/audio-samples/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"Generative adversarial network or variational auto-encoder.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"Scientists at the CERN laboratory say they have discovered a new particle.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"Here’s a way to measure the acute emotional intelligence that has never gone out of style.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"President Trump met with other leaders at the Group of 20 conference.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"The buses aren't the problem, they actually provide a solution.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Comparison with https://google.github.io/tacotron/publications/tacotron/index.html"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"Generative adversarial network or variational auto-encoder.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"Basilar membrane and otolaryngology are not auto-correlations.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \" He has read the whole thing.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"He reads books.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"Thisss isrealy awhsome.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"This is your internet browser, Firefox.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"This is your internet browser Firefox.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"The quick brown fox jumps over the lazy dog.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"Does the quick brown fox jump over the lazy dog?\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"Eren, how are you?\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Hard Sentences"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"Encouraged, he started with a minute a day.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"His meditation consisted of “body scanning” which involved focusing his mind and energy on each section of the body from head to toe .\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase the grey matter in the parts of the brain responsible for emotional regulation and learning . \"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"If he decided to watch TV he really watched it.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentence = \"Often we try to bring about change through sheer effort and we put all of our energy into a new initiative .\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# for twb dataset\n",
+    "sentence = \"In our preparation for Easter, God in his providence offers us each year the season of Lent as a sacramental sign of our conversion.\"\n",
+    "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

From 2cec58320bed7ec3c7070a8465b58e4f4c6de98a Mon Sep 17 00:00:00 2001
From: erogol <erogol@hotmail.com>
Date: Fri, 7 Feb 2020 14:21:57 +0100
Subject: [PATCH 44/61] use decorater for torch.no_grad

---
 train.py | 224 +++++++++++++++++++++++++++----------------------------
 1 file changed, 112 insertions(+), 112 deletions(-)

diff --git a/train.py b/train.py
index f52d24c1..b9f5fefb 100644
--- a/train.py
+++ b/train.py
@@ -327,6 +327,7 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
     return keep_avg['avg_postnet_loss'], global_step
 
 
+@torch.no_grad()
 def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
     data_loader = setup_loader(ap, model.decoder.r, is_val=True)
     if c.use_speaker_embedding:
@@ -346,125 +347,124 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
     keep_avg.add_values(eval_values_dict)
     print("\n > Validation")
 
-    with torch.no_grad():
-        if data_loader is not None:
-            for num_iter, data in enumerate(data_loader):
-                start_time = time.time()
+    if data_loader is not None:
+        for num_iter, data in enumerate(data_loader):
+            start_time = time.time()
 
-                # format data
-                text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, _, _ = format_data(data)
-                assert mel_input.shape[1] % model.decoder.r == 0
+            # format data
+            text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, _, _ = format_data(data)
+            assert mel_input.shape[1] % model.decoder.r == 0
 
-                # forward pass model
-                if c.bidirectional_decoder:
-                    decoder_output, postnet_output, alignments, stop_tokens, decoder_backward_output, alignments_backward = model(
-                        text_input, text_lengths, mel_input, speaker_ids=speaker_ids)
-                else:
-                    decoder_output, postnet_output, alignments, stop_tokens = model(
-                        text_input, text_lengths, mel_input, speaker_ids=speaker_ids)
+            # forward pass model
+            if c.bidirectional_decoder:
+                decoder_output, postnet_output, alignments, stop_tokens, decoder_backward_output, alignments_backward = model(
+                    text_input, text_lengths, mel_input, speaker_ids=speaker_ids)
+            else:
+                decoder_output, postnet_output, alignments, stop_tokens = model(
+                    text_input, text_lengths, mel_input, speaker_ids=speaker_ids)
 
-                # loss computation
-                stop_loss = criterion_st(
-                    stop_tokens, stop_targets) if c.stopnet else torch.zeros(1)
-                if c.loss_masking:
-                    decoder_loss = criterion(decoder_output, mel_input,
-                                             mel_lengths)
-                    if c.model in ["Tacotron", "TacotronGST"]:
-                        postnet_loss = criterion(postnet_output, linear_input,
-                                                 mel_lengths)
-                    else:
-                        postnet_loss = criterion(postnet_output, mel_input,
-                                                 mel_lengths)
-                else:
-                    decoder_loss = criterion(decoder_output, mel_input)
-                    if c.model in ["Tacotron", "TacotronGST"]:
-                        postnet_loss = criterion(postnet_output, linear_input)
-                    else:
-                        postnet_loss = criterion(postnet_output, mel_input)
-                loss = decoder_loss + postnet_loss + stop_loss
-
-                # backward decoder loss
-                if c.bidirectional_decoder:
-                    if c.loss_masking:
-                        decoder_backward_loss = criterion(torch.flip(decoder_backward_output, dims=(1, )), mel_input, mel_lengths)
-                    else:
-                        decoder_backward_loss = criterion(torch.flip(decoder_backward_output, dims=(1, )), mel_input)
-                    decoder_c_loss = torch.nn.functional.l1_loss(torch.flip(decoder_backward_output, dims=(1, )), decoder_output)
-                    loss += decoder_backward_loss + decoder_c_loss
-                    keep_avg.update_values({'avg_decoder_b_loss': decoder_backward_loss.item(), 'avg_decoder_c_loss': decoder_c_loss.item()})
-
-                step_time = time.time() - start_time
-                epoch_time += step_time
-
-                # compute alignment score
-                align_score = alignment_diagonal_score(alignments)
-                keep_avg.update_value('avg_align_score', align_score)
-
-                # aggregate losses from processes
-                if num_gpus > 1:
-                    postnet_loss = reduce_tensor(postnet_loss.data, num_gpus)
-                    decoder_loss = reduce_tensor(decoder_loss.data, num_gpus)
-                    if c.stopnet:
-                        stop_loss = reduce_tensor(stop_loss.data, num_gpus)
-
-                keep_avg.update_values({
-                    'avg_postnet_loss':
-                    float(postnet_loss.item()),
-                    'avg_decoder_loss':
-                    float(decoder_loss.item()),
-                    'avg_stop_loss':
-                    float(stop_loss.item()),
-                })
-
-                if num_iter % c.print_step == 0:
-                    print(
-                        "   | > TotalLoss: {:.5f}   PostnetLoss: {:.5f} - {:.5f}  DecoderLoss:{:.5f} - {:.5f} "
-                        "StopLoss: {:.5f} - {:.5f}  AlignScore: {:.4f} : {:.4f}"
-                        .format(loss.item(), postnet_loss.item(),
-                                keep_avg['avg_postnet_loss'],
-                                decoder_loss.item(),
-                                keep_avg['avg_decoder_loss'], stop_loss.item(),
-                                keep_avg['avg_stop_loss'], align_score,
-                                keep_avg['avg_align_score']),
-                        flush=True)
-
-            if args.rank == 0:
-                # Diagnostic visualizations
-                idx = np.random.randint(mel_input.shape[0])
-                const_spec = postnet_output[idx].data.cpu().numpy()
-                gt_spec = linear_input[idx].data.cpu().numpy() if c.model in [
-                    "Tacotron", "TacotronGST"
-                ] else mel_input[idx].data.cpu().numpy()
-                align_img = alignments[idx].data.cpu().numpy()
-
-                eval_figures = {
-                    "prediction": plot_spectrogram(const_spec, ap),
-                    "ground_truth": plot_spectrogram(gt_spec, ap),
-                    "alignment": plot_alignment(align_img)
-                }
-
-                # Sample audio
+            # loss computation
+            stop_loss = criterion_st(
+                stop_tokens, stop_targets) if c.stopnet else torch.zeros(1)
+            if c.loss_masking:
+                decoder_loss = criterion(decoder_output, mel_input,
+                                            mel_lengths)
                 if c.model in ["Tacotron", "TacotronGST"]:
-                    eval_audio = ap.inv_spectrogram(const_spec.T)
+                    postnet_loss = criterion(postnet_output, linear_input,
+                                                mel_lengths)
                 else:
-                    eval_audio = ap.inv_mel_spectrogram(const_spec.T)
-                tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio},
-                                         c.audio["sample_rate"])
+                    postnet_loss = criterion(postnet_output, mel_input,
+                                                mel_lengths)
+            else:
+                decoder_loss = criterion(decoder_output, mel_input)
+                if c.model in ["Tacotron", "TacotronGST"]:
+                    postnet_loss = criterion(postnet_output, linear_input)
+                else:
+                    postnet_loss = criterion(postnet_output, mel_input)
+            loss = decoder_loss + postnet_loss + stop_loss
 
-                # Plot Validation Stats
-                epoch_stats = {
-                    "loss_postnet": keep_avg['avg_postnet_loss'],
-                    "loss_decoder": keep_avg['avg_decoder_loss'],
-                    "stop_loss": keep_avg['avg_stop_loss'],
-                    "alignment_score": keep_avg['avg_align_score']
-                }
+            # backward decoder loss
+            if c.bidirectional_decoder:
+                if c.loss_masking:
+                    decoder_backward_loss = criterion(torch.flip(decoder_backward_output, dims=(1, )), mel_input, mel_lengths)
+                else:
+                    decoder_backward_loss = criterion(torch.flip(decoder_backward_output, dims=(1, )), mel_input)
+                decoder_c_loss = torch.nn.functional.l1_loss(torch.flip(decoder_backward_output, dims=(1, )), decoder_output)
+                loss += decoder_backward_loss + decoder_c_loss
+                keep_avg.update_values({'avg_decoder_b_loss': decoder_backward_loss.item(), 'avg_decoder_c_loss': decoder_c_loss.item()})
 
-                if c.bidirectional_decoder:
-                    epoch_stats['loss_decoder_backward'] = keep_avg['avg_decoder_b_loss']
-                    align_b_img = alignments_backward[idx].data.cpu().numpy()
-                    eval_figures['alignment_backward'] = plot_alignment(align_b_img)
-                tb_logger.tb_eval_stats(global_step, epoch_stats)
-                tb_logger.tb_eval_figures(global_step, eval_figures)
+            step_time = time.time() - start_time
+            epoch_time += step_time
+
+            # compute alignment score
+            align_score = alignment_diagonal_score(alignments)
+            keep_avg.update_value('avg_align_score', align_score)
+
+            # aggregate losses from processes
+            if num_gpus > 1:
+                postnet_loss = reduce_tensor(postnet_loss.data, num_gpus)
+                decoder_loss = reduce_tensor(decoder_loss.data, num_gpus)
+                if c.stopnet:
+                    stop_loss = reduce_tensor(stop_loss.data, num_gpus)
+
+            keep_avg.update_values({
+                'avg_postnet_loss':
+                float(postnet_loss.item()),
+                'avg_decoder_loss':
+                float(decoder_loss.item()),
+                'avg_stop_loss':
+                float(stop_loss.item()),
+            })
+
+            if num_iter % c.print_step == 0:
+                print(
+                    "   | > TotalLoss: {:.5f}   PostnetLoss: {:.5f} - {:.5f}  DecoderLoss:{:.5f} - {:.5f} "
+                    "StopLoss: {:.5f} - {:.5f}  AlignScore: {:.4f} : {:.4f}"
+                    .format(loss.item(), postnet_loss.item(),
+                            keep_avg['avg_postnet_loss'],
+                            decoder_loss.item(),
+                            keep_avg['avg_decoder_loss'], stop_loss.item(),
+                            keep_avg['avg_stop_loss'], align_score,
+                            keep_avg['avg_align_score']),
+                    flush=True)
+
+        if args.rank == 0:
+            # Diagnostic visualizations
+            idx = np.random.randint(mel_input.shape[0])
+            const_spec = postnet_output[idx].data.cpu().numpy()
+            gt_spec = linear_input[idx].data.cpu().numpy() if c.model in [
+                "Tacotron", "TacotronGST"
+            ] else mel_input[idx].data.cpu().numpy()
+            align_img = alignments[idx].data.cpu().numpy()
+
+            eval_figures = {
+                "prediction": plot_spectrogram(const_spec, ap),
+                "ground_truth": plot_spectrogram(gt_spec, ap),
+                "alignment": plot_alignment(align_img)
+            }
+
+            # Sample audio
+            if c.model in ["Tacotron", "TacotronGST"]:
+                eval_audio = ap.inv_spectrogram(const_spec.T)
+            else:
+                eval_audio = ap.inv_mel_spectrogram(const_spec.T)
+            tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio},
+                                        c.audio["sample_rate"])
+
+            # Plot Validation Stats
+            epoch_stats = {
+                "loss_postnet": keep_avg['avg_postnet_loss'],
+                "loss_decoder": keep_avg['avg_decoder_loss'],
+                "stop_loss": keep_avg['avg_stop_loss'],
+                "alignment_score": keep_avg['avg_align_score']
+            }
+
+            if c.bidirectional_decoder:
+                epoch_stats['loss_decoder_backward'] = keep_avg['avg_decoder_b_loss']
+                align_b_img = alignments_backward[idx].data.cpu().numpy()
+                eval_figures['alignment_backward'] = plot_alignment(align_b_img)
+            tb_logger.tb_eval_stats(global_step, epoch_stats)
+            tb_logger.tb_eval_figures(global_step, eval_figures)
 
     if args.rank == 0 and epoch > c.test_delay_epochs:
         if c.test_sentences_file is None:

From cf6e16254fc683a4fb487812d2d7653571a1bc2f Mon Sep 17 00:00:00 2001
From: erogol <erogol@hotmail.com>
Date: Wed, 12 Feb 2020 10:29:30 +0100
Subject: [PATCH 45/61] add torch.no_grad decorator for inference

---
 models/tacotron.py  | 1 +
 models/tacotron2.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/models/tacotron.py b/models/tacotron.py
index a2d9e1c4..04ecd573 100644
--- a/models/tacotron.py
+++ b/models/tacotron.py
@@ -132,6 +132,7 @@ class Tacotron(nn.Module):
             return decoder_outputs, postnet_outputs, alignments, stop_tokens, decoder_outputs_backward, alignments_backward
         return decoder_outputs, postnet_outputs, alignments, stop_tokens
 
+    @torch.no_grad()
     def inference(self, characters, speaker_ids=None, style_mel=None):
         inputs = self.embedding(characters)
         self._init_states()
diff --git a/models/tacotron2.py b/models/tacotron2.py
index 852b1886..3a3863de 100644
--- a/models/tacotron2.py
+++ b/models/tacotron2.py
@@ -82,6 +82,7 @@ class Tacotron2(nn.Module):
             return decoder_outputs, postnet_outputs, alignments, stop_tokens, decoder_outputs_backward, alignments_backward
         return decoder_outputs, postnet_outputs, alignments, stop_tokens
 
+    @torch.no_grad()
     def inference(self, text, speaker_ids=None):
         embedded_inputs = self.embedding(text).transpose(1, 2)
         encoder_outputs = self.encoder.inference(embedded_inputs)

From 9aacd2ee0ab181eb183800aca70ff9a16a3bd275 Mon Sep 17 00:00:00 2001
From: erogol <erogol@hotmail.com>
Date: Wed, 12 Feb 2020 10:32:52 +0100
Subject: [PATCH 46/61] linter fix

---
 train.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/train.py b/train.py
index b9f5fefb..e8c240f3 100644
--- a/train.py
+++ b/train.py
@@ -368,13 +368,13 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
                 stop_tokens, stop_targets) if c.stopnet else torch.zeros(1)
             if c.loss_masking:
                 decoder_loss = criterion(decoder_output, mel_input,
-                                            mel_lengths)
+                                         mel_lengths)
                 if c.model in ["Tacotron", "TacotronGST"]:
                     postnet_loss = criterion(postnet_output, linear_input,
-                                                mel_lengths)
+                                             mel_lengths)
                 else:
                     postnet_loss = criterion(postnet_output, mel_input,
-                                                mel_lengths)
+                                             mel_lengths)
             else:
                 decoder_loss = criterion(decoder_output, mel_input)
                 if c.model in ["Tacotron", "TacotronGST"]:
@@ -449,7 +449,7 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
             else:
                 eval_audio = ap.inv_mel_spectrogram(const_spec.T)
             tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio},
-                                        c.audio["sample_rate"])
+                                     c.audio["sample_rate"])
 
             # Plot Validation Stats
             epoch_stats = {

From 60379271dc4531bd5f8b4ad4865b1cbbb1d87f4f Mon Sep 17 00:00:00 2001
From: erogol <erogol@hotmail.com>
Date: Wed, 12 Feb 2020 12:21:53 +0100
Subject: [PATCH 47/61] update for phonemizer 2.1

---
 tests/test_text_processing.py |  4 ++--
 utils/text/__init__.py        | 43 +++++++++++++++++++++++------------
 2 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/tests/test_text_processing.py b/tests/test_text_processing.py
index 8f8e6fab..0ecb9962 100644
--- a/tests/test_text_processing.py
+++ b/tests/test_text_processing.py
@@ -69,7 +69,7 @@ def test_phoneme_to_sequence():
 
 def test_text2phone():
     text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!"
-    gt = "ɹ|iː|s|ə|n|t| |ɹ|ɪ|s|ɜː|tʃ| |æ|t| |h|ɑːɹ|v|ɚ|d| |h|ɐ|z| |ʃ|oʊ|n| |m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| |f|ɔː|ɹ| |æ|z| |l|ɪ|ɾ|əl| |æ|z| |eɪ|t| |w|iː|k|s| |k|æ|n| |æ|k|tʃ|uː|əl|i|| |ɪ|n|k|ɹ|iː|s|,| |ð|ə| |ɡ|ɹ|eɪ| |m|æ|ɾ|ɚ|ɹ| |ɪ|n|ð|ə| |p|ɑːɹ|t|s| |ʌ|v|ð|ə| |b|ɹ|eɪ|n| |ɹ|ɪ|s|p|ɑː|n|s|ə|b|əl| |f|ɔː|ɹ| |ɪ|m|oʊ|ʃ|ə|n|əl| |ɹ|ɛ|ɡ|j|uː|l|eɪ|ʃ|ə|n||| |æ|n|d| |l|ɜː|n|ɪ|ŋ|!"
+    gt = "ɹ|iː|s|ə|n|t| |ɹ|ɪ|s|ɜː|tʃ| |æ|t| |h|ɑːɹ|v|ɚ|d| |h|ɐ|z| |ʃ|oʊ|n| |m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| |f|ɔː|ɹ| |æ|z| |l|ɪ|ɾ|əl| |æ|z| |eɪ|t| |w|iː|k|s| |k|æ|n| |æ|k|tʃ|uː|əl|i| |ɪ|n|k|ɹ|iː|s|,| |ð|ə| |ɡ|ɹ|eɪ| |m|æ|ɾ|ɚ|ɹ| |ɪ|n|ð|ə| |p|ɑːɹ|t|s| |ʌ|v|ð|ə| |b|ɹ|eɪ|n| |ɹ|ɪ|s|p|ɑː|n|s|ə|b|əl| |f|ɔː|ɹ| |ɪ|m|oʊ|ʃ|ə|n|əl| |ɹ|ɛ|ɡ|j|uː|l|eɪ|ʃ|ə|n| |æ|n|d| |l|ɜː|n|ɪ|ŋ|!"
     lang = "en-us"
     phonemes = text2phone(text, lang)
-    assert gt == phonemes
+    assert gt == phonemes, f"\n{phonemes} \n vs \n{gt}"
diff --git a/utils/text/__init__.py b/utils/text/__init__.py
index 1c5b98c3..e6842dfa 100644
--- a/utils/text/__init__.py
+++ b/utils/text/__init__.py
@@ -28,21 +28,34 @@ def text2phone(text, language):
     seperator = phonemizer.separator.Separator(' |', '', '|')
     #try:
     punctuations = re.findall(PHONEME_PUNCTUATION_PATTERN, text)
-    ph = phonemize(text, separator=seperator, strip=False, njobs=1, backend='espeak', language=language)
-    ph = ph[:-1].strip() # skip the last empty character
-    # Replace \n with matching punctuations.
-    if punctuations:
-        # if text ends with a punctuation.
-        if text[-1] == punctuations[-1]:
-            for punct in punctuations[:-1]:
-                ph = ph.replace('| |\n', '|'+punct+'| |', 1)
-            try:
-                ph = ph + punctuations[-1]
-            except:
-                print(text)
-        else:
-            for punct in punctuations:
-                ph = ph.replace('| |\n', '|'+punct+'| |', 1)
+    if float(phonemizer.__version__) < 2.1:
+        ph = phonemize(text, separator=seperator, strip=False, njobs=1, backend='espeak', language=language)
+        ph = ph[:-1].strip() # skip the last empty character
+        # phonemizer does not tackle punctuations. Here we do.
+        # Replace \n with matching punctuations.
+        if punctuations:
+            # if text ends with a punctuation.
+            if text[-1] == punctuations[-1]:
+                for punct in punctuations[:-1]:
+                    ph = ph.replace('| |\n', '|'+punct+'| |', 1)
+                try:
+                    ph = ph + punctuations[-1]
+                except:
+                    print(text)
+            else:
+                for punct in punctuations:
+                    ph = ph.replace('| |\n', '|'+punct+'| |', 1)
+    elif float(phonemizer.__version__) == 2.1:
+        ph = phonemize(text, separator=seperator, strip=False, njobs=1, backend='espeak', language=language, preserve_punctuation=True)
+        # this is a simple fix for phonemizer.
+        # https://github.com/bootphon/phonemizer/issues/32
+        if punctuations:
+            for punctuation in punctuations:
+                ph = ph.replace(f"| |{punctuation} ", f"|{punctuation}| |").replace(f"| |{punctuation}", f"|{punctuation}| |")
+            ph = ph[:-3]
+    else:
+        raise RuntimeError(" [!] Use 'phonemizer' version 2.1 or older.")
+
     return ph
 
 

From 3b57e88a66ba1f410be70dbd2ad2899b5b1bcb0e Mon Sep 17 00:00:00 2001
From: Reuben Morais <reuben.morais@gmail.com>
Date: Thu, 13 Feb 2020 15:49:46 +0100
Subject: [PATCH 48/61] Use PWGAN if available in Synthesizer.tts

---
 server/synthesizer.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/server/synthesizer.py b/server/synthesizer.py
index 75fd4e76..455bd332 100644
--- a/server/synthesizer.py
+++ b/server/synthesizer.py
@@ -168,9 +168,16 @@ class Synthesizer(object):
             postnet_output, decoder_output, _ = parse_outputs(
                 postnet_output, decoder_output, alignments)
 
+            if self.pwgan:
+                vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0)
+                if self.use_cuda:
+                    vocoder_input.cuda()
+                wav = self.pwgan.inference(vocoder_input, hop_size=self.ap.hop_length)
             if self.wavernn:
-                postnet_output = postnet_output[0].data.cpu().numpy()
-                wav = self.wavernn.generate(torch.FloatTensor(postnet_output.T).unsqueeze(0).cuda(), batched=self.config.is_wavernn_batched, target=11000, overlap=550)
+                vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0)
+                if self.use_cuda:
+                    vocoder_input.cuda()
+                wav = self.wavernn.generate(vocoder_input, batched=self.config.is_wavernn_batched, target=11000, overlap=550)
             else:
                 wav = inv_spectrogram(postnet_output, self.ap, self.tts_config)
             # trim silence

From bfd45a8ea900603f7a8e231b9c50ea4506bf9eb9 Mon Sep 17 00:00:00 2001
From: Reuben Morais <reuben.morais@gmail.com>
Date: Thu, 13 Feb 2020 15:54:30 +0100
Subject: [PATCH 49/61] Load PWGAN/WaveRNN embedded files if present

---
 server/server.py | 47 +++++++++++++++++++++++++++--------------------
 1 file changed, 27 insertions(+), 20 deletions(-)

diff --git a/server/server.py b/server/server.py
index 6af119bf..705937e2 100644
--- a/server/server.py
+++ b/server/server.py
@@ -18,9 +18,9 @@ def create_argparser():
     parser.add_argument('--wavernn_file', type=str, default=None, help='path to WaveRNN checkpoint file.')
     parser.add_argument('--wavernn_config', type=str, default=None, help='path to WaveRNN config file.')
     parser.add_argument('--is_wavernn_batched', type=convert_boolean, default=False, help='true to use batched WaveRNN.')
-    parser.add_argument('--pwgan_lib_path', type=str, help='path to ParallelWaveGAN project folder to be imported. If this is not passed, model uses Griffin-Lim for synthesis.')
-    parser.add_argument('--pwgan_file', type=str, help='path to ParallelWaveGAN checkpoint file.')
-    parser.add_argument('--pwgan_config', type=str, help='path to ParallelWaveGAN config file.')
+    parser.add_argument('--pwgan_lib_path', type=str, default=None, help='path to ParallelWaveGAN project folder to be imported. If this is not passed, model uses Griffin-Lim for synthesis.')
+    parser.add_argument('--pwgan_file', type=str, default=None, help='path to ParallelWaveGAN checkpoint file.')
+    parser.add_argument('--pwgan_config', type=str, default=None, help='path to ParallelWaveGAN config file.')
     parser.add_argument('--port', type=int, default=5002, help='port to listen on.')
     parser.add_argument('--use_cuda', type=convert_boolean, default=False, help='true to use CUDA.')
     parser.add_argument('--debug', type=convert_boolean, default=False, help='true to enable Flask debug mode.')
@@ -29,28 +29,35 @@ def create_argparser():
 
 synthesizer = None
 
-embedded_model_folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'model')
-checkpoint_file = os.path.join(embedded_model_folder, 'checkpoint.pth.tar')
-config_file = os.path.join(embedded_model_folder, 'config.json')
+embedded_models_folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'model')
 
-# Default options with embedded model files
-if os.path.isfile(checkpoint_file):
-    default_tts_checkpoint = checkpoint_file
-else:
-    default_tts_checkpoint = None
+embedded_tts_folder = os.path.join(embedded_models_folder, 'tts')
+tts_checkpoint_file = os.path.join(embedded_tts_folder, 'checkpoint.pth.tar')
+tts_config_file = os.path.join(embedded_tts_folder, 'config.json')
 
-if os.path.isfile(config_file):
-    default_tts_config = config_file
-else:
-    default_tts_config = None
+embedded_wavernn_folder = os.path.join(embedded_models_folder, 'wavernn')
+wavernn_checkpoint_file = os.path.join(embedded_wavernn_folder, 'checkpoint.pth.tar')
+wavernn_config_file = os.path.join(embedded_wavernn_folder, 'config.json')
+
+embedded_pwgan_folder = os.path.join(embedded_models_folder, 'pwgan')
+pwgan_checkpoint_file = os.path.join(embedded_pwgan_folder, 'checkpoint.pkl')
+pwgan_config_file = os.path.join(embedded_pwgan_folder, 'config.yml')
 
 args = create_argparser().parse_args()
 
-# If these were not specified in the CLI args, use default values
-if not args.tts_checkpoint:
-    args.tts_checkpoint = default_tts_checkpoint
-if not args.tts_config:
-    args.tts_config = default_tts_config
+# If these were not specified in the CLI args, use default values with embedded model files
+if not args.tts_checkpoint and os.path.isfile(tts_checkpoint_file):
+    args.tts_checkpoint = tts_checkpoint_file
+if not args.tts_config and os.path.isfile(tts_config_file):
+    args.tts_config = tts_config_file
+if not args.wavernn_file and os.path.isfile(wavernn_checkpoint_file):
+    args.wavernn_file = wavernn_checkpoint_file
+if not args.wavernn_config and os.path.isfile(wavernn_config_file):
+    args.wavernn_config = wavernn_config_file
+if not args.pwgan_file and os.path.isfile(pwgan_checkpoint_file):
+    args.pwgan_file = pwgan_checkpoint_file
+if not args.pwgan_config and os.path.isfile(pwgan_config_file):
+    args.pwgan_config = pwgan_config_file
 
 synthesizer = Synthesizer(args)
 

From 846d147a66e39bd6a817027a806459376936a60e Mon Sep 17 00:00:00 2001
From: Reuben Morais <reuben.morais@gmail.com>
Date: Thu, 13 Feb 2020 16:03:30 +0100
Subject: [PATCH 50/61] Fix bug where sometimes the second sentence disappears
 if it doesn't end with punctuation

---
 server/synthesizer.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/server/synthesizer.py b/server/synthesizer.py
index 455bd332..1082b73a 100644
--- a/server/synthesizer.py
+++ b/server/synthesizer.py
@@ -122,7 +122,7 @@ class Synthesizer(object):
         self.ap.save_wav(wav, path)
 
     def split_into_sentences(self, text):
-        text = " " + text + "  "
+        text = " " + text + "  <stop>"
         text = text.replace("\n", " ")
         text = re.sub(prefixes, "\\1<prd>", text)
         text = re.sub(websites, "<prd>\\1", text)
@@ -149,15 +149,13 @@ class Synthesizer(object):
         text = text.replace("<prd>", ".")
         sentences = text.split("<stop>")
         sentences = sentences[:-1]
-        sentences = [s.strip() for s in sentences]
+        sentences = list(filter(None, [s.strip() for s in sentences])) # remove empty sentences
         return sentences
 
     def tts(self, text):
         wavs = []
         sens = self.split_into_sentences(text)
         print(sens)
-        if not sens:
-            sens = [text+'.']
         for sen in sens:
             # preprocess the given text
             inputs = text_to_seqvec(sen, self.tts_config, self.use_cuda)

From e40bc18c84ba16456ad9f3f7529f76ffc568b6b2 Mon Sep 17 00:00:00 2001
From: erogol <erogol@hotmail.com>
Date: Thu, 13 Feb 2020 17:23:37 +0100
Subject: [PATCH 51/61] fix linter problems and loader test

---
 tests/test_loader.py          | 4 +---
 tests/test_text_processing.py | 4 ++--
 utils/text/__init__.py        | 3 ---
 3 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/tests/test_loader.py b/tests/test_loader.py
index 751bc181..d8727895 100644
--- a/tests/test_loader.py
+++ b/tests/test_loader.py
@@ -137,9 +137,7 @@ class TestTTSDataset(unittest.TestCase):
                 # NOTE: Below needs to check == 0 but due to an unknown reason
                 # there is a slight difference between two matrices.
                 # TODO: Check this assert cond more in detail.
-                assert abs((abs(mel.T)
-                            - abs(mel_dl)
-                            ).sum()) < 1e-5, (abs(mel.T) - abs(mel_dl)).sum()
+                assert abs(mel.T - mel_dl).max() < 1e-5, abs(mel.T - mel_dl).max()
 
                 # check mel-spec correctness
                 mel_spec = mel_input[0].cpu().numpy()
diff --git a/tests/test_text_processing.py b/tests/test_text_processing.py
index 0ecb9962..aa17f694 100644
--- a/tests/test_text_processing.py
+++ b/tests/test_text_processing.py
@@ -71,5 +71,5 @@ def test_text2phone():
     text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!"
     gt = "ɹ|iː|s|ə|n|t| |ɹ|ɪ|s|ɜː|tʃ| |æ|t| |h|ɑːɹ|v|ɚ|d| |h|ɐ|z| |ʃ|oʊ|n| |m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| |f|ɔː|ɹ| |æ|z| |l|ɪ|ɾ|əl| |æ|z| |eɪ|t| |w|iː|k|s| |k|æ|n| |æ|k|tʃ|uː|əl|i| |ɪ|n|k|ɹ|iː|s|,| |ð|ə| |ɡ|ɹ|eɪ| |m|æ|ɾ|ɚ|ɹ| |ɪ|n|ð|ə| |p|ɑːɹ|t|s| |ʌ|v|ð|ə| |b|ɹ|eɪ|n| |ɹ|ɪ|s|p|ɑː|n|s|ə|b|əl| |f|ɔː|ɹ| |ɪ|m|oʊ|ʃ|ə|n|əl| |ɹ|ɛ|ɡ|j|uː|l|eɪ|ʃ|ə|n| |æ|n|d| |l|ɜː|n|ɪ|ŋ|!"
     lang = "en-us"
-    phonemes = text2phone(text, lang)
-    assert gt == phonemes, f"\n{phonemes} \n vs \n{gt}"
+    ph = text2phone(text, lang)
+    assert gt == ph, f"\n{phonemes} \n vs \n{gt}"
diff --git a/utils/text/__init__.py b/utils/text/__init__.py
index e6842dfa..0e6684d2 100644
--- a/utils/text/__init__.py
+++ b/utils/text/__init__.py
@@ -38,10 +38,7 @@ def text2phone(text, language):
             if text[-1] == punctuations[-1]:
                 for punct in punctuations[:-1]:
                     ph = ph.replace('| |\n', '|'+punct+'| |', 1)
-                try:
                     ph = ph + punctuations[-1]
-                except:
-                    print(text)
             else:
                 for punct in punctuations:
                     ph = ph.replace('| |\n', '|'+punct+'| |', 1)

From d97eb9f7839635c1063bbb0b6d854b582b98c6e8 Mon Sep 17 00:00:00 2001
From: Reuben Morais <reuben.morais@gmail.com>
Date: Thu, 13 Feb 2020 17:30:41 +0100
Subject: [PATCH 52/61] Fix linter and server package test

---
 server/synthesizer.py        | 3 ++-
 setup.py                     | 7 ++++---
 tests/test_server_package.sh | 2 +-
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/server/synthesizer.py b/server/synthesizer.py
index 1082b73a..fcdc8787 100644
--- a/server/synthesizer.py
+++ b/server/synthesizer.py
@@ -121,7 +121,8 @@ class Synthesizer(object):
         wav = np.array(wav)
         self.ap.save_wav(wav, path)
 
-    def split_into_sentences(self, text):
+    @staticmethod
+    def split_into_sentences(text):
         text = " " + text + "  <stop>"
         text = text.replace("\n", " ")
         text = re.sub(prefixes, "\\1<prd>", text)
diff --git a/setup.py b/setup.py
index 63782800..f92dac8a 100644
--- a/setup.py
+++ b/setup.py
@@ -61,10 +61,11 @@ package_data = ['server/templates/*']
 if 'bdist_wheel' in unknown_args and args.checkpoint and args.model_config:
     print('Embedding model in wheel file...')
     model_dir = os.path.join('server', 'model')
-    os.makedirs(model_dir, exist_ok=True)
-    embedded_checkpoint_path = os.path.join(model_dir, 'checkpoint.pth.tar')
+    tts_dir = os.path.join(model_dir, 'tts')
+    os.makedirs(tts_dir, exist_ok=True)
+    embedded_checkpoint_path = os.path.join(tts_dir, 'checkpoint.pth.tar')
     shutil.copy(args.checkpoint, embedded_checkpoint_path)
-    embedded_config_path = os.path.join(model_dir, 'config.json')
+    embedded_config_path = os.path.join(tts_dir, 'config.json')
     shutil.copy(args.model_config, embedded_config_path)
     package_data.extend([embedded_checkpoint_path, embedded_config_path])
 
diff --git a/tests/test_server_package.sh b/tests/test_server_package.sh
index 01e42843..9fe5e8b1 100755
--- a/tests/test_server_package.sh
+++ b/tests/test_server_package.sh
@@ -11,7 +11,7 @@ source /tmp/venv/bin/activate
 pip install --quiet --upgrade pip setuptools wheel
 
 rm -f dist/*.whl
-python setup.py bdist_wheel --checkpoint tests/outputs/checkpoint_10.pth.tar --model_config tests/outputs/dummy_model_config.json
+python setup.py --quiet bdist_wheel --checkpoint tests/outputs/checkpoint_10.pth.tar --model_config tests/outputs/dummy_model_config.json
 pip install --quiet dist/TTS*.whl
 
 python -m TTS.server.server &

From 2079097183f6355b0394e85c811aec830f65686d Mon Sep 17 00:00:00 2001
From: erogol <erogol@hotmail.com>
Date: Thu, 13 Feb 2020 22:16:40 +0100
Subject: [PATCH 53/61] check config with a function

---
 config.json            |   9 +--
 train.py               |   3 +-
 utils/generic_utils.py | 128 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 135 insertions(+), 5 deletions(-)

diff --git a/config.json b/config.json
index 9e4fa906..c1a8158d 100644
--- a/config.json
+++ b/config.json
@@ -9,7 +9,7 @@
         "num_mels": 80,         // size of the mel spec frame. 
         "num_freq": 1025,       // number of stft frequency levels. Size of the linear spectogram frame.
         "sample_rate": 22050,   // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
-        "frame_length_ms": 50,  // stft window length in ms.
+        "frame_length_ms": 50.0,  // stft window length in ms.
         "frame_shift_ms": 12.5, // stft window hop-lengh in ms.
         "preemphasis": 0.98,    // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
         "min_level_db": -100,   // normalization range
@@ -19,7 +19,7 @@
         // Normalization parameters
         "signal_norm": true,    // normalize the spec values in range [0, 1]
         "symmetric_norm": true, // move normalization to range [-1, 1]
-        "max_norm": 4,          // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
+        "max_norm": 4.0,          // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
         "clip_norm": true,      // clip normalized values into the range.
         "mel_fmin": 0.0,         // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
         "mel_fmax": 8000.0,        // maximum freq level for mel-spec. Tune for dataset!!
@@ -36,11 +36,12 @@
     "reinit_layers": [],    // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
 
     // TRAINING
-    "batch_size": 32,       // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
+    "batch_size": 2,       // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
     "eval_batch_size":16,   
     "r": 7,                 // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.  
     "gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed.
     "loss_masking": true,         // enable / disable loss masking against the sequence padding.
+    "grad_accum": 2,        // if N > 1, enable gradient accumulation for N iterations. It is useful for low memory GPUs. 
 
     // VALIDATION
     "run_eval": true,
@@ -49,7 +50,7 @@
 
     // OPTIMIZER
     "noam_schedule": false,        // use noam warmup and lr schedule.
-    "grad_clip": 1,                // upper limit for gradients for clipping.
+    "grad_clip": 1.0,                // upper limit for gradients for clipping.
     "epochs": 1000,                // total number of epochs to train.
     "lr": 0.0001,                  // Initial learning rate. If Noam decay is active, maximum learning rate.
     "wd": 0.000001,         // Weight decay weight.
diff --git a/train.py b/train.py
index e8c240f3..7bfb8751 100644
--- a/train.py
+++ b/train.py
@@ -20,7 +20,7 @@ from TTS.utils.generic_utils import (
     get_git_branch, load_config, remove_experiment_folder, save_best_model,
     save_checkpoint, adam_weight_decay, set_init_dict, copy_config_file,
     setup_model, gradual_training_scheduler, KeepAverage,
-    set_weight_decay)
+    set_weight_decay, check_config)
 from TTS.utils.logger import Logger
 from TTS.utils.speakers import load_speaker_mapping, save_speaker_mapping, \
     get_speakers
@@ -687,6 +687,7 @@ if __name__ == '__main__':
 
     # setup output paths and read configs
     c = load_config(args.config_path)
+    check_config(c)
     _ = os.path.dirname(os.path.realpath(__file__))
 
     OUT_PATH = args.continue_path
diff --git a/utils/generic_utils.py b/utils/generic_utils.py
index cf1a83a6..7a5c2ac2 100644
--- a/utils/generic_utils.py
+++ b/utils/generic_utils.py
@@ -389,3 +389,131 @@ class KeepAverage():
     def update_values(self, value_dict):
         for key, value in value_dict.items():
             self.update_value(key, value)
+
+
+def _check_argument(name, c, enum_list=None, max_val=None, min_val=None, restricted=False, val_type=None):
+    if restricted:
+        assert name in c.keys(), f' [!] {name} not defined in config.json'
+    if name in c.keys():
+        if max_val:
+            assert c[name] <= max_val, f' [!] {name} is larger than max value {max_val}'
+        if min_val:
+            assert c[name] >= min_val, f' [!] {name} is smaller than min value {min_val}'
+        if enum_list:
+            assert c[name].lower() in enum_list, f' [!] {name} is not a valid value'
+        if val_type:
+            assert type(c[name]) is val_type or c[name] is None, f' [!] {name} has wrong type - {type(c[name])} vs {val_type}'
+    
+
+
+def check_config(c):
+    _check_argument('model', c, enum_list=['tacotron', 'tacotron2'], restricted=True, val_type=str)
+    _check_argument('run_name', c, restricted=True, val_type=str)
+    _check_argument('run_description', c, val_type=str)
+
+    # AUDIO
+    _check_argument('audio', c, restricted=True, val_type=dict)
+
+    # audio processing parameters
+    _check_argument('num_mels', c['audio'], restricted=True, val_type=int, min_val=10, max_val=2056)
+    _check_argument('num_freq', c['audio'], restricted=True, val_type=int, min_val=128, max_val=4058)
+    _check_argument('sample_rate', c['audio'], restricted=True, val_type=int, min_val=512, max_val=100000)
+    _check_argument('frame_length_ms', c['audio'], restricted=True, val_type=float, min_val=10, max_val=1000)
+    _check_argument('frame_shift_ms', c['audio'], restricted=True, val_type=float, min_val=1, max_val=1000)
+    _check_argument('preemphasis', c['audio'], restricted=True, val_type=float, min_val=0, max_val=1)
+    _check_argument('min_level_db', c['audio'], restricted=True, val_type=int, min_val=-1000, max_val=10)
+    _check_argument('ref_level_db', c['audio'], restricted=True, val_type=int, min_val=0, max_val=1000)
+    _check_argument('power', c['audio'], restricted=True, val_type=float, min_val=1, max_val=5)
+    _check_argument('griffin_lim_iters', c['audio'], restricted=True, val_type=int, min_val=10, max_val=1000)
+
+    # normalization parameters
+    _check_argument('signal_norm', c['audio'], restricted=True, val_type=bool)
+    _check_argument('symmetric_norm', c['audio'], restricted=True, val_type=bool)
+    _check_argument('max_norm', c['audio'], restricted=True, val_type=float, min_val=0.1, max_val=1000)
+    _check_argument('clip_norm', c['audio'], restricted=True, val_type=bool)
+    _check_argument('mel_fmin', c['audio'], restricted=True, val_type=float, min_val=0.0, max_val=1000)
+    _check_argument('mel_fmax', c['audio'], restricted=True, val_type=float, min_val=500.0)
+    _check_argument('do_trim_silence', c['audio'], restricted=True, val_type=bool)
+    _check_argument('trim_db', c['audio'], restricted=True, val_type=int)
+
+    # training parameters
+    _check_argument('batch_size', c, restricted=True, val_type=int, min_val=1)
+    _check_argument('eval_batch_size', c, restricted=True, val_type=int, min_val=1)
+    _check_argument('r', c, restricted=True, val_type=int, min_val=1)
+    _check_argument('gradual_training', c, restricted=False, val_type=list)
+    _check_argument('loss_masking', c, restricted=True, val_type=bool)
+    _check_argument('grad_accum', c, restricted=True, val_type=int, min_val=1, max_val=100)
+
+    # validation parameters
+    _check_argument('run_eval', c, restricted=True, val_type=bool)
+    _check_argument('test_delay_epochs', c, restricted=True, val_type=int, min_val=0)
+    _check_argument('test_sentences_file', c, restricted=False, val_type=str)
+
+    # optimizer
+    _check_argument('noam_schedule', c, restricted=False, val_type=bool)
+    _check_argument('grad_clip', c, restricted=True, val_type=float, min_val=0.0)
+    _check_argument('epochs', c, restricted=True, val_type=int, min_val=1)
+    _check_argument('lr', c, restricted=True, val_type=float, min_val=0)
+    _check_argument('wd', c, restricted=True, val_type=float, min_val=0)
+    _check_argument('warmup_steps', c, restricted=True, val_type=int, min_val=0)
+    _check_argument('seq_len_norm', c, restricted=True, val_type=bool)
+
+    # tacotron prenet
+    _check_argument('memory_size', c, restricted=True, val_type=int, min_val=-1)
+    _check_argument('prenet_type', c, restricted=True, val_type=str, enum_list=['original', 'bn'])
+    _check_argument('prenet_dropout', c, restricted=True, val_type=bool)
+
+    # attention
+    _check_argument('attention_type', c, restricted=True, val_type=str, enum_list=['graves', 'original'])
+    _check_argument('attention_heads', c, restricted=True, val_type=int)
+    _check_argument('attention_norm', c, restricted=True, val_type=str, enum_list=['sigmoid', 'softmax'])
+    _check_argument('windowing', c, restricted=True, val_type=bool)
+    _check_argument('use_forward_attn', c, restricted=True, val_type=bool)
+    _check_argument('forward_attn_mask', c, restricted=True, val_type=bool)
+    _check_argument('transition_agent', c, restricted=True, val_type=bool)
+    _check_argument('transition_agent', c, restricted=True, val_type=bool)
+    _check_argument('location_attn', c, restricted=True, val_type=bool)
+    _check_argument('bidirectional_decoder', c, restricted=True, val_type=bool)
+
+    # stopnet
+    _check_argument('stopnet', c, restricted=True, val_type=bool)
+    _check_argument('separate_stopnet', c, restricted=True, val_type=bool)
+
+    # tensorboard
+    _check_argument('print_step', c, restricted=True, val_type=int, min_val=1)
+    _check_argument('save_step', c, restricted=True, val_type=int, min_val=1)
+    _check_argument('checkpoint', c, restricted=True, val_type=bool)
+    _check_argument('tb_model_param_stats', c, restricted=True, val_type=bool)
+
+    # dataloading
+    _check_argument('text_cleaner', c, restricted=True, val_type=str, enum_list=['english_cleaners', 'phoneme_cleaners', 'transliteration_cleaners', 'basic_cleaners'])
+    _check_argument('enable_eos_bos_chars', c, restricted=True, val_type=bool)
+    _check_argument('num_loader_workers', c, restricted=True, val_type=int, min_val=0)
+    _check_argument('num_val_loader_workers', c, restricted=True, val_type=int, min_val=0)
+    _check_argument('batch_group_size', c, restricted=True, val_type=int, min_val=0)
+    _check_argument('min_seq_len', c, restricted=True, val_type=int, min_val=0)
+    _check_argument('max_seq_len', c, restricted=True, val_type=int, min_val=10)
+
+    # paths
+    _check_argument('output_path', c, restricted=True, val_type=str)
+
+    # multi-speaker gst
+    _check_argument('use_speaker_embedding', c, restricted=True, val_type=bool)
+    _check_argument('style_wav_for_test', c, restricted=True, val_type=str)
+    _check_argument('use_gst', c, restricted=True, val_type=bool)
+
+    # datasets - checking only the first entry
+    _check_argument('datasets', c, restricted=True, val_type=list)
+    for dataset_entry in c['datasets']:
+        _check_argument('name', dataset_entry, restricted=True, val_type=str)
+        _check_argument('path', dataset_entry, restricted=True, val_type=str)
+        _check_argument('meta_file_train', dataset_entry, restricted=True, val_type=str)
+        _check_argument('meta_file_val', dataset_entry, restricted=True, val_type=str)
+
+
+
+
+    
+
+
+

From 0c7c34c12c1ff05c1205a0299adef5c446322088 Mon Sep 17 00:00:00 2001
From: erogol <erogol@hotmail.com>
Date: Fri, 14 Feb 2020 17:47:33 +0100
Subject: [PATCH 54/61] remove grad_accum from config checker

---
 utils/generic_utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/utils/generic_utils.py b/utils/generic_utils.py
index 7a5c2ac2..942fedf9 100644
--- a/utils/generic_utils.py
+++ b/utils/generic_utils.py
@@ -405,7 +405,6 @@ def _check_argument(name, c, enum_list=None, max_val=None, min_val=None, restric
             assert type(c[name]) is val_type or c[name] is None, f' [!] {name} has wrong type - {type(c[name])} vs {val_type}'
     
 
-
 def check_config(c):
     _check_argument('model', c, enum_list=['tacotron', 'tacotron2'], restricted=True, val_type=str)
     _check_argument('run_name', c, restricted=True, val_type=str)
@@ -442,7 +441,7 @@ def check_config(c):
     _check_argument('r', c, restricted=True, val_type=int, min_val=1)
     _check_argument('gradual_training', c, restricted=False, val_type=list)
     _check_argument('loss_masking', c, restricted=True, val_type=bool)
-    _check_argument('grad_accum', c, restricted=True, val_type=int, min_val=1, max_val=100)
+    # _check_argument('grad_accum', c, restricted=True, val_type=int, min_val=1, max_val=100)
 
     # validation parameters
     _check_argument('run_eval', c, restricted=True, val_type=bool)

From ecf84fa4ad6e13df623e0c746f28a431d2953724 Mon Sep 17 00:00:00 2001
From: erogol <erogol@hotmail.com>
Date: Fri, 14 Feb 2020 18:00:15 +0100
Subject: [PATCH 55/61] linter fixes

---
 utils/generic_utils.py | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/utils/generic_utils.py b/utils/generic_utils.py
index 942fedf9..a8de5bbb 100644
--- a/utils/generic_utils.py
+++ b/utils/generic_utils.py
@@ -402,8 +402,8 @@ def _check_argument(name, c, enum_list=None, max_val=None, min_val=None, restric
         if enum_list:
             assert c[name].lower() in enum_list, f' [!] {name} is not a valid value'
         if val_type:
-            assert type(c[name]) is val_type or c[name] is None, f' [!] {name} has wrong type - {type(c[name])} vs {val_type}'
-    
+            assert isinstance(c[name], val_type) or c[name] is None, f' [!] {name} has wrong type - {type(c[name])} vs {val_type}'
+
 
 def check_config(c):
     _check_argument('model', c, enum_list=['tacotron', 'tacotron2'], restricted=True, val_type=str)
@@ -507,12 +507,4 @@ def check_config(c):
         _check_argument('name', dataset_entry, restricted=True, val_type=str)
         _check_argument('path', dataset_entry, restricted=True, val_type=str)
         _check_argument('meta_file_train', dataset_entry, restricted=True, val_type=str)
-        _check_argument('meta_file_val', dataset_entry, restricted=True, val_type=str)
-
-
-
-
-    
-
-
-
+        _check_argument('meta_file_val', dataset_entry, restricted=True, val_type=str)
\ No newline at end of file

From 2b1fb6cb12684b726699b93c5fd9245b43641fcd Mon Sep 17 00:00:00 2001
From: erogol <erogol@hotmail.com>
Date: Mon, 17 Feb 2020 16:05:05 +0100
Subject: [PATCH 56/61] add mozilla german

---
 datasets/preprocess.py | 32 ++++++++++++++++----------------
 layers/tacotron2.py    |  1 -
 2 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/datasets/preprocess.py b/datasets/preprocess.py
index a78abab9..64efc665 100644
--- a/datasets/preprocess.py
+++ b/datasets/preprocess.py
@@ -60,22 +60,6 @@ def tweb(root_path, meta_file):
 #     return  {'text': texts, 'wavs': wavs}
 
 
-def mozilla_old(root_path, meta_file):
-    """Normalizes Mozilla meta data files to TTS format"""
-    txt_file = os.path.join(root_path, meta_file)
-    items = []
-    speaker_name = "mozilla_old"
-    with open(txt_file, 'r') as ttf:
-        for line in ttf:
-            cols = line.split('|')
-            batch_no = int(cols[1].strip().split("_")[0])
-            wav_folder = "batch{}".format(batch_no)
-            wav_file = os.path.join(root_path, wav_folder, "wavs_no_processing", cols[1].strip())
-            text = cols[0].strip()
-            items.append([text, wav_file, speaker_name])
-    return items
-
-
 def mozilla(root_path, meta_file):
     """Normalizes Mozilla meta data files to TTS format"""
     txt_file = os.path.join(root_path, meta_file)
@@ -91,6 +75,22 @@ def mozilla(root_path, meta_file):
     return items
 
 
+def mozilla_de(root_path, meta_file):
+    """Normalizes Mozilla meta data files to TTS format"""
+    txt_file = os.path.join(root_path, meta_file)
+    items = []
+    speaker_name = "mozilla"
+    with open(txt_file, 'r', encoding="ISO 8859-1") as ttf:
+        for line in ttf:
+            cols = line.strip().split('|')
+            wav_file = cols[0].strip()
+            text = cols[1].strip()            
+            folder_name = f"BATCH_{wav_file.split('_')[0]}_FINAL"
+            wav_file = os.path.join(root_path, folder_name, wav_file)
+            items.append([text, wav_file, speaker_name])
+    return items
+
+
 def mailabs(root_path, meta_files=None):
     """Normalizes M-AI-Labs meta data files to TTS format"""
     speaker_regex = re.compile("by_book/(male|female)/(?P<speaker_name>[^/]+)/")
diff --git a/layers/tacotron2.py b/layers/tacotron2.py
index c195b277..fa76a6b2 100644
--- a/layers/tacotron2.py
+++ b/layers/tacotron2.py
@@ -64,7 +64,6 @@ class Encoder(nn.Module):
     def forward(self, x, input_lengths):
         x = self.convolutions(x)
         x = x.transpose(1, 2)
-        input_lengths = input_lengths.cpu().numpy()
         x = nn.utils.rnn.pack_padded_sequence(x,
                                               input_lengths,
                                               batch_first=True)

From 8feb326a60fce455ba439d4e1fb7bf0e66642bd4 Mon Sep 17 00:00:00 2001
From: Edresson Casanova <edresson1@gmail.com>
Date: Sun, 1 Mar 2020 15:47:08 -0300
Subject: [PATCH 57/61] add text parameters in config.json

---
 config.json                          | 10 +++++++
 datasets/TTSDataset.py               |  9 ++++--
 notebooks/Benchmark-PWGAN.ipynb      |  6 +++-
 notebooks/Benchmark.ipynb            |  6 +++-
 notebooks/ExtractTTSpectrogram.ipynb |  8 ++++--
 notebooks/TestAttention.ipynb        |  6 +++-
 server/synthesizer.py                |  9 +++++-
 synthesize.py                        |  8 +++++-
 tests/test_demo_server.py            |  5 +++-
 tests/test_loader.py                 |  1 +
 train.py                             |  8 ++++--
 utils/generic_utils.py               |  9 ++++++
 utils/synthesis.py                   |  5 ++--
 utils/text/__init__.py               | 41 +++++++++++++++++++++++-----
 utils/text/symbols.py                | 21 +++++++++-----
 utils/visual.py                      |  5 ++--
 16 files changed, 126 insertions(+), 31 deletions(-)

diff --git a/config.json b/config.json
index c1a8158d..2a7c4551 100644
--- a/config.json
+++ b/config.json
@@ -27,6 +27,16 @@
         "trim_db": 60          // threshold for timming silence. Set this according to your dataset.
     },
 
+    // VOCABULARY PARAMETERS
+    "text":{
+        "pad": "_",
+        "eos": "~",
+        "bos": "^",
+        "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ",
+        "punctuations":"!'(),-.:;? ",
+        "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ"
+    },
+    
     // DISTRIBUTED TRAINING
     "distributed":{
         "backend": "nccl",
diff --git a/datasets/TTSDataset.py b/datasets/TTSDataset.py
index a45d77ff..cccd65a2 100644
--- a/datasets/TTSDataset.py
+++ b/datasets/TTSDataset.py
@@ -15,6 +15,7 @@ class MyDataset(Dataset):
                  text_cleaner,
                  ap,
                  meta_data,
+                 tp=None,
                  batch_group_size=0,
                  min_seq_len=0,
                  max_seq_len=float("inf"),
@@ -49,6 +50,7 @@ class MyDataset(Dataset):
         self.min_seq_len = min_seq_len
         self.max_seq_len = max_seq_len
         self.ap = ap
+        self.tp = tp
         self.use_phonemes = use_phonemes
         self.phoneme_cache_path = phoneme_cache_path
         self.phoneme_language = phoneme_language
@@ -81,7 +83,8 @@ class MyDataset(Dataset):
         config option."""
         phonemes = phoneme_to_sequence(text, [self.cleaners],
                                        language=self.phoneme_language,
-                                       enable_eos_bos=False)
+                                       enable_eos_bos=False, 
+                                       tp=self.tp)
         phonemes = np.asarray(phonemes, dtype=np.int32)
         np.save(cache_path, phonemes)
         return phonemes
@@ -101,7 +104,7 @@ class MyDataset(Dataset):
             phonemes = self._generate_and_cache_phoneme_sequence(text,
                                                                  cache_path)
         if self.enable_eos_bos:
-            phonemes = pad_with_eos_bos(phonemes)
+            phonemes = pad_with_eos_bos(phonemes, tp=self.tp)
             phonemes = np.asarray(phonemes, dtype=np.int32)
         return phonemes
 
@@ -113,7 +116,7 @@ class MyDataset(Dataset):
             text = self._load_or_generate_phoneme_sequence(wav_file, text)
         else:
             text = np.asarray(
-                text_to_sequence(text, [self.cleaners]), dtype=np.int32)
+                text_to_sequence(text, [self.cleaners], tp=self.tp), dtype=np.int32)
 
         assert text.size > 0, self.items[idx][1]
         assert wav.size > 0, self.items[idx][1]
diff --git a/notebooks/Benchmark-PWGAN.ipynb b/notebooks/Benchmark-PWGAN.ipynb
index 430d329f..4a2a21d7 100644
--- a/notebooks/Benchmark-PWGAN.ipynb
+++ b/notebooks/Benchmark-PWGAN.ipynb
@@ -132,7 +132,7 @@
    "outputs": [],
    "source": [
     "# LOAD TTS MODEL\n",
-    "from TTS.utils.text.symbols import symbols, phonemes\n",
+    "from TTS.utils.text.symbols import make_symbols, symbols, phonemes\n",
     "\n",
     "# multi speaker \n",
     "if CONFIG.use_speaker_embedding:\n",
@@ -142,6 +142,10 @@
     "    speakers = []\n",
     "    speaker_id = None\n",
     "\n",
+    "# if the vocabulary was passed, replace the default\n",
+    "if 'text' in CONFIG.keys():\n",
+    "    symbols, phonemes =  make_symbols(**CONFIG.text)\n",
+    "\n",
     "# load the model\n",
     "num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n",
     "model = setup_model(num_chars, len(speakers), CONFIG)\n",
diff --git a/notebooks/Benchmark.ipynb b/notebooks/Benchmark.ipynb
index 00ac7d16..528d7a3b 100644
--- a/notebooks/Benchmark.ipynb
+++ b/notebooks/Benchmark.ipynb
@@ -65,7 +65,7 @@
     "from TTS.utils.text import text_to_sequence\n",
     "from TTS.utils.synthesis import synthesis\n",
     "from TTS.utils.visual import visualize\n",
-    "from TTS.utils.text.symbols import symbols, phonemes\n",
+    "from TTS.utils.text.symbols import make_symbols, symbols, phonemes\n",
     "\n",
     "import IPython\n",
     "from IPython.display import Audio\n",
@@ -149,6 +149,10 @@
     "    speakers = []\n",
     "    speaker_id = None\n",
     "\n",
+    "# if the vocabulary was passed, replace the default\n",
+    "if 'text' in CONFIG.keys():\n",
+    "    symbols, phonemes =  make_symbols(**CONFIG.text)\n",
+    "\n",
     "# load the model\n",
     "num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n",
     "model = setup_model(num_chars, len(speakers), CONFIG)\n",
diff --git a/notebooks/ExtractTTSpectrogram.ipynb b/notebooks/ExtractTTSpectrogram.ipynb
index 20038f78..2313e47e 100644
--- a/notebooks/ExtractTTSpectrogram.ipynb
+++ b/notebooks/ExtractTTSpectrogram.ipynb
@@ -37,7 +37,7 @@
     "from TTS.utils.audio import AudioProcessor\n",
     "from TTS.utils.visual import plot_spectrogram\n",
     "from TTS.utils.generic_utils import load_config, setup_model, sequence_mask\n",
-    "from TTS.utils.text.symbols import symbols, phonemes\n",
+    "from TTS.utils.text.symbols import make_symbols, symbols, phonemes\n",
     "\n",
     "%matplotlib inline\n",
     "\n",
@@ -94,6 +94,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
+   "# if the vocabulary was passed, replace the default\n",
+    "if 'text' in C.keys():\n",
+    "    symbols, phonemes = make_symbols(**C.text)\n",
+    "\n",
     "# load the model\n",
     "num_chars = len(phonemes) if C.use_phonemes else len(symbols)\n",
     "# TODO: multiple speaker\n",
@@ -116,7 +120,7 @@
     "preprocessor = importlib.import_module('TTS.datasets.preprocess')\n",
     "preprocessor = getattr(preprocessor, DATASET.lower())\n",
     "meta_data = preprocessor(DATA_PATH,METADATA_FILE)\n",
-    "dataset = MyDataset(checkpoint['r'], C.text_cleaner, ap, meta_data, use_phonemes=C.use_phonemes,  phoneme_cache_path=C.phoneme_cache_path, enable_eos_bos=C.enable_eos_bos_chars)\n",
+    "dataset = MyDataset(checkpoint['r'], C.text_cleaner, ap, meta_data,tp=C.text if 'text' in C.keys() else None, use_phonemes=C.use_phonemes,  phoneme_cache_path=C.phoneme_cache_path, enable_eos_bos=C.enable_eos_bos_chars)\n",
     "loader = DataLoader(dataset, batch_size=BATCH_SIZE, num_workers=4, collate_fn=dataset.collate_fn, shuffle=False, drop_last=False)"
    ]
   },
diff --git a/notebooks/TestAttention.ipynb b/notebooks/TestAttention.ipynb
index a1867d13..5310fb92 100644
--- a/notebooks/TestAttention.ipynb
+++ b/notebooks/TestAttention.ipynb
@@ -100,7 +100,7 @@
    "outputs": [],
    "source": [
     "# LOAD TTS MODEL\n",
-    "from TTS.utils.text.symbols import symbols, phonemes\n",
+    "from TTS.utils.text.symbols import make_symbols, symbols, phonemes\n",
     "\n",
     "# multi speaker \n",
     "if CONFIG.use_speaker_embedding:\n",
@@ -110,6 +110,10 @@
     "    speakers = []\n",
     "    speaker_id = None\n",
     "\n",
+    "# if the vocabulary was passed, replace the default\n",
+    "if 'text' in CONFIG.keys():\n",
+    "    symbols, phonemes =  make_symbols(**CONFIG.text)\n",
+    "\n",
     "# load the model\n",
     "num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n",
     "model = setup_model(num_chars, len(speakers), CONFIG)\n",
diff --git a/server/synthesizer.py b/server/synthesizer.py
index 347bef21..f001afcd 100644
--- a/server/synthesizer.py
+++ b/server/synthesizer.py
@@ -10,7 +10,7 @@ from TTS.utils.audio import AudioProcessor
 from TTS.utils.generic_utils import load_config, setup_model
 from TTS.utils.speakers import load_speaker_mapping
 from TTS.utils.synthesis import *
-from TTS.utils.text import phonemes, symbols
+from TTS.utils.text import make_symbols, phonemes, symbols
 
 alphabets = r"([A-Za-z])"
 prefixes = r"(Mr|St|Mrs|Ms|Dr)[.]"
@@ -38,12 +38,19 @@ class Synthesizer(object):
                             self.config.pwgan_config, self.config.use_cuda)
 
     def load_tts(self, tts_checkpoint, tts_config, use_cuda):
+        global symbols, phonemes
+
         print(" > Loading TTS model ...")
         print(" | > model config: ", tts_config)
         print(" | > checkpoint file: ", tts_checkpoint)
+        
         self.tts_config = load_config(tts_config)
         self.use_phonemes = self.tts_config.use_phonemes
         self.ap = AudioProcessor(**self.tts_config.audio)
+
+        if 'text' in self.tts_config.keys():
+            symbols, phonemes =  make_symbols(**self.tts_config.text)
+
         if self.use_phonemes:
             self.input_size = len(phonemes)
         else:
diff --git a/synthesize.py b/synthesize.py
index bf85d7c9..d294701f 100644
--- a/synthesize.py
+++ b/synthesize.py
@@ -8,7 +8,7 @@ import string
 
 from TTS.utils.synthesis import synthesis
 from TTS.utils.generic_utils import load_config, setup_model
-from TTS.utils.text.symbols import symbols, phonemes
+from TTS.utils.text.symbols import make_symbols, symbols, phonemes
 from TTS.utils.audio import AudioProcessor
 
 
@@ -48,6 +48,8 @@ def tts(model,
 
 if __name__ == "__main__":
 
+    global symbols, phonemes
+
     parser = argparse.ArgumentParser()
     parser.add_argument('text', type=str, help='Text to generate speech.')
     parser.add_argument('config_path',
@@ -105,6 +107,10 @@ if __name__ == "__main__":
     # load the audio processor
     ap = AudioProcessor(**C.audio)
 
+    # if the vocabulary was passed, replace the default
+    if 'text' in C.keys():
+        symbols, phonemes =  make_symbols(**C.text)
+
     # load speakers
     if args.speakers_json != '':
         speakers = json.load(open(args.speakers_json, 'r'))
diff --git a/tests/test_demo_server.py b/tests/test_demo_server.py
index c343a6a4..3e360e20 100644
--- a/tests/test_demo_server.py
+++ b/tests/test_demo_server.py
@@ -5,13 +5,16 @@ import torch as T
 
 from TTS.server.synthesizer import Synthesizer
 from TTS.tests import get_tests_input_path, get_tests_output_path
-from TTS.utils.text.symbols import phonemes, symbols
+from TTS.utils.text.symbols import make_symbols, phonemes, symbols
 from TTS.utils.generic_utils import load_config, save_checkpoint, setup_model
 
 
 class DemoServerTest(unittest.TestCase):
     def _create_random_model(self):
         config = load_config(os.path.join(get_tests_output_path(), 'dummy_model_config.json'))
+        if 'text' in config.keys():
+            symbols, phonemes =  make_symbols(**config.text)
+
         num_chars = len(phonemes) if config.use_phonemes else len(symbols)
         model = setup_model(num_chars, 0, config)
         output_path = os.path.join(get_tests_output_path())
diff --git a/tests/test_loader.py b/tests/test_loader.py
index d8727895..5141fa85 100644
--- a/tests/test_loader.py
+++ b/tests/test_loader.py
@@ -38,6 +38,7 @@ class TestTTSDataset(unittest.TestCase):
             c.text_cleaner,
             ap=self.ap,
             meta_data=items, 
+            tp=c.text if 'text' in c.keys() else None,
             batch_group_size=bgs,
             min_seq_len=c.min_seq_len,
             max_seq_len=float("inf"),
diff --git a/train.py b/train.py
index 7bfb8751..96c268f0 100644
--- a/train.py
+++ b/train.py
@@ -25,7 +25,7 @@ from TTS.utils.logger import Logger
 from TTS.utils.speakers import load_speaker_mapping, save_speaker_mapping, \
     get_speakers
 from TTS.utils.synthesis import synthesis
-from TTS.utils.text.symbols import phonemes, symbols
+from TTS.utils.text.symbols import make_symbols, phonemes, symbols
 from TTS.utils.visual import plot_alignment, plot_spectrogram
 from TTS.datasets.preprocess import load_meta_data
 from TTS.utils.radam import RAdam
@@ -49,6 +49,7 @@ def setup_loader(ap, r, is_val=False, verbose=False):
             c.text_cleaner,
             meta_data=meta_data_eval if is_val else meta_data_train,
             ap=ap,
+            tp=c.text if 'text' in c.keys() else None,
             batch_group_size=0 if is_val else c.batch_group_size *
             c.batch_size,
             min_seq_len=c.min_seq_len,
@@ -515,9 +516,12 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
 
 # FIXME: move args definition/parsing inside of main?
 def main(args):  # pylint: disable=redefined-outer-name
-    global meta_data_train, meta_data_eval
+    global meta_data_train, meta_data_eval, symbols, phonemes
     # Audio processor
     ap = AudioProcessor(**c.audio)
+    
+    if 'text' in c.keys():
+        symbols, phonemes =  make_symbols(**c.text)
 
     # DISTRUBUTED
     if num_gpus > 1:
diff --git a/utils/generic_utils.py b/utils/generic_utils.py
index a8de5bbb..6aecdc7d 100644
--- a/utils/generic_utils.py
+++ b/utils/generic_utils.py
@@ -425,6 +425,15 @@ def check_config(c):
     _check_argument('power', c['audio'], restricted=True, val_type=float, min_val=1, max_val=5)
     _check_argument('griffin_lim_iters', c['audio'], restricted=True, val_type=int, min_val=10, max_val=1000)
 
+    # vocabulary parameters
+    _check_argument('text', c, restricted=False, val_type=dict) # parameter not mandatory 
+    _check_argument('pad', c['text'] if 'text' in c.keys() else {}, restricted=True if 'text' in c.keys() else False, val_type=str) # mandatory if "text parameters" else no mandatory
+    _check_argument('eos', c['text'] if 'text' in c.keys() else {}, restricted=True if 'text' in c.keys() else False, val_type=str)
+    _check_argument('bos', c['text'] if 'text' in c.keys() else {}, restricted=True if 'text' in c.keys() else False, val_type=str)
+    _check_argument('characters', c['text'] if 'text' in c.keys() else {}, restricted=True if 'text' in c.keys() else False, val_type=str)
+    _check_argument('phonemes', c['text'] if 'text' in c.keys() else {}, restricted=True if 'text' in c.keys() else False, val_type=str)
+    _check_argument('punctuations', c['text'] if 'text' in c.keys() else {}, restricted=True if 'text' in c.keys() else False, val_type=str)
+
     # normalization parameters
     _check_argument('signal_norm', c['audio'], restricted=True, val_type=bool)
     _check_argument('symmetric_norm', c['audio'], restricted=True, val_type=bool)
diff --git a/utils/synthesis.py b/utils/synthesis.py
index 79a17c78..c5ff2e70 100644
--- a/utils/synthesis.py
+++ b/utils/synthesis.py
@@ -9,10 +9,11 @@ def text_to_seqvec(text, CONFIG, use_cuda):
     if CONFIG.use_phonemes:
         seq = np.asarray(
             phoneme_to_sequence(text, text_cleaner, CONFIG.phoneme_language,
-                                CONFIG.enable_eos_bos_chars),
+                                CONFIG.enable_eos_bos_chars,
+                                tp=CONFIG.text if 'text' in CONFIG.keys() else None),
             dtype=np.int32)
     else:
-        seq = np.asarray(text_to_sequence(text, text_cleaner), dtype=np.int32)
+        seq = np.asarray(text_to_sequence(text, text_cleaner, tp=CONFIG.text if 'text' in CONFIG.keys() else None), dtype=np.int32)
     # torch tensor
     chars_var = torch.from_numpy(seq).unsqueeze(0)
     if use_cuda:
diff --git a/utils/text/__init__.py b/utils/text/__init__.py
index 0e6684d2..fcb239b2 100644
--- a/utils/text/__init__.py
+++ b/utils/text/__init__.py
@@ -4,7 +4,7 @@ import re
 import phonemizer
 from phonemizer.phonemize import phonemize
 from TTS.utils.text import cleaners
-from TTS.utils.text.symbols import symbols, phonemes, _phoneme_punctuations, _bos, \
+from TTS.utils.text.symbols import make_symbols, symbols, phonemes, _phoneme_punctuations, _bos, \
     _eos
 
 # Mappings from symbol to numeric ID and vice versa:
@@ -56,11 +56,23 @@ def text2phone(text, language):
     return ph
 
 
-def pad_with_eos_bos(phoneme_sequence):
+def pad_with_eos_bos(phoneme_sequence, tp=None):
+    global _PHONEMES_TO_ID, _bos, _eos
+    if tp:
+        _bos = tp['bos']
+        _eos = tp['eos']
+        _, phonemes = make_symbols(**tp)
+        _PHONEMES_TO_ID = {s: i for i, s in enumerate(phonemes)}
+        
     return [_PHONEMES_TO_ID[_bos]] + list(phoneme_sequence) + [_PHONEMES_TO_ID[_eos]]
 
 
-def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False):
+def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=None):
+    global _PHONEMES_TO_ID
+    if tp:
+        _, phonemes = make_symbols(**tp)
+        _PHONEMES_TO_ID = {s: i for i, s in enumerate(phonemes)}
+
     sequence = []
     text = text.replace(":", "")
     clean_text = _clean_text(text, cleaner_names)
@@ -72,13 +84,18 @@ def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False):
         sequence += _phoneme_to_sequence(phoneme)
     # Append EOS char
     if enable_eos_bos:
-        sequence = pad_with_eos_bos(sequence)
+        sequence = pad_with_eos_bos(sequence, tp=tp)
     return sequence
 
 
-def sequence_to_phoneme(sequence):
+def sequence_to_phoneme(sequence, tp=None):
     '''Converts a sequence of IDs back to a string'''
+    global _ID_TO_PHONEMES
     result = ''
+    if tp:
+        _, phonemes =  make_symbols(**tp)
+        _ID_TO_PHONEMES = {i: s for i, s in enumerate(phonemes)}
+        
     for symbol_id in sequence:
         if symbol_id in _ID_TO_PHONEMES:
             s = _ID_TO_PHONEMES[symbol_id]
@@ -86,7 +103,7 @@ def sequence_to_phoneme(sequence):
     return result.replace('}{', ' ')
 
 
-def text_to_sequence(text, cleaner_names):
+def text_to_sequence(text, cleaner_names, tp=None):
     '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
 
       The text can optionally have ARPAbet sequences enclosed in curly braces embedded
@@ -99,6 +116,11 @@ def text_to_sequence(text, cleaner_names):
       Returns:
         List of integers corresponding to the symbols in the text
     '''
+    global _SYMBOL_TO_ID
+    if tp:
+        symbols, _ = make_symbols(**tp)
+        _SYMBOL_TO_ID = {s: i for i, s in enumerate(symbols)}
+
     sequence = []
     # Check for curly braces and treat their contents as ARPAbet:
     while text:
@@ -113,8 +135,13 @@ def text_to_sequence(text, cleaner_names):
     return sequence
 
 
-def sequence_to_text(sequence):
+def sequence_to_text(sequence, tp=None):
     '''Converts a sequence of IDs back to a string'''
+    global _ID_TO_SYMBOL
+    if tp:
+        symbols, _ = make_symbols(**tp)
+        _ID_TO_SYMBOL = {i: s for i, s in enumerate(symbols)}
+
     result = ''
     for symbol_id in sequence:
         if symbol_id in _ID_TO_SYMBOL:
diff --git a/utils/text/symbols.py b/utils/text/symbols.py
index ee6fd2cf..e4a4b103 100644
--- a/utils/text/symbols.py
+++ b/utils/text/symbols.py
@@ -5,6 +5,18 @@ Defines the set of symbols used in text input to the model.
 The default is a set of ASCII characters that works well for English or text that has been run
 through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details.
 '''
+def make_symbols(characters, phonemes, punctuations='!\'(),-.:;? ', pad='_', eos='~', bos='^'):
+    ''' Function to create symbols and phonemes '''
+    _phonemes = sorted(list(phonemes))
+
+    # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
+    _arpabet = ['@' + s for s in _phonemes]
+
+    # Export all symbols:
+    symbols = [pad, eos, bos] + list(characters) + _arpabet
+    phonemes = [pad, eos, bos] + list(_phonemes) + list(punctuations)
+
+    return symbols, phonemes
 
 _pad = '_'
 _eos = '~'
@@ -20,14 +32,9 @@ _pulmonic_consonants = 'pbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðsz
 _suprasegmentals = 'ˈˌːˑ'
 _other_symbols = 'ʍwɥʜʢʡɕʑɺɧ'
 _diacrilics = 'ɚ˞ɫ'
-_phonemes = sorted(list(_vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics))
+_phonemes = _vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics
 
-# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
-_arpabet = ['@' + s for s in _phonemes]
-
-# Export all symbols:
-symbols = [_pad, _eos, _bos] + list(_characters) + _arpabet
-phonemes = [_pad, _eos, _bos] + list(_phonemes) + list(_punctuations)
+symbols, phonemes = make_symbols( _characters, _phonemes,_punctuations, _pad, _eos, _bos)
 
 # Generate ALIEN language
 # from random import shuffle
diff --git a/utils/visual.py b/utils/visual.py
index ab513666..2f93d812 100644
--- a/utils/visual.py
+++ b/utils/visual.py
@@ -54,9 +54,10 @@ def visualize(alignment, spectrogram_postnet, stop_tokens, text, hop_length, CON
     plt.xlabel("Decoder timestamp", fontsize=label_fontsize)
     plt.ylabel("Encoder timestamp", fontsize=label_fontsize)
     if CONFIG.use_phonemes:
-        seq = phoneme_to_sequence(text, [CONFIG.text_cleaner], CONFIG.phoneme_language, CONFIG.enable_eos_bos_chars)
-        text = sequence_to_phoneme(seq)
+        seq = phoneme_to_sequence(text, [CONFIG.text_cleaner], CONFIG.phoneme_language, CONFIG.enable_eos_bos_chars, tp=CONFIG.text if 'text' in CONFIG.keys() else None)
+        text = sequence_to_phoneme(seq, tp=CONFIG.text if 'text' in CONFIG.keys() else None)
         print(text)
+        
     plt.yticks(range(len(text)), list(text))
     plt.colorbar()
 

From 59e2752107162b7b6060c3096d24c16a3cbbd0b3 Mon Sep 17 00:00:00 2001
From: Edresson Casanova <edresson1@gmail.com>
Date: Mon, 2 Mar 2020 11:46:00 -0300
Subject: [PATCH 58/61] fix travis unit test errors

---
 datasets/TTSDataset.py |  3 +--
 train.py               |  3 +--
 utils/generic_utils.py | 14 +++++++-------
 utils/text/__init__.py | 20 ++++++++++----------
 utils/text/symbols.py  | 10 +++++-----
 5 files changed, 24 insertions(+), 26 deletions(-)

diff --git a/datasets/TTSDataset.py b/datasets/TTSDataset.py
index cccd65a2..d649bf23 100644
--- a/datasets/TTSDataset.py
+++ b/datasets/TTSDataset.py
@@ -77,13 +77,12 @@ class MyDataset(Dataset):
 
     def _generate_and_cache_phoneme_sequence(self, text, cache_path):
         """generate a phoneme sequence from text.
-
         since the usage is for subsequent caching, we never add bos and
         eos chars here. Instead we add those dynamically later; based on the
         config option."""
         phonemes = phoneme_to_sequence(text, [self.cleaners],
                                        language=self.phoneme_language,
-                                       enable_eos_bos=False, 
+                                       enable_eos_bos=False,
                                        tp=self.tp)
         phonemes = np.asarray(phonemes, dtype=np.int32)
         np.save(cache_path, phonemes)
diff --git a/train.py b/train.py
index 96c268f0..616d54ac 100644
--- a/train.py
+++ b/train.py
@@ -519,9 +519,8 @@ def main(args):  # pylint: disable=redefined-outer-name
     global meta_data_train, meta_data_eval, symbols, phonemes
     # Audio processor
     ap = AudioProcessor(**c.audio)
-    
     if 'text' in c.keys():
-        symbols, phonemes =  make_symbols(**c.text)
+        symbols, phonemes = make_symbols(**c.text)
 
     # DISTRUBUTED
     if num_gpus > 1:
diff --git a/utils/generic_utils.py b/utils/generic_utils.py
index 6aecdc7d..7c2f033a 100644
--- a/utils/generic_utils.py
+++ b/utils/generic_utils.py
@@ -426,13 +426,13 @@ def check_config(c):
     _check_argument('griffin_lim_iters', c['audio'], restricted=True, val_type=int, min_val=10, max_val=1000)
 
     # vocabulary parameters
-    _check_argument('text', c, restricted=False, val_type=dict) # parameter not mandatory 
-    _check_argument('pad', c['text'] if 'text' in c.keys() else {}, restricted=True if 'text' in c.keys() else False, val_type=str) # mandatory if "text parameters" else no mandatory
-    _check_argument('eos', c['text'] if 'text' in c.keys() else {}, restricted=True if 'text' in c.keys() else False, val_type=str)
-    _check_argument('bos', c['text'] if 'text' in c.keys() else {}, restricted=True if 'text' in c.keys() else False, val_type=str)
-    _check_argument('characters', c['text'] if 'text' in c.keys() else {}, restricted=True if 'text' in c.keys() else False, val_type=str)
-    _check_argument('phonemes', c['text'] if 'text' in c.keys() else {}, restricted=True if 'text' in c.keys() else False, val_type=str)
-    _check_argument('punctuations', c['text'] if 'text' in c.keys() else {}, restricted=True if 'text' in c.keys() else False, val_type=str)
+    _check_argument('text', c, restricted=False, val_type=dict)
+    _check_argument('pad', c['text'] if 'text' in c.keys() else {}, restricted='text' in c.keys(), val_type=str)
+    _check_argument('eos', c['text'] if 'text' in c.keys() else {}, restricted='text' in c.keys(), val_type=str)
+    _check_argument('bos', c['text'] if 'text' in c.keys() else {}, restricted='text' in c.keys(), val_type=str)
+    _check_argument('characters', c['text'] if 'text' in c.keys() else {}, restricted='text' in c.keys(), val_type=str)
+    _check_argument('phonemes', c['text'] if 'text' in c.keys() else {}, restricted='text' in c.keys(), val_type=str)
+    _check_argument('punctuations', c['text'] if 'text' in c.keys() else {}, restricted='text' in c.keys(), val_type=str)
 
     # normalization parameters
     _check_argument('signal_norm', c['audio'], restricted=True, val_type=bool)
diff --git a/utils/text/__init__.py b/utils/text/__init__.py
index fcb239b2..ff21ffe0 100644
--- a/utils/text/__init__.py
+++ b/utils/text/__init__.py
@@ -61,8 +61,8 @@ def pad_with_eos_bos(phoneme_sequence, tp=None):
     if tp:
         _bos = tp['bos']
         _eos = tp['eos']
-        _, phonemes = make_symbols(**tp)
-        _PHONEMES_TO_ID = {s: i for i, s in enumerate(phonemes)}
+        _, _phonemes = make_symbols(**tp)
+        _PHONEMES_TO_ID = {s: i for i, s in enumerate(_phonemes)}
         
     return [_PHONEMES_TO_ID[_bos]] + list(phoneme_sequence) + [_PHONEMES_TO_ID[_eos]]
 
@@ -70,8 +70,8 @@ def pad_with_eos_bos(phoneme_sequence, tp=None):
 def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=None):
     global _PHONEMES_TO_ID
     if tp:
-        _, phonemes = make_symbols(**tp)
-        _PHONEMES_TO_ID = {s: i for i, s in enumerate(phonemes)}
+        _, _phonemes = make_symbols(**tp)
+        _PHONEMES_TO_ID = {s: i for i, s in enumerate(_phonemes)}
 
     sequence = []
     text = text.replace(":", "")
@@ -93,8 +93,8 @@ def sequence_to_phoneme(sequence, tp=None):
     global _ID_TO_PHONEMES
     result = ''
     if tp:
-        _, phonemes =  make_symbols(**tp)
-        _ID_TO_PHONEMES = {i: s for i, s in enumerate(phonemes)}
+        _, _phonemes = make_symbols(**tp)
+        _ID_TO_PHONEMES = {i: s for i, s in enumerate(_phonemes)}
         
     for symbol_id in sequence:
         if symbol_id in _ID_TO_PHONEMES:
@@ -118,8 +118,8 @@ def text_to_sequence(text, cleaner_names, tp=None):
     '''
     global _SYMBOL_TO_ID
     if tp:
-        symbols, _ = make_symbols(**tp)
-        _SYMBOL_TO_ID = {s: i for i, s in enumerate(symbols)}
+        _symbols, _ = make_symbols(**tp)
+        _SYMBOL_TO_ID = {s: i for i, s in enumerate(_symbols)}
 
     sequence = []
     # Check for curly braces and treat their contents as ARPAbet:
@@ -139,8 +139,8 @@ def sequence_to_text(sequence, tp=None):
     '''Converts a sequence of IDs back to a string'''
     global _ID_TO_SYMBOL
     if tp:
-        symbols, _ = make_symbols(**tp)
-        _ID_TO_SYMBOL = {i: s for i, s in enumerate(symbols)}
+        _symbols, _ = make_symbols(**tp)
+        _ID_TO_SYMBOL = {i: s for i, s in enumerate(_symbols)}
 
     result = ''
     for symbol_id in sequence:
diff --git a/utils/text/symbols.py b/utils/text/symbols.py
index e4a4b103..db83cb29 100644
--- a/utils/text/symbols.py
+++ b/utils/text/symbols.py
@@ -5,16 +5,16 @@ Defines the set of symbols used in text input to the model.
 The default is a set of ASCII characters that works well for English or text that has been run
 through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details.
 '''
-def make_symbols(characters, phonemes, punctuations='!\'(),-.:;? ', pad='_', eos='~', bos='^'):
+def make_symbols(characters, phnms, punctuations='!\'(),-.:;? ', pad='_', eos='~', bos='^'):
     ''' Function to create symbols and phonemes '''
-    _phonemes = sorted(list(phonemes))
+    _phonemes_sorted = sorted(list(phnms))
 
     # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
-    _arpabet = ['@' + s for s in _phonemes]
+    _arpabet = ['@' + s for s in _phonemes_sorted]
 
     # Export all symbols:
-    symbols = [pad, eos, bos] + list(characters) + _arpabet
-    phonemes = [pad, eos, bos] + list(_phonemes) + list(punctuations)
+    _symbols = [pad, eos, bos] + list(characters) + _arpabet
+    _phonemes = [pad, eos, bos] + list(_phonemes_sorted) + list(punctuations)
 
     return symbols, phonemes
 

From 4e53896438b5365269e54dae999b6ddab837b0c4 Mon Sep 17 00:00:00 2001
From: Edresson Casanova <edresson1@gmail.com>
Date: Mon, 2 Mar 2020 15:33:13 -0300
Subject: [PATCH 59/61] fix travis lint check

---
 datasets/TTSDataset.py          |  2 +-
 notebooks/Benchmark-PWGAN.ipynb |  2 +-
 notebooks/Benchmark.ipynb       |  2 +-
 notebooks/TestAttention.ipynb   |  2 +-
 server/synthesizer.py           | 12 ++++---
 synthesize.py                   |  2 +-
 tests/test_demo_server.py       |  5 ++-
 tests/test_loader.py            |  2 +-
 train.py                        |  1 +
 utils/text/__init__.py          | 55 ++++++++++++++++++---------------
 utils/text/symbols.py           |  4 +--
 utils/visual.py                 |  2 +-
 12 files changed, 52 insertions(+), 39 deletions(-)

diff --git a/datasets/TTSDataset.py b/datasets/TTSDataset.py
index d649bf23..d3a6f486 100644
--- a/datasets/TTSDataset.py
+++ b/datasets/TTSDataset.py
@@ -195,7 +195,7 @@ class MyDataset(Dataset):
             mel = [self.ap.melspectrogram(w).astype('float32') for w in wav]
             linear = [self.ap.spectrogram(w).astype('float32') for w in wav]
 
-            mel_lengths = [m.shape[1] for m in mel] 
+            mel_lengths = [m.shape[1] for m in mel]
 
             # compute 'stop token' targets
             stop_targets = [
diff --git a/notebooks/Benchmark-PWGAN.ipynb b/notebooks/Benchmark-PWGAN.ipynb
index 4a2a21d7..19a1a79c 100644
--- a/notebooks/Benchmark-PWGAN.ipynb
+++ b/notebooks/Benchmark-PWGAN.ipynb
@@ -144,7 +144,7 @@
     "\n",
     "# if the vocabulary was passed, replace the default\n",
     "if 'text' in CONFIG.keys():\n",
-    "    symbols, phonemes =  make_symbols(**CONFIG.text)\n",
+    "    symbols, phonemes = make_symbols(**CONFIG.text)\n",
     "\n",
     "# load the model\n",
     "num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n",
diff --git a/notebooks/Benchmark.ipynb b/notebooks/Benchmark.ipynb
index 528d7a3b..bf6f2774 100644
--- a/notebooks/Benchmark.ipynb
+++ b/notebooks/Benchmark.ipynb
@@ -151,7 +151,7 @@
     "\n",
     "# if the vocabulary was passed, replace the default\n",
     "if 'text' in CONFIG.keys():\n",
-    "    symbols, phonemes =  make_symbols(**CONFIG.text)\n",
+    "    symbols, phonemes = make_symbols(**CONFIG.text)\n",
     "\n",
     "# load the model\n",
     "num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n",
diff --git a/notebooks/TestAttention.ipynb b/notebooks/TestAttention.ipynb
index 5310fb92..b0599d80 100644
--- a/notebooks/TestAttention.ipynb
+++ b/notebooks/TestAttention.ipynb
@@ -112,7 +112,7 @@
     "\n",
     "# if the vocabulary was passed, replace the default\n",
     "if 'text' in CONFIG.keys():\n",
-    "    symbols, phonemes =  make_symbols(**CONFIG.text)\n",
+    "    symbols, phonemes = make_symbols(**CONFIG.text)\n",
     "\n",
     "# load the model\n",
     "num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n",
diff --git a/server/synthesizer.py b/server/synthesizer.py
index f001afcd..f0921513 100644
--- a/server/synthesizer.py
+++ b/server/synthesizer.py
@@ -9,7 +9,10 @@ import yaml
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.generic_utils import load_config, setup_model
 from TTS.utils.speakers import load_speaker_mapping
+# pylint: disable=unused-wildcard-import
+# pylint: disable=wildcard-import
 from TTS.utils.synthesis import *
+
 from TTS.utils.text import make_symbols, phonemes, symbols
 
 alphabets = r"([A-Za-z])"
@@ -38,18 +41,19 @@ class Synthesizer(object):
                             self.config.pwgan_config, self.config.use_cuda)
 
     def load_tts(self, tts_checkpoint, tts_config, use_cuda):
+        # pylint: disable=global-statement
         global symbols, phonemes
 
         print(" > Loading TTS model ...")
         print(" | > model config: ", tts_config)
         print(" | > checkpoint file: ", tts_checkpoint)
-        
+
         self.tts_config = load_config(tts_config)
         self.use_phonemes = self.tts_config.use_phonemes
         self.ap = AudioProcessor(**self.tts_config.audio)
 
         if 'text' in self.tts_config.keys():
-            symbols, phonemes =  make_symbols(**self.tts_config.text)
+            symbols, phonemes = make_symbols(**self.tts_config.text)
 
         if self.use_phonemes:
             self.input_size = len(phonemes)
@@ -61,7 +65,7 @@ class Synthesizer(object):
             num_speakers = len(self.tts_speakers)
         else:
             num_speakers = 0
-        self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config) 
+        self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config)
         # load model state
         cp = torch.load(tts_checkpoint, map_location=torch.device('cpu'))
         # load the model
@@ -91,7 +95,7 @@ class Synthesizer(object):
             mulaw=self.wavernn_config.mulaw,
             pad=self.wavernn_config.pad,
             use_aux_net=self.wavernn_config.use_aux_net,
-            use_upsample_net = self.wavernn_config.use_upsample_net,
+            use_upsample_net=self.wavernn_config.use_upsample_net,
             upsample_factors=self.wavernn_config.upsample_factors,
             feat_dims=80,
             compute_dims=128,
diff --git a/synthesize.py b/synthesize.py
index d294701f..6f3a235f 100644
--- a/synthesize.py
+++ b/synthesize.py
@@ -109,7 +109,7 @@ if __name__ == "__main__":
 
     # if the vocabulary was passed, replace the default
     if 'text' in C.keys():
-        symbols, phonemes =  make_symbols(**C.text)
+        symbols, phonemes = make_symbols(**C.text)
 
     # load speakers
     if args.speakers_json != '':
diff --git a/tests/test_demo_server.py b/tests/test_demo_server.py
index 3e360e20..36848942 100644
--- a/tests/test_demo_server.py
+++ b/tests/test_demo_server.py
@@ -10,10 +10,13 @@ from TTS.utils.generic_utils import load_config, save_checkpoint, setup_model
 
 
 class DemoServerTest(unittest.TestCase):
+    # pylint: disable=R0201
     def _create_random_model(self):
+        # pylint: disable=global-statement
+        global symbols, phonemes
         config = load_config(os.path.join(get_tests_output_path(), 'dummy_model_config.json'))
         if 'text' in config.keys():
-            symbols, phonemes =  make_symbols(**config.text)
+            symbols, phonemes = make_symbols(**config.text)
 
         num_chars = len(phonemes) if config.use_phonemes else len(symbols)
         model = setup_model(num_chars, 0, config)
diff --git a/tests/test_loader.py b/tests/test_loader.py
index 5141fa85..eb23ed19 100644
--- a/tests/test_loader.py
+++ b/tests/test_loader.py
@@ -37,7 +37,7 @@ class TestTTSDataset(unittest.TestCase):
             r,
             c.text_cleaner,
             ap=self.ap,
-            meta_data=items, 
+            meta_data=items,
             tp=c.text if 'text' in c.keys() else None,
             batch_group_size=bgs,
             min_seq_len=c.min_seq_len,
diff --git a/train.py b/train.py
index 616d54ac..bf5429e9 100644
--- a/train.py
+++ b/train.py
@@ -516,6 +516,7 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
 
 # FIXME: move args definition/parsing inside of main?
 def main(args):  # pylint: disable=redefined-outer-name
+    # pylint: disable=global-variable-undefined
     global meta_data_train, meta_data_eval, symbols, phonemes
     # Audio processor
     ap = AudioProcessor(**c.audio)
diff --git a/utils/text/__init__.py b/utils/text/__init__.py
index ff21ffe0..4361bc13 100644
--- a/utils/text/__init__.py
+++ b/utils/text/__init__.py
@@ -8,11 +8,11 @@ from TTS.utils.text.symbols import make_symbols, symbols, phonemes, _phoneme_pun
     _eos
 
 # Mappings from symbol to numeric ID and vice versa:
-_SYMBOL_TO_ID = {s: i for i, s in enumerate(symbols)}
-_ID_TO_SYMBOL = {i: s for i, s in enumerate(symbols)}
+_symbol_to_id = {s: i for i, s in enumerate(symbols)}
+_id_to_symbol = {i: s for i, s in enumerate(symbols)}
 
-_PHONEMES_TO_ID = {s: i for i, s in enumerate(phonemes)}
-_ID_TO_PHONEMES = {i: s for i, s in enumerate(phonemes)}
+_phonemes_to_id = {s: i for i, s in enumerate(phonemes)}
+_id_to_phonemes = {i: s for i, s in enumerate(phonemes)}
 
 # Regular expression matching text enclosed in curly braces:
 _CURLY_RE = re.compile(r'(.*?)\{(.+?)\}(.*)')
@@ -57,21 +57,23 @@ def text2phone(text, language):
 
 
 def pad_with_eos_bos(phoneme_sequence, tp=None):
-    global _PHONEMES_TO_ID, _bos, _eos
+    # pylint: disable=global-statement
+    global _phonemes_to_id, _bos, _eos
     if tp:
         _bos = tp['bos']
         _eos = tp['eos']
         _, _phonemes = make_symbols(**tp)
-        _PHONEMES_TO_ID = {s: i for i, s in enumerate(_phonemes)}
-        
-    return [_PHONEMES_TO_ID[_bos]] + list(phoneme_sequence) + [_PHONEMES_TO_ID[_eos]]
+        _phonemes_to_id = {s: i for i, s in enumerate(_phonemes)}
+
+    return [_phonemes_to_id[_bos]] + list(phoneme_sequence) + [_phonemes_to_id[_eos]]
 
 
 def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=None):
-    global _PHONEMES_TO_ID
+    # pylint: disable=global-statement
+    global _phonemes_to_id
     if tp:
         _, _phonemes = make_symbols(**tp)
-        _PHONEMES_TO_ID = {s: i for i, s in enumerate(_phonemes)}
+        _phonemes_to_id = {s: i for i, s in enumerate(_phonemes)}
 
     sequence = []
     text = text.replace(":", "")
@@ -89,16 +91,17 @@ def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=
 
 
 def sequence_to_phoneme(sequence, tp=None):
+    # pylint: disable=global-statement
     '''Converts a sequence of IDs back to a string'''
-    global _ID_TO_PHONEMES
+    global _id_to_phonemes
     result = ''
     if tp:
         _, _phonemes = make_symbols(**tp)
-        _ID_TO_PHONEMES = {i: s for i, s in enumerate(_phonemes)}
-        
+        _id_to_phonemes = {i: s for i, s in enumerate(_phonemes)}
+
     for symbol_id in sequence:
-        if symbol_id in _ID_TO_PHONEMES:
-            s = _ID_TO_PHONEMES[symbol_id]
+        if symbol_id in _id_to_phonemes:
+            s = _id_to_phonemes[symbol_id]
             result += s
     return result.replace('}{', ' ')
 
@@ -116,10 +119,11 @@ def text_to_sequence(text, cleaner_names, tp=None):
       Returns:
         List of integers corresponding to the symbols in the text
     '''
-    global _SYMBOL_TO_ID
+    # pylint: disable=global-statement
+    global _symbol_to_id
     if tp:
         _symbols, _ = make_symbols(**tp)
-        _SYMBOL_TO_ID = {s: i for i, s in enumerate(_symbols)}
+        _symbol_to_id = {s: i for i, s in enumerate(_symbols)}
 
     sequence = []
     # Check for curly braces and treat their contents as ARPAbet:
@@ -137,15 +141,16 @@ def text_to_sequence(text, cleaner_names, tp=None):
 
 def sequence_to_text(sequence, tp=None):
     '''Converts a sequence of IDs back to a string'''
-    global _ID_TO_SYMBOL
+    # pylint: disable=global-statement
+    global _id_to_symbol
     if tp:
         _symbols, _ = make_symbols(**tp)
-        _ID_TO_SYMBOL = {i: s for i, s in enumerate(_symbols)}
+        _id_to_symbol = {i: s for i, s in enumerate(_symbols)}
 
     result = ''
     for symbol_id in sequence:
-        if symbol_id in _ID_TO_SYMBOL:
-            s = _ID_TO_SYMBOL[symbol_id]
+        if symbol_id in _id_to_symbol:
+            s = _id_to_symbol[symbol_id]
             # Enclose ARPAbet back in curly braces:
             if len(s) > 1 and s[0] == '@':
                 s = '{%s}' % s[1:]
@@ -163,11 +168,11 @@ def _clean_text(text, cleaner_names):
 
 
 def _symbols_to_sequence(syms):
-    return [_SYMBOL_TO_ID[s] for s in syms if _should_keep_symbol(s)]
+    return [_symbol_to_id[s] for s in syms if _should_keep_symbol(s)]
 
 
 def _phoneme_to_sequence(phons):
-    return [_PHONEMES_TO_ID[s] for s in list(phons) if _should_keep_phoneme(s)]
+    return [_phonemes_to_id[s] for s in list(phons) if _should_keep_phoneme(s)]
 
 
 def _arpabet_to_sequence(text):
@@ -175,8 +180,8 @@ def _arpabet_to_sequence(text):
 
 
 def _should_keep_symbol(s):
-    return s in _SYMBOL_TO_ID and s not in ['~', '^', '_']
+    return s in _symbol_to_id and s not in ['~', '^', '_']
 
 
 def _should_keep_phoneme(p):
-    return p in _PHONEMES_TO_ID and p not in ['~', '^', '_']
+    return p in _phonemes_to_id and p not in ['~', '^', '_']
diff --git a/utils/text/symbols.py b/utils/text/symbols.py
index db83cb29..15862cbd 100644
--- a/utils/text/symbols.py
+++ b/utils/text/symbols.py
@@ -16,7 +16,7 @@ def make_symbols(characters, phnms, punctuations='!\'(),-.:;? ', pad='_', eos='~
     _symbols = [pad, eos, bos] + list(characters) + _arpabet
     _phonemes = [pad, eos, bos] + list(_phonemes_sorted) + list(punctuations)
 
-    return symbols, phonemes
+    return _symbols, _phonemes
 
 _pad = '_'
 _eos = '~'
@@ -34,7 +34,7 @@ _other_symbols = 'ʍwɥʜʢʡɕʑɺɧ'
 _diacrilics = 'ɚ˞ɫ'
 _phonemes = _vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics
 
-symbols, phonemes = make_symbols( _characters, _phonemes,_punctuations, _pad, _eos, _bos)
+symbols, phonemes = make_symbols(_characters, _phonemes, _punctuations, _pad, _eos, _bos)
 
 # Generate ALIEN language
 # from random import shuffle
diff --git a/utils/visual.py b/utils/visual.py
index 2f93d812..3b24364c 100644
--- a/utils/visual.py
+++ b/utils/visual.py
@@ -57,7 +57,7 @@ def visualize(alignment, spectrogram_postnet, stop_tokens, text, hop_length, CON
         seq = phoneme_to_sequence(text, [CONFIG.text_cleaner], CONFIG.phoneme_language, CONFIG.enable_eos_bos_chars, tp=CONFIG.text if 'text' in CONFIG.keys() else None)
         text = sequence_to_phoneme(seq, tp=CONFIG.text if 'text' in CONFIG.keys() else None)
         print(text)
-        
+
     plt.yticks(range(len(text)), list(text))
     plt.colorbar()
 

From 36235c5e3fc0f47c56253a99941fc769d744469d Mon Sep 17 00:00:00 2001
From: Edresson Casanova <edresson1@gmail.com>
Date: Tue, 3 Mar 2020 09:17:56 -0300
Subject: [PATCH 60/61] rename text to characters in config.json

---
 config.json                          |  2 +-
 notebooks/Benchmark-PWGAN.ipynb      |  4 ++--
 notebooks/Benchmark.ipynb            |  4 ++--
 notebooks/ExtractTTSpectrogram.ipynb |  6 +++---
 notebooks/TestAttention.ipynb        |  4 ++--
 server/synthesizer.py                |  4 ++--
 synthesize.py                        |  4 ++--
 tests/test_demo_server.py            |  4 ++--
 tests/test_loader.py                 |  2 +-
 train.py                             |  6 +++---
 utils/generic_utils.py               | 14 +++++++-------
 utils/synthesis.py                   |  4 ++--
 utils/visual.py                      |  4 ++--
 13 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/config.json b/config.json
index 2a7c4551..3722de9d 100644
--- a/config.json
+++ b/config.json
@@ -28,7 +28,7 @@
     },
 
     // VOCABULARY PARAMETERS
-    "text":{
+    "characters":{
         "pad": "_",
         "eos": "~",
         "bos": "^",
diff --git a/notebooks/Benchmark-PWGAN.ipynb b/notebooks/Benchmark-PWGAN.ipynb
index 19a1a79c..840da10e 100644
--- a/notebooks/Benchmark-PWGAN.ipynb
+++ b/notebooks/Benchmark-PWGAN.ipynb
@@ -143,8 +143,8 @@
     "    speaker_id = None\n",
     "\n",
     "# if the vocabulary was passed, replace the default\n",
-    "if 'text' in CONFIG.keys():\n",
-    "    symbols, phonemes = make_symbols(**CONFIG.text)\n",
+    "if 'characters' in CONFIG.keys():\n",
+    "    symbols, phonemes = make_symbols(**CONFIG.characters)\n",
     "\n",
     "# load the model\n",
     "num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n",
diff --git a/notebooks/Benchmark.ipynb b/notebooks/Benchmark.ipynb
index bf6f2774..7d3a45cf 100644
--- a/notebooks/Benchmark.ipynb
+++ b/notebooks/Benchmark.ipynb
@@ -150,8 +150,8 @@
     "    speaker_id = None\n",
     "\n",
     "# if the vocabulary was passed, replace the default\n",
-    "if 'text' in CONFIG.keys():\n",
-    "    symbols, phonemes = make_symbols(**CONFIG.text)\n",
+    "if 'characters' in CONFIG.keys():\n",
+    "    symbols, phonemes = make_symbols(**CONFIG.characters)\n",
     "\n",
     "# load the model\n",
     "num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n",
diff --git a/notebooks/ExtractTTSpectrogram.ipynb b/notebooks/ExtractTTSpectrogram.ipynb
index 2313e47e..b5a88611 100644
--- a/notebooks/ExtractTTSpectrogram.ipynb
+++ b/notebooks/ExtractTTSpectrogram.ipynb
@@ -95,8 +95,8 @@
    "outputs": [],
    "source": [
    "# if the vocabulary was passed, replace the default\n",
-    "if 'text' in C.keys():\n",
-    "    symbols, phonemes = make_symbols(**C.text)\n",
+    "if 'characters' in C.keys():\n",
+    "    symbols, phonemes = make_symbols(**C.characters)\n",
     "\n",
     "# load the model\n",
     "num_chars = len(phonemes) if C.use_phonemes else len(symbols)\n",
@@ -120,7 +120,7 @@
     "preprocessor = importlib.import_module('TTS.datasets.preprocess')\n",
     "preprocessor = getattr(preprocessor, DATASET.lower())\n",
     "meta_data = preprocessor(DATA_PATH,METADATA_FILE)\n",
-    "dataset = MyDataset(checkpoint['r'], C.text_cleaner, ap, meta_data,tp=C.text if 'text' in C.keys() else None, use_phonemes=C.use_phonemes,  phoneme_cache_path=C.phoneme_cache_path, enable_eos_bos=C.enable_eos_bos_chars)\n",
+    "dataset = MyDataset(checkpoint['r'], C.text_cleaner, ap, meta_data,tp=C.characters if 'characters' in C.keys() else None, use_phonemes=C.use_phonemes,  phoneme_cache_path=C.phoneme_cache_path, enable_eos_bos=C.enable_eos_bos_chars)\n",
     "loader = DataLoader(dataset, batch_size=BATCH_SIZE, num_workers=4, collate_fn=dataset.collate_fn, shuffle=False, drop_last=False)"
    ]
   },
diff --git a/notebooks/TestAttention.ipynb b/notebooks/TestAttention.ipynb
index b0599d80..9d3e5e75 100644
--- a/notebooks/TestAttention.ipynb
+++ b/notebooks/TestAttention.ipynb
@@ -111,8 +111,8 @@
     "    speaker_id = None\n",
     "\n",
     "# if the vocabulary was passed, replace the default\n",
-    "if 'text' in CONFIG.keys():\n",
-    "    symbols, phonemes = make_symbols(**CONFIG.text)\n",
+    "if 'characters' in CONFIG.keys():\n",
+    "    symbols, phonemes = make_symbols(**CONFIG.characters)\n",
     "\n",
     "# load the model\n",
     "num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n",
diff --git a/server/synthesizer.py b/server/synthesizer.py
index f0921513..f73b73fc 100644
--- a/server/synthesizer.py
+++ b/server/synthesizer.py
@@ -52,8 +52,8 @@ class Synthesizer(object):
         self.use_phonemes = self.tts_config.use_phonemes
         self.ap = AudioProcessor(**self.tts_config.audio)
 
-        if 'text' in self.tts_config.keys():
-            symbols, phonemes = make_symbols(**self.tts_config.text)
+        if 'characters' in self.tts_config.keys():
+            symbols, phonemes = make_symbols(**self.tts_config.characters)
 
         if self.use_phonemes:
             self.input_size = len(phonemes)
diff --git a/synthesize.py b/synthesize.py
index 6f3a235f..1f1ce36f 100644
--- a/synthesize.py
+++ b/synthesize.py
@@ -108,8 +108,8 @@ if __name__ == "__main__":
     ap = AudioProcessor(**C.audio)
 
     # if the vocabulary was passed, replace the default
-    if 'text' in C.keys():
-        symbols, phonemes = make_symbols(**C.text)
+    if 'characters' in C.keys():
+        symbols, phonemes = make_symbols(**C.characters)
 
     # load speakers
     if args.speakers_json != '':
diff --git a/tests/test_demo_server.py b/tests/test_demo_server.py
index 36848942..a0837686 100644
--- a/tests/test_demo_server.py
+++ b/tests/test_demo_server.py
@@ -15,8 +15,8 @@ class DemoServerTest(unittest.TestCase):
         # pylint: disable=global-statement
         global symbols, phonemes
         config = load_config(os.path.join(get_tests_output_path(), 'dummy_model_config.json'))
-        if 'text' in config.keys():
-            symbols, phonemes = make_symbols(**config.text)
+        if 'characters' in config.keys():
+            symbols, phonemes = make_symbols(**config.characters)
 
         num_chars = len(phonemes) if config.use_phonemes else len(symbols)
         model = setup_model(num_chars, 0, config)
diff --git a/tests/test_loader.py b/tests/test_loader.py
index eb23ed19..d835c5d3 100644
--- a/tests/test_loader.py
+++ b/tests/test_loader.py
@@ -38,7 +38,7 @@ class TestTTSDataset(unittest.TestCase):
             c.text_cleaner,
             ap=self.ap,
             meta_data=items,
-            tp=c.text if 'text' in c.keys() else None,
+            tp=c.characters if 'characters' in c.keys() else None,
             batch_group_size=bgs,
             min_seq_len=c.min_seq_len,
             max_seq_len=float("inf"),
diff --git a/train.py b/train.py
index bf5429e9..4bb22a34 100644
--- a/train.py
+++ b/train.py
@@ -49,7 +49,7 @@ def setup_loader(ap, r, is_val=False, verbose=False):
             c.text_cleaner,
             meta_data=meta_data_eval if is_val else meta_data_train,
             ap=ap,
-            tp=c.text if 'text' in c.keys() else None,
+            tp=c.characters if 'characters' in c.keys() else None,
             batch_group_size=0 if is_val else c.batch_group_size *
             c.batch_size,
             min_seq_len=c.min_seq_len,
@@ -520,8 +520,8 @@ def main(args):  # pylint: disable=redefined-outer-name
     global meta_data_train, meta_data_eval, symbols, phonemes
     # Audio processor
     ap = AudioProcessor(**c.audio)
-    if 'text' in c.keys():
-        symbols, phonemes = make_symbols(**c.text)
+    if 'characters' in c.keys():
+        symbols, phonemes = make_symbols(**c.characters)
 
     # DISTRUBUTED
     if num_gpus > 1:
diff --git a/utils/generic_utils.py b/utils/generic_utils.py
index 7c2f033a..cf0a05b4 100644
--- a/utils/generic_utils.py
+++ b/utils/generic_utils.py
@@ -426,13 +426,13 @@ def check_config(c):
     _check_argument('griffin_lim_iters', c['audio'], restricted=True, val_type=int, min_val=10, max_val=1000)
 
     # vocabulary parameters
-    _check_argument('text', c, restricted=False, val_type=dict)
-    _check_argument('pad', c['text'] if 'text' in c.keys() else {}, restricted='text' in c.keys(), val_type=str)
-    _check_argument('eos', c['text'] if 'text' in c.keys() else {}, restricted='text' in c.keys(), val_type=str)
-    _check_argument('bos', c['text'] if 'text' in c.keys() else {}, restricted='text' in c.keys(), val_type=str)
-    _check_argument('characters', c['text'] if 'text' in c.keys() else {}, restricted='text' in c.keys(), val_type=str)
-    _check_argument('phonemes', c['text'] if 'text' in c.keys() else {}, restricted='text' in c.keys(), val_type=str)
-    _check_argument('punctuations', c['text'] if 'text' in c.keys() else {}, restricted='text' in c.keys(), val_type=str)
+    _check_argument('characters', c, restricted=False, val_type=dict)
+    _check_argument('pad', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
+    _check_argument('eos', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
+    _check_argument('bos', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
+    _check_argument('characters', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
+    _check_argument('phonemes', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
+    _check_argument('punctuations', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
 
     # normalization parameters
     _check_argument('signal_norm', c['audio'], restricted=True, val_type=bool)
diff --git a/utils/synthesis.py b/utils/synthesis.py
index c5ff2e70..42f0408c 100644
--- a/utils/synthesis.py
+++ b/utils/synthesis.py
@@ -10,10 +10,10 @@ def text_to_seqvec(text, CONFIG, use_cuda):
         seq = np.asarray(
             phoneme_to_sequence(text, text_cleaner, CONFIG.phoneme_language,
                                 CONFIG.enable_eos_bos_chars,
-                                tp=CONFIG.text if 'text' in CONFIG.keys() else None),
+                                tp=CONFIG.characters if 'characters' in CONFIG.keys() else None),
             dtype=np.int32)
     else:
-        seq = np.asarray(text_to_sequence(text, text_cleaner, tp=CONFIG.text if 'text' in CONFIG.keys() else None), dtype=np.int32)
+        seq = np.asarray(text_to_sequence(text, text_cleaner, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None), dtype=np.int32)
     # torch tensor
     chars_var = torch.from_numpy(seq).unsqueeze(0)
     if use_cuda:
diff --git a/utils/visual.py b/utils/visual.py
index 3b24364c..1cb9ac5d 100644
--- a/utils/visual.py
+++ b/utils/visual.py
@@ -54,8 +54,8 @@ def visualize(alignment, spectrogram_postnet, stop_tokens, text, hop_length, CON
     plt.xlabel("Decoder timestamp", fontsize=label_fontsize)
     plt.ylabel("Encoder timestamp", fontsize=label_fontsize)
     if CONFIG.use_phonemes:
-        seq = phoneme_to_sequence(text, [CONFIG.text_cleaner], CONFIG.phoneme_language, CONFIG.enable_eos_bos_chars, tp=CONFIG.text if 'text' in CONFIG.keys() else None)
-        text = sequence_to_phoneme(seq, tp=CONFIG.text if 'text' in CONFIG.keys() else None)
+        seq = phoneme_to_sequence(text, [CONFIG.text_cleaner], CONFIG.phoneme_language, CONFIG.enable_eos_bos_chars, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None)
+        text = sequence_to_phoneme(seq, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None)
         print(text)
 
     plt.yticks(range(len(text)), list(text))

From 7ffc1025424e49e40a2b325892d359a3fe21b68c Mon Sep 17 00:00:00 2001
From: Edresson Casanova <edresson1@gmail.com>
Date: Thu, 5 Mar 2020 17:44:47 -0300
Subject: [PATCH 61/61] add unittest for vocabulary parameters

---
 tests/test_config.json        | 10 +++++++++
 tests/test_text_processing.py | 42 ++++++++++++++++++++++++++---------
 utils/text/symbols.py         |  4 ++--
 3 files changed, 43 insertions(+), 13 deletions(-)

diff --git a/tests/test_config.json b/tests/test_config.json
index 0cd3d751..6d63e6ab 100644
--- a/tests/test_config.json
+++ b/tests/test_config.json
@@ -19,6 +19,16 @@
         "mel_fmax": 7600,        // maximum freq level for mel-spec. Tune for dataset!!
         "do_trim_silence": false
     },
+
+    "characters":{
+        "pad": "_",
+        "eos": "~",
+        "bos": "^",
+        "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ",
+        "punctuations":"!'(),-.:;? ",
+        "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ"
+    },
+
     "hidden_size": 128,
     "embedding_size": 256,
     "text_cleaner": "english_cleaners",
diff --git a/tests/test_text_processing.py b/tests/test_text_processing.py
index aa17f694..6c0c7058 100644
--- a/tests/test_text_processing.py
+++ b/tests/test_text_processing.py
@@ -1,7 +1,14 @@
+import os
+# pylint: disable=unused-wildcard-import
+# pylint: disable=wildcard-import
+# pylint: disable=unused-import
 import unittest
-import torch as T
-
 from TTS.utils.text import *
+from TTS.tests import get_tests_path
+from TTS.utils.generic_utils import load_config
+
+TESTS_PATH = get_tests_path()
+conf = load_config(os.path.join(TESTS_PATH, 'test_config.json'))
 
 def test_phoneme_to_sequence():
     text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!"
@@ -9,67 +16,80 @@ def test_phoneme_to_sequence():
     lang = "en-us"
     sequence = phoneme_to_sequence(text, text_cleaner, lang)
     text_hat = sequence_to_phoneme(sequence)
+    sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
+    text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
     gt = "ɹiːsənt ɹɪsɜːtʃ æt hɑːɹvɚd hɐz ʃoʊn mɛdᵻteɪɾɪŋ fɔːɹ æz lɪɾəl æz eɪt wiːks kæn æktʃuːəli ɪnkɹiːs, ðə ɡɹeɪ mæɾɚɹ ɪnðə pɑːɹts ʌvðə bɹeɪn ɹɪspɑːnsəbəl fɔːɹ ɪmoʊʃənəl ɹɛɡjuːleɪʃən ænd lɜːnɪŋ!"
-    assert text_hat == gt
+    assert text_hat == text_hat_with_params == gt 
 
     # multiple punctuations
     text = "Be a voice, not an! echo?"
     sequence = phoneme_to_sequence(text, text_cleaner, lang)
     text_hat = sequence_to_phoneme(sequence)
+    sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
+    text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
     gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ?"
     print(text_hat)
     print(len(sequence))
-    assert text_hat == gt
+    assert text_hat == text_hat_with_params == gt
 
     # not ending with punctuation
     text = "Be a voice, not an! echo"
     sequence = phoneme_to_sequence(text, text_cleaner, lang)
     text_hat = sequence_to_phoneme(sequence)
+    sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
+    text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
     gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ"
     print(text_hat)
     print(len(sequence))
-    assert text_hat == gt
+    assert text_hat == text_hat_with_params == gt
 
     # original
     text = "Be a voice, not an echo!"
     sequence = phoneme_to_sequence(text, text_cleaner, lang)
     text_hat = sequence_to_phoneme(sequence)
+    sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
+    text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
     gt = "biː ɐ vɔɪs, nɑːt ɐn ɛkoʊ!"
     print(text_hat)
     print(len(sequence))
-    assert text_hat == gt
+    assert text_hat == text_hat_with_params == gt
 
     # extra space after the sentence
     text = "Be a voice, not an! echo.  "
     sequence = phoneme_to_sequence(text, text_cleaner, lang)
     text_hat = sequence_to_phoneme(sequence)
+    sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
+    text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
     gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ."
     print(text_hat)
     print(len(sequence))
-    assert text_hat == gt
+    assert text_hat == text_hat_with_params == gt
 
     # extra space after the sentence
     text = "Be a voice, not an! echo.  "
     sequence = phoneme_to_sequence(text, text_cleaner, lang, True)
     text_hat = sequence_to_phoneme(sequence)
+    sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
+    text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
     gt = "^biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ.~"
     print(text_hat)
     print(len(sequence))
-    assert text_hat == gt
+    assert text_hat == text_hat_with_params == gt
 
     # padding char
     text = "_Be a _voice, not an! echo_"
     sequence = phoneme_to_sequence(text, text_cleaner, lang)
     text_hat = sequence_to_phoneme(sequence)
+    sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
+    text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
     gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ"
     print(text_hat)
     print(len(sequence))
-    assert text_hat == gt
-
+    assert text_hat == text_hat_with_params == gt
 
 def test_text2phone():
     text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!"
     gt = "ɹ|iː|s|ə|n|t| |ɹ|ɪ|s|ɜː|tʃ| |æ|t| |h|ɑːɹ|v|ɚ|d| |h|ɐ|z| |ʃ|oʊ|n| |m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| |f|ɔː|ɹ| |æ|z| |l|ɪ|ɾ|əl| |æ|z| |eɪ|t| |w|iː|k|s| |k|æ|n| |æ|k|tʃ|uː|əl|i| |ɪ|n|k|ɹ|iː|s|,| |ð|ə| |ɡ|ɹ|eɪ| |m|æ|ɾ|ɚ|ɹ| |ɪ|n|ð|ə| |p|ɑːɹ|t|s| |ʌ|v|ð|ə| |b|ɹ|eɪ|n| |ɹ|ɪ|s|p|ɑː|n|s|ə|b|əl| |f|ɔː|ɹ| |ɪ|m|oʊ|ʃ|ə|n|əl| |ɹ|ɛ|ɡ|j|uː|l|eɪ|ʃ|ə|n| |æ|n|d| |l|ɜː|n|ɪ|ŋ|!"
     lang = "en-us"
     ph = text2phone(text, lang)
-    assert gt == ph, f"\n{phonemes} \n vs \n{gt}"
+    assert gt == ph, f"\n{phonemes} \n vs \n{gt}"
\ No newline at end of file
diff --git a/utils/text/symbols.py b/utils/text/symbols.py
index 15862cbd..544277c5 100644
--- a/utils/text/symbols.py
+++ b/utils/text/symbols.py
@@ -5,9 +5,9 @@ Defines the set of symbols used in text input to the model.
 The default is a set of ASCII characters that works well for English or text that has been run
 through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details.
 '''
-def make_symbols(characters, phnms, punctuations='!\'(),-.:;? ', pad='_', eos='~', bos='^'):
+def make_symbols(characters, phonemes, punctuations='!\'(),-.:;? ', pad='_', eos='~', bos='^'):# pylint: disable=redefined-outer-name
     ''' Function to create symbols and phonemes '''
-    _phonemes_sorted = sorted(list(phnms))
+    _phonemes_sorted = sorted(list(phonemes))
 
     # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
     _arpabet = ['@' + s for s in _phonemes_sorted]