From 1c5d3b52cf8139ea20dc33c865a9eefb7900687e Mon Sep 17 00:00:00 2001 From: root Date: Wed, 15 Jan 2020 23:10:11 +0100 Subject: [PATCH 01/61] test updates --- tests/test_layers.py | 2 +- tests/test_tacotron2_model.py | 2 +- tests/test_tacotron_model.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_layers.py b/tests/test_layers.py index 6e3c4b13..7d02b673 100644 --- a/tests/test_layers.py +++ b/tests/test_layers.py @@ -119,7 +119,7 @@ class EncoderTests(unittest.TestCase): class L1LossMaskedTests(unittest.TestCase): def test_in_out(self): # test input == target - layer = L1LossMasked() + layer = L1LossMasked(seq_len_norm=False) dummy_input = T.ones(4, 8, 128).float() dummy_target = T.ones(4, 8, 128).float() dummy_length = (T.ones(4) * 8).long() diff --git a/tests/test_tacotron2_model.py b/tests/test_tacotron2_model.py index a26f1ddf..aa2869eb 100644 --- a/tests/test_tacotron2_model.py +++ b/tests/test_tacotron2_model.py @@ -38,7 +38,7 @@ class TacotronTrainTest(unittest.TestCase): stop_targets.size(1) // c.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() - criterion = MSELossMasked().to(device) + criterion = MSELossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) model = Tacotron2(num_chars=24, r=c.r, num_speakers=5).to(device) model.train() diff --git a/tests/test_tacotron_model.py b/tests/test_tacotron_model.py index 7e5e8daf..ac6712b0 100644 --- a/tests/test_tacotron_model.py +++ b/tests/test_tacotron_model.py @@ -44,7 +44,7 @@ class TacotronTrainTest(unittest.TestCase): stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() - criterion = L1LossMasked().to(device) + criterion = L1LossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) model = Tacotron( num_chars=32, @@ -106,7 +106,7 @@ class TacotronGSTTrainTest(unittest.TestCase): stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() - criterion = L1LossMasked().to(device) + criterion = L1LossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) model = Tacotron( num_chars=32, From e37503cb710bb229d8adc12a40d73338f1201351 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 27 Jan 2020 16:38:57 +0100 Subject: [PATCH 02/61] stale.yml --- .github/stale.yml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 .github/stale.yml diff --git a/.github/stale.yml b/.github/stale.yml new file mode 100644 index 00000000..5bac63d3 --- /dev/null +++ b/.github/stale.yml @@ -0,0 +1,19 @@ +# Number of days of inactivity before an issue becomes stale +daysUntilStale: 60 +# Number of days of inactivity before a stale issue is closed +daysUntilClose: 7 +# Issues with these labels will never be considered stale +exemptLabels: + - pinned + - security +# Label to use when marking an issue as stale +staleLabel: wontfix +# Comment to post when marking an issue as stale. Set to `false` to disable +markComment: > + This issue has been automatically marked as stale because it has not had + recent activity. It will be closed if no further activity occurs. Thank you + for your contributions. You might also look our discourse page for further help. + https://discourse.mozilla.org/c/tts +# Comment to post when closing a stale issue. Set to `false` to disable +closeComment: false + From 5c78816f5181a743dc46df3a0ee1746207a57da9 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 4 Feb 2020 17:09:59 +0100 Subject: [PATCH 03/61] update server and synthesizer to handle ParallelWaveGAN --- server/server.py | 9 ++++++--- server/synthesizer.py | 46 ++++++++++++++++++++++++++++++------------- 2 files changed, 38 insertions(+), 17 deletions(-) diff --git a/server/server.py b/server/server.py index 3be66f9e..6af119bf 100644 --- a/server/server.py +++ b/server/server.py @@ -14,10 +14,13 @@ def create_argparser(): parser.add_argument('--tts_checkpoint', type=str, help='path to TTS checkpoint file') parser.add_argument('--tts_config', type=str, help='path to TTS config.json file') parser.add_argument('--tts_speakers', type=str, help='path to JSON file containing speaker ids, if speaker ids are used in the model') - parser.add_argument('--wavernn_lib_path', type=str, help='path to WaveRNN project folder to be imported. If this is not passed, model uses Griffin-Lim for synthesis.') - parser.add_argument('--wavernn_file', type=str, help='path to WaveRNN checkpoint file.') - parser.add_argument('--wavernn_config', type=str, help='path to WaveRNN config file.') + parser.add_argument('--wavernn_lib_path', type=str, default=None, help='path to WaveRNN project folder to be imported. If this is not passed, model uses Griffin-Lim for synthesis.') + parser.add_argument('--wavernn_file', type=str, default=None, help='path to WaveRNN checkpoint file.') + parser.add_argument('--wavernn_config', type=str, default=None, help='path to WaveRNN config file.') parser.add_argument('--is_wavernn_batched', type=convert_boolean, default=False, help='true to use batched WaveRNN.') + parser.add_argument('--pwgan_lib_path', type=str, help='path to ParallelWaveGAN project folder to be imported. If this is not passed, model uses Griffin-Lim for synthesis.') + parser.add_argument('--pwgan_file', type=str, help='path to ParallelWaveGAN checkpoint file.') + parser.add_argument('--pwgan_config', type=str, help='path to ParallelWaveGAN config file.') parser.add_argument('--port', type=int, default=5002, help='port to listen on.') parser.add_argument('--use_cuda', type=convert_boolean, default=False, help='true to use CUDA.') parser.add_argument('--debug', type=convert_boolean, default=False, help='true to enable Flask debug mode.') diff --git a/server/synthesizer.py b/server/synthesizer.py index d8852a3e..b703c62e 100644 --- a/server/synthesizer.py +++ b/server/synthesizer.py @@ -1,17 +1,18 @@ import io import os +import re +import sys import numpy as np import torch -import sys +import yaml from TTS.utils.audio import AudioProcessor from TTS.utils.generic_utils import load_config, setup_model -from TTS.utils.text import phonemes, symbols from TTS.utils.speakers import load_speaker_mapping from TTS.utils.synthesis import * +from TTS.utils.text import phonemes, symbols -import re alphabets = r"([A-Za-z])" prefixes = r"(Mr|St|Mrs|Ms|Dr)[.]" suffixes = r"(Inc|Ltd|Jr|Sr|Co)" @@ -23,6 +24,7 @@ websites = r"[.](com|net|org|io|gov)" class Synthesizer(object): def __init__(self, config): self.wavernn = None + self.pwgan = None self.config = config self.use_cuda = self.config.use_cuda if self.use_cuda: @@ -30,9 +32,11 @@ class Synthesizer(object): self.load_tts(self.config.tts_checkpoint, self.config.tts_config, self.config.use_cuda) if self.config.wavernn_lib_path: - self.load_wavernn(self.config.wavernn_lib_path, self.config.wavernn_path, - self.config.wavernn_file, self.config.wavernn_config, - self.config.use_cuda) + self.load_wavernn(self.config.wavernn_lib_path, self.config.wavernn_file, + self.config.wavernn_config, self.config.use_cuda) + if self.config.pwgan_lib_path: + self.load_pwgan(self.config.pwgan_lib_path, self.config.pwgan_file, + self.config.pwgan_config, self.config.use_cuda) def load_tts(self, tts_checkpoint, tts_config, use_cuda): print(" > Loading TTS model ...") @@ -45,9 +49,9 @@ class Synthesizer(object): self.input_size = len(phonemes) else: self.input_size = len(symbols) - # load speakers + # TODO: fix this for multi-speaker model - load speakers if self.config.tts_speakers is not None: - self.tts_speakers = load_speaker_mapping(os.path.join(model_path, self.config.tts_speakers)) + self.tts_speakers = load_speaker_mapping(self.config.tts_speakers) num_speakers = len(self.tts_speakers) else: num_speakers = 0 @@ -63,16 +67,14 @@ class Synthesizer(object): if 'r' in cp: self.tts_model.decoder.set_r(cp['r']) - def load_wavernn(self, lib_path, model_path, model_file, model_config, use_cuda): + def load_wavernn(self, lib_path, model_file, model_config, use_cuda): # TODO: set a function in wavernn code base for model setup and call it here. sys.path.append(lib_path) # set this if TTS is not installed globally from WaveRNN.models.wavernn import Model - wavernn_config = os.path.join(model_path, model_config) - model_file = os.path.join(model_path, model_file) print(" > Loading WaveRNN model ...") - print(" | > model config: ", wavernn_config) + print(" | > model config: ", model_config) print(" | > model file: ", model_file) - self.wavernn_config = load_config(wavernn_config) + self.wavernn_config = load_config(model_config) self.wavernn = Model( rnn_dims=512, fc_dims=512, @@ -91,11 +93,27 @@ class Synthesizer(object): ).cuda() check = torch.load(model_file) - self.wavernn.load_state_dict(check['model']) + self.wavernn.load_state_dict(check['model'], map_location="cpu") if use_cuda: self.wavernn.cuda() self.wavernn.eval() + def load_pwgan(self, lib_path, model_file, model_config, use_cuda): + sys.path.append(lib_path) # set this if TTS is not installed globally + from parallel_wavegan.models import ParallelWaveGANGenerator + from parallel_wavegan.utils.audio import AudioProcessor as AudioProcessorVocoder + print(" > Loading PWGAN model ...") + print(" | > model config: ", model_config) + print(" | > model file: ", model_file) + with open(model_config) as f: + self.pwgan_config = yaml.load(f, Loader=yaml.Loader) + self.pwgan = ParallelWaveGANGenerator(**self.pwgan_config["generator_params"]) + self.pwgan.load_state_dict(torch.load(model_file, map_location="cpu")["model"]["generator"]) + self.pwgan.remove_weight_norm() + if use_cuda: + self.pwgan.cuda() + self.pwgan.eval() + def save_wav(self, wav, path): # wav *= 32767 / max(1e-8, np.max(np.abs(wav))) wav = np.array(wav) From 61bdb265540321889a3e959676a0995842833562 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 4 Feb 2020 17:19:12 +0100 Subject: [PATCH 04/61] README update --- server/README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/server/README.md b/server/README.md index 95297225..0563ef94 100644 --- a/server/README.md +++ b/server/README.md @@ -6,6 +6,10 @@ Instructions below are based on a Ubuntu 18.04 machine, but it should be simple #### Development server: +##### Using server.py +If you have the environment set already for TTS, then you can directly call ```setup.py```. + +##### Using .whl 1. apt-get install -y espeak libsndfile1 python3-venv 2. python3 -m venv /tmp/venv 3. source /tmp/venv/bin/activate From 2a6bce31cb41fb365c5d5f605bb1084ff49f1b5f Mon Sep 17 00:00:00 2001 From: root Date: Tue, 4 Feb 2020 17:31:02 +0100 Subject: [PATCH 05/61] update server test --- server/synthesizer.py | 2 -- tests/inputs/server_config.json | 4 +++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/server/synthesizer.py b/server/synthesizer.py index b703c62e..63f2080a 100644 --- a/server/synthesizer.py +++ b/server/synthesizer.py @@ -1,5 +1,4 @@ import io -import os import re import sys @@ -101,7 +100,6 @@ class Synthesizer(object): def load_pwgan(self, lib_path, model_file, model_config, use_cuda): sys.path.append(lib_path) # set this if TTS is not installed globally from parallel_wavegan.models import ParallelWaveGANGenerator - from parallel_wavegan.utils.audio import AudioProcessor as AudioProcessorVocoder print(" > Loading PWGAN model ...") print(" | > model config: ", model_config) print(" | > model file: ", model_file) diff --git a/tests/inputs/server_config.json b/tests/inputs/server_config.json index 3988db4c..7f5a60fb 100644 --- a/tests/inputs/server_config.json +++ b/tests/inputs/server_config.json @@ -3,9 +3,11 @@ "tts_config":"dummy_model_config.json", // tts config.json file "tts_speakers": null, // json file listing speaker ids. null if no speaker embedding. "wavernn_lib_path": null, // Rootpath to wavernn project folder to be imported. If this is null, model uses GL for speech synthesis. - "wavernn_path": null, // wavernn model root path "wavernn_file": null, // wavernn checkpoint file name "wavernn_config": null, // wavernn config file + "pwgan_lib_path": null, + "pwgan_file": null, + "pwgan_config": null, "is_wavernn_batched":true, "port": 5002, "use_cuda": false, From 451f7da6980301820402b82d502b29976fd6ca31 Mon Sep 17 00:00:00 2001 From: erogol Date: Thu, 6 Feb 2020 15:16:29 +0100 Subject: [PATCH 06/61] pylint check --- server/synthesizer.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/server/synthesizer.py b/server/synthesizer.py index 63f2080a..75fd4e76 100644 --- a/server/synthesizer.py +++ b/server/synthesizer.py @@ -68,12 +68,15 @@ class Synthesizer(object): def load_wavernn(self, lib_path, model_file, model_config, use_cuda): # TODO: set a function in wavernn code base for model setup and call it here. - sys.path.append(lib_path) # set this if TTS is not installed globally + sys.path.append(lib_path) # set this if WaveRNN is not installed globally + #pylint: disable=import-outside-toplevel from WaveRNN.models.wavernn import Model print(" > Loading WaveRNN model ...") print(" | > model config: ", model_config) print(" | > model file: ", model_file) self.wavernn_config = load_config(model_config) + # This is the default architecture we use for our models. + # You might need to update it self.wavernn = Model( rnn_dims=512, fc_dims=512, @@ -98,7 +101,8 @@ class Synthesizer(object): self.wavernn.eval() def load_pwgan(self, lib_path, model_file, model_config, use_cuda): - sys.path.append(lib_path) # set this if TTS is not installed globally + sys.path.append(lib_path) # set this if ParallelWaveGAN is not installed globally + #pylint: disable=import-outside-toplevel from parallel_wavegan.models import ParallelWaveGANGenerator print(" > Loading PWGAN model ...") print(" | > model config: ", model_config) From 631fbdcb8e158733b4ec1c9996c6c7cc105cd114 Mon Sep 17 00:00:00 2001 From: Markus Toman Date: Fri, 7 Feb 2020 11:08:21 +0100 Subject: [PATCH 07/61] Fix vocoder normalization when no vocoder is used When G&L is used, ap_vocoder is None and crashes --- synthesize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/synthesize.py b/synthesize.py index cb0ee8af..eec022ab 100644 --- a/synthesize.py +++ b/synthesize.py @@ -31,8 +31,8 @@ def tts(model, postnet_output = ap.out_linear_to_mel(postnet_output.T).T # correct if there is a scale difference b/w two models postnet_output = ap._denormalize(postnet_output) - postnet_output = ap_vocoder._normalize(postnet_output) if use_vocoder_model: + postnet_output = ap_vocoder._normalize(postnet_output) vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0) waveform = vocoder_model.generate( vocoder_input.cuda() if use_cuda else vocoder_input, From 3f54c39b0a4bb4678aec99a2e6b13b825387d712 Mon Sep 17 00:00:00 2001 From: Markus Toman Date: Fri, 7 Feb 2020 12:35:03 +0100 Subject: [PATCH 08/61] Pacify pylint --- synthesize.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/synthesize.py b/synthesize.py index eec022ab..47b409ef 100644 --- a/synthesize.py +++ b/synthesize.py @@ -30,9 +30,9 @@ def tts(model, if C.model == "Tacotron" and use_vocoder_model: postnet_output = ap.out_linear_to_mel(postnet_output.T).T # correct if there is a scale difference b/w two models - postnet_output = ap._denormalize(postnet_output) + postnet_output = ap._denormalize(postnet_output) # pylint: disable=W021 if use_vocoder_model: - postnet_output = ap_vocoder._normalize(postnet_output) + postnet_output = ap_vocoder._normalize(postnet_output) # pylint: disable=W021 vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0) waveform = vocoder_model.generate( vocoder_input.cuda() if use_cuda else vocoder_input, From 8f37ea9b84c556440c0fca3c7682f101be03cb0a Mon Sep 17 00:00:00 2001 From: Markus Toman Date: Fri, 7 Feb 2020 12:58:58 +0100 Subject: [PATCH 09/61] Pacify pylint even more --- synthesize.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/synthesize.py b/synthesize.py index 47b409ef..8312d78d 100644 --- a/synthesize.py +++ b/synthesize.py @@ -1,3 +1,4 @@ +# pylint: disable=redefined-outer-name, unused-argument import os import time import argparse @@ -30,9 +31,9 @@ def tts(model, if C.model == "Tacotron" and use_vocoder_model: postnet_output = ap.out_linear_to_mel(postnet_output.T).T # correct if there is a scale difference b/w two models - postnet_output = ap._denormalize(postnet_output) # pylint: disable=W021 + postnet_output = ap._denormalize(postnet_output) # pylint: disable=protected-access if use_vocoder_model: - postnet_output = ap_vocoder._normalize(postnet_output) # pylint: disable=W021 + postnet_output = ap_vocoder._normalize(postnet_output) # pylint: disable=protected-access vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0) waveform = vocoder_model.generate( vocoder_input.cuda() if use_cuda else vocoder_input, From 1d13bb5f8df82fe0ea13dcb0be33ece4e2d477fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 11 Feb 2020 16:52:06 +0100 Subject: [PATCH 10/61] Update README.md Contact and getting help --- README.md | 23 +---------------------- 1 file changed, 1 insertion(+), 22 deletions(-) diff --git a/README.md b/README.md index e98be3c4..7c2e4e3c 100644 --- a/README.md +++ b/README.md @@ -139,28 +139,7 @@ If you like to use TTS to try a new idea and like to share your experiments with - Share your results as you proceed. (Tensorboard log files, audio results, visuals etc.) - Use LJSpeech dataset (for English) if you like to compare results with the released models. (It is the most open scalable dataset for quick experimentation) -## Contact/Getting Help -- [Wiki](https://github.com/mozilla/TTS/wiki) - -- [Discourse Forums](https://discourse.mozilla.org/c/tts) - If your question is not addressed in the Wiki, the Discourse Forums is the next place to look. They contain conversations on General Topics, Using TTS, and TTS Development. - -- [Issues](https://github.com/mozilla/TTS/issues) - Finally, if all else fails, you can open an issue in our repo. - - +## [Contact/Getting Help](https://github.com/mozilla/TTS/wiki/Contact-and-Getting-Help) ## Major TODOs - [x] Implement the model. From 02e6d0538272f589d6c3c290b81575b7bd866991 Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Thu, 13 Feb 2020 15:49:46 +0100 Subject: [PATCH 11/61] Use PWGAN if available in Synthesizer.tts --- server/synthesizer.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/server/synthesizer.py b/server/synthesizer.py index 75fd4e76..455bd332 100644 --- a/server/synthesizer.py +++ b/server/synthesizer.py @@ -168,9 +168,16 @@ class Synthesizer(object): postnet_output, decoder_output, _ = parse_outputs( postnet_output, decoder_output, alignments) + if self.pwgan: + vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0) + if self.use_cuda: + vocoder_input.cuda() + wav = self.pwgan.inference(vocoder_input, hop_size=self.ap.hop_length) if self.wavernn: - postnet_output = postnet_output[0].data.cpu().numpy() - wav = self.wavernn.generate(torch.FloatTensor(postnet_output.T).unsqueeze(0).cuda(), batched=self.config.is_wavernn_batched, target=11000, overlap=550) + vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0) + if self.use_cuda: + vocoder_input.cuda() + wav = self.wavernn.generate(vocoder_input, batched=self.config.is_wavernn_batched, target=11000, overlap=550) else: wav = inv_spectrogram(postnet_output, self.ap, self.tts_config) # trim silence From b539ffafc0a0c185438bab262719f4259b6c8f9f Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Thu, 13 Feb 2020 15:54:30 +0100 Subject: [PATCH 12/61] Load PWGAN/WaveRNN embedded files if present --- server/server.py | 47 +++++++++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/server/server.py b/server/server.py index 6af119bf..705937e2 100644 --- a/server/server.py +++ b/server/server.py @@ -18,9 +18,9 @@ def create_argparser(): parser.add_argument('--wavernn_file', type=str, default=None, help='path to WaveRNN checkpoint file.') parser.add_argument('--wavernn_config', type=str, default=None, help='path to WaveRNN config file.') parser.add_argument('--is_wavernn_batched', type=convert_boolean, default=False, help='true to use batched WaveRNN.') - parser.add_argument('--pwgan_lib_path', type=str, help='path to ParallelWaveGAN project folder to be imported. If this is not passed, model uses Griffin-Lim for synthesis.') - parser.add_argument('--pwgan_file', type=str, help='path to ParallelWaveGAN checkpoint file.') - parser.add_argument('--pwgan_config', type=str, help='path to ParallelWaveGAN config file.') + parser.add_argument('--pwgan_lib_path', type=str, default=None, help='path to ParallelWaveGAN project folder to be imported. If this is not passed, model uses Griffin-Lim for synthesis.') + parser.add_argument('--pwgan_file', type=str, default=None, help='path to ParallelWaveGAN checkpoint file.') + parser.add_argument('--pwgan_config', type=str, default=None, help='path to ParallelWaveGAN config file.') parser.add_argument('--port', type=int, default=5002, help='port to listen on.') parser.add_argument('--use_cuda', type=convert_boolean, default=False, help='true to use CUDA.') parser.add_argument('--debug', type=convert_boolean, default=False, help='true to enable Flask debug mode.') @@ -29,28 +29,35 @@ def create_argparser(): synthesizer = None -embedded_model_folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'model') -checkpoint_file = os.path.join(embedded_model_folder, 'checkpoint.pth.tar') -config_file = os.path.join(embedded_model_folder, 'config.json') +embedded_models_folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'model') -# Default options with embedded model files -if os.path.isfile(checkpoint_file): - default_tts_checkpoint = checkpoint_file -else: - default_tts_checkpoint = None +embedded_tts_folder = os.path.join(embedded_models_folder, 'tts') +tts_checkpoint_file = os.path.join(embedded_tts_folder, 'checkpoint.pth.tar') +tts_config_file = os.path.join(embedded_tts_folder, 'config.json') -if os.path.isfile(config_file): - default_tts_config = config_file -else: - default_tts_config = None +embedded_wavernn_folder = os.path.join(embedded_models_folder, 'wavernn') +wavernn_checkpoint_file = os.path.join(embedded_wavernn_folder, 'checkpoint.pth.tar') +wavernn_config_file = os.path.join(embedded_wavernn_folder, 'config.json') + +embedded_pwgan_folder = os.path.join(embedded_models_folder, 'pwgan') +pwgan_checkpoint_file = os.path.join(embedded_pwgan_folder, 'checkpoint.pkl') +pwgan_config_file = os.path.join(embedded_pwgan_folder, 'config.yml') args = create_argparser().parse_args() -# If these were not specified in the CLI args, use default values -if not args.tts_checkpoint: - args.tts_checkpoint = default_tts_checkpoint -if not args.tts_config: - args.tts_config = default_tts_config +# If these were not specified in the CLI args, use default values with embedded model files +if not args.tts_checkpoint and os.path.isfile(tts_checkpoint_file): + args.tts_checkpoint = tts_checkpoint_file +if not args.tts_config and os.path.isfile(tts_config_file): + args.tts_config = tts_config_file +if not args.wavernn_file and os.path.isfile(wavernn_checkpoint_file): + args.wavernn_file = wavernn_checkpoint_file +if not args.wavernn_config and os.path.isfile(wavernn_config_file): + args.wavernn_config = wavernn_config_file +if not args.pwgan_file and os.path.isfile(pwgan_checkpoint_file): + args.pwgan_file = pwgan_checkpoint_file +if not args.pwgan_config and os.path.isfile(pwgan_config_file): + args.pwgan_config = pwgan_config_file synthesizer = Synthesizer(args) From 995eb1bf074caae257a87f5ef54ae5f63617b227 Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Thu, 13 Feb 2020 16:03:30 +0100 Subject: [PATCH 13/61] Fix bug where sometimes the second sentence disappears if it doesn't end with punctuation --- server/synthesizer.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/server/synthesizer.py b/server/synthesizer.py index 455bd332..1082b73a 100644 --- a/server/synthesizer.py +++ b/server/synthesizer.py @@ -122,7 +122,7 @@ class Synthesizer(object): self.ap.save_wav(wav, path) def split_into_sentences(self, text): - text = " " + text + " " + text = " " + text + " " text = text.replace("\n", " ") text = re.sub(prefixes, "\\1", text) text = re.sub(websites, "\\1", text) @@ -149,15 +149,13 @@ class Synthesizer(object): text = text.replace("", ".") sentences = text.split("") sentences = sentences[:-1] - sentences = [s.strip() for s in sentences] + sentences = list(filter(None, [s.strip() for s in sentences])) # remove empty sentences return sentences def tts(self, text): wavs = [] sens = self.split_into_sentences(text) print(sens) - if not sens: - sens = [text+'.'] for sen in sens: # preprocess the given text inputs = text_to_seqvec(sen, self.tts_config, self.use_cuda) From df42a4a03ac886af4f2ef1bdb8ff25745f74d798 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 13 Feb 2020 16:53:16 +0100 Subject: [PATCH 14/61] Update README.md --- README.md | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/README.md b/README.md index 7c2e4e3c..19d7fa24 100644 --- a/README.md +++ b/README.md @@ -115,10 +115,7 @@ In case of any error or intercepted execution, if there is no checkpoint yet und You can also enjoy Tensorboard, if you point Tensorboard argument```--logdir``` to the experiment folder. -## Testing -Best way to test your network is to use Notebooks under ```notebooks``` folder. - -There is also a good [CoLab](https://colab.research.google.com/github/tugstugi/dl-colab-notebooks/blob/master/notebooks/Mozilla_TTS_WaveRNN.ipynb) sample using pre-trained models (by @tugstugi). +## [Testing and Examples](https://github.com/mozilla/TTS/wiki/Examples-using-TTS) ## Contribution guidelines This repository is governed by Mozilla's code of conduct and etiquette guidelines. For more details, please read the [Mozilla Community Participation Guidelines.](https://www.mozilla.org/about/governance/policies/participation/) From 0e35fdc2a1c8a4bc669e3c6d755c551489ee221b Mon Sep 17 00:00:00 2001 From: erogol Date: Thu, 13 Feb 2020 17:23:37 +0100 Subject: [PATCH 15/61] fix linter problems and loader test --- tests/test_loader.py | 4 +--- tests/test_text_processing.py | 4 ++-- utils/text/__init__.py | 3 --- 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/tests/test_loader.py b/tests/test_loader.py index 751bc181..d8727895 100644 --- a/tests/test_loader.py +++ b/tests/test_loader.py @@ -137,9 +137,7 @@ class TestTTSDataset(unittest.TestCase): # NOTE: Below needs to check == 0 but due to an unknown reason # there is a slight difference between two matrices. # TODO: Check this assert cond more in detail. - assert abs((abs(mel.T) - - abs(mel_dl) - ).sum()) < 1e-5, (abs(mel.T) - abs(mel_dl)).sum() + assert abs(mel.T - mel_dl).max() < 1e-5, abs(mel.T - mel_dl).max() # check mel-spec correctness mel_spec = mel_input[0].cpu().numpy() diff --git a/tests/test_text_processing.py b/tests/test_text_processing.py index 0ecb9962..aa17f694 100644 --- a/tests/test_text_processing.py +++ b/tests/test_text_processing.py @@ -71,5 +71,5 @@ def test_text2phone(): text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!" gt = "ɹ|iː|s|ə|n|t| |ɹ|ɪ|s|ɜː|tʃ| |æ|t| |h|ɑːɹ|v|ɚ|d| |h|ɐ|z| |ʃ|oʊ|n| |m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| |f|ɔː|ɹ| |æ|z| |l|ɪ|ɾ|əl| |æ|z| |eɪ|t| |w|iː|k|s| |k|æ|n| |æ|k|tʃ|uː|əl|i| |ɪ|n|k|ɹ|iː|s|,| |ð|ə| |ɡ|ɹ|eɪ| |m|æ|ɾ|ɚ|ɹ| |ɪ|n|ð|ə| |p|ɑːɹ|t|s| |ʌ|v|ð|ə| |b|ɹ|eɪ|n| |ɹ|ɪ|s|p|ɑː|n|s|ə|b|əl| |f|ɔː|ɹ| |ɪ|m|oʊ|ʃ|ə|n|əl| |ɹ|ɛ|ɡ|j|uː|l|eɪ|ʃ|ə|n| |æ|n|d| |l|ɜː|n|ɪ|ŋ|!" lang = "en-us" - phonemes = text2phone(text, lang) - assert gt == phonemes, f"\n{phonemes} \n vs \n{gt}" + ph = text2phone(text, lang) + assert gt == ph, f"\n{phonemes} \n vs \n{gt}" diff --git a/utils/text/__init__.py b/utils/text/__init__.py index e6842dfa..0e6684d2 100644 --- a/utils/text/__init__.py +++ b/utils/text/__init__.py @@ -38,10 +38,7 @@ def text2phone(text, language): if text[-1] == punctuations[-1]: for punct in punctuations[:-1]: ph = ph.replace('| |\n', '|'+punct+'| |', 1) - try: ph = ph + punctuations[-1] - except: - print(text) else: for punct in punctuations: ph = ph.replace('| |\n', '|'+punct+'| |', 1) From ffd00ce295e8b68e59dccda99bc467823a62940d Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Thu, 13 Feb 2020 17:30:41 +0100 Subject: [PATCH 16/61] Fix linter and server package test --- server/synthesizer.py | 3 ++- setup.py | 7 ++++--- tests/test_server_package.sh | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/server/synthesizer.py b/server/synthesizer.py index 1082b73a..fcdc8787 100644 --- a/server/synthesizer.py +++ b/server/synthesizer.py @@ -121,7 +121,8 @@ class Synthesizer(object): wav = np.array(wav) self.ap.save_wav(wav, path) - def split_into_sentences(self, text): + @staticmethod + def split_into_sentences(text): text = " " + text + " " text = text.replace("\n", " ") text = re.sub(prefixes, "\\1", text) diff --git a/setup.py b/setup.py index 63782800..f92dac8a 100644 --- a/setup.py +++ b/setup.py @@ -61,10 +61,11 @@ package_data = ['server/templates/*'] if 'bdist_wheel' in unknown_args and args.checkpoint and args.model_config: print('Embedding model in wheel file...') model_dir = os.path.join('server', 'model') - os.makedirs(model_dir, exist_ok=True) - embedded_checkpoint_path = os.path.join(model_dir, 'checkpoint.pth.tar') + tts_dir = os.path.join(model_dir, 'tts') + os.makedirs(tts_dir, exist_ok=True) + embedded_checkpoint_path = os.path.join(tts_dir, 'checkpoint.pth.tar') shutil.copy(args.checkpoint, embedded_checkpoint_path) - embedded_config_path = os.path.join(model_dir, 'config.json') + embedded_config_path = os.path.join(tts_dir, 'config.json') shutil.copy(args.model_config, embedded_config_path) package_data.extend([embedded_checkpoint_path, embedded_config_path]) diff --git a/tests/test_server_package.sh b/tests/test_server_package.sh index 01e42843..9fe5e8b1 100755 --- a/tests/test_server_package.sh +++ b/tests/test_server_package.sh @@ -11,7 +11,7 @@ source /tmp/venv/bin/activate pip install --quiet --upgrade pip setuptools wheel rm -f dist/*.whl -python setup.py bdist_wheel --checkpoint tests/outputs/checkpoint_10.pth.tar --model_config tests/outputs/dummy_model_config.json +python setup.py --quiet bdist_wheel --checkpoint tests/outputs/checkpoint_10.pth.tar --model_config tests/outputs/dummy_model_config.json pip install --quiet dist/TTS*.whl python -m TTS.server.server & From 9c5c68626825fdebd4af5d02f0bb792fb9f6fa44 Mon Sep 17 00:00:00 2001 From: erogol Date: Thu, 13 Feb 2020 22:16:40 +0100 Subject: [PATCH 17/61] check config with a function --- config.json | 9 +-- train.py | 3 +- utils/generic_utils.py | 128 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 135 insertions(+), 5 deletions(-) diff --git a/config.json b/config.json index 9e4fa906..c1a8158d 100644 --- a/config.json +++ b/config.json @@ -9,7 +9,7 @@ "num_mels": 80, // size of the mel spec frame. "num_freq": 1025, // number of stft frequency levels. Size of the linear spectogram frame. "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. - "frame_length_ms": 50, // stft window length in ms. + "frame_length_ms": 50.0, // stft window length in ms. "frame_shift_ms": 12.5, // stft window hop-lengh in ms. "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. "min_level_db": -100, // normalization range @@ -19,7 +19,7 @@ // Normalization parameters "signal_norm": true, // normalize the spec values in range [0, 1] "symmetric_norm": true, // move normalization to range [-1, 1] - "max_norm": 4, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] "clip_norm": true, // clip normalized values into the range. "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! @@ -36,11 +36,12 @@ "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. // TRAINING - "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. + "batch_size": 2, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. "eval_batch_size":16, "r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. "gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed. "loss_masking": true, // enable / disable loss masking against the sequence padding. + "grad_accum": 2, // if N > 1, enable gradient accumulation for N iterations. It is useful for low memory GPUs. // VALIDATION "run_eval": true, @@ -49,7 +50,7 @@ // OPTIMIZER "noam_schedule": false, // use noam warmup and lr schedule. - "grad_clip": 1, // upper limit for gradients for clipping. + "grad_clip": 1.0, // upper limit for gradients for clipping. "epochs": 1000, // total number of epochs to train. "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. "wd": 0.000001, // Weight decay weight. diff --git a/train.py b/train.py index e8c240f3..7bfb8751 100644 --- a/train.py +++ b/train.py @@ -20,7 +20,7 @@ from TTS.utils.generic_utils import ( get_git_branch, load_config, remove_experiment_folder, save_best_model, save_checkpoint, adam_weight_decay, set_init_dict, copy_config_file, setup_model, gradual_training_scheduler, KeepAverage, - set_weight_decay) + set_weight_decay, check_config) from TTS.utils.logger import Logger from TTS.utils.speakers import load_speaker_mapping, save_speaker_mapping, \ get_speakers @@ -687,6 +687,7 @@ if __name__ == '__main__': # setup output paths and read configs c = load_config(args.config_path) + check_config(c) _ = os.path.dirname(os.path.realpath(__file__)) OUT_PATH = args.continue_path diff --git a/utils/generic_utils.py b/utils/generic_utils.py index cf1a83a6..7a5c2ac2 100644 --- a/utils/generic_utils.py +++ b/utils/generic_utils.py @@ -389,3 +389,131 @@ class KeepAverage(): def update_values(self, value_dict): for key, value in value_dict.items(): self.update_value(key, value) + + +def _check_argument(name, c, enum_list=None, max_val=None, min_val=None, restricted=False, val_type=None): + if restricted: + assert name in c.keys(), f' [!] {name} not defined in config.json' + if name in c.keys(): + if max_val: + assert c[name] <= max_val, f' [!] {name} is larger than max value {max_val}' + if min_val: + assert c[name] >= min_val, f' [!] {name} is smaller than min value {min_val}' + if enum_list: + assert c[name].lower() in enum_list, f' [!] {name} is not a valid value' + if val_type: + assert type(c[name]) is val_type or c[name] is None, f' [!] {name} has wrong type - {type(c[name])} vs {val_type}' + + + +def check_config(c): + _check_argument('model', c, enum_list=['tacotron', 'tacotron2'], restricted=True, val_type=str) + _check_argument('run_name', c, restricted=True, val_type=str) + _check_argument('run_description', c, val_type=str) + + # AUDIO + _check_argument('audio', c, restricted=True, val_type=dict) + + # audio processing parameters + _check_argument('num_mels', c['audio'], restricted=True, val_type=int, min_val=10, max_val=2056) + _check_argument('num_freq', c['audio'], restricted=True, val_type=int, min_val=128, max_val=4058) + _check_argument('sample_rate', c['audio'], restricted=True, val_type=int, min_val=512, max_val=100000) + _check_argument('frame_length_ms', c['audio'], restricted=True, val_type=float, min_val=10, max_val=1000) + _check_argument('frame_shift_ms', c['audio'], restricted=True, val_type=float, min_val=1, max_val=1000) + _check_argument('preemphasis', c['audio'], restricted=True, val_type=float, min_val=0, max_val=1) + _check_argument('min_level_db', c['audio'], restricted=True, val_type=int, min_val=-1000, max_val=10) + _check_argument('ref_level_db', c['audio'], restricted=True, val_type=int, min_val=0, max_val=1000) + _check_argument('power', c['audio'], restricted=True, val_type=float, min_val=1, max_val=5) + _check_argument('griffin_lim_iters', c['audio'], restricted=True, val_type=int, min_val=10, max_val=1000) + + # normalization parameters + _check_argument('signal_norm', c['audio'], restricted=True, val_type=bool) + _check_argument('symmetric_norm', c['audio'], restricted=True, val_type=bool) + _check_argument('max_norm', c['audio'], restricted=True, val_type=float, min_val=0.1, max_val=1000) + _check_argument('clip_norm', c['audio'], restricted=True, val_type=bool) + _check_argument('mel_fmin', c['audio'], restricted=True, val_type=float, min_val=0.0, max_val=1000) + _check_argument('mel_fmax', c['audio'], restricted=True, val_type=float, min_val=500.0) + _check_argument('do_trim_silence', c['audio'], restricted=True, val_type=bool) + _check_argument('trim_db', c['audio'], restricted=True, val_type=int) + + # training parameters + _check_argument('batch_size', c, restricted=True, val_type=int, min_val=1) + _check_argument('eval_batch_size', c, restricted=True, val_type=int, min_val=1) + _check_argument('r', c, restricted=True, val_type=int, min_val=1) + _check_argument('gradual_training', c, restricted=False, val_type=list) + _check_argument('loss_masking', c, restricted=True, val_type=bool) + _check_argument('grad_accum', c, restricted=True, val_type=int, min_val=1, max_val=100) + + # validation parameters + _check_argument('run_eval', c, restricted=True, val_type=bool) + _check_argument('test_delay_epochs', c, restricted=True, val_type=int, min_val=0) + _check_argument('test_sentences_file', c, restricted=False, val_type=str) + + # optimizer + _check_argument('noam_schedule', c, restricted=False, val_type=bool) + _check_argument('grad_clip', c, restricted=True, val_type=float, min_val=0.0) + _check_argument('epochs', c, restricted=True, val_type=int, min_val=1) + _check_argument('lr', c, restricted=True, val_type=float, min_val=0) + _check_argument('wd', c, restricted=True, val_type=float, min_val=0) + _check_argument('warmup_steps', c, restricted=True, val_type=int, min_val=0) + _check_argument('seq_len_norm', c, restricted=True, val_type=bool) + + # tacotron prenet + _check_argument('memory_size', c, restricted=True, val_type=int, min_val=-1) + _check_argument('prenet_type', c, restricted=True, val_type=str, enum_list=['original', 'bn']) + _check_argument('prenet_dropout', c, restricted=True, val_type=bool) + + # attention + _check_argument('attention_type', c, restricted=True, val_type=str, enum_list=['graves', 'original']) + _check_argument('attention_heads', c, restricted=True, val_type=int) + _check_argument('attention_norm', c, restricted=True, val_type=str, enum_list=['sigmoid', 'softmax']) + _check_argument('windowing', c, restricted=True, val_type=bool) + _check_argument('use_forward_attn', c, restricted=True, val_type=bool) + _check_argument('forward_attn_mask', c, restricted=True, val_type=bool) + _check_argument('transition_agent', c, restricted=True, val_type=bool) + _check_argument('transition_agent', c, restricted=True, val_type=bool) + _check_argument('location_attn', c, restricted=True, val_type=bool) + _check_argument('bidirectional_decoder', c, restricted=True, val_type=bool) + + # stopnet + _check_argument('stopnet', c, restricted=True, val_type=bool) + _check_argument('separate_stopnet', c, restricted=True, val_type=bool) + + # tensorboard + _check_argument('print_step', c, restricted=True, val_type=int, min_val=1) + _check_argument('save_step', c, restricted=True, val_type=int, min_val=1) + _check_argument('checkpoint', c, restricted=True, val_type=bool) + _check_argument('tb_model_param_stats', c, restricted=True, val_type=bool) + + # dataloading + _check_argument('text_cleaner', c, restricted=True, val_type=str, enum_list=['english_cleaners', 'phoneme_cleaners', 'transliteration_cleaners', 'basic_cleaners']) + _check_argument('enable_eos_bos_chars', c, restricted=True, val_type=bool) + _check_argument('num_loader_workers', c, restricted=True, val_type=int, min_val=0) + _check_argument('num_val_loader_workers', c, restricted=True, val_type=int, min_val=0) + _check_argument('batch_group_size', c, restricted=True, val_type=int, min_val=0) + _check_argument('min_seq_len', c, restricted=True, val_type=int, min_val=0) + _check_argument('max_seq_len', c, restricted=True, val_type=int, min_val=10) + + # paths + _check_argument('output_path', c, restricted=True, val_type=str) + + # multi-speaker gst + _check_argument('use_speaker_embedding', c, restricted=True, val_type=bool) + _check_argument('style_wav_for_test', c, restricted=True, val_type=str) + _check_argument('use_gst', c, restricted=True, val_type=bool) + + # datasets - checking only the first entry + _check_argument('datasets', c, restricted=True, val_type=list) + for dataset_entry in c['datasets']: + _check_argument('name', dataset_entry, restricted=True, val_type=str) + _check_argument('path', dataset_entry, restricted=True, val_type=str) + _check_argument('meta_file_train', dataset_entry, restricted=True, val_type=str) + _check_argument('meta_file_val', dataset_entry, restricted=True, val_type=str) + + + + + + + + From 3331afa21932596ca791260e1c14e6942c1d6df2 Mon Sep 17 00:00:00 2001 From: erogol Date: Fri, 14 Feb 2020 17:47:33 +0100 Subject: [PATCH 18/61] remove grad_accum from config checker --- utils/generic_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/utils/generic_utils.py b/utils/generic_utils.py index 7a5c2ac2..942fedf9 100644 --- a/utils/generic_utils.py +++ b/utils/generic_utils.py @@ -405,7 +405,6 @@ def _check_argument(name, c, enum_list=None, max_val=None, min_val=None, restric assert type(c[name]) is val_type or c[name] is None, f' [!] {name} has wrong type - {type(c[name])} vs {val_type}' - def check_config(c): _check_argument('model', c, enum_list=['tacotron', 'tacotron2'], restricted=True, val_type=str) _check_argument('run_name', c, restricted=True, val_type=str) @@ -442,7 +441,7 @@ def check_config(c): _check_argument('r', c, restricted=True, val_type=int, min_val=1) _check_argument('gradual_training', c, restricted=False, val_type=list) _check_argument('loss_masking', c, restricted=True, val_type=bool) - _check_argument('grad_accum', c, restricted=True, val_type=int, min_val=1, max_val=100) + # _check_argument('grad_accum', c, restricted=True, val_type=int, min_val=1, max_val=100) # validation parameters _check_argument('run_eval', c, restricted=True, val_type=bool) From c48b053cdee1a183d747c8151b96febdb102a291 Mon Sep 17 00:00:00 2001 From: erogol Date: Fri, 14 Feb 2020 18:00:15 +0100 Subject: [PATCH 19/61] linter fixes --- utils/generic_utils.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/utils/generic_utils.py b/utils/generic_utils.py index 942fedf9..a8de5bbb 100644 --- a/utils/generic_utils.py +++ b/utils/generic_utils.py @@ -402,8 +402,8 @@ def _check_argument(name, c, enum_list=None, max_val=None, min_val=None, restric if enum_list: assert c[name].lower() in enum_list, f' [!] {name} is not a valid value' if val_type: - assert type(c[name]) is val_type or c[name] is None, f' [!] {name} has wrong type - {type(c[name])} vs {val_type}' - + assert isinstance(c[name], val_type) or c[name] is None, f' [!] {name} has wrong type - {type(c[name])} vs {val_type}' + def check_config(c): _check_argument('model', c, enum_list=['tacotron', 'tacotron2'], restricted=True, val_type=str) @@ -507,12 +507,4 @@ def check_config(c): _check_argument('name', dataset_entry, restricted=True, val_type=str) _check_argument('path', dataset_entry, restricted=True, val_type=str) _check_argument('meta_file_train', dataset_entry, restricted=True, val_type=str) - _check_argument('meta_file_val', dataset_entry, restricted=True, val_type=str) - - - - - - - - + _check_argument('meta_file_val', dataset_entry, restricted=True, val_type=str) \ No newline at end of file From 02df28c7d6059afa31d615a6f24eb27b7c017cff Mon Sep 17 00:00:00 2001 From: richardburleigh Date: Sat, 15 Feb 2020 14:47:50 +1100 Subject: [PATCH 20/61] Fix GL overriding PWGAN inference --- server/synthesizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/synthesizer.py b/server/synthesizer.py index fcdc8787..347bef21 100644 --- a/server/synthesizer.py +++ b/server/synthesizer.py @@ -172,7 +172,7 @@ class Synthesizer(object): if self.use_cuda: vocoder_input.cuda() wav = self.pwgan.inference(vocoder_input, hop_size=self.ap.hop_length) - if self.wavernn: + elif self.wavernn: vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0) if self.use_cuda: vocoder_input.cuda() From 6977899d072a4705bd44e81c1632ff4698524bb1 Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 19 Feb 2020 17:54:06 +0100 Subject: [PATCH 21/61] fix constant GL bug in synthesis --- utils/synthesis.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/utils/synthesis.py b/utils/synthesis.py index f066228a..79a17c78 100644 --- a/utils/synthesis.py +++ b/utils/synthesis.py @@ -78,6 +78,7 @@ def synthesis(model, style_wav=None, truncated=False, enable_eos_bos_chars=False, #pylint: disable=unused-argument + use_griffin_lim=False, do_trim_silence=False): """Synthesize voice for the given text. @@ -111,8 +112,10 @@ def synthesis(model, postnet_output, decoder_output, alignment = parse_outputs( postnet_output, decoder_output, alignments) # plot results - wav = inv_spectrogram(postnet_output, ap, CONFIG) - # trim silence - if do_trim_silence: - wav = trim_silence(wav, ap) + wav = None + if use_griffin_lim: + wav = inv_spectrogram(postnet_output, ap, CONFIG) + # trim silence + if do_trim_silence: + wav = trim_silence(wav, ap) return wav, alignment, decoder_output, postnet_output, stop_tokens From f8ebf6abcdf269fe6278246b2255e5c098ae5395 Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 19 Feb 2020 18:17:10 +0100 Subject: [PATCH 22/61] fix the benchmark notebook after GL fix --- notebooks/Benchmark.ipynb | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/notebooks/Benchmark.ipynb b/notebooks/Benchmark.ipynb index 7c528506..00ac7d16 100644 --- a/notebooks/Benchmark.ipynb +++ b/notebooks/Benchmark.ipynb @@ -65,6 +65,7 @@ "from TTS.utils.text import text_to_sequence\n", "from TTS.utils.synthesis import synthesis\n", "from TTS.utils.visual import visualize\n", + "from TTS.utils.text.symbols import symbols, phonemes\n", "\n", "import IPython\n", "from IPython.display import Audio\n", @@ -81,13 +82,15 @@ "source": [ "def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True):\n", " t_1 = time.time()\n", - " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, False, CONFIG.enable_eos_bos_chars)\n", + " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None, \n", + " truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars,\n", + " use_griffin_lim=use_gl)\n", " if CONFIG.model == \"Tacotron\" and not use_gl:\n", " # coorect the normalization differences b/w TTS and the Vocoder.\n", " mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n", - " mel_postnet_spec = ap._denormalize(mel_postnet_spec)\n", - " mel_postnet_spec = ap_vocoder._normalize(mel_postnet_spec)\n", " if not use_gl:\n", + " mel_postnet_spec = ap._denormalize(mel_postnet_spec)\n", + " mel_postnet_spec = ap_vocoder._normalize(mel_postnet_spec)\n", " waveform = wavernn.generate(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0).cuda(), batched=batched_wavernn, target=8000, overlap=400)\n", "\n", " print(\" > Run-time: {}\".format(time.time() - t_1))\n", @@ -108,7 +111,7 @@ "outputs": [], "source": [ "# Set constants\n", - "ROOT_PATH = '/media/erogol/data_ssd/Models/libri_tts/5099/'\n", + "ROOT_PATH = '/home/erogol/Models/LJSpeech/ljspeech-bn-December-23-2019_08+34AM-ffea133/'\n", "MODEL_PATH = ROOT_PATH + '/best_model.pth.tar'\n", "CONFIG_PATH = ROOT_PATH + '/config.json'\n", "OUT_FOLDER = '/home/erogol/Dropbox/AudioSamples/benchmark_samples/'\n", @@ -116,7 +119,7 @@ "VOCODER_MODEL_PATH = \"/media/erogol/data_ssd/Models/wavernn/ljspeech/mold_ljspeech_best_model/checkpoint_433000.pth.tar\"\n", "VOCODER_CONFIG_PATH = \"/media/erogol/data_ssd/Models/wavernn/ljspeech/mold_ljspeech_best_model/config.json\"\n", "VOCODER_CONFIG = load_config(VOCODER_CONFIG_PATH)\n", - "use_cuda = False\n", + "use_cuda = True\n", "\n", "# Set some config fields manually for testing\n", "# CONFIG.windowing = False\n", @@ -127,7 +130,7 @@ "# CONFIG.stopnet = True\n", "\n", "# Set the vocoder\n", - "use_gl = False # use GL if True\n", + "use_gl = True # use GL if True\n", "batched_wavernn = True # use batched wavernn inference if True" ] }, @@ -138,8 +141,6 @@ "outputs": [], "source": [ "# LOAD TTS MODEL\n", - "from utils.text.symbols import symbols, phonemes\n", - "\n", "# multi speaker \n", "if CONFIG.use_speaker_embedding:\n", " speakers = json.load(open(f\"{ROOT_PATH}/speakers.json\", 'r'))\n", @@ -181,7 +182,7 @@ "metadata": {}, "outputs": [], "source": [ - "# LOAD WAVERNN\n", + "# LOAD WAVERNN - Make sure you downloaded the model and installed the module\n", "if use_gl == False:\n", " from WaveRNN.models.wavernn import Model\n", " from WaveRNN.utils.audio import AudioProcessor as AudioProcessorVocoder\n", @@ -533,7 +534,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.7.4" } }, "nbformat": 4, From e540a5495996e7fec9142b0c372f6c8b37356577 Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 19 Feb 2020 18:24:02 +0100 Subject: [PATCH 23/61] fix synthesize.py --- synthesize.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/synthesize.py b/synthesize.py index cb0ee8af..a338f8b8 100644 --- a/synthesize.py +++ b/synthesize.py @@ -25,14 +25,16 @@ def tts(model, t_1 = time.time() use_vocoder_model = vocoder_model is not None waveform, alignment, _, postnet_output, stop_tokens = synthesis( - model, text, C, use_cuda, ap, speaker_id, False, - C.enable_eos_bos_chars) + model, text, C, use_cuda, ap, speaker_id, style_wav=False, + truncated=False, enable_eos_bos_chars=C.enable_eos_bos_chars, + use_griffin_lim=(not use_vocoder_model), do_trim_silence=True) + if C.model == "Tacotron" and use_vocoder_model: postnet_output = ap.out_linear_to_mel(postnet_output.T).T # correct if there is a scale difference b/w two models - postnet_output = ap._denormalize(postnet_output) - postnet_output = ap_vocoder._normalize(postnet_output) if use_vocoder_model: + postnet_output = ap._denormalize(postnet_output) + postnet_output = ap_vocoder._normalize(postnet_output) vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0) waveform = vocoder_model.generate( vocoder_input.cuda() if use_cuda else vocoder_input, @@ -58,7 +60,7 @@ if __name__ == "__main__": parser.add_argument( 'out_path', type=str, - help='Path to save final wav file.', + help='Path to save final wav file. Wav file will be names as the text given.', ) parser.add_argument('--use_cuda', type=bool, From dc0e6c80197fa2e52e5abc6f2d7568637e04c968 Mon Sep 17 00:00:00 2001 From: root Date: Thu, 9 Jan 2020 15:56:09 +0100 Subject: [PATCH 24/61] simpler gmm attention implementaiton --- config.json | 2 +- layers/common_layers.py | 78 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+), 1 deletion(-) diff --git a/config.json b/config.json index ef999fa9..0bf6c378 100644 --- a/config.json +++ b/config.json @@ -108,7 +108,7 @@ [ { "name": "ljspeech", - "path": "/data5/ro/shared/data/keithito/LJSpeech-1.1/", + "path": "/root/LJSpeech-1.1/", // "path": "/home/erogol/Data/LJSpeech-1.1", "meta_file_train": "metadata_train.csv", "meta_file_val": "metadata_val.csv" diff --git a/layers/common_layers.py b/layers/common_layers.py index c2b042b0..5365d605 100644 --- a/layers/common_layers.py +++ b/layers/common_layers.py @@ -110,6 +110,84 @@ class LocationLayer(nn.Module): return processed_attention +class GravesAttention(nn.Module): + """ Graves attention as described here: + - https://arxiv.org/abs/1910.10288 + """ + COEF = 0.3989422917366028 # numpy.sqrt(1/(2*numpy.pi)) + + def __init__(self, query_dim, K): + super(GravesAttention, self).__init__() + self._mask_value = 0.0 + self.K = K + # self.attention_alignment = 0.05 + self.eps = 1e-5 + self.J = None + self.N_a = nn.Sequential( + nn.Linear(query_dim, query_dim, bias=True), + nn.ReLU(), + nn.Linear(query_dim, 3*K, bias=True)) + self.attention_weights = None + self.mu_prev = None + self.init_layers() + + def init_layers(self): + torch.nn.init.constant_(self.N_a[2].bias[(2*self.K):(3*self.K)], 1.) + torch.nn.init.constant_(self.N_a[2].bias[self.K:(2*self.K)], 10) + + def init_states(self, inputs): + if self.J is None or inputs.shape[1] > self.J.shape[-1]: + self.J = torch.arange(0, inputs.shape[1]+1).to(inputs.device) + 0.5 + self.attention_weights = torch.zeros(inputs.shape[0], inputs.shape[1]).to(inputs.device) + self.mu_prev = torch.zeros(inputs.shape[0], self.K).to(inputs.device) + + # pylint: disable=R0201 + # pylint: disable=unused-argument + def preprocess_inputs(self, inputs): + return None + + def forward(self, query, inputs, processed_inputs, mask): + """ + shapes: + query: B x D_attention_rnn + inputs: B x T_in x D_encoder + processed_inputs: place_holder + mask: B x T_in + """ + gbk_t = self.N_a(query) + gbk_t = gbk_t.view(gbk_t.size(0), -1, self.K) + + # attention model parameters + # each B x K + g_t = gbk_t[:, 0, :] + b_t = gbk_t[:, 1, :] + k_t = gbk_t[:, 2, :] + + # attention GMM parameters + sig_t = torch.nn.functional.softplus(b_t) + self.eps + + mu_t = self.mu_prev + torch.nn.functional.softplus(k_t) + g_t = torch.softmax(g_t, dim=-1) / sig_t + self.eps + + j = self.J[:inputs.size(1)+1] + + # attention weights + phi_t = g_t.unsqueeze(-1) * torch.exp(-0.5 * (mu_t.unsqueeze(-1) - j)**2 / (sig_t.unsqueeze(-1)**2)) + + # discritize attention weights + alpha_t = self.COEF * torch.sum(phi_t, 1) + alpha_t = alpha_t[:, 1:] - alpha_t[:, :-1] + + # apply masking + if mask is not None: + alpha_t.data.masked_fill_(~mask, self._mask_value) + + context = torch.bmm(alpha_t.unsqueeze(1), inputs).squeeze(1) + self.attention_weights = alpha_t + self.mu_prev = mu_t + return context + + class OriginalAttention(nn.Module): """Following the methods proposed here: - https://arxiv.org/abs/1712.05884 From cf7d968f575894e53434ee295eaa52e1f17b6b26 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 14 Jan 2020 13:22:23 +0100 Subject: [PATCH 25/61] graves attention as in melnet paper --- layers/common_layers.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/layers/common_layers.py b/layers/common_layers.py index 5365d605..a768e684 100644 --- a/layers/common_layers.py +++ b/layers/common_layers.py @@ -136,8 +136,8 @@ class GravesAttention(nn.Module): torch.nn.init.constant_(self.N_a[2].bias[self.K:(2*self.K)], 10) def init_states(self, inputs): - if self.J is None or inputs.shape[1] > self.J.shape[-1]: - self.J = torch.arange(0, inputs.shape[1]+1).to(inputs.device) + 0.5 + if self.J is None or inputs.shape[1]+1 > self.J.shape[-1]: + self.J = torch.arange(0, inputs.shape[1]+2).to(inputs.device) + 0.5 self.attention_weights = torch.zeros(inputs.shape[0], inputs.shape[1]).to(inputs.device) self.mu_prev = torch.zeros(inputs.shape[0], self.K).to(inputs.device) @@ -165,24 +165,25 @@ class GravesAttention(nn.Module): # attention GMM parameters sig_t = torch.nn.functional.softplus(b_t) + self.eps - mu_t = self.mu_prev + torch.nn.functional.softplus(k_t) g_t = torch.softmax(g_t, dim=-1) / sig_t + self.eps - j = self.J[:inputs.size(1)+1] # attention weights - phi_t = g_t.unsqueeze(-1) * torch.exp(-0.5 * (mu_t.unsqueeze(-1) - j)**2 / (sig_t.unsqueeze(-1)**2)) + phi_t = g_t.unsqueeze(-1) * (1 / (1 + torch.exp((mu_t.unsqueeze(-1) - j) / sig_t.unsqueeze(-1)))) # discritize attention weights - alpha_t = self.COEF * torch.sum(phi_t, 1) + alpha_t = torch.sum(phi_t, 1) alpha_t = alpha_t[:, 1:] - alpha_t[:, :-1] + alpha_t[alpha_t == 0] = 1e-8 # apply masking if mask is not None: alpha_t.data.masked_fill_(~mask, self._mask_value) context = torch.bmm(alpha_t.unsqueeze(1), inputs).squeeze(1) + # for better visualization + # self.attention_weights = torch.clamp(alpha_t, min=0) self.attention_weights = alpha_t self.mu_prev = mu_t return context @@ -355,7 +356,7 @@ class OriginalAttention(nn.Module): if self.forward_attn: alignment = self.apply_forward_attention(alignment) self.alpha = alignment - + context = torch.bmm(alignment.unsqueeze(1), inputs) context = context.squeeze(1) self.attention_weights = alignment From 72817438db4d805754d19dea818e6b4eb0ce425d Mon Sep 17 00:00:00 2001 From: root Date: Wed, 15 Jan 2020 01:53:27 +0100 Subject: [PATCH 26/61] graves v2 --- config.json | 2 +- layers/common_layers.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/config.json b/config.json index 0bf6c378..fc33d16a 100644 --- a/config.json +++ b/config.json @@ -1,6 +1,6 @@ { "model": "Tacotron2", // one of the model in models/ - "run_name": "ljspeech-graves", + "run_name": "ljspeech-gravesv2", "run_description": "tacotron2 wuth graves attention", // AUDIO PARAMETERS diff --git a/layers/common_layers.py b/layers/common_layers.py index a768e684..f27ecf56 100644 --- a/layers/common_layers.py +++ b/layers/common_layers.py @@ -118,7 +118,7 @@ class GravesAttention(nn.Module): def __init__(self, query_dim, K): super(GravesAttention, self).__init__() - self._mask_value = 0.0 + self._mask_value = 1e-8 self.K = K # self.attention_alignment = 0.05 self.eps = 1e-5 @@ -165,12 +165,14 @@ class GravesAttention(nn.Module): # attention GMM parameters sig_t = torch.nn.functional.softplus(b_t) + self.eps + mu_t = self.mu_prev + torch.nn.functional.softplus(k_t) g_t = torch.softmax(g_t, dim=-1) / sig_t + self.eps + j = self.J[:inputs.size(1)+1] # attention weights - phi_t = g_t.unsqueeze(-1) * (1 / (1 + torch.exp((mu_t.unsqueeze(-1) - j) / sig_t.unsqueeze(-1)))) + phi_t = g_t.unsqueeze(-1) * (1 / (1 + torch.sigmoid((mu_t.unsqueeze(-1) - j) / sig_t.unsqueeze(-1)))) # discritize attention weights alpha_t = torch.sum(phi_t, 1) @@ -182,8 +184,6 @@ class GravesAttention(nn.Module): alpha_t.data.masked_fill_(~mask, self._mask_value) context = torch.bmm(alpha_t.unsqueeze(1), inputs).squeeze(1) - # for better visualization - # self.attention_weights = torch.clamp(alpha_t, min=0) self.attention_weights = alpha_t self.mu_prev = mu_t return context @@ -356,7 +356,7 @@ class OriginalAttention(nn.Module): if self.forward_attn: alignment = self.apply_forward_attention(alignment) self.alpha = alignment - + context = torch.bmm(alignment.unsqueeze(1), inputs) context = context.squeeze(1) self.attention_weights = alignment From 9921d682c325d6f7159c71969bbfdb228c685329 Mon Sep 17 00:00:00 2001 From: root Date: Wed, 15 Jan 2020 12:30:07 +0100 Subject: [PATCH 27/61] seq_len_norm for imbalanced datasets --- layers/losses.py | 40 ++++++++++++++++++++++++++++++++-------- train.py | 4 ++-- 2 files changed, 34 insertions(+), 10 deletions(-) diff --git a/layers/losses.py b/layers/losses.py index e7ecff5f..b8b17c17 100644 --- a/layers/losses.py +++ b/layers/losses.py @@ -6,6 +6,11 @@ from TTS.utils.generic_utils import sequence_mask class L1LossMasked(nn.Module): + + def __init__(self, seq_len_norm): + super(L1LossMasked, self).__init__() + self.seq_len_norm = seq_len_norm + def forward(self, x, target, length): """ Args: @@ -24,14 +29,26 @@ class L1LossMasked(nn.Module): target.requires_grad = False mask = sequence_mask( sequence_length=length, max_len=target.size(1)).unsqueeze(2).float() - mask = mask.expand_as(x) - loss = functional.l1_loss( - x * mask, target * mask, reduction="sum") - loss = loss / mask.sum() + if self.seq_len_norm: + norm_w = mask / mask.sum(dim=1, keepdim=True) + out_weights = norm_w.div(target.shape[0] * target.shape[2]) + mask = mask.expand_as(x) + loss = functional.l1_loss( + x * mask, target * mask, reduction='none') + loss = loss.mul(out_weights.cuda()).sum() + else: + loss = functional.l1_loss( + x * mask, target * mask, reduction='sum') + loss = loss / mask.sum() return loss class MSELossMasked(nn.Module): + + def __init__(self, seq_len_norm): + super(MSELossMasked, self).__init__() + self.seq_len_norm = seq_len_norm + def forward(self, x, target, length): """ Args: @@ -50,10 +67,17 @@ class MSELossMasked(nn.Module): target.requires_grad = False mask = sequence_mask( sequence_length=length, max_len=target.size(1)).unsqueeze(2).float() - mask = mask.expand_as(x) - loss = functional.mse_loss( - x * mask, target * mask, reduction="sum") - loss = loss / mask.sum() + if self.seq_len_norm: + norm_w = mask / mask.sum(dim=1, keepdim=True) + out_weights = norm_w.div(target.shape[0] * target.shape[2]) + mask = mask.expand_as(x) + loss = functional.mse_loss( + x * mask, target * mask, reduction='none') + loss = loss.mul(out_weights.cuda()).sum() + else: + loss = functional.mse_loss( + x * mask, target * mask, reduction='sum') + loss = loss / mask.sum() return loss diff --git a/train.py b/train.py index 81bc2c72..f52d24c1 100644 --- a/train.py +++ b/train.py @@ -561,8 +561,8 @@ def main(args): # pylint: disable=redefined-outer-name optimizer_st = None if c.loss_masking: - criterion = L1LossMasked() if c.model in ["Tacotron", "TacotronGST" - ] else MSELossMasked() + criterion = L1LossMasked(c.seq_len_norm) if c.model in ["Tacotron", "TacotronGST" + ] else MSELossMasked(c.seq_len_norm) else: criterion = nn.L1Loss() if c.model in ["Tacotron", "TacotronGST" ] else nn.MSELoss() From 34d2e9438d36eface47643dc22e7871c6830de97 Mon Sep 17 00:00:00 2001 From: root Date: Wed, 15 Jan 2020 12:38:04 +0100 Subject: [PATCH 28/61] seq_len_norm set in config --- config.json | 1 + 1 file changed, 1 insertion(+) diff --git a/config.json b/config.json index fc33d16a..71ba261e 100644 --- a/config.json +++ b/config.json @@ -53,6 +53,7 @@ "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. "wd": 0.000001, // Weight decay weight. "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" + "seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths. // TACOTRON PRENET "memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame. From 678d56cdef7d671f6e3d2cd70629b50a246e5491 Mon Sep 17 00:00:00 2001 From: root Date: Wed, 15 Jan 2020 23:17:55 +0100 Subject: [PATCH 29/61] bug fix for losses --- layers/losses.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/layers/losses.py b/layers/losses.py index b8b17c17..90d2ac80 100644 --- a/layers/losses.py +++ b/layers/losses.py @@ -37,6 +37,7 @@ class L1LossMasked(nn.Module): x * mask, target * mask, reduction='none') loss = loss.mul(out_weights.cuda()).sum() else: + mask = mask.expand_as(x) loss = functional.l1_loss( x * mask, target * mask, reduction='sum') loss = loss / mask.sum() @@ -75,6 +76,7 @@ class MSELossMasked(nn.Module): x * mask, target * mask, reduction='none') loss = loss.mul(out_weights.cuda()).sum() else: + mask = mask.expand_as(x) loss = functional.mse_loss( x * mask, target * mask, reduction='sum') loss = loss / mask.sum() From bb1117ff32d91a9ba32710810391e062596f62b7 Mon Sep 17 00:00:00 2001 From: root Date: Sat, 18 Jan 2020 00:33:51 +0100 Subject: [PATCH 30/61] stop dividing g_t with sig_t and commenting --- layers/common_layers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/layers/common_layers.py b/layers/common_layers.py index f27ecf56..023c7404 100644 --- a/layers/common_layers.py +++ b/layers/common_layers.py @@ -132,8 +132,8 @@ class GravesAttention(nn.Module): self.init_layers() def init_layers(self): - torch.nn.init.constant_(self.N_a[2].bias[(2*self.K):(3*self.K)], 1.) - torch.nn.init.constant_(self.N_a[2].bias[self.K:(2*self.K)], 10) + torch.nn.init.constant_(self.N_a[2].bias[(2*self.K):(3*self.K)], 1.) # bias mean + torch.nn.init.constant_(self.N_a[2].bias[self.K:(2*self.K)], 10) # bias std def init_states(self, inputs): if self.J is None or inputs.shape[1]+1 > self.J.shape[-1]: @@ -167,7 +167,7 @@ class GravesAttention(nn.Module): sig_t = torch.nn.functional.softplus(b_t) + self.eps mu_t = self.mu_prev + torch.nn.functional.softplus(k_t) - g_t = torch.softmax(g_t, dim=-1) / sig_t + self.eps + g_t = torch.softmax(g_t, dim=-1) + self.eps j = self.J[:inputs.size(1)+1] From 284daba116e2022d13573b89dcc6766fcfa2e342 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 27 Jan 2020 15:42:56 +0100 Subject: [PATCH 31/61] bug fixes --- utils/audio.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/utils/audio.py b/utils/audio.py index 708f0853..82e5aa47 100644 --- a/utils/audio.py +++ b/utils/audio.py @@ -66,12 +66,11 @@ class AudioProcessor(object): return np.maximum(1e-10, np.dot(inv_mel_basis, mel_spec)) def _build_mel_basis(self, ): - n_fft = (self.num_freq - 1) * 2 if self.mel_fmax is not None: assert self.mel_fmax <= self.sample_rate // 2 return librosa.filters.mel( self.sample_rate, - n_fft, + self.n_fft, n_mels=self.num_mels, fmin=self.mel_fmin, fmax=self.mel_fmax) @@ -197,6 +196,7 @@ class AudioProcessor(object): n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length, + pad_mode='constant' ) def _istft(self, y): @@ -217,7 +217,7 @@ class AudioProcessor(object): margin = int(self.sample_rate * 0.01) wav = wav[margin:-margin] return librosa.effects.trim( - wav, top_db=60, frame_length=self.win_length, hop_length=self.hop_length)[0] + wav, top_db=40, frame_length=self.win_length, hop_length=self.hop_length)[0] @staticmethod def mulaw_encode(wav, qc): From 0d17019d224c7db47c2370088e35986e2e8c69af Mon Sep 17 00:00:00 2001 From: root Date: Mon, 27 Jan 2020 15:46:59 +0100 Subject: [PATCH 32/61] remove old graves --- layers/common_layers.py | 79 ++--------------------------------------- 1 file changed, 2 insertions(+), 77 deletions(-) diff --git a/layers/common_layers.py b/layers/common_layers.py index 023c7404..592f017c 100644 --- a/layers/common_layers.py +++ b/layers/common_layers.py @@ -111,8 +111,9 @@ class LocationLayer(nn.Module): class GravesAttention(nn.Module): - """ Graves attention as described here: + """ Discretized Graves attention: - https://arxiv.org/abs/1910.10288 + - https://arxiv.org/pdf/1906.01083.pdf """ COEF = 0.3989422917366028 # numpy.sqrt(1/(2*numpy.pi)) @@ -368,82 +369,6 @@ class OriginalAttention(nn.Module): return context -class GravesAttention(nn.Module): - """ Graves attention as described here: - - https://arxiv.org/abs/1910.10288 - """ - COEF = 0.3989422917366028 # numpy.sqrt(1/(2*numpy.pi)) - - def __init__(self, query_dim, K): - super(GravesAttention, self).__init__() - self._mask_value = 0.0 - self.K = K - # self.attention_alignment = 0.05 - self.eps = 1e-5 - self.J = None - self.N_a = nn.Sequential( - nn.Linear(query_dim, query_dim, bias=True), - nn.ReLU(), - nn.Linear(query_dim, 3*K, bias=True)) - self.attention_weights = None - self.mu_prev = None - self.init_layers() - - def init_layers(self): - torch.nn.init.constant_(self.N_a[2].bias[(2*self.K):(3*self.K)], 1.) - torch.nn.init.constant_(self.N_a[2].bias[self.K:(2*self.K)], 10) - - def init_states(self, inputs): - if self.J is None or inputs.shape[1] > self.J.shape[-1]: - self.J = torch.arange(0, inputs.shape[1]).to(inputs.device) - self.attention_weights = torch.zeros(inputs.shape[0], inputs.shape[1]).to(inputs.device) - self.mu_prev = torch.zeros(inputs.shape[0], self.K).to(inputs.device) - - # pylint: disable=R0201 - # pylint: disable=unused-argument - def preprocess_inputs(self, inputs): - return None - - def forward(self, query, inputs, processed_inputs, mask): - """ - shapes: - query: B x D_attention_rnn - inputs: B x T_in x D_encoder - processed_inputs: place_holder - mask: B x T_in - """ - gbk_t = self.N_a(query) - gbk_t = gbk_t.view(gbk_t.size(0), -1, self.K) - - # attention model parameters - # each B x K - g_t = gbk_t[:, 0, :] - b_t = gbk_t[:, 1, :] - k_t = gbk_t[:, 2, :] - - # attention GMM parameters - sig_t = torch.nn.functional.softplus(b_t) + self.eps - - mu_t = self.mu_prev + torch.nn.functional.softplus(k_t) - g_t = torch.softmax(g_t, dim=-1) / sig_t + self.eps - - # each B x K x T_in - j = self.J[:inputs.size(1)] - - # attention weights - phi_t = g_t.unsqueeze(-1) * torch.exp(-0.5 * (mu_t.unsqueeze(-1) - j)**2 / (sig_t.unsqueeze(-1)**2)) - alpha_t = self.COEF * torch.sum(phi_t, 1) - - # apply masking - if mask is not None: - alpha_t.data.masked_fill_(~mask, self._mask_value) - - context = torch.bmm(alpha_t.unsqueeze(1), inputs).squeeze(1) - self.attention_weights = alpha_t - self.mu_prev = mu_t - return context - - def init_attn(attn_type, query_dim, embedding_dim, attention_dim, location_attention, attention_location_n_filters, attention_location_kernel_size, windowing, norm, forward_attn, From ca33336ae0f34751f5ca393c7998f7ac85c16b79 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 27 Jan 2020 16:02:34 +0100 Subject: [PATCH 33/61] testing seq_len_norm --- layers/losses.py | 4 ++-- tests/test_layers.py | 39 ++++++++++++++++++++++++++++++++++++--- 2 files changed, 38 insertions(+), 5 deletions(-) diff --git a/layers/losses.py b/layers/losses.py index 90d2ac80..176e2f09 100644 --- a/layers/losses.py +++ b/layers/losses.py @@ -35,7 +35,7 @@ class L1LossMasked(nn.Module): mask = mask.expand_as(x) loss = functional.l1_loss( x * mask, target * mask, reduction='none') - loss = loss.mul(out_weights.cuda()).sum() + loss = loss.mul(out_weights.to(loss.device)).sum() else: mask = mask.expand_as(x) loss = functional.l1_loss( @@ -74,7 +74,7 @@ class MSELossMasked(nn.Module): mask = mask.expand_as(x) loss = functional.mse_loss( x * mask, target * mask, reduction='none') - loss = loss.mul(out_weights.cuda()).sum() + loss = loss.mul(out_weights.to(loss.device)).sum() else: mask = mask.expand_as(x) loss = functional.mse_loss( diff --git a/tests/test_layers.py b/tests/test_layers.py index 7d02b673..d7c8829f 100644 --- a/tests/test_layers.py +++ b/tests/test_layers.py @@ -131,7 +131,7 @@ class L1LossMaskedTests(unittest.TestCase): dummy_target = T.zeros(4, 8, 128).float() dummy_length = (T.ones(4) * 8).long() output = layer(dummy_input, dummy_target, dummy_length) - assert output.item() == 1.0, "1.0 vs {}".format(output.data[0]) + assert output.item() == 1.0, "1.0 vs {}".format(output.item()) # test if padded values of input makes any difference dummy_input = T.ones(4, 8, 128).float() @@ -140,7 +140,7 @@ class L1LossMaskedTests(unittest.TestCase): mask = ( (sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) - assert output.item() == 1.0, "1.0 vs {}".format(output.data[0]) + assert output.item() == 1.0, "1.0 vs {}".format(output.item()) dummy_input = T.rand(4, 8, 128).float() dummy_target = dummy_input.detach() @@ -148,4 +148,37 @@ class L1LossMaskedTests(unittest.TestCase): mask = ( (sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) - assert output.item() == 0, "0 vs {}".format(output.data[0]) + assert output.item() == 0, "0 vs {}".format(output.item()) + + # seq_len_norm = True + # test input == target + layer = L1LossMasked(seq_len_norm=True) + dummy_input = T.ones(4, 8, 128).float() + dummy_target = T.ones(4, 8, 128).float() + dummy_length = (T.ones(4) * 8).long() + output = layer(dummy_input, dummy_target, dummy_length) + assert output.item() == 0.0 + + # test input != target + dummy_input = T.ones(4, 8, 128).float() + dummy_target = T.zeros(4, 8, 128).float() + dummy_length = (T.ones(4) * 8).long() + output = layer(dummy_input, dummy_target, dummy_length) + assert output.item() == 1.0, "1.0 vs {}".format(output.item()) + + # test if padded values of input makes any difference + dummy_input = T.ones(4, 8, 128).float() + dummy_target = T.zeros(4, 8, 128).float() + dummy_length = (T.arange(5, 9)).long() + mask = ( + (sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) + output = layer(dummy_input + mask, dummy_target, dummy_length) + assert abs(output.item() - 1.0) < 1e-5, "1.0 vs {}".format(output.item()) + + dummy_input = T.rand(4, 8, 128).float() + dummy_target = dummy_input.detach() + dummy_length = (T.arange(5, 9)).long() + mask = ( + (sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) + output = layer(dummy_input + mask, dummy_target, dummy_length) + assert output.item() == 0, "0 vs {}".format(output.item()) From ffe9a32813c03400576fbea78029a6b869729b9b Mon Sep 17 00:00:00 2001 From: root Date: Mon, 3 Feb 2020 14:16:40 +0100 Subject: [PATCH 34/61] set silence trimming threshold in config --- config.json | 1 + utils/audio.py | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/config.json b/config.json index 71ba261e..89266a94 100644 --- a/config.json +++ b/config.json @@ -24,6 +24,7 @@ "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! "do_trim_silence": true // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. }, // DISTRIBUTED TRAINING diff --git a/utils/audio.py b/utils/audio.py index 82e5aa47..7b2c4834 100644 --- a/utils/audio.py +++ b/utils/audio.py @@ -24,6 +24,7 @@ class AudioProcessor(object): clip_norm=True, griffin_lim_iters=None, do_trim_silence=False, + trim_db=60, sound_norm=False, **_): @@ -46,6 +47,7 @@ class AudioProcessor(object): self.max_norm = 1.0 if max_norm is None else float(max_norm) self.clip_norm = clip_norm self.do_trim_silence = do_trim_silence + self.trim_db = trim_db self.sound_norm = sound_norm self.n_fft, self.hop_length, self.win_length = self._stft_parameters() assert min_level_db != 0.0, " [!] min_level_db is 0" @@ -217,7 +219,7 @@ class AudioProcessor(object): margin = int(self.sample_rate * 0.01) wav = wav[margin:-margin] return librosa.effects.trim( - wav, top_db=40, frame_length=self.win_length, hop_length=self.hop_length)[0] + wav, top_db=self.trim_db, frame_length=self.win_length, hop_length=self.hop_length)[0] @staticmethod def mulaw_encode(wav, qc): From 1ef6278d2d247ea904ad98bfa78ab376628678bc Mon Sep 17 00:00:00 2001 From: root Date: Mon, 3 Feb 2020 15:29:44 +0100 Subject: [PATCH 35/61] tacotron2 stop condition --- layers/tacotron2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/layers/tacotron2.py b/layers/tacotron2.py index 78bdd10d..c195b277 100644 --- a/layers/tacotron2.py +++ b/layers/tacotron2.py @@ -290,7 +290,7 @@ class Decoder(nn.Module): stop_tokens += [stop_token] alignments += [alignment] - if stop_token > 0.7: + if stop_token > 0.7 and t > inputs.shape[0] / 2: break if len(outputs) == self.max_decoder_steps: print(" | > Decoder stopped with 'max_decoder_steps") From fbe5310be01321220ad219efb48ea68d38f30267 Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Tue, 4 Feb 2020 11:16:48 +0100 Subject: [PATCH 36/61] Only use embedded model files if they're not overriden by CLI flags --- server/server.py | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/server/server.py b/server/server.py index d40e2427..3be66f9e 100644 --- a/server/server.py +++ b/server/server.py @@ -24,20 +24,32 @@ def create_argparser(): return parser -config = None synthesizer = None embedded_model_folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'model') checkpoint_file = os.path.join(embedded_model_folder, 'checkpoint.pth.tar') config_file = os.path.join(embedded_model_folder, 'config.json') -if os.path.isfile(checkpoint_file) and os.path.isfile(config_file): - # Use default config with embedded model files - config = create_argparser().parse_args([]) - config.tts_checkpoint = checkpoint_file - config.tts_config = config_file - synthesizer = Synthesizer(config) +# Default options with embedded model files +if os.path.isfile(checkpoint_file): + default_tts_checkpoint = checkpoint_file +else: + default_tts_checkpoint = None +if os.path.isfile(config_file): + default_tts_config = config_file +else: + default_tts_config = None + +args = create_argparser().parse_args() + +# If these were not specified in the CLI args, use default values +if not args.tts_checkpoint: + args.tts_checkpoint = default_tts_checkpoint +if not args.tts_config: + args.tts_config = default_tts_config + +synthesizer = Synthesizer(args) app = Flask(__name__) @@ -55,11 +67,4 @@ def tts(): if __name__ == '__main__': - args = create_argparser().parse_args() - - # Setup synthesizer from CLI args if they're specified or no embedded model - # is present. - if not config or not synthesizer or args.tts_checkpoint or args.tts_config: - synthesizer = Synthesizer(args) - - app.run(debug=config.debug, host='0.0.0.0', port=config.port) + app.run(debug=args.debug, host='0.0.0.0', port=args.port) From ed8a9fc82a383209518733ae4bdedfa986f9648d Mon Sep 17 00:00:00 2001 From: root Date: Tue, 4 Feb 2020 17:09:59 +0100 Subject: [PATCH 37/61] update server and synthesizer to handle ParallelWaveGAN --- server/server.py | 9 ++++++--- server/synthesizer.py | 46 ++++++++++++++++++++++++++++++------------- 2 files changed, 38 insertions(+), 17 deletions(-) diff --git a/server/server.py b/server/server.py index 3be66f9e..6af119bf 100644 --- a/server/server.py +++ b/server/server.py @@ -14,10 +14,13 @@ def create_argparser(): parser.add_argument('--tts_checkpoint', type=str, help='path to TTS checkpoint file') parser.add_argument('--tts_config', type=str, help='path to TTS config.json file') parser.add_argument('--tts_speakers', type=str, help='path to JSON file containing speaker ids, if speaker ids are used in the model') - parser.add_argument('--wavernn_lib_path', type=str, help='path to WaveRNN project folder to be imported. If this is not passed, model uses Griffin-Lim for synthesis.') - parser.add_argument('--wavernn_file', type=str, help='path to WaveRNN checkpoint file.') - parser.add_argument('--wavernn_config', type=str, help='path to WaveRNN config file.') + parser.add_argument('--wavernn_lib_path', type=str, default=None, help='path to WaveRNN project folder to be imported. If this is not passed, model uses Griffin-Lim for synthesis.') + parser.add_argument('--wavernn_file', type=str, default=None, help='path to WaveRNN checkpoint file.') + parser.add_argument('--wavernn_config', type=str, default=None, help='path to WaveRNN config file.') parser.add_argument('--is_wavernn_batched', type=convert_boolean, default=False, help='true to use batched WaveRNN.') + parser.add_argument('--pwgan_lib_path', type=str, help='path to ParallelWaveGAN project folder to be imported. If this is not passed, model uses Griffin-Lim for synthesis.') + parser.add_argument('--pwgan_file', type=str, help='path to ParallelWaveGAN checkpoint file.') + parser.add_argument('--pwgan_config', type=str, help='path to ParallelWaveGAN config file.') parser.add_argument('--port', type=int, default=5002, help='port to listen on.') parser.add_argument('--use_cuda', type=convert_boolean, default=False, help='true to use CUDA.') parser.add_argument('--debug', type=convert_boolean, default=False, help='true to enable Flask debug mode.') diff --git a/server/synthesizer.py b/server/synthesizer.py index d8852a3e..b703c62e 100644 --- a/server/synthesizer.py +++ b/server/synthesizer.py @@ -1,17 +1,18 @@ import io import os +import re +import sys import numpy as np import torch -import sys +import yaml from TTS.utils.audio import AudioProcessor from TTS.utils.generic_utils import load_config, setup_model -from TTS.utils.text import phonemes, symbols from TTS.utils.speakers import load_speaker_mapping from TTS.utils.synthesis import * +from TTS.utils.text import phonemes, symbols -import re alphabets = r"([A-Za-z])" prefixes = r"(Mr|St|Mrs|Ms|Dr)[.]" suffixes = r"(Inc|Ltd|Jr|Sr|Co)" @@ -23,6 +24,7 @@ websites = r"[.](com|net|org|io|gov)" class Synthesizer(object): def __init__(self, config): self.wavernn = None + self.pwgan = None self.config = config self.use_cuda = self.config.use_cuda if self.use_cuda: @@ -30,9 +32,11 @@ class Synthesizer(object): self.load_tts(self.config.tts_checkpoint, self.config.tts_config, self.config.use_cuda) if self.config.wavernn_lib_path: - self.load_wavernn(self.config.wavernn_lib_path, self.config.wavernn_path, - self.config.wavernn_file, self.config.wavernn_config, - self.config.use_cuda) + self.load_wavernn(self.config.wavernn_lib_path, self.config.wavernn_file, + self.config.wavernn_config, self.config.use_cuda) + if self.config.pwgan_lib_path: + self.load_pwgan(self.config.pwgan_lib_path, self.config.pwgan_file, + self.config.pwgan_config, self.config.use_cuda) def load_tts(self, tts_checkpoint, tts_config, use_cuda): print(" > Loading TTS model ...") @@ -45,9 +49,9 @@ class Synthesizer(object): self.input_size = len(phonemes) else: self.input_size = len(symbols) - # load speakers + # TODO: fix this for multi-speaker model - load speakers if self.config.tts_speakers is not None: - self.tts_speakers = load_speaker_mapping(os.path.join(model_path, self.config.tts_speakers)) + self.tts_speakers = load_speaker_mapping(self.config.tts_speakers) num_speakers = len(self.tts_speakers) else: num_speakers = 0 @@ -63,16 +67,14 @@ class Synthesizer(object): if 'r' in cp: self.tts_model.decoder.set_r(cp['r']) - def load_wavernn(self, lib_path, model_path, model_file, model_config, use_cuda): + def load_wavernn(self, lib_path, model_file, model_config, use_cuda): # TODO: set a function in wavernn code base for model setup and call it here. sys.path.append(lib_path) # set this if TTS is not installed globally from WaveRNN.models.wavernn import Model - wavernn_config = os.path.join(model_path, model_config) - model_file = os.path.join(model_path, model_file) print(" > Loading WaveRNN model ...") - print(" | > model config: ", wavernn_config) + print(" | > model config: ", model_config) print(" | > model file: ", model_file) - self.wavernn_config = load_config(wavernn_config) + self.wavernn_config = load_config(model_config) self.wavernn = Model( rnn_dims=512, fc_dims=512, @@ -91,11 +93,27 @@ class Synthesizer(object): ).cuda() check = torch.load(model_file) - self.wavernn.load_state_dict(check['model']) + self.wavernn.load_state_dict(check['model'], map_location="cpu") if use_cuda: self.wavernn.cuda() self.wavernn.eval() + def load_pwgan(self, lib_path, model_file, model_config, use_cuda): + sys.path.append(lib_path) # set this if TTS is not installed globally + from parallel_wavegan.models import ParallelWaveGANGenerator + from parallel_wavegan.utils.audio import AudioProcessor as AudioProcessorVocoder + print(" > Loading PWGAN model ...") + print(" | > model config: ", model_config) + print(" | > model file: ", model_file) + with open(model_config) as f: + self.pwgan_config = yaml.load(f, Loader=yaml.Loader) + self.pwgan = ParallelWaveGANGenerator(**self.pwgan_config["generator_params"]) + self.pwgan.load_state_dict(torch.load(model_file, map_location="cpu")["model"]["generator"]) + self.pwgan.remove_weight_norm() + if use_cuda: + self.pwgan.cuda() + self.pwgan.eval() + def save_wav(self, wav, path): # wav *= 32767 / max(1e-8, np.max(np.abs(wav))) wav = np.array(wav) From af0fa9f6da4f170de4f33d4e68d1a09d7518c13f Mon Sep 17 00:00:00 2001 From: root Date: Tue, 4 Feb 2020 17:19:12 +0100 Subject: [PATCH 38/61] README update --- server/README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/server/README.md b/server/README.md index 95297225..0563ef94 100644 --- a/server/README.md +++ b/server/README.md @@ -6,6 +6,10 @@ Instructions below are based on a Ubuntu 18.04 machine, but it should be simple #### Development server: +##### Using server.py +If you have the environment set already for TTS, then you can directly call ```setup.py```. + +##### Using .whl 1. apt-get install -y espeak libsndfile1 python3-venv 2. python3 -m venv /tmp/venv 3. source /tmp/venv/bin/activate From c776526c45d4229940390c6f468c29842b992dba Mon Sep 17 00:00:00 2001 From: root Date: Tue, 4 Feb 2020 17:31:02 +0100 Subject: [PATCH 39/61] update server test --- server/synthesizer.py | 2 -- tests/inputs/server_config.json | 4 +++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/server/synthesizer.py b/server/synthesizer.py index b703c62e..63f2080a 100644 --- a/server/synthesizer.py +++ b/server/synthesizer.py @@ -1,5 +1,4 @@ import io -import os import re import sys @@ -101,7 +100,6 @@ class Synthesizer(object): def load_pwgan(self, lib_path, model_file, model_config, use_cuda): sys.path.append(lib_path) # set this if TTS is not installed globally from parallel_wavegan.models import ParallelWaveGANGenerator - from parallel_wavegan.utils.audio import AudioProcessor as AudioProcessorVocoder print(" > Loading PWGAN model ...") print(" | > model config: ", model_config) print(" | > model file: ", model_file) diff --git a/tests/inputs/server_config.json b/tests/inputs/server_config.json index 3988db4c..7f5a60fb 100644 --- a/tests/inputs/server_config.json +++ b/tests/inputs/server_config.json @@ -3,9 +3,11 @@ "tts_config":"dummy_model_config.json", // tts config.json file "tts_speakers": null, // json file listing speaker ids. null if no speaker embedding. "wavernn_lib_path": null, // Rootpath to wavernn project folder to be imported. If this is null, model uses GL for speech synthesis. - "wavernn_path": null, // wavernn model root path "wavernn_file": null, // wavernn checkpoint file name "wavernn_config": null, // wavernn config file + "pwgan_lib_path": null, + "pwgan_file": null, + "pwgan_config": null, "is_wavernn_batched":true, "port": 5002, "use_cuda": false, From 532cf8160ccc68edc76ebcbcc2ba777ecd7a453e Mon Sep 17 00:00:00 2001 From: erogol Date: Thu, 6 Feb 2020 15:16:29 +0100 Subject: [PATCH 40/61] pylint check --- server/synthesizer.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/server/synthesizer.py b/server/synthesizer.py index 63f2080a..75fd4e76 100644 --- a/server/synthesizer.py +++ b/server/synthesizer.py @@ -68,12 +68,15 @@ class Synthesizer(object): def load_wavernn(self, lib_path, model_file, model_config, use_cuda): # TODO: set a function in wavernn code base for model setup and call it here. - sys.path.append(lib_path) # set this if TTS is not installed globally + sys.path.append(lib_path) # set this if WaveRNN is not installed globally + #pylint: disable=import-outside-toplevel from WaveRNN.models.wavernn import Model print(" > Loading WaveRNN model ...") print(" | > model config: ", model_config) print(" | > model file: ", model_file) self.wavernn_config = load_config(model_config) + # This is the default architecture we use for our models. + # You might need to update it self.wavernn = Model( rnn_dims=512, fc_dims=512, @@ -98,7 +101,8 @@ class Synthesizer(object): self.wavernn.eval() def load_pwgan(self, lib_path, model_file, model_config, use_cuda): - sys.path.append(lib_path) # set this if TTS is not installed globally + sys.path.append(lib_path) # set this if ParallelWaveGAN is not installed globally + #pylint: disable=import-outside-toplevel from parallel_wavegan.models import ParallelWaveGANGenerator print(" > Loading PWGAN model ...") print(" | > model config: ", model_config) From 5daaadc9dc3d46ba2048401e3a681eae248eb68f Mon Sep 17 00:00:00 2001 From: Markus Toman Date: Fri, 7 Feb 2020 12:58:58 +0100 Subject: [PATCH 41/61] Pacify pylint even more --- synthesize.py | 1 + 1 file changed, 1 insertion(+) diff --git a/synthesize.py b/synthesize.py index a338f8b8..bf85d7c9 100644 --- a/synthesize.py +++ b/synthesize.py @@ -1,3 +1,4 @@ +# pylint: disable=redefined-outer-name, unused-argument import os import time import argparse From 65b8b33d712df5c60ee7ebc39a455c86563c1a64 Mon Sep 17 00:00:00 2001 From: erogol Date: Fri, 7 Feb 2020 13:00:04 +0100 Subject: [PATCH 42/61] config fixes and enable graves attention wq --- config.json | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/config.json b/config.json index 89266a94..9e4fa906 100644 --- a/config.json +++ b/config.json @@ -23,8 +23,8 @@ "clip_norm": true, // clip normalized values into the range. "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! - "do_trim_silence": true // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) - "trim_db": 60, // threshold for timming silence. Set this according to your dataset. + "do_trim_silence": true, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "trim_db": 60 // threshold for timming silence. Set this according to your dataset. }, // DISTRIBUTED TRAINING @@ -62,14 +62,14 @@ "prenet_dropout": true, // enable/disable dropout at prenet. // ATTENTION - "attention_type": "original", // 'original' or 'graves' - "attention_heads": 5, // number of attention heads (only for 'graves') + "attention_type": "graves", // 'original' or 'graves' + "attention_heads": 4, // number of attention heads (only for 'graves') "attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron. "windowing": false, // Enables attention windowing. Used only in eval mode. "use_forward_attn": false, // if it uses forward attention. In general, it aligns faster. "forward_attn_mask": false, // Additional masking forcing monotonicity only in eval mode. "transition_agent": false, // enable/disable transition agent of forward attention. - "location_attn": true, // enable_disable location sensitive attention. It is enabled for TACOTRON by default. + "location_attn": false, // enable_disable location sensitive attention. It is enabled for TACOTRON by default. "bidirectional_decoder": false, // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset. // STOPNET @@ -92,8 +92,8 @@ "max_seq_len": 150, // DATASET-RELATED: maximum text length // PATHS - "output_path": "/data5/rw/pit/keep/", // DATASET-RELATED: output path for all training outputs. - // "output_path": "/media/erogol/data_ssd/Models/runs/", + // "output_path": "/data5/rw/pit/keep/", // DATASET-RELATED: output path for all training outputs. + "output_path": "/home/erogol/Models/LJSpeech/", // PHONEMES "phoneme_cache_path": "mozilla_us_phonemes", // phoneme computation is slow, therefore, it caches results in the given folder. @@ -110,10 +110,10 @@ [ { "name": "ljspeech", - "path": "/root/LJSpeech-1.1/", + "path": "/home/erogol/Data/LJSpeech-1.1/", // "path": "/home/erogol/Data/LJSpeech-1.1", - "meta_file_train": "metadata_train.csv", - "meta_file_val": "metadata_val.csv" + "meta_file_train": "metadata.csv", + "meta_file_val": null } ] From abf8ea4633a70b150a575bf0ad269cd30481fbcf Mon Sep 17 00:00:00 2001 From: erogol Date: Fri, 7 Feb 2020 13:00:48 +0100 Subject: [PATCH 43/61] Notebook for PWGAN vocoder --- notebooks/Benchmark-PWGAN.ipynb | 578 ++++++++++++++++++++++++++++++++ 1 file changed, 578 insertions(+) create mode 100644 notebooks/Benchmark-PWGAN.ipynb diff --git a/notebooks/Benchmark-PWGAN.ipynb b/notebooks/Benchmark-PWGAN.ipynb new file mode 100644 index 00000000..430d329f --- /dev/null +++ b/notebooks/Benchmark-PWGAN.ipynb @@ -0,0 +1,578 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is to test TTS models with benchmark sentences for speech synthesis.\n", + "\n", + "Before running this script please DON'T FORGET: \n", + "- to set file paths.\n", + "- to download related model files from TTS and PWGAN.\n", + "- download or clone related repos, linked below.\n", + "- setup the repositories. ```python setup.py install```\n", + "- to checkout right commit versions (given next to the model) of TTS and PWGAN.\n", + "- to set the right paths in the cell below.\n", + "\n", + "Repositories:\n", + "- TTS: https://github.com/mozilla/TTS\n", + "- PWGAN: https://github.com/erogol/ParallelWaveGAN" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "import os\n", + "import sys\n", + "import io\n", + "import torch \n", + "import time\n", + "import json\n", + "import yaml\n", + "import numpy as np\n", + "from collections import OrderedDict\n", + "import matplotlib.pyplot as plt\n", + "plt.rcParams[\"figure.figsize\"] = (16,5)\n", + "\n", + "import librosa\n", + "import librosa.display\n", + "\n", + "from TTS.models.tacotron import Tacotron \n", + "from TTS.layers import *\n", + "from TTS.utils.data import *\n", + "from TTS.utils.audio import AudioProcessor\n", + "from TTS.utils.generic_utils import load_config, setup_model\n", + "from TTS.utils.text import text_to_sequence\n", + "from TTS.utils.synthesis import synthesis\n", + "from TTS.utils.visual import visualize\n", + "\n", + "import IPython\n", + "from IPython.display import Audio\n", + "\n", + "import os\n", + "\n", + "# you may need to change this depending on your system\n", + "os.environ['CUDA_VISIBLE_DEVICES']='1'\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True):\n", + " t_1 = time.time()\n", + " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, False, CONFIG.enable_eos_bos_chars)\n", + " if CONFIG.model == \"Tacotron\" and not use_gl:\n", + " # coorect the normalization differences b/w TTS and the Vocoder.\n", + " mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n", + " mel_postnet_spec = ap._denormalize(mel_postnet_spec)\n", + "# mel_postnet_spec = np.pad(mel_postnet_spec, pad_width=((2, 2), (0, 0)))\n", + " print(mel_postnet_spec.shape)\n", + " print(\"max- \", mel_postnet_spec.max(), \" -- min- \", mel_postnet_spec.min())\n", + " if not use_gl:\n", + " waveform = vocoder_model.inference(torch.FloatTensor(ap_vocoder._normalize(mel_postnet_spec).T).unsqueeze(0), hop_size=ap_vocoder.hop_length)\n", + "# waveform = waveform / abs(waveform).max() * 0.9\n", + " if use_cuda:\n", + " waveform = waveform.cpu()\n", + " waveform = waveform.numpy()\n", + " print(\" > Run-time: {}\".format(time.time() - t_1))\n", + " if figures: \n", + " visualize(alignment, mel_postnet_spec, stop_tokens, text, ap.hop_length, CONFIG, ap._denormalize(mel_spec)) \n", + " IPython.display.display(Audio(waveform, rate=CONFIG.audio['sample_rate'], normalize=False)) \n", + " os.makedirs(OUT_FOLDER, exist_ok=True)\n", + " file_name = text.replace(\" \", \"_\").replace(\".\",\"\") + \".wav\"\n", + " out_path = os.path.join(OUT_FOLDER, file_name)\n", + " ap.save_wav(waveform, out_path)\n", + " return alignment, mel_postnet_spec, stop_tokens, waveform" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Set constants\n", + "ROOT_PATH = '/home/erogol/Models/LJSpeech/ljspeech-bn-December-23-2019_08+34AM-ffea133/'\n", + "MODEL_PATH = ROOT_PATH + '/checkpoint_670000.pth.tar'\n", + "CONFIG_PATH = ROOT_PATH + '/config.json'\n", + "OUT_FOLDER = '/home/erogol/Dropbox/AudioSamples/benchmark_samples/'\n", + "CONFIG = load_config(CONFIG_PATH)\n", + "VOCODER_MODEL_PATH = \"/home/erogol/Models/LJSpeech/pwgan-ljspeech/checkpoint-400000steps.pkl\"\n", + "VOCODER_CONFIG_PATH = \"/home/erogol/Models/LJSpeech/pwgan-ljspeech/config.yml\"\n", + "\n", + "# load PWGAN config\n", + "with open(VOCODER_CONFIG_PATH) as f:\n", + " VOCODER_CONFIG = yaml.load(f, Loader=yaml.Loader)\n", + " \n", + "# Run FLAGs\n", + "use_cuda = False\n", + "# Set some config fields manually for testing\n", + "CONFIG.windowing = True\n", + "CONFIG.use_forward_attn = True \n", + "# Set the vocoder\n", + "use_gl = False # use GL if True\n", + "batched_wavernn = True # use batched wavernn inference if True" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# LOAD TTS MODEL\n", + "from TTS.utils.text.symbols import symbols, phonemes\n", + "\n", + "# multi speaker \n", + "if CONFIG.use_speaker_embedding:\n", + " speakers = json.load(open(f\"{ROOT_PATH}/speakers.json\", 'r'))\n", + " speakers_idx_to_id = {v: k for k, v in speakers.items()}\n", + "else:\n", + " speakers = []\n", + " speaker_id = None\n", + "\n", + "# load the model\n", + "num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n", + "model = setup_model(num_chars, len(speakers), CONFIG)\n", + "\n", + "# load the audio processor\n", + "ap = AudioProcessor(**CONFIG.audio) \n", + "\n", + "\n", + "# load model state\n", + "cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))\n", + "\n", + "# load the model\n", + "model.load_state_dict(cp['model'])\n", + "if use_cuda:\n", + " model.cuda()\n", + "model.eval()\n", + "print(cp['step'])\n", + "print(cp['r'])\n", + "\n", + "# set model stepsize\n", + "if 'r' in cp:\n", + " model.decoder.set_r(cp['r'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# LOAD WAVERNN\n", + "if use_gl == False:\n", + " from parallel_wavegan.models import ParallelWaveGANGenerator\n", + " from parallel_wavegan.utils.audio import AudioProcessor as AudioProcessorVocoder\n", + " \n", + " vocoder_model = ParallelWaveGANGenerator(**VOCODER_CONFIG[\"generator_params\"])\n", + " vocoder_model.load_state_dict(torch.load(VOCODER_MODEL_PATH, map_location=\"cpu\")[\"model\"][\"generator\"])\n", + " vocoder_model.remove_weight_norm()\n", + " ap_vocoder = AudioProcessorVocoder(**VOCODER_CONFIG['audio']) \n", + " if use_cuda:\n", + " vocoder_model.cuda()\n", + " vocoder_model.eval();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Comparision with https://mycroft.ai/blog/available-voices/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.eval()\n", + "model.decoder.max_decoder_steps = 2000\n", + "model.decoder.prenet.eval()\n", + "speaker_id = None\n", + "sentence = '''A breeding jennet, lusty, young, and proud,'''\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### https://espnet.github.io/icassp2020-tts/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sentence = \"The Commission also recommends\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sentence = \"As a result of these studies, the planning document submitted by the Secretary of the Treasury to the Bureau of the Budget on August thirty-one.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sentence = \"The FBI now transmits information on all defectors, a category which would, of course, have included Oswald.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sentence = \"they seem unduly restrictive in continuing to require some manifestation of animus against a Government official.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sentence = \"and each agency given clear understanding of the assistance which the Secret Service expects.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Other examples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sentence = \"Be a voice, not an echo.\" # 'echo' is not in training set. \n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sentence = \"The human voice is the most perfect instrument of all.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sentence = \"I'm sorry Dave. I'm afraid I can't do that.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sentence = \"This cake is great. It's so delicious and moist.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Comparison with https://keithito.github.io/audio-samples/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sentence = \"Generative adversarial network or variational auto-encoder.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sentence = \"Scientists at the CERN laboratory say they have discovered a new particle.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sentence = \"Here’s a way to measure the acute emotional intelligence that has never gone out of style.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sentence = \"President Trump met with other leaders at the Group of 20 conference.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sentence = \"The buses aren't the problem, they actually provide a solution.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Comparison with https://google.github.io/tacotron/publications/tacotron/index.html" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sentence = \"Generative adversarial network or variational auto-encoder.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sentence = \"Basilar membrane and otolaryngology are not auto-correlations.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sentence = \" He has read the whole thing.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sentence = \"He reads books.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sentence = \"Thisss isrealy awhsome.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sentence = \"This is your internet browser, Firefox.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sentence = \"This is your internet browser Firefox.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sentence = \"The quick brown fox jumps over the lazy dog.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sentence = \"Does the quick brown fox jump over the lazy dog?\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sentence = \"Eren, how are you?\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Hard Sentences" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sentence = \"Encouraged, he started with a minute a day.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sentence = \"His meditation consisted of “body scanning” which involved focusing his mind and energy on each section of the body from head to toe .\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sentence = \"Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase the grey matter in the parts of the brain responsible for emotional regulation and learning . \"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sentence = \"If he decided to watch TV he really watched it.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sentence = \"Often we try to bring about change through sheer effort and we put all of our energy into a new initiative .\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# for twb dataset\n", + "sentence = \"In our preparation for Easter, God in his providence offers us each year the season of Lent as a sacramental sign of our conversion.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From 2cec58320bed7ec3c7070a8465b58e4f4c6de98a Mon Sep 17 00:00:00 2001 From: erogol Date: Fri, 7 Feb 2020 14:21:57 +0100 Subject: [PATCH 44/61] use decorater for torch.no_grad --- train.py | 224 +++++++++++++++++++++++++++---------------------------- 1 file changed, 112 insertions(+), 112 deletions(-) diff --git a/train.py b/train.py index f52d24c1..b9f5fefb 100644 --- a/train.py +++ b/train.py @@ -327,6 +327,7 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, return keep_avg['avg_postnet_loss'], global_step +@torch.no_grad() def evaluate(model, criterion, criterion_st, ap, global_step, epoch): data_loader = setup_loader(ap, model.decoder.r, is_val=True) if c.use_speaker_embedding: @@ -346,125 +347,124 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch): keep_avg.add_values(eval_values_dict) print("\n > Validation") - with torch.no_grad(): - if data_loader is not None: - for num_iter, data in enumerate(data_loader): - start_time = time.time() + if data_loader is not None: + for num_iter, data in enumerate(data_loader): + start_time = time.time() - # format data - text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, _, _ = format_data(data) - assert mel_input.shape[1] % model.decoder.r == 0 + # format data + text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, _, _ = format_data(data) + assert mel_input.shape[1] % model.decoder.r == 0 - # forward pass model - if c.bidirectional_decoder: - decoder_output, postnet_output, alignments, stop_tokens, decoder_backward_output, alignments_backward = model( - text_input, text_lengths, mel_input, speaker_ids=speaker_ids) - else: - decoder_output, postnet_output, alignments, stop_tokens = model( - text_input, text_lengths, mel_input, speaker_ids=speaker_ids) + # forward pass model + if c.bidirectional_decoder: + decoder_output, postnet_output, alignments, stop_tokens, decoder_backward_output, alignments_backward = model( + text_input, text_lengths, mel_input, speaker_ids=speaker_ids) + else: + decoder_output, postnet_output, alignments, stop_tokens = model( + text_input, text_lengths, mel_input, speaker_ids=speaker_ids) - # loss computation - stop_loss = criterion_st( - stop_tokens, stop_targets) if c.stopnet else torch.zeros(1) - if c.loss_masking: - decoder_loss = criterion(decoder_output, mel_input, - mel_lengths) - if c.model in ["Tacotron", "TacotronGST"]: - postnet_loss = criterion(postnet_output, linear_input, - mel_lengths) - else: - postnet_loss = criterion(postnet_output, mel_input, - mel_lengths) - else: - decoder_loss = criterion(decoder_output, mel_input) - if c.model in ["Tacotron", "TacotronGST"]: - postnet_loss = criterion(postnet_output, linear_input) - else: - postnet_loss = criterion(postnet_output, mel_input) - loss = decoder_loss + postnet_loss + stop_loss - - # backward decoder loss - if c.bidirectional_decoder: - if c.loss_masking: - decoder_backward_loss = criterion(torch.flip(decoder_backward_output, dims=(1, )), mel_input, mel_lengths) - else: - decoder_backward_loss = criterion(torch.flip(decoder_backward_output, dims=(1, )), mel_input) - decoder_c_loss = torch.nn.functional.l1_loss(torch.flip(decoder_backward_output, dims=(1, )), decoder_output) - loss += decoder_backward_loss + decoder_c_loss - keep_avg.update_values({'avg_decoder_b_loss': decoder_backward_loss.item(), 'avg_decoder_c_loss': decoder_c_loss.item()}) - - step_time = time.time() - start_time - epoch_time += step_time - - # compute alignment score - align_score = alignment_diagonal_score(alignments) - keep_avg.update_value('avg_align_score', align_score) - - # aggregate losses from processes - if num_gpus > 1: - postnet_loss = reduce_tensor(postnet_loss.data, num_gpus) - decoder_loss = reduce_tensor(decoder_loss.data, num_gpus) - if c.stopnet: - stop_loss = reduce_tensor(stop_loss.data, num_gpus) - - keep_avg.update_values({ - 'avg_postnet_loss': - float(postnet_loss.item()), - 'avg_decoder_loss': - float(decoder_loss.item()), - 'avg_stop_loss': - float(stop_loss.item()), - }) - - if num_iter % c.print_step == 0: - print( - " | > TotalLoss: {:.5f} PostnetLoss: {:.5f} - {:.5f} DecoderLoss:{:.5f} - {:.5f} " - "StopLoss: {:.5f} - {:.5f} AlignScore: {:.4f} : {:.4f}" - .format(loss.item(), postnet_loss.item(), - keep_avg['avg_postnet_loss'], - decoder_loss.item(), - keep_avg['avg_decoder_loss'], stop_loss.item(), - keep_avg['avg_stop_loss'], align_score, - keep_avg['avg_align_score']), - flush=True) - - if args.rank == 0: - # Diagnostic visualizations - idx = np.random.randint(mel_input.shape[0]) - const_spec = postnet_output[idx].data.cpu().numpy() - gt_spec = linear_input[idx].data.cpu().numpy() if c.model in [ - "Tacotron", "TacotronGST" - ] else mel_input[idx].data.cpu().numpy() - align_img = alignments[idx].data.cpu().numpy() - - eval_figures = { - "prediction": plot_spectrogram(const_spec, ap), - "ground_truth": plot_spectrogram(gt_spec, ap), - "alignment": plot_alignment(align_img) - } - - # Sample audio + # loss computation + stop_loss = criterion_st( + stop_tokens, stop_targets) if c.stopnet else torch.zeros(1) + if c.loss_masking: + decoder_loss = criterion(decoder_output, mel_input, + mel_lengths) if c.model in ["Tacotron", "TacotronGST"]: - eval_audio = ap.inv_spectrogram(const_spec.T) + postnet_loss = criterion(postnet_output, linear_input, + mel_lengths) else: - eval_audio = ap.inv_mel_spectrogram(const_spec.T) - tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio}, - c.audio["sample_rate"]) + postnet_loss = criterion(postnet_output, mel_input, + mel_lengths) + else: + decoder_loss = criterion(decoder_output, mel_input) + if c.model in ["Tacotron", "TacotronGST"]: + postnet_loss = criterion(postnet_output, linear_input) + else: + postnet_loss = criterion(postnet_output, mel_input) + loss = decoder_loss + postnet_loss + stop_loss - # Plot Validation Stats - epoch_stats = { - "loss_postnet": keep_avg['avg_postnet_loss'], - "loss_decoder": keep_avg['avg_decoder_loss'], - "stop_loss": keep_avg['avg_stop_loss'], - "alignment_score": keep_avg['avg_align_score'] - } + # backward decoder loss + if c.bidirectional_decoder: + if c.loss_masking: + decoder_backward_loss = criterion(torch.flip(decoder_backward_output, dims=(1, )), mel_input, mel_lengths) + else: + decoder_backward_loss = criterion(torch.flip(decoder_backward_output, dims=(1, )), mel_input) + decoder_c_loss = torch.nn.functional.l1_loss(torch.flip(decoder_backward_output, dims=(1, )), decoder_output) + loss += decoder_backward_loss + decoder_c_loss + keep_avg.update_values({'avg_decoder_b_loss': decoder_backward_loss.item(), 'avg_decoder_c_loss': decoder_c_loss.item()}) - if c.bidirectional_decoder: - epoch_stats['loss_decoder_backward'] = keep_avg['avg_decoder_b_loss'] - align_b_img = alignments_backward[idx].data.cpu().numpy() - eval_figures['alignment_backward'] = plot_alignment(align_b_img) - tb_logger.tb_eval_stats(global_step, epoch_stats) - tb_logger.tb_eval_figures(global_step, eval_figures) + step_time = time.time() - start_time + epoch_time += step_time + + # compute alignment score + align_score = alignment_diagonal_score(alignments) + keep_avg.update_value('avg_align_score', align_score) + + # aggregate losses from processes + if num_gpus > 1: + postnet_loss = reduce_tensor(postnet_loss.data, num_gpus) + decoder_loss = reduce_tensor(decoder_loss.data, num_gpus) + if c.stopnet: + stop_loss = reduce_tensor(stop_loss.data, num_gpus) + + keep_avg.update_values({ + 'avg_postnet_loss': + float(postnet_loss.item()), + 'avg_decoder_loss': + float(decoder_loss.item()), + 'avg_stop_loss': + float(stop_loss.item()), + }) + + if num_iter % c.print_step == 0: + print( + " | > TotalLoss: {:.5f} PostnetLoss: {:.5f} - {:.5f} DecoderLoss:{:.5f} - {:.5f} " + "StopLoss: {:.5f} - {:.5f} AlignScore: {:.4f} : {:.4f}" + .format(loss.item(), postnet_loss.item(), + keep_avg['avg_postnet_loss'], + decoder_loss.item(), + keep_avg['avg_decoder_loss'], stop_loss.item(), + keep_avg['avg_stop_loss'], align_score, + keep_avg['avg_align_score']), + flush=True) + + if args.rank == 0: + # Diagnostic visualizations + idx = np.random.randint(mel_input.shape[0]) + const_spec = postnet_output[idx].data.cpu().numpy() + gt_spec = linear_input[idx].data.cpu().numpy() if c.model in [ + "Tacotron", "TacotronGST" + ] else mel_input[idx].data.cpu().numpy() + align_img = alignments[idx].data.cpu().numpy() + + eval_figures = { + "prediction": plot_spectrogram(const_spec, ap), + "ground_truth": plot_spectrogram(gt_spec, ap), + "alignment": plot_alignment(align_img) + } + + # Sample audio + if c.model in ["Tacotron", "TacotronGST"]: + eval_audio = ap.inv_spectrogram(const_spec.T) + else: + eval_audio = ap.inv_mel_spectrogram(const_spec.T) + tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio}, + c.audio["sample_rate"]) + + # Plot Validation Stats + epoch_stats = { + "loss_postnet": keep_avg['avg_postnet_loss'], + "loss_decoder": keep_avg['avg_decoder_loss'], + "stop_loss": keep_avg['avg_stop_loss'], + "alignment_score": keep_avg['avg_align_score'] + } + + if c.bidirectional_decoder: + epoch_stats['loss_decoder_backward'] = keep_avg['avg_decoder_b_loss'] + align_b_img = alignments_backward[idx].data.cpu().numpy() + eval_figures['alignment_backward'] = plot_alignment(align_b_img) + tb_logger.tb_eval_stats(global_step, epoch_stats) + tb_logger.tb_eval_figures(global_step, eval_figures) if args.rank == 0 and epoch > c.test_delay_epochs: if c.test_sentences_file is None: From cf6e16254fc683a4fb487812d2d7653571a1bc2f Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 12 Feb 2020 10:29:30 +0100 Subject: [PATCH 45/61] add torch.no_grad decorator for inference --- models/tacotron.py | 1 + models/tacotron2.py | 1 + 2 files changed, 2 insertions(+) diff --git a/models/tacotron.py b/models/tacotron.py index a2d9e1c4..04ecd573 100644 --- a/models/tacotron.py +++ b/models/tacotron.py @@ -132,6 +132,7 @@ class Tacotron(nn.Module): return decoder_outputs, postnet_outputs, alignments, stop_tokens, decoder_outputs_backward, alignments_backward return decoder_outputs, postnet_outputs, alignments, stop_tokens + @torch.no_grad() def inference(self, characters, speaker_ids=None, style_mel=None): inputs = self.embedding(characters) self._init_states() diff --git a/models/tacotron2.py b/models/tacotron2.py index 852b1886..3a3863de 100644 --- a/models/tacotron2.py +++ b/models/tacotron2.py @@ -82,6 +82,7 @@ class Tacotron2(nn.Module): return decoder_outputs, postnet_outputs, alignments, stop_tokens, decoder_outputs_backward, alignments_backward return decoder_outputs, postnet_outputs, alignments, stop_tokens + @torch.no_grad() def inference(self, text, speaker_ids=None): embedded_inputs = self.embedding(text).transpose(1, 2) encoder_outputs = self.encoder.inference(embedded_inputs) From 9aacd2ee0ab181eb183800aca70ff9a16a3bd275 Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 12 Feb 2020 10:32:52 +0100 Subject: [PATCH 46/61] linter fix --- train.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/train.py b/train.py index b9f5fefb..e8c240f3 100644 --- a/train.py +++ b/train.py @@ -368,13 +368,13 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch): stop_tokens, stop_targets) if c.stopnet else torch.zeros(1) if c.loss_masking: decoder_loss = criterion(decoder_output, mel_input, - mel_lengths) + mel_lengths) if c.model in ["Tacotron", "TacotronGST"]: postnet_loss = criterion(postnet_output, linear_input, - mel_lengths) + mel_lengths) else: postnet_loss = criterion(postnet_output, mel_input, - mel_lengths) + mel_lengths) else: decoder_loss = criterion(decoder_output, mel_input) if c.model in ["Tacotron", "TacotronGST"]: @@ -449,7 +449,7 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch): else: eval_audio = ap.inv_mel_spectrogram(const_spec.T) tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio}, - c.audio["sample_rate"]) + c.audio["sample_rate"]) # Plot Validation Stats epoch_stats = { From 60379271dc4531bd5f8b4ad4865b1cbbb1d87f4f Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 12 Feb 2020 12:21:53 +0100 Subject: [PATCH 47/61] update for phonemizer 2.1 --- tests/test_text_processing.py | 4 ++-- utils/text/__init__.py | 43 +++++++++++++++++++++++------------ 2 files changed, 30 insertions(+), 17 deletions(-) diff --git a/tests/test_text_processing.py b/tests/test_text_processing.py index 8f8e6fab..0ecb9962 100644 --- a/tests/test_text_processing.py +++ b/tests/test_text_processing.py @@ -69,7 +69,7 @@ def test_phoneme_to_sequence(): def test_text2phone(): text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!" - gt = "ɹ|iː|s|ə|n|t| |ɹ|ɪ|s|ɜː|tʃ| |æ|t| |h|ɑːɹ|v|ɚ|d| |h|ɐ|z| |ʃ|oʊ|n| |m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| |f|ɔː|ɹ| |æ|z| |l|ɪ|ɾ|əl| |æ|z| |eɪ|t| |w|iː|k|s| |k|æ|n| |æ|k|tʃ|uː|əl|i|| |ɪ|n|k|ɹ|iː|s|,| |ð|ə| |ɡ|ɹ|eɪ| |m|æ|ɾ|ɚ|ɹ| |ɪ|n|ð|ə| |p|ɑːɹ|t|s| |ʌ|v|ð|ə| |b|ɹ|eɪ|n| |ɹ|ɪ|s|p|ɑː|n|s|ə|b|əl| |f|ɔː|ɹ| |ɪ|m|oʊ|ʃ|ə|n|əl| |ɹ|ɛ|ɡ|j|uː|l|eɪ|ʃ|ə|n||| |æ|n|d| |l|ɜː|n|ɪ|ŋ|!" + gt = "ɹ|iː|s|ə|n|t| |ɹ|ɪ|s|ɜː|tʃ| |æ|t| |h|ɑːɹ|v|ɚ|d| |h|ɐ|z| |ʃ|oʊ|n| |m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| |f|ɔː|ɹ| |æ|z| |l|ɪ|ɾ|əl| |æ|z| |eɪ|t| |w|iː|k|s| |k|æ|n| |æ|k|tʃ|uː|əl|i| |ɪ|n|k|ɹ|iː|s|,| |ð|ə| |ɡ|ɹ|eɪ| |m|æ|ɾ|ɚ|ɹ| |ɪ|n|ð|ə| |p|ɑːɹ|t|s| |ʌ|v|ð|ə| |b|ɹ|eɪ|n| |ɹ|ɪ|s|p|ɑː|n|s|ə|b|əl| |f|ɔː|ɹ| |ɪ|m|oʊ|ʃ|ə|n|əl| |ɹ|ɛ|ɡ|j|uː|l|eɪ|ʃ|ə|n| |æ|n|d| |l|ɜː|n|ɪ|ŋ|!" lang = "en-us" phonemes = text2phone(text, lang) - assert gt == phonemes + assert gt == phonemes, f"\n{phonemes} \n vs \n{gt}" diff --git a/utils/text/__init__.py b/utils/text/__init__.py index 1c5b98c3..e6842dfa 100644 --- a/utils/text/__init__.py +++ b/utils/text/__init__.py @@ -28,21 +28,34 @@ def text2phone(text, language): seperator = phonemizer.separator.Separator(' |', '', '|') #try: punctuations = re.findall(PHONEME_PUNCTUATION_PATTERN, text) - ph = phonemize(text, separator=seperator, strip=False, njobs=1, backend='espeak', language=language) - ph = ph[:-1].strip() # skip the last empty character - # Replace \n with matching punctuations. - if punctuations: - # if text ends with a punctuation. - if text[-1] == punctuations[-1]: - for punct in punctuations[:-1]: - ph = ph.replace('| |\n', '|'+punct+'| |', 1) - try: - ph = ph + punctuations[-1] - except: - print(text) - else: - for punct in punctuations: - ph = ph.replace('| |\n', '|'+punct+'| |', 1) + if float(phonemizer.__version__) < 2.1: + ph = phonemize(text, separator=seperator, strip=False, njobs=1, backend='espeak', language=language) + ph = ph[:-1].strip() # skip the last empty character + # phonemizer does not tackle punctuations. Here we do. + # Replace \n with matching punctuations. + if punctuations: + # if text ends with a punctuation. + if text[-1] == punctuations[-1]: + for punct in punctuations[:-1]: + ph = ph.replace('| |\n', '|'+punct+'| |', 1) + try: + ph = ph + punctuations[-1] + except: + print(text) + else: + for punct in punctuations: + ph = ph.replace('| |\n', '|'+punct+'| |', 1) + elif float(phonemizer.__version__) == 2.1: + ph = phonemize(text, separator=seperator, strip=False, njobs=1, backend='espeak', language=language, preserve_punctuation=True) + # this is a simple fix for phonemizer. + # https://github.com/bootphon/phonemizer/issues/32 + if punctuations: + for punctuation in punctuations: + ph = ph.replace(f"| |{punctuation} ", f"|{punctuation}| |").replace(f"| |{punctuation}", f"|{punctuation}| |") + ph = ph[:-3] + else: + raise RuntimeError(" [!] Use 'phonemizer' version 2.1 or older.") + return ph From 3b57e88a66ba1f410be70dbd2ad2899b5b1bcb0e Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Thu, 13 Feb 2020 15:49:46 +0100 Subject: [PATCH 48/61] Use PWGAN if available in Synthesizer.tts --- server/synthesizer.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/server/synthesizer.py b/server/synthesizer.py index 75fd4e76..455bd332 100644 --- a/server/synthesizer.py +++ b/server/synthesizer.py @@ -168,9 +168,16 @@ class Synthesizer(object): postnet_output, decoder_output, _ = parse_outputs( postnet_output, decoder_output, alignments) + if self.pwgan: + vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0) + if self.use_cuda: + vocoder_input.cuda() + wav = self.pwgan.inference(vocoder_input, hop_size=self.ap.hop_length) if self.wavernn: - postnet_output = postnet_output[0].data.cpu().numpy() - wav = self.wavernn.generate(torch.FloatTensor(postnet_output.T).unsqueeze(0).cuda(), batched=self.config.is_wavernn_batched, target=11000, overlap=550) + vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0) + if self.use_cuda: + vocoder_input.cuda() + wav = self.wavernn.generate(vocoder_input, batched=self.config.is_wavernn_batched, target=11000, overlap=550) else: wav = inv_spectrogram(postnet_output, self.ap, self.tts_config) # trim silence From bfd45a8ea900603f7a8e231b9c50ea4506bf9eb9 Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Thu, 13 Feb 2020 15:54:30 +0100 Subject: [PATCH 49/61] Load PWGAN/WaveRNN embedded files if present --- server/server.py | 47 +++++++++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/server/server.py b/server/server.py index 6af119bf..705937e2 100644 --- a/server/server.py +++ b/server/server.py @@ -18,9 +18,9 @@ def create_argparser(): parser.add_argument('--wavernn_file', type=str, default=None, help='path to WaveRNN checkpoint file.') parser.add_argument('--wavernn_config', type=str, default=None, help='path to WaveRNN config file.') parser.add_argument('--is_wavernn_batched', type=convert_boolean, default=False, help='true to use batched WaveRNN.') - parser.add_argument('--pwgan_lib_path', type=str, help='path to ParallelWaveGAN project folder to be imported. If this is not passed, model uses Griffin-Lim for synthesis.') - parser.add_argument('--pwgan_file', type=str, help='path to ParallelWaveGAN checkpoint file.') - parser.add_argument('--pwgan_config', type=str, help='path to ParallelWaveGAN config file.') + parser.add_argument('--pwgan_lib_path', type=str, default=None, help='path to ParallelWaveGAN project folder to be imported. If this is not passed, model uses Griffin-Lim for synthesis.') + parser.add_argument('--pwgan_file', type=str, default=None, help='path to ParallelWaveGAN checkpoint file.') + parser.add_argument('--pwgan_config', type=str, default=None, help='path to ParallelWaveGAN config file.') parser.add_argument('--port', type=int, default=5002, help='port to listen on.') parser.add_argument('--use_cuda', type=convert_boolean, default=False, help='true to use CUDA.') parser.add_argument('--debug', type=convert_boolean, default=False, help='true to enable Flask debug mode.') @@ -29,28 +29,35 @@ def create_argparser(): synthesizer = None -embedded_model_folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'model') -checkpoint_file = os.path.join(embedded_model_folder, 'checkpoint.pth.tar') -config_file = os.path.join(embedded_model_folder, 'config.json') +embedded_models_folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'model') -# Default options with embedded model files -if os.path.isfile(checkpoint_file): - default_tts_checkpoint = checkpoint_file -else: - default_tts_checkpoint = None +embedded_tts_folder = os.path.join(embedded_models_folder, 'tts') +tts_checkpoint_file = os.path.join(embedded_tts_folder, 'checkpoint.pth.tar') +tts_config_file = os.path.join(embedded_tts_folder, 'config.json') -if os.path.isfile(config_file): - default_tts_config = config_file -else: - default_tts_config = None +embedded_wavernn_folder = os.path.join(embedded_models_folder, 'wavernn') +wavernn_checkpoint_file = os.path.join(embedded_wavernn_folder, 'checkpoint.pth.tar') +wavernn_config_file = os.path.join(embedded_wavernn_folder, 'config.json') + +embedded_pwgan_folder = os.path.join(embedded_models_folder, 'pwgan') +pwgan_checkpoint_file = os.path.join(embedded_pwgan_folder, 'checkpoint.pkl') +pwgan_config_file = os.path.join(embedded_pwgan_folder, 'config.yml') args = create_argparser().parse_args() -# If these were not specified in the CLI args, use default values -if not args.tts_checkpoint: - args.tts_checkpoint = default_tts_checkpoint -if not args.tts_config: - args.tts_config = default_tts_config +# If these were not specified in the CLI args, use default values with embedded model files +if not args.tts_checkpoint and os.path.isfile(tts_checkpoint_file): + args.tts_checkpoint = tts_checkpoint_file +if not args.tts_config and os.path.isfile(tts_config_file): + args.tts_config = tts_config_file +if not args.wavernn_file and os.path.isfile(wavernn_checkpoint_file): + args.wavernn_file = wavernn_checkpoint_file +if not args.wavernn_config and os.path.isfile(wavernn_config_file): + args.wavernn_config = wavernn_config_file +if not args.pwgan_file and os.path.isfile(pwgan_checkpoint_file): + args.pwgan_file = pwgan_checkpoint_file +if not args.pwgan_config and os.path.isfile(pwgan_config_file): + args.pwgan_config = pwgan_config_file synthesizer = Synthesizer(args) From 846d147a66e39bd6a817027a806459376936a60e Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Thu, 13 Feb 2020 16:03:30 +0100 Subject: [PATCH 50/61] Fix bug where sometimes the second sentence disappears if it doesn't end with punctuation --- server/synthesizer.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/server/synthesizer.py b/server/synthesizer.py index 455bd332..1082b73a 100644 --- a/server/synthesizer.py +++ b/server/synthesizer.py @@ -122,7 +122,7 @@ class Synthesizer(object): self.ap.save_wav(wav, path) def split_into_sentences(self, text): - text = " " + text + " " + text = " " + text + " " text = text.replace("\n", " ") text = re.sub(prefixes, "\\1", text) text = re.sub(websites, "\\1", text) @@ -149,15 +149,13 @@ class Synthesizer(object): text = text.replace("", ".") sentences = text.split("") sentences = sentences[:-1] - sentences = [s.strip() for s in sentences] + sentences = list(filter(None, [s.strip() for s in sentences])) # remove empty sentences return sentences def tts(self, text): wavs = [] sens = self.split_into_sentences(text) print(sens) - if not sens: - sens = [text+'.'] for sen in sens: # preprocess the given text inputs = text_to_seqvec(sen, self.tts_config, self.use_cuda) From e40bc18c84ba16456ad9f3f7529f76ffc568b6b2 Mon Sep 17 00:00:00 2001 From: erogol Date: Thu, 13 Feb 2020 17:23:37 +0100 Subject: [PATCH 51/61] fix linter problems and loader test --- tests/test_loader.py | 4 +--- tests/test_text_processing.py | 4 ++-- utils/text/__init__.py | 3 --- 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/tests/test_loader.py b/tests/test_loader.py index 751bc181..d8727895 100644 --- a/tests/test_loader.py +++ b/tests/test_loader.py @@ -137,9 +137,7 @@ class TestTTSDataset(unittest.TestCase): # NOTE: Below needs to check == 0 but due to an unknown reason # there is a slight difference between two matrices. # TODO: Check this assert cond more in detail. - assert abs((abs(mel.T) - - abs(mel_dl) - ).sum()) < 1e-5, (abs(mel.T) - abs(mel_dl)).sum() + assert abs(mel.T - mel_dl).max() < 1e-5, abs(mel.T - mel_dl).max() # check mel-spec correctness mel_spec = mel_input[0].cpu().numpy() diff --git a/tests/test_text_processing.py b/tests/test_text_processing.py index 0ecb9962..aa17f694 100644 --- a/tests/test_text_processing.py +++ b/tests/test_text_processing.py @@ -71,5 +71,5 @@ def test_text2phone(): text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!" gt = "ɹ|iː|s|ə|n|t| |ɹ|ɪ|s|ɜː|tʃ| |æ|t| |h|ɑːɹ|v|ɚ|d| |h|ɐ|z| |ʃ|oʊ|n| |m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| |f|ɔː|ɹ| |æ|z| |l|ɪ|ɾ|əl| |æ|z| |eɪ|t| |w|iː|k|s| |k|æ|n| |æ|k|tʃ|uː|əl|i| |ɪ|n|k|ɹ|iː|s|,| |ð|ə| |ɡ|ɹ|eɪ| |m|æ|ɾ|ɚ|ɹ| |ɪ|n|ð|ə| |p|ɑːɹ|t|s| |ʌ|v|ð|ə| |b|ɹ|eɪ|n| |ɹ|ɪ|s|p|ɑː|n|s|ə|b|əl| |f|ɔː|ɹ| |ɪ|m|oʊ|ʃ|ə|n|əl| |ɹ|ɛ|ɡ|j|uː|l|eɪ|ʃ|ə|n| |æ|n|d| |l|ɜː|n|ɪ|ŋ|!" lang = "en-us" - phonemes = text2phone(text, lang) - assert gt == phonemes, f"\n{phonemes} \n vs \n{gt}" + ph = text2phone(text, lang) + assert gt == ph, f"\n{phonemes} \n vs \n{gt}" diff --git a/utils/text/__init__.py b/utils/text/__init__.py index e6842dfa..0e6684d2 100644 --- a/utils/text/__init__.py +++ b/utils/text/__init__.py @@ -38,10 +38,7 @@ def text2phone(text, language): if text[-1] == punctuations[-1]: for punct in punctuations[:-1]: ph = ph.replace('| |\n', '|'+punct+'| |', 1) - try: ph = ph + punctuations[-1] - except: - print(text) else: for punct in punctuations: ph = ph.replace('| |\n', '|'+punct+'| |', 1) From d97eb9f7839635c1063bbb0b6d854b582b98c6e8 Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Thu, 13 Feb 2020 17:30:41 +0100 Subject: [PATCH 52/61] Fix linter and server package test --- server/synthesizer.py | 3 ++- setup.py | 7 ++++--- tests/test_server_package.sh | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/server/synthesizer.py b/server/synthesizer.py index 1082b73a..fcdc8787 100644 --- a/server/synthesizer.py +++ b/server/synthesizer.py @@ -121,7 +121,8 @@ class Synthesizer(object): wav = np.array(wav) self.ap.save_wav(wav, path) - def split_into_sentences(self, text): + @staticmethod + def split_into_sentences(text): text = " " + text + " " text = text.replace("\n", " ") text = re.sub(prefixes, "\\1", text) diff --git a/setup.py b/setup.py index 63782800..f92dac8a 100644 --- a/setup.py +++ b/setup.py @@ -61,10 +61,11 @@ package_data = ['server/templates/*'] if 'bdist_wheel' in unknown_args and args.checkpoint and args.model_config: print('Embedding model in wheel file...') model_dir = os.path.join('server', 'model') - os.makedirs(model_dir, exist_ok=True) - embedded_checkpoint_path = os.path.join(model_dir, 'checkpoint.pth.tar') + tts_dir = os.path.join(model_dir, 'tts') + os.makedirs(tts_dir, exist_ok=True) + embedded_checkpoint_path = os.path.join(tts_dir, 'checkpoint.pth.tar') shutil.copy(args.checkpoint, embedded_checkpoint_path) - embedded_config_path = os.path.join(model_dir, 'config.json') + embedded_config_path = os.path.join(tts_dir, 'config.json') shutil.copy(args.model_config, embedded_config_path) package_data.extend([embedded_checkpoint_path, embedded_config_path]) diff --git a/tests/test_server_package.sh b/tests/test_server_package.sh index 01e42843..9fe5e8b1 100755 --- a/tests/test_server_package.sh +++ b/tests/test_server_package.sh @@ -11,7 +11,7 @@ source /tmp/venv/bin/activate pip install --quiet --upgrade pip setuptools wheel rm -f dist/*.whl -python setup.py bdist_wheel --checkpoint tests/outputs/checkpoint_10.pth.tar --model_config tests/outputs/dummy_model_config.json +python setup.py --quiet bdist_wheel --checkpoint tests/outputs/checkpoint_10.pth.tar --model_config tests/outputs/dummy_model_config.json pip install --quiet dist/TTS*.whl python -m TTS.server.server & From 2079097183f6355b0394e85c811aec830f65686d Mon Sep 17 00:00:00 2001 From: erogol Date: Thu, 13 Feb 2020 22:16:40 +0100 Subject: [PATCH 53/61] check config with a function --- config.json | 9 +-- train.py | 3 +- utils/generic_utils.py | 128 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 135 insertions(+), 5 deletions(-) diff --git a/config.json b/config.json index 9e4fa906..c1a8158d 100644 --- a/config.json +++ b/config.json @@ -9,7 +9,7 @@ "num_mels": 80, // size of the mel spec frame. "num_freq": 1025, // number of stft frequency levels. Size of the linear spectogram frame. "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. - "frame_length_ms": 50, // stft window length in ms. + "frame_length_ms": 50.0, // stft window length in ms. "frame_shift_ms": 12.5, // stft window hop-lengh in ms. "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. "min_level_db": -100, // normalization range @@ -19,7 +19,7 @@ // Normalization parameters "signal_norm": true, // normalize the spec values in range [0, 1] "symmetric_norm": true, // move normalization to range [-1, 1] - "max_norm": 4, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] "clip_norm": true, // clip normalized values into the range. "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! @@ -36,11 +36,12 @@ "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. // TRAINING - "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. + "batch_size": 2, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. "eval_batch_size":16, "r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. "gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed. "loss_masking": true, // enable / disable loss masking against the sequence padding. + "grad_accum": 2, // if N > 1, enable gradient accumulation for N iterations. It is useful for low memory GPUs. // VALIDATION "run_eval": true, @@ -49,7 +50,7 @@ // OPTIMIZER "noam_schedule": false, // use noam warmup and lr schedule. - "grad_clip": 1, // upper limit for gradients for clipping. + "grad_clip": 1.0, // upper limit for gradients for clipping. "epochs": 1000, // total number of epochs to train. "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. "wd": 0.000001, // Weight decay weight. diff --git a/train.py b/train.py index e8c240f3..7bfb8751 100644 --- a/train.py +++ b/train.py @@ -20,7 +20,7 @@ from TTS.utils.generic_utils import ( get_git_branch, load_config, remove_experiment_folder, save_best_model, save_checkpoint, adam_weight_decay, set_init_dict, copy_config_file, setup_model, gradual_training_scheduler, KeepAverage, - set_weight_decay) + set_weight_decay, check_config) from TTS.utils.logger import Logger from TTS.utils.speakers import load_speaker_mapping, save_speaker_mapping, \ get_speakers @@ -687,6 +687,7 @@ if __name__ == '__main__': # setup output paths and read configs c = load_config(args.config_path) + check_config(c) _ = os.path.dirname(os.path.realpath(__file__)) OUT_PATH = args.continue_path diff --git a/utils/generic_utils.py b/utils/generic_utils.py index cf1a83a6..7a5c2ac2 100644 --- a/utils/generic_utils.py +++ b/utils/generic_utils.py @@ -389,3 +389,131 @@ class KeepAverage(): def update_values(self, value_dict): for key, value in value_dict.items(): self.update_value(key, value) + + +def _check_argument(name, c, enum_list=None, max_val=None, min_val=None, restricted=False, val_type=None): + if restricted: + assert name in c.keys(), f' [!] {name} not defined in config.json' + if name in c.keys(): + if max_val: + assert c[name] <= max_val, f' [!] {name} is larger than max value {max_val}' + if min_val: + assert c[name] >= min_val, f' [!] {name} is smaller than min value {min_val}' + if enum_list: + assert c[name].lower() in enum_list, f' [!] {name} is not a valid value' + if val_type: + assert type(c[name]) is val_type or c[name] is None, f' [!] {name} has wrong type - {type(c[name])} vs {val_type}' + + + +def check_config(c): + _check_argument('model', c, enum_list=['tacotron', 'tacotron2'], restricted=True, val_type=str) + _check_argument('run_name', c, restricted=True, val_type=str) + _check_argument('run_description', c, val_type=str) + + # AUDIO + _check_argument('audio', c, restricted=True, val_type=dict) + + # audio processing parameters + _check_argument('num_mels', c['audio'], restricted=True, val_type=int, min_val=10, max_val=2056) + _check_argument('num_freq', c['audio'], restricted=True, val_type=int, min_val=128, max_val=4058) + _check_argument('sample_rate', c['audio'], restricted=True, val_type=int, min_val=512, max_val=100000) + _check_argument('frame_length_ms', c['audio'], restricted=True, val_type=float, min_val=10, max_val=1000) + _check_argument('frame_shift_ms', c['audio'], restricted=True, val_type=float, min_val=1, max_val=1000) + _check_argument('preemphasis', c['audio'], restricted=True, val_type=float, min_val=0, max_val=1) + _check_argument('min_level_db', c['audio'], restricted=True, val_type=int, min_val=-1000, max_val=10) + _check_argument('ref_level_db', c['audio'], restricted=True, val_type=int, min_val=0, max_val=1000) + _check_argument('power', c['audio'], restricted=True, val_type=float, min_val=1, max_val=5) + _check_argument('griffin_lim_iters', c['audio'], restricted=True, val_type=int, min_val=10, max_val=1000) + + # normalization parameters + _check_argument('signal_norm', c['audio'], restricted=True, val_type=bool) + _check_argument('symmetric_norm', c['audio'], restricted=True, val_type=bool) + _check_argument('max_norm', c['audio'], restricted=True, val_type=float, min_val=0.1, max_val=1000) + _check_argument('clip_norm', c['audio'], restricted=True, val_type=bool) + _check_argument('mel_fmin', c['audio'], restricted=True, val_type=float, min_val=0.0, max_val=1000) + _check_argument('mel_fmax', c['audio'], restricted=True, val_type=float, min_val=500.0) + _check_argument('do_trim_silence', c['audio'], restricted=True, val_type=bool) + _check_argument('trim_db', c['audio'], restricted=True, val_type=int) + + # training parameters + _check_argument('batch_size', c, restricted=True, val_type=int, min_val=1) + _check_argument('eval_batch_size', c, restricted=True, val_type=int, min_val=1) + _check_argument('r', c, restricted=True, val_type=int, min_val=1) + _check_argument('gradual_training', c, restricted=False, val_type=list) + _check_argument('loss_masking', c, restricted=True, val_type=bool) + _check_argument('grad_accum', c, restricted=True, val_type=int, min_val=1, max_val=100) + + # validation parameters + _check_argument('run_eval', c, restricted=True, val_type=bool) + _check_argument('test_delay_epochs', c, restricted=True, val_type=int, min_val=0) + _check_argument('test_sentences_file', c, restricted=False, val_type=str) + + # optimizer + _check_argument('noam_schedule', c, restricted=False, val_type=bool) + _check_argument('grad_clip', c, restricted=True, val_type=float, min_val=0.0) + _check_argument('epochs', c, restricted=True, val_type=int, min_val=1) + _check_argument('lr', c, restricted=True, val_type=float, min_val=0) + _check_argument('wd', c, restricted=True, val_type=float, min_val=0) + _check_argument('warmup_steps', c, restricted=True, val_type=int, min_val=0) + _check_argument('seq_len_norm', c, restricted=True, val_type=bool) + + # tacotron prenet + _check_argument('memory_size', c, restricted=True, val_type=int, min_val=-1) + _check_argument('prenet_type', c, restricted=True, val_type=str, enum_list=['original', 'bn']) + _check_argument('prenet_dropout', c, restricted=True, val_type=bool) + + # attention + _check_argument('attention_type', c, restricted=True, val_type=str, enum_list=['graves', 'original']) + _check_argument('attention_heads', c, restricted=True, val_type=int) + _check_argument('attention_norm', c, restricted=True, val_type=str, enum_list=['sigmoid', 'softmax']) + _check_argument('windowing', c, restricted=True, val_type=bool) + _check_argument('use_forward_attn', c, restricted=True, val_type=bool) + _check_argument('forward_attn_mask', c, restricted=True, val_type=bool) + _check_argument('transition_agent', c, restricted=True, val_type=bool) + _check_argument('transition_agent', c, restricted=True, val_type=bool) + _check_argument('location_attn', c, restricted=True, val_type=bool) + _check_argument('bidirectional_decoder', c, restricted=True, val_type=bool) + + # stopnet + _check_argument('stopnet', c, restricted=True, val_type=bool) + _check_argument('separate_stopnet', c, restricted=True, val_type=bool) + + # tensorboard + _check_argument('print_step', c, restricted=True, val_type=int, min_val=1) + _check_argument('save_step', c, restricted=True, val_type=int, min_val=1) + _check_argument('checkpoint', c, restricted=True, val_type=bool) + _check_argument('tb_model_param_stats', c, restricted=True, val_type=bool) + + # dataloading + _check_argument('text_cleaner', c, restricted=True, val_type=str, enum_list=['english_cleaners', 'phoneme_cleaners', 'transliteration_cleaners', 'basic_cleaners']) + _check_argument('enable_eos_bos_chars', c, restricted=True, val_type=bool) + _check_argument('num_loader_workers', c, restricted=True, val_type=int, min_val=0) + _check_argument('num_val_loader_workers', c, restricted=True, val_type=int, min_val=0) + _check_argument('batch_group_size', c, restricted=True, val_type=int, min_val=0) + _check_argument('min_seq_len', c, restricted=True, val_type=int, min_val=0) + _check_argument('max_seq_len', c, restricted=True, val_type=int, min_val=10) + + # paths + _check_argument('output_path', c, restricted=True, val_type=str) + + # multi-speaker gst + _check_argument('use_speaker_embedding', c, restricted=True, val_type=bool) + _check_argument('style_wav_for_test', c, restricted=True, val_type=str) + _check_argument('use_gst', c, restricted=True, val_type=bool) + + # datasets - checking only the first entry + _check_argument('datasets', c, restricted=True, val_type=list) + for dataset_entry in c['datasets']: + _check_argument('name', dataset_entry, restricted=True, val_type=str) + _check_argument('path', dataset_entry, restricted=True, val_type=str) + _check_argument('meta_file_train', dataset_entry, restricted=True, val_type=str) + _check_argument('meta_file_val', dataset_entry, restricted=True, val_type=str) + + + + + + + + From 0c7c34c12c1ff05c1205a0299adef5c446322088 Mon Sep 17 00:00:00 2001 From: erogol Date: Fri, 14 Feb 2020 17:47:33 +0100 Subject: [PATCH 54/61] remove grad_accum from config checker --- utils/generic_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/utils/generic_utils.py b/utils/generic_utils.py index 7a5c2ac2..942fedf9 100644 --- a/utils/generic_utils.py +++ b/utils/generic_utils.py @@ -405,7 +405,6 @@ def _check_argument(name, c, enum_list=None, max_val=None, min_val=None, restric assert type(c[name]) is val_type or c[name] is None, f' [!] {name} has wrong type - {type(c[name])} vs {val_type}' - def check_config(c): _check_argument('model', c, enum_list=['tacotron', 'tacotron2'], restricted=True, val_type=str) _check_argument('run_name', c, restricted=True, val_type=str) @@ -442,7 +441,7 @@ def check_config(c): _check_argument('r', c, restricted=True, val_type=int, min_val=1) _check_argument('gradual_training', c, restricted=False, val_type=list) _check_argument('loss_masking', c, restricted=True, val_type=bool) - _check_argument('grad_accum', c, restricted=True, val_type=int, min_val=1, max_val=100) + # _check_argument('grad_accum', c, restricted=True, val_type=int, min_val=1, max_val=100) # validation parameters _check_argument('run_eval', c, restricted=True, val_type=bool) From ecf84fa4ad6e13df623e0c746f28a431d2953724 Mon Sep 17 00:00:00 2001 From: erogol Date: Fri, 14 Feb 2020 18:00:15 +0100 Subject: [PATCH 55/61] linter fixes --- utils/generic_utils.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/utils/generic_utils.py b/utils/generic_utils.py index 942fedf9..a8de5bbb 100644 --- a/utils/generic_utils.py +++ b/utils/generic_utils.py @@ -402,8 +402,8 @@ def _check_argument(name, c, enum_list=None, max_val=None, min_val=None, restric if enum_list: assert c[name].lower() in enum_list, f' [!] {name} is not a valid value' if val_type: - assert type(c[name]) is val_type or c[name] is None, f' [!] {name} has wrong type - {type(c[name])} vs {val_type}' - + assert isinstance(c[name], val_type) or c[name] is None, f' [!] {name} has wrong type - {type(c[name])} vs {val_type}' + def check_config(c): _check_argument('model', c, enum_list=['tacotron', 'tacotron2'], restricted=True, val_type=str) @@ -507,12 +507,4 @@ def check_config(c): _check_argument('name', dataset_entry, restricted=True, val_type=str) _check_argument('path', dataset_entry, restricted=True, val_type=str) _check_argument('meta_file_train', dataset_entry, restricted=True, val_type=str) - _check_argument('meta_file_val', dataset_entry, restricted=True, val_type=str) - - - - - - - - + _check_argument('meta_file_val', dataset_entry, restricted=True, val_type=str) \ No newline at end of file From 2b1fb6cb12684b726699b93c5fd9245b43641fcd Mon Sep 17 00:00:00 2001 From: erogol Date: Mon, 17 Feb 2020 16:05:05 +0100 Subject: [PATCH 56/61] add mozilla german --- datasets/preprocess.py | 32 ++++++++++++++++---------------- layers/tacotron2.py | 1 - 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/datasets/preprocess.py b/datasets/preprocess.py index a78abab9..64efc665 100644 --- a/datasets/preprocess.py +++ b/datasets/preprocess.py @@ -60,22 +60,6 @@ def tweb(root_path, meta_file): # return {'text': texts, 'wavs': wavs} -def mozilla_old(root_path, meta_file): - """Normalizes Mozilla meta data files to TTS format""" - txt_file = os.path.join(root_path, meta_file) - items = [] - speaker_name = "mozilla_old" - with open(txt_file, 'r') as ttf: - for line in ttf: - cols = line.split('|') - batch_no = int(cols[1].strip().split("_")[0]) - wav_folder = "batch{}".format(batch_no) - wav_file = os.path.join(root_path, wav_folder, "wavs_no_processing", cols[1].strip()) - text = cols[0].strip() - items.append([text, wav_file, speaker_name]) - return items - - def mozilla(root_path, meta_file): """Normalizes Mozilla meta data files to TTS format""" txt_file = os.path.join(root_path, meta_file) @@ -91,6 +75,22 @@ def mozilla(root_path, meta_file): return items +def mozilla_de(root_path, meta_file): + """Normalizes Mozilla meta data files to TTS format""" + txt_file = os.path.join(root_path, meta_file) + items = [] + speaker_name = "mozilla" + with open(txt_file, 'r', encoding="ISO 8859-1") as ttf: + for line in ttf: + cols = line.strip().split('|') + wav_file = cols[0].strip() + text = cols[1].strip() + folder_name = f"BATCH_{wav_file.split('_')[0]}_FINAL" + wav_file = os.path.join(root_path, folder_name, wav_file) + items.append([text, wav_file, speaker_name]) + return items + + def mailabs(root_path, meta_files=None): """Normalizes M-AI-Labs meta data files to TTS format""" speaker_regex = re.compile("by_book/(male|female)/(?P[^/]+)/") diff --git a/layers/tacotron2.py b/layers/tacotron2.py index c195b277..fa76a6b2 100644 --- a/layers/tacotron2.py +++ b/layers/tacotron2.py @@ -64,7 +64,6 @@ class Encoder(nn.Module): def forward(self, x, input_lengths): x = self.convolutions(x) x = x.transpose(1, 2) - input_lengths = input_lengths.cpu().numpy() x = nn.utils.rnn.pack_padded_sequence(x, input_lengths, batch_first=True) From 8feb326a60fce455ba439d4e1fb7bf0e66642bd4 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Sun, 1 Mar 2020 15:47:08 -0300 Subject: [PATCH 57/61] add text parameters in config.json --- config.json | 10 +++++++ datasets/TTSDataset.py | 9 ++++-- notebooks/Benchmark-PWGAN.ipynb | 6 +++- notebooks/Benchmark.ipynb | 6 +++- notebooks/ExtractTTSpectrogram.ipynb | 8 ++++-- notebooks/TestAttention.ipynb | 6 +++- server/synthesizer.py | 9 +++++- synthesize.py | 8 +++++- tests/test_demo_server.py | 5 +++- tests/test_loader.py | 1 + train.py | 8 ++++-- utils/generic_utils.py | 9 ++++++ utils/synthesis.py | 5 ++-- utils/text/__init__.py | 41 +++++++++++++++++++++++----- utils/text/symbols.py | 21 +++++++++----- utils/visual.py | 5 ++-- 16 files changed, 126 insertions(+), 31 deletions(-) diff --git a/config.json b/config.json index c1a8158d..2a7c4551 100644 --- a/config.json +++ b/config.json @@ -27,6 +27,16 @@ "trim_db": 60 // threshold for timming silence. Set this according to your dataset. }, + // VOCABULARY PARAMETERS + "text":{ + "pad": "_", + "eos": "~", + "bos": "^", + "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ", + "punctuations":"!'(),-.:;? ", + "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ" + }, + // DISTRIBUTED TRAINING "distributed":{ "backend": "nccl", diff --git a/datasets/TTSDataset.py b/datasets/TTSDataset.py index a45d77ff..cccd65a2 100644 --- a/datasets/TTSDataset.py +++ b/datasets/TTSDataset.py @@ -15,6 +15,7 @@ class MyDataset(Dataset): text_cleaner, ap, meta_data, + tp=None, batch_group_size=0, min_seq_len=0, max_seq_len=float("inf"), @@ -49,6 +50,7 @@ class MyDataset(Dataset): self.min_seq_len = min_seq_len self.max_seq_len = max_seq_len self.ap = ap + self.tp = tp self.use_phonemes = use_phonemes self.phoneme_cache_path = phoneme_cache_path self.phoneme_language = phoneme_language @@ -81,7 +83,8 @@ class MyDataset(Dataset): config option.""" phonemes = phoneme_to_sequence(text, [self.cleaners], language=self.phoneme_language, - enable_eos_bos=False) + enable_eos_bos=False, + tp=self.tp) phonemes = np.asarray(phonemes, dtype=np.int32) np.save(cache_path, phonemes) return phonemes @@ -101,7 +104,7 @@ class MyDataset(Dataset): phonemes = self._generate_and_cache_phoneme_sequence(text, cache_path) if self.enable_eos_bos: - phonemes = pad_with_eos_bos(phonemes) + phonemes = pad_with_eos_bos(phonemes, tp=self.tp) phonemes = np.asarray(phonemes, dtype=np.int32) return phonemes @@ -113,7 +116,7 @@ class MyDataset(Dataset): text = self._load_or_generate_phoneme_sequence(wav_file, text) else: text = np.asarray( - text_to_sequence(text, [self.cleaners]), dtype=np.int32) + text_to_sequence(text, [self.cleaners], tp=self.tp), dtype=np.int32) assert text.size > 0, self.items[idx][1] assert wav.size > 0, self.items[idx][1] diff --git a/notebooks/Benchmark-PWGAN.ipynb b/notebooks/Benchmark-PWGAN.ipynb index 430d329f..4a2a21d7 100644 --- a/notebooks/Benchmark-PWGAN.ipynb +++ b/notebooks/Benchmark-PWGAN.ipynb @@ -132,7 +132,7 @@ "outputs": [], "source": [ "# LOAD TTS MODEL\n", - "from TTS.utils.text.symbols import symbols, phonemes\n", + "from TTS.utils.text.symbols import make_symbols, symbols, phonemes\n", "\n", "# multi speaker \n", "if CONFIG.use_speaker_embedding:\n", @@ -142,6 +142,10 @@ " speakers = []\n", " speaker_id = None\n", "\n", + "# if the vocabulary was passed, replace the default\n", + "if 'text' in CONFIG.keys():\n", + " symbols, phonemes = make_symbols(**CONFIG.text)\n", + "\n", "# load the model\n", "num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n", "model = setup_model(num_chars, len(speakers), CONFIG)\n", diff --git a/notebooks/Benchmark.ipynb b/notebooks/Benchmark.ipynb index 00ac7d16..528d7a3b 100644 --- a/notebooks/Benchmark.ipynb +++ b/notebooks/Benchmark.ipynb @@ -65,7 +65,7 @@ "from TTS.utils.text import text_to_sequence\n", "from TTS.utils.synthesis import synthesis\n", "from TTS.utils.visual import visualize\n", - "from TTS.utils.text.symbols import symbols, phonemes\n", + "from TTS.utils.text.symbols import make_symbols, symbols, phonemes\n", "\n", "import IPython\n", "from IPython.display import Audio\n", @@ -149,6 +149,10 @@ " speakers = []\n", " speaker_id = None\n", "\n", + "# if the vocabulary was passed, replace the default\n", + "if 'text' in CONFIG.keys():\n", + " symbols, phonemes = make_symbols(**CONFIG.text)\n", + "\n", "# load the model\n", "num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n", "model = setup_model(num_chars, len(speakers), CONFIG)\n", diff --git a/notebooks/ExtractTTSpectrogram.ipynb b/notebooks/ExtractTTSpectrogram.ipynb index 20038f78..2313e47e 100644 --- a/notebooks/ExtractTTSpectrogram.ipynb +++ b/notebooks/ExtractTTSpectrogram.ipynb @@ -37,7 +37,7 @@ "from TTS.utils.audio import AudioProcessor\n", "from TTS.utils.visual import plot_spectrogram\n", "from TTS.utils.generic_utils import load_config, setup_model, sequence_mask\n", - "from TTS.utils.text.symbols import symbols, phonemes\n", + "from TTS.utils.text.symbols import make_symbols, symbols, phonemes\n", "\n", "%matplotlib inline\n", "\n", @@ -94,6 +94,10 @@ "metadata": {}, "outputs": [], "source": [ + "# if the vocabulary was passed, replace the default\n", + "if 'text' in C.keys():\n", + " symbols, phonemes = make_symbols(**C.text)\n", + "\n", "# load the model\n", "num_chars = len(phonemes) if C.use_phonemes else len(symbols)\n", "# TODO: multiple speaker\n", @@ -116,7 +120,7 @@ "preprocessor = importlib.import_module('TTS.datasets.preprocess')\n", "preprocessor = getattr(preprocessor, DATASET.lower())\n", "meta_data = preprocessor(DATA_PATH,METADATA_FILE)\n", - "dataset = MyDataset(checkpoint['r'], C.text_cleaner, ap, meta_data, use_phonemes=C.use_phonemes, phoneme_cache_path=C.phoneme_cache_path, enable_eos_bos=C.enable_eos_bos_chars)\n", + "dataset = MyDataset(checkpoint['r'], C.text_cleaner, ap, meta_data,tp=C.text if 'text' in C.keys() else None, use_phonemes=C.use_phonemes, phoneme_cache_path=C.phoneme_cache_path, enable_eos_bos=C.enable_eos_bos_chars)\n", "loader = DataLoader(dataset, batch_size=BATCH_SIZE, num_workers=4, collate_fn=dataset.collate_fn, shuffle=False, drop_last=False)" ] }, diff --git a/notebooks/TestAttention.ipynb b/notebooks/TestAttention.ipynb index a1867d13..5310fb92 100644 --- a/notebooks/TestAttention.ipynb +++ b/notebooks/TestAttention.ipynb @@ -100,7 +100,7 @@ "outputs": [], "source": [ "# LOAD TTS MODEL\n", - "from TTS.utils.text.symbols import symbols, phonemes\n", + "from TTS.utils.text.symbols import make_symbols, symbols, phonemes\n", "\n", "# multi speaker \n", "if CONFIG.use_speaker_embedding:\n", @@ -110,6 +110,10 @@ " speakers = []\n", " speaker_id = None\n", "\n", + "# if the vocabulary was passed, replace the default\n", + "if 'text' in CONFIG.keys():\n", + " symbols, phonemes = make_symbols(**CONFIG.text)\n", + "\n", "# load the model\n", "num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n", "model = setup_model(num_chars, len(speakers), CONFIG)\n", diff --git a/server/synthesizer.py b/server/synthesizer.py index 347bef21..f001afcd 100644 --- a/server/synthesizer.py +++ b/server/synthesizer.py @@ -10,7 +10,7 @@ from TTS.utils.audio import AudioProcessor from TTS.utils.generic_utils import load_config, setup_model from TTS.utils.speakers import load_speaker_mapping from TTS.utils.synthesis import * -from TTS.utils.text import phonemes, symbols +from TTS.utils.text import make_symbols, phonemes, symbols alphabets = r"([A-Za-z])" prefixes = r"(Mr|St|Mrs|Ms|Dr)[.]" @@ -38,12 +38,19 @@ class Synthesizer(object): self.config.pwgan_config, self.config.use_cuda) def load_tts(self, tts_checkpoint, tts_config, use_cuda): + global symbols, phonemes + print(" > Loading TTS model ...") print(" | > model config: ", tts_config) print(" | > checkpoint file: ", tts_checkpoint) + self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) + + if 'text' in self.tts_config.keys(): + symbols, phonemes = make_symbols(**self.tts_config.text) + if self.use_phonemes: self.input_size = len(phonemes) else: diff --git a/synthesize.py b/synthesize.py index bf85d7c9..d294701f 100644 --- a/synthesize.py +++ b/synthesize.py @@ -8,7 +8,7 @@ import string from TTS.utils.synthesis import synthesis from TTS.utils.generic_utils import load_config, setup_model -from TTS.utils.text.symbols import symbols, phonemes +from TTS.utils.text.symbols import make_symbols, symbols, phonemes from TTS.utils.audio import AudioProcessor @@ -48,6 +48,8 @@ def tts(model, if __name__ == "__main__": + global symbols, phonemes + parser = argparse.ArgumentParser() parser.add_argument('text', type=str, help='Text to generate speech.') parser.add_argument('config_path', @@ -105,6 +107,10 @@ if __name__ == "__main__": # load the audio processor ap = AudioProcessor(**C.audio) + # if the vocabulary was passed, replace the default + if 'text' in C.keys(): + symbols, phonemes = make_symbols(**C.text) + # load speakers if args.speakers_json != '': speakers = json.load(open(args.speakers_json, 'r')) diff --git a/tests/test_demo_server.py b/tests/test_demo_server.py index c343a6a4..3e360e20 100644 --- a/tests/test_demo_server.py +++ b/tests/test_demo_server.py @@ -5,13 +5,16 @@ import torch as T from TTS.server.synthesizer import Synthesizer from TTS.tests import get_tests_input_path, get_tests_output_path -from TTS.utils.text.symbols import phonemes, symbols +from TTS.utils.text.symbols import make_symbols, phonemes, symbols from TTS.utils.generic_utils import load_config, save_checkpoint, setup_model class DemoServerTest(unittest.TestCase): def _create_random_model(self): config = load_config(os.path.join(get_tests_output_path(), 'dummy_model_config.json')) + if 'text' in config.keys(): + symbols, phonemes = make_symbols(**config.text) + num_chars = len(phonemes) if config.use_phonemes else len(symbols) model = setup_model(num_chars, 0, config) output_path = os.path.join(get_tests_output_path()) diff --git a/tests/test_loader.py b/tests/test_loader.py index d8727895..5141fa85 100644 --- a/tests/test_loader.py +++ b/tests/test_loader.py @@ -38,6 +38,7 @@ class TestTTSDataset(unittest.TestCase): c.text_cleaner, ap=self.ap, meta_data=items, + tp=c.text if 'text' in c.keys() else None, batch_group_size=bgs, min_seq_len=c.min_seq_len, max_seq_len=float("inf"), diff --git a/train.py b/train.py index 7bfb8751..96c268f0 100644 --- a/train.py +++ b/train.py @@ -25,7 +25,7 @@ from TTS.utils.logger import Logger from TTS.utils.speakers import load_speaker_mapping, save_speaker_mapping, \ get_speakers from TTS.utils.synthesis import synthesis -from TTS.utils.text.symbols import phonemes, symbols +from TTS.utils.text.symbols import make_symbols, phonemes, symbols from TTS.utils.visual import plot_alignment, plot_spectrogram from TTS.datasets.preprocess import load_meta_data from TTS.utils.radam import RAdam @@ -49,6 +49,7 @@ def setup_loader(ap, r, is_val=False, verbose=False): c.text_cleaner, meta_data=meta_data_eval if is_val else meta_data_train, ap=ap, + tp=c.text if 'text' in c.keys() else None, batch_group_size=0 if is_val else c.batch_group_size * c.batch_size, min_seq_len=c.min_seq_len, @@ -515,9 +516,12 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch): # FIXME: move args definition/parsing inside of main? def main(args): # pylint: disable=redefined-outer-name - global meta_data_train, meta_data_eval + global meta_data_train, meta_data_eval, symbols, phonemes # Audio processor ap = AudioProcessor(**c.audio) + + if 'text' in c.keys(): + symbols, phonemes = make_symbols(**c.text) # DISTRUBUTED if num_gpus > 1: diff --git a/utils/generic_utils.py b/utils/generic_utils.py index a8de5bbb..6aecdc7d 100644 --- a/utils/generic_utils.py +++ b/utils/generic_utils.py @@ -425,6 +425,15 @@ def check_config(c): _check_argument('power', c['audio'], restricted=True, val_type=float, min_val=1, max_val=5) _check_argument('griffin_lim_iters', c['audio'], restricted=True, val_type=int, min_val=10, max_val=1000) + # vocabulary parameters + _check_argument('text', c, restricted=False, val_type=dict) # parameter not mandatory + _check_argument('pad', c['text'] if 'text' in c.keys() else {}, restricted=True if 'text' in c.keys() else False, val_type=str) # mandatory if "text parameters" else no mandatory + _check_argument('eos', c['text'] if 'text' in c.keys() else {}, restricted=True if 'text' in c.keys() else False, val_type=str) + _check_argument('bos', c['text'] if 'text' in c.keys() else {}, restricted=True if 'text' in c.keys() else False, val_type=str) + _check_argument('characters', c['text'] if 'text' in c.keys() else {}, restricted=True if 'text' in c.keys() else False, val_type=str) + _check_argument('phonemes', c['text'] if 'text' in c.keys() else {}, restricted=True if 'text' in c.keys() else False, val_type=str) + _check_argument('punctuations', c['text'] if 'text' in c.keys() else {}, restricted=True if 'text' in c.keys() else False, val_type=str) + # normalization parameters _check_argument('signal_norm', c['audio'], restricted=True, val_type=bool) _check_argument('symmetric_norm', c['audio'], restricted=True, val_type=bool) diff --git a/utils/synthesis.py b/utils/synthesis.py index 79a17c78..c5ff2e70 100644 --- a/utils/synthesis.py +++ b/utils/synthesis.py @@ -9,10 +9,11 @@ def text_to_seqvec(text, CONFIG, use_cuda): if CONFIG.use_phonemes: seq = np.asarray( phoneme_to_sequence(text, text_cleaner, CONFIG.phoneme_language, - CONFIG.enable_eos_bos_chars), + CONFIG.enable_eos_bos_chars, + tp=CONFIG.text if 'text' in CONFIG.keys() else None), dtype=np.int32) else: - seq = np.asarray(text_to_sequence(text, text_cleaner), dtype=np.int32) + seq = np.asarray(text_to_sequence(text, text_cleaner, tp=CONFIG.text if 'text' in CONFIG.keys() else None), dtype=np.int32) # torch tensor chars_var = torch.from_numpy(seq).unsqueeze(0) if use_cuda: diff --git a/utils/text/__init__.py b/utils/text/__init__.py index 0e6684d2..fcb239b2 100644 --- a/utils/text/__init__.py +++ b/utils/text/__init__.py @@ -4,7 +4,7 @@ import re import phonemizer from phonemizer.phonemize import phonemize from TTS.utils.text import cleaners -from TTS.utils.text.symbols import symbols, phonemes, _phoneme_punctuations, _bos, \ +from TTS.utils.text.symbols import make_symbols, symbols, phonemes, _phoneme_punctuations, _bos, \ _eos # Mappings from symbol to numeric ID and vice versa: @@ -56,11 +56,23 @@ def text2phone(text, language): return ph -def pad_with_eos_bos(phoneme_sequence): +def pad_with_eos_bos(phoneme_sequence, tp=None): + global _PHONEMES_TO_ID, _bos, _eos + if tp: + _bos = tp['bos'] + _eos = tp['eos'] + _, phonemes = make_symbols(**tp) + _PHONEMES_TO_ID = {s: i for i, s in enumerate(phonemes)} + return [_PHONEMES_TO_ID[_bos]] + list(phoneme_sequence) + [_PHONEMES_TO_ID[_eos]] -def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False): +def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=None): + global _PHONEMES_TO_ID + if tp: + _, phonemes = make_symbols(**tp) + _PHONEMES_TO_ID = {s: i for i, s in enumerate(phonemes)} + sequence = [] text = text.replace(":", "") clean_text = _clean_text(text, cleaner_names) @@ -72,13 +84,18 @@ def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False): sequence += _phoneme_to_sequence(phoneme) # Append EOS char if enable_eos_bos: - sequence = pad_with_eos_bos(sequence) + sequence = pad_with_eos_bos(sequence, tp=tp) return sequence -def sequence_to_phoneme(sequence): +def sequence_to_phoneme(sequence, tp=None): '''Converts a sequence of IDs back to a string''' + global _ID_TO_PHONEMES result = '' + if tp: + _, phonemes = make_symbols(**tp) + _ID_TO_PHONEMES = {i: s for i, s in enumerate(phonemes)} + for symbol_id in sequence: if symbol_id in _ID_TO_PHONEMES: s = _ID_TO_PHONEMES[symbol_id] @@ -86,7 +103,7 @@ def sequence_to_phoneme(sequence): return result.replace('}{', ' ') -def text_to_sequence(text, cleaner_names): +def text_to_sequence(text, cleaner_names, tp=None): '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. The text can optionally have ARPAbet sequences enclosed in curly braces embedded @@ -99,6 +116,11 @@ def text_to_sequence(text, cleaner_names): Returns: List of integers corresponding to the symbols in the text ''' + global _SYMBOL_TO_ID + if tp: + symbols, _ = make_symbols(**tp) + _SYMBOL_TO_ID = {s: i for i, s in enumerate(symbols)} + sequence = [] # Check for curly braces and treat their contents as ARPAbet: while text: @@ -113,8 +135,13 @@ def text_to_sequence(text, cleaner_names): return sequence -def sequence_to_text(sequence): +def sequence_to_text(sequence, tp=None): '''Converts a sequence of IDs back to a string''' + global _ID_TO_SYMBOL + if tp: + symbols, _ = make_symbols(**tp) + _ID_TO_SYMBOL = {i: s for i, s in enumerate(symbols)} + result = '' for symbol_id in sequence: if symbol_id in _ID_TO_SYMBOL: diff --git a/utils/text/symbols.py b/utils/text/symbols.py index ee6fd2cf..e4a4b103 100644 --- a/utils/text/symbols.py +++ b/utils/text/symbols.py @@ -5,6 +5,18 @@ Defines the set of symbols used in text input to the model. The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. ''' +def make_symbols(characters, phonemes, punctuations='!\'(),-.:;? ', pad='_', eos='~', bos='^'): + ''' Function to create symbols and phonemes ''' + _phonemes = sorted(list(phonemes)) + + # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): + _arpabet = ['@' + s for s in _phonemes] + + # Export all symbols: + symbols = [pad, eos, bos] + list(characters) + _arpabet + phonemes = [pad, eos, bos] + list(_phonemes) + list(punctuations) + + return symbols, phonemes _pad = '_' _eos = '~' @@ -20,14 +32,9 @@ _pulmonic_consonants = 'pbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðsz _suprasegmentals = 'ˈˌːˑ' _other_symbols = 'ʍwɥʜʢʡɕʑɺɧ' _diacrilics = 'ɚ˞ɫ' -_phonemes = sorted(list(_vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics)) +_phonemes = _vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics -# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): -_arpabet = ['@' + s for s in _phonemes] - -# Export all symbols: -symbols = [_pad, _eos, _bos] + list(_characters) + _arpabet -phonemes = [_pad, _eos, _bos] + list(_phonemes) + list(_punctuations) +symbols, phonemes = make_symbols( _characters, _phonemes,_punctuations, _pad, _eos, _bos) # Generate ALIEN language # from random import shuffle diff --git a/utils/visual.py b/utils/visual.py index ab513666..2f93d812 100644 --- a/utils/visual.py +++ b/utils/visual.py @@ -54,9 +54,10 @@ def visualize(alignment, spectrogram_postnet, stop_tokens, text, hop_length, CON plt.xlabel("Decoder timestamp", fontsize=label_fontsize) plt.ylabel("Encoder timestamp", fontsize=label_fontsize) if CONFIG.use_phonemes: - seq = phoneme_to_sequence(text, [CONFIG.text_cleaner], CONFIG.phoneme_language, CONFIG.enable_eos_bos_chars) - text = sequence_to_phoneme(seq) + seq = phoneme_to_sequence(text, [CONFIG.text_cleaner], CONFIG.phoneme_language, CONFIG.enable_eos_bos_chars, tp=CONFIG.text if 'text' in CONFIG.keys() else None) + text = sequence_to_phoneme(seq, tp=CONFIG.text if 'text' in CONFIG.keys() else None) print(text) + plt.yticks(range(len(text)), list(text)) plt.colorbar() From 59e2752107162b7b6060c3096d24c16a3cbbd0b3 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Mon, 2 Mar 2020 11:46:00 -0300 Subject: [PATCH 58/61] fix travis unit test errors --- datasets/TTSDataset.py | 3 +-- train.py | 3 +-- utils/generic_utils.py | 14 +++++++------- utils/text/__init__.py | 20 ++++++++++---------- utils/text/symbols.py | 10 +++++----- 5 files changed, 24 insertions(+), 26 deletions(-) diff --git a/datasets/TTSDataset.py b/datasets/TTSDataset.py index cccd65a2..d649bf23 100644 --- a/datasets/TTSDataset.py +++ b/datasets/TTSDataset.py @@ -77,13 +77,12 @@ class MyDataset(Dataset): def _generate_and_cache_phoneme_sequence(self, text, cache_path): """generate a phoneme sequence from text. - since the usage is for subsequent caching, we never add bos and eos chars here. Instead we add those dynamically later; based on the config option.""" phonemes = phoneme_to_sequence(text, [self.cleaners], language=self.phoneme_language, - enable_eos_bos=False, + enable_eos_bos=False, tp=self.tp) phonemes = np.asarray(phonemes, dtype=np.int32) np.save(cache_path, phonemes) diff --git a/train.py b/train.py index 96c268f0..616d54ac 100644 --- a/train.py +++ b/train.py @@ -519,9 +519,8 @@ def main(args): # pylint: disable=redefined-outer-name global meta_data_train, meta_data_eval, symbols, phonemes # Audio processor ap = AudioProcessor(**c.audio) - if 'text' in c.keys(): - symbols, phonemes = make_symbols(**c.text) + symbols, phonemes = make_symbols(**c.text) # DISTRUBUTED if num_gpus > 1: diff --git a/utils/generic_utils.py b/utils/generic_utils.py index 6aecdc7d..7c2f033a 100644 --- a/utils/generic_utils.py +++ b/utils/generic_utils.py @@ -426,13 +426,13 @@ def check_config(c): _check_argument('griffin_lim_iters', c['audio'], restricted=True, val_type=int, min_val=10, max_val=1000) # vocabulary parameters - _check_argument('text', c, restricted=False, val_type=dict) # parameter not mandatory - _check_argument('pad', c['text'] if 'text' in c.keys() else {}, restricted=True if 'text' in c.keys() else False, val_type=str) # mandatory if "text parameters" else no mandatory - _check_argument('eos', c['text'] if 'text' in c.keys() else {}, restricted=True if 'text' in c.keys() else False, val_type=str) - _check_argument('bos', c['text'] if 'text' in c.keys() else {}, restricted=True if 'text' in c.keys() else False, val_type=str) - _check_argument('characters', c['text'] if 'text' in c.keys() else {}, restricted=True if 'text' in c.keys() else False, val_type=str) - _check_argument('phonemes', c['text'] if 'text' in c.keys() else {}, restricted=True if 'text' in c.keys() else False, val_type=str) - _check_argument('punctuations', c['text'] if 'text' in c.keys() else {}, restricted=True if 'text' in c.keys() else False, val_type=str) + _check_argument('text', c, restricted=False, val_type=dict) + _check_argument('pad', c['text'] if 'text' in c.keys() else {}, restricted='text' in c.keys(), val_type=str) + _check_argument('eos', c['text'] if 'text' in c.keys() else {}, restricted='text' in c.keys(), val_type=str) + _check_argument('bos', c['text'] if 'text' in c.keys() else {}, restricted='text' in c.keys(), val_type=str) + _check_argument('characters', c['text'] if 'text' in c.keys() else {}, restricted='text' in c.keys(), val_type=str) + _check_argument('phonemes', c['text'] if 'text' in c.keys() else {}, restricted='text' in c.keys(), val_type=str) + _check_argument('punctuations', c['text'] if 'text' in c.keys() else {}, restricted='text' in c.keys(), val_type=str) # normalization parameters _check_argument('signal_norm', c['audio'], restricted=True, val_type=bool) diff --git a/utils/text/__init__.py b/utils/text/__init__.py index fcb239b2..ff21ffe0 100644 --- a/utils/text/__init__.py +++ b/utils/text/__init__.py @@ -61,8 +61,8 @@ def pad_with_eos_bos(phoneme_sequence, tp=None): if tp: _bos = tp['bos'] _eos = tp['eos'] - _, phonemes = make_symbols(**tp) - _PHONEMES_TO_ID = {s: i for i, s in enumerate(phonemes)} + _, _phonemes = make_symbols(**tp) + _PHONEMES_TO_ID = {s: i for i, s in enumerate(_phonemes)} return [_PHONEMES_TO_ID[_bos]] + list(phoneme_sequence) + [_PHONEMES_TO_ID[_eos]] @@ -70,8 +70,8 @@ def pad_with_eos_bos(phoneme_sequence, tp=None): def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=None): global _PHONEMES_TO_ID if tp: - _, phonemes = make_symbols(**tp) - _PHONEMES_TO_ID = {s: i for i, s in enumerate(phonemes)} + _, _phonemes = make_symbols(**tp) + _PHONEMES_TO_ID = {s: i for i, s in enumerate(_phonemes)} sequence = [] text = text.replace(":", "") @@ -93,8 +93,8 @@ def sequence_to_phoneme(sequence, tp=None): global _ID_TO_PHONEMES result = '' if tp: - _, phonemes = make_symbols(**tp) - _ID_TO_PHONEMES = {i: s for i, s in enumerate(phonemes)} + _, _phonemes = make_symbols(**tp) + _ID_TO_PHONEMES = {i: s for i, s in enumerate(_phonemes)} for symbol_id in sequence: if symbol_id in _ID_TO_PHONEMES: @@ -118,8 +118,8 @@ def text_to_sequence(text, cleaner_names, tp=None): ''' global _SYMBOL_TO_ID if tp: - symbols, _ = make_symbols(**tp) - _SYMBOL_TO_ID = {s: i for i, s in enumerate(symbols)} + _symbols, _ = make_symbols(**tp) + _SYMBOL_TO_ID = {s: i for i, s in enumerate(_symbols)} sequence = [] # Check for curly braces and treat their contents as ARPAbet: @@ -139,8 +139,8 @@ def sequence_to_text(sequence, tp=None): '''Converts a sequence of IDs back to a string''' global _ID_TO_SYMBOL if tp: - symbols, _ = make_symbols(**tp) - _ID_TO_SYMBOL = {i: s for i, s in enumerate(symbols)} + _symbols, _ = make_symbols(**tp) + _ID_TO_SYMBOL = {i: s for i, s in enumerate(_symbols)} result = '' for symbol_id in sequence: diff --git a/utils/text/symbols.py b/utils/text/symbols.py index e4a4b103..db83cb29 100644 --- a/utils/text/symbols.py +++ b/utils/text/symbols.py @@ -5,16 +5,16 @@ Defines the set of symbols used in text input to the model. The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. ''' -def make_symbols(characters, phonemes, punctuations='!\'(),-.:;? ', pad='_', eos='~', bos='^'): +def make_symbols(characters, phnms, punctuations='!\'(),-.:;? ', pad='_', eos='~', bos='^'): ''' Function to create symbols and phonemes ''' - _phonemes = sorted(list(phonemes)) + _phonemes_sorted = sorted(list(phnms)) # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): - _arpabet = ['@' + s for s in _phonemes] + _arpabet = ['@' + s for s in _phonemes_sorted] # Export all symbols: - symbols = [pad, eos, bos] + list(characters) + _arpabet - phonemes = [pad, eos, bos] + list(_phonemes) + list(punctuations) + _symbols = [pad, eos, bos] + list(characters) + _arpabet + _phonemes = [pad, eos, bos] + list(_phonemes_sorted) + list(punctuations) return symbols, phonemes From 4e53896438b5365269e54dae999b6ddab837b0c4 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Mon, 2 Mar 2020 15:33:13 -0300 Subject: [PATCH 59/61] fix travis lint check --- datasets/TTSDataset.py | 2 +- notebooks/Benchmark-PWGAN.ipynb | 2 +- notebooks/Benchmark.ipynb | 2 +- notebooks/TestAttention.ipynb | 2 +- server/synthesizer.py | 12 ++++--- synthesize.py | 2 +- tests/test_demo_server.py | 5 ++- tests/test_loader.py | 2 +- train.py | 1 + utils/text/__init__.py | 55 ++++++++++++++++++--------------- utils/text/symbols.py | 4 +-- utils/visual.py | 2 +- 12 files changed, 52 insertions(+), 39 deletions(-) diff --git a/datasets/TTSDataset.py b/datasets/TTSDataset.py index d649bf23..d3a6f486 100644 --- a/datasets/TTSDataset.py +++ b/datasets/TTSDataset.py @@ -195,7 +195,7 @@ class MyDataset(Dataset): mel = [self.ap.melspectrogram(w).astype('float32') for w in wav] linear = [self.ap.spectrogram(w).astype('float32') for w in wav] - mel_lengths = [m.shape[1] for m in mel] + mel_lengths = [m.shape[1] for m in mel] # compute 'stop token' targets stop_targets = [ diff --git a/notebooks/Benchmark-PWGAN.ipynb b/notebooks/Benchmark-PWGAN.ipynb index 4a2a21d7..19a1a79c 100644 --- a/notebooks/Benchmark-PWGAN.ipynb +++ b/notebooks/Benchmark-PWGAN.ipynb @@ -144,7 +144,7 @@ "\n", "# if the vocabulary was passed, replace the default\n", "if 'text' in CONFIG.keys():\n", - " symbols, phonemes = make_symbols(**CONFIG.text)\n", + " symbols, phonemes = make_symbols(**CONFIG.text)\n", "\n", "# load the model\n", "num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n", diff --git a/notebooks/Benchmark.ipynb b/notebooks/Benchmark.ipynb index 528d7a3b..bf6f2774 100644 --- a/notebooks/Benchmark.ipynb +++ b/notebooks/Benchmark.ipynb @@ -151,7 +151,7 @@ "\n", "# if the vocabulary was passed, replace the default\n", "if 'text' in CONFIG.keys():\n", - " symbols, phonemes = make_symbols(**CONFIG.text)\n", + " symbols, phonemes = make_symbols(**CONFIG.text)\n", "\n", "# load the model\n", "num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n", diff --git a/notebooks/TestAttention.ipynb b/notebooks/TestAttention.ipynb index 5310fb92..b0599d80 100644 --- a/notebooks/TestAttention.ipynb +++ b/notebooks/TestAttention.ipynb @@ -112,7 +112,7 @@ "\n", "# if the vocabulary was passed, replace the default\n", "if 'text' in CONFIG.keys():\n", - " symbols, phonemes = make_symbols(**CONFIG.text)\n", + " symbols, phonemes = make_symbols(**CONFIG.text)\n", "\n", "# load the model\n", "num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n", diff --git a/server/synthesizer.py b/server/synthesizer.py index f001afcd..f0921513 100644 --- a/server/synthesizer.py +++ b/server/synthesizer.py @@ -9,7 +9,10 @@ import yaml from TTS.utils.audio import AudioProcessor from TTS.utils.generic_utils import load_config, setup_model from TTS.utils.speakers import load_speaker_mapping +# pylint: disable=unused-wildcard-import +# pylint: disable=wildcard-import from TTS.utils.synthesis import * + from TTS.utils.text import make_symbols, phonemes, symbols alphabets = r"([A-Za-z])" @@ -38,18 +41,19 @@ class Synthesizer(object): self.config.pwgan_config, self.config.use_cuda) def load_tts(self, tts_checkpoint, tts_config, use_cuda): + # pylint: disable=global-statement global symbols, phonemes print(" > Loading TTS model ...") print(" | > model config: ", tts_config) print(" | > checkpoint file: ", tts_checkpoint) - + self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if 'text' in self.tts_config.keys(): - symbols, phonemes = make_symbols(**self.tts_config.text) + symbols, phonemes = make_symbols(**self.tts_config.text) if self.use_phonemes: self.input_size = len(phonemes) @@ -61,7 +65,7 @@ class Synthesizer(object): num_speakers = len(self.tts_speakers) else: num_speakers = 0 - self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config) + self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config) # load model state cp = torch.load(tts_checkpoint, map_location=torch.device('cpu')) # load the model @@ -91,7 +95,7 @@ class Synthesizer(object): mulaw=self.wavernn_config.mulaw, pad=self.wavernn_config.pad, use_aux_net=self.wavernn_config.use_aux_net, - use_upsample_net = self.wavernn_config.use_upsample_net, + use_upsample_net=self.wavernn_config.use_upsample_net, upsample_factors=self.wavernn_config.upsample_factors, feat_dims=80, compute_dims=128, diff --git a/synthesize.py b/synthesize.py index d294701f..6f3a235f 100644 --- a/synthesize.py +++ b/synthesize.py @@ -109,7 +109,7 @@ if __name__ == "__main__": # if the vocabulary was passed, replace the default if 'text' in C.keys(): - symbols, phonemes = make_symbols(**C.text) + symbols, phonemes = make_symbols(**C.text) # load speakers if args.speakers_json != '': diff --git a/tests/test_demo_server.py b/tests/test_demo_server.py index 3e360e20..36848942 100644 --- a/tests/test_demo_server.py +++ b/tests/test_demo_server.py @@ -10,10 +10,13 @@ from TTS.utils.generic_utils import load_config, save_checkpoint, setup_model class DemoServerTest(unittest.TestCase): + # pylint: disable=R0201 def _create_random_model(self): + # pylint: disable=global-statement + global symbols, phonemes config = load_config(os.path.join(get_tests_output_path(), 'dummy_model_config.json')) if 'text' in config.keys(): - symbols, phonemes = make_symbols(**config.text) + symbols, phonemes = make_symbols(**config.text) num_chars = len(phonemes) if config.use_phonemes else len(symbols) model = setup_model(num_chars, 0, config) diff --git a/tests/test_loader.py b/tests/test_loader.py index 5141fa85..eb23ed19 100644 --- a/tests/test_loader.py +++ b/tests/test_loader.py @@ -37,7 +37,7 @@ class TestTTSDataset(unittest.TestCase): r, c.text_cleaner, ap=self.ap, - meta_data=items, + meta_data=items, tp=c.text if 'text' in c.keys() else None, batch_group_size=bgs, min_seq_len=c.min_seq_len, diff --git a/train.py b/train.py index 616d54ac..bf5429e9 100644 --- a/train.py +++ b/train.py @@ -516,6 +516,7 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch): # FIXME: move args definition/parsing inside of main? def main(args): # pylint: disable=redefined-outer-name + # pylint: disable=global-variable-undefined global meta_data_train, meta_data_eval, symbols, phonemes # Audio processor ap = AudioProcessor(**c.audio) diff --git a/utils/text/__init__.py b/utils/text/__init__.py index ff21ffe0..4361bc13 100644 --- a/utils/text/__init__.py +++ b/utils/text/__init__.py @@ -8,11 +8,11 @@ from TTS.utils.text.symbols import make_symbols, symbols, phonemes, _phoneme_pun _eos # Mappings from symbol to numeric ID and vice versa: -_SYMBOL_TO_ID = {s: i for i, s in enumerate(symbols)} -_ID_TO_SYMBOL = {i: s for i, s in enumerate(symbols)} +_symbol_to_id = {s: i for i, s in enumerate(symbols)} +_id_to_symbol = {i: s for i, s in enumerate(symbols)} -_PHONEMES_TO_ID = {s: i for i, s in enumerate(phonemes)} -_ID_TO_PHONEMES = {i: s for i, s in enumerate(phonemes)} +_phonemes_to_id = {s: i for i, s in enumerate(phonemes)} +_id_to_phonemes = {i: s for i, s in enumerate(phonemes)} # Regular expression matching text enclosed in curly braces: _CURLY_RE = re.compile(r'(.*?)\{(.+?)\}(.*)') @@ -57,21 +57,23 @@ def text2phone(text, language): def pad_with_eos_bos(phoneme_sequence, tp=None): - global _PHONEMES_TO_ID, _bos, _eos + # pylint: disable=global-statement + global _phonemes_to_id, _bos, _eos if tp: _bos = tp['bos'] _eos = tp['eos'] _, _phonemes = make_symbols(**tp) - _PHONEMES_TO_ID = {s: i for i, s in enumerate(_phonemes)} - - return [_PHONEMES_TO_ID[_bos]] + list(phoneme_sequence) + [_PHONEMES_TO_ID[_eos]] + _phonemes_to_id = {s: i for i, s in enumerate(_phonemes)} + + return [_phonemes_to_id[_bos]] + list(phoneme_sequence) + [_phonemes_to_id[_eos]] def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=None): - global _PHONEMES_TO_ID + # pylint: disable=global-statement + global _phonemes_to_id if tp: _, _phonemes = make_symbols(**tp) - _PHONEMES_TO_ID = {s: i for i, s in enumerate(_phonemes)} + _phonemes_to_id = {s: i for i, s in enumerate(_phonemes)} sequence = [] text = text.replace(":", "") @@ -89,16 +91,17 @@ def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp= def sequence_to_phoneme(sequence, tp=None): + # pylint: disable=global-statement '''Converts a sequence of IDs back to a string''' - global _ID_TO_PHONEMES + global _id_to_phonemes result = '' if tp: _, _phonemes = make_symbols(**tp) - _ID_TO_PHONEMES = {i: s for i, s in enumerate(_phonemes)} - + _id_to_phonemes = {i: s for i, s in enumerate(_phonemes)} + for symbol_id in sequence: - if symbol_id in _ID_TO_PHONEMES: - s = _ID_TO_PHONEMES[symbol_id] + if symbol_id in _id_to_phonemes: + s = _id_to_phonemes[symbol_id] result += s return result.replace('}{', ' ') @@ -116,10 +119,11 @@ def text_to_sequence(text, cleaner_names, tp=None): Returns: List of integers corresponding to the symbols in the text ''' - global _SYMBOL_TO_ID + # pylint: disable=global-statement + global _symbol_to_id if tp: _symbols, _ = make_symbols(**tp) - _SYMBOL_TO_ID = {s: i for i, s in enumerate(_symbols)} + _symbol_to_id = {s: i for i, s in enumerate(_symbols)} sequence = [] # Check for curly braces and treat their contents as ARPAbet: @@ -137,15 +141,16 @@ def text_to_sequence(text, cleaner_names, tp=None): def sequence_to_text(sequence, tp=None): '''Converts a sequence of IDs back to a string''' - global _ID_TO_SYMBOL + # pylint: disable=global-statement + global _id_to_symbol if tp: _symbols, _ = make_symbols(**tp) - _ID_TO_SYMBOL = {i: s for i, s in enumerate(_symbols)} + _id_to_symbol = {i: s for i, s in enumerate(_symbols)} result = '' for symbol_id in sequence: - if symbol_id in _ID_TO_SYMBOL: - s = _ID_TO_SYMBOL[symbol_id] + if symbol_id in _id_to_symbol: + s = _id_to_symbol[symbol_id] # Enclose ARPAbet back in curly braces: if len(s) > 1 and s[0] == '@': s = '{%s}' % s[1:] @@ -163,11 +168,11 @@ def _clean_text(text, cleaner_names): def _symbols_to_sequence(syms): - return [_SYMBOL_TO_ID[s] for s in syms if _should_keep_symbol(s)] + return [_symbol_to_id[s] for s in syms if _should_keep_symbol(s)] def _phoneme_to_sequence(phons): - return [_PHONEMES_TO_ID[s] for s in list(phons) if _should_keep_phoneme(s)] + return [_phonemes_to_id[s] for s in list(phons) if _should_keep_phoneme(s)] def _arpabet_to_sequence(text): @@ -175,8 +180,8 @@ def _arpabet_to_sequence(text): def _should_keep_symbol(s): - return s in _SYMBOL_TO_ID and s not in ['~', '^', '_'] + return s in _symbol_to_id and s not in ['~', '^', '_'] def _should_keep_phoneme(p): - return p in _PHONEMES_TO_ID and p not in ['~', '^', '_'] + return p in _phonemes_to_id and p not in ['~', '^', '_'] diff --git a/utils/text/symbols.py b/utils/text/symbols.py index db83cb29..15862cbd 100644 --- a/utils/text/symbols.py +++ b/utils/text/symbols.py @@ -16,7 +16,7 @@ def make_symbols(characters, phnms, punctuations='!\'(),-.:;? ', pad='_', eos='~ _symbols = [pad, eos, bos] + list(characters) + _arpabet _phonemes = [pad, eos, bos] + list(_phonemes_sorted) + list(punctuations) - return symbols, phonemes + return _symbols, _phonemes _pad = '_' _eos = '~' @@ -34,7 +34,7 @@ _other_symbols = 'ʍwɥʜʢʡɕʑɺɧ' _diacrilics = 'ɚ˞ɫ' _phonemes = _vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics -symbols, phonemes = make_symbols( _characters, _phonemes,_punctuations, _pad, _eos, _bos) +symbols, phonemes = make_symbols(_characters, _phonemes, _punctuations, _pad, _eos, _bos) # Generate ALIEN language # from random import shuffle diff --git a/utils/visual.py b/utils/visual.py index 2f93d812..3b24364c 100644 --- a/utils/visual.py +++ b/utils/visual.py @@ -57,7 +57,7 @@ def visualize(alignment, spectrogram_postnet, stop_tokens, text, hop_length, CON seq = phoneme_to_sequence(text, [CONFIG.text_cleaner], CONFIG.phoneme_language, CONFIG.enable_eos_bos_chars, tp=CONFIG.text if 'text' in CONFIG.keys() else None) text = sequence_to_phoneme(seq, tp=CONFIG.text if 'text' in CONFIG.keys() else None) print(text) - + plt.yticks(range(len(text)), list(text)) plt.colorbar() From 36235c5e3fc0f47c56253a99941fc769d744469d Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Tue, 3 Mar 2020 09:17:56 -0300 Subject: [PATCH 60/61] rename text to characters in config.json --- config.json | 2 +- notebooks/Benchmark-PWGAN.ipynb | 4 ++-- notebooks/Benchmark.ipynb | 4 ++-- notebooks/ExtractTTSpectrogram.ipynb | 6 +++--- notebooks/TestAttention.ipynb | 4 ++-- server/synthesizer.py | 4 ++-- synthesize.py | 4 ++-- tests/test_demo_server.py | 4 ++-- tests/test_loader.py | 2 +- train.py | 6 +++--- utils/generic_utils.py | 14 +++++++------- utils/synthesis.py | 4 ++-- utils/visual.py | 4 ++-- 13 files changed, 31 insertions(+), 31 deletions(-) diff --git a/config.json b/config.json index 2a7c4551..3722de9d 100644 --- a/config.json +++ b/config.json @@ -28,7 +28,7 @@ }, // VOCABULARY PARAMETERS - "text":{ + "characters":{ "pad": "_", "eos": "~", "bos": "^", diff --git a/notebooks/Benchmark-PWGAN.ipynb b/notebooks/Benchmark-PWGAN.ipynb index 19a1a79c..840da10e 100644 --- a/notebooks/Benchmark-PWGAN.ipynb +++ b/notebooks/Benchmark-PWGAN.ipynb @@ -143,8 +143,8 @@ " speaker_id = None\n", "\n", "# if the vocabulary was passed, replace the default\n", - "if 'text' in CONFIG.keys():\n", - " symbols, phonemes = make_symbols(**CONFIG.text)\n", + "if 'characters' in CONFIG.keys():\n", + " symbols, phonemes = make_symbols(**CONFIG.characters)\n", "\n", "# load the model\n", "num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n", diff --git a/notebooks/Benchmark.ipynb b/notebooks/Benchmark.ipynb index bf6f2774..7d3a45cf 100644 --- a/notebooks/Benchmark.ipynb +++ b/notebooks/Benchmark.ipynb @@ -150,8 +150,8 @@ " speaker_id = None\n", "\n", "# if the vocabulary was passed, replace the default\n", - "if 'text' in CONFIG.keys():\n", - " symbols, phonemes = make_symbols(**CONFIG.text)\n", + "if 'characters' in CONFIG.keys():\n", + " symbols, phonemes = make_symbols(**CONFIG.characters)\n", "\n", "# load the model\n", "num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n", diff --git a/notebooks/ExtractTTSpectrogram.ipynb b/notebooks/ExtractTTSpectrogram.ipynb index 2313e47e..b5a88611 100644 --- a/notebooks/ExtractTTSpectrogram.ipynb +++ b/notebooks/ExtractTTSpectrogram.ipynb @@ -95,8 +95,8 @@ "outputs": [], "source": [ "# if the vocabulary was passed, replace the default\n", - "if 'text' in C.keys():\n", - " symbols, phonemes = make_symbols(**C.text)\n", + "if 'characters' in C.keys():\n", + " symbols, phonemes = make_symbols(**C.characters)\n", "\n", "# load the model\n", "num_chars = len(phonemes) if C.use_phonemes else len(symbols)\n", @@ -120,7 +120,7 @@ "preprocessor = importlib.import_module('TTS.datasets.preprocess')\n", "preprocessor = getattr(preprocessor, DATASET.lower())\n", "meta_data = preprocessor(DATA_PATH,METADATA_FILE)\n", - "dataset = MyDataset(checkpoint['r'], C.text_cleaner, ap, meta_data,tp=C.text if 'text' in C.keys() else None, use_phonemes=C.use_phonemes, phoneme_cache_path=C.phoneme_cache_path, enable_eos_bos=C.enable_eos_bos_chars)\n", + "dataset = MyDataset(checkpoint['r'], C.text_cleaner, ap, meta_data,tp=C.characters if 'characters' in C.keys() else None, use_phonemes=C.use_phonemes, phoneme_cache_path=C.phoneme_cache_path, enable_eos_bos=C.enable_eos_bos_chars)\n", "loader = DataLoader(dataset, batch_size=BATCH_SIZE, num_workers=4, collate_fn=dataset.collate_fn, shuffle=False, drop_last=False)" ] }, diff --git a/notebooks/TestAttention.ipynb b/notebooks/TestAttention.ipynb index b0599d80..9d3e5e75 100644 --- a/notebooks/TestAttention.ipynb +++ b/notebooks/TestAttention.ipynb @@ -111,8 +111,8 @@ " speaker_id = None\n", "\n", "# if the vocabulary was passed, replace the default\n", - "if 'text' in CONFIG.keys():\n", - " symbols, phonemes = make_symbols(**CONFIG.text)\n", + "if 'characters' in CONFIG.keys():\n", + " symbols, phonemes = make_symbols(**CONFIG.characters)\n", "\n", "# load the model\n", "num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n", diff --git a/server/synthesizer.py b/server/synthesizer.py index f0921513..f73b73fc 100644 --- a/server/synthesizer.py +++ b/server/synthesizer.py @@ -52,8 +52,8 @@ class Synthesizer(object): self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) - if 'text' in self.tts_config.keys(): - symbols, phonemes = make_symbols(**self.tts_config.text) + if 'characters' in self.tts_config.keys(): + symbols, phonemes = make_symbols(**self.tts_config.characters) if self.use_phonemes: self.input_size = len(phonemes) diff --git a/synthesize.py b/synthesize.py index 6f3a235f..1f1ce36f 100644 --- a/synthesize.py +++ b/synthesize.py @@ -108,8 +108,8 @@ if __name__ == "__main__": ap = AudioProcessor(**C.audio) # if the vocabulary was passed, replace the default - if 'text' in C.keys(): - symbols, phonemes = make_symbols(**C.text) + if 'characters' in C.keys(): + symbols, phonemes = make_symbols(**C.characters) # load speakers if args.speakers_json != '': diff --git a/tests/test_demo_server.py b/tests/test_demo_server.py index 36848942..a0837686 100644 --- a/tests/test_demo_server.py +++ b/tests/test_demo_server.py @@ -15,8 +15,8 @@ class DemoServerTest(unittest.TestCase): # pylint: disable=global-statement global symbols, phonemes config = load_config(os.path.join(get_tests_output_path(), 'dummy_model_config.json')) - if 'text' in config.keys(): - symbols, phonemes = make_symbols(**config.text) + if 'characters' in config.keys(): + symbols, phonemes = make_symbols(**config.characters) num_chars = len(phonemes) if config.use_phonemes else len(symbols) model = setup_model(num_chars, 0, config) diff --git a/tests/test_loader.py b/tests/test_loader.py index eb23ed19..d835c5d3 100644 --- a/tests/test_loader.py +++ b/tests/test_loader.py @@ -38,7 +38,7 @@ class TestTTSDataset(unittest.TestCase): c.text_cleaner, ap=self.ap, meta_data=items, - tp=c.text if 'text' in c.keys() else None, + tp=c.characters if 'characters' in c.keys() else None, batch_group_size=bgs, min_seq_len=c.min_seq_len, max_seq_len=float("inf"), diff --git a/train.py b/train.py index bf5429e9..4bb22a34 100644 --- a/train.py +++ b/train.py @@ -49,7 +49,7 @@ def setup_loader(ap, r, is_val=False, verbose=False): c.text_cleaner, meta_data=meta_data_eval if is_val else meta_data_train, ap=ap, - tp=c.text if 'text' in c.keys() else None, + tp=c.characters if 'characters' in c.keys() else None, batch_group_size=0 if is_val else c.batch_group_size * c.batch_size, min_seq_len=c.min_seq_len, @@ -520,8 +520,8 @@ def main(args): # pylint: disable=redefined-outer-name global meta_data_train, meta_data_eval, symbols, phonemes # Audio processor ap = AudioProcessor(**c.audio) - if 'text' in c.keys(): - symbols, phonemes = make_symbols(**c.text) + if 'characters' in c.keys(): + symbols, phonemes = make_symbols(**c.characters) # DISTRUBUTED if num_gpus > 1: diff --git a/utils/generic_utils.py b/utils/generic_utils.py index 7c2f033a..cf0a05b4 100644 --- a/utils/generic_utils.py +++ b/utils/generic_utils.py @@ -426,13 +426,13 @@ def check_config(c): _check_argument('griffin_lim_iters', c['audio'], restricted=True, val_type=int, min_val=10, max_val=1000) # vocabulary parameters - _check_argument('text', c, restricted=False, val_type=dict) - _check_argument('pad', c['text'] if 'text' in c.keys() else {}, restricted='text' in c.keys(), val_type=str) - _check_argument('eos', c['text'] if 'text' in c.keys() else {}, restricted='text' in c.keys(), val_type=str) - _check_argument('bos', c['text'] if 'text' in c.keys() else {}, restricted='text' in c.keys(), val_type=str) - _check_argument('characters', c['text'] if 'text' in c.keys() else {}, restricted='text' in c.keys(), val_type=str) - _check_argument('phonemes', c['text'] if 'text' in c.keys() else {}, restricted='text' in c.keys(), val_type=str) - _check_argument('punctuations', c['text'] if 'text' in c.keys() else {}, restricted='text' in c.keys(), val_type=str) + _check_argument('characters', c, restricted=False, val_type=dict) + _check_argument('pad', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str) + _check_argument('eos', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str) + _check_argument('bos', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str) + _check_argument('characters', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str) + _check_argument('phonemes', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str) + _check_argument('punctuations', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str) # normalization parameters _check_argument('signal_norm', c['audio'], restricted=True, val_type=bool) diff --git a/utils/synthesis.py b/utils/synthesis.py index c5ff2e70..42f0408c 100644 --- a/utils/synthesis.py +++ b/utils/synthesis.py @@ -10,10 +10,10 @@ def text_to_seqvec(text, CONFIG, use_cuda): seq = np.asarray( phoneme_to_sequence(text, text_cleaner, CONFIG.phoneme_language, CONFIG.enable_eos_bos_chars, - tp=CONFIG.text if 'text' in CONFIG.keys() else None), + tp=CONFIG.characters if 'characters' in CONFIG.keys() else None), dtype=np.int32) else: - seq = np.asarray(text_to_sequence(text, text_cleaner, tp=CONFIG.text if 'text' in CONFIG.keys() else None), dtype=np.int32) + seq = np.asarray(text_to_sequence(text, text_cleaner, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None), dtype=np.int32) # torch tensor chars_var = torch.from_numpy(seq).unsqueeze(0) if use_cuda: diff --git a/utils/visual.py b/utils/visual.py index 3b24364c..1cb9ac5d 100644 --- a/utils/visual.py +++ b/utils/visual.py @@ -54,8 +54,8 @@ def visualize(alignment, spectrogram_postnet, stop_tokens, text, hop_length, CON plt.xlabel("Decoder timestamp", fontsize=label_fontsize) plt.ylabel("Encoder timestamp", fontsize=label_fontsize) if CONFIG.use_phonemes: - seq = phoneme_to_sequence(text, [CONFIG.text_cleaner], CONFIG.phoneme_language, CONFIG.enable_eos_bos_chars, tp=CONFIG.text if 'text' in CONFIG.keys() else None) - text = sequence_to_phoneme(seq, tp=CONFIG.text if 'text' in CONFIG.keys() else None) + seq = phoneme_to_sequence(text, [CONFIG.text_cleaner], CONFIG.phoneme_language, CONFIG.enable_eos_bos_chars, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None) + text = sequence_to_phoneme(seq, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None) print(text) plt.yticks(range(len(text)), list(text)) From 7ffc1025424e49e40a2b325892d359a3fe21b68c Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Thu, 5 Mar 2020 17:44:47 -0300 Subject: [PATCH 61/61] add unittest for vocabulary parameters --- tests/test_config.json | 10 +++++++++ tests/test_text_processing.py | 42 ++++++++++++++++++++++++++--------- utils/text/symbols.py | 4 ++-- 3 files changed, 43 insertions(+), 13 deletions(-) diff --git a/tests/test_config.json b/tests/test_config.json index 0cd3d751..6d63e6ab 100644 --- a/tests/test_config.json +++ b/tests/test_config.json @@ -19,6 +19,16 @@ "mel_fmax": 7600, // maximum freq level for mel-spec. Tune for dataset!! "do_trim_silence": false }, + + "characters":{ + "pad": "_", + "eos": "~", + "bos": "^", + "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ", + "punctuations":"!'(),-.:;? ", + "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ" + }, + "hidden_size": 128, "embedding_size": 256, "text_cleaner": "english_cleaners", diff --git a/tests/test_text_processing.py b/tests/test_text_processing.py index aa17f694..6c0c7058 100644 --- a/tests/test_text_processing.py +++ b/tests/test_text_processing.py @@ -1,7 +1,14 @@ +import os +# pylint: disable=unused-wildcard-import +# pylint: disable=wildcard-import +# pylint: disable=unused-import import unittest -import torch as T - from TTS.utils.text import * +from TTS.tests import get_tests_path +from TTS.utils.generic_utils import load_config + +TESTS_PATH = get_tests_path() +conf = load_config(os.path.join(TESTS_PATH, 'test_config.json')) def test_phoneme_to_sequence(): text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!" @@ -9,67 +16,80 @@ def test_phoneme_to_sequence(): lang = "en-us" sequence = phoneme_to_sequence(text, text_cleaner, lang) text_hat = sequence_to_phoneme(sequence) + sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters) + text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters) gt = "ɹiːsənt ɹɪsɜːtʃ æt hɑːɹvɚd hɐz ʃoʊn mɛdᵻteɪɾɪŋ fɔːɹ æz lɪɾəl æz eɪt wiːks kæn æktʃuːəli ɪnkɹiːs, ðə ɡɹeɪ mæɾɚɹ ɪnðə pɑːɹts ʌvðə bɹeɪn ɹɪspɑːnsəbəl fɔːɹ ɪmoʊʃənəl ɹɛɡjuːleɪʃən ænd lɜːnɪŋ!" - assert text_hat == gt + assert text_hat == text_hat_with_params == gt # multiple punctuations text = "Be a voice, not an! echo?" sequence = phoneme_to_sequence(text, text_cleaner, lang) text_hat = sequence_to_phoneme(sequence) + sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters) + text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters) gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ?" print(text_hat) print(len(sequence)) - assert text_hat == gt + assert text_hat == text_hat_with_params == gt # not ending with punctuation text = "Be a voice, not an! echo" sequence = phoneme_to_sequence(text, text_cleaner, lang) text_hat = sequence_to_phoneme(sequence) + sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters) + text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters) gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ" print(text_hat) print(len(sequence)) - assert text_hat == gt + assert text_hat == text_hat_with_params == gt # original text = "Be a voice, not an echo!" sequence = phoneme_to_sequence(text, text_cleaner, lang) text_hat = sequence_to_phoneme(sequence) + sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters) + text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters) gt = "biː ɐ vɔɪs, nɑːt ɐn ɛkoʊ!" print(text_hat) print(len(sequence)) - assert text_hat == gt + assert text_hat == text_hat_with_params == gt # extra space after the sentence text = "Be a voice, not an! echo. " sequence = phoneme_to_sequence(text, text_cleaner, lang) text_hat = sequence_to_phoneme(sequence) + sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters) + text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters) gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ." print(text_hat) print(len(sequence)) - assert text_hat == gt + assert text_hat == text_hat_with_params == gt # extra space after the sentence text = "Be a voice, not an! echo. " sequence = phoneme_to_sequence(text, text_cleaner, lang, True) text_hat = sequence_to_phoneme(sequence) + sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters) + text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters) gt = "^biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ.~" print(text_hat) print(len(sequence)) - assert text_hat == gt + assert text_hat == text_hat_with_params == gt # padding char text = "_Be a _voice, not an! echo_" sequence = phoneme_to_sequence(text, text_cleaner, lang) text_hat = sequence_to_phoneme(sequence) + sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters) + text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters) gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ" print(text_hat) print(len(sequence)) - assert text_hat == gt - + assert text_hat == text_hat_with_params == gt def test_text2phone(): text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!" gt = "ɹ|iː|s|ə|n|t| |ɹ|ɪ|s|ɜː|tʃ| |æ|t| |h|ɑːɹ|v|ɚ|d| |h|ɐ|z| |ʃ|oʊ|n| |m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| |f|ɔː|ɹ| |æ|z| |l|ɪ|ɾ|əl| |æ|z| |eɪ|t| |w|iː|k|s| |k|æ|n| |æ|k|tʃ|uː|əl|i| |ɪ|n|k|ɹ|iː|s|,| |ð|ə| |ɡ|ɹ|eɪ| |m|æ|ɾ|ɚ|ɹ| |ɪ|n|ð|ə| |p|ɑːɹ|t|s| |ʌ|v|ð|ə| |b|ɹ|eɪ|n| |ɹ|ɪ|s|p|ɑː|n|s|ə|b|əl| |f|ɔː|ɹ| |ɪ|m|oʊ|ʃ|ə|n|əl| |ɹ|ɛ|ɡ|j|uː|l|eɪ|ʃ|ə|n| |æ|n|d| |l|ɜː|n|ɪ|ŋ|!" lang = "en-us" ph = text2phone(text, lang) - assert gt == ph, f"\n{phonemes} \n vs \n{gt}" + assert gt == ph, f"\n{phonemes} \n vs \n{gt}" \ No newline at end of file diff --git a/utils/text/symbols.py b/utils/text/symbols.py index 15862cbd..544277c5 100644 --- a/utils/text/symbols.py +++ b/utils/text/symbols.py @@ -5,9 +5,9 @@ Defines the set of symbols used in text input to the model. The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. ''' -def make_symbols(characters, phnms, punctuations='!\'(),-.:;? ', pad='_', eos='~', bos='^'): +def make_symbols(characters, phonemes, punctuations='!\'(),-.:;? ', pad='_', eos='~', bos='^'):# pylint: disable=redefined-outer-name ''' Function to create symbols and phonemes ''' - _phonemes_sorted = sorted(list(phnms)) + _phonemes_sorted = sorted(list(phonemes)) # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): _arpabet = ['@' + s for s in _phonemes_sorted]