From e1f0ea5487782365e28c1e281ef476625e0d31d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 8 Jul 2021 01:30:21 +0200 Subject: [PATCH 1/5] Fix #618 --- hubconf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hubconf.py b/hubconf.py index 96f12b5f..0c9c5930 100644 --- a/hubconf.py +++ b/hubconf.py @@ -1,5 +1,5 @@ dependencies = [ - 'torch', 'gdown', 'pysbd', 'gruut', 'anyascii', 'pypinyin', 'coqpit', 'mecab-python3', 'unidic-lite` + 'torch', 'gdown', 'pysbd', 'gruut', 'anyascii', 'pypinyin', 'coqpit', 'mecab-python3', 'unidic-lite' ] import torch From 377b379f1e168f4e56e41c8e1dd1229abc5694dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 8 Jul 2021 01:55:02 +0200 Subject: [PATCH 2/5] Update dataset URL --- docs/source/tts_datasets.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/tts_datasets.md b/docs/source/tts_datasets.md index 6075bc95..852ccd37 100644 --- a/docs/source/tts_datasets.md +++ b/docs/source/tts_datasets.md @@ -11,6 +11,6 @@ Some of the known public datasets that we successfully applied 🐸TTS: - [Spanish](https://drive.google.com/file/d/1Sm_zyBo67XHkiFhcRSQ4YaHPYM0slO_e/view?usp=sharing) - thx! @carlfm01 - [German - Thorsten OGVD](https://github.com/thorstenMueller/deep-learning-german-tts) - [Japanese - Kokoro](https://www.kaggle.com/kaiida/kokoro-speech-dataset-v11-small/version/1) -- [Chinese](https://www.data-baker.com/open_source.html) +- [Chinese](https://www.data-baker.com/data/index/source/) Let us know if you use 🐸TTS on a different dataset. \ No newline at end of file From d7a99653898e0c254b584fcc51927f8443ed39e7 Mon Sep 17 00:00:00 2001 From: ravi maithrey Date: Wed, 14 Jul 2021 18:16:27 +0530 Subject: [PATCH 3/5] added information to ask for model contributions --- CONTRIBUTING.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 831eddd5..89138e47 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -6,6 +6,7 @@ This repository is governed by [the Contributor Covenant Code of Conduct](https: ## Where to start. We welcome everyone who likes to contribute to 🐸TTS. + You can contribute not only with code but with bug reports, comments, questions, answers, or just a simple tweet to spread the word. If you like to contribute code, squash a bug but if you don't know where to start, here are some pointers. @@ -25,6 +26,16 @@ If you like to contribute code, squash a bug but if you don't know where to star We list all the target improvements for the next version. You can pick one of them and start contributing. - Also feel free to suggest new features, ideas and models. We're always open for new things. +#####Call for sharing language models +If possible, please consider sharing your pre-trained models in any language (if the licences allow for you to do so). We will include them in our model catalogue for public use and give the proper attribution, whether it be your name, company, website or any other source specified. + +This model can be shared in two ways: +1. Share the model files with us and we serve them with the next 🐸 TTS release. +2. Upload your models on GDrive and share the link. + +Models are served under `.models.json` file and any model is available under TTS CLI or Server end points. + +Either way you choose, please make sure you send the models [here](https://github.com/coqui-ai/TTS/issues/380). ## Sending a ✨**PR**✨ If you have a new feature, a model to implement, or a bug to squash, go ahead and send a ✨**PR**✨. From fc0c4600bdf6e69e090bfcb9befe811df18985bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 20 Jul 2021 17:34:42 +0200 Subject: [PATCH 4/5] Fix stopnet training --- TTS/tts/layers/losses.py | 15 ++++++++------- TTS/tts/models/base_tts.py | 4 +++- TTS/tts/models/tacotron.py | 2 ++ TTS/tts/models/tacotron2.py | 2 ++ TTS/tts/utils/data.py | 15 ++++++++++++--- tests/data_tests/test_loader.py | 2 +- 6 files changed, 28 insertions(+), 12 deletions(-) diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index 86d34c30..07b58974 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -246,9 +246,9 @@ class Huber(nn.Module): class TacotronLoss(torch.nn.Module): """Collection of Tacotron set-up based on provided config.""" - def __init__(self, c, stopnet_pos_weight=10, ga_sigma=0.4): + def __init__(self, c, ga_sigma=0.4): super().__init__() - self.stopnet_pos_weight = stopnet_pos_weight + self.stopnet_pos_weight = c.stopnet_pos_weight self.ga_alpha = c.ga_alpha self.decoder_diff_spec_alpha = c.decoder_diff_spec_alpha self.postnet_diff_spec_alpha = c.postnet_diff_spec_alpha @@ -274,7 +274,7 @@ class TacotronLoss(torch.nn.Module): self.criterion_ssim = SSIMLoss() # stopnet loss # pylint: disable=not-callable - self.criterion_st = BCELossMasked(pos_weight=torch.tensor(stopnet_pos_weight)) if c.stopnet else None + self.criterion_st = BCELossMasked(pos_weight=torch.tensor(self.stopnet_pos_weight)) if c.stopnet else None def forward( self, @@ -284,6 +284,7 @@ class TacotronLoss(torch.nn.Module): linear_input, stopnet_output, stopnet_target, + stop_target_length, output_lens, decoder_b_output, alignments, @@ -315,12 +316,12 @@ class TacotronLoss(torch.nn.Module): return_dict["decoder_loss"] = decoder_loss return_dict["postnet_loss"] = postnet_loss - # stopnet loss stop_loss = ( - self.criterion_st(stopnet_output, stopnet_target, output_lens) if self.config.stopnet else torch.zeros(1) + self.criterion_st(stopnet_output, stopnet_target, stop_target_length) + if self.config.stopnet + else torch.zeros(1) ) - if not self.config.separate_stopnet and self.config.stopnet: - loss += stop_loss + loss += stop_loss return_dict["stopnet_loss"] = stop_loss # backward decoder loss (if enabled) diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index 561b76fb..b36ed106 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -119,9 +119,10 @@ class BaseTTS(BaseModel): ), f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}" durations[idx, : text_lengths[idx]] = dur - # set stop targets view, we predict a single stop token per iteration. + # set stop targets wrt reduction factor stop_targets = stop_targets.view(text_input.shape[0], stop_targets.size(1) // self.config.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2) + stop_target_lengths = torch.divide(mel_lengths, self.config.r).ceil_() return { "text_input": text_input, @@ -131,6 +132,7 @@ class BaseTTS(BaseModel): "mel_lengths": mel_lengths, "linear_input": linear_input, "stop_targets": stop_targets, + "stop_target_lengths": stop_target_lengths, "attn_mask": attn_mask, "durations": durations, "speaker_ids": speaker_ids, diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py index 95b4a358..7949ddf9 100644 --- a/TTS/tts/models/tacotron.py +++ b/TTS/tts/models/tacotron.py @@ -219,6 +219,7 @@ class Tacotron(BaseTacotron): mel_lengths = batch["mel_lengths"] linear_input = batch["linear_input"] stop_targets = batch["stop_targets"] + stop_target_lengths = batch["stop_target_lengths"] speaker_ids = batch["speaker_ids"] d_vectors = batch["d_vectors"] @@ -250,6 +251,7 @@ class Tacotron(BaseTacotron): linear_input, outputs["stop_tokens"], stop_targets, + stop_target_lengths, mel_lengths, outputs["decoder_outputs_backward"], outputs["alignments"], diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py index eaca3ff8..19619662 100644 --- a/TTS/tts/models/tacotron2.py +++ b/TTS/tts/models/tacotron2.py @@ -224,6 +224,7 @@ class Tacotron2(BaseTacotron): mel_lengths = batch["mel_lengths"] linear_input = batch["linear_input"] stop_targets = batch["stop_targets"] + stop_target_lengths = batch["stop_target_lengths"] speaker_ids = batch["speaker_ids"] d_vectors = batch["d_vectors"] @@ -255,6 +256,7 @@ class Tacotron2(BaseTacotron): linear_input, outputs["stop_tokens"], stop_targets, + stop_target_lengths, mel_lengths, outputs["decoder_outputs_backward"], outputs["alignments"], diff --git a/TTS/tts/utils/data.py b/TTS/tts/utils/data.py index 3ff52195..887f4376 100644 --- a/TTS/tts/utils/data.py +++ b/TTS/tts/utils/data.py @@ -27,10 +27,19 @@ def prepare_tensor(inputs, out_steps): return np.stack([_pad_tensor(x, pad_len) for x in inputs]) -def _pad_stop_target(x, length): - _pad = 0.0 +def _pad_stop_target(x: np.ndarray, length: int, pad_val=1) -> np.ndarray: + """Pad stop target array. + + Args: + x (np.ndarray): Stop target array. + length (int): Length after padding. + pad_val (int, optional): Padding value. Defaults to 1. + + Returns: + np.ndarray: Padded stop target array. + """ assert x.ndim == 1 - return np.pad(x, (0, length - x.shape[0]), mode="constant", constant_values=_pad) + return np.pad(x, (0, length - x.shape[0]), mode="constant", constant_values=pad_val) def prepare_stop_target(inputs, out_steps): diff --git a/tests/data_tests/test_loader.py b/tests/data_tests/test_loader.py index 9bc70ddd..3fd3eaef 100644 --- a/tests/data_tests/test_loader.py +++ b/tests/data_tests/test_loader.py @@ -207,7 +207,7 @@ class TestTTSDataset(unittest.TestCase): assert linear_input[1 - idx, -1].sum() == 0 assert mel_input[1 - idx, -1].sum() == 0 assert stop_target[1, mel_lengths[1] - 1] == 1 - assert stop_target[1, mel_lengths[1] :].sum() == 0 + assert stop_target[1, mel_lengths[1] :].sum() == stop_target.shape[1] - mel_lengths[1] assert len(mel_lengths.shape) == 1 # check batch zero-frame conditions (zero-frame disabled) From d435fd7981f096692c02ae33b106767cac1a6ab4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Sat, 24 Jul 2021 11:27:44 +0200 Subject: [PATCH 5/5] Update `max_decoder_steps` in tacotron recipes --- recipes/ljspeech/tacotron2-DCA/tacotron2-DCA.json | 2 +- recipes/ljspeech/tacotron2-DDC/tacotron2-DDC.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/recipes/ljspeech/tacotron2-DCA/tacotron2-DCA.json b/recipes/ljspeech/tacotron2-DCA/tacotron2-DCA.json index c5b6fa52..73bb8ae3 100644 --- a/recipes/ljspeech/tacotron2-DCA/tacotron2-DCA.json +++ b/recipes/ljspeech/tacotron2-DCA/tacotron2-DCA.json @@ -50,7 +50,7 @@ "stopnet_pos_weight": 15.0, "run_eval": true, "test_delay_epochs": 10, - "max_decoder_steps": 50, + "max_decoder_steps": 1000, "noam_schedule": true, "grad_clip": 0.05, "epochs": 1000, diff --git a/recipes/ljspeech/tacotron2-DDC/tacotron2-DDC.json b/recipes/ljspeech/tacotron2-DDC/tacotron2-DDC.json index d787c138..339e65b8 100644 --- a/recipes/ljspeech/tacotron2-DDC/tacotron2-DDC.json +++ b/recipes/ljspeech/tacotron2-DDC/tacotron2-DDC.json @@ -56,7 +56,7 @@ "run_eval": true, "test_delay_epochs": 10, "test_sentences_file": null, - "max_decoder_steps": 50, + "max_decoder_steps": 1000, "noam_schedule": true, "grad_clip": 0.05, "epochs": 1000,