From 69f080eb47ea2e7cf0ebf1d31cfecb045829b7c8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <erogol@hotmail.com>
Date: Mon, 31 Jul 2023 13:52:45 +0200
Subject: [PATCH] Fix DelightfulTTS  (#2823)

* Fix tests

* Make style
---
 TTS/tts/configs/delightful_tts_config.py      | 12 ++---
 TTS/tts/datasets/formatters.py                |  1 +
 TTS/tts/models/delightful_tts.py              | 15 +++---
 TTS/tts/models/vits.py                        |  6 +--
 TTS/tts/utils/text/cleaners.py                |  3 +-
 recipes/bel-alex73/dump_config.py             |  5 +-
 recipes/bel-alex73/train_glowtts.py           | 50 +++++++++----------
 recipes/bel-alex73/train_hifigan.py           | 10 ++--
 .../delightful_tts/train_delightful_tts.py    | 10 ++--
 .../test_delightful_tts_d-vectors_train.py    |  4 +-
 .../tts_tests2/test_delightful_tts_emb_spk.py |  4 +-
 tests/tts_tests2/test_delightful_tts_train.py |  2 +-
 12 files changed, 64 insertions(+), 58 deletions(-)

diff --git a/TTS/tts/configs/delightful_tts_config.py b/TTS/tts/configs/delightful_tts_config.py
index 50ab60af..805d9953 100644
--- a/TTS/tts/configs/delightful_tts_config.py
+++ b/TTS/tts/configs/delightful_tts_config.py
@@ -140,13 +140,13 @@ class DelightfulTTSConfig(BaseTTSConfig):
     d_vector_dim: int = None
 
     # testing
-    test_sentences: List[str] = field(
+    test_sentences: List[List[str]] = field(
         default_factory=lambda: [
-            "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
-            "Be a voice, not an echo.",
-            "I'm sorry Dave. I'm afraid I can't do that.",
-            "This cake is great. It's so delicious and moist.",
-            "Prior to November 22, 1963.",
+            ["It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."],
+            ["Be a voice, not an echo."],
+            ["I'm sorry Dave. I'm afraid I can't do that."],
+            ["This cake is great. It's so delicious and moist."],
+            ["Prior to November 22, 1963."],
         ]
     )
 
diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py
index 12b4f929..0eac29c8 100644
--- a/TTS/tts/datasets/formatters.py
+++ b/TTS/tts/datasets/formatters.py
@@ -602,6 +602,7 @@ def kss(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
             items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
     return items
 
+
 def bel_tts_formatter(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
     txt_file = os.path.join(root_path, meta_file)
     items = []
diff --git a/TTS/tts/models/delightful_tts.py b/TTS/tts/models/delightful_tts.py
index a832e23b..c0a00c66 100644
--- a/TTS/tts/models/delightful_tts.py
+++ b/TTS/tts/models/delightful_tts.py
@@ -49,7 +49,7 @@ def id_to_torch(aux_id, cuda=False):
 def embedding_to_torch(d_vector, cuda=False):
     if d_vector is not None:
         d_vector = np.asarray(d_vector)
-        d_vector = torch.from_numpy(d_vector).type(torch.FloatTensor)
+        d_vector = torch.from_numpy(d_vector).float()
         d_vector = d_vector.squeeze().unsqueeze(0)
     if cuda:
         return d_vector.cuda()
@@ -1151,7 +1151,7 @@ class DelightfulTTS(BaseTTSE2E):
                 if speaker_name is None:
                     speaker_id = self.speaker_manager.get_random_id()
                 else:
-                    speaker_id = self.speaker_manager.ids[speaker_name]
+                    speaker_id = self.speaker_manager.name_to_id[speaker_name]
 
         return {"text": text, "speaker_id": speaker_id, "style_wav": style_wav, "d_vector": d_vector}
 
@@ -1208,17 +1208,16 @@ class DelightfulTTS(BaseTTSE2E):
 
         # set speaker inputs
         _speaker_id = None
-        if speaker_id is not None and (self.args.use_speaker_embedding or self.args.use_d_vector_file):
+        if speaker_id is not None and self.args.use_speaker_embedding:
             if isinstance(speaker_id, str) and self.args.use_speaker_embedding:
                 # get the speaker id for the speaker embedding layer
                 _speaker_id = self.speaker_manager.name_to_id[speaker_id]
                 _speaker_id = id_to_torch(_speaker_id, cuda=is_cuda)
-            else:
-                # get the average d_vector for the speaker
-                d_vector = self.speaker_manager.get_mean_embedding(speaker_id, num_samples=None, randomize=False)
 
-        if d_vector is not None and self.args.use_d_vector_file:
-            d_vector = embedding_to_torch(d_vector, cuda=is_cuda)
+        if speaker_id is not None and self.args.use_d_vector_file:
+            # get the average d_vector for the speaker
+            d_vector = self.speaker_manager.get_mean_embedding(speaker_id, num_samples=None, randomize=False)
+        d_vector = embedding_to_torch(d_vector, cuda=is_cuda)
 
         text_inputs = numpy_to_torch(text_inputs, torch.long, cuda=is_cuda)
         text_inputs = text_inputs.unsqueeze(0)
diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py
index 58769dde..bc16ea63 100644
--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@@ -1814,7 +1814,7 @@ class Vits(BaseTTS):
         # rollback values
         _forward = self.forward
         disc = None
-        if hasattr(self, 'disc'):
+        if hasattr(self, "disc"):
             disc = self.disc
         training = self.training
 
@@ -1908,7 +1908,7 @@ class Vits(BaseTTS):
             [self.inference_noise_scale, self.length_scale, self.inference_noise_scale_dp],
             dtype=np.float32,
         )
-		
+
         audio = self.onnx_sess.run(
             ["output"],
             {
@@ -1916,7 +1916,7 @@ class Vits(BaseTTS):
                 "input_lengths": x_lengths,
                 "scales": scales,
                 "sid": None if speaker_id is None else torch.tensor([speaker_id]).cpu().numpy(),
-				"langid": None if language_id is None else torch.tensor([language_id]).cpu().numpy()
+                "langid": None if language_id is None else torch.tensor([language_id]).cpu().numpy(),
             },
         )
         return audio[0][0]
diff --git a/TTS/tts/utils/text/cleaners.py b/TTS/tts/utils/text/cleaners.py
index fa3de984..74d3910b 100644
--- a/TTS/tts/utils/text/cleaners.py
+++ b/TTS/tts/utils/text/cleaners.py
@@ -164,7 +164,8 @@ def multilingual_cleaners(text):
     text = collapse_whitespace(text)
     return text
 
+
 def no_cleaners(text):
     # remove newline characters
     text = text.replace("\n", "")
-    return text
\ No newline at end of file
+    return text
diff --git a/recipes/bel-alex73/dump_config.py b/recipes/bel-alex73/dump_config.py
index cf9e3c25..c4d30723 100644
--- a/recipes/bel-alex73/dump_config.py
+++ b/recipes/bel-alex73/dump_config.py
@@ -1,7 +1,8 @@
-from train_glowtts import config
 import json
 import re
 
+from train_glowtts import config
+
 s = json.dumps(config, default=vars, indent=2)
-s = re.sub(r'"test_sentences":\s*\[\],', '', s)
+s = re.sub(r'"test_sentences":\s*\[\],', "", s)
 print(s)
diff --git a/recipes/bel-alex73/train_glowtts.py b/recipes/bel-alex73/train_glowtts.py
index 99f1da74..e0827cdc 100644
--- a/recipes/bel-alex73/train_glowtts.py
+++ b/recipes/bel-alex73/train_glowtts.py
@@ -3,30 +3,31 @@ import os
 # Trainer: Where the ✨️ happens.
 # TrainingArgs: Defines the set of arguments of the Trainer.
 from trainer import Trainer, TrainerArgs
-from TTS.tts.configs.shared_configs import BaseAudioConfig
 
 # GlowTTSConfig: all model related values for training, validating and testing.
 from TTS.tts.configs.glow_tts_config import GlowTTSConfig
 
 # BaseDatasetConfig: defines name, formatter and path of the dataset.
-from TTS.tts.configs.shared_configs import BaseDatasetConfig, CharactersConfig
+from TTS.tts.configs.shared_configs import BaseAudioConfig, BaseDatasetConfig, CharactersConfig
 from TTS.tts.datasets import load_tts_samples
 from TTS.tts.models.glow_tts import GlowTTS
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
 
 # we use the same path as this script as our training folder.
-output_path = '/storage/output-glowtts/'
+output_path = "/storage/output-glowtts/"
 
 
 # DEFINE DATASET CONFIG
 # Set LJSpeech as our target dataset and define its path.
 # You can also use a simple Dict to define the dataset and pass it to your custom formatter.
 dataset_config = BaseDatasetConfig(
-    formatter="bel_tts_formatter", meta_file_train="ipa_final_dataset.csv", path=os.path.join(output_path, "/storage/filtered_dataset/")
+    formatter="bel_tts_formatter",
+    meta_file_train="ipa_final_dataset.csv",
+    path=os.path.join(output_path, "/storage/filtered_dataset/"),
 )
 
-characters=CharactersConfig(
+characters = CharactersConfig(
     characters_class="TTS.tts.utils.text.characters.Graphemes",
     pad="_",
     eos="~",
@@ -71,22 +72,21 @@ config = GlowTTSConfig(
 )
 
 if __name__ == "__main__":
-
-# INITIALIZE THE AUDIO PROCESSOR
-# Audio processor is used for feature extraction and audio I/O.
-# It mainly serves to the dataloader and the training loggers.
+    # INITIALIZE THE AUDIO PROCESSOR
+    # Audio processor is used for feature extraction and audio I/O.
+    # It mainly serves to the dataloader and the training loggers.
     ap = AudioProcessor.init_from_config(config)
 
-# INITIALIZE THE TOKENIZER
-# Tokenizer is used to convert text to sequences of token IDs.
-# If characters are not defined in the config, default characters are passed to the config
+    # INITIALIZE THE TOKENIZER
+    # Tokenizer is used to convert text to sequences of token IDs.
+    # If characters are not defined in the config, default characters are passed to the config
     tokenizer, config = TTSTokenizer.init_from_config(config)
 
-# LOAD DATA SAMPLES
-# Each sample is a list of ```[text, audio_file_path, speaker_name]```
-# You can define your custom sample loader returning the list of samples.
-# Or define your custom formatter and pass it to the `load_tts_samples`.
-# Check `TTS.tts.datasets.load_tts_samples` for more details.
+    # LOAD DATA SAMPLES
+    # Each sample is a list of ```[text, audio_file_path, speaker_name]```
+    # You can define your custom sample loader returning the list of samples.
+    # Or define your custom formatter and pass it to the `load_tts_samples`.
+    # Check `TTS.tts.datasets.load_tts_samples` for more details.
     train_samples, eval_samples = load_tts_samples(
         dataset_config,
         eval_split=True,
@@ -94,18 +94,18 @@ if __name__ == "__main__":
         eval_split_size=config.eval_split_size,
     )
 
-# INITIALIZE THE MODEL
-# Models take a config object and a speaker manager as input
-# Config defines the details of the model like the number of layers, the size of the embedding, etc.
-# Speaker manager is used by multi-speaker models.
+    # INITIALIZE THE MODEL
+    # Models take a config object and a speaker manager as input
+    # Config defines the details of the model like the number of layers, the size of the embedding, etc.
+    # Speaker manager is used by multi-speaker models.
     model = GlowTTS(config, ap, tokenizer, speaker_manager=None)
 
-# INITIALIZE THE TRAINER
-# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
-# distributed training, etc.
+    # INITIALIZE THE TRAINER
+    # Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
+    # distributed training, etc.
     trainer = Trainer(
         TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
     )
 
-# AND... 3,2,1... 🚀
+    # AND... 3,2,1... 🚀
     trainer.fit()
diff --git a/recipes/bel-alex73/train_hifigan.py b/recipes/bel-alex73/train_hifigan.py
index 04425f48..3e740b2f 100644
--- a/recipes/bel-alex73/train_hifigan.py
+++ b/recipes/bel-alex73/train_hifigan.py
@@ -1,15 +1,15 @@
 import os
 
-from trainer import Trainer, TrainerArgs
-from TTS.tts.configs.shared_configs import BaseAudioConfig
 from coqpit import Coqpit
+from trainer import Trainer, TrainerArgs
 
+from TTS.tts.configs.shared_configs import BaseAudioConfig
 from TTS.utils.audio import AudioProcessor
-from TTS.vocoder.configs.hifigan_config import *;
+from TTS.vocoder.configs.hifigan_config import *
 from TTS.vocoder.datasets.preprocess import load_wav_data
 from TTS.vocoder.models.gan import GAN
 
-output_path = '/storage/output-hifigan/'
+output_path = "/storage/output-hifigan/"
 
 audio_config = BaseAudioConfig(
     mel_fmin=50,
@@ -57,4 +57,4 @@ model = GAN(config, ap)
 trainer = Trainer(
     TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
 )
-trainer.fit()
\ No newline at end of file
+trainer.fit()
diff --git a/recipes/vctk/delightful_tts/train_delightful_tts.py b/recipes/vctk/delightful_tts/train_delightful_tts.py
index e03ed2b7..eebf408b 100644
--- a/recipes/vctk/delightful_tts/train_delightful_tts.py
+++ b/recipes/vctk/delightful_tts/train_delightful_tts.py
@@ -5,7 +5,7 @@ from trainer import Trainer, TrainerArgs
 from TTS.config.shared_configs import BaseDatasetConfig
 from TTS.tts.configs.delightful_tts_config import DelightfulTtsAudioConfig, DelightfulTTSConfig
 from TTS.tts.datasets import load_tts_samples
-from TTS.tts.models.delightful_tts import DelightfulTtsArgs, DelightfulTTS, VocoderConfig
+from TTS.tts.models.delightful_tts import DelightfulTTS, DelightfulTtsArgs, VocoderConfig
 from TTS.tts.utils.speakers import SpeakerManager
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio.processor import AudioProcessor
@@ -14,7 +14,9 @@ data_path = "/raid/datasets/vctk_v092_48khz_removed_silence_silero_vad"
 output_path = os.path.dirname(os.path.abspath(__file__))
 
 
-dataset_config = BaseDatasetConfig(dataset_name="vctk", formatter="vctk", meta_file_train="", path=data_path, language="en-us")
+dataset_config = BaseDatasetConfig(
+    dataset_name="vctk", formatter="vctk", meta_file_train="", path=data_path, language="en-us"
+)
 
 audio_config = DelightfulTtsAudioConfig()
 
@@ -73,9 +75,7 @@ speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speak
 config.model_args.num_speakers = speaker_manager.num_speakers
 
 
-model = DelightfulTTS(
-    ap=ap, config=config, tokenizer=tokenizer, speaker_manager=speaker_manager, emotion_manager=None
-)
+model = DelightfulTTS(ap=ap, config=config, tokenizer=tokenizer, speaker_manager=speaker_manager, emotion_manager=None)
 
 trainer = Trainer(
     TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
diff --git a/tests/tts_tests2/test_delightful_tts_d-vectors_train.py b/tests/tts_tests2/test_delightful_tts_d-vectors_train.py
index e6d04747..8fc4ea7e 100644
--- a/tests/tts_tests2/test_delightful_tts_d-vectors_train.py
+++ b/tests/tts_tests2/test_delightful_tts_d-vectors_train.py
@@ -39,7 +39,9 @@ config = DelightfulTTSConfig(
     print_eval=True,
     binary_align_loss_alpha=0.0,
     use_attn_priors=False,
-    test_sentences=["Be a voice, not an echo."],
+    test_sentences=[
+        ["Be a voice, not an echo.", "ljspeech-0"],
+    ],
     output_path=output_path,
     use_speaker_embedding=False,
     use_d_vector_file=True,
diff --git a/tests/tts_tests2/test_delightful_tts_emb_spk.py b/tests/tts_tests2/test_delightful_tts_emb_spk.py
index d72536d8..6fb70c5f 100644
--- a/tests/tts_tests2/test_delightful_tts_emb_spk.py
+++ b/tests/tts_tests2/test_delightful_tts_emb_spk.py
@@ -37,7 +37,9 @@ config = DelightfulTTSConfig(
     print_eval=True,
     binary_align_loss_alpha=0.0,
     use_attn_priors=False,
-    test_sentences=["Be a voice, not an echo."],
+    test_sentences=[
+        ["Be a voice, not an echo.", "ljspeech"],
+    ],
     output_path=output_path,
     num_speakers=4,
     use_speaker_embedding=True,
diff --git a/tests/tts_tests2/test_delightful_tts_train.py b/tests/tts_tests2/test_delightful_tts_train.py
index cef65745..a917d776 100644
--- a/tests/tts_tests2/test_delightful_tts_train.py
+++ b/tests/tts_tests2/test_delightful_tts_train.py
@@ -51,7 +51,7 @@ config = DelightfulTTSConfig(
     use_attn_priors=False,
     print_eval=True,
     test_sentences=[
-        "Be a voice, not an echo.",
+        ["Be a voice, not an echo."],
     ],
     use_speaker_embedding=False,
 )