From 69f080eb47ea2e7cf0ebf1d31cfecb045829b7c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 31 Jul 2023 13:52:45 +0200 Subject: [PATCH] Fix DelightfulTTS (#2823) * Fix tests * Make style --- TTS/tts/configs/delightful_tts_config.py | 12 ++--- TTS/tts/datasets/formatters.py | 1 + TTS/tts/models/delightful_tts.py | 15 +++--- TTS/tts/models/vits.py | 6 +-- TTS/tts/utils/text/cleaners.py | 3 +- recipes/bel-alex73/dump_config.py | 5 +- recipes/bel-alex73/train_glowtts.py | 50 +++++++++---------- recipes/bel-alex73/train_hifigan.py | 10 ++-- .../delightful_tts/train_delightful_tts.py | 10 ++-- .../test_delightful_tts_d-vectors_train.py | 4 +- .../tts_tests2/test_delightful_tts_emb_spk.py | 4 +- tests/tts_tests2/test_delightful_tts_train.py | 2 +- 12 files changed, 64 insertions(+), 58 deletions(-) diff --git a/TTS/tts/configs/delightful_tts_config.py b/TTS/tts/configs/delightful_tts_config.py index 50ab60af..805d9953 100644 --- a/TTS/tts/configs/delightful_tts_config.py +++ b/TTS/tts/configs/delightful_tts_config.py @@ -140,13 +140,13 @@ class DelightfulTTSConfig(BaseTTSConfig): d_vector_dim: int = None # testing - test_sentences: List[str] = field( + test_sentences: List[List[str]] = field( default_factory=lambda: [ - "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", - "Be a voice, not an echo.", - "I'm sorry Dave. I'm afraid I can't do that.", - "This cake is great. It's so delicious and moist.", - "Prior to November 22, 1963.", + ["It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."], + ["Be a voice, not an echo."], + ["I'm sorry Dave. I'm afraid I can't do that."], + ["This cake is great. It's so delicious and moist."], + ["Prior to November 22, 1963."], ] ) diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index 12b4f929..0eac29c8 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -602,6 +602,7 @@ def kss(root_path, meta_file, **kwargs): # pylint: disable=unused-argument items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path}) return items + def bel_tts_formatter(root_path, meta_file, **kwargs): # pylint: disable=unused-argument txt_file = os.path.join(root_path, meta_file) items = [] diff --git a/TTS/tts/models/delightful_tts.py b/TTS/tts/models/delightful_tts.py index a832e23b..c0a00c66 100644 --- a/TTS/tts/models/delightful_tts.py +++ b/TTS/tts/models/delightful_tts.py @@ -49,7 +49,7 @@ def id_to_torch(aux_id, cuda=False): def embedding_to_torch(d_vector, cuda=False): if d_vector is not None: d_vector = np.asarray(d_vector) - d_vector = torch.from_numpy(d_vector).type(torch.FloatTensor) + d_vector = torch.from_numpy(d_vector).float() d_vector = d_vector.squeeze().unsqueeze(0) if cuda: return d_vector.cuda() @@ -1151,7 +1151,7 @@ class DelightfulTTS(BaseTTSE2E): if speaker_name is None: speaker_id = self.speaker_manager.get_random_id() else: - speaker_id = self.speaker_manager.ids[speaker_name] + speaker_id = self.speaker_manager.name_to_id[speaker_name] return {"text": text, "speaker_id": speaker_id, "style_wav": style_wav, "d_vector": d_vector} @@ -1208,17 +1208,16 @@ class DelightfulTTS(BaseTTSE2E): # set speaker inputs _speaker_id = None - if speaker_id is not None and (self.args.use_speaker_embedding or self.args.use_d_vector_file): + if speaker_id is not None and self.args.use_speaker_embedding: if isinstance(speaker_id, str) and self.args.use_speaker_embedding: # get the speaker id for the speaker embedding layer _speaker_id = self.speaker_manager.name_to_id[speaker_id] _speaker_id = id_to_torch(_speaker_id, cuda=is_cuda) - else: - # get the average d_vector for the speaker - d_vector = self.speaker_manager.get_mean_embedding(speaker_id, num_samples=None, randomize=False) - if d_vector is not None and self.args.use_d_vector_file: - d_vector = embedding_to_torch(d_vector, cuda=is_cuda) + if speaker_id is not None and self.args.use_d_vector_file: + # get the average d_vector for the speaker + d_vector = self.speaker_manager.get_mean_embedding(speaker_id, num_samples=None, randomize=False) + d_vector = embedding_to_torch(d_vector, cuda=is_cuda) text_inputs = numpy_to_torch(text_inputs, torch.long, cuda=is_cuda) text_inputs = text_inputs.unsqueeze(0) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 58769dde..bc16ea63 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -1814,7 +1814,7 @@ class Vits(BaseTTS): # rollback values _forward = self.forward disc = None - if hasattr(self, 'disc'): + if hasattr(self, "disc"): disc = self.disc training = self.training @@ -1908,7 +1908,7 @@ class Vits(BaseTTS): [self.inference_noise_scale, self.length_scale, self.inference_noise_scale_dp], dtype=np.float32, ) - + audio = self.onnx_sess.run( ["output"], { @@ -1916,7 +1916,7 @@ class Vits(BaseTTS): "input_lengths": x_lengths, "scales": scales, "sid": None if speaker_id is None else torch.tensor([speaker_id]).cpu().numpy(), - "langid": None if language_id is None else torch.tensor([language_id]).cpu().numpy() + "langid": None if language_id is None else torch.tensor([language_id]).cpu().numpy(), }, ) return audio[0][0] diff --git a/TTS/tts/utils/text/cleaners.py b/TTS/tts/utils/text/cleaners.py index fa3de984..74d3910b 100644 --- a/TTS/tts/utils/text/cleaners.py +++ b/TTS/tts/utils/text/cleaners.py @@ -164,7 +164,8 @@ def multilingual_cleaners(text): text = collapse_whitespace(text) return text + def no_cleaners(text): # remove newline characters text = text.replace("\n", "") - return text \ No newline at end of file + return text diff --git a/recipes/bel-alex73/dump_config.py b/recipes/bel-alex73/dump_config.py index cf9e3c25..c4d30723 100644 --- a/recipes/bel-alex73/dump_config.py +++ b/recipes/bel-alex73/dump_config.py @@ -1,7 +1,8 @@ -from train_glowtts import config import json import re +from train_glowtts import config + s = json.dumps(config, default=vars, indent=2) -s = re.sub(r'"test_sentences":\s*\[\],', '', s) +s = re.sub(r'"test_sentences":\s*\[\],', "", s) print(s) diff --git a/recipes/bel-alex73/train_glowtts.py b/recipes/bel-alex73/train_glowtts.py index 99f1da74..e0827cdc 100644 --- a/recipes/bel-alex73/train_glowtts.py +++ b/recipes/bel-alex73/train_glowtts.py @@ -3,30 +3,31 @@ import os # Trainer: Where the ✨️ happens. # TrainingArgs: Defines the set of arguments of the Trainer. from trainer import Trainer, TrainerArgs -from TTS.tts.configs.shared_configs import BaseAudioConfig # GlowTTSConfig: all model related values for training, validating and testing. from TTS.tts.configs.glow_tts_config import GlowTTSConfig # BaseDatasetConfig: defines name, formatter and path of the dataset. -from TTS.tts.configs.shared_configs import BaseDatasetConfig, CharactersConfig +from TTS.tts.configs.shared_configs import BaseAudioConfig, BaseDatasetConfig, CharactersConfig from TTS.tts.datasets import load_tts_samples from TTS.tts.models.glow_tts import GlowTTS from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.utils.audio import AudioProcessor # we use the same path as this script as our training folder. -output_path = '/storage/output-glowtts/' +output_path = "/storage/output-glowtts/" # DEFINE DATASET CONFIG # Set LJSpeech as our target dataset and define its path. # You can also use a simple Dict to define the dataset and pass it to your custom formatter. dataset_config = BaseDatasetConfig( - formatter="bel_tts_formatter", meta_file_train="ipa_final_dataset.csv", path=os.path.join(output_path, "/storage/filtered_dataset/") + formatter="bel_tts_formatter", + meta_file_train="ipa_final_dataset.csv", + path=os.path.join(output_path, "/storage/filtered_dataset/"), ) -characters=CharactersConfig( +characters = CharactersConfig( characters_class="TTS.tts.utils.text.characters.Graphemes", pad="_", eos="~", @@ -71,22 +72,21 @@ config = GlowTTSConfig( ) if __name__ == "__main__": - -# INITIALIZE THE AUDIO PROCESSOR -# Audio processor is used for feature extraction and audio I/O. -# It mainly serves to the dataloader and the training loggers. + # INITIALIZE THE AUDIO PROCESSOR + # Audio processor is used for feature extraction and audio I/O. + # It mainly serves to the dataloader and the training loggers. ap = AudioProcessor.init_from_config(config) -# INITIALIZE THE TOKENIZER -# Tokenizer is used to convert text to sequences of token IDs. -# If characters are not defined in the config, default characters are passed to the config + # INITIALIZE THE TOKENIZER + # Tokenizer is used to convert text to sequences of token IDs. + # If characters are not defined in the config, default characters are passed to the config tokenizer, config = TTSTokenizer.init_from_config(config) -# LOAD DATA SAMPLES -# Each sample is a list of ```[text, audio_file_path, speaker_name]``` -# You can define your custom sample loader returning the list of samples. -# Or define your custom formatter and pass it to the `load_tts_samples`. -# Check `TTS.tts.datasets.load_tts_samples` for more details. + # LOAD DATA SAMPLES + # Each sample is a list of ```[text, audio_file_path, speaker_name]``` + # You can define your custom sample loader returning the list of samples. + # Or define your custom formatter and pass it to the `load_tts_samples`. + # Check `TTS.tts.datasets.load_tts_samples` for more details. train_samples, eval_samples = load_tts_samples( dataset_config, eval_split=True, @@ -94,18 +94,18 @@ if __name__ == "__main__": eval_split_size=config.eval_split_size, ) -# INITIALIZE THE MODEL -# Models take a config object and a speaker manager as input -# Config defines the details of the model like the number of layers, the size of the embedding, etc. -# Speaker manager is used by multi-speaker models. + # INITIALIZE THE MODEL + # Models take a config object and a speaker manager as input + # Config defines the details of the model like the number of layers, the size of the embedding, etc. + # Speaker manager is used by multi-speaker models. model = GlowTTS(config, ap, tokenizer, speaker_manager=None) -# INITIALIZE THE TRAINER -# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training, -# distributed training, etc. + # INITIALIZE THE TRAINER + # Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training, + # distributed training, etc. trainer = Trainer( TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples ) -# AND... 3,2,1... 🚀 + # AND... 3,2,1... 🚀 trainer.fit() diff --git a/recipes/bel-alex73/train_hifigan.py b/recipes/bel-alex73/train_hifigan.py index 04425f48..3e740b2f 100644 --- a/recipes/bel-alex73/train_hifigan.py +++ b/recipes/bel-alex73/train_hifigan.py @@ -1,15 +1,15 @@ import os -from trainer import Trainer, TrainerArgs -from TTS.tts.configs.shared_configs import BaseAudioConfig from coqpit import Coqpit +from trainer import Trainer, TrainerArgs +from TTS.tts.configs.shared_configs import BaseAudioConfig from TTS.utils.audio import AudioProcessor -from TTS.vocoder.configs.hifigan_config import *; +from TTS.vocoder.configs.hifigan_config import * from TTS.vocoder.datasets.preprocess import load_wav_data from TTS.vocoder.models.gan import GAN -output_path = '/storage/output-hifigan/' +output_path = "/storage/output-hifigan/" audio_config = BaseAudioConfig( mel_fmin=50, @@ -57,4 +57,4 @@ model = GAN(config, ap) trainer = Trainer( TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples ) -trainer.fit() \ No newline at end of file +trainer.fit() diff --git a/recipes/vctk/delightful_tts/train_delightful_tts.py b/recipes/vctk/delightful_tts/train_delightful_tts.py index e03ed2b7..eebf408b 100644 --- a/recipes/vctk/delightful_tts/train_delightful_tts.py +++ b/recipes/vctk/delightful_tts/train_delightful_tts.py @@ -5,7 +5,7 @@ from trainer import Trainer, TrainerArgs from TTS.config.shared_configs import BaseDatasetConfig from TTS.tts.configs.delightful_tts_config import DelightfulTtsAudioConfig, DelightfulTTSConfig from TTS.tts.datasets import load_tts_samples -from TTS.tts.models.delightful_tts import DelightfulTtsArgs, DelightfulTTS, VocoderConfig +from TTS.tts.models.delightful_tts import DelightfulTTS, DelightfulTtsArgs, VocoderConfig from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.utils.audio.processor import AudioProcessor @@ -14,7 +14,9 @@ data_path = "/raid/datasets/vctk_v092_48khz_removed_silence_silero_vad" output_path = os.path.dirname(os.path.abspath(__file__)) -dataset_config = BaseDatasetConfig(dataset_name="vctk", formatter="vctk", meta_file_train="", path=data_path, language="en-us") +dataset_config = BaseDatasetConfig( + dataset_name="vctk", formatter="vctk", meta_file_train="", path=data_path, language="en-us" +) audio_config = DelightfulTtsAudioConfig() @@ -73,9 +75,7 @@ speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speak config.model_args.num_speakers = speaker_manager.num_speakers -model = DelightfulTTS( - ap=ap, config=config, tokenizer=tokenizer, speaker_manager=speaker_manager, emotion_manager=None -) +model = DelightfulTTS(ap=ap, config=config, tokenizer=tokenizer, speaker_manager=speaker_manager, emotion_manager=None) trainer = Trainer( TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples diff --git a/tests/tts_tests2/test_delightful_tts_d-vectors_train.py b/tests/tts_tests2/test_delightful_tts_d-vectors_train.py index e6d04747..8fc4ea7e 100644 --- a/tests/tts_tests2/test_delightful_tts_d-vectors_train.py +++ b/tests/tts_tests2/test_delightful_tts_d-vectors_train.py @@ -39,7 +39,9 @@ config = DelightfulTTSConfig( print_eval=True, binary_align_loss_alpha=0.0, use_attn_priors=False, - test_sentences=["Be a voice, not an echo."], + test_sentences=[ + ["Be a voice, not an echo.", "ljspeech-0"], + ], output_path=output_path, use_speaker_embedding=False, use_d_vector_file=True, diff --git a/tests/tts_tests2/test_delightful_tts_emb_spk.py b/tests/tts_tests2/test_delightful_tts_emb_spk.py index d72536d8..6fb70c5f 100644 --- a/tests/tts_tests2/test_delightful_tts_emb_spk.py +++ b/tests/tts_tests2/test_delightful_tts_emb_spk.py @@ -37,7 +37,9 @@ config = DelightfulTTSConfig( print_eval=True, binary_align_loss_alpha=0.0, use_attn_priors=False, - test_sentences=["Be a voice, not an echo."], + test_sentences=[ + ["Be a voice, not an echo.", "ljspeech"], + ], output_path=output_path, num_speakers=4, use_speaker_embedding=True, diff --git a/tests/tts_tests2/test_delightful_tts_train.py b/tests/tts_tests2/test_delightful_tts_train.py index cef65745..a917d776 100644 --- a/tests/tts_tests2/test_delightful_tts_train.py +++ b/tests/tts_tests2/test_delightful_tts_train.py @@ -51,7 +51,7 @@ config = DelightfulTTSConfig( use_attn_priors=False, print_eval=True, test_sentences=[ - "Be a voice, not an echo.", + ["Be a voice, not an echo."], ], use_speaker_embedding=False, )