From 13334d507ca81d4dd02444b94349019b72e67d30 Mon Sep 17 00:00:00 2001 From: Eren G??lge Date: Mon, 23 Jan 2023 13:45:45 +0100 Subject: [PATCH 1/6] Load model from path --- TTS/api.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/TTS/api.py b/TTS/api.py index 99c3e522..da571414 100644 --- a/TTS/api.py +++ b/TTS/api.py @@ -7,7 +7,7 @@ from TTS.utils.synthesizer import Synthesizer class TTS: """TODO: Add voice conversion and Capacitron support.""" - def __init__(self, model_name: str = None, progress_bar: bool = True, gpu=False): + def __init__(self, model_name: str = None, model_path:str = None, config_path:str=None, progress_bar: bool = True, gpu=False): """🐸TTS python interface that allows to load and use the released models. Example with a multi-speaker model: @@ -20,6 +20,10 @@ class TTS: >>> tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False) >>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav") + Example loading a model from a path: + >>> tts = TTS(model_path="/path/to/checkpoint_100000.pth", config_path="/path/to/config.json", progress_bar=False, gpu=False) + >>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav") + Args: model_name (str, optional): Model name to load. You can list models by ```tts.models```. Defaults to None. progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True. @@ -29,6 +33,8 @@ class TTS: self.synthesizer = None if model_name: self.load_model_by_name(model_name, gpu) + if model_path: + self.load_model_by_path(model_path, config_path, gpu) @property def models(self): @@ -90,6 +96,19 @@ class TTS: use_cuda=gpu, ) + def load_model_by_path(self, model_path: str, config_path: str, gpu: bool = False): + self.synthesizer = Synthesizer( + tts_checkpoint=model_path, + tts_config_path=config_path, + tts_speakers_file=None, + tts_languages_file=None, + vocoder_checkpoint=None, + vocoder_config=None, + encoder_checkpoint=None, + encoder_config=None, + use_cuda=gpu, + ) + def _check_arguments(self, speaker: str = None, language: str = None): if self.is_multi_speaker and speaker is None: raise ValueError("Model is multi-speaker but no speaker is provided.") From cf076345e7ddb44584f15127f8d4c595a1428e04 Mon Sep 17 00:00:00 2001 From: Eren G??lge Date: Mon, 23 Jan 2023 13:49:51 +0100 Subject: [PATCH 2/6] Make style --- TTS/api.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/TTS/api.py b/TTS/api.py index da571414..ed7e6e6b 100644 --- a/TTS/api.py +++ b/TTS/api.py @@ -7,7 +7,14 @@ from TTS.utils.synthesizer import Synthesizer class TTS: """TODO: Add voice conversion and Capacitron support.""" - def __init__(self, model_name: str = None, model_path:str = None, config_path:str=None, progress_bar: bool = True, gpu=False): + def __init__( + self, + model_name: str = None, + model_path: str = None, + config_path: str = None, + progress_bar: bool = True, + gpu=False, + ): """🐸TTS python interface that allows to load and use the released models. Example with a multi-speaker model: From 335b8ed44e7b252f7c17069bb621a1dab592008a Mon Sep 17 00:00:00 2001 From: Eren G??lge Date: Mon, 30 Jan 2023 12:59:29 +0100 Subject: [PATCH 3/6] Add vocoder path --- TTS/api.py | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/TTS/api.py b/TTS/api.py index ed7e6e6b..22b81ba4 100644 --- a/TTS/api.py +++ b/TTS/api.py @@ -12,6 +12,8 @@ class TTS: model_name: str = None, model_path: str = None, config_path: str = None, + vocoder_path: str = None, + vocoder_config_path: str = None, progress_bar: bool = True, gpu=False, ): @@ -33,6 +35,10 @@ class TTS: Args: model_name (str, optional): Model name to load. You can list models by ```tts.models```. Defaults to None. + model_path (str, optional): Path to the model checkpoint. Defaults to None. + config_path (str, optional): Path to the model config. Defaults to None. + vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None. + vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None. progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True. gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False. """ @@ -41,7 +47,9 @@ class TTS: if model_name: self.load_model_by_name(model_name, gpu) if model_path: - self.load_model_by_path(model_path, config_path, gpu) + self.load_model_by_path( + model_path, config_path, vocoder_path=vocoder_path, vocoder_config=vocoder_config_path, gpu=gpu + ) @property def models(self): @@ -89,6 +97,14 @@ class TTS: def load_model_by_name(self, model_name: str, gpu: bool = False): model_path, config_path, vocoder_path, vocoder_config_path = self.download_model_by_name(model_name) + """ Load one of 🐸TTS models by name. + + Args: + model_name (str): Model name to load. You can list models by ```tts.models```. + gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False. + + TODO: Add tests + """ # init synthesizer # None values are fetch from the model self.synthesizer = Synthesizer( @@ -103,14 +119,26 @@ class TTS: use_cuda=gpu, ) - def load_model_by_path(self, model_path: str, config_path: str, gpu: bool = False): + def load_model_by_path( + self, model_path: str, config_path: str, vocoder_path: str = None, vocoder_config: str = None, gpu: bool = False + ): + """Load a model from a path. + + Args: + model_path (str): Path to the model checkpoint. + config_path (str): Path to the model config. + vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None. + vocoder_config (str, optional): Path to the vocoder config. Defaults to None. + gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False. + """ + self.synthesizer = Synthesizer( tts_checkpoint=model_path, tts_config_path=config_path, tts_speakers_file=None, tts_languages_file=None, - vocoder_checkpoint=None, - vocoder_config=None, + vocoder_checkpoint=vocoder_path, + vocoder_config=vocoder_config, encoder_checkpoint=None, encoder_config=None, use_cuda=gpu, From 7fddabc8ac9c84a9d05ba7928f454e45558c9422 Mon Sep 17 00:00:00 2001 From: Eren G??lge Date: Mon, 30 Jan 2023 13:35:48 +0100 Subject: [PATCH 4/6] Implement cloning in API --- TTS/api.py | 33 +++++++++++++++++++----- TTS/utils/synthesizer.py | 2 +- tests/inference_tests/test_python_api.py | 9 ++++++- 3 files changed, 35 insertions(+), 9 deletions(-) diff --git a/TTS/api.py b/TTS/api.py index 22b81ba4..6fa8c606 100644 --- a/TTS/api.py +++ b/TTS/api.py @@ -33,6 +33,12 @@ class TTS: >>> tts = TTS(model_path="/path/to/checkpoint_100000.pth", config_path="/path/to/config.json", progress_bar=False, gpu=False) >>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav") + Example voice cloning with YourTTS in English, French and Portuguese: + >>> tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True) + >>> tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="thisisit.wav") + >>> tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr", file_path="thisisit.wav") + >>> tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt", file_path="thisisit.wav") + Args: model_name (str, optional): Model name to load. You can list models by ```tts.models```. Defaults to None. model_path (str, optional): Path to the model checkpoint. Defaults to None. @@ -144,8 +150,8 @@ class TTS: use_cuda=gpu, ) - def _check_arguments(self, speaker: str = None, language: str = None): - if self.is_multi_speaker and speaker is None: + def _check_arguments(self, speaker: str = None, language: str = None, speaker_wav: str = None): + if self.is_multi_speaker and (speaker is None and speaker_wav is None): raise ValueError("Model is multi-speaker but no speaker is provided.") if self.is_multi_lingual and language is None: raise ValueError("Model is multi-lingual but no language is provided.") @@ -154,7 +160,7 @@ class TTS: if not self.is_multi_lingual and language is not None: raise ValueError("Model is not multi-lingual but language is provided.") - def tts(self, text: str, speaker: str = None, language: str = None): + def tts(self, text: str, speaker: str = None, language: str = None, speaker_wav: str = None): """Convert text to speech. Args: @@ -166,14 +172,17 @@ class TTS: language (str, optional): Language code for multi-lingual models. You can check whether loaded model is multi-lingual `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None. + speaker_wav (str, optional): + Path to a reference wav file to use for voice cloning with supporting models like YourTTS. + Defaults to None. """ - self._check_arguments(speaker=speaker, language=language) + self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav) wav = self.synthesizer.tts( text=text, speaker_name=speaker, language_name=language, - speaker_wav=None, + speaker_wav=speaker_wav, reference_wav=None, style_wav=None, style_text=None, @@ -181,7 +190,14 @@ class TTS: ) return wav - def tts_to_file(self, text: str, speaker: str = None, language: str = None, file_path: str = "output.wav"): + def tts_to_file( + self, + text: str, + speaker: str = None, + language: str = None, + speaker_wav: str = None, + file_path: str = "output.wav", + ): """Convert text to speech. Args: @@ -193,8 +209,11 @@ class TTS: language (str, optional): Language code for multi-lingual models. You can check whether loaded model is multi-lingual `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None. + speaker_wav (str, optional): + Path to a reference wav file to use for voice cloning with supporting models like YourTTS. + Defaults to None. file_path (str, optional): Output file path. Defaults to "output.wav". """ - wav = self.tts(text=text, speaker=speaker, language=language) + wav = self.tts(text=text, speaker=speaker, language=language, speaker_wav=speaker_wav) self.synthesizer.save_wav(wav=wav, path=file_path) diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 4a0ab038..498dc7ba 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -187,7 +187,7 @@ class Synthesizer(object): text (str): input text. speaker_name (str, optional): spekaer id for multi-speaker models. Defaults to "". language_name (str, optional): language id for multi-language models. Defaults to "". - speaker_wav (Union[str, List[str]], optional): path to the speaker wav. Defaults to None. + speaker_wav (Union[str, List[str]], optional): path to the speaker wav for voice cloning. Defaults to None. style_wav ([type], optional): style waveform for GST. Defaults to None. style_text ([type], optional): transcription of style_wav for Capacitron. Defaults to None. reference_wav ([type], optional): reference waveform for voice conversion. Defaults to None. diff --git a/tests/inference_tests/test_python_api.py b/tests/inference_tests/test_python_api.py index fdd7e1cb..b306b5ea 100644 --- a/tests/inference_tests/test_python_api.py +++ b/tests/inference_tests/test_python_api.py @@ -1,10 +1,12 @@ import os import unittest -from tests import get_tests_output_path +from tests import get_tests_data_path, get_tests_output_path + from TTS.api import TTS OUTPUT_PATH = os.path.join(get_tests_output_path(), "test_python_api.wav") +cloning_test_wav_path = os.path.join(get_tests_data_path(), "ljspeech/wavs/LJ001-0028.wav") class TTSTest(unittest.TestCase): @@ -34,3 +36,8 @@ class TTSTest(unittest.TestCase): self.assertTrue(tts.is_multi_lingual) self.assertGreater(len(tts.speakers), 1) self.assertGreater(len(tts.languages), 1) + + def test_voice_cloning(): + tts = TTS() + tts.load_model_by_name("tts_models/multilingual/multi-dataset/your_tts") + tts.tts_to_file("Hello world!", speaker_wav=cloning_test_wav_path, language="en", file_path=OUTPUT_PATH) From 6ee94f8badb32bfbc0ed61c000fd976899ecb5d0 Mon Sep 17 00:00:00 2001 From: Eren G??lge Date: Mon, 30 Jan 2023 14:02:25 +0100 Subject: [PATCH 5/6] Fixup --- TTS/api.py | 4 +++- tests/inference_tests/test_python_api.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/TTS/api.py b/TTS/api.py index 6fa8c606..850f0681 100644 --- a/TTS/api.py +++ b/TTS/api.py @@ -102,7 +102,6 @@ class TTS: return model_path, config_path, vocoder_path, vocoder_config_path def load_model_by_name(self, model_name: str, gpu: bool = False): - model_path, config_path, vocoder_path, vocoder_config_path = self.download_model_by_name(model_name) """ Load one of 🐸TTS models by name. Args: @@ -111,6 +110,9 @@ class TTS: TODO: Add tests """ + + model_path, config_path, vocoder_path, vocoder_config_path = self.download_model_by_name(model_name) + # init synthesizer # None values are fetch from the model self.synthesizer = Synthesizer( diff --git a/tests/inference_tests/test_python_api.py b/tests/inference_tests/test_python_api.py index b306b5ea..6114c803 100644 --- a/tests/inference_tests/test_python_api.py +++ b/tests/inference_tests/test_python_api.py @@ -37,7 +37,7 @@ class TTSTest(unittest.TestCase): self.assertGreater(len(tts.speakers), 1) self.assertGreater(len(tts.languages), 1) - def test_voice_cloning(): + def test_voice_cloning(self): tts = TTS() tts.load_model_by_name("tts_models/multilingual/multi-dataset/your_tts") tts.tts_to_file("Hello world!", speaker_wav=cloning_test_wav_path, language="en", file_path=OUTPUT_PATH) From c496b1a986c9a507fbe90b01f02f870baa70cfa5 Mon Sep 17 00:00:00 2001 From: Eren G??lge Date: Mon, 6 Feb 2023 11:17:28 +0100 Subject: [PATCH 6/6] Linter fix --- tests/inference_tests/test_python_api.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/inference_tests/test_python_api.py b/tests/inference_tests/test_python_api.py index 6114c803..71690440 100644 --- a/tests/inference_tests/test_python_api.py +++ b/tests/inference_tests/test_python_api.py @@ -37,6 +37,7 @@ class TTSTest(unittest.TestCase): self.assertGreater(len(tts.speakers), 1) self.assertGreater(len(tts.languages), 1) + @staticmethod def test_voice_cloning(self): tts = TTS() tts.load_model_by_name("tts_models/multilingual/multi-dataset/your_tts")