From 623ea41634a795c54d84dabb9ad6087f9619200f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 14 Sep 2023 15:21:48 +0200 Subject: [PATCH] Fix model tests (#2943) --- TTS/.models.json | 5 +- TTS/config/__init__.py | 2 +- TTS/utils/manage.py | 27 +++ TTS/utils/synthesizer.py | 2 +- .../multilingual/cml_yourtts/train_yourtts.py | 189 +++++------------- tests/zoo_tests/test_models.py | 42 ++-- 6 files changed, 106 insertions(+), 161 deletions(-) diff --git a/TTS/.models.json b/TTS/.models.json index 07ef3902..1eaaab71 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -11,8 +11,9 @@ ], "default_vocoder": null, "commit": "e9a1953e", - "license": "Coqui Community Model License", - "contact": "info@coqui.ai" + "license": "CPML", + "contact": "info@coqui.ai", + "tos_required": true }, "your_tts": { "description": "Your TTS model accompanying the paper https://arxiv.org/abs/2112.02418", diff --git a/TTS/config/__init__.py b/TTS/config/__init__.py index b9200cd0..25b4baef 100644 --- a/TTS/config/__init__.py +++ b/TTS/config/__init__.py @@ -41,6 +41,7 @@ def register_config(model_name: str) -> Coqpit: # TODO: fix this if model_name == "xtts": from TTS.tts.configs.xtts_config import XttsConfig + config_class = XttsConfig paths = ["TTS.tts.configs", "TTS.vocoder.configs", "TTS.encoder.configs", "TTS.vc.configs"] for path in paths: @@ -96,7 +97,6 @@ def load_config(config_path: str) -> Coqpit: raise TypeError(f" [!] Unknown config file type {ext}") config_dict.update(data) model_name = _process_model_name(config_dict) - breakpoint config_class = register_config(model_name.lower()) config = config_class() config.from_dict(config_dict) diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index ed48758f..79eed828 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -21,6 +21,7 @@ LICENSE_URLS = { "apache 2.0": "https://choosealicense.com/licenses/apache-2.0/", "apache2": "https://choosealicense.com/licenses/apache-2.0/", "cc-by-sa 4.0": "https://creativecommons.org/licenses/by-sa/4.0/", + "cpml": "https://coqui.ai/cpml.txt" } @@ -295,6 +296,29 @@ class ModelManager(object): model_item = self.set_model_url(model_item) return model_item, model_full_name, model + def ask_tos(self, model_full_path): + """Ask the user to agree to the terms of service""" + tos_path = os.path.join(model_full_path, "tos_agreed.txt") + if not os.path.exists(tos_path): + print(" > You must agree to the terms of service to use this model.") + print(" | > Please see the terms of service at https://coqui.ai/cpml.txt") + print(' | > "I have read, understood and agreed the Terms and Conditions." - [y/n]') + answer = input(" | | > ") + if answer.lower() == "y": + with open(tos_path, "w") as f: + f.write("I have read, understood ad agree the Terms and Conditions.") + else: + raise Exception("You must agree to the terms of service to use this model.") + + def tos_agreed(self, model_item, model_full_path): + """Check if the user has agreed to the terms of service""" + if "tos_required" in model_item and model_item["tos_required"]: + tos_path = os.path.join(model_full_path, "tos_agreed.txt") + if os.path.exists(tos_path): + return True + return False + return True + def download_model(self, model_name): """Download model files given the full model name. Model name is in the format @@ -316,6 +340,9 @@ class ModelManager(object): print(f" > {model_name} is already downloaded.") else: os.makedirs(output_path, exist_ok=True) + # handle TOS + if not self.tos_agreed(model_item, output_path): + self.ask_tos(output_path) print(f" > Downloading model to {output_path}") try: if "fairseq" in model_name: diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index e6f35460..24a078f5 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -338,7 +338,7 @@ class Synthesizer(nn.Module): elif language_name and isinstance(language_name, str): try: - language_id = self.tts_model.language_manager.name_to_id[language_id] + language_id = self.tts_model.language_manager.name_to_id[language_name] except KeyError as e: raise ValueError( f" [!] Looks like you use a multi-lingual model. " diff --git a/recipes/multilingual/cml_yourtts/train_yourtts.py b/recipes/multilingual/cml_yourtts/train_yourtts.py index 8c4ec581..25a2fd0a 100644 --- a/recipes/multilingual/cml_yourtts/train_yourtts.py +++ b/recipes/multilingual/cml_yourtts/train_yourtts.py @@ -27,7 +27,7 @@ RUN_NAME = "YourTTS-CML-TTS" OUT_PATH = os.path.dirname(os.path.abspath(__file__)) # "/raid/coqui/Checkpoints/original-YourTTS/" # If you want to do transfer learning and speedup your training you can set here the path to the CML-TTS available checkpoint that cam be downloaded here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p -RESTORE_PATH = "/raid/edresson/CML_YourTTS/checkpoints_yourtts_cml_tts_dataset/best_model.pth" # Download the checkpoint here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p +RESTORE_PATH = "/raid/edresson/CML_YourTTS/checkpoints_yourtts_cml_tts_dataset/best_model.pth" # Download the checkpoint here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p # This paramter is useful to debug, it skips the training epochs and just do the evaluation and produce the test sentences SKIP_TRAIN_EPOCH = False @@ -47,7 +47,7 @@ MAX_AUDIO_LEN_IN_SECONDS = float("inf") CML_DATASET_PATH = "./datasets/CML-TTS-Dataset/" -### Download LibriTTS dataset +### Download LibriTTS dataset # it will automatic download the dataset, if you have problems you can comment it and manually donwload and extract it ! Download link: https://www.openslr.org/resources/60/train-clean-360.tar.gz LIBRITTS_DOWNLOAD_PATH = "./datasets/LibriTTS/" # Check if LibriTTS dataset is not already downloaded, if not download it @@ -62,7 +62,7 @@ libritts_config = BaseDatasetConfig( meta_file_train="", meta_file_val="", path=os.path.join(LIBRITTS_DOWNLOAD_PATH, "train-clean-360/"), - language="en" + language="en", ) # init CML-TTS configs @@ -71,8 +71,8 @@ pt_config = BaseDatasetConfig( dataset_name="cml_tts", meta_file_train="train.csv", meta_file_val="", - path=os.path.join(CML_DATASET_PATH,"cml_tts_dataset_portuguese_v0.1/"), - language="pt-br" + path=os.path.join(CML_DATASET_PATH, "cml_tts_dataset_portuguese_v0.1/"), + language="pt-br", ) pl_config = BaseDatasetConfig( @@ -80,8 +80,8 @@ pl_config = BaseDatasetConfig( dataset_name="cml_tts", meta_file_train="train.csv", meta_file_val="", - path=os.path.join(CML_DATASET_PATH,"cml_tts_dataset_polish_v0.1/"), - language="pl" + path=os.path.join(CML_DATASET_PATH, "cml_tts_dataset_polish_v0.1/"), + language="pl", ) it_config = BaseDatasetConfig( @@ -89,8 +89,8 @@ it_config = BaseDatasetConfig( dataset_name="cml_tts", meta_file_train="train.csv", meta_file_val="", - path=os.path.join(CML_DATASET_PATH,"cml_tts_dataset_italian_v0.1/"), - language="it" + path=os.path.join(CML_DATASET_PATH, "cml_tts_dataset_italian_v0.1/"), + language="it", ) fr_config = BaseDatasetConfig( @@ -98,8 +98,8 @@ fr_config = BaseDatasetConfig( dataset_name="cml_tts", meta_file_train="train.csv", meta_file_val="", - path=os.path.join(CML_DATASET_PATH,"cml_tts_dataset_french_v0.1/"), - language="fr" + path=os.path.join(CML_DATASET_PATH, "cml_tts_dataset_french_v0.1/"), + language="fr", ) du_config = BaseDatasetConfig( @@ -107,8 +107,8 @@ du_config = BaseDatasetConfig( dataset_name="cml_tts", meta_file_train="train.csv", meta_file_val="", - path=os.path.join(CML_DATASET_PATH,"cml_tts_dataset_dutch_v0.1/"), - language="du" + path=os.path.join(CML_DATASET_PATH, "cml_tts_dataset_dutch_v0.1/"), + language="du", ) ge_config = BaseDatasetConfig( @@ -116,8 +116,8 @@ ge_config = BaseDatasetConfig( dataset_name="cml_tts", meta_file_train="train.csv", meta_file_val="", - path=os.path.join(CML_DATASET_PATH,"cml_tts_dataset_german_v0.1/"), - language="ge" + path=os.path.join(CML_DATASET_PATH, "cml_tts_dataset_german_v0.1/"), + language="ge", ) sp_config = BaseDatasetConfig( @@ -125,8 +125,8 @@ sp_config = BaseDatasetConfig( dataset_name="cml_tts", meta_file_train="train.csv", meta_file_val="", - path=os.path.join(CML_DATASET_PATH,"cml_tts_dataset_spanish_v0.1/"), - language="sp" + path=os.path.join(CML_DATASET_PATH, "cml_tts_dataset_spanish_v0.1/"), + language="sp", ) # Add here all datasets configs Note: If you want to add new datasets, just add them here and it will automatically compute the speaker embeddings (d-vectors) for this new dataset :) @@ -247,150 +247,55 @@ config = VitsConfig( max_audio_len=SAMPLE_RATE * MAX_AUDIO_LEN_IN_SECONDS, mixed_precision=False, test_sentences=[ - [ - "Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.", - "9351", - None, - "pt-br" - ], - [ - "Quando voc\u00ea n\u00e3o corre nenhum risco, voc\u00ea arrisca tudo.", - "12249", - None, - "pt-br" - ], + ["Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.", "9351", None, "pt-br"], + ["Quando voc\u00ea n\u00e3o corre nenhum risco, voc\u00ea arrisca tudo.", "12249", None, "pt-br"], [ "S\u00e3o necess\u00e1rios muitos anos de trabalho para ter sucesso da noite para o dia.", "2961", None, - "pt-br" - ], - [ - "You'll have the view of the top of the mountain that you climb.", - "LTTS_6574", - None, - "en" - ], - [ - "When you don\u2019t take any risks, you risk everything.", - "LTTS_6206", - None, - "en" - ], - [ - "Are necessary too many years of work to succeed overnight.", - "LTTS_5717", - None, - "en" - ], - [ - "Je hebt uitzicht op de top van de berg die je beklimt.", - "960", - None, - "du" - ], - [ - "Als je geen risico neemt, riskeer je alles.", - "2450", - None, - "du" - ], - [ - "Zijn te veel jaren werk nodig om van de ene op de andere dag te slagen.", - "10984", - None, - "du" - ], - [ - "Vous aurez la vue sur le sommet de la montagne que vous gravirez.", - "6381", - None, - "fr" - ], - [ - "Quand tu ne prends aucun risque, tu risques tout.", - "2825", - None, - "fr" + "pt-br", ], + ["You'll have the view of the top of the mountain that you climb.", "LTTS_6574", None, "en"], + ["When you don\u2019t take any risks, you risk everything.", "LTTS_6206", None, "en"], + ["Are necessary too many years of work to succeed overnight.", "LTTS_5717", None, "en"], + ["Je hebt uitzicht op de top van de berg die je beklimt.", "960", None, "du"], + ["Als je geen risico neemt, riskeer je alles.", "2450", None, "du"], + ["Zijn te veel jaren werk nodig om van de ene op de andere dag te slagen.", "10984", None, "du"], + ["Vous aurez la vue sur le sommet de la montagne que vous gravirez.", "6381", None, "fr"], + ["Quand tu ne prends aucun risque, tu risques tout.", "2825", None, "fr"], [ "Sont n\u00e9cessaires trop d'ann\u00e9es de travail pour r\u00e9ussir du jour au lendemain.", "1844", None, - "fr" - ], - [ - "Sie haben die Aussicht auf die Spitze des Berges, den Sie erklimmen.", - "2314", - None, - "ge" - ], - [ - "Wer nichts riskiert, riskiert alles.", - "7483", - None, - "ge" - ], - [ - "Es sind zu viele Jahre Arbeit notwendig, um \u00fcber Nacht erfolgreich zu sein.", - "12461", - None, - "ge" - ], - [ - "Avrai la vista della cima della montagna che sali.", - "4998", - None, - "it" - ], - [ - "Quando non corri alcun rischio, rischi tutto.", - "6744", - None, - "it" - ], - [ - "Are necessary too many years of work to succeed overnight.", - "1157", - None, - "it" + "fr", ], + ["Sie haben die Aussicht auf die Spitze des Berges, den Sie erklimmen.", "2314", None, "ge"], + ["Wer nichts riskiert, riskiert alles.", "7483", None, "ge"], + ["Es sind zu viele Jahre Arbeit notwendig, um \u00fcber Nacht erfolgreich zu sein.", "12461", None, "ge"], + ["Avrai la vista della cima della montagna che sali.", "4998", None, "it"], + ["Quando non corri alcun rischio, rischi tutto.", "6744", None, "it"], + ["Are necessary too many years of work to succeed overnight.", "1157", None, "it"], [ "B\u0119dziesz mie\u0107 widok na szczyt g\u00f3ry, na kt\u00f3r\u0105 si\u0119 wspinasz.", "7014", None, - "pl" - ], - [ - "Kiedy nie podejmujesz \u017cadnego ryzyka, ryzykujesz wszystko.", - "3492", - None, - "pl" + "pl", ], + ["Kiedy nie podejmujesz \u017cadnego ryzyka, ryzykujesz wszystko.", "3492", None, "pl"], [ "Potrzebne s\u0105 zbyt wiele lat pracy, aby odnie\u015b\u0107 sukces z dnia na dzie\u0144.", "1890", None, - "pl" - ], - [ - "Tendr\u00e1s la vista de la cima de la monta\u00f1a que subes", - "101", - None, - "sp" - ], - [ - "Cuando no te arriesgas, lo arriesgas todo.", - "5922", - None, - "sp" + "pl", ], + ["Tendr\u00e1s la vista de la cima de la monta\u00f1a que subes", "101", None, "sp"], + ["Cuando no te arriesgas, lo arriesgas todo.", "5922", None, "sp"], [ "Son necesarios demasiados a\u00f1os de trabajo para triunfar de la noche a la ma\u00f1ana.", "10246", None, - "sp" - ] + "sp", + ], ], # Enable the weighted sampler use_weighted_sampler=True, @@ -399,10 +304,10 @@ config = VitsConfig( weighted_sampler_attrs={"language": 1.0}, weighted_sampler_multipliers={ # "speaker_name": { - # you can force the batching scheme to give a higher weight to a certain speaker and then this speaker will appears more frequently on the batch. - # It will speedup the speaker adaptation process. Considering the CML train dataset and "new_speaker" as the speaker name of the speaker that you want to adapt. - # The line above will make the balancer consider the "new_speaker" as 106 speakers so 1/4 of the number of speakers present on CML dataset. - # 'new_speaker': 106, # (CML tot. train speaker)/4 = (424/4) = 106 + # you can force the batching scheme to give a higher weight to a certain speaker and then this speaker will appears more frequently on the batch. + # It will speedup the speaker adaptation process. Considering the CML train dataset and "new_speaker" as the speaker name of the speaker that you want to adapt. + # The line above will make the balancer consider the "new_speaker" as 106 speakers so 1/4 of the number of speakers present on CML dataset. + # 'new_speaker': 106, # (CML tot. train speaker)/4 = (424/4) = 106 # } }, # It defines the Speaker Consistency Loss (SCL) α to 9 like the YourTTS paper @@ -414,7 +319,7 @@ train_samples, eval_samples = load_tts_samples( config.datasets, eval_split=True, eval_split_max_size=config.eval_split_max_size, - eval_split_size=config.eval_split_size + eval_split_size=config.eval_split_size, ) # Init the model diff --git a/tests/zoo_tests/test_models.py b/tests/zoo_tests/test_models.py index d3a83980..e6ed18e1 100644 --- a/tests/zoo_tests/test_models.py +++ b/tests/zoo_tests/test_models.py @@ -10,12 +10,15 @@ from TTS.utils.generic_utils import get_user_data_dir from TTS.utils.manage import ModelManager +MODELS_WITH_SEP_TESTS = ["bark", "xtts"] + + def run_models(offset=0, step=1): """Check if all the models are downloadable and tts models run correctly.""" print(" > Run synthesizer with all the models.") output_path = os.path.join(get_tests_output_path(), "output.wav") manager = ModelManager(output_prefix=get_tests_output_path(), progress_bar=False) - model_names = [name for name in manager.list_models() if "bark" not in name] + model_names = [name for name in manager.list_models() if name in MODELS_WITH_SEP_TESTS] for model_name in model_names[offset::step]: print(f"\n > Run - {model_name}") model_path, _, _ = manager.download_model(model_name) @@ -63,20 +66,15 @@ def run_models(offset=0, step=1): manager.download_model(model_name) print(f" | > OK: {model_name}") - # folders = glob.glob(os.path.join(manager.output_prefix, "*")) - # assert len(folders) == len(model_names) // step - -def test_models_offset_0_step_3(): - run_models(offset=0, step=3) - - -def test_models_offset_1_step_3(): - run_models(offset=1, step=3) - - -def test_models_offset_2_step_3(): - run_models(offset=2, step=3) +def test_xtts(): + output_path = os.path.join(get_tests_output_path(), "output.wav") + speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav") + run_cli("yes | " + f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v1 " + f'--text "This is an example." --out_path "{output_path}" --progress_bar False --use_cuda True ' + f'--speaker_wav "{speaker_wav}" --language_idx "en"' + ) def test_bark(): @@ -84,7 +82,7 @@ def test_bark(): output_path = os.path.join(get_tests_output_path(), "output.wav") run_cli( f" tts --model_name tts_models/multilingual/multi-dataset/bark " - f'--text "This is an example." --out_path "{output_path}" --progress_bar False' + f'--text "This is an example." --out_path "{output_path}" --progress_bar False --use_cuda True' ) @@ -99,3 +97,17 @@ def test_voice_conversion(): f"tts --model_name {model_name}" f" --out_path {output_path} --speaker_wav {speaker_wav} --reference_wav {reference_wav} --language_idx {language_id} --progress_bar False" ) + +""" +These are used to split tests into different actions on Github. +""" +def test_models_offset_0_step_3(): + run_models(offset=0, step=3) + + +def test_models_offset_1_step_3(): + run_models(offset=1, step=3) + + +def test_models_offset_2_step_3(): + run_models(offset=2, step=3) \ No newline at end of file