diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py index 976b74af..bdb4f6f6 100644 --- a/TTS/bin/train_tts.py +++ b/TTS/bin/train_tts.py @@ -57,7 +57,7 @@ def main(): # init the trainer and 🚀 trainer = Trainer( train_args, - config, + model.config, config.output_path, model=model, train_samples=train_samples, diff --git a/TTS/tts/utils/text/tokenizer.py b/TTS/tts/utils/text/tokenizer.py index f0d85a44..1569c634 100644 --- a/TTS/tts/utils/text/tokenizer.py +++ b/TTS/tts/utils/text/tokenizer.py @@ -191,6 +191,7 @@ class TTSTokenizer: phonemizer = get_phonemizer_by_name( DEF_LANG_TO_PHONEMIZER[config.phoneme_language], **phonemizer_kwargs ) + new_config.phonemizer = phonemizer.name() except KeyError as e: raise ValueError( f"""No phonemizer found for language {config.phoneme_language}. diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 2ea23adb..3dd8be44 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -112,6 +112,9 @@ class Synthesizer(object): self.use_phonemes = self.tts_config.use_phonemes self.tts_model = setup_tts_model(config=self.tts_config) + if self.use_phonemes and self.tts_config["phonemizer"] is None: + raise ValueError("Phonemizer is not defined in the TTS config.") + if not self.encoder_checkpoint: self._set_speaker_encoder_paths_from_tts_config() diff --git a/requirements.txt b/requirements.txt index f735c57a..db47c2cc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -25,7 +25,7 @@ tensorboardX pyworld # coqui stack coqui-trainer -coqpit # config managemenr +coqpit # config management # chinese g2p deps jieba pypinyin diff --git a/tests/tts_tests/test_align_tts_train.py b/tests/tts_tests/test_align_tts_train.py index 85dfbbcb..75c5643c 100644 --- a/tests/tts_tests/test_align_tts_train.py +++ b/tests/tts_tests/test_align_tts_train.py @@ -1,4 +1,5 @@ import glob +import json import os import shutil @@ -42,7 +43,7 @@ command_train = ( "--coqpit.datasets.0.meta_file_train metadata.csv " "--coqpit.datasets.0.meta_file_val metadata.csv " "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.test_delay_epochs -1" + "--coqpit.test_delay_epochs 0 " ) run_cli(command_train) @@ -54,6 +55,14 @@ continue_config_path = os.path.join(continue_path, "config.json") continue_restore_path, _ = get_last_checkpoint(continue_path) out_wav_path = os.path.join(get_tests_output_path(), "output.wav") +# Check integrity of the config +with open(continue_config_path, "r", encoding="utf-8") as f: + config_loaded = json.load(f) +assert config_loaded["characters"] is not None +assert config_loaded["output_path"] in continue_path +assert config_loaded["test_delay_epochs"] == 0 + +# Load the model and run inference inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" run_cli(inference_command) diff --git a/tests/tts_tests/test_fast_pitch_speaker_emb_train.py b/tests/tts_tests/test_fast_pitch_speaker_emb_train.py index 37faf449..9553d745 100644 --- a/tests/tts_tests/test_fast_pitch_speaker_emb_train.py +++ b/tests/tts_tests/test_fast_pitch_speaker_emb_train.py @@ -1,4 +1,5 @@ import glob +import json import os import shutil @@ -74,6 +75,14 @@ out_wav_path = os.path.join(get_tests_output_path(), "output.wav") speaker_id = "ljspeech-1" continue_speakers_path = os.path.join(continue_path, "speakers.json") +# Check integrity of the config +with open(continue_config_path, "r", encoding="utf-8") as f: + config_loaded = json.load(f) +assert config_loaded["characters"] is not None +assert config_loaded["output_path"] in continue_path +assert config_loaded["test_delay_epochs"] == 0 + +# Load the model and run inference inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" run_cli(inference_command) diff --git a/tests/tts_tests/test_fast_pitch_train.py b/tests/tts_tests/test_fast_pitch_train.py index d2d78af4..134cd4ba 100644 --- a/tests/tts_tests/test_fast_pitch_train.py +++ b/tests/tts_tests/test_fast_pitch_train.py @@ -1,4 +1,5 @@ import glob +import json import os import shutil @@ -73,6 +74,14 @@ continue_config_path = os.path.join(continue_path, "config.json") continue_restore_path, _ = get_last_checkpoint(continue_path) out_wav_path = os.path.join(get_tests_output_path(), "output.wav") +# Check integrity of the config +with open(continue_config_path, "r", encoding="utf-8") as f: + config_loaded = json.load(f) +assert config_loaded["characters"] is not None +assert config_loaded["output_path"] in continue_path +assert config_loaded["test_delay_epochs"] == 0 + +# Load the model and run inference inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" run_cli(inference_command) diff --git a/tests/tts_tests/test_glow_tts_d-vectors_train.py b/tests/tts_tests/test_glow_tts_d-vectors_train.py index 14f9e4d2..3a9c8fcc 100644 --- a/tests/tts_tests/test_glow_tts_d-vectors_train.py +++ b/tests/tts_tests/test_glow_tts_d-vectors_train.py @@ -1,4 +1,5 @@ import glob +import json import os import shutil @@ -61,6 +62,14 @@ out_wav_path = os.path.join(get_tests_output_path(), "output.wav") speaker_id = "ljspeech-1" continue_speakers_path = config.d_vector_file +# Check integrity of the config +with open(continue_config_path, "r", encoding="utf-8") as f: + config_loaded = json.load(f) +assert config_loaded["characters"] is not None +assert config_loaded["output_path"] in continue_path +assert config_loaded["test_delay_epochs"] == 0 + +# Load the model and run inference inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" run_cli(inference_command) diff --git a/tests/tts_tests/test_glow_tts_speaker_emb_train.py b/tests/tts_tests/test_glow_tts_speaker_emb_train.py index c327332e..322b506e 100644 --- a/tests/tts_tests/test_glow_tts_speaker_emb_train.py +++ b/tests/tts_tests/test_glow_tts_speaker_emb_train.py @@ -1,4 +1,5 @@ import glob +import json import os import shutil @@ -58,6 +59,14 @@ out_wav_path = os.path.join(get_tests_output_path(), "output.wav") speaker_id = "ljspeech-1" continue_speakers_path = os.path.join(continue_path, "speakers.json") +# Check integrity of the config +with open(continue_config_path, "r", encoding="utf-8") as f: + config_loaded = json.load(f) +assert config_loaded["characters"] is not None +assert config_loaded["output_path"] in continue_path +assert config_loaded["test_delay_epochs"] == 0 + +# Load the model and run inference inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" run_cli(inference_command) diff --git a/tests/tts_tests/test_glow_tts_train.py b/tests/tts_tests/test_glow_tts_train.py index b0acf004..cf9a04f4 100644 --- a/tests/tts_tests/test_glow_tts_train.py +++ b/tests/tts_tests/test_glow_tts_train.py @@ -1,4 +1,5 @@ import glob +import json import os import shutil @@ -55,6 +56,14 @@ continue_config_path = os.path.join(continue_path, "config.json") continue_restore_path, _ = get_last_checkpoint(continue_path) out_wav_path = os.path.join(get_tests_output_path(), "output.wav") +# Check integrity of the config +with open(continue_config_path, "r", encoding="utf-8") as f: + config_loaded = json.load(f) +assert config_loaded["characters"] is not None +assert config_loaded["output_path"] in continue_path +assert config_loaded["test_delay_epochs"] == 0 + +# Load the model and run inference inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" run_cli(inference_command) diff --git a/tests/tts_tests/test_speedy_speech_train.py b/tests/tts_tests/test_speedy_speech_train.py index 9a26d253..c4adcee3 100644 --- a/tests/tts_tests/test_speedy_speech_train.py +++ b/tests/tts_tests/test_speedy_speech_train.py @@ -1,4 +1,5 @@ import glob +import json import os import shutil @@ -54,6 +55,14 @@ continue_config_path = os.path.join(continue_path, "config.json") continue_restore_path, _ = get_last_checkpoint(continue_path) out_wav_path = os.path.join(get_tests_output_path(), "output.wav") +# Check integrity of the config +with open(continue_config_path, "r", encoding="utf-8") as f: + config_loaded = json.load(f) +assert config_loaded["characters"] is not None +assert config_loaded["output_path"] in continue_path +assert config_loaded["test_delay_epochs"] == 0 + +# Load the model and run inference inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example for it.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" run_cli(inference_command) diff --git a/tests/tts_tests/test_tacotron2_d-vectors_train.py b/tests/tts_tests/test_tacotron2_d-vectors_train.py index 6b003f2c..0d02fa98 100644 --- a/tests/tts_tests/test_tacotron2_d-vectors_train.py +++ b/tests/tts_tests/test_tacotron2_d-vectors_train.py @@ -1,4 +1,5 @@ import glob +import json import os import shutil @@ -61,6 +62,14 @@ out_wav_path = os.path.join(get_tests_output_path(), "output.wav") speaker_id = "ljspeech-1" continue_speakers_path = config.d_vector_file +# Check integrity of the config +with open(continue_config_path, "r", encoding="utf-8") as f: + config_loaded = json.load(f) +assert config_loaded["characters"] is not None +assert config_loaded["output_path"] in continue_path +assert config_loaded["test_delay_epochs"] == 0 + +# Load the model and run inference inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" run_cli(inference_command) diff --git a/tests/tts_tests/test_tacotron2_speaker_emb_train.py b/tests/tts_tests/test_tacotron2_speaker_emb_train.py index b9f4de0b..2e812d90 100644 --- a/tests/tts_tests/test_tacotron2_speaker_emb_train.py +++ b/tests/tts_tests/test_tacotron2_speaker_emb_train.py @@ -1,4 +1,5 @@ import glob +import json import os import shutil @@ -59,6 +60,14 @@ out_wav_path = os.path.join(get_tests_output_path(), "output.wav") speaker_id = "ljspeech-1" continue_speakers_path = os.path.join(continue_path, "speakers.json") +# Check integrity of the config +with open(continue_config_path, "r", encoding="utf-8") as f: + config_loaded = json.load(f) +assert config_loaded["characters"] is not None +assert config_loaded["output_path"] in continue_path +assert config_loaded["test_delay_epochs"] == 0 + +# Load the model and run inference inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" run_cli(inference_command) diff --git a/tests/tts_tests/test_tacotron2_train.py b/tests/tts_tests/test_tacotron2_train.py index 8c30d9f9..d1941022 100644 --- a/tests/tts_tests/test_tacotron2_train.py +++ b/tests/tts_tests/test_tacotron2_train.py @@ -1,4 +1,5 @@ import glob +import json import os import shutil @@ -54,6 +55,14 @@ continue_config_path = os.path.join(continue_path, "config.json") continue_restore_path, _ = get_last_checkpoint(continue_path) out_wav_path = os.path.join(get_tests_output_path(), "output.wav") +# Check integrity of the config +with open(continue_config_path, "r", encoding="utf-8") as f: + config_loaded = json.load(f) +assert config_loaded["characters"] is not None +assert config_loaded["output_path"] in continue_path +assert config_loaded["test_delay_epochs"] == 0 + +# Load the model and run inference inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" run_cli(inference_command) diff --git a/tests/tts_tests/test_tacotron2_train_fsspec_path.py b/tests/tts_tests/test_tacotron2_train_fsspec_path.py deleted file mode 100644 index 5d14a983..00000000 --- a/tests/tts_tests/test_tacotron2_train_fsspec_path.py +++ /dev/null @@ -1,55 +0,0 @@ -import glob -import os -import shutil - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.tts.configs.tacotron2_config import Tacotron2Config - -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - -config = Tacotron2Config( - r=5, - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=False, - phoneme_language="en-us", - phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"), - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - test_sentences=[ - "Be a voice, not an echo.", - ], - print_eval=True, - max_decoder_steps=50, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) - -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path file://{config_path} " - f"--coqpit.output_path file://{output_path} " - "--coqpit.datasets.0.name ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.test_delay_epochs 0 " -) -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path file://{continue_path} " -) -run_cli(command_train) -shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_vits_multilingual_speaker_emb_train.py b/tests/tts_tests/test_vits_multilingual_speaker_emb_train.py index 0c7672d7..683bb0a7 100644 --- a/tests/tts_tests/test_vits_multilingual_speaker_emb_train.py +++ b/tests/tts_tests/test_vits_multilingual_speaker_emb_train.py @@ -1,4 +1,5 @@ import glob +import json import os import shutil @@ -92,6 +93,14 @@ languae_id = "en" continue_speakers_path = os.path.join(continue_path, "speakers.json") continue_languages_path = os.path.join(continue_path, "language_ids.json") +# Check integrity of the config +with open(continue_config_path, "r", encoding="utf-8") as f: + config_loaded = json.load(f) +assert config_loaded["characters"] is not None +assert config_loaded["output_path"] in continue_path +assert config_loaded["test_delay_epochs"] == 0 + +# Load the model and run inference inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --language_ids_file_path {continue_languages_path} --language_idx {languae_id} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" run_cli(inference_command) diff --git a/tests/tts_tests/test_vits_multilingual_train-d_vectors.py b/tests/tts_tests/test_vits_multilingual_train-d_vectors.py index e12661a5..e4a82cdd 100644 --- a/tests/tts_tests/test_vits_multilingual_train-d_vectors.py +++ b/tests/tts_tests/test_vits_multilingual_train-d_vectors.py @@ -1,4 +1,5 @@ import glob +import json import os import shutil @@ -99,6 +100,14 @@ languae_id = "en" continue_speakers_path = config.d_vector_file continue_languages_path = os.path.join(continue_path, "language_ids.json") +# Check integrity of the config +with open(continue_config_path, "r", encoding="utf-8") as f: + config_loaded = json.load(f) +assert config_loaded["characters"] is not None +assert config_loaded["output_path"] in continue_path +assert config_loaded["test_delay_epochs"] == 0 + +# Load the model and run inference inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --language_ids_file_path {continue_languages_path} --language_idx {languae_id} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" run_cli(inference_command) diff --git a/tests/tts_tests/test_vits_speaker_emb_train.py b/tests/tts_tests/test_vits_speaker_emb_train.py index c928cee4..48597241 100644 --- a/tests/tts_tests/test_vits_speaker_emb_train.py +++ b/tests/tts_tests/test_vits_speaker_emb_train.py @@ -1,4 +1,5 @@ import glob +import json import os import shutil @@ -65,6 +66,14 @@ out_wav_path = os.path.join(get_tests_output_path(), "output.wav") speaker_id = "ljspeech-1" continue_speakers_path = os.path.join(continue_path, "speakers.json") +# Check integrity of the config +with open(continue_config_path, "r", encoding="utf-8") as f: + config_loaded = json.load(f) +assert config_loaded["characters"] is not None +assert config_loaded["output_path"] in continue_path +assert config_loaded["test_delay_epochs"] == 0 + +# Load the model and run inference inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" run_cli(inference_command) diff --git a/tests/tts_tests/test_vits_train.py b/tests/tts_tests/test_vits_train.py index 003f99a8..64ff63f3 100644 --- a/tests/tts_tests/test_vits_train.py +++ b/tests/tts_tests/test_vits_train.py @@ -1,4 +1,5 @@ import glob +import json import os import shutil @@ -54,6 +55,14 @@ continue_config_path = os.path.join(continue_path, "config.json") continue_restore_path, _ = get_last_checkpoint(continue_path) out_wav_path = os.path.join(get_tests_output_path(), "output.wav") +# Check integrity of the config +with open(continue_config_path, "r", encoding="utf-8") as f: + config_loaded = json.load(f) +assert config_loaded["characters"] is not None +assert config_loaded["output_path"] in continue_path +assert config_loaded["test_delay_epochs"] == 0 + +# Load the model and run inference inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" run_cli(inference_command)