From 06cad27e31445331c3c27c32fde69c9819249153 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Fri, 18 Feb 2022 18:20:47 +0000 Subject: [PATCH 1/9] Add Glow-TTS multi-speaker unit test --- .../test_glow_tts_speaker_emb_train.py | 57 +++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 tests/tts_tests/test_glow_tts_speaker_emb_train.py diff --git a/tests/tts_tests/test_glow_tts_speaker_emb_train.py b/tests/tts_tests/test_glow_tts_speaker_emb_train.py new file mode 100644 index 00000000..9a1a1910 --- /dev/null +++ b/tests/tts_tests/test_glow_tts_speaker_emb_train.py @@ -0,0 +1,57 @@ +import glob +import os +import shutil + +from tests import get_device_id, get_tests_output_path, run_cli +from TTS.tts.configs.glow_tts_config import GlowTTSConfig +from TTS.utils.trainer_utils import get_last_checkpoint + +config_path = os.path.join(get_tests_output_path(), "test_model_config.json") +output_path = os.path.join(get_tests_output_path(), "train_outputs") + + +config = GlowTTSConfig( + batch_size=2, + eval_batch_size=8, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + use_espeak_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + test_sentences=[ + "Be a voice, not an echo.", + ], + data_dep_init_steps=1.0, + use_speaker_embedding=True, +) +config.audio.do_trim_silence = True +config.audio.trim_db = 60 +config.save_json(config_path) + +# train the model for one epoch +command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.name ljspeech_test " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " + "--coqpit.test_delay_epochs 0" +) +run_cli(command_train) + +# Find latest folder +continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + +# restore the model and continue training for one more epoch +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " +run_cli(command_train) +shutil.rmtree(continue_path) From ba6e56e01c8fd1a42177fc5717e29fa0d6990e05 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Fri, 18 Feb 2022 19:25:29 +0000 Subject: [PATCH 2/9] Fix Glow-TTS multi-speaker inference --- TTS/tts/models/glow_tts.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index 7dbfdd09..8f3b3804 100644 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -170,6 +170,8 @@ class GlowTTS(BaseTTS): if g is not None: if hasattr(self, "emb_g"): # use speaker embedding layer + if not g.size(): # if is a scalar + g = g.unsqueeze(0) # unsqueeze g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h, 1] else: # use d-vector From 759f9ac76a22af865391daa0d7c46d0670c422ee Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Fri, 18 Feb 2022 20:03:36 +0000 Subject: [PATCH 3/9] Add Glow-TTS d-vectors training unit test --- .../test_glow_tts_d-vectors_train.py | 60 +++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 tests/tts_tests/test_glow_tts_d-vectors_train.py diff --git a/tests/tts_tests/test_glow_tts_d-vectors_train.py b/tests/tts_tests/test_glow_tts_d-vectors_train.py new file mode 100644 index 00000000..5b82eebb --- /dev/null +++ b/tests/tts_tests/test_glow_tts_d-vectors_train.py @@ -0,0 +1,60 @@ +import glob +import os +import shutil + +from tests import get_device_id, get_tests_output_path, run_cli +from TTS.tts.configs.glow_tts_config import GlowTTSConfig +from TTS.utils.trainer_utils import get_last_checkpoint + +config_path = os.path.join(get_tests_output_path(), "test_model_config.json") +output_path = os.path.join(get_tests_output_path(), "train_outputs") + + +config = GlowTTSConfig( + batch_size=2, + eval_batch_size=8, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + use_espeak_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + test_sentences=[ + "Be a voice, not an echo.", + ], + data_dep_init_steps=1.0, + use_speaker_embedding=False, + use_d_vector_file=True, + d_vector_file="tests/data/ljspeech/speakers.json", + d_vector_dim=256, +) +config.audio.do_trim_silence = True +config.audio.trim_db = 60 +config.save_json(config_path) + +# train the model for one epoch +command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.name ljspeech_test " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " + "--coqpit.test_delay_epochs 0" +) +run_cli(command_train) + +# Find latest folder +continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + +# restore the model and continue training for one more epoch +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " +run_cli(command_train) +shutil.rmtree(continue_path) From 5cca4aa8aebe689cebd1dbda70ad648b42ee5407 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Fri, 18 Feb 2022 20:16:52 +0000 Subject: [PATCH 4/9] Add FastPitch Speaker embedding train unit test --- .../test_fast_pitch_speaker_emb_train.py | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 tests/tts_tests/test_fast_pitch_speaker_emb_train.py diff --git a/tests/tts_tests/test_fast_pitch_speaker_emb_train.py b/tests/tts_tests/test_fast_pitch_speaker_emb_train.py new file mode 100644 index 00000000..c526e33a --- /dev/null +++ b/tests/tts_tests/test_fast_pitch_speaker_emb_train.py @@ -0,0 +1,69 @@ +import glob +import os +import shutil + +from tests import get_device_id, get_tests_output_path, run_cli +from TTS.config.shared_configs import BaseAudioConfig +from TTS.tts.configs.fast_pitch_config import FastPitchConfig + +config_path = os.path.join(get_tests_output_path(), "test_fast_pitch_config.json") +output_path = os.path.join(get_tests_output_path(), "train_outputs") + +audio_config = BaseAudioConfig( + sample_rate=22050, + do_trim_silence=True, + trim_db=60.0, + signal_norm=False, + mel_fmin=0.0, + mel_fmax=8000, + spec_gain=1.0, + log_func="np.log", + ref_level_db=20, + preemphasis=0.0, +) + +config = FastPitchConfig( + audio=audio_config, + batch_size=8, + eval_batch_size=8, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", + f0_cache_path="tests/data/ljspeech/f0_cache/", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + use_speaker_embedding=True, + test_sentences=[ + "Be a voice, not an echo.", + ], +) +config.audio.do_trim_silence = True +config.audio.trim_db = 60 +config.save_json(config_path) + +# train the model for one epoch +command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.name ljspeech_test " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " + "--coqpit.test_delay_epochs 0" +) +run_cli(command_train) + +# Find latest folder +continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + +# restore the model and continue training for one more epoch +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " +run_cli(command_train) +shutil.rmtree(continue_path) From fc7081fc5e05fff2b4d52856588b947694bd199f Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Fri, 18 Feb 2022 21:06:08 +0000 Subject: [PATCH 5/9] Add Inference test using TTS API in all models unit tests --- tests/data/ljspeech/f0_cache/pitch_stats.npy | Bin 0 -> 424 bytes tests/tts_tests/test_align_tts_train.py | 9 +++++++++ .../test_fast_pitch_speaker_emb_train.py | 11 ++++++++++ tests/tts_tests/test_fast_pitch_train.py | 9 +++++++++ .../test_glow_tts_d-vectors_train.py | 10 +++++++++ .../test_glow_tts_speaker_emb_train.py | 10 +++++++++ tests/tts_tests/test_glow_tts_train.py | 9 +++++++++ tests/tts_tests/test_speedy_speech_train.py | 9 +++++++++ .../test_tacotron2_d-vectors_train.py | 11 ++++++++++ .../test_tacotron2_speaker_emb_train.py | 11 ++++++++++ tests/tts_tests/test_tacotron2_train.py | 9 +++++++++ tests/tts_tests/test_tacotron_train.py | 9 +++++++++ ...st_vits_multilingual_speaker_emb_train.py} | 15 +++++++++++++- .../test_vits_multilingual_train-d_vectors.py | 19 +++++++++++++++--- .../tts_tests/test_vits_speaker_emb_train.py | 15 ++++++++++++-- tests/tts_tests/test_vits_train.py | 9 +++++++++ 16 files changed, 159 insertions(+), 6 deletions(-) create mode 100644 tests/data/ljspeech/f0_cache/pitch_stats.npy rename tests/tts_tests/{test_vits_multilingual_train.py => test_vits_multilingual_speaker_emb_train.py} (74%) diff --git a/tests/data/ljspeech/f0_cache/pitch_stats.npy b/tests/data/ljspeech/f0_cache/pitch_stats.npy new file mode 100644 index 0000000000000000000000000000000000000000..aaa385c3c07d9eb8739ab504b8bdb7e34f0002d5 GIT binary patch literal 424 zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1Jt-^H=`z0QA$Z=K`K`vTLcpW1B1UsA$w;> zdm%?qA*Y5na|9z$tfr95&(F{6KM;TkZ~Kx$?xfDxLY~?}UX2JAppx9w#Joa29BwO4 zPRvOx;wt3NfY^~{Q78biLoldN2xf;(p)jf)3?+pkNzNQkzxYgW;mLI<6m2UM3n~ Date: Sat, 19 Feb 2022 12:15:03 +0000 Subject: [PATCH 6/9] Fix unit tests issue --- tests/tts_tests/test_fast_pitch_speaker_emb_train.py | 4 +++- tests/tts_tests/test_fast_pitch_train.py | 6 +++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/tts_tests/test_fast_pitch_speaker_emb_train.py b/tests/tts_tests/test_fast_pitch_speaker_emb_train.py index 1b777803..59e90e0a 100644 --- a/tests/tts_tests/test_fast_pitch_speaker_emb_train.py +++ b/tests/tts_tests/test_fast_pitch_speaker_emb_train.py @@ -7,7 +7,7 @@ from TTS.config.shared_configs import BaseAudioConfig from TTS.tts.configs.fast_pitch_config import FastPitchConfig from TTS.utils.trainer_utils import get_last_checkpoint -config_path = os.path.join(get_tests_output_path(), "test_fast_pitch_config.json") +config_path = os.path.join(get_tests_output_path(), "test_model_config.json") output_path = os.path.join(get_tests_output_path(), "train_outputs") audio_config = BaseAudioConfig( @@ -45,6 +45,8 @@ config = FastPitchConfig( ], ) config.audio.do_trim_silence = True +config.use_speaker_embedding = True +config.model_args.use_speaker_embedding = True config.audio.trim_db = 60 config.save_json(config_path) diff --git a/tests/tts_tests/test_fast_pitch_train.py b/tests/tts_tests/test_fast_pitch_train.py index 9aae5bbd..bbfbb823 100644 --- a/tests/tts_tests/test_fast_pitch_train.py +++ b/tests/tts_tests/test_fast_pitch_train.py @@ -7,7 +7,7 @@ from TTS.config.shared_configs import BaseAudioConfig from TTS.tts.configs.fast_pitch_config import FastPitchConfig from TTS.utils.trainer_utils import get_last_checkpoint -config_path = os.path.join(get_tests_output_path(), "test_fast_pitch_config.json") +config_path = os.path.join(get_tests_output_path(), "test_model_config.json") output_path = os.path.join(get_tests_output_path(), "train_outputs") audio_config = BaseAudioConfig( @@ -42,8 +42,11 @@ config = FastPitchConfig( test_sentences=[ "Be a voice, not an echo.", ], + use_speaker_embedding=False, ) config.audio.do_trim_silence = True +config.use_speaker_embedding = False +config.model_args.use_speaker_embedding = False config.audio.trim_db = 60 config.save_json(config_path) @@ -58,6 +61,7 @@ command_train = ( "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " "--coqpit.test_delay_epochs 0" ) + run_cli(command_train) # Find latest folder From 531821545e40e2a8fba7d351be52b58bfd61458f Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Sat, 19 Feb 2022 12:21:32 +0000 Subject: [PATCH 7/9] Fix inference test issue --- tests/inference_tests/test_synthesizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/inference_tests/test_synthesizer.py b/tests/inference_tests/test_synthesizer.py index 5972dc90..97878574 100644 --- a/tests/inference_tests/test_synthesizer.py +++ b/tests/inference_tests/test_synthesizer.py @@ -6,7 +6,7 @@ from TTS.tts.models import setup_model from TTS.utils.io import save_checkpoint from TTS.utils.synthesizer import Synthesizer -from .. import get_tests_output_path +from tests import get_tests_output_path class SynthesizerTest(unittest.TestCase): From 05fffb0ebc549a75f77b37fb71f7f42002f4d1c2 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Sat, 19 Feb 2022 14:42:24 +0000 Subject: [PATCH 8/9] Add inference unit test on GitHub workflow --- .github/workflows/inference_tests.yml | 46 +++++++++++++++++++++++++++ Makefile | 3 ++ 2 files changed, 49 insertions(+) create mode 100644 .github/workflows/inference_tests.yml diff --git a/.github/workflows/inference_tests.yml b/.github/workflows/inference_tests.yml new file mode 100644 index 00000000..3f08b904 --- /dev/null +++ b/.github/workflows/inference_tests.yml @@ -0,0 +1,46 @@ +name: inference_tests + +on: + push: + branches: + - main + pull_request: + types: [opened, synchronize, reopened] +jobs: + check_skip: + runs-on: ubuntu-latest + if: "! contains(github.event.head_commit.message, '[ci skip]')" + steps: + - run: echo "${{ github.event.head_commit.message }}" + + test: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: [3.6, 3.7, 3.8, 3.9] + experimental: [false] + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: coqui-ai/setup-python@pip-cache-key-py-ver + with: + python-version: ${{ matrix.python-version }} + architecture: x64 + cache: 'pip' + cache-dependency-path: 'requirements*' + - name: check OS + run: cat /etc/os-release + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install -y --no-install-recommends git make gcc + make system-deps + - name: Install/upgrade Python setup deps + run: python3 -m pip install --upgrade pip setuptools wheel + - name: Install TTS + run: | + python3 -m pip install .[all] + python3 setup.py egg_info + - name: Unit tests + run: make inference_tests diff --git a/Makefile b/Makefile index 2632dbab..bb849981 100644 --- a/Makefile +++ b/Makefile @@ -26,6 +26,9 @@ test_aux: ## run aux tests. test_zoo: ## run zoo tests. nosetests tests.zoo_tests -x --with-cov -cov --cover-erase --cover-package TTS tests.zoo_tests --nologcapture --with-id +inference_tests: ## run inference tests. + nosetests tests.inference_tests -x --with-cov -cov --cover-erase --cover-package TTS tests.inference_tests --nologcapture --with-id + test_failed: ## only run tests failed the last time. nosetests -x --with-cov -cov --cover-erase --cover-package TTS tests --nologcapture --failed From bc5db13d067332baecb932f54fe4abe5398be016 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Sat, 19 Feb 2022 19:24:00 +0000 Subject: [PATCH 9/9] Fix the bug in extract tts spectrogram script --- TTS/bin/extract_tts_spectrograms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index 7b489fd6..386cf332 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -138,7 +138,7 @@ def inference( aux_input={"d_vectors": speaker_c, "speaker_ids": speaker_ids}, ) model_output = outputs["model_outputs"] - model_output = model_output.transpose(1, 2).detach().cpu().numpy() + model_output = model_output.detach().cpu().numpy() elif "tacotron" in model_name: aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors}