Add XTTS v2.0 inference unit tests

2023-11-03 14:05:07 -03:00 · 2023-11-03 14:05:07 -03:00 · b621ab17d1
parent 8133b10540
commit b621ab17d1
1 changed files with 52 additions and 0 deletions
--- a/tests/zoo_tests/test_models.py
+++ b/tests/zoo_tests/test_models.py
@ -126,6 +126,58 @@ def test_xtts_streaming():
    assert len(wav_chuncks) > 1


+def test_xtts_v2():
+    """XTTS is too big to run on github actions. We need to test it locally"""
+    output_path = os.path.join(get_tests_output_path(), "output.wav")
+    speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")
+    use_gpu = torch.cuda.is_available()
+    if use_gpu:
+        run_cli(
+            "yes | "
+            f"tts --model_name  tts_models/multilingual/multi-dataset/xtts_v2 "
+            f'--text "This is an example." --out_path "{output_path}" --progress_bar False --use_cuda True '
+            f'--speaker_wav "{speaker_wav}" --language_idx "en"'
+        )
+    else:
+        run_cli(
+            "yes | "
+            f"tts --model_name  tts_models/multilingual/multi-dataset/xtts_v2 "
+            f'--text "This is an example." --out_path "{output_path}" --progress_bar False '
+            f'--speaker_wav "{speaker_wav}" --language_idx "en"'
+        )
+
+
+def test_xtts_v2_streaming():
+    """Testing the new inference_stream method"""
+    from TTS.tts.configs.xtts_config import XttsConfig
+    from TTS.tts.models.xtts import Xtts
+
+    speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")
+    model_path = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v2")
+    config = XttsConfig()
+    config.load_json(os.path.join(model_path, "config.json"))
+    model = Xtts.init_from_config(config)
+    model.load_checkpoint(config, checkpoint_dir=model_path)
+    model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
+
+    print("Computing speaker latents...")
+    gpt_cond_latent, _, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav)
+
+    print("Inference...")
+    chunks = model.inference_stream(
+        "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
+        "en",
+        gpt_cond_latent,
+        speaker_embedding,
+    )
+    wav_chuncks = []
+    for i, chunk in enumerate(chunks):
+        if i == 0:
+            assert chunk.shape[-1] > 5000
+        wav_chuncks.append(chunk)
+    assert len(wav_chuncks) > 1
+
+
 def test_tortoise():
    output_path = os.path.join(get_tests_output_path(), "output.wav")
    use_gpu = torch.cuda.is_available()