From 5dd217a759d6303b599a75985f3664cff1a36d8a Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Fri, 1 Dec 2023 09:47:09 -0300 Subject: [PATCH] Update XTTS finetuner docs --- TTS/demos/xtts_ft_demo/xtts_demo.py | 4 ++-- docs/source/models/xtts.md | 18 +++++++++++++++++- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/TTS/demos/xtts_ft_demo/xtts_demo.py b/TTS/demos/xtts_ft_demo/xtts_demo.py index 8e9a88eb..ebb11f29 100644 --- a/TTS/demos/xtts_ft_demo/xtts_demo.py +++ b/TTS/demos/xtts_ft_demo/xtts_demo.py @@ -202,7 +202,7 @@ if __name__ == "__main__": ) demo.load(read_logs, None, logs, every=1) - prompt_compute_btn = gr.Button(value="Step 1 - Create dataset.") + prompt_compute_btn = gr.Button(value="Step 1 - Create dataset") def preprocess_dataset(audio_path, language, out_path, progress=gr.Progress(track_tqdm=True)): clear_gpu_cache() @@ -315,7 +315,7 @@ if __name__ == "__main__": progress_load = gr.Label( label="Progress:" ) - load_btn = gr.Button(value="Step 3 - Load Fine tuned XTTS model") + load_btn = gr.Button(value="Step 3 - Load Fine-tuned XTTS model") with gr.Column() as col2: speaker_reference_audio = gr.Textbox( diff --git a/docs/source/models/xtts.md b/docs/source/models/xtts.md index f42e8d8f..92a981d7 100644 --- a/docs/source/models/xtts.md +++ b/docs/source/models/xtts.md @@ -182,7 +182,7 @@ To make `XTTS_v2` GPT encoder training easier for beginner users we did a gradio - Train the XTTS GPT encoder with the processed data - Inference support using the fine-tuned model -The user can run this gradio demos locally or remotely using a Colab Notebook. +The user can run this gradio demo locally or remotely using a Colab Notebook. ##### Run demo on Colab To make the `XTTS_v2` fine-tuning more accessible for users that do not have good GPUs available we did a Google Colab Notebook. @@ -191,6 +191,15 @@ The Colab Notebook is available [here](https://colab.research.google.com/drive/1 To learn how to use this Colab Notebook please check the [XTTS fine-tuning video](). +If you are not able to acess the video you need to follow the steps: + +1. Open the Colab notebook and start the demo by runining the first two cells (ignore pip install errors in the first one). +2. Click on the link "Running on public URL:" on the second cell output. +3. On the first Tab (1 - Data processing) you need to select the audio file or files, wait for upload, and then click on the button "Step 1 - Create dataset" and then wait until the dataset processing is done. +4. Soon as the dataset processing is done you need to go to the second Tab (2 - Fine-tuning XTTS Encoder) and press the button "Step 2 - Run the training" and then wait until the training is finished. Note that it can take up to 40 minutes. +5. Soon the training is done you can go to the third Tab (3 - Inference) and then click on the button "Step 3 - Load Fine-tuned XTTS model" and wait until the fine-tuned model is loaded. Then you can do the inference on the model by clicking on the button "Step 4 - Inference". + + ##### Run demo locally To run the demo locally you need to do the following steps: @@ -199,6 +208,13 @@ To run the demo locally you need to do the following steps: 3. Run the gradio demo using the command `python3 TTS/demos/xtts_ft_demo/xtts_demo.py` 4. Follow the steps presented on the [XTTS fine-tuning video]() to be able to fine-tune and use the fine-tuned model. + +If you are not able to acess the video you need to follow the steps: + +1. On the first Tab (1 - Data processing) you need to select the audio file or files, wait for upload, and then click on the button "Step 1 - Create dataset" and then wait until the dataset processing is done. +2. Soon as the dataset processing is done you need to go to the second Tab (2 - Fine-tuning XTTS Encoder) and press the button "Step 2 - Run the training" and then wait until the training is finished. Note that it can take up to 40 minutes. +3. Soon the training is done you can go to the third Tab (3 - Inference) and then click on the button "Step 3 - Load Fine-tuned XTTS model" and wait until the fine-tuned model is loaded. Then you can do the inference on the model by clicking on the button "Step 4 - Inference". + #### Advanced training A recipe for `XTTS_v2` GPT encoder training using `LJSpeech` dataset is available at https://github.com/coqui-ai/TTS/tree/dev/recipes/ljspeech/xtts_v1/train_gpt_xtts.py