mirror of https://github.com/coqui-ai/TTS.git
Updating XTTS docs
This commit is contained in:
parent
b75e90ba85
commit
18b7d746cb
|
@ -39,6 +39,10 @@ You can also mail us at info@coqui.ai.
|
||||||
#### 🐸TTS API
|
#### 🐸TTS API
|
||||||
|
|
||||||
##### Single reference
|
##### Single reference
|
||||||
|
|
||||||
|
Splits the text into sentences and generates audio for each sentence. The audio files are then concatenated to produce the final audio.
|
||||||
|
You can optionally disable sentence splitting for better coherence but more VRAM and possibly hitting models context length limit.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from TTS.api import TTS
|
from TTS.api import TTS
|
||||||
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
|
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
|
||||||
|
@ -47,14 +51,29 @@ tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
|
||||||
tts.tts_to_file(text="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
|
tts.tts_to_file(text="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
|
||||||
file_path="output.wav",
|
file_path="output.wav",
|
||||||
speaker_wav=["/path/to/target/speaker.wav"],
|
speaker_wav=["/path/to/target/speaker.wav"],
|
||||||
language="en")
|
language="en",
|
||||||
|
split_sentences=True
|
||||||
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
##### Multiple references
|
##### Multiple references
|
||||||
|
|
||||||
|
You can pass multiple audio files to the `speaker_wav` argument for better voice cloning.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from TTS.api import TTS
|
from TTS.api import TTS
|
||||||
|
|
||||||
|
# using the default version set in 🐸TTS
|
||||||
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
|
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
|
||||||
|
|
||||||
|
# using a specific version
|
||||||
|
# 👀 see the branch names for versions on https://huggingface.co/coqui/XTTS-v2/tree/main
|
||||||
|
# ❗some versions might be incompatible with the API
|
||||||
|
tts = TTS("xtts_v2.0.2", gpu=True)
|
||||||
|
|
||||||
|
# getting the latest XTTS_v2
|
||||||
|
tts = TTS("xtts", gpu=True)
|
||||||
|
|
||||||
# generate speech by cloning a voice using default settings
|
# generate speech by cloning a voice using default settings
|
||||||
tts.tts_to_file(text="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
|
tts.tts_to_file(text="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
|
||||||
file_path="output.wav",
|
file_path="output.wav",
|
||||||
|
@ -62,6 +81,42 @@ tts.tts_to_file(text="It took me quite a long time to develop a voice, and now t
|
||||||
language="en")
|
language="en")
|
||||||
```
|
```
|
||||||
|
|
||||||
|
##### Streaming inference
|
||||||
|
|
||||||
|
XTTS supports streaming inference. This is useful for real-time applications.
|
||||||
|
|
||||||
|
```python
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import torch
|
||||||
|
import torchaudio
|
||||||
|
|
||||||
|
print("Loading model...")
|
||||||
|
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
|
||||||
|
model = tts.synthesizer.tts_model
|
||||||
|
|
||||||
|
print("Computing speaker latents...")
|
||||||
|
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"])
|
||||||
|
|
||||||
|
print("Inference...")
|
||||||
|
t0 = time.time()
|
||||||
|
stream_generator = model.inference_stream(
|
||||||
|
"It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
|
||||||
|
"en",
|
||||||
|
gpt_cond_latent,
|
||||||
|
speaker_embedding
|
||||||
|
)
|
||||||
|
|
||||||
|
wav_chuncks = []
|
||||||
|
for i, chunk in enumerate(stream_generator):
|
||||||
|
if i == 0:
|
||||||
|
print(f"Time to first chunck: {time.time() - t0}")
|
||||||
|
print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
|
||||||
|
wav_chuncks.append(chunk)
|
||||||
|
wav = torch.cat(wav_chuncks, dim=0)
|
||||||
|
torchaudio.save("xtts_streaming.wav", wav.squeeze().unsqueeze(0).cpu(), 24000)
|
||||||
|
```
|
||||||
|
|
||||||
#### 🐸TTS Command line
|
#### 🐸TTS Command line
|
||||||
|
|
||||||
##### Single reference
|
##### Single reference
|
||||||
|
@ -91,10 +146,13 @@ or for all wav files in a directory you can use:
|
||||||
--use_cuda true
|
--use_cuda true
|
||||||
```
|
```
|
||||||
|
|
||||||
|
#### 🐸TTS Model API
|
||||||
|
|
||||||
#### model directly
|
To use the model API, you need to download the model files and pass config and model file paths manually.
|
||||||
|
|
||||||
If you want to be able to run with `use_deepspeed=True` and enjoy the speedup, you need to install deepspeed first.
|
##### Calling manually
|
||||||
|
|
||||||
|
If you want to be able to run with `use_deepspeed=True` and **enjoy the speedup**, you need to install deepspeed first.
|
||||||
|
|
||||||
```console
|
```console
|
||||||
pip install deepspeed==0.10.3
|
pip install deepspeed==0.10.3
|
||||||
|
@ -129,7 +187,7 @@ torchaudio.save("xtts.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
#### streaming inference
|
##### Streaming manually
|
||||||
|
|
||||||
Here the goal is to stream the audio as it is being generated. This is useful for real-time applications.
|
Here the goal is to stream the audio as it is being generated. This is useful for real-time applications.
|
||||||
Streaming inference is typically slower than regular inference, but it allows to get a first chunk of audio faster.
|
Streaming inference is typically slower than regular inference, but it allows to get a first chunk of audio faster.
|
||||||
|
|
Loading…
Reference in New Issue