mirror of https://github.com/coqui-ai/TTS.git
Update XTTS docs
This commit is contained in:
parent
459ad70dc8
commit
00294ffdf6
|
@ -336,7 +336,7 @@ class Xtts(BaseTTS):
|
||||||
"""Compute the conditioning latents for the GPT model from the given audio.
|
"""Compute the conditioning latents for the GPT model from the given audio.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
audio_path (str): Path to the audio file.
|
audio (tensor): audio tensor.
|
||||||
sr (int): Sample rate of the audio.
|
sr (int): Sample rate of the audio.
|
||||||
length (int): Length of the audio in seconds. Defaults to 3.
|
length (int): Length of the audio in seconds. Defaults to 3.
|
||||||
"""
|
"""
|
||||||
|
@ -444,7 +444,7 @@ class Xtts(BaseTTS):
|
||||||
Args:
|
Args:
|
||||||
text (str): Input text.
|
text (str): Input text.
|
||||||
config (XttsConfig): Config with inference parameters.
|
config (XttsConfig): Config with inference parameters.
|
||||||
speaker_wav (str): Path to the speaker audio file for cloning.
|
speaker_wav (list): List of paths to the speaker audio files to be used for cloning.
|
||||||
language (str): Language ID of the speaker.
|
language (str): Language ID of the speaker.
|
||||||
**kwargs: Inference settings. See `inference()`.
|
**kwargs: Inference settings. See `inference()`.
|
||||||
|
|
||||||
|
|
|
@ -39,6 +39,7 @@ You can also mail us at info@coqui.ai.
|
||||||
### Inference
|
### Inference
|
||||||
#### 🐸TTS API
|
#### 🐸TTS API
|
||||||
|
|
||||||
|
##### Single reference
|
||||||
```python
|
```python
|
||||||
from TTS.api import TTS
|
from TTS.api import TTS
|
||||||
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
|
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
|
||||||
|
@ -46,12 +47,25 @@ tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
|
||||||
# generate speech by cloning a voice using default settings
|
# generate speech by cloning a voice using default settings
|
||||||
tts.tts_to_file(text="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
|
tts.tts_to_file(text="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
|
||||||
file_path="output.wav",
|
file_path="output.wav",
|
||||||
speaker_wav="/path/to/target/speaker.wav",
|
speaker_wav=["/path/to/target/speaker.wav"],
|
||||||
|
language="en")
|
||||||
|
```
|
||||||
|
|
||||||
|
##### Multiple references
|
||||||
|
```python
|
||||||
|
from TTS.api import TTS
|
||||||
|
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
|
||||||
|
|
||||||
|
# generate speech by cloning a voice using default settings
|
||||||
|
tts.tts_to_file(text="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
|
||||||
|
file_path="output.wav",
|
||||||
|
speaker_wav=["/path/to/target/speaker.wav", "/path/to/target/speaker_2.wav", "/path/to/target/speaker_3.wav"],
|
||||||
language="en")
|
language="en")
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 🐸TTS Command line
|
#### 🐸TTS Command line
|
||||||
|
|
||||||
|
##### Single reference
|
||||||
```console
|
```console
|
||||||
tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \
|
tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \
|
||||||
--text "Bugün okula gitmek istemiyorum." \
|
--text "Bugün okula gitmek istemiyorum." \
|
||||||
|
@ -60,6 +74,25 @@ tts.tts_to_file(text="It took me quite a long time to develop a voice, and now t
|
||||||
--use_cuda true
|
--use_cuda true
|
||||||
```
|
```
|
||||||
|
|
||||||
|
##### Multiple references
|
||||||
|
```console
|
||||||
|
tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \
|
||||||
|
--text "Bugün okula gitmek istemiyorum." \
|
||||||
|
--speaker_wav /path/to/target/speaker.wav /path/to/target/speaker_2.wav /path/to/target/speaker_3.wav \
|
||||||
|
--language_idx tr \
|
||||||
|
--use_cuda true
|
||||||
|
```
|
||||||
|
or for all wav files in a directory you can use:
|
||||||
|
|
||||||
|
```console
|
||||||
|
tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \
|
||||||
|
--text "Bugün okula gitmek istemiyorum." \
|
||||||
|
--speaker_wav /path/to/target/*.wav \
|
||||||
|
--language_idx tr \
|
||||||
|
--use_cuda true
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
#### model directly
|
#### model directly
|
||||||
|
|
||||||
If you want to be able to run with `use_deepspeed=True` and enjoy the speedup, you need to install deepspeed first.
|
If you want to be able to run with `use_deepspeed=True` and enjoy the speedup, you need to install deepspeed first.
|
||||||
|
@ -83,7 +116,7 @@ model.load_checkpoint(config, checkpoint_dir="/path/to/xtts/", use_deepspeed=Tru
|
||||||
model.cuda()
|
model.cuda()
|
||||||
|
|
||||||
print("Computing speaker latents...")
|
print("Computing speaker latents...")
|
||||||
gpt_cond_latent, diffusion_conditioning, speaker_embedding = model.get_conditioning_latents(audio_path="reference.wav")
|
gpt_cond_latent, diffusion_conditioning, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"])
|
||||||
|
|
||||||
print("Inference...")
|
print("Inference...")
|
||||||
out = model.inference(
|
out = model.inference(
|
||||||
|
@ -120,7 +153,7 @@ model.load_checkpoint(config, checkpoint_dir="/path/to/xtts/", use_deepspeed=Tru
|
||||||
model.cuda()
|
model.cuda()
|
||||||
|
|
||||||
print("Computing speaker latents...")
|
print("Computing speaker latents...")
|
||||||
gpt_cond_latent, _, speaker_embedding = model.get_conditioning_latents(audio_path="reference.wav")
|
gpt_cond_latent, _, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"])
|
||||||
|
|
||||||
print("Inference...")
|
print("Inference...")
|
||||||
t0 = time.time()
|
t0 = time.time()
|
||||||
|
@ -177,7 +210,7 @@ model.load_checkpoint(config, checkpoint_path=XTTS_CHECKPOINT, vocab_path=TOKENI
|
||||||
model.cuda()
|
model.cuda()
|
||||||
|
|
||||||
print("Computing speaker latents...")
|
print("Computing speaker latents...")
|
||||||
gpt_cond_latent, diffusion_conditioning, speaker_embedding = model.get_conditioning_latents(audio_path=SPEAKER_REFERENCE)
|
gpt_cond_latent, diffusion_conditioning, speaker_embedding = model.get_conditioning_latents(audio_path=[SPEAKER_REFERENCE])
|
||||||
|
|
||||||
print("Inference...")
|
print("Inference...")
|
||||||
out = model.inference(
|
out = model.inference(
|
||||||
|
|
Loading…
Reference in New Issue