Merge pull request #3081 from coqui-ai/dev

v0.17.9
This commit is contained in:
Eren Gölge 2023-10-19 11:23:55 +02:00 committed by GitHub
commit f0faed962d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
17 changed files with 290 additions and 186 deletions

View File

@ -146,7 +146,7 @@ Underlined "TTS*" and "Judy*" are **internal** 🐸TTS models that are not relea
You can also help us implement more models. You can also help us implement more models.
## Installation ## Installation
🐸TTS is tested on Ubuntu 18.04 with **python >= 3.7, < 3.11.**. 🐸TTS is tested on Ubuntu 18.04 with **python >= 3.9, < 3.12.**.
If you are only interested in [synthesizing speech](https://tts.readthedocs.io/en/latest/inference.html) with the released 🐸TTS models, installing from PyPI is the easiest option. If you are only interested in [synthesizing speech](https://tts.readthedocs.io/en/latest/inference.html) with the released 🐸TTS models, installing from PyPI is the easiest option.
@ -198,17 +198,18 @@ from TTS.api import TTS
# Get device # Get device
device = "cuda" if torch.cuda.is_available() else "cpu" device = "cuda" if torch.cuda.is_available() else "cpu"
# List available 🐸TTS models and choose the first one # List available 🐸TTS models
model_name = TTS().list_models()[0] print(TTS().list_models())
# Init TTS # Init TTS
tts = TTS(model_name).to(device) tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1").to(device)
# Run TTS # Run TTS
# ❗ Since this model is multi-speaker and multi-lingual, we must set the target speaker and the language # ❗ Since this model is multi-lingual voice cloning model, we must set the target speaker_wav and language
# Text to speech with a numpy output # Text to speech list of amplitude values as output
wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[0], language=tts.languages[0]) wav = tts.tts(text="Hello world!", speaker_wav="my/cloning/audio.wav", language="en")
# Text to speech to a file # Text to speech to a file
tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path="output.wav") tts.tts_to_file(text="Hello world!", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav")
``` ```
#### Running a single speaker model #### Running a single speaker model
@ -347,6 +348,18 @@ If you don't specify any models, then it uses LJSpeech based English model.
$ tts --text "Text for TTS" --out_path output/path/speech.wav $ tts --text "Text for TTS" --out_path output/path/speech.wav
``` ```
- Run TTS and pipe out the generated TTS wav file data:
```
$ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay
```
- Run TTS and define speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0:
```
$ tts --text "Text for TTS" --model_name "coqui_studio/<language>/<dataset>/<model_name>" --speed 1.2 --out_path output/path/speech.wav
```
- Run a TTS model with its default vocoder model: - Run a TTS model with its default vocoder model:
``` ```

View File

@ -5,9 +5,9 @@
"xtts_v1": { "xtts_v1": {
"description": "XTTS-v1 by Coqui with 13 languages and cross-language voice cloning.", "description": "XTTS-v1 by Coqui with 13 languages and cross-language voice cloning.",
"hf_url": [ "hf_url": [
"https://huggingface.co/coqui/XTTS-v1/resolve/hifigan/model.pth", "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/hifigan/model.pth",
"https://huggingface.co/coqui/XTTS-v1/resolve/hifigan/config.json", "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/hifigan/config.json",
"https://huggingface.co/coqui/XTTS-v1/resolve/hifigan/vocab.json" "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/hifigan/vocab.json",
], ],
"default_vocoder": null, "default_vocoder": null,
"commit": "e5140314", "commit": "e5140314",

View File

@ -1 +1 @@
0.17.8 0.17.9

View File

@ -112,7 +112,6 @@ class TTS(nn.Module):
return self.synthesizer.tts_model.language_manager.num_languages > 1 return self.synthesizer.tts_model.language_manager.num_languages > 1
return False return False
@property @property
def speakers(self): def speakers(self):
if not self.is_multi_speaker: if not self.is_multi_speaker:
@ -265,6 +264,7 @@ class TTS(nn.Module):
language: str = None, language: str = None,
emotion: str = None, emotion: str = None,
speed: float = 1.0, speed: float = 1.0,
pipe_out = None,
file_path: str = None, file_path: str = None,
) -> Union[np.ndarray, str]: ) -> Union[np.ndarray, str]:
"""Convert text to speech using Coqui Studio models. Use `CS_API` class if you are only interested in the API. """Convert text to speech using Coqui Studio models. Use `CS_API` class if you are only interested in the API.
@ -281,6 +281,8 @@ class TTS(nn.Module):
with "V1" model. Defaults to None. with "V1" model. Defaults to None.
speed (float, optional): speed (float, optional):
Speed of the speech. Defaults to 1.0. Speed of the speech. Defaults to 1.0.
pipe_out (BytesIO, optional):
Flag to stdout the generated TTS wav file for shell pipe.
file_path (str, optional): file_path (str, optional):
Path to save the output file. When None it returns the `np.ndarray` of waveform. Defaults to None. Path to save the output file. When None it returns the `np.ndarray` of waveform. Defaults to None.
@ -294,6 +296,7 @@ class TTS(nn.Module):
speaker_name=speaker_name, speaker_name=speaker_name,
language=language, language=language,
speed=speed, speed=speed,
pipe_out=pipe_out,
emotion=emotion, emotion=emotion,
file_path=file_path, file_path=file_path,
)[0] )[0]
@ -356,6 +359,7 @@ class TTS(nn.Module):
speaker_wav: str = None, speaker_wav: str = None,
emotion: str = None, emotion: str = None,
speed: float = 1.0, speed: float = 1.0,
pipe_out = None,
file_path: str = "output.wav", file_path: str = "output.wav",
**kwargs, **kwargs,
): ):
@ -377,6 +381,8 @@ class TTS(nn.Module):
Emotion to use for 🐸Coqui Studio models. Defaults to "Neutral". Emotion to use for 🐸Coqui Studio models. Defaults to "Neutral".
speed (float, optional): speed (float, optional):
Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0. Defaults to None. Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0. Defaults to None.
pipe_out (BytesIO, optional):
Flag to stdout the generated TTS wav file for shell pipe.
file_path (str, optional): file_path (str, optional):
Output file path. Defaults to "output.wav". Output file path. Defaults to "output.wav".
kwargs (dict, optional): kwargs (dict, optional):
@ -386,10 +392,16 @@ class TTS(nn.Module):
if self.csapi is not None: if self.csapi is not None:
return self.tts_coqui_studio( return self.tts_coqui_studio(
text=text, speaker_name=speaker, language=language, emotion=emotion, speed=speed, file_path=file_path text=text,
speaker_name=speaker,
language=language,
emotion=emotion,
speed=speed,
file_path=file_path,
pipe_out=pipe_out,
) )
wav = self.tts(text=text, speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs) wav = self.tts(text=text, speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)
self.synthesizer.save_wav(wav=wav, path=file_path) self.synthesizer.save_wav(wav=wav, path=file_path, pipe_out=pipe_out)
return file_path return file_path
def voice_conversion( def voice_conversion(

View File

@ -2,6 +2,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import argparse import argparse
import contextlib
import sys import sys
from argparse import RawTextHelpFormatter from argparse import RawTextHelpFormatter
@ -59,6 +60,18 @@ If you don't specify any models, then it uses LJSpeech based English model.
$ tts --text "Text for TTS" --out_path output/path/speech.wav $ tts --text "Text for TTS" --out_path output/path/speech.wav
``` ```
- Run TTS and pipe out the generated TTS wav file data:
```
$ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay
```
- Run TTS and define speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0:
```
$ tts --text "Text for TTS" --model_name "coqui_studio/<language>/<dataset>/<model_name>" --speed 1.2 --out_path output/path/speech.wav
```
- Run a TTS model with its default vocoder model: - Run a TTS model with its default vocoder model:
``` ```
@ -228,6 +241,20 @@ def main():
help="Language to condition the model with. Only available for 🐸Coqui Studio `XTTS-multilingual` model.", help="Language to condition the model with. Only available for 🐸Coqui Studio `XTTS-multilingual` model.",
default=None, default=None,
) )
parser.add_argument(
"--pipe_out",
help="stdout the generated TTS wav file for shell pipe.",
type=str2bool,
nargs="?",
const=True,
default=False,
)
parser.add_argument(
"--speed",
type=float,
help="Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0.",
default=None,
)
# args for multi-speaker synthesis # args for multi-speaker synthesis
parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None) parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
@ -335,167 +362,177 @@ def main():
if not any(check_args): if not any(check_args):
parser.parse_args(["-h"]) parser.parse_args(["-h"])
# Late-import to make things load faster pipe_out = sys.stdout if args.pipe_out else None
from TTS.api import TTS
from TTS.utils.manage import ModelManager
from TTS.utils.synthesizer import Synthesizer
# load model manager with contextlib.redirect_stdout(None if args.pipe_out else sys.stdout):
path = Path(__file__).parent / "../.models.json" # Late-import to make things load faster
manager = ModelManager(path, progress_bar=args.progress_bar) from TTS.api import TTS
api = TTS() from TTS.utils.manage import ModelManager
from TTS.utils.synthesizer import Synthesizer
tts_path = None # load model manager
tts_config_path = None path = Path(__file__).parent / "../.models.json"
speakers_file_path = None manager = ModelManager(path, progress_bar=args.progress_bar)
language_ids_file_path = None api = TTS()
vocoder_path = None
vocoder_config_path = None
encoder_path = None
encoder_config_path = None
vc_path = None
vc_config_path = None
model_dir = None
# CASE1 #list : list pre-trained TTS models tts_path = None
if args.list_models: tts_config_path = None
manager.add_cs_api_models(api.list_models()) speakers_file_path = None
manager.list_models() language_ids_file_path = None
sys.exit() vocoder_path = None
vocoder_config_path = None
encoder_path = None
encoder_config_path = None
vc_path = None
vc_config_path = None
model_dir = None
# CASE2 #info : model info for pre-trained TTS models # CASE1 #list : list pre-trained TTS models
if args.model_info_by_idx: if args.list_models:
model_query = args.model_info_by_idx manager.add_cs_api_models(api.list_models())
manager.model_info_by_idx(model_query) manager.list_models()
sys.exit() sys.exit()
if args.model_info_by_name: # CASE2 #info : model info for pre-trained TTS models
model_query_full_name = args.model_info_by_name if args.model_info_by_idx:
manager.model_info_by_full_name(model_query_full_name) model_query = args.model_info_by_idx
sys.exit() manager.model_info_by_idx(model_query)
sys.exit()
# CASE3: TTS with coqui studio models if args.model_info_by_name:
if "coqui_studio" in args.model_name: model_query_full_name = args.model_info_by_name
print(" > Using 🐸Coqui Studio model: ", args.model_name) manager.model_info_by_full_name(model_query_full_name)
api = TTS(model_name=args.model_name, cs_api_model=args.cs_model) sys.exit()
api.tts_to_file(text=args.text, emotion=args.emotion, file_path=args.out_path, language=args.language)
print(" > Saving output to ", args.out_path)
return
# CASE4: load pre-trained model paths # CASE3: TTS with coqui studio models
if args.model_name is not None and not args.model_path: if "coqui_studio" in args.model_name:
model_path, config_path, model_item = manager.download_model(args.model_name) print(" > Using 🐸Coqui Studio model: ", args.model_name)
# tts model api = TTS(model_name=args.model_name, cs_api_model=args.cs_model)
if model_item["model_type"] == "tts_models": api.tts_to_file(
tts_path = model_path text=args.text,
tts_config_path = config_path emotion=args.emotion,
if "default_vocoder" in model_item: file_path=args.out_path,
args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name language=args.language,
speed=args.speed,
pipe_out=pipe_out,
)
print(" > Saving output to ", args.out_path)
return
# voice conversion model # CASE4: load pre-trained model paths
if model_item["model_type"] == "voice_conversion_models": if args.model_name is not None and not args.model_path:
vc_path = model_path model_path, config_path, model_item = manager.download_model(args.model_name)
vc_config_path = config_path # tts model
if model_item["model_type"] == "tts_models":
tts_path = model_path
tts_config_path = config_path
if "default_vocoder" in model_item:
args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name
# tts model with multiple files to be loaded from the directory path # voice conversion model
if model_item.get("author", None) == "fairseq" or isinstance(model_item["model_url"], list): if model_item["model_type"] == "voice_conversion_models":
model_dir = model_path vc_path = model_path
tts_path = None vc_config_path = config_path
tts_config_path = None
args.vocoder_name = None
# load vocoder # tts model with multiple files to be loaded from the directory path
if args.vocoder_name is not None and not args.vocoder_path: if model_item.get("author", None) == "fairseq" or isinstance(model_item["model_url"], list):
vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name) model_dir = model_path
tts_path = None
tts_config_path = None
args.vocoder_name = None
# CASE5: set custom model paths # load vocoder
if args.model_path is not None: if args.vocoder_name is not None and not args.vocoder_path:
tts_path = args.model_path vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
tts_config_path = args.config_path
speakers_file_path = args.speakers_file_path
language_ids_file_path = args.language_ids_file_path
if args.vocoder_path is not None: # CASE5: set custom model paths
vocoder_path = args.vocoder_path if args.model_path is not None:
vocoder_config_path = args.vocoder_config_path tts_path = args.model_path
tts_config_path = args.config_path
speakers_file_path = args.speakers_file_path
language_ids_file_path = args.language_ids_file_path
if args.encoder_path is not None: if args.vocoder_path is not None:
encoder_path = args.encoder_path vocoder_path = args.vocoder_path
encoder_config_path = args.encoder_config_path vocoder_config_path = args.vocoder_config_path
device = args.device if args.encoder_path is not None:
if args.use_cuda: encoder_path = args.encoder_path
device = "cuda" encoder_config_path = args.encoder_config_path
# load models device = args.device
synthesizer = Synthesizer( if args.use_cuda:
tts_path, device = "cuda"
tts_config_path,
speakers_file_path,
language_ids_file_path,
vocoder_path,
vocoder_config_path,
encoder_path,
encoder_config_path,
vc_path,
vc_config_path,
model_dir,
args.voice_dir,
).to(device)
# query speaker ids of a multi-speaker model. # load models
if args.list_speaker_idxs: synthesizer = Synthesizer(
print( tts_path,
" > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model." tts_config_path,
) speakers_file_path,
print(synthesizer.tts_model.speaker_manager.name_to_id) language_ids_file_path,
return vocoder_path,
vocoder_config_path,
encoder_path,
encoder_config_path,
vc_path,
vc_config_path,
model_dir,
args.voice_dir,
).to(device)
# query langauge ids of a multi-lingual model. # query speaker ids of a multi-speaker model.
if args.list_language_idxs: if args.list_speaker_idxs:
print( print(
" > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model." " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
) )
print(synthesizer.tts_model.language_manager.name_to_id) print(synthesizer.tts_model.speaker_manager.name_to_id)
return return
# check the arguments against a multi-speaker model. # query langauge ids of a multi-lingual model.
if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav): if args.list_language_idxs:
print( print(
" [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to " " > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
"select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`." )
) print(synthesizer.tts_model.language_manager.name_to_id)
return return
# RUN THE SYNTHESIS # check the arguments against a multi-speaker model.
if args.text: if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav):
print(" > Text: {}".format(args.text)) print(
" [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to "
"select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
)
return
# kick it # RUN THE SYNTHESIS
if tts_path is not None: if args.text:
wav = synthesizer.tts( print(" > Text: {}".format(args.text))
args.text,
speaker_name=args.speaker_idx,
language_name=args.language_idx,
speaker_wav=args.speaker_wav,
reference_wav=args.reference_wav,
style_wav=args.capacitron_style_wav,
style_text=args.capacitron_style_text,
reference_speaker_name=args.reference_speaker_idx,
)
elif vc_path is not None:
wav = synthesizer.voice_conversion(
source_wav=args.source_wav,
target_wav=args.target_wav,
)
elif model_dir is not None:
wav = synthesizer.tts(
args.text, speaker_name=args.speaker_idx, language_name=args.language_idx, speaker_wav=args.speaker_wav
)
# save the results # kick it
print(" > Saving output to {}".format(args.out_path)) if tts_path is not None:
synthesizer.save_wav(wav, args.out_path) wav = synthesizer.tts(
args.text,
speaker_name=args.speaker_idx,
language_name=args.language_idx,
speaker_wav=args.speaker_wav,
reference_wav=args.reference_wav,
style_wav=args.capacitron_style_wav,
style_text=args.capacitron_style_text,
reference_speaker_name=args.reference_speaker_idx,
)
elif vc_path is not None:
wav = synthesizer.voice_conversion(
source_wav=args.source_wav,
target_wav=args.target_wav,
)
elif model_dir is not None:
wav = synthesizer.tts(
args.text, speaker_name=args.speaker_idx, language_name=args.language_idx, speaker_wav=args.speaker_wav
)
# save the results
print(" > Saving output to {}".format(args.out_path))
synthesizer.save_wav(wav, args.out_path, pipe_out=pipe_out)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -9,6 +9,8 @@ import numpy as np
import requests import requests
from scipy.io import wavfile from scipy.io import wavfile
from TTS.utils.audio.numpy_transforms import save_wav
class Speaker(object): class Speaker(object):
"""Convert dict to object.""" """Convert dict to object."""
@ -288,6 +290,7 @@ class CS_API:
speaker_id=None, speaker_id=None,
emotion=None, emotion=None,
speed=1.0, speed=1.0,
pipe_out=None,
language=None, language=None,
file_path: str = None, file_path: str = None,
) -> str: ) -> str:
@ -300,6 +303,7 @@ class CS_API:
speaker_id (str): Speaker ID. If None, the speaker name is used. speaker_id (str): Speaker ID. If None, the speaker name is used.
emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull".
speed (float): Speed of the speech. 1.0 is normal speed. speed (float): Speed of the speech. 1.0 is normal speed.
pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe.
language (str): Language of the text. If None, the default language of the speaker is used. Language is only language (str): Language of the text. If None, the default language of the speaker is used. Language is only
supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en". supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
file_path (str): Path to save the file. If None, a temporary file is created. file_path (str): Path to save the file. If None, a temporary file is created.
@ -307,7 +311,7 @@ class CS_API:
if file_path is None: if file_path is None:
file_path = tempfile.mktemp(".wav") file_path = tempfile.mktemp(".wav")
wav, sr = self.tts(text, speaker_name, speaker_id, emotion, speed, language) wav, sr = self.tts(text, speaker_name, speaker_id, emotion, speed, language)
wavfile.write(file_path, sr, wav) save_wav(wav=wav, path=file_path, sample_rate=sr, pipe_out=pipe_out)
return file_path return file_path

View File

@ -396,6 +396,7 @@ class ForwardTTS(BaseTTS):
- g: :math:`(B, C)` - g: :math:`(B, C)`
""" """
if hasattr(self, "emb_g"): if hasattr(self, "emb_g"):
g = g.type(torch.LongTensor)
g = self.emb_g(g) # [B, C, 1] g = self.emb_g(g) # [B, C, 1]
if g is not None: if g is not None:
g = g.unsqueeze(-1) g = g.unsqueeze(-1)
@ -683,9 +684,10 @@ class ForwardTTS(BaseTTS):
# encoder pass # encoder pass
o_en, x_mask, g, _ = self._forward_encoder(x, x_mask, g) o_en, x_mask, g, _ = self._forward_encoder(x, x_mask, g)
# duration predictor pass # duration predictor pass
o_dr_log = self.duration_predictor(o_en, x_mask) o_dr_log = self.duration_predictor(o_en.squeeze(), x_mask)
o_dr = self.format_durations(o_dr_log, x_mask).squeeze(1) o_dr = self.format_durations(o_dr_log, x_mask).squeeze(1)
y_lengths = o_dr.sum(1) y_lengths = o_dr.sum(1)
# pitch predictor pass # pitch predictor pass
o_pitch = None o_pitch = None
if self.args.use_pitch: if self.args.use_pitch:

View File

@ -1,3 +1,4 @@
from io import BytesIO
from typing import Tuple from typing import Tuple
import librosa import librosa
@ -427,16 +428,24 @@ def load_wav(*, filename: str, sample_rate: int = None, resample: bool = False,
return x return x
def save_wav(*, wav: np.ndarray, path: str, sample_rate: int = None, **kwargs) -> None: def save_wav(*, wav: np.ndarray, path: str, sample_rate: int = None, pipe_out = None, **kwargs) -> None:
"""Save float waveform to a file using Scipy. """Save float waveform to a file using Scipy.
Args: Args:
wav (np.ndarray): Waveform with float values in range [-1, 1] to save. wav (np.ndarray): Waveform with float values in range [-1, 1] to save.
path (str): Path to a output file. path (str): Path to a output file.
sr (int, optional): Sampling rate used for saving to the file. Defaults to None. sr (int, optional): Sampling rate used for saving to the file. Defaults to None.
pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe.
""" """
wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav)))) wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
scipy.io.wavfile.write(path, sample_rate, wav_norm.astype(np.int16))
wav_norm = wav_norm.astype(np.int16)
if pipe_out:
wav_buffer = BytesIO()
scipy.io.wavfile.write(wav_buffer, sample_rate, wav_norm)
wav_buffer.seek(0)
pipe_out.buffer.write(wav_buffer.read())
scipy.io.wavfile.write(path, sample_rate, wav_norm)
def mulaw_encode(*, wav: np.ndarray, mulaw_qc: int, **kwargs) -> np.ndarray: def mulaw_encode(*, wav: np.ndarray, mulaw_qc: int, **kwargs) -> np.ndarray:

View File

@ -1,3 +1,4 @@
from io import BytesIO
from typing import Dict, Tuple from typing import Dict, Tuple
import librosa import librosa
@ -693,20 +694,27 @@ class AudioProcessor(object):
x = self.rms_volume_norm(x, self.db_level) x = self.rms_volume_norm(x, self.db_level)
return x return x
def save_wav(self, wav: np.ndarray, path: str, sr: int = None) -> None: def save_wav(self, wav: np.ndarray, path: str, sr: int = None, pipe_out = None) -> None:
"""Save a waveform to a file using Scipy. """Save a waveform to a file using Scipy.
Args: Args:
wav (np.ndarray): Waveform to save. wav (np.ndarray): Waveform to save.
path (str): Path to a output file. path (str): Path to a output file.
sr (int, optional): Sampling rate used for saving to the file. Defaults to None. sr (int, optional): Sampling rate used for saving to the file. Defaults to None.
pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe.
""" """
if self.do_rms_norm: if self.do_rms_norm:
wav_norm = self.rms_volume_norm(wav, self.db_level) * 32767 wav_norm = self.rms_volume_norm(wav, self.db_level) * 32767
else: else:
wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav)))) wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
scipy.io.wavfile.write(path, sr if sr else self.sample_rate, wav_norm.astype(np.int16)) wav_norm = wav_norm.astype(np.int16)
if pipe_out:
wav_buffer = BytesIO()
scipy.io.wavfile.write(wav_buffer, sr if sr else self.sample_rate, wav_norm)
wav_buffer.seek(0)
pipe_out.buffer.write(wav_buffer.read())
scipy.io.wavfile.write(path, sr if sr else self.sample_rate, wav_norm)
def get_duration(self, filename: str) -> float: def get_duration(self, filename: str) -> float:
"""Get the duration of a wav file using Librosa. """Get the duration of a wav file using Librosa.

View File

@ -235,19 +235,20 @@ class Synthesizer(nn.Module):
""" """
return self.seg.segment(text) return self.seg.segment(text)
def save_wav(self, wav: List[int], path: str) -> None: def save_wav(self, wav: List[int], path: str, pipe_out = None) -> None:
"""Save the waveform as a file. """Save the waveform as a file.
Args: Args:
wav (List[int]): waveform as a list of values. wav (List[int]): waveform as a list of values.
path (str): output path to save the waveform. path (str): output path to save the waveform.
pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe.
""" """
# if tensor convert to numpy # if tensor convert to numpy
if torch.is_tensor(wav): if torch.is_tensor(wav):
wav = wav.cpu().numpy() wav = wav.cpu().numpy()
if isinstance(wav, list): if isinstance(wav, list):
wav = np.array(wav) wav = np.array(wav)
save_wav(wav=wav, path=path, sample_rate=self.output_sample_rate) save_wav(wav=wav, path=path, sample_rate=self.output_sample_rate, pipe_out=pipe_out)
def voice_conversion(self, source_wav: str, target_wav: str) -> List[int]: def voice_conversion(self, source_wav: str, target_wav: str) -> List[int]:
output_wav = self.vc_model.voice_conversion(source_wav, target_wav) output_wav = self.vc_model.voice_conversion(source_wav, target_wav)
@ -299,11 +300,7 @@ class Synthesizer(nn.Module):
speaker_embedding = None speaker_embedding = None
speaker_id = None speaker_id = None
if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "name_to_id"): if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "name_to_id"):
# handle Neon models with single speaker. if speaker_name and isinstance(speaker_name, str):
if len(self.tts_model.speaker_manager.name_to_id) == 1:
speaker_id = list(self.tts_model.speaker_manager.name_to_id.values())[0]
elif speaker_name and isinstance(speaker_name, str):
if self.tts_config.use_d_vector_file: if self.tts_config.use_d_vector_file:
# get the average speaker embedding from the saved d_vectors. # get the average speaker embedding from the saved d_vectors.
speaker_embedding = self.tts_model.speaker_manager.get_mean_embedding( speaker_embedding = self.tts_model.speaker_manager.get_mean_embedding(
@ -313,7 +310,9 @@ class Synthesizer(nn.Module):
else: else:
# get speaker idx from the speaker name # get speaker idx from the speaker name
speaker_id = self.tts_model.speaker_manager.name_to_id[speaker_name] speaker_id = self.tts_model.speaker_manager.name_to_id[speaker_name]
# handle Neon models with single speaker.
elif len(self.tts_model.speaker_manager.name_to_id) == 1:
speaker_id = list(self.tts_model.speaker_manager.name_to_id.values())[0]
elif not speaker_name and not speaker_wav: elif not speaker_name and not speaker_wav:
raise ValueError( raise ValueError(
" [!] Looks like you are using a multi-speaker model. " " [!] Looks like you are using a multi-speaker model. "

View File

@ -17,19 +17,20 @@ Let's assume you created the audio clips and their transcription. You can collec
... ...
``` ```
You can either create separate transcription files for each clip or create a text file that maps each audio clip to its transcription. In this file, each line must be delimitered by a special character separating the audio file name from the transcription. And make sure that the delimiter is not used in the transcription text. You can either create separate transcription files for each clip or create a text file that maps each audio clip to its transcription. In this file, each column must be delimitered by a special character separating the audio file name, the transcription and the normalized transcription. And make sure that the delimiter is not used in the transcription text.
We recommend the following format delimited by `|`. In the following example, `audio1`, `audio2` refer to files `audio1.wav`, `audio2.wav` etc. We recommend the following format delimited by `|`. In the following example, `audio1`, `audio2` refer to files `audio1.wav`, `audio2.wav` etc.
``` ```
# metadata.txt # metadata.txt
audio1|This is my sentence. audio1|This is my sentence.|This is my sentence.
audio2|This is maybe my sentence. audio2|1469 and 1470|fourteen sixty-nine and fourteen seventy
audio3|This is certainly my sentence. audio3|It'll be $16 sir.|It'll be sixteen dollars sir.
audio4|Let this be your sentence.
... ...
``` ```
*If you don't have normalized transcriptions, you can use the same transcription for both columns. If it's your case, we recommend to use normalization later in the pipeline, either in the text cleaner or in the phonemizer.*
In the end, we have the following folder structure In the end, we have the following folder structure
``` ```

View File

@ -41,7 +41,7 @@
6. Optionally, define `MyModelArgs`. 6. Optionally, define `MyModelArgs`.
`MyModelArgs` is a 👨Coqpit class that sets all the class arguments of the `MyModel`. `MyModelArgs` must have `MyModelArgs` is a 👨Coqpit class that sets all the class arguments of the `MyModel`. `MyModelArgs` must have
all the fields neccessary to instantiate the `MyModel`. However, for training, you need to pass `MyModelConfig` to all the fields necessary to instantiate the `MyModel`. However, for training, you need to pass `MyModelConfig` to
the model. the model.
7. Test `MyModel`. 7. Test `MyModel`.

View File

@ -114,18 +114,24 @@ tts-server --model_name "<type>/<language>/<dataset>/<model_name>" \
You can run a multi-speaker and multi-lingual model in Python as You can run a multi-speaker and multi-lingual model in Python as
```python ```python
import torch
from TTS.api import TTS from TTS.api import TTS
# List available 🐸TTS models and choose the first one # Get device
model_name = TTS().list_models()[0] device = "cuda" if torch.cuda.is_available() else "cpu"
# List available 🐸TTS models
print(TTS().list_models())
# Init TTS # Init TTS
tts = TTS(model_name) tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1").to(device)
# Run TTS # Run TTS
# ❗ Since this model is multi-speaker and multi-lingual, we must set the target speaker and the language # ❗ Since this model is multi-lingual voice cloning model, we must set the target speaker_wav and language
# Text to speech with a numpy output # Text to speech list of amplitude values as output
wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[0], language=tts.languages[0]) wav = tts.tts(text="Hello world!", speaker_wav="my/cloning/audio.wav", language="en")
# Text to speech to a file # Text to speech to a file
tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path="output.wav") tts.tts_to_file(text="Hello world!", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav")
``` ```
#### Here is an example for a single speaker model. #### Here is an example for a single speaker model.

View File

@ -1,3 +1,3 @@
# Trainer API # Trainer API
We made the trainer a seprate project on https://github.com/coqui-ai/Trainer We made the trainer a separate project on https://github.com/coqui-ai/Trainer

View File

@ -12,7 +12,7 @@ Currently we provide the following pre-configured architectures:
- **FastPitch:** - **FastPitch:**
It uses the same FastSpeech architecture that is conditioned on fundemental frequency (f0) contours with the It uses the same FastSpeech architecture that is conditioned on fundamental frequency (f0) contours with the
promise of more expressive speech. promise of more expressive speech.
- **SpeedySpeech:** - **SpeedySpeech:**

View File

@ -100,7 +100,7 @@
" wav_file = item[\"audio_file\"].strip()\n", " wav_file = item[\"audio_file\"].strip()\n",
" wav_files.append(wav_file)\n", " wav_files.append(wav_file)\n",
" if not os.path.exists(wav_file):\n", " if not os.path.exists(wav_file):\n",
" print(waf_path)" " print(wav_file)"
] ]
}, },
{ {

View File

@ -13,3 +13,16 @@ def test_synthesize():
'--text "This is it" ' '--text "This is it" '
f'--out_path "{output_path}"' f'--out_path "{output_path}"'
) )
# 🐸 Coqui studio model with speed arg.
run_cli(
'tts --model_name "coqui_studio/en/Torcull Diarmuid/coqui_studio" '
'--text "This is it but slow" --speed 0.1'
f'--out_path "{output_path}"'
)
# test pipe_out command
run_cli(
'tts --text "test." --pipe_out '
f'--out_path "{output_path}" | aplay'
)