mirror of https://github.com/coqui-ai/TTS.git
Add stdout option (#3027)
* add add cli options for play and speed --play argument uses simpleaudio to play the tts wav --speed <float 0.0-2.0> passes speed argument to Coqui Studio models * remove simpleaudio not referenced in file * fix simpleaudio dependency version * add ALSA headers for simpleaudio compilation * Dockerfile ALSA headers for simpleaudio * base changes to use stdout instead of play audio Considering conversion to pipe wav data for audio playback with ohter program like aplay. This is incomplete code. Using to get feedback before proceeding with implementation. * remove play for pipe_out arg that suppresses stdout removed play and simpleaudio dependency in place of pipe fuctionality to allow passing wav file data to a program dedicated to playing audio. * scipy.io.wavfile.write fails with /dev/null target * Streaming inference for XTTS 🚀 (#3035) * v0.17.7 * Redownload XTTS with the local and remote config do not match * Remove unused method * Print a message when it is already donwloaded * Try-except to present error when the user dont have connection * Fix style * 0.17.8 * v0.17.8 --------- Co-authored-by: Julian Weber <julian.weber@hotmail.fr> Co-authored-by: Eren Gölge <erogol@hotmail.com> Co-authored-by: Edresson Casanova <edresson1@gmail.com> Co-authored-by: ggoknar <ggoknar@coqui.ai>
This commit is contained in:
parent
cae185fd16
commit
a151d70242
12
README.md
12
README.md
|
@ -347,6 +347,18 @@ If you don't specify any models, then it uses LJSpeech based English model.
|
||||||
$ tts --text "Text for TTS" --out_path output/path/speech.wav
|
$ tts --text "Text for TTS" --out_path output/path/speech.wav
|
||||||
```
|
```
|
||||||
|
|
||||||
|
- Run TTS and pipe out the generated TTS wav file data:
|
||||||
|
|
||||||
|
```
|
||||||
|
$ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay
|
||||||
|
```
|
||||||
|
|
||||||
|
- Run TTS and define speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0:
|
||||||
|
|
||||||
|
```
|
||||||
|
$ tts --text "Text for TTS" --model_name "coqui_studio/<language>/<dataset>/<model_name>" --speed 1.2 --out_path output/path/speech.wav
|
||||||
|
```
|
||||||
|
|
||||||
- Run a TTS model with its default vocoder model:
|
- Run a TTS model with its default vocoder model:
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
18
TTS/api.py
18
TTS/api.py
|
@ -112,7 +112,6 @@ class TTS(nn.Module):
|
||||||
return self.synthesizer.tts_model.language_manager.num_languages > 1
|
return self.synthesizer.tts_model.language_manager.num_languages > 1
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def speakers(self):
|
def speakers(self):
|
||||||
if not self.is_multi_speaker:
|
if not self.is_multi_speaker:
|
||||||
|
@ -265,6 +264,7 @@ class TTS(nn.Module):
|
||||||
language: str = None,
|
language: str = None,
|
||||||
emotion: str = None,
|
emotion: str = None,
|
||||||
speed: float = 1.0,
|
speed: float = 1.0,
|
||||||
|
pipe_out = None,
|
||||||
file_path: str = None,
|
file_path: str = None,
|
||||||
) -> Union[np.ndarray, str]:
|
) -> Union[np.ndarray, str]:
|
||||||
"""Convert text to speech using Coqui Studio models. Use `CS_API` class if you are only interested in the API.
|
"""Convert text to speech using Coqui Studio models. Use `CS_API` class if you are only interested in the API.
|
||||||
|
@ -281,6 +281,8 @@ class TTS(nn.Module):
|
||||||
with "V1" model. Defaults to None.
|
with "V1" model. Defaults to None.
|
||||||
speed (float, optional):
|
speed (float, optional):
|
||||||
Speed of the speech. Defaults to 1.0.
|
Speed of the speech. Defaults to 1.0.
|
||||||
|
pipe_out (BytesIO, optional):
|
||||||
|
Flag to stdout the generated TTS wav file for shell pipe.
|
||||||
file_path (str, optional):
|
file_path (str, optional):
|
||||||
Path to save the output file. When None it returns the `np.ndarray` of waveform. Defaults to None.
|
Path to save the output file. When None it returns the `np.ndarray` of waveform. Defaults to None.
|
||||||
|
|
||||||
|
@ -294,6 +296,7 @@ class TTS(nn.Module):
|
||||||
speaker_name=speaker_name,
|
speaker_name=speaker_name,
|
||||||
language=language,
|
language=language,
|
||||||
speed=speed,
|
speed=speed,
|
||||||
|
pipe_out=pipe_out,
|
||||||
emotion=emotion,
|
emotion=emotion,
|
||||||
file_path=file_path,
|
file_path=file_path,
|
||||||
)[0]
|
)[0]
|
||||||
|
@ -356,6 +359,7 @@ class TTS(nn.Module):
|
||||||
speaker_wav: str = None,
|
speaker_wav: str = None,
|
||||||
emotion: str = None,
|
emotion: str = None,
|
||||||
speed: float = 1.0,
|
speed: float = 1.0,
|
||||||
|
pipe_out = None,
|
||||||
file_path: str = "output.wav",
|
file_path: str = "output.wav",
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
|
@ -377,6 +381,8 @@ class TTS(nn.Module):
|
||||||
Emotion to use for 🐸Coqui Studio models. Defaults to "Neutral".
|
Emotion to use for 🐸Coqui Studio models. Defaults to "Neutral".
|
||||||
speed (float, optional):
|
speed (float, optional):
|
||||||
Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0. Defaults to None.
|
Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0. Defaults to None.
|
||||||
|
pipe_out (BytesIO, optional):
|
||||||
|
Flag to stdout the generated TTS wav file for shell pipe.
|
||||||
file_path (str, optional):
|
file_path (str, optional):
|
||||||
Output file path. Defaults to "output.wav".
|
Output file path. Defaults to "output.wav".
|
||||||
kwargs (dict, optional):
|
kwargs (dict, optional):
|
||||||
|
@ -386,10 +392,16 @@ class TTS(nn.Module):
|
||||||
|
|
||||||
if self.csapi is not None:
|
if self.csapi is not None:
|
||||||
return self.tts_coqui_studio(
|
return self.tts_coqui_studio(
|
||||||
text=text, speaker_name=speaker, language=language, emotion=emotion, speed=speed, file_path=file_path
|
text=text,
|
||||||
|
speaker_name=speaker,
|
||||||
|
language=language,
|
||||||
|
emotion=emotion,
|
||||||
|
speed=speed,
|
||||||
|
file_path=file_path,
|
||||||
|
pipe_out=pipe_out,
|
||||||
)
|
)
|
||||||
wav = self.tts(text=text, speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)
|
wav = self.tts(text=text, speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)
|
||||||
self.synthesizer.save_wav(wav=wav, path=file_path)
|
self.synthesizer.save_wav(wav=wav, path=file_path, pipe_out=pipe_out)
|
||||||
return file_path
|
return file_path
|
||||||
|
|
||||||
def voice_conversion(
|
def voice_conversion(
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import contextlib
|
||||||
import sys
|
import sys
|
||||||
from argparse import RawTextHelpFormatter
|
from argparse import RawTextHelpFormatter
|
||||||
|
|
||||||
|
@ -59,6 +60,18 @@ If you don't specify any models, then it uses LJSpeech based English model.
|
||||||
$ tts --text "Text for TTS" --out_path output/path/speech.wav
|
$ tts --text "Text for TTS" --out_path output/path/speech.wav
|
||||||
```
|
```
|
||||||
|
|
||||||
|
- Run TTS and pipe out the generated TTS wav file data:
|
||||||
|
|
||||||
|
```
|
||||||
|
$ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay
|
||||||
|
```
|
||||||
|
|
||||||
|
- Run TTS and define speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0:
|
||||||
|
|
||||||
|
```
|
||||||
|
$ tts --text "Text for TTS" --model_name "coqui_studio/<language>/<dataset>/<model_name>" --speed 1.2 --out_path output/path/speech.wav
|
||||||
|
```
|
||||||
|
|
||||||
- Run a TTS model with its default vocoder model:
|
- Run a TTS model with its default vocoder model:
|
||||||
|
|
||||||
```
|
```
|
||||||
|
@ -228,6 +241,20 @@ def main():
|
||||||
help="Language to condition the model with. Only available for 🐸Coqui Studio `XTTS-multilingual` model.",
|
help="Language to condition the model with. Only available for 🐸Coqui Studio `XTTS-multilingual` model.",
|
||||||
default=None,
|
default=None,
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--pipe_out",
|
||||||
|
help="stdout the generated TTS wav file for shell pipe.",
|
||||||
|
type=str2bool,
|
||||||
|
nargs="?",
|
||||||
|
const=True,
|
||||||
|
default=False,
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--speed",
|
||||||
|
type=float,
|
||||||
|
help="Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0.",
|
||||||
|
default=None,
|
||||||
|
)
|
||||||
|
|
||||||
# args for multi-speaker synthesis
|
# args for multi-speaker synthesis
|
||||||
parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
|
parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
|
||||||
|
@ -335,6 +362,9 @@ def main():
|
||||||
if not any(check_args):
|
if not any(check_args):
|
||||||
parser.parse_args(["-h"])
|
parser.parse_args(["-h"])
|
||||||
|
|
||||||
|
pipe_out = sys.stdout if args.pipe_out else None
|
||||||
|
|
||||||
|
with contextlib.redirect_stdout(None if args.pipe_out else sys.stdout):
|
||||||
# Late-import to make things load faster
|
# Late-import to make things load faster
|
||||||
from TTS.api import TTS
|
from TTS.api import TTS
|
||||||
from TTS.utils.manage import ModelManager
|
from TTS.utils.manage import ModelManager
|
||||||
|
@ -378,7 +408,14 @@ def main():
|
||||||
if "coqui_studio" in args.model_name:
|
if "coqui_studio" in args.model_name:
|
||||||
print(" > Using 🐸Coqui Studio model: ", args.model_name)
|
print(" > Using 🐸Coqui Studio model: ", args.model_name)
|
||||||
api = TTS(model_name=args.model_name, cs_api_model=args.cs_model)
|
api = TTS(model_name=args.model_name, cs_api_model=args.cs_model)
|
||||||
api.tts_to_file(text=args.text, emotion=args.emotion, file_path=args.out_path, language=args.language)
|
api.tts_to_file(
|
||||||
|
text=args.text,
|
||||||
|
emotion=args.emotion,
|
||||||
|
file_path=args.out_path,
|
||||||
|
language=args.language,
|
||||||
|
speed=args.speed,
|
||||||
|
pipe_out=pipe_out,
|
||||||
|
)
|
||||||
print(" > Saving output to ", args.out_path)
|
print(" > Saving output to ", args.out_path)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -495,7 +532,7 @@ def main():
|
||||||
|
|
||||||
# save the results
|
# save the results
|
||||||
print(" > Saving output to {}".format(args.out_path))
|
print(" > Saving output to {}".format(args.out_path))
|
||||||
synthesizer.save_wav(wav, args.out_path)
|
synthesizer.save_wav(wav, args.out_path, pipe_out=pipe_out)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
@ -9,6 +9,8 @@ import numpy as np
|
||||||
import requests
|
import requests
|
||||||
from scipy.io import wavfile
|
from scipy.io import wavfile
|
||||||
|
|
||||||
|
from TTS.utils.audio.numpy_transforms import save_wav
|
||||||
|
|
||||||
|
|
||||||
class Speaker(object):
|
class Speaker(object):
|
||||||
"""Convert dict to object."""
|
"""Convert dict to object."""
|
||||||
|
@ -288,6 +290,7 @@ class CS_API:
|
||||||
speaker_id=None,
|
speaker_id=None,
|
||||||
emotion=None,
|
emotion=None,
|
||||||
speed=1.0,
|
speed=1.0,
|
||||||
|
pipe_out=None,
|
||||||
language=None,
|
language=None,
|
||||||
file_path: str = None,
|
file_path: str = None,
|
||||||
) -> str:
|
) -> str:
|
||||||
|
@ -300,6 +303,7 @@ class CS_API:
|
||||||
speaker_id (str): Speaker ID. If None, the speaker name is used.
|
speaker_id (str): Speaker ID. If None, the speaker name is used.
|
||||||
emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull".
|
emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull".
|
||||||
speed (float): Speed of the speech. 1.0 is normal speed.
|
speed (float): Speed of the speech. 1.0 is normal speed.
|
||||||
|
pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe.
|
||||||
language (str): Language of the text. If None, the default language of the speaker is used. Language is only
|
language (str): Language of the text. If None, the default language of the speaker is used. Language is only
|
||||||
supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
|
supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
|
||||||
file_path (str): Path to save the file. If None, a temporary file is created.
|
file_path (str): Path to save the file. If None, a temporary file is created.
|
||||||
|
@ -307,7 +311,7 @@ class CS_API:
|
||||||
if file_path is None:
|
if file_path is None:
|
||||||
file_path = tempfile.mktemp(".wav")
|
file_path = tempfile.mktemp(".wav")
|
||||||
wav, sr = self.tts(text, speaker_name, speaker_id, emotion, speed, language)
|
wav, sr = self.tts(text, speaker_name, speaker_id, emotion, speed, language)
|
||||||
wavfile.write(file_path, sr, wav)
|
save_wav(wav=wav, path=file_path, sample_rate=sr, pipe_out=pipe_out)
|
||||||
return file_path
|
return file_path
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
from io import BytesIO
|
||||||
from typing import Tuple
|
from typing import Tuple
|
||||||
|
|
||||||
import librosa
|
import librosa
|
||||||
|
@ -427,16 +428,24 @@ def load_wav(*, filename: str, sample_rate: int = None, resample: bool = False,
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
def save_wav(*, wav: np.ndarray, path: str, sample_rate: int = None, **kwargs) -> None:
|
def save_wav(*, wav: np.ndarray, path: str, sample_rate: int = None, pipe_out = None, **kwargs) -> None:
|
||||||
"""Save float waveform to a file using Scipy.
|
"""Save float waveform to a file using Scipy.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
wav (np.ndarray): Waveform with float values in range [-1, 1] to save.
|
wav (np.ndarray): Waveform with float values in range [-1, 1] to save.
|
||||||
path (str): Path to a output file.
|
path (str): Path to a output file.
|
||||||
sr (int, optional): Sampling rate used for saving to the file. Defaults to None.
|
sr (int, optional): Sampling rate used for saving to the file. Defaults to None.
|
||||||
|
pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe.
|
||||||
"""
|
"""
|
||||||
wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
|
wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
|
||||||
scipy.io.wavfile.write(path, sample_rate, wav_norm.astype(np.int16))
|
|
||||||
|
wav_norm = wav_norm.astype(np.int16)
|
||||||
|
if pipe_out:
|
||||||
|
wav_buffer = BytesIO()
|
||||||
|
scipy.io.wavfile.write(wav_buffer, sample_rate, wav_norm)
|
||||||
|
wav_buffer.seek(0)
|
||||||
|
pipe_out.buffer.write(wav_buffer.read())
|
||||||
|
scipy.io.wavfile.write(path, sample_rate, wav_norm)
|
||||||
|
|
||||||
|
|
||||||
def mulaw_encode(*, wav: np.ndarray, mulaw_qc: int, **kwargs) -> np.ndarray:
|
def mulaw_encode(*, wav: np.ndarray, mulaw_qc: int, **kwargs) -> np.ndarray:
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
from io import BytesIO
|
||||||
from typing import Dict, Tuple
|
from typing import Dict, Tuple
|
||||||
|
|
||||||
import librosa
|
import librosa
|
||||||
|
@ -693,20 +694,27 @@ class AudioProcessor(object):
|
||||||
x = self.rms_volume_norm(x, self.db_level)
|
x = self.rms_volume_norm(x, self.db_level)
|
||||||
return x
|
return x
|
||||||
|
|
||||||
def save_wav(self, wav: np.ndarray, path: str, sr: int = None) -> None:
|
def save_wav(self, wav: np.ndarray, path: str, sr: int = None, pipe_out = None) -> None:
|
||||||
"""Save a waveform to a file using Scipy.
|
"""Save a waveform to a file using Scipy.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
wav (np.ndarray): Waveform to save.
|
wav (np.ndarray): Waveform to save.
|
||||||
path (str): Path to a output file.
|
path (str): Path to a output file.
|
||||||
sr (int, optional): Sampling rate used for saving to the file. Defaults to None.
|
sr (int, optional): Sampling rate used for saving to the file. Defaults to None.
|
||||||
|
pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe.
|
||||||
"""
|
"""
|
||||||
if self.do_rms_norm:
|
if self.do_rms_norm:
|
||||||
wav_norm = self.rms_volume_norm(wav, self.db_level) * 32767
|
wav_norm = self.rms_volume_norm(wav, self.db_level) * 32767
|
||||||
else:
|
else:
|
||||||
wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
|
wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
|
||||||
|
|
||||||
scipy.io.wavfile.write(path, sr if sr else self.sample_rate, wav_norm.astype(np.int16))
|
wav_norm = wav_norm.astype(np.int16)
|
||||||
|
if pipe_out:
|
||||||
|
wav_buffer = BytesIO()
|
||||||
|
scipy.io.wavfile.write(wav_buffer, sr if sr else self.sample_rate, wav_norm)
|
||||||
|
wav_buffer.seek(0)
|
||||||
|
pipe_out.buffer.write(wav_buffer.read())
|
||||||
|
scipy.io.wavfile.write(path, sr if sr else self.sample_rate, wav_norm)
|
||||||
|
|
||||||
def get_duration(self, filename: str) -> float:
|
def get_duration(self, filename: str) -> float:
|
||||||
"""Get the duration of a wav file using Librosa.
|
"""Get the duration of a wav file using Librosa.
|
||||||
|
|
|
@ -235,19 +235,20 @@ class Synthesizer(nn.Module):
|
||||||
"""
|
"""
|
||||||
return self.seg.segment(text)
|
return self.seg.segment(text)
|
||||||
|
|
||||||
def save_wav(self, wav: List[int], path: str) -> None:
|
def save_wav(self, wav: List[int], path: str, pipe_out = None) -> None:
|
||||||
"""Save the waveform as a file.
|
"""Save the waveform as a file.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
wav (List[int]): waveform as a list of values.
|
wav (List[int]): waveform as a list of values.
|
||||||
path (str): output path to save the waveform.
|
path (str): output path to save the waveform.
|
||||||
|
pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe.
|
||||||
"""
|
"""
|
||||||
# if tensor convert to numpy
|
# if tensor convert to numpy
|
||||||
if torch.is_tensor(wav):
|
if torch.is_tensor(wav):
|
||||||
wav = wav.cpu().numpy()
|
wav = wav.cpu().numpy()
|
||||||
if isinstance(wav, list):
|
if isinstance(wav, list):
|
||||||
wav = np.array(wav)
|
wav = np.array(wav)
|
||||||
save_wav(wav=wav, path=path, sample_rate=self.output_sample_rate)
|
save_wav(wav=wav, path=path, sample_rate=self.output_sample_rate, pipe_out=pipe_out)
|
||||||
|
|
||||||
def voice_conversion(self, source_wav: str, target_wav: str) -> List[int]:
|
def voice_conversion(self, source_wav: str, target_wav: str) -> List[int]:
|
||||||
output_wav = self.vc_model.voice_conversion(source_wav, target_wav)
|
output_wav = self.vc_model.voice_conversion(source_wav, target_wav)
|
||||||
|
|
|
@ -13,3 +13,16 @@ def test_synthesize():
|
||||||
'--text "This is it" '
|
'--text "This is it" '
|
||||||
f'--out_path "{output_path}"'
|
f'--out_path "{output_path}"'
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# 🐸 Coqui studio model with speed arg.
|
||||||
|
run_cli(
|
||||||
|
'tts --model_name "coqui_studio/en/Torcull Diarmuid/coqui_studio" '
|
||||||
|
'--text "This is it but slow" --speed 0.1'
|
||||||
|
f'--out_path "{output_path}"'
|
||||||
|
)
|
||||||
|
|
||||||
|
# test pipe_out command
|
||||||
|
run_cli(
|
||||||
|
'tts --text "test." --pipe_out '
|
||||||
|
f'--out_path "{output_path}" | aplay'
|
||||||
|
)
|
||||||
|
|
Loading…
Reference in New Issue