From a151d70242ff4d9f2874584681284aab2414e729 Mon Sep 17 00:00:00 2001 From: David Garvey Date: Mon, 16 Oct 2023 05:07:21 -0500 Subject: [PATCH] Add stdout option (#3027) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add add cli options for play and speed --play argument uses simpleaudio to play the tts wav --speed passes speed argument to Coqui Studio models * remove simpleaudio not referenced in file * fix simpleaudio dependency version * add ALSA headers for simpleaudio compilation * Dockerfile ALSA headers for simpleaudio * base changes to use stdout instead of play audio Considering conversion to pipe wav data for audio playback with ohter program like aplay. This is incomplete code. Using to get feedback before proceeding with implementation. * remove play for pipe_out arg that suppresses stdout removed play and simpleaudio dependency in place of pipe fuctionality to allow passing wav file data to a program dedicated to playing audio. * scipy.io.wavfile.write fails with /dev/null target * Streaming inference for XTTS 🚀 (#3035) * v0.17.7 * Redownload XTTS with the local and remote config do not match * Remove unused method * Print a message when it is already donwloaded * Try-except to present error when the user dont have connection * Fix style * 0.17.8 * v0.17.8 --------- Co-authored-by: Julian Weber Co-authored-by: Eren Gölge Co-authored-by: Edresson Casanova Co-authored-by: ggoknar --- README.md | 12 + TTS/api.py | 18 +- TTS/bin/synthesize.py | 317 ++++++++++++++----------- TTS/cs_api.py | 6 +- TTS/utils/audio/numpy_transforms.py | 13 +- TTS/utils/audio/processor.py | 12 +- TTS/utils/synthesizer.py | 5 +- tests/api_tests/test_synthesize_api.py | 13 + 8 files changed, 246 insertions(+), 150 deletions(-) diff --git a/README.md b/README.md index ad4a90b9..4f386ecd 100644 --- a/README.md +++ b/README.md @@ -347,6 +347,18 @@ If you don't specify any models, then it uses LJSpeech based English model. $ tts --text "Text for TTS" --out_path output/path/speech.wav ``` +- Run TTS and pipe out the generated TTS wav file data: + + ``` + $ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay + ``` + +- Run TTS and define speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0: + + ``` + $ tts --text "Text for TTS" --model_name "coqui_studio///" --speed 1.2 --out_path output/path/speech.wav + ``` + - Run a TTS model with its default vocoder model: ``` diff --git a/TTS/api.py b/TTS/api.py index e1d167a9..dd5820f8 100644 --- a/TTS/api.py +++ b/TTS/api.py @@ -112,7 +112,6 @@ class TTS(nn.Module): return self.synthesizer.tts_model.language_manager.num_languages > 1 return False - @property def speakers(self): if not self.is_multi_speaker: @@ -265,6 +264,7 @@ class TTS(nn.Module): language: str = None, emotion: str = None, speed: float = 1.0, + pipe_out = None, file_path: str = None, ) -> Union[np.ndarray, str]: """Convert text to speech using Coqui Studio models. Use `CS_API` class if you are only interested in the API. @@ -281,6 +281,8 @@ class TTS(nn.Module): with "V1" model. Defaults to None. speed (float, optional): Speed of the speech. Defaults to 1.0. + pipe_out (BytesIO, optional): + Flag to stdout the generated TTS wav file for shell pipe. file_path (str, optional): Path to save the output file. When None it returns the `np.ndarray` of waveform. Defaults to None. @@ -294,6 +296,7 @@ class TTS(nn.Module): speaker_name=speaker_name, language=language, speed=speed, + pipe_out=pipe_out, emotion=emotion, file_path=file_path, )[0] @@ -356,6 +359,7 @@ class TTS(nn.Module): speaker_wav: str = None, emotion: str = None, speed: float = 1.0, + pipe_out = None, file_path: str = "output.wav", **kwargs, ): @@ -377,6 +381,8 @@ class TTS(nn.Module): Emotion to use for 🐸Coqui Studio models. Defaults to "Neutral". speed (float, optional): Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0. Defaults to None. + pipe_out (BytesIO, optional): + Flag to stdout the generated TTS wav file for shell pipe. file_path (str, optional): Output file path. Defaults to "output.wav". kwargs (dict, optional): @@ -386,10 +392,16 @@ class TTS(nn.Module): if self.csapi is not None: return self.tts_coqui_studio( - text=text, speaker_name=speaker, language=language, emotion=emotion, speed=speed, file_path=file_path + text=text, + speaker_name=speaker, + language=language, + emotion=emotion, + speed=speed, + file_path=file_path, + pipe_out=pipe_out, ) wav = self.tts(text=text, speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs) - self.synthesizer.save_wav(wav=wav, path=file_path) + self.synthesizer.save_wav(wav=wav, path=file_path, pipe_out=pipe_out) return file_path def voice_conversion( diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index 5ff1181f..78a20c25 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import argparse +import contextlib import sys from argparse import RawTextHelpFormatter @@ -59,6 +60,18 @@ If you don't specify any models, then it uses LJSpeech based English model. $ tts --text "Text for TTS" --out_path output/path/speech.wav ``` +- Run TTS and pipe out the generated TTS wav file data: + + ``` + $ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay + ``` + +- Run TTS and define speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0: + + ``` + $ tts --text "Text for TTS" --model_name "coqui_studio///" --speed 1.2 --out_path output/path/speech.wav + ``` + - Run a TTS model with its default vocoder model: ``` @@ -228,6 +241,20 @@ def main(): help="Language to condition the model with. Only available for 🐸Coqui Studio `XTTS-multilingual` model.", default=None, ) + parser.add_argument( + "--pipe_out", + help="stdout the generated TTS wav file for shell pipe.", + type=str2bool, + nargs="?", + const=True, + default=False, + ) + parser.add_argument( + "--speed", + type=float, + help="Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0.", + default=None, + ) # args for multi-speaker synthesis parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None) @@ -335,167 +362,177 @@ def main(): if not any(check_args): parser.parse_args(["-h"]) - # Late-import to make things load faster - from TTS.api import TTS - from TTS.utils.manage import ModelManager - from TTS.utils.synthesizer import Synthesizer + pipe_out = sys.stdout if args.pipe_out else None - # load model manager - path = Path(__file__).parent / "../.models.json" - manager = ModelManager(path, progress_bar=args.progress_bar) - api = TTS() + with contextlib.redirect_stdout(None if args.pipe_out else sys.stdout): + # Late-import to make things load faster + from TTS.api import TTS + from TTS.utils.manage import ModelManager + from TTS.utils.synthesizer import Synthesizer - tts_path = None - tts_config_path = None - speakers_file_path = None - language_ids_file_path = None - vocoder_path = None - vocoder_config_path = None - encoder_path = None - encoder_config_path = None - vc_path = None - vc_config_path = None - model_dir = None + # load model manager + path = Path(__file__).parent / "../.models.json" + manager = ModelManager(path, progress_bar=args.progress_bar) + api = TTS() - # CASE1 #list : list pre-trained TTS models - if args.list_models: - manager.add_cs_api_models(api.list_models()) - manager.list_models() - sys.exit() + tts_path = None + tts_config_path = None + speakers_file_path = None + language_ids_file_path = None + vocoder_path = None + vocoder_config_path = None + encoder_path = None + encoder_config_path = None + vc_path = None + vc_config_path = None + model_dir = None - # CASE2 #info : model info for pre-trained TTS models - if args.model_info_by_idx: - model_query = args.model_info_by_idx - manager.model_info_by_idx(model_query) - sys.exit() + # CASE1 #list : list pre-trained TTS models + if args.list_models: + manager.add_cs_api_models(api.list_models()) + manager.list_models() + sys.exit() - if args.model_info_by_name: - model_query_full_name = args.model_info_by_name - manager.model_info_by_full_name(model_query_full_name) - sys.exit() + # CASE2 #info : model info for pre-trained TTS models + if args.model_info_by_idx: + model_query = args.model_info_by_idx + manager.model_info_by_idx(model_query) + sys.exit() - # CASE3: TTS with coqui studio models - if "coqui_studio" in args.model_name: - print(" > Using 🐸Coqui Studio model: ", args.model_name) - api = TTS(model_name=args.model_name, cs_api_model=args.cs_model) - api.tts_to_file(text=args.text, emotion=args.emotion, file_path=args.out_path, language=args.language) - print(" > Saving output to ", args.out_path) - return + if args.model_info_by_name: + model_query_full_name = args.model_info_by_name + manager.model_info_by_full_name(model_query_full_name) + sys.exit() - # CASE4: load pre-trained model paths - if args.model_name is not None and not args.model_path: - model_path, config_path, model_item = manager.download_model(args.model_name) - # tts model - if model_item["model_type"] == "tts_models": - tts_path = model_path - tts_config_path = config_path - if "default_vocoder" in model_item: - args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name + # CASE3: TTS with coqui studio models + if "coqui_studio" in args.model_name: + print(" > Using 🐸Coqui Studio model: ", args.model_name) + api = TTS(model_name=args.model_name, cs_api_model=args.cs_model) + api.tts_to_file( + text=args.text, + emotion=args.emotion, + file_path=args.out_path, + language=args.language, + speed=args.speed, + pipe_out=pipe_out, + ) + print(" > Saving output to ", args.out_path) + return - # voice conversion model - if model_item["model_type"] == "voice_conversion_models": - vc_path = model_path - vc_config_path = config_path + # CASE4: load pre-trained model paths + if args.model_name is not None and not args.model_path: + model_path, config_path, model_item = manager.download_model(args.model_name) + # tts model + if model_item["model_type"] == "tts_models": + tts_path = model_path + tts_config_path = config_path + if "default_vocoder" in model_item: + args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name - # tts model with multiple files to be loaded from the directory path - if model_item.get("author", None) == "fairseq" or isinstance(model_item["model_url"], list): - model_dir = model_path - tts_path = None - tts_config_path = None - args.vocoder_name = None + # voice conversion model + if model_item["model_type"] == "voice_conversion_models": + vc_path = model_path + vc_config_path = config_path - # load vocoder - if args.vocoder_name is not None and not args.vocoder_path: - vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name) + # tts model with multiple files to be loaded from the directory path + if model_item.get("author", None) == "fairseq" or isinstance(model_item["model_url"], list): + model_dir = model_path + tts_path = None + tts_config_path = None + args.vocoder_name = None - # CASE5: set custom model paths - if args.model_path is not None: - tts_path = args.model_path - tts_config_path = args.config_path - speakers_file_path = args.speakers_file_path - language_ids_file_path = args.language_ids_file_path + # load vocoder + if args.vocoder_name is not None and not args.vocoder_path: + vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name) - if args.vocoder_path is not None: - vocoder_path = args.vocoder_path - vocoder_config_path = args.vocoder_config_path + # CASE5: set custom model paths + if args.model_path is not None: + tts_path = args.model_path + tts_config_path = args.config_path + speakers_file_path = args.speakers_file_path + language_ids_file_path = args.language_ids_file_path - if args.encoder_path is not None: - encoder_path = args.encoder_path - encoder_config_path = args.encoder_config_path + if args.vocoder_path is not None: + vocoder_path = args.vocoder_path + vocoder_config_path = args.vocoder_config_path - device = args.device - if args.use_cuda: - device = "cuda" + if args.encoder_path is not None: + encoder_path = args.encoder_path + encoder_config_path = args.encoder_config_path - # load models - synthesizer = Synthesizer( - tts_path, - tts_config_path, - speakers_file_path, - language_ids_file_path, - vocoder_path, - vocoder_config_path, - encoder_path, - encoder_config_path, - vc_path, - vc_config_path, - model_dir, - args.voice_dir, - ).to(device) + device = args.device + if args.use_cuda: + device = "cuda" - # query speaker ids of a multi-speaker model. - if args.list_speaker_idxs: - print( - " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model." - ) - print(synthesizer.tts_model.speaker_manager.name_to_id) - return + # load models + synthesizer = Synthesizer( + tts_path, + tts_config_path, + speakers_file_path, + language_ids_file_path, + vocoder_path, + vocoder_config_path, + encoder_path, + encoder_config_path, + vc_path, + vc_config_path, + model_dir, + args.voice_dir, + ).to(device) - # query langauge ids of a multi-lingual model. - if args.list_language_idxs: - print( - " > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model." - ) - print(synthesizer.tts_model.language_manager.name_to_id) - return + # query speaker ids of a multi-speaker model. + if args.list_speaker_idxs: + print( + " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model." + ) + print(synthesizer.tts_model.speaker_manager.name_to_id) + return - # check the arguments against a multi-speaker model. - if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav): - print( - " [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to " - "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`." - ) - return + # query langauge ids of a multi-lingual model. + if args.list_language_idxs: + print( + " > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model." + ) + print(synthesizer.tts_model.language_manager.name_to_id) + return - # RUN THE SYNTHESIS - if args.text: - print(" > Text: {}".format(args.text)) + # check the arguments against a multi-speaker model. + if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav): + print( + " [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to " + "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`." + ) + return - # kick it - if tts_path is not None: - wav = synthesizer.tts( - args.text, - speaker_name=args.speaker_idx, - language_name=args.language_idx, - speaker_wav=args.speaker_wav, - reference_wav=args.reference_wav, - style_wav=args.capacitron_style_wav, - style_text=args.capacitron_style_text, - reference_speaker_name=args.reference_speaker_idx, - ) - elif vc_path is not None: - wav = synthesizer.voice_conversion( - source_wav=args.source_wav, - target_wav=args.target_wav, - ) - elif model_dir is not None: - wav = synthesizer.tts( - args.text, speaker_name=args.speaker_idx, language_name=args.language_idx, speaker_wav=args.speaker_wav - ) + # RUN THE SYNTHESIS + if args.text: + print(" > Text: {}".format(args.text)) - # save the results - print(" > Saving output to {}".format(args.out_path)) - synthesizer.save_wav(wav, args.out_path) + # kick it + if tts_path is not None: + wav = synthesizer.tts( + args.text, + speaker_name=args.speaker_idx, + language_name=args.language_idx, + speaker_wav=args.speaker_wav, + reference_wav=args.reference_wav, + style_wav=args.capacitron_style_wav, + style_text=args.capacitron_style_text, + reference_speaker_name=args.reference_speaker_idx, + ) + elif vc_path is not None: + wav = synthesizer.voice_conversion( + source_wav=args.source_wav, + target_wav=args.target_wav, + ) + elif model_dir is not None: + wav = synthesizer.tts( + args.text, speaker_name=args.speaker_idx, language_name=args.language_idx, speaker_wav=args.speaker_wav + ) + + # save the results + print(" > Saving output to {}".format(args.out_path)) + synthesizer.save_wav(wav, args.out_path, pipe_out=pipe_out) if __name__ == "__main__": diff --git a/TTS/cs_api.py b/TTS/cs_api.py index a36452ab..4a44b535 100644 --- a/TTS/cs_api.py +++ b/TTS/cs_api.py @@ -9,6 +9,8 @@ import numpy as np import requests from scipy.io import wavfile +from TTS.utils.audio.numpy_transforms import save_wav + class Speaker(object): """Convert dict to object.""" @@ -288,6 +290,7 @@ class CS_API: speaker_id=None, emotion=None, speed=1.0, + pipe_out=None, language=None, file_path: str = None, ) -> str: @@ -300,6 +303,7 @@ class CS_API: speaker_id (str): Speaker ID. If None, the speaker name is used. emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". speed (float): Speed of the speech. 1.0 is normal speed. + pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe. language (str): Language of the text. If None, the default language of the speaker is used. Language is only supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en". file_path (str): Path to save the file. If None, a temporary file is created. @@ -307,7 +311,7 @@ class CS_API: if file_path is None: file_path = tempfile.mktemp(".wav") wav, sr = self.tts(text, speaker_name, speaker_id, emotion, speed, language) - wavfile.write(file_path, sr, wav) + save_wav(wav=wav, path=file_path, sample_rate=sr, pipe_out=pipe_out) return file_path diff --git a/TTS/utils/audio/numpy_transforms.py b/TTS/utils/audio/numpy_transforms.py index ae44472f..e2b71fb2 100644 --- a/TTS/utils/audio/numpy_transforms.py +++ b/TTS/utils/audio/numpy_transforms.py @@ -1,3 +1,4 @@ +from io import BytesIO from typing import Tuple import librosa @@ -427,16 +428,24 @@ def load_wav(*, filename: str, sample_rate: int = None, resample: bool = False, return x -def save_wav(*, wav: np.ndarray, path: str, sample_rate: int = None, **kwargs) -> None: +def save_wav(*, wav: np.ndarray, path: str, sample_rate: int = None, pipe_out = None, **kwargs) -> None: """Save float waveform to a file using Scipy. Args: wav (np.ndarray): Waveform with float values in range [-1, 1] to save. path (str): Path to a output file. sr (int, optional): Sampling rate used for saving to the file. Defaults to None. + pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe. """ wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav)))) - scipy.io.wavfile.write(path, sample_rate, wav_norm.astype(np.int16)) + + wav_norm = wav_norm.astype(np.int16) + if pipe_out: + wav_buffer = BytesIO() + scipy.io.wavfile.write(wav_buffer, sample_rate, wav_norm) + wav_buffer.seek(0) + pipe_out.buffer.write(wav_buffer.read()) + scipy.io.wavfile.write(path, sample_rate, wav_norm) def mulaw_encode(*, wav: np.ndarray, mulaw_qc: int, **kwargs) -> np.ndarray: diff --git a/TTS/utils/audio/processor.py b/TTS/utils/audio/processor.py index b0920dc9..248e15b8 100644 --- a/TTS/utils/audio/processor.py +++ b/TTS/utils/audio/processor.py @@ -1,3 +1,4 @@ +from io import BytesIO from typing import Dict, Tuple import librosa @@ -693,20 +694,27 @@ class AudioProcessor(object): x = self.rms_volume_norm(x, self.db_level) return x - def save_wav(self, wav: np.ndarray, path: str, sr: int = None) -> None: + def save_wav(self, wav: np.ndarray, path: str, sr: int = None, pipe_out = None) -> None: """Save a waveform to a file using Scipy. Args: wav (np.ndarray): Waveform to save. path (str): Path to a output file. sr (int, optional): Sampling rate used for saving to the file. Defaults to None. + pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe. """ if self.do_rms_norm: wav_norm = self.rms_volume_norm(wav, self.db_level) * 32767 else: wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav)))) - scipy.io.wavfile.write(path, sr if sr else self.sample_rate, wav_norm.astype(np.int16)) + wav_norm = wav_norm.astype(np.int16) + if pipe_out: + wav_buffer = BytesIO() + scipy.io.wavfile.write(wav_buffer, sr if sr else self.sample_rate, wav_norm) + wav_buffer.seek(0) + pipe_out.buffer.write(wav_buffer.read()) + scipy.io.wavfile.write(path, sr if sr else self.sample_rate, wav_norm) def get_duration(self, filename: str) -> float: """Get the duration of a wav file using Librosa. diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 2e2e40e2..a7370cd2 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -235,19 +235,20 @@ class Synthesizer(nn.Module): """ return self.seg.segment(text) - def save_wav(self, wav: List[int], path: str) -> None: + def save_wav(self, wav: List[int], path: str, pipe_out = None) -> None: """Save the waveform as a file. Args: wav (List[int]): waveform as a list of values. path (str): output path to save the waveform. + pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe. """ # if tensor convert to numpy if torch.is_tensor(wav): wav = wav.cpu().numpy() if isinstance(wav, list): wav = np.array(wav) - save_wav(wav=wav, path=path, sample_rate=self.output_sample_rate) + save_wav(wav=wav, path=path, sample_rate=self.output_sample_rate, pipe_out=pipe_out) def voice_conversion(self, source_wav: str, target_wav: str) -> List[int]: output_wav = self.vc_model.voice_conversion(source_wav, target_wav) diff --git a/tests/api_tests/test_synthesize_api.py b/tests/api_tests/test_synthesize_api.py index a96c8bea..084f81d4 100644 --- a/tests/api_tests/test_synthesize_api.py +++ b/tests/api_tests/test_synthesize_api.py @@ -13,3 +13,16 @@ def test_synthesize(): '--text "This is it" ' f'--out_path "{output_path}"' ) + + # 🐸 Coqui studio model with speed arg. + run_cli( + 'tts --model_name "coqui_studio/en/Torcull Diarmuid/coqui_studio" ' + '--text "This is it but slow" --speed 0.1' + f'--out_path "{output_path}"' + ) + + # test pipe_out command + run_cli( + 'tts --text "test." --pipe_out ' + f'--out_path "{output_path}" | aplay' + )