From 77722cb0dd0c43becfb245051d1bb0629ada8f48 Mon Sep 17 00:00:00 2001 From: Enno Hermann Date: Thu, 30 May 2024 11:12:10 +0200 Subject: [PATCH] fix(bin.synthesize): correctly handle boolean arguments Previously, e.g. `--use_cuda false` would actually set use_cuda=True: https://github.com/coqui-ai/TTS/discussions/3762 --- TTS/bin/compute_attention_masks.py | 4 +- TTS/bin/compute_embeddings.py | 2 +- TTS/bin/eval_encoder.py | 4 +- TTS/bin/extract_tts_spectrograms.py | 2 +- TTS/bin/remove_silence_using_vad.py | 10 ++-- TTS/bin/synthesize.py | 57 +++++++------------ TTS/encoder/README.md | 2 +- TTS/server/README.md | 2 +- docs/source/docker_images.md | 4 +- docs/source/models/bark.md | 6 +- docs/source/models/tortoise.md | 6 +- docs/source/models/xtts.md | 8 +-- .../ljspeech/fast_pitch/train_fast_pitch.py | 2 +- .../ljspeech/fast_speech/train_fast_speech.py | 2 +- .../ljspeech/fastspeech2/train_fastspeech2.py | 2 +- tests/zoo_tests/test_models.py | 24 ++++---- 16 files changed, 57 insertions(+), 80 deletions(-) diff --git a/TTS/bin/compute_attention_masks.py b/TTS/bin/compute_attention_masks.py index 207b17e9..be275baa 100644 --- a/TTS/bin/compute_attention_masks.py +++ b/TTS/bin/compute_attention_masks.py @@ -35,7 +35,7 @@ Example run: --data_path /root/LJSpeech-1.1/ --batch_size 32 --dataset ljspeech - --use_cuda True + --use_cuda """, formatter_class=RawTextHelpFormatter, ) @@ -62,7 +62,7 @@ Example run: help="Dataset metafile inclusing file paths with transcripts.", ) parser.add_argument("--data_path", type=str, default="", help="Defines the data path. It overwrites config.json.") - parser.add_argument("--use_cuda", type=bool, default=False, help="enable/disable cuda.") + parser.add_argument("--use_cuda", action=argparse.BooleanOptionalAction, default=False, help="enable/disable cuda.") parser.add_argument( "--batch_size", default=16, type=int, help="Batch size for the model. Use batch_size=1 if you have no CUDA." diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py index 6795241a..1bdb8d73 100644 --- a/TTS/bin/compute_embeddings.py +++ b/TTS/bin/compute_embeddings.py @@ -150,7 +150,7 @@ if __name__ == "__main__": default=False, action="store_true", ) - parser.add_argument("--disable_cuda", type=bool, help="Flag to disable cuda.", default=False) + parser.add_argument("--disable_cuda", action="store_true", help="Flag to disable cuda.", default=False) parser.add_argument("--no_eval", help="Do not compute eval?. Default False", default=False, action="store_true") parser.add_argument( "--formatter_name", diff --git a/TTS/bin/eval_encoder.py b/TTS/bin/eval_encoder.py index 8327851c..711c8221 100644 --- a/TTS/bin/eval_encoder.py +++ b/TTS/bin/eval_encoder.py @@ -75,8 +75,8 @@ if __name__ == "__main__": type=str, help="Path to dataset config file.", ) - parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True) - parser.add_argument("--eval", type=bool, help="compute eval.", default=True) + parser.add_argument("--use_cuda", action=argparse.BooleanOptionalAction, help="flag to set cuda.", default=True) + parser.add_argument("--eval", action=argparse.BooleanOptionalAction, help="compute eval.", default=True) args = parser.parse_args() diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index 83f2ca21..86a4dce1 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -282,7 +282,7 @@ if __name__ == "__main__": parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug") parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files") parser.add_argument("--quantize_bits", type=int, default=0, help="Save quantized audio files if non-zero") - parser.add_argument("--eval", type=bool, help="compute eval.", default=True) + parser.add_argument("--eval", action=argparse.BooleanOptionalAction, help="compute eval.", default=True) args = parser.parse_args() c = load_config(args.config_path) diff --git a/TTS/bin/remove_silence_using_vad.py b/TTS/bin/remove_silence_using_vad.py index f6d09d6b..edab882d 100755 --- a/TTS/bin/remove_silence_using_vad.py +++ b/TTS/bin/remove_silence_using_vad.py @@ -80,7 +80,7 @@ if __name__ == "__main__": setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter()) parser = argparse.ArgumentParser( - description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end True" + description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end" ) parser.add_argument("-i", "--input_dir", type=str, help="Dataset root dir", required=True) parser.add_argument("-o", "--output_dir", type=str, help="Output Dataset dir", default="") @@ -95,20 +95,20 @@ if __name__ == "__main__": parser.add_argument( "-t", "--trim_just_beginning_and_end", - type=bool, + action=argparse.BooleanOptionalAction, default=True, - help="If True this script will trim just the beginning and end nonspeech parts. If False all nonspeech parts will be trim. Default True", + help="If True this script will trim just the beginning and end nonspeech parts. If False all nonspeech parts will be trimmed.", ) parser.add_argument( "-c", "--use_cuda", - type=bool, + action=argparse.BooleanOptionalAction, default=False, help="If True use cuda", ) parser.add_argument( "--use_onnx", - type=bool, + action=argparse.BooleanOptionalAction, default=False, help="If True use onnx", ) diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index 0464cb29..bc01ffd5 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- + +"""Command line interface.""" import argparse import contextlib @@ -136,19 +137,8 @@ $ tts --out_path output/path/speech.wav --model_name "// argparse.Namespace: + """Parse arguments.""" parser = argparse.ArgumentParser( description=description.replace(" ```\n", ""), formatter_class=RawTextHelpFormatter, @@ -156,10 +146,7 @@ def main(): parser.add_argument( "--list_models", - type=str2bool, - nargs="?", - const=True, - default=False, + action="store_true", help="list available pre-trained TTS and vocoder models.", ) @@ -207,7 +194,7 @@ def main(): default="tts_output.wav", help="Output wav file path.", ) - parser.add_argument("--use_cuda", type=bool, help="Run model on CUDA.", default=False) + parser.add_argument("--use_cuda", action="store_true", help="Run model on CUDA.") parser.add_argument("--device", type=str, help="Device to run model on.", default="cpu") parser.add_argument( "--vocoder_path", @@ -226,10 +213,7 @@ def main(): parser.add_argument( "--pipe_out", help="stdout the generated TTS wav file for shell pipe.", - type=str2bool, - nargs="?", - const=True, - default=False, + action="store_true", ) # args for multi-speaker synthesis @@ -261,25 +245,18 @@ def main(): parser.add_argument( "--list_speaker_idxs", help="List available speaker ids for the defined multi-speaker model.", - type=str2bool, - nargs="?", - const=True, - default=False, + action="store_true", ) parser.add_argument( "--list_language_idxs", help="List available language ids for the defined multi-lingual model.", - type=str2bool, - nargs="?", - const=True, - default=False, + action="store_true", ) # aux args parser.add_argument( "--save_spectogram", - type=bool, - help="If true save raw spectogram for further (vocoder) processing in out_path.", - default=False, + action="store_true", + help="Save raw spectogram for further (vocoder) processing in out_path.", ) parser.add_argument( "--reference_wav", @@ -295,8 +272,8 @@ def main(): ) parser.add_argument( "--progress_bar", - type=str2bool, - help="If true shows a progress bar for the model download. Defaults to True", + action=argparse.BooleanOptionalAction, + help="Show a progress bar for the model download.", default=True, ) @@ -337,19 +314,23 @@ def main(): ] if not any(check_args): parser.parse_args(["-h"]) + return args + + +def main(): + setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter()) + args = parse_args() pipe_out = sys.stdout if args.pipe_out else None with contextlib.redirect_stdout(None if args.pipe_out else sys.stdout): # Late-import to make things load faster - from TTS.api import TTS from TTS.utils.manage import ModelManager from TTS.utils.synthesizer import Synthesizer # load model manager path = Path(__file__).parent / "../.models.json" manager = ModelManager(path, progress_bar=args.progress_bar) - api = TTS() tts_path = None tts_config_path = None diff --git a/TTS/encoder/README.md b/TTS/encoder/README.md index b38b2005..9f829c9e 100644 --- a/TTS/encoder/README.md +++ b/TTS/encoder/README.md @@ -14,5 +14,5 @@ To run the code, you need to follow the same flow as in TTS. - Define 'config.json' for your needs. Note that, audio parameters should match your TTS model. - Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360``` -- Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files. +- Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda /model/path/best_model.pth model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files. - Watch training on Tensorboard as in TTS diff --git a/TTS/server/README.md b/TTS/server/README.md index 3b27575a..ae8e38a4 100644 --- a/TTS/server/README.md +++ b/TTS/server/README.md @@ -15,7 +15,7 @@ Run the server with the official models. ```python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan``` Run the server with the official models on a GPU. -```CUDA_VISIBLE_DEVICES="0" python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan --use_cuda True``` +```CUDA_VISIBLE_DEVICES="0" python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan --use_cuda``` Run the server with a custom models. ```python TTS/server/server.py --tts_checkpoint /path/to/tts/model.pth --tts_config /path/to/tts/config.json --vocoder_checkpoint /path/to/vocoder/model.pth --vocoder_config /path/to/vocoder/config.json``` diff --git a/docs/source/docker_images.md b/docs/source/docker_images.md index 8df51855..58d96120 100644 --- a/docs/source/docker_images.md +++ b/docs/source/docker_images.md @@ -32,7 +32,7 @@ For the GPU version, you need to have the latest NVIDIA drivers installed. With `nvidia-smi` you can check the CUDA version supported, it must be >= 11.8 ```bash -docker run --rm --gpus all -v ~/tts-output:/root/tts-output ghcr.io/coqui-ai/tts --text "Hello." --out_path /root/tts-output/hello.wav --use_cuda true +docker run --rm --gpus all -v ~/tts-output:/root/tts-output ghcr.io/coqui-ai/tts --text "Hello." --out_path /root/tts-output/hello.wav --use_cuda ``` ## Start a server @@ -50,7 +50,7 @@ python3 TTS/server/server.py --model_name tts_models/en/vctk/vits ```bash docker run --rm -it -p 5002:5002 --gpus all --entrypoint /bin/bash ghcr.io/coqui-ai/tts python3 TTS/server/server.py --list_models #To get the list of available models -python3 TTS/server/server.py --model_name tts_models/en/vctk/vits --use_cuda true +python3 TTS/server/server.py --model_name tts_models/en/vctk/vits --use_cuda ``` Click [there](http://[::1]:5002/) and have fun with the server! diff --git a/docs/source/models/bark.md b/docs/source/models/bark.md index c328ae61..a180afbb 100644 --- a/docs/source/models/bark.md +++ b/docs/source/models/bark.md @@ -69,14 +69,12 @@ tts --model_name tts_models/multilingual/multi-dataset/bark \ --text "This is an example." \ --out_path "output.wav" \ --voice_dir bark_voices/ \ ---speaker_idx "ljspeech" \ ---progress_bar True +--speaker_idx "ljspeech" # Random voice generation tts --model_name tts_models/multilingual/multi-dataset/bark \ --text "This is an example." \ ---out_path "output.wav" \ ---progress_bar True +--out_path "output.wav" ``` diff --git a/docs/source/models/tortoise.md b/docs/source/models/tortoise.md index 1a8e9ca8..30afd135 100644 --- a/docs/source/models/tortoise.md +++ b/docs/source/models/tortoise.md @@ -57,14 +57,12 @@ tts --model_name tts_models/en/multi-dataset/tortoise-v2 \ --text "This is an example." \ --out_path "output.wav" \ --voice_dir path/to/tortoise/voices/dir/ \ ---speaker_idx "lj" \ ---progress_bar True +--speaker_idx "lj" # Random voice generation tts --model_name tts_models/en/multi-dataset/tortoise-v2 \ --text "This is an example." \ ---out_path "output.wav" \ ---progress_bar True +--out_path "output.wav" ``` diff --git a/docs/source/models/xtts.md b/docs/source/models/xtts.md index cc7c36b7..c07d879f 100644 --- a/docs/source/models/xtts.md +++ b/docs/source/models/xtts.md @@ -72,7 +72,7 @@ You can do inference using one of the available speakers using the following com --text "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent." \ --speaker_idx "Ana Florence" \ --language_idx en \ - --use_cuda true + --use_cuda ``` ##### Clone a voice @@ -85,7 +85,7 @@ You can clone a speaker voice using a single or multiple references: --text "Bugün okula gitmek istemiyorum." \ --speaker_wav /path/to/target/speaker.wav \ --language_idx tr \ - --use_cuda true + --use_cuda ``` ###### Multiple references @@ -94,7 +94,7 @@ You can clone a speaker voice using a single or multiple references: --text "Bugün okula gitmek istemiyorum." \ --speaker_wav /path/to/target/speaker.wav /path/to/target/speaker_2.wav /path/to/target/speaker_3.wav \ --language_idx tr \ - --use_cuda true + --use_cuda ``` or for all wav files in a directory you can use: @@ -103,7 +103,7 @@ or for all wav files in a directory you can use: --text "Bugün okula gitmek istemiyorum." \ --speaker_wav /path/to/target/*.wav \ --language_idx tr \ - --use_cuda true + --use_cuda ``` #### 🐸TTS API diff --git a/recipes/ljspeech/fast_pitch/train_fast_pitch.py b/recipes/ljspeech/fast_pitch/train_fast_pitch.py index 055526b1..64fd737b 100644 --- a/recipes/ljspeech/fast_pitch/train_fast_pitch.py +++ b/recipes/ljspeech/fast_pitch/train_fast_pitch.py @@ -65,7 +65,7 @@ if not config.model_args.use_aligner: model_path, config_path, _ = manager.download_model("tts_models/en/ljspeech/tacotron2-DCA") # TODO: make compute_attention python callable os.system( - f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/ --use_cuda true" + f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/ --use_cuda" ) # INITIALIZE THE AUDIO PROCESSOR diff --git a/recipes/ljspeech/fast_speech/train_fast_speech.py b/recipes/ljspeech/fast_speech/train_fast_speech.py index 8c9a272e..9839fcb3 100644 --- a/recipes/ljspeech/fast_speech/train_fast_speech.py +++ b/recipes/ljspeech/fast_speech/train_fast_speech.py @@ -64,7 +64,7 @@ if not config.model_args.use_aligner: model_path, config_path, _ = manager.download_model("tts_models/en/ljspeech/tacotron2-DCA") # TODO: make compute_attention python callable os.system( - f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/ --use_cuda true" + f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/ --use_cuda" ) # INITIALIZE THE AUDIO PROCESSOR diff --git a/recipes/ljspeech/fastspeech2/train_fastspeech2.py b/recipes/ljspeech/fastspeech2/train_fastspeech2.py index 93737dba..0a7a1756 100644 --- a/recipes/ljspeech/fastspeech2/train_fastspeech2.py +++ b/recipes/ljspeech/fastspeech2/train_fastspeech2.py @@ -67,7 +67,7 @@ if not config.model_args.use_aligner: model_path, config_path, _ = manager.download_model("tts_models/en/ljspeech/tacotron2-DCA") # TODO: make compute_attention python callable os.system( - f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/ --use_cuda true" + f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/ --use_cuda" ) # INITIALIZE THE AUDIO PROCESSOR diff --git a/tests/zoo_tests/test_models.py b/tests/zoo_tests/test_models.py index 8fa56e28..1c28e860 100644 --- a/tests/zoo_tests/test_models.py +++ b/tests/zoo_tests/test_models.py @@ -50,13 +50,13 @@ def run_models(offset=0, step=1): speaker_id = list(speaker_manager.name_to_id.keys())[0] run_cli( f"tts --model_name {model_name} " - f'--text "This is an example." --out_path "{output_path}" --speaker_idx "{speaker_id}" --language_idx "{language_id}" --progress_bar False' + f'--text "This is an example." --out_path "{output_path}" --speaker_idx "{speaker_id}" --language_idx "{language_id}" --no-progress_bar' ) else: # single-speaker model run_cli( f"tts --model_name {model_name} " - f'--text "This is an example." --out_path "{output_path}" --progress_bar False' + f'--text "This is an example." --out_path "{output_path}" --no-progress_bar' ) # remove downloaded models shutil.rmtree(local_download_dir) @@ -66,7 +66,7 @@ def run_models(offset=0, step=1): reference_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0032.wav") run_cli( f"tts --model_name {model_name} " - f'--out_path "{output_path}" --source_wav "{speaker_wav}" --target_wav "{reference_wav}" --progress_bar False' + f'--out_path "{output_path}" --source_wav "{speaker_wav}" --target_wav "{reference_wav}" --no-progress_bar' ) else: # only download the model @@ -83,14 +83,14 @@ def test_xtts(): run_cli( "yes | " f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v1.1 " - f'--text "This is an example." --out_path "{output_path}" --progress_bar False --use_cuda True ' + f'--text "This is an example." --out_path "{output_path}" --no-progress_bar --use_cuda ' f'--speaker_wav "{speaker_wav}" --language_idx "en"' ) else: run_cli( "yes | " f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v1.1 " - f'--text "This is an example." --out_path "{output_path}" --progress_bar False ' + f'--text "This is an example." --out_path "{output_path}" --no-progress_bar ' f'--speaker_wav "{speaker_wav}" --language_idx "en"' ) @@ -138,14 +138,14 @@ def test_xtts_v2(): run_cli( "yes | " f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 " - f'--text "This is an example." --out_path "{output_path}" --progress_bar False --use_cuda True ' + f'--text "This is an example." --out_path "{output_path}" --no-progress_bar --use_cuda ' f'--speaker_wav "{speaker_wav}" "{speaker_wav_2}" --language_idx "en"' ) else: run_cli( "yes | " f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 " - f'--text "This is an example." --out_path "{output_path}" --progress_bar False ' + f'--text "This is an example." --out_path "{output_path}" --no-progress_bar ' f'--speaker_wav "{speaker_wav}" "{speaker_wav_2}" --language_idx "en"' ) @@ -215,12 +215,12 @@ def test_tortoise(): if use_gpu: run_cli( f" tts --model_name tts_models/en/multi-dataset/tortoise-v2 " - f'--text "This is an example." --out_path "{output_path}" --progress_bar False --use_cuda True' + f'--text "This is an example." --out_path "{output_path}" --no-progress_bar --use_cuda' ) else: run_cli( f" tts --model_name tts_models/en/multi-dataset/tortoise-v2 " - f'--text "This is an example." --out_path "{output_path}" --progress_bar False' + f'--text "This is an example." --out_path "{output_path}" --no-progress_bar' ) @@ -231,12 +231,12 @@ def test_bark(): if use_gpu: run_cli( f" tts --model_name tts_models/multilingual/multi-dataset/bark " - f'--text "This is an example." --out_path "{output_path}" --progress_bar False --use_cuda True' + f'--text "This is an example." --out_path "{output_path}" --no-progress_bar --use_cuda' ) else: run_cli( f" tts --model_name tts_models/multilingual/multi-dataset/bark " - f'--text "This is an example." --out_path "{output_path}" --progress_bar False' + f'--text "This is an example." --out_path "{output_path}" --no-progress_bar' ) @@ -249,7 +249,7 @@ def test_voice_conversion(): output_path = os.path.join(get_tests_output_path(), "output.wav") run_cli( f"tts --model_name {model_name}" - f" --out_path {output_path} --speaker_wav {speaker_wav} --reference_wav {reference_wav} --language_idx {language_id} --progress_bar False" + f" --out_path {output_path} --speaker_wav {speaker_wav} --reference_wav {reference_wav} --language_idx {language_id} --no-progress_bar" )