From fdf0c8b10a00404b51bdd62bf62231c0dbf4e50f Mon Sep 17 00:00:00 2001 From: Enno Hermann Date: Thu, 16 Nov 2023 23:40:21 +0100 Subject: [PATCH 01/17] chore(encoder): remove unused code --- TTS/encoder/utils/generic_utils.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/TTS/encoder/utils/generic_utils.py b/TTS/encoder/utils/generic_utils.py index 1da02961..bbce6a8a 100644 --- a/TTS/encoder/utils/generic_utils.py +++ b/TTS/encoder/utils/generic_utils.py @@ -2,7 +2,6 @@ import datetime import glob import os import random -import re import numpy as np from scipy import signal @@ -118,11 +117,6 @@ class AugmentWAV(object): return self.additive_noise(noise_type, audio) -def to_camel(text): - text = text.capitalize() - return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text) - - def setup_encoder_model(config: "Coqpit"): if config.model_params["model_name"].lower() == "lstm": model = LSTMSpeakerEncoder( From 39fe38bda4d6937336255d32e542d4f84dd0fe15 Mon Sep 17 00:00:00 2001 From: Enno Hermann Date: Thu, 16 Nov 2023 23:46:26 +0100 Subject: [PATCH 02/17] refactor: use save_fsspec() from Trainer --- TTS/encoder/utils/generic_utils.py | 2 +- TTS/encoder/utils/io.py | 2 +- TTS/utils/io.py | 13 +------------ 3 files changed, 3 insertions(+), 14 deletions(-) diff --git a/TTS/encoder/utils/generic_utils.py b/TTS/encoder/utils/generic_utils.py index bbce6a8a..2b003ac8 100644 --- a/TTS/encoder/utils/generic_utils.py +++ b/TTS/encoder/utils/generic_utils.py @@ -5,10 +5,10 @@ import random import numpy as np from scipy import signal +from trainer.io import save_fsspec from TTS.encoder.models.lstm import LSTMSpeakerEncoder from TTS.encoder.models.resnet import ResNetSpeakerEncoder -from TTS.utils.io import save_fsspec class AugmentWAV(object): diff --git a/TTS/encoder/utils/io.py b/TTS/encoder/utils/io.py index d1dad3e2..a8359be1 100644 --- a/TTS/encoder/utils/io.py +++ b/TTS/encoder/utils/io.py @@ -1,7 +1,7 @@ import datetime import os -from TTS.utils.io import save_fsspec +from trainer.io import save_fsspec def save_checkpoint(model, optimizer, model_loss, out_path, current_step): diff --git a/TTS/utils/io.py b/TTS/utils/io.py index e9bdf3e6..9ab1075c 100644 --- a/TTS/utils/io.py +++ b/TTS/utils/io.py @@ -8,6 +8,7 @@ from typing import Any, Callable, Dict, Union import fsspec import torch from coqpit import Coqpit +from trainer.io import save_fsspec from TTS.utils.generic_utils import get_user_data_dir @@ -102,18 +103,6 @@ def load_checkpoint( return model, state -def save_fsspec(state: Any, path: str, **kwargs): - """Like torch.save but can save to other locations (e.g. s3:// , gs://). - - Args: - state: State object to save - path: Any path or url supported by fsspec. - **kwargs: Keyword arguments forwarded to torch.save. - """ - with fsspec.open(path, "wb") as f: - torch.save(state, f, **kwargs) - - def save_model(config, model, optimizer, scaler, current_step, epoch, output_path, **kwargs): if hasattr(model, "module"): model_state = model.module.state_dict() From 5119e651a1dbccdc4e5fdb47dc386d33f378e621 Mon Sep 17 00:00:00 2001 From: Enno Hermann Date: Thu, 16 Nov 2023 23:52:28 +0100 Subject: [PATCH 03/17] chore(utils.io): remove unused code These are all available in Trainer. --- TTS/utils/io.py | 104 ------------------------------------------------ 1 file changed, 104 deletions(-) diff --git a/TTS/utils/io.py b/TTS/utils/io.py index 9ab1075c..7aaedbe2 100644 --- a/TTS/utils/io.py +++ b/TTS/utils/io.py @@ -1,4 +1,3 @@ -import datetime import json import os import pickle as pickle_tts @@ -8,7 +7,6 @@ from typing import Any, Callable, Dict, Union import fsspec import torch from coqpit import Coqpit -from trainer.io import save_fsspec from TTS.utils.generic_utils import get_user_data_dir @@ -101,105 +99,3 @@ def load_checkpoint( if eval: model.eval() return model, state - - -def save_model(config, model, optimizer, scaler, current_step, epoch, output_path, **kwargs): - if hasattr(model, "module"): - model_state = model.module.state_dict() - else: - model_state = model.state_dict() - if isinstance(optimizer, list): - optimizer_state = [optim.state_dict() for optim in optimizer] - elif optimizer.__class__.__name__ == "CapacitronOptimizer": - optimizer_state = [optimizer.primary_optimizer.state_dict(), optimizer.secondary_optimizer.state_dict()] - else: - optimizer_state = optimizer.state_dict() if optimizer is not None else None - - if isinstance(scaler, list): - scaler_state = [s.state_dict() for s in scaler] - else: - scaler_state = scaler.state_dict() if scaler is not None else None - - if isinstance(config, Coqpit): - config = config.to_dict() - - state = { - "config": config, - "model": model_state, - "optimizer": optimizer_state, - "scaler": scaler_state, - "step": current_step, - "epoch": epoch, - "date": datetime.date.today().strftime("%B %d, %Y"), - } - state.update(kwargs) - save_fsspec(state, output_path) - - -def save_checkpoint( - config, - model, - optimizer, - scaler, - current_step, - epoch, - output_folder, - **kwargs, -): - file_name = "checkpoint_{}.pth".format(current_step) - checkpoint_path = os.path.join(output_folder, file_name) - print("\n > CHECKPOINT : {}".format(checkpoint_path)) - save_model( - config, - model, - optimizer, - scaler, - current_step, - epoch, - checkpoint_path, - **kwargs, - ) - - -def save_best_model( - current_loss, - best_loss, - config, - model, - optimizer, - scaler, - current_step, - epoch, - out_path, - keep_all_best=False, - keep_after=10000, - **kwargs, -): - if current_loss < best_loss: - best_model_name = f"best_model_{current_step}.pth" - checkpoint_path = os.path.join(out_path, best_model_name) - print(" > BEST MODEL : {}".format(checkpoint_path)) - save_model( - config, - model, - optimizer, - scaler, - current_step, - epoch, - checkpoint_path, - model_loss=current_loss, - **kwargs, - ) - fs = fsspec.get_mapper(out_path).fs - # only delete previous if current is saved successfully - if not keep_all_best or (current_step < keep_after): - model_names = fs.glob(os.path.join(out_path, "best_model*.pth")) - for model_name in model_names: - if os.path.basename(model_name) != best_model_name: - fs.rm(model_name) - # create a shortcut which always points to the currently best model - shortcut_name = "best_model.pth" - shortcut_path = os.path.join(out_path, shortcut_name) - fs.copy(checkpoint_path, shortcut_path) - best_loss = current_loss - return best_loss From 96678c7ba227871d0929f2366d083219ccfa9262 Mon Sep 17 00:00:00 2001 From: Enno Hermann Date: Fri, 17 Nov 2023 00:12:09 +0100 Subject: [PATCH 04/17] refactor: use copy_model_files() from Trainer --- TTS/bin/train_encoder.py | 4 ++-- TTS/encoder/utils/training.py | 2 +- TTS/utils/io.py | 31 ------------------------------- 3 files changed, 3 insertions(+), 34 deletions(-) diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index f2e7779c..c4fb920f 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -8,6 +8,7 @@ import traceback import torch from torch.utils.data import DataLoader +from trainer.io import copy_model_files from trainer.torch import NoamLR from trainer.trainer_utils import get_optimizer @@ -18,7 +19,6 @@ from TTS.encoder.utils.visual import plot_embeddings from TTS.tts.datasets import load_tts_samples from TTS.utils.audio import AudioProcessor from TTS.utils.generic_utils import count_parameters, remove_experiment_folder -from TTS.utils.io import copy_model_files from TTS.utils.samplers import PerfectBatchSampler from TTS.utils.training import check_update @@ -276,7 +276,7 @@ def main(args): # pylint: disable=redefined-outer-name if c.loss == "softmaxproto" and c.model != "speaker_encoder": c.map_classid_to_classname = map_classid_to_classname - copy_model_files(c, OUT_PATH) + copy_model_files(c, OUT_PATH, new_fields={}) if args.restore_path: criterion, args.restore_step = model.load_checkpoint( diff --git a/TTS/encoder/utils/training.py b/TTS/encoder/utils/training.py index 7c58a232..ff8f271d 100644 --- a/TTS/encoder/utils/training.py +++ b/TTS/encoder/utils/training.py @@ -3,13 +3,13 @@ from dataclasses import dataclass, field from coqpit import Coqpit from trainer import TrainerArgs, get_last_checkpoint +from trainer.io import copy_model_files from trainer.logging import logger_factory from trainer.logging.console_logger import ConsoleLogger from TTS.config import load_config, register_config from TTS.tts.utils.text.characters import parse_symbols from TTS.utils.generic_utils import get_experiment_folder_path, get_git_branch -from TTS.utils.io import copy_model_files @dataclass diff --git a/TTS/utils/io.py b/TTS/utils/io.py index 7aaedbe2..3107ba66 100644 --- a/TTS/utils/io.py +++ b/TTS/utils/io.py @@ -1,12 +1,9 @@ -import json import os import pickle as pickle_tts -import shutil from typing import Any, Callable, Dict, Union import fsspec import torch -from coqpit import Coqpit from TTS.utils.generic_utils import get_user_data_dir @@ -27,34 +24,6 @@ class AttrDict(dict): self.__dict__ = self -def copy_model_files(config: Coqpit, out_path, new_fields=None): - """Copy config.json and other model files to training folder and add - new fields. - - Args: - config (Coqpit): Coqpit config defining the training run. - out_path (str): output path to copy the file. - new_fields (dict): new fileds to be added or edited - in the config file. - """ - copy_config_path = os.path.join(out_path, "config.json") - # add extra information fields - if new_fields: - config.update(new_fields, allow_new=True) - # TODO: Revert to config.save_json() once Coqpit supports arbitrary paths. - with fsspec.open(copy_config_path, "w", encoding="utf8") as f: - json.dump(config.to_dict(), f, indent=4) - - # copy model stats file if available - if config.audio.stats_path is not None: - copy_stats_path = os.path.join(out_path, "scale_stats.npy") - filesystem = fsspec.get_mapper(copy_stats_path).fs - if not filesystem.exists(copy_stats_path): - with fsspec.open(config.audio.stats_path, "rb") as source_file: - with fsspec.open(copy_stats_path, "wb") as target_file: - shutil.copyfileobj(source_file, target_file) - - def load_fsspec( path: str, map_location: Union[str, Callable, torch.device, Dict[Union[str, torch.device], Union[str, torch.device]]] = None, From 0fb0d67de7bd05ef4afd80f05e242217e9800c80 Mon Sep 17 00:00:00 2001 From: Enno Hermann Date: Fri, 17 Nov 2023 00:39:11 +0100 Subject: [PATCH 05/17] refactor: use save_checkpoint()/save_best_model() from Trainer --- TTS/bin/train_encoder.py | 21 +++++++++--- TTS/encoder/utils/generic_utils.py | 40 ----------------------- TTS/encoder/utils/io.py | 38 --------------------- tests/aux_tests/test_embedding_manager.py | 4 +-- tests/aux_tests/test_speaker_manager.py | 4 +-- tests/inference_tests/test_synthesizer.py | 3 +- 6 files changed, 23 insertions(+), 87 deletions(-) delete mode 100644 TTS/encoder/utils/io.py diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index c4fb920f..448fefc7 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -8,12 +8,12 @@ import traceback import torch from torch.utils.data import DataLoader -from trainer.io import copy_model_files +from trainer.io import copy_model_files, save_best_model, save_checkpoint from trainer.torch import NoamLR from trainer.trainer_utils import get_optimizer from TTS.encoder.dataset import EncoderDataset -from TTS.encoder.utils.generic_utils import save_best_model, save_checkpoint, setup_encoder_model +from TTS.encoder.utils.generic_utils import setup_encoder_model from TTS.encoder.utils.training import init_training from TTS.encoder.utils.visual import plot_embeddings from TTS.tts.datasets import load_tts_samples @@ -222,7 +222,9 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, if global_step % c.save_step == 0: # save model - save_checkpoint(model, optimizer, criterion, loss.item(), OUT_PATH, global_step, epoch) + save_checkpoint( + c, model, optimizer, None, global_step, epoch, OUT_PATH, criterion=criterion.state_dict() + ) end_time = time.time() @@ -245,7 +247,18 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, flush=True, ) # save the best checkpoint - best_loss = save_best_model(model, optimizer, criterion, eval_loss, best_loss, OUT_PATH, global_step, epoch) + best_loss = save_best_model( + eval_loss, + best_loss, + c, + model, + optimizer, + None, + global_step, + epoch, + OUT_PATH, + criterion=criterion.state_dict(), + ) model.train() return best_loss, global_step diff --git a/TTS/encoder/utils/generic_utils.py b/TTS/encoder/utils/generic_utils.py index 2b003ac8..236d6fe9 100644 --- a/TTS/encoder/utils/generic_utils.py +++ b/TTS/encoder/utils/generic_utils.py @@ -1,11 +1,9 @@ -import datetime import glob import os import random import numpy as np from scipy import signal -from trainer.io import save_fsspec from TTS.encoder.models.lstm import LSTMSpeakerEncoder from TTS.encoder.models.resnet import ResNetSpeakerEncoder @@ -136,41 +134,3 @@ def setup_encoder_model(config: "Coqpit"): audio_config=config.audio, ) return model - - -def save_checkpoint(model, optimizer, criterion, model_loss, out_path, current_step, epoch): - checkpoint_path = "checkpoint_{}.pth".format(current_step) - checkpoint_path = os.path.join(out_path, checkpoint_path) - print(" | | > Checkpoint saving : {}".format(checkpoint_path)) - - new_state_dict = model.state_dict() - state = { - "model": new_state_dict, - "optimizer": optimizer.state_dict() if optimizer is not None else None, - "criterion": criterion.state_dict(), - "step": current_step, - "epoch": epoch, - "loss": model_loss, - "date": datetime.date.today().strftime("%B %d, %Y"), - } - save_fsspec(state, checkpoint_path) - - -def save_best_model(model, optimizer, criterion, model_loss, best_loss, out_path, current_step, epoch): - if model_loss < best_loss: - new_state_dict = model.state_dict() - state = { - "model": new_state_dict, - "optimizer": optimizer.state_dict(), - "criterion": criterion.state_dict(), - "step": current_step, - "epoch": epoch, - "loss": model_loss, - "date": datetime.date.today().strftime("%B %d, %Y"), - } - best_loss = model_loss - bestmodel_path = "best_model.pth" - bestmodel_path = os.path.join(out_path, bestmodel_path) - print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path)) - save_fsspec(state, bestmodel_path) - return best_loss diff --git a/TTS/encoder/utils/io.py b/TTS/encoder/utils/io.py deleted file mode 100644 index a8359be1..00000000 --- a/TTS/encoder/utils/io.py +++ /dev/null @@ -1,38 +0,0 @@ -import datetime -import os - -from trainer.io import save_fsspec - - -def save_checkpoint(model, optimizer, model_loss, out_path, current_step): - checkpoint_path = "checkpoint_{}.pth".format(current_step) - checkpoint_path = os.path.join(out_path, checkpoint_path) - print(" | | > Checkpoint saving : {}".format(checkpoint_path)) - - new_state_dict = model.state_dict() - state = { - "model": new_state_dict, - "optimizer": optimizer.state_dict() if optimizer is not None else None, - "step": current_step, - "loss": model_loss, - "date": datetime.date.today().strftime("%B %d, %Y"), - } - save_fsspec(state, checkpoint_path) - - -def save_best_model(model, optimizer, model_loss, best_loss, out_path, current_step): - if model_loss < best_loss: - new_state_dict = model.state_dict() - state = { - "model": new_state_dict, - "optimizer": optimizer.state_dict(), - "step": current_step, - "loss": model_loss, - "date": datetime.date.today().strftime("%B %d, %Y"), - } - best_loss = model_loss - bestmodel_path = "best_model.pth" - bestmodel_path = os.path.join(out_path, bestmodel_path) - print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path)) - save_fsspec(state, bestmodel_path) - return best_loss diff --git a/tests/aux_tests/test_embedding_manager.py b/tests/aux_tests/test_embedding_manager.py index 73921501..e3acd62b 100644 --- a/tests/aux_tests/test_embedding_manager.py +++ b/tests/aux_tests/test_embedding_manager.py @@ -3,11 +3,11 @@ import unittest import numpy as np import torch +from trainer.io import save_checkpoint from tests import get_tests_input_path from TTS.config import load_config from TTS.encoder.utils.generic_utils import setup_encoder_model -from TTS.encoder.utils.io import save_checkpoint from TTS.tts.utils.managers import EmbeddingManager from TTS.utils.audio import AudioProcessor @@ -31,7 +31,7 @@ class EmbeddingManagerTest(unittest.TestCase): # create a dummy speaker encoder model = setup_encoder_model(config) - save_checkpoint(model, None, None, get_tests_input_path(), 0) + save_checkpoint(config, model, None, None, 0, 0, get_tests_input_path()) # load audio processor and speaker encoder manager = EmbeddingManager(encoder_model_path=encoder_model_path, encoder_config_path=encoder_config_path) diff --git a/tests/aux_tests/test_speaker_manager.py b/tests/aux_tests/test_speaker_manager.py index 397f9c81..402fbca4 100644 --- a/tests/aux_tests/test_speaker_manager.py +++ b/tests/aux_tests/test_speaker_manager.py @@ -3,11 +3,11 @@ import unittest import numpy as np import torch +from trainer.io import save_checkpoint from tests import get_tests_input_path from TTS.config import load_config from TTS.encoder.utils.generic_utils import setup_encoder_model -from TTS.encoder.utils.io import save_checkpoint from TTS.tts.utils.speakers import SpeakerManager from TTS.utils.audio import AudioProcessor @@ -30,7 +30,7 @@ class SpeakerManagerTest(unittest.TestCase): # create a dummy speaker encoder model = setup_encoder_model(config) - save_checkpoint(model, None, None, get_tests_input_path(), 0) + save_checkpoint(config, model, None, None, 0, 0, get_tests_input_path()) # load audio processor and speaker encoder ap = AudioProcessor(**config.audio) diff --git a/tests/inference_tests/test_synthesizer.py b/tests/inference_tests/test_synthesizer.py index 40e83017..ce4fc751 100644 --- a/tests/inference_tests/test_synthesizer.py +++ b/tests/inference_tests/test_synthesizer.py @@ -1,10 +1,11 @@ import os import unittest +from trainer.io import save_checkpoint + from tests import get_tests_input_path from TTS.config import load_config from TTS.tts.models import setup_model -from TTS.utils.io import save_checkpoint from TTS.utils.synthesizer import Synthesizer From 64f391b583c1f2814a0e613df3a7b2074397fe2a Mon Sep 17 00:00:00 2001 From: Tessa Painter Date: Fri, 24 Nov 2023 05:23:59 -0600 Subject: [PATCH 06/17] Made the tqdm `progress_bar` objects of static download methods a static class variable (#3297) --- TTS/utils/manage.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index 1cd437e6..d3eb8104 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -26,7 +26,9 @@ LICENSE_URLS = { } + class ModelManager(object): + tqdm_progress = None """Manage TTS models defined in .models.json. It provides an interface to list and download models defines in '.model.json' @@ -525,12 +527,12 @@ class ModelManager(object): total_size_in_bytes = int(r.headers.get("content-length", 0)) block_size = 1024 # 1 Kibibyte if progress_bar: - progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True) + ModelManager.tqdm_progress = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True) temp_zip_name = os.path.join(output_folder, file_url.split("/")[-1]) with open(temp_zip_name, "wb") as file: for data in r.iter_content(block_size): if progress_bar: - progress_bar.update(len(data)) + ModelManager.tqdm_progress.update(len(data)) file.write(data) with zipfile.ZipFile(temp_zip_name) as z: z.extractall(output_folder) @@ -560,12 +562,12 @@ class ModelManager(object): total_size_in_bytes = int(r.headers.get("content-length", 0)) block_size = 1024 # 1 Kibibyte if progress_bar: - progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True) + ModelManager.tqdm_progress = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True) temp_tar_name = os.path.join(output_folder, file_url.split("/")[-1]) with open(temp_tar_name, "wb") as file: for data in r.iter_content(block_size): if progress_bar: - progress_bar.update(len(data)) + ModelManager.tqdm_progress.update(len(data)) file.write(data) with tarfile.open(temp_tar_name) as t: t.extractall(output_folder) @@ -596,10 +598,10 @@ class ModelManager(object): block_size = 1024 # 1 Kibibyte with open(temp_zip_name, "wb") as file: if progress_bar: - progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True) + ModelManager.tqdm_progress = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True) for data in r.iter_content(block_size): if progress_bar: - progress_bar.update(len(data)) + ModelManager.tqdm_progress.update(len(data)) file.write(data) @staticmethod From 4a2684be341f26e273249e2b58a17b92dfc68d84 Mon Sep 17 00:00:00 2001 From: Enno Hermann Date: Fri, 24 Nov 2023 12:24:42 +0100 Subject: [PATCH 07/17] fix(bin.synthesize): more informative error for wrong --language argument (#3294) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In multilingual models, the target language is specified via the `--language_idx` argument. However, the `tts` CLI also accepts a `--language` argument for use with Coqui Studio, so it is easy to choose the wrong one, resulting in the following confusing error at synthesis time: ``` AssertionError: ❗ Language None is not supported. Supported languages are ['en', 'es', 'fr', 'de', 'it', 'pt', 'pl', 'tr', 'ru', 'nl', 'cs', 'ar', 'zh-cn', 'hu', 'ko', 'ja'] ``` This commit adds a better error message when `--language` is passed for a non-studio model. Fixes #3270, fixes #3291 --- TTS/bin/synthesize.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index ddfe35d2..d9ec3063 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -419,6 +419,13 @@ def main(): print(" > Saving output to ", args.out_path) return + if args.language_idx is None and args.language is not None: + msg = ( + "--language is only supported for Coqui Studio models. " + "Use --language_idx to specify the target language for multilingual models." + ) + raise ValueError(msg) + # CASE4: load pre-trained model paths if args.model_name is not None and not args.model_path: model_path, config_path, model_item = manager.download_model(args.model_name) From 2af02209960f8f2c93329689af0100d1cc591080 Mon Sep 17 00:00:00 2001 From: Enno Hermann Date: Fri, 24 Nov 2023 12:25:37 +0100 Subject: [PATCH 08/17] fix: don't pass quotes to espeak (#3286) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, the text was wrapped in an additional set of quotes that was passed to Espeak. This could result in different phonemization in certain edges and caused the insertion of an initial separator "_" that had to be removed. Compare: $ espeak-ng -q -b 1 -v en-us --ipa=1 '"A"' _ˈɐ $ espeak-ng -q -b 1 -v en-us --ipa=1 'A' ˈeɪ Fixes #2619 --- TTS/tts/utils/text/phonemizers/espeak_wrapper.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/TTS/tts/utils/text/phonemizers/espeak_wrapper.py b/TTS/tts/utils/text/phonemizers/espeak_wrapper.py index 8982a893..328e52f3 100644 --- a/TTS/tts/utils/text/phonemizers/espeak_wrapper.py +++ b/TTS/tts/utils/text/phonemizers/espeak_wrapper.py @@ -185,20 +185,16 @@ class ESpeak(BasePhonemizer): if tie: args.append("--tie=%s" % tie) - args.append('"' + text + '"') + args.append(text) # compute phonemes phonemes = "" for line in _espeak_exe(self._ESPEAK_LIB, args, sync=True): logging.debug("line: %s", repr(line)) ph_decoded = line.decode("utf8").strip() - # espeak need to skip first two characters of the retuned text: - # version 1.48.03: "_ p_ɹ_ˈaɪ_ɚ t_ə n_oʊ_v_ˈɛ_m_b_ɚ t_w_ˈɛ_n_t_i t_ˈuː\n" + # espeak: # version 1.48.15: " p_ɹ_ˈaɪ_ɚ t_ə n_oʊ_v_ˈɛ_m_b_ɚ t_w_ˈɛ_n_t_i t_ˈuː\n" - # espeak-ng need to skip the first character of the retuned text: - # "_p_ɹ_ˈaɪ_ɚ t_ə n_oʊ_v_ˈɛ_m_b_ɚ t_w_ˈɛ_n_t_i t_ˈuː\n" - - # dealing with the conditions descrived above - ph_decoded = ph_decoded[:1].replace("_", "") + ph_decoded[1:] + # espeak-ng: + # "p_ɹ_ˈaɪ_ɚ t_ə n_oʊ_v_ˈɛ_m_b_ɚ t_w_ˈɛ_n_t_i t_ˈuː\n" # espeak-ng backend can add language flags that need to be removed: # "sɛʁtˈɛ̃ mˈo kɔm (en)fˈʊtbɔːl(fr) ʒenˈɛʁ de- flˈaɡ də- lˈɑ̃ɡ." From 8c5227ed8489ba1ae528371a6df46de77a144333 Mon Sep 17 00:00:00 2001 From: Enno Hermann Date: Fri, 24 Nov 2023 12:26:37 +0100 Subject: [PATCH 09/17] Fix tts_with_vc (#3275) * Revert "fix for issue 3067" This reverts commit 041b4b6723a1c07a540059c5d2854a8698579de4. Fixes #3143. The original issue (#3067) was people trying to use tts.tts_with_vc_to_file() with XTTS and was "fixed" in #3109. But XTTS has integrated VC and you can just do tts.tts_to_file(..., speaker_wav="..."), there is no point in passing it through FreeVC afterwards. So, reverting this commit because it breaks tts.tts_with_vc_to_file() for any model that doesn't have integrated VC, i.e. all models this method is meant for. * fix: support multi-speaker models in tts_with_vc/tts_with_vc_to_file * fix: only compute spk embeddings for models that support it Fixes #1440. Passing a `speaker_wav` argument to regular Vits models failed because they don't support voice cloning. Now that argument is simply ignored. --- TTS/api.py | 19 +++++++++++++++---- TTS/utils/synthesizer.py | 6 +++++- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/TTS/api.py b/TTS/api.py index c8600dcd..fdf97d10 100644 --- a/TTS/api.py +++ b/TTS/api.py @@ -440,7 +440,7 @@ class TTS(nn.Module): save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate) return file_path - def tts_with_vc(self, text: str, language: str = None, speaker_wav: str = None): + def tts_with_vc(self, text: str, language: str = None, speaker_wav: str = None, speaker: str = None): """Convert text to speech with voice conversion. It combines tts with voice conversion to fake voice cloning. @@ -457,17 +457,25 @@ class TTS(nn.Module): speaker_wav (str, optional): Path to a reference wav file to use for voice cloning with supporting models like YourTTS. Defaults to None. + speaker (str, optional): + Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by + `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None. """ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: # Lazy code... save it to a temp file to resample it while reading it for VC - self.tts_to_file(text=text, speaker=None, language=language, file_path=fp.name, speaker_wav=speaker_wav) + self.tts_to_file(text=text, speaker=speaker, language=language, file_path=fp.name) if self.voice_converter is None: self.load_vc_model_by_name("voice_conversion_models/multilingual/vctk/freevc24") wav = self.voice_converter.voice_conversion(source_wav=fp.name, target_wav=speaker_wav) return wav def tts_with_vc_to_file( - self, text: str, language: str = None, speaker_wav: str = None, file_path: str = "output.wav" + self, + text: str, + language: str = None, + speaker_wav: str = None, + file_path: str = "output.wav", + speaker: str = None, ): """Convert text to speech with voice conversion and save to file. @@ -484,6 +492,9 @@ class TTS(nn.Module): Defaults to None. file_path (str, optional): Output file path. Defaults to "output.wav". + speaker (str, optional): + Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by + `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None. """ - wav = self.tts_with_vc(text=text, language=language, speaker_wav=speaker_wav) + wav = self.tts_with_vc(text=text, language=language, speaker_wav=speaker_wav, speaker=speaker) save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate) diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 8efe608b..0d0eb78a 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -358,7 +358,11 @@ class Synthesizer(nn.Module): ) # compute a new d_vector from the given clip. - if speaker_wav is not None and self.tts_model.speaker_manager is not None: + if ( + speaker_wav is not None + and self.tts_model.speaker_manager is not None + and self.tts_model.speaker_manager.encoder_ap is not None + ): speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip(speaker_wav) vocoder_device = "cpu" From 4d0f53d2ee572210c20401657aa1606c7c32189c Mon Sep 17 00:00:00 2001 From: TITC <35098797+TITC@users.noreply.github.com> Date: Fri, 24 Nov 2023 19:28:31 +0800 Subject: [PATCH 10/17] Misjudgment of `is_multi_lingual` When Loading Multilingual Model via `model_path` (#3273) * load multilingual model by path * use config to assert multi lingual or not --- TTS/api.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/TTS/api.py b/TTS/api.py index fdf97d10..3331f30e 100644 --- a/TTS/api.py +++ b/TTS/api.py @@ -10,7 +10,7 @@ from TTS.cs_api import CS_API from TTS.utils.audio.numpy_transforms import save_wav from TTS.utils.manage import ModelManager from TTS.utils.synthesizer import Synthesizer - +from TTS.config import load_config class TTS(nn.Module): """TODO: Add voice conversion and Capacitron support.""" @@ -66,13 +66,12 @@ class TTS(nn.Module): """ super().__init__() self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar, verbose=False) - + self.config = load_config(config_path) if config_path else None self.synthesizer = None self.voice_converter = None self.csapi = None self.cs_api_model = cs_api_model self.model_name = "" - if gpu: warnings.warn("`gpu` will be deprecated. Please use `tts.to(device)` instead.") @@ -106,7 +105,8 @@ class TTS(nn.Module): @property def is_multi_lingual(self): # Not sure what sets this to None, but applied a fix to prevent crashing. - if isinstance(self.model_name, str) and "xtts" in self.model_name: + if (isinstance(self.model_name, str) and "xtts" in self.model_name or + self.config and ("xtts" in self.config.model or len(self.config.languages) > 1)): return True if hasattr(self.synthesizer.tts_model, "language_manager") and self.synthesizer.tts_model.language_manager: return self.synthesizer.tts_model.language_manager.num_languages > 1 From 1bf59261967e7545480b23d9dcb34bc80374d284 Mon Sep 17 00:00:00 2001 From: Kaszanas <34846245+Kaszanas@users.noreply.github.com> Date: Fri, 24 Nov 2023 12:30:15 +0100 Subject: [PATCH 11/17] Introducing Development Dockerfile (#3263) * Moved Dockerfile, COPY at the end This change should prevent re-installation of the dependencies upon every change of the repository's contents. Typically if Docker detects that something changed in a layer, all downstream layers are invalidated and rebuilt. * Moved Dockerfile back to main directory Main dockerfile in a separate directory can cause issues with the current CI/CD setup. This can be a good change for later. * Introduced Dockerfile.dev, updated CONTRIBUTING Dockerfile.dev can be used as a separate development environment for anyone that does not wish to install the dependencies locally. --- CONTRIBUTING.md | 26 ++++++++++++++++++++++ Dockerfile | 10 +++++++-- dockerfiles/Dockerfile.dev | 44 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 78 insertions(+), 2 deletions(-) create mode 100644 dockerfiles/Dockerfile.dev diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ade35507..cae35993 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -128,6 +128,32 @@ The following steps are tested on an Ubuntu system. 14. Once things look perfect, We merge it to the ```dev``` branch and make it ready for the next version. +## Development in Docker container + +If you prefer working within a Docker container as your development environment, you can do the following: + +1. Fork 🐸TTS[https://github.com/coqui-ai/TTS] by clicking the fork button at the top right corner of the project page. + +2. Clone 🐸TTS and add the main repo as a new remote named ```upsteam```. + + ```bash + $ git clone git@github.com:/TTS.git + $ cd TTS + $ git remote add upstream https://github.com/coqui-ai/TTS.git + ``` + +3. Build the Docker Image as your development environment (it installs all of the dependencies for you): + + ``` + docker build --tag=tts-dev:latest -f .\dockerfiles\Dockerfile.dev . + ``` + +4. Run the container with GPU support: + + ``` + docker run -it --gpus all tts-dev:latest /bin/bash + ``` + Feel free to ping us at any step you need help using our communication channels. If you are new to Github or open-source contribution, These are good resources. diff --git a/Dockerfile b/Dockerfile index 30dfb23d..9fb3005e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,13 +1,19 @@ ARG BASE=nvidia/cuda:11.8.0-base-ubuntu22.04 FROM ${BASE} + RUN apt-get update && apt-get upgrade -y RUN apt-get install -y --no-install-recommends gcc g++ make python3 python3-dev python3-pip python3-venv python3-wheel espeak-ng libsndfile1-dev && rm -rf /var/lib/apt/lists/* RUN pip3 install llvmlite --ignore-installed -WORKDIR /root -COPY . /root +# Install Dependencies: RUN pip3 install torch torchaudio --extra-index-url https://download.pytorch.org/whl/cu118 RUN rm -rf /root/.cache/pip + +# Copy TTS repository contents: +WORKDIR /root +COPY . /root + RUN make install + ENTRYPOINT ["tts"] CMD ["--help"] diff --git a/dockerfiles/Dockerfile.dev b/dockerfiles/Dockerfile.dev new file mode 100644 index 00000000..58baee53 --- /dev/null +++ b/dockerfiles/Dockerfile.dev @@ -0,0 +1,44 @@ +ARG BASE=nvidia/cuda:11.8.0-base-ubuntu22.04 +FROM ${BASE} + +# Install OS dependencies: +RUN apt-get update && apt-get upgrade -y +RUN apt-get install -y --no-install-recommends \ + gcc g++ \ + make \ + python3 python3-dev python3-pip python3-venv python3-wheel \ + espeak-ng libsndfile1-dev \ + && rm -rf /var/lib/apt/lists/* + +# Install Major Python Dependencies: +RUN pip3 install llvmlite --ignore-installed +RUN pip3 install torch torchaudio --extra-index-url https://download.pytorch.org/whl/cu118 +RUN rm -rf /root/.cache/pip + +WORKDIR /root + +# Copy Dependency Lock Files: +COPY \ + Makefile \ + pyproject.toml \ + setup.py \ + requirements.dev.txt \ + requirements.ja.txt \ + requirements.notebooks.txt \ + requirements.txt \ + /root/ + +# Install Project Dependencies +# Separate stage to limit re-downloading: +RUN pip install \ + -r requirements.txt \ + -r requirements.dev.txt \ + -r requirements.ja.txt \ + -r requirements.notebooks.txt + +# Copy TTS repository contents: +COPY . /root + +# Installing the TTS package itself: +RUN make install + From a55755c8dfc74c9d9abd3eeef61dcb13d632765e Mon Sep 17 00:00:00 2001 From: Julian Weber Date: Fri, 24 Nov 2023 12:35:49 +0100 Subject: [PATCH 12/17] update deepspeed version (#3281) --- docs/source/models/xtts.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/models/xtts.md b/docs/source/models/xtts.md index 03e44af1..43f27540 100644 --- a/docs/source/models/xtts.md +++ b/docs/source/models/xtts.md @@ -97,7 +97,7 @@ or for all wav files in a directory you can use: If you want to be able to run with `use_deepspeed=True` and enjoy the speedup, you need to install deepspeed first. ```console -pip install deepspeed==0.8.3 +pip install deepspeed==0.10.3 ``` ```python From 6dd43b0ce2fe92a719cea26577c15f61a676fca8 Mon Sep 17 00:00:00 2001 From: Eren G??lge Date: Fri, 24 Nov 2023 14:36:04 +0100 Subject: [PATCH 13/17] Update to XTTS v2.0.3 --- TTS/.models.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/.models.json b/TTS/.models.json index 5f4008fb..1957d78a 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -10,7 +10,7 @@ "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json", "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/hash.md5" ], - "model_hash": "5ce0502bfe3bc88dc8d9312b12a7558c", + "model_hash": "10f92b55c512af7a8d39d650547a15a7", "default_vocoder": null, "commit": "480a6cdf7", "license": "CPML", From 1542a50c3ac9c7486ecc0be160f9f2c359181d6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 24 Nov 2023 14:37:05 +0100 Subject: [PATCH 14/17] Update to v0.21.0 --- TTS/VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/VERSION b/TTS/VERSION index 752e6303..88541566 100644 --- a/TTS/VERSION +++ b/TTS/VERSION @@ -1 +1 @@ -0.20.6 +0.21.0 From 32065139e713b3e44aa88e72c4d35012bb888238 Mon Sep 17 00:00:00 2001 From: Eren G??lge Date: Fri, 24 Nov 2023 15:14:34 +0100 Subject: [PATCH 15/17] Simple text cleaner for "hi" --- TTS/tts/layers/xtts/tokenizer.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/TTS/tts/layers/xtts/tokenizer.py b/TTS/tts/layers/xtts/tokenizer.py index 52848743..1a3cc47a 100644 --- a/TTS/tts/layers/xtts/tokenizer.py +++ b/TTS/tts/layers/xtts/tokenizer.py @@ -636,6 +636,9 @@ class VoiceBpeTokenizer: txt = korean_transliterate(txt) elif lang == "ja": txt = japanese_cleaners(txt, self.katsu) + elif lang == "hi": + # @manmay will implement this + txt = basic_cleaners(txt) else: raise NotImplementedError(f"Language '{lang}' is not supported.") return txt From 00a870c26abdc06429ffef3e2814b1a1d5b40fff Mon Sep 17 00:00:00 2001 From: Eren G??lge Date: Fri, 24 Nov 2023 15:15:44 +0100 Subject: [PATCH 16/17] Update to v0.21.1 --- TTS/VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/VERSION b/TTS/VERSION index 88541566..a67cebaf 100644 --- a/TTS/VERSION +++ b/TTS/VERSION @@ -1 +1 @@ -0.21.0 +0.21.1 From 11ec9f7471620ebaa57db7ff5705254829ffe516 Mon Sep 17 00:00:00 2001 From: Eren G??lge Date: Fri, 24 Nov 2023 15:38:36 +0100 Subject: [PATCH 17/17] Add hi in config defaults --- TTS/tts/configs/xtts_config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/TTS/tts/configs/xtts_config.py b/TTS/tts/configs/xtts_config.py index e8ab07da..bbf048e1 100644 --- a/TTS/tts/configs/xtts_config.py +++ b/TTS/tts/configs/xtts_config.py @@ -88,6 +88,7 @@ class XttsConfig(BaseTTSConfig): "hu", "ko", "ja", + "hi", ] )