From fdf0c8b10a00404b51bdd62bf62231c0dbf4e50f Mon Sep 17 00:00:00 2001
From: Enno Hermann <enno.hermann@idiap.ch>
Date: Thu, 16 Nov 2023 23:40:21 +0100
Subject: [PATCH 01/17] chore(encoder): remove unused code

---
 TTS/encoder/utils/generic_utils.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/TTS/encoder/utils/generic_utils.py b/TTS/encoder/utils/generic_utils.py
index 1da02961..bbce6a8a 100644
--- a/TTS/encoder/utils/generic_utils.py
+++ b/TTS/encoder/utils/generic_utils.py
@@ -2,7 +2,6 @@ import datetime
 import glob
 import os
 import random
-import re
 
 import numpy as np
 from scipy import signal
@@ -118,11 +117,6 @@ class AugmentWAV(object):
         return self.additive_noise(noise_type, audio)
 
 
-def to_camel(text):
-    text = text.capitalize()
-    return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text)
-
-
 def setup_encoder_model(config: "Coqpit"):
     if config.model_params["model_name"].lower() == "lstm":
         model = LSTMSpeakerEncoder(

From 39fe38bda4d6937336255d32e542d4f84dd0fe15 Mon Sep 17 00:00:00 2001
From: Enno Hermann <enno.hermann@idiap.ch>
Date: Thu, 16 Nov 2023 23:46:26 +0100
Subject: [PATCH 02/17] refactor: use save_fsspec() from Trainer

---
 TTS/encoder/utils/generic_utils.py |  2 +-
 TTS/encoder/utils/io.py            |  2 +-
 TTS/utils/io.py                    | 13 +------------
 3 files changed, 3 insertions(+), 14 deletions(-)

diff --git a/TTS/encoder/utils/generic_utils.py b/TTS/encoder/utils/generic_utils.py
index bbce6a8a..2b003ac8 100644
--- a/TTS/encoder/utils/generic_utils.py
+++ b/TTS/encoder/utils/generic_utils.py
@@ -5,10 +5,10 @@ import random
 
 import numpy as np
 from scipy import signal
+from trainer.io import save_fsspec
 
 from TTS.encoder.models.lstm import LSTMSpeakerEncoder
 from TTS.encoder.models.resnet import ResNetSpeakerEncoder
-from TTS.utils.io import save_fsspec
 
 
 class AugmentWAV(object):
diff --git a/TTS/encoder/utils/io.py b/TTS/encoder/utils/io.py
index d1dad3e2..a8359be1 100644
--- a/TTS/encoder/utils/io.py
+++ b/TTS/encoder/utils/io.py
@@ -1,7 +1,7 @@
 import datetime
 import os
 
-from TTS.utils.io import save_fsspec
+from trainer.io import save_fsspec
 
 
 def save_checkpoint(model, optimizer, model_loss, out_path, current_step):
diff --git a/TTS/utils/io.py b/TTS/utils/io.py
index e9bdf3e6..9ab1075c 100644
--- a/TTS/utils/io.py
+++ b/TTS/utils/io.py
@@ -8,6 +8,7 @@ from typing import Any, Callable, Dict, Union
 import fsspec
 import torch
 from coqpit import Coqpit
+from trainer.io import save_fsspec
 
 from TTS.utils.generic_utils import get_user_data_dir
 
@@ -102,18 +103,6 @@ def load_checkpoint(
     return model, state
 
 
-def save_fsspec(state: Any, path: str, **kwargs):
-    """Like torch.save but can save to other locations (e.g. s3:// , gs://).
-
-    Args:
-        state: State object to save
-        path: Any path or url supported by fsspec.
-        **kwargs: Keyword arguments forwarded to torch.save.
-    """
-    with fsspec.open(path, "wb") as f:
-        torch.save(state, f, **kwargs)
-
-
 def save_model(config, model, optimizer, scaler, current_step, epoch, output_path, **kwargs):
     if hasattr(model, "module"):
         model_state = model.module.state_dict()

From 5119e651a1dbccdc4e5fdb47dc386d33f378e621 Mon Sep 17 00:00:00 2001
From: Enno Hermann <enno.hermann@idiap.ch>
Date: Thu, 16 Nov 2023 23:52:28 +0100
Subject: [PATCH 03/17] chore(utils.io): remove unused code

These are all available in Trainer.
---
 TTS/utils/io.py | 104 ------------------------------------------------
 1 file changed, 104 deletions(-)

diff --git a/TTS/utils/io.py b/TTS/utils/io.py
index 9ab1075c..7aaedbe2 100644
--- a/TTS/utils/io.py
+++ b/TTS/utils/io.py
@@ -1,4 +1,3 @@
-import datetime
 import json
 import os
 import pickle as pickle_tts
@@ -8,7 +7,6 @@ from typing import Any, Callable, Dict, Union
 import fsspec
 import torch
 from coqpit import Coqpit
-from trainer.io import save_fsspec
 
 from TTS.utils.generic_utils import get_user_data_dir
 
@@ -101,105 +99,3 @@ def load_checkpoint(
     if eval:
         model.eval()
     return model, state
-
-
-def save_model(config, model, optimizer, scaler, current_step, epoch, output_path, **kwargs):
-    if hasattr(model, "module"):
-        model_state = model.module.state_dict()
-    else:
-        model_state = model.state_dict()
-    if isinstance(optimizer, list):
-        optimizer_state = [optim.state_dict() for optim in optimizer]
-    elif optimizer.__class__.__name__ == "CapacitronOptimizer":
-        optimizer_state = [optimizer.primary_optimizer.state_dict(), optimizer.secondary_optimizer.state_dict()]
-    else:
-        optimizer_state = optimizer.state_dict() if optimizer is not None else None
-
-    if isinstance(scaler, list):
-        scaler_state = [s.state_dict() for s in scaler]
-    else:
-        scaler_state = scaler.state_dict() if scaler is not None else None
-
-    if isinstance(config, Coqpit):
-        config = config.to_dict()
-
-    state = {
-        "config": config,
-        "model": model_state,
-        "optimizer": optimizer_state,
-        "scaler": scaler_state,
-        "step": current_step,
-        "epoch": epoch,
-        "date": datetime.date.today().strftime("%B %d, %Y"),
-    }
-    state.update(kwargs)
-    save_fsspec(state, output_path)
-
-
-def save_checkpoint(
-    config,
-    model,
-    optimizer,
-    scaler,
-    current_step,
-    epoch,
-    output_folder,
-    **kwargs,
-):
-    file_name = "checkpoint_{}.pth".format(current_step)
-    checkpoint_path = os.path.join(output_folder, file_name)
-    print("\n > CHECKPOINT : {}".format(checkpoint_path))
-    save_model(
-        config,
-        model,
-        optimizer,
-        scaler,
-        current_step,
-        epoch,
-        checkpoint_path,
-        **kwargs,
-    )
-
-
-def save_best_model(
-    current_loss,
-    best_loss,
-    config,
-    model,
-    optimizer,
-    scaler,
-    current_step,
-    epoch,
-    out_path,
-    keep_all_best=False,
-    keep_after=10000,
-    **kwargs,
-):
-    if current_loss < best_loss:
-        best_model_name = f"best_model_{current_step}.pth"
-        checkpoint_path = os.path.join(out_path, best_model_name)
-        print(" > BEST MODEL : {}".format(checkpoint_path))
-        save_model(
-            config,
-            model,
-            optimizer,
-            scaler,
-            current_step,
-            epoch,
-            checkpoint_path,
-            model_loss=current_loss,
-            **kwargs,
-        )
-        fs = fsspec.get_mapper(out_path).fs
-        # only delete previous if current is saved successfully
-        if not keep_all_best or (current_step < keep_after):
-            model_names = fs.glob(os.path.join(out_path, "best_model*.pth"))
-            for model_name in model_names:
-                if os.path.basename(model_name) != best_model_name:
-                    fs.rm(model_name)
-        # create a shortcut which always points to the currently best model
-        shortcut_name = "best_model.pth"
-        shortcut_path = os.path.join(out_path, shortcut_name)
-        fs.copy(checkpoint_path, shortcut_path)
-        best_loss = current_loss
-    return best_loss

From 96678c7ba227871d0929f2366d083219ccfa9262 Mon Sep 17 00:00:00 2001
From: Enno Hermann <enno.hermann@idiap.ch>
Date: Fri, 17 Nov 2023 00:12:09 +0100
Subject: [PATCH 04/17] refactor: use copy_model_files() from Trainer

---
 TTS/bin/train_encoder.py      |  4 ++--
 TTS/encoder/utils/training.py |  2 +-
 TTS/utils/io.py               | 31 -------------------------------
 3 files changed, 3 insertions(+), 34 deletions(-)

diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py
index f2e7779c..c4fb920f 100644
--- a/TTS/bin/train_encoder.py
+++ b/TTS/bin/train_encoder.py
@@ -8,6 +8,7 @@ import traceback
 
 import torch
 from torch.utils.data import DataLoader
+from trainer.io import copy_model_files
 from trainer.torch import NoamLR
 from trainer.trainer_utils import get_optimizer
 
@@ -18,7 +19,6 @@ from TTS.encoder.utils.visual import plot_embeddings
 from TTS.tts.datasets import load_tts_samples
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.generic_utils import count_parameters, remove_experiment_folder
-from TTS.utils.io import copy_model_files
 from TTS.utils.samplers import PerfectBatchSampler
 from TTS.utils.training import check_update
 
@@ -276,7 +276,7 @@ def main(args):  # pylint: disable=redefined-outer-name
 
     if c.loss == "softmaxproto" and c.model != "speaker_encoder":
         c.map_classid_to_classname = map_classid_to_classname
-        copy_model_files(c, OUT_PATH)
+        copy_model_files(c, OUT_PATH, new_fields={})
 
     if args.restore_path:
         criterion, args.restore_step = model.load_checkpoint(
diff --git a/TTS/encoder/utils/training.py b/TTS/encoder/utils/training.py
index 7c58a232..ff8f271d 100644
--- a/TTS/encoder/utils/training.py
+++ b/TTS/encoder/utils/training.py
@@ -3,13 +3,13 @@ from dataclasses import dataclass, field
 
 from coqpit import Coqpit
 from trainer import TrainerArgs, get_last_checkpoint
+from trainer.io import copy_model_files
 from trainer.logging import logger_factory
 from trainer.logging.console_logger import ConsoleLogger
 
 from TTS.config import load_config, register_config
 from TTS.tts.utils.text.characters import parse_symbols
 from TTS.utils.generic_utils import get_experiment_folder_path, get_git_branch
-from TTS.utils.io import copy_model_files
 
 
 @dataclass
diff --git a/TTS/utils/io.py b/TTS/utils/io.py
index 7aaedbe2..3107ba66 100644
--- a/TTS/utils/io.py
+++ b/TTS/utils/io.py
@@ -1,12 +1,9 @@
-import json
 import os
 import pickle as pickle_tts
-import shutil
 from typing import Any, Callable, Dict, Union
 
 import fsspec
 import torch
-from coqpit import Coqpit
 
 from TTS.utils.generic_utils import get_user_data_dir
 
@@ -27,34 +24,6 @@ class AttrDict(dict):
         self.__dict__ = self
 
 
-def copy_model_files(config: Coqpit, out_path, new_fields=None):
-    """Copy config.json and other model files to training folder and add
-    new fields.
-
-    Args:
-        config (Coqpit): Coqpit config defining the training run.
-        out_path (str): output path to copy the file.
-        new_fields (dict): new fileds to be added or edited
-            in the config file.
-    """
-    copy_config_path = os.path.join(out_path, "config.json")
-    # add extra information fields
-    if new_fields:
-        config.update(new_fields, allow_new=True)
-    # TODO: Revert to config.save_json() once Coqpit supports arbitrary paths.
-    with fsspec.open(copy_config_path, "w", encoding="utf8") as f:
-        json.dump(config.to_dict(), f, indent=4)
-
-    # copy model stats file if available
-    if config.audio.stats_path is not None:
-        copy_stats_path = os.path.join(out_path, "scale_stats.npy")
-        filesystem = fsspec.get_mapper(copy_stats_path).fs
-        if not filesystem.exists(copy_stats_path):
-            with fsspec.open(config.audio.stats_path, "rb") as source_file:
-                with fsspec.open(copy_stats_path, "wb") as target_file:
-                    shutil.copyfileobj(source_file, target_file)
-
-
 def load_fsspec(
     path: str,
     map_location: Union[str, Callable, torch.device, Dict[Union[str, torch.device], Union[str, torch.device]]] = None,

From 0fb0d67de7bd05ef4afd80f05e242217e9800c80 Mon Sep 17 00:00:00 2001
From: Enno Hermann <enno.hermann@idiap.ch>
Date: Fri, 17 Nov 2023 00:39:11 +0100
Subject: [PATCH 05/17] refactor: use save_checkpoint()/save_best_model() from
 Trainer

---
 TTS/bin/train_encoder.py                  | 21 +++++++++---
 TTS/encoder/utils/generic_utils.py        | 40 -----------------------
 TTS/encoder/utils/io.py                   | 38 ---------------------
 tests/aux_tests/test_embedding_manager.py |  4 +--
 tests/aux_tests/test_speaker_manager.py   |  4 +--
 tests/inference_tests/test_synthesizer.py |  3 +-
 6 files changed, 23 insertions(+), 87 deletions(-)
 delete mode 100644 TTS/encoder/utils/io.py

diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py
index c4fb920f..448fefc7 100644
--- a/TTS/bin/train_encoder.py
+++ b/TTS/bin/train_encoder.py
@@ -8,12 +8,12 @@ import traceback
 
 import torch
 from torch.utils.data import DataLoader
-from trainer.io import copy_model_files
+from trainer.io import copy_model_files, save_best_model, save_checkpoint
 from trainer.torch import NoamLR
 from trainer.trainer_utils import get_optimizer
 
 from TTS.encoder.dataset import EncoderDataset
-from TTS.encoder.utils.generic_utils import save_best_model, save_checkpoint, setup_encoder_model
+from TTS.encoder.utils.generic_utils import setup_encoder_model
 from TTS.encoder.utils.training import init_training
 from TTS.encoder.utils.visual import plot_embeddings
 from TTS.tts.datasets import load_tts_samples
@@ -222,7 +222,9 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader,
 
             if global_step % c.save_step == 0:
                 # save model
-                save_checkpoint(model, optimizer, criterion, loss.item(), OUT_PATH, global_step, epoch)
+                save_checkpoint(
+                    c, model, optimizer, None, global_step, epoch, OUT_PATH, criterion=criterion.state_dict()
+                )
 
             end_time = time.time()
 
@@ -245,7 +247,18 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader,
                 flush=True,
             )
             # save the best checkpoint
-            best_loss = save_best_model(model, optimizer, criterion, eval_loss, best_loss, OUT_PATH, global_step, epoch)
+            best_loss = save_best_model(
+                eval_loss,
+                best_loss,
+                c,
+                model,
+                optimizer,
+                None,
+                global_step,
+                epoch,
+                OUT_PATH,
+                criterion=criterion.state_dict(),
+            )
             model.train()
 
     return best_loss, global_step
diff --git a/TTS/encoder/utils/generic_utils.py b/TTS/encoder/utils/generic_utils.py
index 2b003ac8..236d6fe9 100644
--- a/TTS/encoder/utils/generic_utils.py
+++ b/TTS/encoder/utils/generic_utils.py
@@ -1,11 +1,9 @@
-import datetime
 import glob
 import os
 import random
 
 import numpy as np
 from scipy import signal
-from trainer.io import save_fsspec
 
 from TTS.encoder.models.lstm import LSTMSpeakerEncoder
 from TTS.encoder.models.resnet import ResNetSpeakerEncoder
@@ -136,41 +134,3 @@ def setup_encoder_model(config: "Coqpit"):
             audio_config=config.audio,
         )
     return model
-
-
-def save_checkpoint(model, optimizer, criterion, model_loss, out_path, current_step, epoch):
-    checkpoint_path = "checkpoint_{}.pth".format(current_step)
-    checkpoint_path = os.path.join(out_path, checkpoint_path)
-    print(" | | > Checkpoint saving : {}".format(checkpoint_path))
-
-    new_state_dict = model.state_dict()
-    state = {
-        "model": new_state_dict,
-        "optimizer": optimizer.state_dict() if optimizer is not None else None,
-        "criterion": criterion.state_dict(),
-        "step": current_step,
-        "epoch": epoch,
-        "loss": model_loss,
-        "date": datetime.date.today().strftime("%B %d, %Y"),
-    }
-    save_fsspec(state, checkpoint_path)
-
-
-def save_best_model(model, optimizer, criterion, model_loss, best_loss, out_path, current_step, epoch):
-    if model_loss < best_loss:
-        new_state_dict = model.state_dict()
-        state = {
-            "model": new_state_dict,
-            "optimizer": optimizer.state_dict(),
-            "criterion": criterion.state_dict(),
-            "step": current_step,
-            "epoch": epoch,
-            "loss": model_loss,
-            "date": datetime.date.today().strftime("%B %d, %Y"),
-        }
-        best_loss = model_loss
-        bestmodel_path = "best_model.pth"
-        bestmodel_path = os.path.join(out_path, bestmodel_path)
-        print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path))
-        save_fsspec(state, bestmodel_path)
-    return best_loss
diff --git a/TTS/encoder/utils/io.py b/TTS/encoder/utils/io.py
deleted file mode 100644
index a8359be1..00000000
--- a/TTS/encoder/utils/io.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import datetime
-import os
-
-from trainer.io import save_fsspec
-
-
-def save_checkpoint(model, optimizer, model_loss, out_path, current_step):
-    checkpoint_path = "checkpoint_{}.pth".format(current_step)
-    checkpoint_path = os.path.join(out_path, checkpoint_path)
-    print(" | | > Checkpoint saving : {}".format(checkpoint_path))
-
-    new_state_dict = model.state_dict()
-    state = {
-        "model": new_state_dict,
-        "optimizer": optimizer.state_dict() if optimizer is not None else None,
-        "step": current_step,
-        "loss": model_loss,
-        "date": datetime.date.today().strftime("%B %d, %Y"),
-    }
-    save_fsspec(state, checkpoint_path)
-
-
-def save_best_model(model, optimizer, model_loss, best_loss, out_path, current_step):
-    if model_loss < best_loss:
-        new_state_dict = model.state_dict()
-        state = {
-            "model": new_state_dict,
-            "optimizer": optimizer.state_dict(),
-            "step": current_step,
-            "loss": model_loss,
-            "date": datetime.date.today().strftime("%B %d, %Y"),
-        }
-        best_loss = model_loss
-        bestmodel_path = "best_model.pth"
-        bestmodel_path = os.path.join(out_path, bestmodel_path)
-        print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path))
-        save_fsspec(state, bestmodel_path)
-    return best_loss
diff --git a/tests/aux_tests/test_embedding_manager.py b/tests/aux_tests/test_embedding_manager.py
index 73921501..e3acd62b 100644
--- a/tests/aux_tests/test_embedding_manager.py
+++ b/tests/aux_tests/test_embedding_manager.py
@@ -3,11 +3,11 @@ import unittest
 
 import numpy as np
 import torch
+from trainer.io import save_checkpoint
 
 from tests import get_tests_input_path
 from TTS.config import load_config
 from TTS.encoder.utils.generic_utils import setup_encoder_model
-from TTS.encoder.utils.io import save_checkpoint
 from TTS.tts.utils.managers import EmbeddingManager
 from TTS.utils.audio import AudioProcessor
 
@@ -31,7 +31,7 @@ class EmbeddingManagerTest(unittest.TestCase):
 
         # create a dummy speaker encoder
         model = setup_encoder_model(config)
-        save_checkpoint(model, None, None, get_tests_input_path(), 0)
+        save_checkpoint(config, model, None, None, 0, 0, get_tests_input_path())
 
         # load audio processor and speaker encoder
         manager = EmbeddingManager(encoder_model_path=encoder_model_path, encoder_config_path=encoder_config_path)
diff --git a/tests/aux_tests/test_speaker_manager.py b/tests/aux_tests/test_speaker_manager.py
index 397f9c81..402fbca4 100644
--- a/tests/aux_tests/test_speaker_manager.py
+++ b/tests/aux_tests/test_speaker_manager.py
@@ -3,11 +3,11 @@ import unittest
 
 import numpy as np
 import torch
+from trainer.io import save_checkpoint
 
 from tests import get_tests_input_path
 from TTS.config import load_config
 from TTS.encoder.utils.generic_utils import setup_encoder_model
-from TTS.encoder.utils.io import save_checkpoint
 from TTS.tts.utils.speakers import SpeakerManager
 from TTS.utils.audio import AudioProcessor
 
@@ -30,7 +30,7 @@ class SpeakerManagerTest(unittest.TestCase):
 
         # create a dummy speaker encoder
         model = setup_encoder_model(config)
-        save_checkpoint(model, None, None, get_tests_input_path(), 0)
+        save_checkpoint(config, model, None, None, 0, 0, get_tests_input_path())
 
         # load audio processor and speaker encoder
         ap = AudioProcessor(**config.audio)
diff --git a/tests/inference_tests/test_synthesizer.py b/tests/inference_tests/test_synthesizer.py
index 40e83017..ce4fc751 100644
--- a/tests/inference_tests/test_synthesizer.py
+++ b/tests/inference_tests/test_synthesizer.py
@@ -1,10 +1,11 @@
 import os
 import unittest
 
+from trainer.io import save_checkpoint
+
 from tests import get_tests_input_path
 from TTS.config import load_config
 from TTS.tts.models import setup_model
-from TTS.utils.io import save_checkpoint
 from TTS.utils.synthesizer import Synthesizer
 
 

From 64f391b583c1f2814a0e613df3a7b2074397fe2a Mon Sep 17 00:00:00 2001
From: Tessa Painter <seth@sethpainter.com>
Date: Fri, 24 Nov 2023 05:23:59 -0600
Subject: [PATCH 06/17] Made the tqdm `progress_bar` objects of static download
 methods a static class variable (#3297)

---
 TTS/utils/manage.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py
index 1cd437e6..d3eb8104 100644
--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@@ -26,7 +26,9 @@ LICENSE_URLS = {
 }
 
 
+
 class ModelManager(object):
+    tqdm_progress = None
     """Manage TTS models defined in .models.json.
     It provides an interface to list and download
     models defines in '.model.json'
@@ -525,12 +527,12 @@ class ModelManager(object):
             total_size_in_bytes = int(r.headers.get("content-length", 0))
             block_size = 1024  # 1 Kibibyte
             if progress_bar:
-                progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
+                ModelManager.tqdm_progress = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
             temp_zip_name = os.path.join(output_folder, file_url.split("/")[-1])
             with open(temp_zip_name, "wb") as file:
                 for data in r.iter_content(block_size):
                     if progress_bar:
-                        progress_bar.update(len(data))
+                        ModelManager.tqdm_progress.update(len(data))
                     file.write(data)
             with zipfile.ZipFile(temp_zip_name) as z:
                 z.extractall(output_folder)
@@ -560,12 +562,12 @@ class ModelManager(object):
             total_size_in_bytes = int(r.headers.get("content-length", 0))
             block_size = 1024  # 1 Kibibyte
             if progress_bar:
-                progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
+                ModelManager.tqdm_progress = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
             temp_tar_name = os.path.join(output_folder, file_url.split("/")[-1])
             with open(temp_tar_name, "wb") as file:
                 for data in r.iter_content(block_size):
                     if progress_bar:
-                        progress_bar.update(len(data))
+                        ModelManager.tqdm_progress.update(len(data))
                     file.write(data)
             with tarfile.open(temp_tar_name) as t:
                 t.extractall(output_folder)
@@ -596,10 +598,10 @@ class ModelManager(object):
             block_size = 1024  # 1 Kibibyte
             with open(temp_zip_name, "wb") as file:
                 if progress_bar:
-                    progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
+                    ModelManager.tqdm_progress = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
                 for data in r.iter_content(block_size):
                     if progress_bar:
-                        progress_bar.update(len(data))
+                        ModelManager.tqdm_progress.update(len(data))
                     file.write(data)
 
     @staticmethod

From 4a2684be341f26e273249e2b58a17b92dfc68d84 Mon Sep 17 00:00:00 2001
From: Enno Hermann <Eginhard@users.noreply.github.com>
Date: Fri, 24 Nov 2023 12:24:42 +0100
Subject: [PATCH 07/17] fix(bin.synthesize): more informative error for wrong
 --language argument (#3294)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In multilingual models, the target language is specified via the
`--language_idx` argument. However, the `tts` CLI also accepts a `--language`
argument for use with Coqui Studio, so it is easy to choose the wrong one,
resulting in the following confusing error at synthesis time:

```
AssertionError:  ❗ Language None is not supported. Supported languages are
['en', 'es', 'fr', 'de', 'it', 'pt', 'pl', 'tr', 'ru', 'nl', 'cs', 'ar',
'zh-cn', 'hu', 'ko', 'ja']
```

This commit adds a better error message when `--language` is passed for a
non-studio model.

Fixes #3270, fixes #3291
---
 TTS/bin/synthesize.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py
index ddfe35d2..d9ec3063 100755
--- a/TTS/bin/synthesize.py
+++ b/TTS/bin/synthesize.py
@@ -419,6 +419,13 @@ def main():
             print(" > Saving output to ", args.out_path)
             return
 
+        if args.language_idx is None and args.language is not None:
+            msg = (
+                "--language is only supported for Coqui Studio models. "
+                "Use --language_idx to specify the target language for multilingual models."
+            )
+            raise ValueError(msg)
+
         # CASE4: load pre-trained model paths
         if args.model_name is not None and not args.model_path:
             model_path, config_path, model_item = manager.download_model(args.model_name)

From 2af02209960f8f2c93329689af0100d1cc591080 Mon Sep 17 00:00:00 2001
From: Enno Hermann <Eginhard@users.noreply.github.com>
Date: Fri, 24 Nov 2023 12:25:37 +0100
Subject: [PATCH 08/17] fix: don't pass quotes to espeak (#3286)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously, the text was wrapped in an additional set of quotes that was passed
to Espeak. This could result in different phonemization in certain edges and
caused the insertion of an initial separator "_" that had to be removed.
Compare:
$ espeak-ng -q -b 1 -v en-us --ipa=1 '"A"'
_ˈɐ
$ espeak-ng -q -b 1 -v en-us --ipa=1 'A'
ˈeɪ

Fixes #2619
---
 TTS/tts/utils/text/phonemizers/espeak_wrapper.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/TTS/tts/utils/text/phonemizers/espeak_wrapper.py b/TTS/tts/utils/text/phonemizers/espeak_wrapper.py
index 8982a893..328e52f3 100644
--- a/TTS/tts/utils/text/phonemizers/espeak_wrapper.py
+++ b/TTS/tts/utils/text/phonemizers/espeak_wrapper.py
@@ -185,20 +185,16 @@ class ESpeak(BasePhonemizer):
         if tie:
             args.append("--tie=%s" % tie)
 
-        args.append('"' + text + '"')
+        args.append(text)
         # compute phonemes
         phonemes = ""
         for line in _espeak_exe(self._ESPEAK_LIB, args, sync=True):
             logging.debug("line: %s", repr(line))
             ph_decoded = line.decode("utf8").strip()
-            # espeak need to skip first two characters of the retuned text:
-            #   version 1.48.03: "_ p_ɹ_ˈaɪ_ɚ t_ə n_oʊ_v_ˈɛ_m_b_ɚ t_w_ˈɛ_n_t_i t_ˈuː\n"
+            # espeak:
             #   version 1.48.15: " p_ɹ_ˈaɪ_ɚ t_ə n_oʊ_v_ˈɛ_m_b_ɚ t_w_ˈɛ_n_t_i t_ˈuː\n"
-            # espeak-ng need to skip the first character of the retuned text:
-            #   "_p_ɹ_ˈaɪ_ɚ t_ə n_oʊ_v_ˈɛ_m_b_ɚ t_w_ˈɛ_n_t_i t_ˈuː\n"
-
-            # dealing with the conditions descrived above
-            ph_decoded = ph_decoded[:1].replace("_", "") + ph_decoded[1:]
+            # espeak-ng:
+            #   "p_ɹ_ˈaɪ_ɚ t_ə n_oʊ_v_ˈɛ_m_b_ɚ t_w_ˈɛ_n_t_i t_ˈuː\n"
 
             # espeak-ng backend can add language flags that need to be removed:
             #   "sɛʁtˈɛ̃ mˈo kɔm (en)fˈʊtbɔːl(fr) ʒenˈɛʁ de- flˈaɡ də- lˈɑ̃ɡ."

From 8c5227ed8489ba1ae528371a6df46de77a144333 Mon Sep 17 00:00:00 2001
From: Enno Hermann <Eginhard@users.noreply.github.com>
Date: Fri, 24 Nov 2023 12:26:37 +0100
Subject: [PATCH 09/17] Fix tts_with_vc (#3275)

* Revert "fix for issue 3067"

This reverts commit 041b4b6723a1c07a540059c5d2854a8698579de4.

Fixes #3143. The original issue (#3067) was people trying to use
tts.tts_with_vc_to_file() with XTTS and was "fixed" in #3109. But XTTS has
integrated VC and you can just do tts.tts_to_file(..., speaker_wav="..."), there
is no point in passing it through FreeVC afterwards. So, reverting this commit
because it breaks tts.tts_with_vc_to_file() for any model that doesn't have
integrated VC, i.e. all models this method is meant for.

* fix: support multi-speaker models in tts_with_vc/tts_with_vc_to_file

* fix: only compute spk embeddings for models that support it

Fixes #1440. Passing a `speaker_wav` argument to regular Vits models failed
because they don't support voice cloning. Now that argument is simply ignored.
---
 TTS/api.py               | 19 +++++++++++++++----
 TTS/utils/synthesizer.py |  6 +++++-
 2 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/TTS/api.py b/TTS/api.py
index c8600dcd..fdf97d10 100644
--- a/TTS/api.py
+++ b/TTS/api.py
@@ -440,7 +440,7 @@ class TTS(nn.Module):
         save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)
         return file_path
 
-    def tts_with_vc(self, text: str, language: str = None, speaker_wav: str = None):
+    def tts_with_vc(self, text: str, language: str = None, speaker_wav: str = None, speaker: str = None):
         """Convert text to speech with voice conversion.
 
         It combines tts with voice conversion to fake voice cloning.
@@ -457,17 +457,25 @@ class TTS(nn.Module):
             speaker_wav (str, optional):
                 Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
                 Defaults to None.
+            speaker (str, optional):
+                Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
+                `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
         """
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
             # Lazy code... save it to a temp file to resample it while reading it for VC
-            self.tts_to_file(text=text, speaker=None, language=language, file_path=fp.name, speaker_wav=speaker_wav)
+            self.tts_to_file(text=text, speaker=speaker, language=language, file_path=fp.name)
         if self.voice_converter is None:
             self.load_vc_model_by_name("voice_conversion_models/multilingual/vctk/freevc24")
         wav = self.voice_converter.voice_conversion(source_wav=fp.name, target_wav=speaker_wav)
         return wav
 
     def tts_with_vc_to_file(
-        self, text: str, language: str = None, speaker_wav: str = None, file_path: str = "output.wav"
+        self,
+        text: str,
+        language: str = None,
+        speaker_wav: str = None,
+        file_path: str = "output.wav",
+        speaker: str = None,
     ):
         """Convert text to speech with voice conversion and save to file.
 
@@ -484,6 +492,9 @@ class TTS(nn.Module):
                 Defaults to None.
             file_path (str, optional):
                 Output file path. Defaults to "output.wav".
+            speaker (str, optional):
+                Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
+                `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
         """
-        wav = self.tts_with_vc(text=text, language=language, speaker_wav=speaker_wav)
+        wav = self.tts_with_vc(text=text, language=language, speaker_wav=speaker_wav, speaker=speaker)
         save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)
diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py
index 8efe608b..0d0eb78a 100644
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@@ -358,7 +358,11 @@ class Synthesizer(nn.Module):
                 )
 
         # compute a new d_vector from the given clip.
-        if speaker_wav is not None and self.tts_model.speaker_manager is not None:
+        if (
+            speaker_wav is not None
+            and self.tts_model.speaker_manager is not None
+            and self.tts_model.speaker_manager.encoder_ap is not None
+        ):
             speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip(speaker_wav)
 
         vocoder_device = "cpu"

From 4d0f53d2ee572210c20401657aa1606c7c32189c Mon Sep 17 00:00:00 2001
From: TITC <35098797+TITC@users.noreply.github.com>
Date: Fri, 24 Nov 2023 19:28:31 +0800
Subject: [PATCH 10/17] Misjudgment of `is_multi_lingual` When Loading
 Multilingual Model via `model_path` (#3273)

* load multilingual model by path

* use config to assert multi lingual or not
---
 TTS/api.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/TTS/api.py b/TTS/api.py
index fdf97d10..3331f30e 100644
--- a/TTS/api.py
+++ b/TTS/api.py
@@ -10,7 +10,7 @@ from TTS.cs_api import CS_API
 from TTS.utils.audio.numpy_transforms import save_wav
 from TTS.utils.manage import ModelManager
 from TTS.utils.synthesizer import Synthesizer
-
+from TTS.config import load_config
 
 class TTS(nn.Module):
     """TODO: Add voice conversion and Capacitron support."""
@@ -66,13 +66,12 @@ class TTS(nn.Module):
         """
         super().__init__()
         self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar, verbose=False)
-
+        self.config = load_config(config_path) if config_path else None
         self.synthesizer = None
         self.voice_converter = None
         self.csapi = None
         self.cs_api_model = cs_api_model
         self.model_name = ""
-
         if gpu:
             warnings.warn("`gpu` will be deprecated. Please use `tts.to(device)` instead.")
 
@@ -106,7 +105,8 @@ class TTS(nn.Module):
     @property
     def is_multi_lingual(self):
         # Not sure what sets this to None, but applied a fix to prevent crashing.
-        if isinstance(self.model_name, str) and "xtts" in self.model_name:
+        if (isinstance(self.model_name, str) and "xtts" in self.model_name or
+                self.config and ("xtts" in self.config.model or len(self.config.languages) > 1)):
             return True
         if hasattr(self.synthesizer.tts_model, "language_manager") and self.synthesizer.tts_model.language_manager:
             return self.synthesizer.tts_model.language_manager.num_languages > 1

From 1bf59261967e7545480b23d9dcb34bc80374d284 Mon Sep 17 00:00:00 2001
From: Kaszanas <34846245+Kaszanas@users.noreply.github.com>
Date: Fri, 24 Nov 2023 12:30:15 +0100
Subject: [PATCH 11/17] Introducing Development Dockerfile (#3263)

* Moved Dockerfile, COPY at the end

This change should prevent re-installation of the dependencies upon
every change of the repository's contents. Typically if Docker detects
that something changed in a layer, all downstream layers are invalidated
and rebuilt.

* Moved Dockerfile back to main directory

Main dockerfile in a separate directory can cause issues with the
current CI/CD setup. This can be a good change for later.

* Introduced Dockerfile.dev, updated CONTRIBUTING

Dockerfile.dev can be used as a separate development environment for
anyone that does not wish to install the dependencies locally.
---
 CONTRIBUTING.md            | 26 ++++++++++++++++++++++
 Dockerfile                 | 10 +++++++--
 dockerfiles/Dockerfile.dev | 44 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 78 insertions(+), 2 deletions(-)
 create mode 100644 dockerfiles/Dockerfile.dev

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index ade35507..cae35993 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -128,6 +128,32 @@ The following steps are tested on an Ubuntu system.
 
 14. Once things look perfect, We merge it to the ```dev``` branch and make it ready for the next version.
 
+## Development in Docker container
+
+If you prefer working within a Docker container as your development environment, you can do the following:
+
+1. Fork 🐸TTS[https://github.com/coqui-ai/TTS] by clicking the fork button at the top right corner of the project page.
+
+2. Clone 🐸TTS and add the main repo as a new remote named ```upsteam```.
+
+    ```bash
+    $ git clone git@github.com:<your Github name>/TTS.git
+    $ cd TTS
+    $ git remote add upstream https://github.com/coqui-ai/TTS.git
+    ```
+
+3. Build the Docker Image as your development environment (it installs all of the dependencies for you):
+
+    ```
+    docker build --tag=tts-dev:latest -f .\dockerfiles\Dockerfile.dev .
+    ```
+
+4. Run the container with GPU support:
+
+    ```
+    docker run -it --gpus all tts-dev:latest /bin/bash
+    ```
+
 Feel free to ping us at any step you need help using our communication channels.
 
 If you are new to Github or open-source contribution, These are good resources.
diff --git a/Dockerfile b/Dockerfile
index 30dfb23d..9fb3005e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,13 +1,19 @@
 ARG BASE=nvidia/cuda:11.8.0-base-ubuntu22.04
 FROM ${BASE}
+
 RUN apt-get update && apt-get upgrade -y
 RUN apt-get install -y --no-install-recommends gcc g++ make python3 python3-dev python3-pip python3-venv python3-wheel espeak-ng libsndfile1-dev && rm -rf /var/lib/apt/lists/*
 RUN pip3 install llvmlite --ignore-installed
 
-WORKDIR /root
-COPY . /root
+# Install Dependencies:
 RUN pip3 install torch torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
 RUN rm -rf /root/.cache/pip
+
+# Copy TTS repository contents:
+WORKDIR /root
+COPY . /root
+
 RUN make install
+
 ENTRYPOINT ["tts"]
 CMD ["--help"]
diff --git a/dockerfiles/Dockerfile.dev b/dockerfiles/Dockerfile.dev
new file mode 100644
index 00000000..58baee53
--- /dev/null
+++ b/dockerfiles/Dockerfile.dev
@@ -0,0 +1,44 @@
+ARG BASE=nvidia/cuda:11.8.0-base-ubuntu22.04
+FROM ${BASE}
+
+# Install OS dependencies:
+RUN apt-get update && apt-get upgrade -y
+RUN apt-get install -y --no-install-recommends \
+    gcc g++ \
+    make \
+    python3 python3-dev python3-pip python3-venv python3-wheel \
+    espeak-ng libsndfile1-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Major Python Dependencies:
+RUN pip3 install llvmlite --ignore-installed
+RUN pip3 install torch torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
+RUN rm -rf /root/.cache/pip
+
+WORKDIR /root
+
+# Copy Dependency Lock Files:
+COPY \
+    Makefile \
+    pyproject.toml \
+    setup.py \
+    requirements.dev.txt \
+    requirements.ja.txt \
+    requirements.notebooks.txt \
+    requirements.txt \
+    /root/
+
+# Install Project Dependencies
+# Separate stage to limit re-downloading:
+RUN pip install \
+    -r requirements.txt \
+    -r requirements.dev.txt \
+    -r requirements.ja.txt \
+    -r requirements.notebooks.txt
+
+# Copy TTS repository contents:
+COPY . /root
+
+# Installing the TTS package itself:
+RUN make install
+

From a55755c8dfc74c9d9abd3eeef61dcb13d632765e Mon Sep 17 00:00:00 2001
From: Julian Weber <julian.weber@hotmail.fr>
Date: Fri, 24 Nov 2023 12:35:49 +0100
Subject: [PATCH 12/17] update deepspeed version (#3281)

---
 docs/source/models/xtts.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/models/xtts.md b/docs/source/models/xtts.md
index 03e44af1..43f27540 100644
--- a/docs/source/models/xtts.md
+++ b/docs/source/models/xtts.md
@@ -97,7 +97,7 @@ or for all wav files in a directory you can use:
 If you want to be able to run with `use_deepspeed=True` and enjoy the speedup, you need to install deepspeed first.
 
 ```console
-pip install deepspeed==0.8.3
+pip install deepspeed==0.10.3
 ```
 
 ```python

From 6dd43b0ce2fe92a719cea26577c15f61a676fca8 Mon Sep 17 00:00:00 2001
From: Eren G??lge <egolge@coqui.ai>
Date: Fri, 24 Nov 2023 14:36:04 +0100
Subject: [PATCH 13/17] Update to XTTS v2.0.3

---
 TTS/.models.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TTS/.models.json b/TTS/.models.json
index 5f4008fb..1957d78a 100644
--- a/TTS/.models.json
+++ b/TTS/.models.json
@@ -10,7 +10,7 @@
                         "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json",
                         "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/hash.md5"
                     ],
-                    "model_hash": "5ce0502bfe3bc88dc8d9312b12a7558c",
+                    "model_hash": "10f92b55c512af7a8d39d650547a15a7",
                     "default_vocoder": null,
                     "commit": "480a6cdf7",
                     "license": "CPML",

From 1542a50c3ac9c7486ecc0be160f9f2c359181d6e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <erogol@hotmail.com>
Date: Fri, 24 Nov 2023 14:37:05 +0100
Subject: [PATCH 14/17] Update to v0.21.0

---
 TTS/VERSION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TTS/VERSION b/TTS/VERSION
index 752e6303..88541566 100644
--- a/TTS/VERSION
+++ b/TTS/VERSION
@@ -1 +1 @@
-0.20.6
+0.21.0

From 32065139e713b3e44aa88e72c4d35012bb888238 Mon Sep 17 00:00:00 2001
From: Eren G??lge <egolge@coqui.ai>
Date: Fri, 24 Nov 2023 15:14:34 +0100
Subject: [PATCH 15/17] Simple text cleaner for "hi"

---
 TTS/tts/layers/xtts/tokenizer.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/TTS/tts/layers/xtts/tokenizer.py b/TTS/tts/layers/xtts/tokenizer.py
index 52848743..1a3cc47a 100644
--- a/TTS/tts/layers/xtts/tokenizer.py
+++ b/TTS/tts/layers/xtts/tokenizer.py
@@ -636,6 +636,9 @@ class VoiceBpeTokenizer:
                 txt = korean_transliterate(txt)
         elif lang == "ja":
             txt = japanese_cleaners(txt, self.katsu)
+        elif lang == "hi":
+            # @manmay will implement this
+            txt = basic_cleaners(txt)
         else:
             raise NotImplementedError(f"Language '{lang}' is not supported.")
         return txt

From 00a870c26abdc06429ffef3e2814b1a1d5b40fff Mon Sep 17 00:00:00 2001
From: Eren G??lge <egolge@coqui.ai>
Date: Fri, 24 Nov 2023 15:15:44 +0100
Subject: [PATCH 16/17] Update to v0.21.1

---
 TTS/VERSION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TTS/VERSION b/TTS/VERSION
index 88541566..a67cebaf 100644
--- a/TTS/VERSION
+++ b/TTS/VERSION
@@ -1 +1 @@
-0.21.0
+0.21.1

From 11ec9f7471620ebaa57db7ff5705254829ffe516 Mon Sep 17 00:00:00 2001
From: Eren G??lge <egolge@coqui.ai>
Date: Fri, 24 Nov 2023 15:38:36 +0100
Subject: [PATCH 17/17] Add hi in config defaults

---
 TTS/tts/configs/xtts_config.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/TTS/tts/configs/xtts_config.py b/TTS/tts/configs/xtts_config.py
index e8ab07da..bbf048e1 100644
--- a/TTS/tts/configs/xtts_config.py
+++ b/TTS/tts/configs/xtts_config.py
@@ -88,6 +88,7 @@ class XttsConfig(BaseTTSConfig):
             "hu",
             "ko",
             "ja",
+            "hi",
         ]
     )