mirror of https://github.com/coqui-ai/TTS.git
commit
6189e2f4fc
|
@ -128,6 +128,32 @@ The following steps are tested on an Ubuntu system.
|
||||||
|
|
||||||
14. Once things look perfect, We merge it to the ```dev``` branch and make it ready for the next version.
|
14. Once things look perfect, We merge it to the ```dev``` branch and make it ready for the next version.
|
||||||
|
|
||||||
|
## Development in Docker container
|
||||||
|
|
||||||
|
If you prefer working within a Docker container as your development environment, you can do the following:
|
||||||
|
|
||||||
|
1. Fork 🐸TTS[https://github.com/coqui-ai/TTS] by clicking the fork button at the top right corner of the project page.
|
||||||
|
|
||||||
|
2. Clone 🐸TTS and add the main repo as a new remote named ```upsteam```.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ git clone git@github.com:<your Github name>/TTS.git
|
||||||
|
$ cd TTS
|
||||||
|
$ git remote add upstream https://github.com/coqui-ai/TTS.git
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Build the Docker Image as your development environment (it installs all of the dependencies for you):
|
||||||
|
|
||||||
|
```
|
||||||
|
docker build --tag=tts-dev:latest -f .\dockerfiles\Dockerfile.dev .
|
||||||
|
```
|
||||||
|
|
||||||
|
4. Run the container with GPU support:
|
||||||
|
|
||||||
|
```
|
||||||
|
docker run -it --gpus all tts-dev:latest /bin/bash
|
||||||
|
```
|
||||||
|
|
||||||
Feel free to ping us at any step you need help using our communication channels.
|
Feel free to ping us at any step you need help using our communication channels.
|
||||||
|
|
||||||
If you are new to Github or open-source contribution, These are good resources.
|
If you are new to Github or open-source contribution, These are good resources.
|
||||||
|
|
10
Dockerfile
10
Dockerfile
|
@ -1,13 +1,19 @@
|
||||||
ARG BASE=nvidia/cuda:11.8.0-base-ubuntu22.04
|
ARG BASE=nvidia/cuda:11.8.0-base-ubuntu22.04
|
||||||
FROM ${BASE}
|
FROM ${BASE}
|
||||||
|
|
||||||
RUN apt-get update && apt-get upgrade -y
|
RUN apt-get update && apt-get upgrade -y
|
||||||
RUN apt-get install -y --no-install-recommends gcc g++ make python3 python3-dev python3-pip python3-venv python3-wheel espeak-ng libsndfile1-dev && rm -rf /var/lib/apt/lists/*
|
RUN apt-get install -y --no-install-recommends gcc g++ make python3 python3-dev python3-pip python3-venv python3-wheel espeak-ng libsndfile1-dev && rm -rf /var/lib/apt/lists/*
|
||||||
RUN pip3 install llvmlite --ignore-installed
|
RUN pip3 install llvmlite --ignore-installed
|
||||||
|
|
||||||
WORKDIR /root
|
# Install Dependencies:
|
||||||
COPY . /root
|
|
||||||
RUN pip3 install torch torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
|
RUN pip3 install torch torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
|
||||||
RUN rm -rf /root/.cache/pip
|
RUN rm -rf /root/.cache/pip
|
||||||
|
|
||||||
|
# Copy TTS repository contents:
|
||||||
|
WORKDIR /root
|
||||||
|
COPY . /root
|
||||||
|
|
||||||
RUN make install
|
RUN make install
|
||||||
|
|
||||||
ENTRYPOINT ["tts"]
|
ENTRYPOINT ["tts"]
|
||||||
CMD ["--help"]
|
CMD ["--help"]
|
||||||
|
|
|
@ -10,7 +10,7 @@
|
||||||
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json",
|
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json",
|
||||||
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/hash.md5"
|
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/hash.md5"
|
||||||
],
|
],
|
||||||
"model_hash": "5ce0502bfe3bc88dc8d9312b12a7558c",
|
"model_hash": "10f92b55c512af7a8d39d650547a15a7",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"commit": "480a6cdf7",
|
"commit": "480a6cdf7",
|
||||||
"license": "CPML",
|
"license": "CPML",
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
0.20.6
|
0.21.1
|
||||||
|
|
27
TTS/api.py
27
TTS/api.py
|
@ -10,7 +10,7 @@ from TTS.cs_api import CS_API
|
||||||
from TTS.utils.audio.numpy_transforms import save_wav
|
from TTS.utils.audio.numpy_transforms import save_wav
|
||||||
from TTS.utils.manage import ModelManager
|
from TTS.utils.manage import ModelManager
|
||||||
from TTS.utils.synthesizer import Synthesizer
|
from TTS.utils.synthesizer import Synthesizer
|
||||||
|
from TTS.config import load_config
|
||||||
|
|
||||||
class TTS(nn.Module):
|
class TTS(nn.Module):
|
||||||
"""TODO: Add voice conversion and Capacitron support."""
|
"""TODO: Add voice conversion and Capacitron support."""
|
||||||
|
@ -66,13 +66,12 @@ class TTS(nn.Module):
|
||||||
"""
|
"""
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar, verbose=False)
|
self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar, verbose=False)
|
||||||
|
self.config = load_config(config_path) if config_path else None
|
||||||
self.synthesizer = None
|
self.synthesizer = None
|
||||||
self.voice_converter = None
|
self.voice_converter = None
|
||||||
self.csapi = None
|
self.csapi = None
|
||||||
self.cs_api_model = cs_api_model
|
self.cs_api_model = cs_api_model
|
||||||
self.model_name = ""
|
self.model_name = ""
|
||||||
|
|
||||||
if gpu:
|
if gpu:
|
||||||
warnings.warn("`gpu` will be deprecated. Please use `tts.to(device)` instead.")
|
warnings.warn("`gpu` will be deprecated. Please use `tts.to(device)` instead.")
|
||||||
|
|
||||||
|
@ -106,7 +105,8 @@ class TTS(nn.Module):
|
||||||
@property
|
@property
|
||||||
def is_multi_lingual(self):
|
def is_multi_lingual(self):
|
||||||
# Not sure what sets this to None, but applied a fix to prevent crashing.
|
# Not sure what sets this to None, but applied a fix to prevent crashing.
|
||||||
if isinstance(self.model_name, str) and "xtts" in self.model_name:
|
if (isinstance(self.model_name, str) and "xtts" in self.model_name or
|
||||||
|
self.config and ("xtts" in self.config.model or len(self.config.languages) > 1)):
|
||||||
return True
|
return True
|
||||||
if hasattr(self.synthesizer.tts_model, "language_manager") and self.synthesizer.tts_model.language_manager:
|
if hasattr(self.synthesizer.tts_model, "language_manager") and self.synthesizer.tts_model.language_manager:
|
||||||
return self.synthesizer.tts_model.language_manager.num_languages > 1
|
return self.synthesizer.tts_model.language_manager.num_languages > 1
|
||||||
|
@ -440,7 +440,7 @@ class TTS(nn.Module):
|
||||||
save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)
|
save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)
|
||||||
return file_path
|
return file_path
|
||||||
|
|
||||||
def tts_with_vc(self, text: str, language: str = None, speaker_wav: str = None):
|
def tts_with_vc(self, text: str, language: str = None, speaker_wav: str = None, speaker: str = None):
|
||||||
"""Convert text to speech with voice conversion.
|
"""Convert text to speech with voice conversion.
|
||||||
|
|
||||||
It combines tts with voice conversion to fake voice cloning.
|
It combines tts with voice conversion to fake voice cloning.
|
||||||
|
@ -457,17 +457,25 @@ class TTS(nn.Module):
|
||||||
speaker_wav (str, optional):
|
speaker_wav (str, optional):
|
||||||
Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
|
Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
|
||||||
Defaults to None.
|
Defaults to None.
|
||||||
|
speaker (str, optional):
|
||||||
|
Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
|
||||||
|
`tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
|
||||||
"""
|
"""
|
||||||
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
|
||||||
# Lazy code... save it to a temp file to resample it while reading it for VC
|
# Lazy code... save it to a temp file to resample it while reading it for VC
|
||||||
self.tts_to_file(text=text, speaker=None, language=language, file_path=fp.name, speaker_wav=speaker_wav)
|
self.tts_to_file(text=text, speaker=speaker, language=language, file_path=fp.name)
|
||||||
if self.voice_converter is None:
|
if self.voice_converter is None:
|
||||||
self.load_vc_model_by_name("voice_conversion_models/multilingual/vctk/freevc24")
|
self.load_vc_model_by_name("voice_conversion_models/multilingual/vctk/freevc24")
|
||||||
wav = self.voice_converter.voice_conversion(source_wav=fp.name, target_wav=speaker_wav)
|
wav = self.voice_converter.voice_conversion(source_wav=fp.name, target_wav=speaker_wav)
|
||||||
return wav
|
return wav
|
||||||
|
|
||||||
def tts_with_vc_to_file(
|
def tts_with_vc_to_file(
|
||||||
self, text: str, language: str = None, speaker_wav: str = None, file_path: str = "output.wav"
|
self,
|
||||||
|
text: str,
|
||||||
|
language: str = None,
|
||||||
|
speaker_wav: str = None,
|
||||||
|
file_path: str = "output.wav",
|
||||||
|
speaker: str = None,
|
||||||
):
|
):
|
||||||
"""Convert text to speech with voice conversion and save to file.
|
"""Convert text to speech with voice conversion and save to file.
|
||||||
|
|
||||||
|
@ -484,6 +492,9 @@ class TTS(nn.Module):
|
||||||
Defaults to None.
|
Defaults to None.
|
||||||
file_path (str, optional):
|
file_path (str, optional):
|
||||||
Output file path. Defaults to "output.wav".
|
Output file path. Defaults to "output.wav".
|
||||||
|
speaker (str, optional):
|
||||||
|
Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
|
||||||
|
`tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
|
||||||
"""
|
"""
|
||||||
wav = self.tts_with_vc(text=text, language=language, speaker_wav=speaker_wav)
|
wav = self.tts_with_vc(text=text, language=language, speaker_wav=speaker_wav, speaker=speaker)
|
||||||
save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)
|
save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)
|
||||||
|
|
|
@ -419,6 +419,13 @@ def main():
|
||||||
print(" > Saving output to ", args.out_path)
|
print(" > Saving output to ", args.out_path)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
if args.language_idx is None and args.language is not None:
|
||||||
|
msg = (
|
||||||
|
"--language is only supported for Coqui Studio models. "
|
||||||
|
"Use --language_idx to specify the target language for multilingual models."
|
||||||
|
)
|
||||||
|
raise ValueError(msg)
|
||||||
|
|
||||||
# CASE4: load pre-trained model paths
|
# CASE4: load pre-trained model paths
|
||||||
if args.model_name is not None and not args.model_path:
|
if args.model_name is not None and not args.model_path:
|
||||||
model_path, config_path, model_item = manager.download_model(args.model_name)
|
model_path, config_path, model_item = manager.download_model(args.model_name)
|
||||||
|
|
|
@ -8,17 +8,17 @@ import traceback
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from torch.utils.data import DataLoader
|
from torch.utils.data import DataLoader
|
||||||
|
from trainer.io import copy_model_files, save_best_model, save_checkpoint
|
||||||
from trainer.torch import NoamLR
|
from trainer.torch import NoamLR
|
||||||
from trainer.trainer_utils import get_optimizer
|
from trainer.trainer_utils import get_optimizer
|
||||||
|
|
||||||
from TTS.encoder.dataset import EncoderDataset
|
from TTS.encoder.dataset import EncoderDataset
|
||||||
from TTS.encoder.utils.generic_utils import save_best_model, save_checkpoint, setup_encoder_model
|
from TTS.encoder.utils.generic_utils import setup_encoder_model
|
||||||
from TTS.encoder.utils.training import init_training
|
from TTS.encoder.utils.training import init_training
|
||||||
from TTS.encoder.utils.visual import plot_embeddings
|
from TTS.encoder.utils.visual import plot_embeddings
|
||||||
from TTS.tts.datasets import load_tts_samples
|
from TTS.tts.datasets import load_tts_samples
|
||||||
from TTS.utils.audio import AudioProcessor
|
from TTS.utils.audio import AudioProcessor
|
||||||
from TTS.utils.generic_utils import count_parameters, remove_experiment_folder
|
from TTS.utils.generic_utils import count_parameters, remove_experiment_folder
|
||||||
from TTS.utils.io import copy_model_files
|
|
||||||
from TTS.utils.samplers import PerfectBatchSampler
|
from TTS.utils.samplers import PerfectBatchSampler
|
||||||
from TTS.utils.training import check_update
|
from TTS.utils.training import check_update
|
||||||
|
|
||||||
|
@ -222,7 +222,9 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader,
|
||||||
|
|
||||||
if global_step % c.save_step == 0:
|
if global_step % c.save_step == 0:
|
||||||
# save model
|
# save model
|
||||||
save_checkpoint(model, optimizer, criterion, loss.item(), OUT_PATH, global_step, epoch)
|
save_checkpoint(
|
||||||
|
c, model, optimizer, None, global_step, epoch, OUT_PATH, criterion=criterion.state_dict()
|
||||||
|
)
|
||||||
|
|
||||||
end_time = time.time()
|
end_time = time.time()
|
||||||
|
|
||||||
|
@ -245,7 +247,18 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader,
|
||||||
flush=True,
|
flush=True,
|
||||||
)
|
)
|
||||||
# save the best checkpoint
|
# save the best checkpoint
|
||||||
best_loss = save_best_model(model, optimizer, criterion, eval_loss, best_loss, OUT_PATH, global_step, epoch)
|
best_loss = save_best_model(
|
||||||
|
eval_loss,
|
||||||
|
best_loss,
|
||||||
|
c,
|
||||||
|
model,
|
||||||
|
optimizer,
|
||||||
|
None,
|
||||||
|
global_step,
|
||||||
|
epoch,
|
||||||
|
OUT_PATH,
|
||||||
|
criterion=criterion.state_dict(),
|
||||||
|
)
|
||||||
model.train()
|
model.train()
|
||||||
|
|
||||||
return best_loss, global_step
|
return best_loss, global_step
|
||||||
|
@ -276,7 +289,7 @@ def main(args): # pylint: disable=redefined-outer-name
|
||||||
|
|
||||||
if c.loss == "softmaxproto" and c.model != "speaker_encoder":
|
if c.loss == "softmaxproto" and c.model != "speaker_encoder":
|
||||||
c.map_classid_to_classname = map_classid_to_classname
|
c.map_classid_to_classname = map_classid_to_classname
|
||||||
copy_model_files(c, OUT_PATH)
|
copy_model_files(c, OUT_PATH, new_fields={})
|
||||||
|
|
||||||
if args.restore_path:
|
if args.restore_path:
|
||||||
criterion, args.restore_step = model.load_checkpoint(
|
criterion, args.restore_step = model.load_checkpoint(
|
||||||
|
|
|
@ -1,15 +1,12 @@
|
||||||
import datetime
|
|
||||||
import glob
|
import glob
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
import re
|
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from scipy import signal
|
from scipy import signal
|
||||||
|
|
||||||
from TTS.encoder.models.lstm import LSTMSpeakerEncoder
|
from TTS.encoder.models.lstm import LSTMSpeakerEncoder
|
||||||
from TTS.encoder.models.resnet import ResNetSpeakerEncoder
|
from TTS.encoder.models.resnet import ResNetSpeakerEncoder
|
||||||
from TTS.utils.io import save_fsspec
|
|
||||||
|
|
||||||
|
|
||||||
class AugmentWAV(object):
|
class AugmentWAV(object):
|
||||||
|
@ -118,11 +115,6 @@ class AugmentWAV(object):
|
||||||
return self.additive_noise(noise_type, audio)
|
return self.additive_noise(noise_type, audio)
|
||||||
|
|
||||||
|
|
||||||
def to_camel(text):
|
|
||||||
text = text.capitalize()
|
|
||||||
return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text)
|
|
||||||
|
|
||||||
|
|
||||||
def setup_encoder_model(config: "Coqpit"):
|
def setup_encoder_model(config: "Coqpit"):
|
||||||
if config.model_params["model_name"].lower() == "lstm":
|
if config.model_params["model_name"].lower() == "lstm":
|
||||||
model = LSTMSpeakerEncoder(
|
model = LSTMSpeakerEncoder(
|
||||||
|
@ -142,41 +134,3 @@ def setup_encoder_model(config: "Coqpit"):
|
||||||
audio_config=config.audio,
|
audio_config=config.audio,
|
||||||
)
|
)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
def save_checkpoint(model, optimizer, criterion, model_loss, out_path, current_step, epoch):
|
|
||||||
checkpoint_path = "checkpoint_{}.pth".format(current_step)
|
|
||||||
checkpoint_path = os.path.join(out_path, checkpoint_path)
|
|
||||||
print(" | | > Checkpoint saving : {}".format(checkpoint_path))
|
|
||||||
|
|
||||||
new_state_dict = model.state_dict()
|
|
||||||
state = {
|
|
||||||
"model": new_state_dict,
|
|
||||||
"optimizer": optimizer.state_dict() if optimizer is not None else None,
|
|
||||||
"criterion": criterion.state_dict(),
|
|
||||||
"step": current_step,
|
|
||||||
"epoch": epoch,
|
|
||||||
"loss": model_loss,
|
|
||||||
"date": datetime.date.today().strftime("%B %d, %Y"),
|
|
||||||
}
|
|
||||||
save_fsspec(state, checkpoint_path)
|
|
||||||
|
|
||||||
|
|
||||||
def save_best_model(model, optimizer, criterion, model_loss, best_loss, out_path, current_step, epoch):
|
|
||||||
if model_loss < best_loss:
|
|
||||||
new_state_dict = model.state_dict()
|
|
||||||
state = {
|
|
||||||
"model": new_state_dict,
|
|
||||||
"optimizer": optimizer.state_dict(),
|
|
||||||
"criterion": criterion.state_dict(),
|
|
||||||
"step": current_step,
|
|
||||||
"epoch": epoch,
|
|
||||||
"loss": model_loss,
|
|
||||||
"date": datetime.date.today().strftime("%B %d, %Y"),
|
|
||||||
}
|
|
||||||
best_loss = model_loss
|
|
||||||
bestmodel_path = "best_model.pth"
|
|
||||||
bestmodel_path = os.path.join(out_path, bestmodel_path)
|
|
||||||
print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path))
|
|
||||||
save_fsspec(state, bestmodel_path)
|
|
||||||
return best_loss
|
|
||||||
|
|
|
@ -1,38 +0,0 @@
|
||||||
import datetime
|
|
||||||
import os
|
|
||||||
|
|
||||||
from TTS.utils.io import save_fsspec
|
|
||||||
|
|
||||||
|
|
||||||
def save_checkpoint(model, optimizer, model_loss, out_path, current_step):
|
|
||||||
checkpoint_path = "checkpoint_{}.pth".format(current_step)
|
|
||||||
checkpoint_path = os.path.join(out_path, checkpoint_path)
|
|
||||||
print(" | | > Checkpoint saving : {}".format(checkpoint_path))
|
|
||||||
|
|
||||||
new_state_dict = model.state_dict()
|
|
||||||
state = {
|
|
||||||
"model": new_state_dict,
|
|
||||||
"optimizer": optimizer.state_dict() if optimizer is not None else None,
|
|
||||||
"step": current_step,
|
|
||||||
"loss": model_loss,
|
|
||||||
"date": datetime.date.today().strftime("%B %d, %Y"),
|
|
||||||
}
|
|
||||||
save_fsspec(state, checkpoint_path)
|
|
||||||
|
|
||||||
|
|
||||||
def save_best_model(model, optimizer, model_loss, best_loss, out_path, current_step):
|
|
||||||
if model_loss < best_loss:
|
|
||||||
new_state_dict = model.state_dict()
|
|
||||||
state = {
|
|
||||||
"model": new_state_dict,
|
|
||||||
"optimizer": optimizer.state_dict(),
|
|
||||||
"step": current_step,
|
|
||||||
"loss": model_loss,
|
|
||||||
"date": datetime.date.today().strftime("%B %d, %Y"),
|
|
||||||
}
|
|
||||||
best_loss = model_loss
|
|
||||||
bestmodel_path = "best_model.pth"
|
|
||||||
bestmodel_path = os.path.join(out_path, bestmodel_path)
|
|
||||||
print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path))
|
|
||||||
save_fsspec(state, bestmodel_path)
|
|
||||||
return best_loss
|
|
|
@ -3,13 +3,13 @@ from dataclasses import dataclass, field
|
||||||
|
|
||||||
from coqpit import Coqpit
|
from coqpit import Coqpit
|
||||||
from trainer import TrainerArgs, get_last_checkpoint
|
from trainer import TrainerArgs, get_last_checkpoint
|
||||||
|
from trainer.io import copy_model_files
|
||||||
from trainer.logging import logger_factory
|
from trainer.logging import logger_factory
|
||||||
from trainer.logging.console_logger import ConsoleLogger
|
from trainer.logging.console_logger import ConsoleLogger
|
||||||
|
|
||||||
from TTS.config import load_config, register_config
|
from TTS.config import load_config, register_config
|
||||||
from TTS.tts.utils.text.characters import parse_symbols
|
from TTS.tts.utils.text.characters import parse_symbols
|
||||||
from TTS.utils.generic_utils import get_experiment_folder_path, get_git_branch
|
from TTS.utils.generic_utils import get_experiment_folder_path, get_git_branch
|
||||||
from TTS.utils.io import copy_model_files
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
|
|
@ -88,6 +88,7 @@ class XttsConfig(BaseTTSConfig):
|
||||||
"hu",
|
"hu",
|
||||||
"ko",
|
"ko",
|
||||||
"ja",
|
"ja",
|
||||||
|
"hi",
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -636,6 +636,9 @@ class VoiceBpeTokenizer:
|
||||||
txt = korean_transliterate(txt)
|
txt = korean_transliterate(txt)
|
||||||
elif lang == "ja":
|
elif lang == "ja":
|
||||||
txt = japanese_cleaners(txt, self.katsu)
|
txt = japanese_cleaners(txt, self.katsu)
|
||||||
|
elif lang == "hi":
|
||||||
|
# @manmay will implement this
|
||||||
|
txt = basic_cleaners(txt)
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError(f"Language '{lang}' is not supported.")
|
raise NotImplementedError(f"Language '{lang}' is not supported.")
|
||||||
return txt
|
return txt
|
||||||
|
|
|
@ -185,20 +185,16 @@ class ESpeak(BasePhonemizer):
|
||||||
if tie:
|
if tie:
|
||||||
args.append("--tie=%s" % tie)
|
args.append("--tie=%s" % tie)
|
||||||
|
|
||||||
args.append('"' + text + '"')
|
args.append(text)
|
||||||
# compute phonemes
|
# compute phonemes
|
||||||
phonemes = ""
|
phonemes = ""
|
||||||
for line in _espeak_exe(self._ESPEAK_LIB, args, sync=True):
|
for line in _espeak_exe(self._ESPEAK_LIB, args, sync=True):
|
||||||
logging.debug("line: %s", repr(line))
|
logging.debug("line: %s", repr(line))
|
||||||
ph_decoded = line.decode("utf8").strip()
|
ph_decoded = line.decode("utf8").strip()
|
||||||
# espeak need to skip first two characters of the retuned text:
|
# espeak:
|
||||||
# version 1.48.03: "_ p_ɹ_ˈaɪ_ɚ t_ə n_oʊ_v_ˈɛ_m_b_ɚ t_w_ˈɛ_n_t_i t_ˈuː\n"
|
|
||||||
# version 1.48.15: " p_ɹ_ˈaɪ_ɚ t_ə n_oʊ_v_ˈɛ_m_b_ɚ t_w_ˈɛ_n_t_i t_ˈuː\n"
|
# version 1.48.15: " p_ɹ_ˈaɪ_ɚ t_ə n_oʊ_v_ˈɛ_m_b_ɚ t_w_ˈɛ_n_t_i t_ˈuː\n"
|
||||||
# espeak-ng need to skip the first character of the retuned text:
|
# espeak-ng:
|
||||||
# "_p_ɹ_ˈaɪ_ɚ t_ə n_oʊ_v_ˈɛ_m_b_ɚ t_w_ˈɛ_n_t_i t_ˈuː\n"
|
# "p_ɹ_ˈaɪ_ɚ t_ə n_oʊ_v_ˈɛ_m_b_ɚ t_w_ˈɛ_n_t_i t_ˈuː\n"
|
||||||
|
|
||||||
# dealing with the conditions descrived above
|
|
||||||
ph_decoded = ph_decoded[:1].replace("_", "") + ph_decoded[1:]
|
|
||||||
|
|
||||||
# espeak-ng backend can add language flags that need to be removed:
|
# espeak-ng backend can add language flags that need to be removed:
|
||||||
# "sɛʁtˈɛ̃ mˈo kɔm (en)fˈʊtbɔːl(fr) ʒenˈɛʁ de- flˈaɡ də- lˈɑ̃ɡ."
|
# "sɛʁtˈɛ̃ mˈo kɔm (en)fˈʊtbɔːl(fr) ʒenˈɛʁ de- flˈaɡ də- lˈɑ̃ɡ."
|
||||||
|
|
146
TTS/utils/io.py
146
TTS/utils/io.py
|
@ -1,13 +1,9 @@
|
||||||
import datetime
|
|
||||||
import json
|
|
||||||
import os
|
import os
|
||||||
import pickle as pickle_tts
|
import pickle as pickle_tts
|
||||||
import shutil
|
|
||||||
from typing import Any, Callable, Dict, Union
|
from typing import Any, Callable, Dict, Union
|
||||||
|
|
||||||
import fsspec
|
import fsspec
|
||||||
import torch
|
import torch
|
||||||
from coqpit import Coqpit
|
|
||||||
|
|
||||||
from TTS.utils.generic_utils import get_user_data_dir
|
from TTS.utils.generic_utils import get_user_data_dir
|
||||||
|
|
||||||
|
@ -28,34 +24,6 @@ class AttrDict(dict):
|
||||||
self.__dict__ = self
|
self.__dict__ = self
|
||||||
|
|
||||||
|
|
||||||
def copy_model_files(config: Coqpit, out_path, new_fields=None):
|
|
||||||
"""Copy config.json and other model files to training folder and add
|
|
||||||
new fields.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
config (Coqpit): Coqpit config defining the training run.
|
|
||||||
out_path (str): output path to copy the file.
|
|
||||||
new_fields (dict): new fileds to be added or edited
|
|
||||||
in the config file.
|
|
||||||
"""
|
|
||||||
copy_config_path = os.path.join(out_path, "config.json")
|
|
||||||
# add extra information fields
|
|
||||||
if new_fields:
|
|
||||||
config.update(new_fields, allow_new=True)
|
|
||||||
# TODO: Revert to config.save_json() once Coqpit supports arbitrary paths.
|
|
||||||
with fsspec.open(copy_config_path, "w", encoding="utf8") as f:
|
|
||||||
json.dump(config.to_dict(), f, indent=4)
|
|
||||||
|
|
||||||
# copy model stats file if available
|
|
||||||
if config.audio.stats_path is not None:
|
|
||||||
copy_stats_path = os.path.join(out_path, "scale_stats.npy")
|
|
||||||
filesystem = fsspec.get_mapper(copy_stats_path).fs
|
|
||||||
if not filesystem.exists(copy_stats_path):
|
|
||||||
with fsspec.open(config.audio.stats_path, "rb") as source_file:
|
|
||||||
with fsspec.open(copy_stats_path, "wb") as target_file:
|
|
||||||
shutil.copyfileobj(source_file, target_file)
|
|
||||||
|
|
||||||
|
|
||||||
def load_fsspec(
|
def load_fsspec(
|
||||||
path: str,
|
path: str,
|
||||||
map_location: Union[str, Callable, torch.device, Dict[Union[str, torch.device], Union[str, torch.device]]] = None,
|
map_location: Union[str, Callable, torch.device, Dict[Union[str, torch.device], Union[str, torch.device]]] = None,
|
||||||
|
@ -100,117 +68,3 @@ def load_checkpoint(
|
||||||
if eval:
|
if eval:
|
||||||
model.eval()
|
model.eval()
|
||||||
return model, state
|
return model, state
|
||||||
|
|
||||||
|
|
||||||
def save_fsspec(state: Any, path: str, **kwargs):
|
|
||||||
"""Like torch.save but can save to other locations (e.g. s3:// , gs://).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
state: State object to save
|
|
||||||
path: Any path or url supported by fsspec.
|
|
||||||
**kwargs: Keyword arguments forwarded to torch.save.
|
|
||||||
"""
|
|
||||||
with fsspec.open(path, "wb") as f:
|
|
||||||
torch.save(state, f, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
def save_model(config, model, optimizer, scaler, current_step, epoch, output_path, **kwargs):
|
|
||||||
if hasattr(model, "module"):
|
|
||||||
model_state = model.module.state_dict()
|
|
||||||
else:
|
|
||||||
model_state = model.state_dict()
|
|
||||||
if isinstance(optimizer, list):
|
|
||||||
optimizer_state = [optim.state_dict() for optim in optimizer]
|
|
||||||
elif optimizer.__class__.__name__ == "CapacitronOptimizer":
|
|
||||||
optimizer_state = [optimizer.primary_optimizer.state_dict(), optimizer.secondary_optimizer.state_dict()]
|
|
||||||
else:
|
|
||||||
optimizer_state = optimizer.state_dict() if optimizer is not None else None
|
|
||||||
|
|
||||||
if isinstance(scaler, list):
|
|
||||||
scaler_state = [s.state_dict() for s in scaler]
|
|
||||||
else:
|
|
||||||
scaler_state = scaler.state_dict() if scaler is not None else None
|
|
||||||
|
|
||||||
if isinstance(config, Coqpit):
|
|
||||||
config = config.to_dict()
|
|
||||||
|
|
||||||
state = {
|
|
||||||
"config": config,
|
|
||||||
"model": model_state,
|
|
||||||
"optimizer": optimizer_state,
|
|
||||||
"scaler": scaler_state,
|
|
||||||
"step": current_step,
|
|
||||||
"epoch": epoch,
|
|
||||||
"date": datetime.date.today().strftime("%B %d, %Y"),
|
|
||||||
}
|
|
||||||
state.update(kwargs)
|
|
||||||
save_fsspec(state, output_path)
|
|
||||||
|
|
||||||
|
|
||||||
def save_checkpoint(
|
|
||||||
config,
|
|
||||||
model,
|
|
||||||
optimizer,
|
|
||||||
scaler,
|
|
||||||
current_step,
|
|
||||||
epoch,
|
|
||||||
output_folder,
|
|
||||||
**kwargs,
|
|
||||||
):
|
|
||||||
file_name = "checkpoint_{}.pth".format(current_step)
|
|
||||||
checkpoint_path = os.path.join(output_folder, file_name)
|
|
||||||
print("\n > CHECKPOINT : {}".format(checkpoint_path))
|
|
||||||
save_model(
|
|
||||||
config,
|
|
||||||
model,
|
|
||||||
optimizer,
|
|
||||||
scaler,
|
|
||||||
current_step,
|
|
||||||
epoch,
|
|
||||||
checkpoint_path,
|
|
||||||
**kwargs,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def save_best_model(
|
|
||||||
current_loss,
|
|
||||||
best_loss,
|
|
||||||
config,
|
|
||||||
model,
|
|
||||||
optimizer,
|
|
||||||
scaler,
|
|
||||||
current_step,
|
|
||||||
epoch,
|
|
||||||
out_path,
|
|
||||||
keep_all_best=False,
|
|
||||||
keep_after=10000,
|
|
||||||
**kwargs,
|
|
||||||
):
|
|
||||||
if current_loss < best_loss:
|
|
||||||
best_model_name = f"best_model_{current_step}.pth"
|
|
||||||
checkpoint_path = os.path.join(out_path, best_model_name)
|
|
||||||
print(" > BEST MODEL : {}".format(checkpoint_path))
|
|
||||||
save_model(
|
|
||||||
config,
|
|
||||||
model,
|
|
||||||
optimizer,
|
|
||||||
scaler,
|
|
||||||
current_step,
|
|
||||||
epoch,
|
|
||||||
checkpoint_path,
|
|
||||||
model_loss=current_loss,
|
|
||||||
**kwargs,
|
|
||||||
)
|
|
||||||
fs = fsspec.get_mapper(out_path).fs
|
|
||||||
# only delete previous if current is saved successfully
|
|
||||||
if not keep_all_best or (current_step < keep_after):
|
|
||||||
model_names = fs.glob(os.path.join(out_path, "best_model*.pth"))
|
|
||||||
for model_name in model_names:
|
|
||||||
if os.path.basename(model_name) != best_model_name:
|
|
||||||
fs.rm(model_name)
|
|
||||||
# create a shortcut which always points to the currently best model
|
|
||||||
shortcut_name = "best_model.pth"
|
|
||||||
shortcut_path = os.path.join(out_path, shortcut_name)
|
|
||||||
fs.copy(checkpoint_path, shortcut_path)
|
|
||||||
best_loss = current_loss
|
|
||||||
return best_loss
|
|
||||||
|
|
|
@ -26,7 +26,9 @@ LICENSE_URLS = {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class ModelManager(object):
|
class ModelManager(object):
|
||||||
|
tqdm_progress = None
|
||||||
"""Manage TTS models defined in .models.json.
|
"""Manage TTS models defined in .models.json.
|
||||||
It provides an interface to list and download
|
It provides an interface to list and download
|
||||||
models defines in '.model.json'
|
models defines in '.model.json'
|
||||||
|
@ -525,12 +527,12 @@ class ModelManager(object):
|
||||||
total_size_in_bytes = int(r.headers.get("content-length", 0))
|
total_size_in_bytes = int(r.headers.get("content-length", 0))
|
||||||
block_size = 1024 # 1 Kibibyte
|
block_size = 1024 # 1 Kibibyte
|
||||||
if progress_bar:
|
if progress_bar:
|
||||||
progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
|
ModelManager.tqdm_progress = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
|
||||||
temp_zip_name = os.path.join(output_folder, file_url.split("/")[-1])
|
temp_zip_name = os.path.join(output_folder, file_url.split("/")[-1])
|
||||||
with open(temp_zip_name, "wb") as file:
|
with open(temp_zip_name, "wb") as file:
|
||||||
for data in r.iter_content(block_size):
|
for data in r.iter_content(block_size):
|
||||||
if progress_bar:
|
if progress_bar:
|
||||||
progress_bar.update(len(data))
|
ModelManager.tqdm_progress.update(len(data))
|
||||||
file.write(data)
|
file.write(data)
|
||||||
with zipfile.ZipFile(temp_zip_name) as z:
|
with zipfile.ZipFile(temp_zip_name) as z:
|
||||||
z.extractall(output_folder)
|
z.extractall(output_folder)
|
||||||
|
@ -560,12 +562,12 @@ class ModelManager(object):
|
||||||
total_size_in_bytes = int(r.headers.get("content-length", 0))
|
total_size_in_bytes = int(r.headers.get("content-length", 0))
|
||||||
block_size = 1024 # 1 Kibibyte
|
block_size = 1024 # 1 Kibibyte
|
||||||
if progress_bar:
|
if progress_bar:
|
||||||
progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
|
ModelManager.tqdm_progress = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
|
||||||
temp_tar_name = os.path.join(output_folder, file_url.split("/")[-1])
|
temp_tar_name = os.path.join(output_folder, file_url.split("/")[-1])
|
||||||
with open(temp_tar_name, "wb") as file:
|
with open(temp_tar_name, "wb") as file:
|
||||||
for data in r.iter_content(block_size):
|
for data in r.iter_content(block_size):
|
||||||
if progress_bar:
|
if progress_bar:
|
||||||
progress_bar.update(len(data))
|
ModelManager.tqdm_progress.update(len(data))
|
||||||
file.write(data)
|
file.write(data)
|
||||||
with tarfile.open(temp_tar_name) as t:
|
with tarfile.open(temp_tar_name) as t:
|
||||||
t.extractall(output_folder)
|
t.extractall(output_folder)
|
||||||
|
@ -596,10 +598,10 @@ class ModelManager(object):
|
||||||
block_size = 1024 # 1 Kibibyte
|
block_size = 1024 # 1 Kibibyte
|
||||||
with open(temp_zip_name, "wb") as file:
|
with open(temp_zip_name, "wb") as file:
|
||||||
if progress_bar:
|
if progress_bar:
|
||||||
progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
|
ModelManager.tqdm_progress = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
|
||||||
for data in r.iter_content(block_size):
|
for data in r.iter_content(block_size):
|
||||||
if progress_bar:
|
if progress_bar:
|
||||||
progress_bar.update(len(data))
|
ModelManager.tqdm_progress.update(len(data))
|
||||||
file.write(data)
|
file.write(data)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|
|
@ -358,7 +358,11 @@ class Synthesizer(nn.Module):
|
||||||
)
|
)
|
||||||
|
|
||||||
# compute a new d_vector from the given clip.
|
# compute a new d_vector from the given clip.
|
||||||
if speaker_wav is not None and self.tts_model.speaker_manager is not None:
|
if (
|
||||||
|
speaker_wav is not None
|
||||||
|
and self.tts_model.speaker_manager is not None
|
||||||
|
and self.tts_model.speaker_manager.encoder_ap is not None
|
||||||
|
):
|
||||||
speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip(speaker_wav)
|
speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip(speaker_wav)
|
||||||
|
|
||||||
vocoder_device = "cpu"
|
vocoder_device = "cpu"
|
||||||
|
|
|
@ -0,0 +1,44 @@
|
||||||
|
ARG BASE=nvidia/cuda:11.8.0-base-ubuntu22.04
|
||||||
|
FROM ${BASE}
|
||||||
|
|
||||||
|
# Install OS dependencies:
|
||||||
|
RUN apt-get update && apt-get upgrade -y
|
||||||
|
RUN apt-get install -y --no-install-recommends \
|
||||||
|
gcc g++ \
|
||||||
|
make \
|
||||||
|
python3 python3-dev python3-pip python3-venv python3-wheel \
|
||||||
|
espeak-ng libsndfile1-dev \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Install Major Python Dependencies:
|
||||||
|
RUN pip3 install llvmlite --ignore-installed
|
||||||
|
RUN pip3 install torch torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
|
||||||
|
RUN rm -rf /root/.cache/pip
|
||||||
|
|
||||||
|
WORKDIR /root
|
||||||
|
|
||||||
|
# Copy Dependency Lock Files:
|
||||||
|
COPY \
|
||||||
|
Makefile \
|
||||||
|
pyproject.toml \
|
||||||
|
setup.py \
|
||||||
|
requirements.dev.txt \
|
||||||
|
requirements.ja.txt \
|
||||||
|
requirements.notebooks.txt \
|
||||||
|
requirements.txt \
|
||||||
|
/root/
|
||||||
|
|
||||||
|
# Install Project Dependencies
|
||||||
|
# Separate stage to limit re-downloading:
|
||||||
|
RUN pip install \
|
||||||
|
-r requirements.txt \
|
||||||
|
-r requirements.dev.txt \
|
||||||
|
-r requirements.ja.txt \
|
||||||
|
-r requirements.notebooks.txt
|
||||||
|
|
||||||
|
# Copy TTS repository contents:
|
||||||
|
COPY . /root
|
||||||
|
|
||||||
|
# Installing the TTS package itself:
|
||||||
|
RUN make install
|
||||||
|
|
|
@ -97,7 +97,7 @@ or for all wav files in a directory you can use:
|
||||||
If you want to be able to run with `use_deepspeed=True` and enjoy the speedup, you need to install deepspeed first.
|
If you want to be able to run with `use_deepspeed=True` and enjoy the speedup, you need to install deepspeed first.
|
||||||
|
|
||||||
```console
|
```console
|
||||||
pip install deepspeed==0.8.3
|
pip install deepspeed==0.10.3
|
||||||
```
|
```
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
|
|
@ -3,11 +3,11 @@ import unittest
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
|
from trainer.io import save_checkpoint
|
||||||
|
|
||||||
from tests import get_tests_input_path
|
from tests import get_tests_input_path
|
||||||
from TTS.config import load_config
|
from TTS.config import load_config
|
||||||
from TTS.encoder.utils.generic_utils import setup_encoder_model
|
from TTS.encoder.utils.generic_utils import setup_encoder_model
|
||||||
from TTS.encoder.utils.io import save_checkpoint
|
|
||||||
from TTS.tts.utils.managers import EmbeddingManager
|
from TTS.tts.utils.managers import EmbeddingManager
|
||||||
from TTS.utils.audio import AudioProcessor
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
|
||||||
|
@ -31,7 +31,7 @@ class EmbeddingManagerTest(unittest.TestCase):
|
||||||
|
|
||||||
# create a dummy speaker encoder
|
# create a dummy speaker encoder
|
||||||
model = setup_encoder_model(config)
|
model = setup_encoder_model(config)
|
||||||
save_checkpoint(model, None, None, get_tests_input_path(), 0)
|
save_checkpoint(config, model, None, None, 0, 0, get_tests_input_path())
|
||||||
|
|
||||||
# load audio processor and speaker encoder
|
# load audio processor and speaker encoder
|
||||||
manager = EmbeddingManager(encoder_model_path=encoder_model_path, encoder_config_path=encoder_config_path)
|
manager = EmbeddingManager(encoder_model_path=encoder_model_path, encoder_config_path=encoder_config_path)
|
||||||
|
|
|
@ -3,11 +3,11 @@ import unittest
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
|
from trainer.io import save_checkpoint
|
||||||
|
|
||||||
from tests import get_tests_input_path
|
from tests import get_tests_input_path
|
||||||
from TTS.config import load_config
|
from TTS.config import load_config
|
||||||
from TTS.encoder.utils.generic_utils import setup_encoder_model
|
from TTS.encoder.utils.generic_utils import setup_encoder_model
|
||||||
from TTS.encoder.utils.io import save_checkpoint
|
|
||||||
from TTS.tts.utils.speakers import SpeakerManager
|
from TTS.tts.utils.speakers import SpeakerManager
|
||||||
from TTS.utils.audio import AudioProcessor
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
|
||||||
|
@ -30,7 +30,7 @@ class SpeakerManagerTest(unittest.TestCase):
|
||||||
|
|
||||||
# create a dummy speaker encoder
|
# create a dummy speaker encoder
|
||||||
model = setup_encoder_model(config)
|
model = setup_encoder_model(config)
|
||||||
save_checkpoint(model, None, None, get_tests_input_path(), 0)
|
save_checkpoint(config, model, None, None, 0, 0, get_tests_input_path())
|
||||||
|
|
||||||
# load audio processor and speaker encoder
|
# load audio processor and speaker encoder
|
||||||
ap = AudioProcessor(**config.audio)
|
ap = AudioProcessor(**config.audio)
|
||||||
|
|
|
@ -1,10 +1,11 @@
|
||||||
import os
|
import os
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
|
from trainer.io import save_checkpoint
|
||||||
|
|
||||||
from tests import get_tests_input_path
|
from tests import get_tests_input_path
|
||||||
from TTS.config import load_config
|
from TTS.config import load_config
|
||||||
from TTS.tts.models import setup_model
|
from TTS.tts.models import setup_model
|
||||||
from TTS.utils.io import save_checkpoint
|
|
||||||
from TTS.utils.synthesizer import Synthesizer
|
from TTS.utils.synthesizer import Synthesizer
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue