Merge branch 'dev-managers' into dev-emotion

This commit is contained in:
Edresson Casanova 2022-03-30 16:25:47 -03:00 committed by GitHub
commit aebbdfc62b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
74 changed files with 566 additions and 516 deletions

View File

@ -1,4 +1,4 @@
name: tts-tests
name: text-tests
on:
push:

1
.gitignore vendored
View File

@ -115,6 +115,7 @@ venv.bak/
*.swo
# pytorch models
*.pth
*.pth.tar
result/

20
CITATION.cff Normal file
View File

@ -0,0 +1,20 @@
cff-version: 1.2.0
message: "If you want to cite 🐸💬, feel free to use this (but only if you loved it 😊)"
title: "Coqui TTS"
abstract: "A deep learning toolkit for Text-to-Speech, battle-tested in research and production"
date-released: 2021-01-01
authors:
- family-names: "Eren"
given-names: "Gölge"
- name: "The Coqui TTS Team"
version: 1.4
doi: 10.5281/zenodo.6334862
license: "MPL-2.0"
url: "https://www.coqui.ai"
repository-code: "https://github.com/coqui-ai/TTS"
keywords:
- machine learning
- deep learning
- artificial intelligence
- text to speech
- TTS

View File

@ -1,6 +1,7 @@
include README.md
include LICENSE.txt
include requirements.*.txt
include *.cff
include requirements.txt
include TTS/VERSION
recursive-include TTS *.json

View File

@ -44,6 +44,8 @@ style: ## update code style.
lint: ## run pylint linter.
pylint ${target_dirs}
black ${target_dirs} --check
isort ${target_dirs} --check-only
system-deps: ## install linux system deps
sudo apt-get install -y libsndfile1-dev

View File

@ -159,13 +159,13 @@ If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](ht
- Run your own TTS model (Using Griffin-Lim Vocoder):
```
$ tts --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav
$ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
```
- Run your own TTS and Vocoder models:
```
$ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav
--vocoder_path path/to/vocoder.pth.tar --vocoder_config_path path/to/vocoder_config.json
$ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth --out_path output/path/speech.wav
--vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
```
### Multi-speaker Models
@ -185,7 +185,7 @@ If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](ht
- Run your own multi-speaker TTS model:
```
$ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth.tar --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
$ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
```
## Directory Structure

View File

@ -25,7 +25,7 @@ These masks can be used for different purposes including training a TTS model wi
"""
Example run:
CUDA_VISIBLE_DEVICE="0" python TTS/bin/compute_attention_masks.py
--model_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/checkpoint_200000.pth.tar
--model_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/checkpoint_200000.pth
--config_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/config.json
--dataset_metafile metadata.csv
--data_path /root/LJSpeech-1.1/

View File

@ -13,7 +13,7 @@ parser = argparse.ArgumentParser(
description="""Compute embedding vectors for each wav file in a dataset.\n\n"""
"""
Example runs:
python TTS/bin/compute_embeddings.py speaker_encoder_model.pth.tar speaker_encoder_config.json dataset_config.json embeddings_output_path/
python TTS/bin/compute_embeddings.py speaker_encoder_model.pth speaker_encoder_config.json dataset_config.json embeddings_output_path/
""",
formatter_class=RawTextHelpFormatter,
)

View File

@ -1,55 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import pathlib
import subprocess
import time
import torch
from trainer import TrainerArgs
def main():
"""
Call train.py as a new process and pass command arguments
"""
parser = TrainerArgs().init_argparse(arg_prefix="")
parser.add_argument("--script", type=str, help="Target training script to distibute.")
args, unargs = parser.parse_known_args()
num_gpus = torch.cuda.device_count()
group_id = time.strftime("%Y_%m_%d-%H%M%S")
# set arguments for train.py
folder_path = pathlib.Path(__file__).parent.absolute()
if os.path.exists(os.path.join(folder_path, args.script)):
command = [os.path.join(folder_path, args.script)]
else:
command = [args.script]
command.append("--continue_path={}".format(args.continue_path))
command.append("--restore_path={}".format(args.restore_path))
command.append("--config_path={}".format(args.config_path))
command.append("--group_id=group_{}".format(group_id))
command.append("--use_ddp=true")
command += unargs
command.append("")
# run processes
processes = []
for i in range(num_gpus):
my_env = os.environ.copy()
my_env["PYTHON_EGG_CACHE"] = "/tmp/tmp{}".format(i)
command[-1] = "--rank={}".format(i)
# prevent stdout for processes with rank != 0
stdout = None
p = subprocess.Popen(["python3"] + command, stdout=stdout, env=my_env) # pylint: disable=consider-using-with
processes.append(p)
print(command)
for p in processes:
p.wait()
if __name__ == "__main__":
main()

View File

@ -1,18 +1,18 @@
import argparse
import torch
from argparse import RawTextHelpFormatter
import torch
from tqdm import tqdm
from TTS.config import load_config
from TTS.tts.datasets import load_tts_samples
from TTS.tts.utils.speakers import SpeakerManager
def compute_encoder_accuracy(dataset_items, encoder_manager):
class_name_key = encoder_manager.encoder_config.class_name_key
map_classid_to_classname = getattr(encoder_manager.encoder_config, 'map_classid_to_classname', None)
class_acc_dict = {}
# compute embeddings for all wav_files
@ -43,11 +43,11 @@ def compute_encoder_accuracy(dataset_items, encoder_manager):
acc_avg = 0
for key, values in class_acc_dict.items():
acc = sum(values)/len(values)
acc = sum(values) / len(values)
print("Class", key, "Accuracy:", acc)
acc_avg += acc
print("Average Accuracy:", acc_avg/len(class_acc_dict))
print("Average Accuracy:", acc_avg / len(class_acc_dict))
if __name__ == "__main__":
@ -55,7 +55,7 @@ if __name__ == "__main__":
description="""Compute the accuracy of the encoder.\n\n"""
"""
Example runs:
python TTS/bin/eval_encoder.py emotion_encoder_model.pth.tar emotion_encoder_config.json dataset_config.json
python TTS/bin/eval_encoder.py emotion_encoder_model.pth emotion_encoder_config.json dataset_config.json
""",
formatter_class=RawTextHelpFormatter,
)

View File

@ -60,13 +60,13 @@ If you don't specify any models, then it uses LJSpeech based English model.
- Run your own TTS model (Using Griffin-Lim Vocoder):
```
$ tts --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav
$ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
```
- Run your own TTS and Vocoder models:
```
$ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav
--vocoder_path path/to/vocoder.pth.tar --vocoder_config_path path/to/vocoder_config.json
$ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth --out_path output/path/speech.wav
--vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
```
### Multi-speaker Models
@ -86,7 +86,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
- Run your own multi-speaker TTS model:
```
$ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth.tar --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
$ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
```
"""
# We remove Markdown code formatting programmatically here to allow us to copy-and-paste from main README to keep
@ -217,7 +217,13 @@ If you don't specify any models, then it uses LJSpeech based English model.
args = parser.parse_args()
# print the description if either text or list_models is not set
if not args.text and not args.list_models and not args.list_speaker_idxs and not args.list_language_idxs and not args.reference_wav:
if (
not args.text
and not args.list_models
and not args.list_speaker_idxs
and not args.list_language_idxs
and not args.reference_wav
):
parser.parse_args(["-h"])
# load model manager
@ -306,7 +312,15 @@ If you don't specify any models, then it uses LJSpeech based English model.
print(" > Text: {}".format(args.text))
# kick it
wav = synthesizer.tts(args.text, args.speaker_idx, args.language_idx, args.speaker_wav, reference_wav=args.reference_wav, reference_speaker_name=args.reference_speaker_idx, emotion_name=args.emotion_idx)
wav = synthesizer.tts(
args.text,
args.speaker_idx,
args.language_idx,
args.speaker_wav,
reference_wav=args.reference_wav,
reference_speaker_name=args.reference_speaker_idx,
emotion_name=args.emotion_idx
)
# save the results
print(" > Saving output to {}".format(args.out_path))

View File

@ -9,6 +9,7 @@ import traceback
import torch
from torch.utils.data import DataLoader
from trainer.torch import NoamLR
from trainer.trainer_utils import get_optimizer
from TTS.encoder.dataset import EncoderDataset
from TTS.encoder.utils.generic_utils import save_best_model, save_checkpoint, setup_encoder_model
@ -19,7 +20,6 @@ from TTS.tts.datasets import load_tts_samples
from TTS.utils.audio import AudioProcessor
from TTS.utils.generic_utils import count_parameters, remove_experiment_folder
from TTS.utils.io import copy_model_files
from trainer.trainer_utils import get_optimizer
from TTS.utils.training import check_update
torch.backends.cudnn.enabled = True
@ -52,16 +52,21 @@ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False
sampler = PerfectBatchSampler(
dataset.items,
classes,
batch_size=num_classes_in_batch*num_utter_per_class, # total batch size
batch_size=num_classes_in_batch * num_utter_per_class, # total batch size
num_classes_in_batch=num_classes_in_batch,
num_gpus=1,
shuffle=not is_val,
drop_last=True)
drop_last=True,
)
if len(classes) < num_classes_in_batch:
if is_val:
raise RuntimeError(f"config.eval_num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Eval dataset) !")
raise RuntimeError(f"config.num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Train dataset) !")
raise RuntimeError(
f"config.eval_num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Eval dataset) !"
)
raise RuntimeError(
f"config.num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Train dataset) !"
)
# set the classes to avoid get wrong class_id when the number of training and eval classes are not equal
if is_val:
@ -76,6 +81,7 @@ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False
return loader, classes, dataset.get_map_classid_to_classname()
def evaluation(model, criterion, data_loader, global_step):
eval_loss = 0
for _, data in enumerate(data_loader):
@ -84,8 +90,12 @@ def evaluation(model, criterion, data_loader, global_step):
inputs, labels = data
# agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
labels = torch.transpose(labels.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch), 0, 1).reshape(labels.shape)
inputs = torch.transpose(inputs.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch, -1), 0, 1).reshape(inputs.shape)
labels = torch.transpose(
labels.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch), 0, 1
).reshape(labels.shape)
inputs = torch.transpose(
inputs.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch, -1), 0, 1
).reshape(inputs.shape)
# dispatch data to GPU
if use_cuda:
@ -96,20 +106,23 @@ def evaluation(model, criterion, data_loader, global_step):
outputs = model(inputs)
# loss computation
loss = criterion(outputs.view(c.eval_num_classes_in_batch, outputs.shape[0] // c.eval_num_classes_in_batch, -1), labels)
loss = criterion(
outputs.view(c.eval_num_classes_in_batch, outputs.shape[0] // c.eval_num_classes_in_batch, -1), labels
)
eval_loss += loss.item()
eval_avg_loss = eval_loss/len(data_loader)
eval_avg_loss = eval_loss / len(data_loader)
# save stats
dashboard_logger.eval_stats(global_step, {"loss": eval_avg_loss})
# plot the last batch in the evaluation
figures = {
"UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
"UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
}
dashboard_logger.eval_figures(global_step, figures)
return eval_avg_loss
def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, global_step):
model.train()
best_loss = float("inf")
@ -124,8 +137,12 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader,
# setup input data
inputs, labels = data
# agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
labels = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(labels.shape)
inputs = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(inputs.shape)
labels = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(
labels.shape
)
inputs = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(
inputs.shape
)
# ToDo: move it to a unit test
# labels_converted = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(labels.shape)
# inputs_converted = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(inputs.shape)
@ -157,7 +174,9 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader,
outputs = model(inputs)
# loss computation
loss = criterion(outputs.view(c.num_classes_in_batch, outputs.shape[0] // c.num_classes_in_batch, -1), labels)
loss = criterion(
outputs.view(c.num_classes_in_batch, outputs.shape[0] // c.num_classes_in_batch, -1), labels
)
loss.backward()
grad_norm, _ = check_update(model, c.grad_clip)
optimizer.step()
@ -211,7 +230,7 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader,
print(
">>> Epoch:{} AvgLoss: {:.5f} GradNorm:{:.5f} "
"EpochTime:{:.2f} AvGLoaderTime:{:.2f} ".format(
epoch, tot_loss/len(data_loader), grad_norm, epoch_time, avg_loader_time
epoch, tot_loss / len(data_loader), grad_norm, epoch_time, avg_loader_time
),
flush=True,
)
@ -222,10 +241,8 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader,
print("\n\n")
print("--> EVAL PERFORMANCE")
print(
" | > Epoch:{} AvgLoss: {:.5f} ".format(
epoch, eval_loss
),
flush=True,
" | > Epoch:{} AvgLoss: {:.5f} ".format(epoch, eval_loss),
flush=True,
)
# save the best checkpoint
best_loss = save_best_model(model, optimizer, criterion, eval_loss, best_loss, OUT_PATH, global_step, epoch)
@ -262,7 +279,9 @@ def main(args): # pylint: disable=redefined-outer-name
copy_model_files(c, OUT_PATH)
if args.restore_path:
criterion, args.restore_step = model.load_checkpoint(c, args.restore_path, eval=False, use_cuda=use_cuda, criterion=criterion)
criterion, args.restore_step = model.load_checkpoint(
c, args.restore_path, eval=False, use_cuda=use_cuda, criterion=criterion
)
print(" > Model restored from step %d" % args.restore_step, flush=True)
else:
args.restore_step = 0

View File

@ -14,5 +14,5 @@ To run the code, you need to follow the same flow as in TTS.
- Define 'config.json' for your needs. Note that, audio parameters should match your TTS model.
- Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360```
- Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth.tar model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files.
- Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files.
- Watch training on Tensorboard as in TTS

View File

@ -33,10 +33,7 @@ class BaseEncoderConfig(BaseTrainingConfig):
grad_clip: float = 3.0
lr: float = 0.0001
optimizer: str = "radam"
optimizer_params: Dict = field(default_factory=lambda: {
"betas": [0.9, 0.999],
"weight_decay": 0
})
optimizer_params: Dict = field(default_factory=lambda: {"betas": [0.9, 0.999], "weight_decay": 0})
lr_decay: bool = False
warmup_steps: int = 4000

View File

@ -5,6 +5,7 @@ from torch.utils.data import Dataset
from TTS.encoder.utils.generic_utils import AugmentWAV
class EncoderDataset(Dataset):
def __init__(
self,
@ -57,7 +58,6 @@ class EncoderDataset(Dataset):
print(f" | > Num Classes: {len(self.classes)}")
print(f" | > Classes: {self.classes}")
def load_wav(self, filename):
audio = self.ap.load_wav(filename, sr=self.ap.sample_rate)
return audio
@ -75,9 +75,7 @@ class EncoderDataset(Dataset):
]
# skip classes with number of samples >= self.num_utter_per_class
class_to_utters = {
k: v for (k, v) in class_to_utters.items() if len(v) >= self.num_utter_per_class
}
class_to_utters = {k: v for (k, v) in class_to_utters.items() if len(v) >= self.num_utter_per_class}
classes = list(class_to_utters.keys())
classes.sort()
@ -105,11 +103,11 @@ class EncoderDataset(Dataset):
def get_class_list(self):
return self.classes
def set_classes(self, classes):
self.classes = classes
self.classname_to_classid = {key: i for i, key in enumerate(self.classes)}
def get_map_classid_to_classname(self):
return dict((c_id, c_n) for c_n, c_id in self.classname_to_classid.items())

View File

@ -195,6 +195,7 @@ class SoftmaxLoss(nn.Module):
class_id = torch.argmax(activations)
return class_id
class SoftmaxAngleProtoLoss(nn.Module):
"""
Implementation of the Softmax AnglePrototypical loss as defined in https://arxiv.org/abs/2009.14153

View File

@ -1,12 +1,13 @@
import numpy as np
import torch
import torchaudio
import numpy as np
from coqpit import Coqpit
from torch import nn
from TTS.utils.io import load_fsspec
from TTS.encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss
from TTS.utils.generic_utils import set_init_dict
from coqpit import Coqpit
from TTS.utils.io import load_fsspec
class PreEmphasis(nn.Module):
def __init__(self, coefficient=0.97):
@ -20,6 +21,7 @@ class PreEmphasis(nn.Module):
x = torch.nn.functional.pad(x.unsqueeze(1), (1, 0), "reflect")
return torch.nn.functional.conv1d(x, self.filter).squeeze(1)
class BaseEncoder(nn.Module):
"""Base `encoder` class. Every new `encoder` model must inherit this.
@ -32,31 +34,31 @@ class BaseEncoder(nn.Module):
def get_torch_mel_spectrogram_class(self, audio_config):
return torch.nn.Sequential(
PreEmphasis(audio_config["preemphasis"]),
# TorchSTFT(
# n_fft=audio_config["fft_size"],
# hop_length=audio_config["hop_length"],
# win_length=audio_config["win_length"],
# sample_rate=audio_config["sample_rate"],
# window="hamming_window",
# mel_fmin=0.0,
# mel_fmax=None,
# use_htk=True,
# do_amp_to_db=False,
# n_mels=audio_config["num_mels"],
# power=2.0,
# use_mel=True,
# mel_norm=None,
# )
torchaudio.transforms.MelSpectrogram(
sample_rate=audio_config["sample_rate"],
n_fft=audio_config["fft_size"],
win_length=audio_config["win_length"],
hop_length=audio_config["hop_length"],
window_fn=torch.hamming_window,
n_mels=audio_config["num_mels"],
)
)
PreEmphasis(audio_config["preemphasis"]),
# TorchSTFT(
# n_fft=audio_config["fft_size"],
# hop_length=audio_config["hop_length"],
# win_length=audio_config["win_length"],
# sample_rate=audio_config["sample_rate"],
# window="hamming_window",
# mel_fmin=0.0,
# mel_fmax=None,
# use_htk=True,
# do_amp_to_db=False,
# n_mels=audio_config["num_mels"],
# power=2.0,
# use_mel=True,
# mel_norm=None,
# )
torchaudio.transforms.MelSpectrogram(
sample_rate=audio_config["sample_rate"],
n_fft=audio_config["fft_size"],
win_length=audio_config["win_length"],
hop_length=audio_config["hop_length"],
window_fn=torch.hamming_window,
n_mels=audio_config["num_mels"],
),
)
@torch.no_grad()
def inference(self, x, l2_norm=True):
@ -104,7 +106,9 @@ class BaseEncoder(nn.Module):
raise Exception("The %s not is a loss supported" % c.loss)
return criterion
def load_checkpoint(self, config: Coqpit, checkpoint_path: str, eval: bool = False, use_cuda: bool = False, criterion=None):
def load_checkpoint(
self, config: Coqpit, checkpoint_path: str, eval: bool = False, use_cuda: bool = False, criterion=None
):
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
try:
self.load_state_dict(state["model"])
@ -127,7 +131,12 @@ class BaseEncoder(nn.Module):
print(" > Criterion load ignored because of:", error)
# instance and load the criterion for the encoder classifier in inference time
if eval and criterion is None and "criterion" in state and getattr(config, 'map_classid_to_classname', None) is not None:
if (
eval
and criterion is None
and "criterion" in state
and getattr(config, "map_classid_to_classname", None) is not None
):
criterion = self.get_criterion(config, len(config.map_classid_to_classname))
criterion.load_state_dict(state["criterion"])

View File

@ -4,6 +4,7 @@ from torch import nn
# from TTS.utils.audio import TorchSTFT
from TTS.encoder.models.base_encoder import BaseEncoder
class SELayer(nn.Module):
def __init__(self, channel, reduction=8):
super(SELayer, self).__init__()

View File

@ -147,7 +147,7 @@ def setup_encoder_model(config: "Coqpit"):
def save_checkpoint(model, optimizer, criterion, model_loss, out_path, current_step, epoch):
checkpoint_path = "checkpoint_{}.pth.tar".format(current_step)
checkpoint_path = "checkpoint_{}.pth".format(current_step)
checkpoint_path = os.path.join(out_path, checkpoint_path)
print(" | | > Checkpoint saving : {}".format(checkpoint_path))
@ -177,7 +177,7 @@ def save_best_model(model, optimizer, criterion, model_loss, best_loss, out_path
"date": datetime.date.today().strftime("%B %d, %Y"),
}
best_loss = model_loss
bestmodel_path = "best_model.pth.tar"
bestmodel_path = "best_model.pth"
bestmodel_path = os.path.join(out_path, bestmodel_path)
print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path))
save_fsspec(state, bestmodel_path)

View File

@ -5,7 +5,7 @@ from TTS.utils.io import save_fsspec
def save_checkpoint(model, optimizer, model_loss, out_path, current_step):
checkpoint_path = "checkpoint_{}.pth.tar".format(current_step)
checkpoint_path = "checkpoint_{}.pth".format(current_step)
checkpoint_path = os.path.join(out_path, checkpoint_path)
print(" | | > Checkpoint saving : {}".format(checkpoint_path))
@ -31,7 +31,7 @@ def save_best_model(model, optimizer, model_loss, best_loss, out_path, current_s
"date": datetime.date.today().strftime("%B %d, %Y"),
}
best_loss = model_loss
bestmodel_path = "best_model.pth.tar"
bestmodel_path = "best_model.pth"
bestmodel_path = os.path.join(out_path, bestmodel_path)
print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path))
save_fsspec(state, bestmodel_path)

View File

@ -1,4 +1,5 @@
import random
from torch.utils.data.sampler import Sampler, SubsetRandomSampler
@ -34,10 +35,21 @@ class PerfectBatchSampler(Sampler):
drop_last (bool): if True, drops last incomplete batch.
"""
def __init__(self, dataset_items, classes, batch_size, num_classes_in_batch, num_gpus=1, shuffle=True, drop_last=False, label_key="class_name"):
def __init__(
self,
dataset_items,
classes,
batch_size,
num_classes_in_batch,
num_gpus=1,
shuffle=True,
drop_last=False,
label_key="class_name",
):
super().__init__(dataset_items)
assert batch_size % (num_classes_in_batch * num_gpus) == 0, (
'Batch size must be divisible by number of classes times the number of data parallel devices (if enabled).')
assert (
batch_size % (num_classes_in_batch * num_gpus) == 0
), "Batch size must be divisible by number of classes times the number of data parallel devices (if enabled)."
label_indices = {}
for idx, item in enumerate(dataset_items):
@ -93,7 +105,7 @@ class PerfectBatchSampler(Sampler):
if groups % self._dp_devices == 0:
yield batch
else:
batch = batch[:(groups // self._dp_devices) * self._dp_devices * self._num_classes_in_batch]
batch = batch[: (groups // self._dp_devices) * self._dp_devices * self._num_classes_in_batch]
if len(batch) > 0:
yield batch

View File

@ -1,46 +1,34 @@
from abc import ABC, abstractmethod
from typing import Dict, List, Tuple
from abc import abstractmethod
from typing import Dict
import torch
from coqpit import Coqpit
from torch import nn
from trainer import TrainerModel
# pylint: skip-file
class BaseTrainerModel(ABC, nn.Module):
"""Abstract 🐸TTS class. Every new 🐸TTS model must inherit this."""
class BaseTrainerModel(TrainerModel):
"""BaseTrainerModel model expanding TrainerModel with required functions by 🐸TTS.
Every new 🐸TTS model must inherit it.
"""
@staticmethod
@abstractmethod
def init_from_config(config: Coqpit):
"""Init the model from given config.
"""Init the model and all its attributes from the given config.
Override this depending on your model.
"""
...
@abstractmethod
def forward(self, input: torch.Tensor, *args, aux_input={}, **kwargs) -> Dict:
"""Forward ... for the model mainly used in training.
You can be flexible here and use different number of arguments and argument names since it is intended to be
used by `train_step()` without exposing it out of the model.
Args:
input (torch.Tensor): Input tensor.
aux_input (Dict): Auxiliary model inputs like embeddings, durations or any other sorts of inputs.
Returns:
Dict: Model outputs. Main model output must be named as "model_outputs".
"""
outputs_dict = {"model_outputs": None}
...
return outputs_dict
@abstractmethod
def inference(self, input: torch.Tensor, aux_input={}) -> Dict:
"""Forward ... for inference.
"""Forward pass for inference.
It must return a dictionary with the main model output and all the auxiliary outputs. The key ```model_outputs```
is considered to be the main output and you can add any other auxiliary outputs as you want.
We don't use `*kwargs` since it is problematic with the TorchScript API.
@ -55,78 +43,9 @@ class BaseTrainerModel(ABC, nn.Module):
...
return outputs_dict
def format_batch(self, batch: Dict) -> Dict:
"""Format batch returned by the data loader before sending it to the model.
If not implemented, model uses the batch as is.
Can be used for data augmentation, feature ectraction, etc.
"""
return batch
def format_batch_on_device(self, batch: Dict) -> Dict:
"""Format batch on device before sending it to the model.
If not implemented, model uses the batch as is.
Can be used for data augmentation, feature ectraction, etc.
"""
return batch
@abstractmethod
def train_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]:
"""Perform a single training step. Run the model forward ... and compute losses.
Args:
batch (Dict): Input tensors.
criterion (nn.Module): Loss layer designed for the model.
Returns:
Tuple[Dict, Dict]: Model ouputs and computed losses.
"""
outputs_dict = {}
loss_dict = {} # this returns from the criterion
...
return outputs_dict, loss_dict
def train_log(self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int) -> None:
"""Create visualizations and waveform examples for training.
For example, here you can plot spectrograms and generate sample sample waveforms from these spectrograms to
be projected onto Tensorboard.
Args:
ap (AudioProcessor): audio processor used at training.
batch (Dict): Model inputs used at the previous training step.
outputs (Dict): Model outputs generated at the previoud training step.
Returns:
Tuple[Dict, np.ndarray]: training plots and output waveform.
"""
...
@abstractmethod
def eval_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]:
"""Perform a single evaluation step. Run the model forward ... and compute losses. In most cases, you can
call `train_step()` with no changes.
Args:
batch (Dict): Input tensors.
criterion (nn.Module): Loss layer designed for the model.
Returns:
Tuple[Dict, Dict]: Model ouputs and computed losses.
"""
outputs_dict = {}
loss_dict = {} # this returns from the criterion
...
return outputs_dict, loss_dict
def eval_log(self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int) -> None:
"""The same as `train_log()`"""
...
@abstractmethod
def load_checkpoint(self, config: Coqpit, checkpoint_path: str, eval: bool = False, strict: bool = True) -> None:
"""Load a checkpoint and get ready for training or inference.
"""Load a model checkpoint gile and get ready for training or inference.
Args:
config (Coqpit): Model configuration.
@ -135,36 +54,3 @@ class BaseTrainerModel(ABC, nn.Module):
strcit (bool, optional): Match all checkpoint keys to model's keys. Defaults to True.
"""
...
@staticmethod
@abstractmethod
def init_from_config(config: Coqpit, samples: List[Dict] = None, verbose=False) -> "BaseTrainerModel":
"""Init the model from given config.
Override this depending on your model.
"""
...
@abstractmethod
def get_data_loader(
self, config: Coqpit, assets: Dict, is_eval: True, data_items: List, verbose: bool, num_gpus: int
):
...
# def get_optimizer(self) -> Union["Optimizer", List["Optimizer"]]:
# """Setup an return optimizer or optimizers."""
# ...
# def get_lr(self) -> Union[float, List[float]]:
# """Return learning rate(s).
# Returns:
# Union[float, List[float]]: Model's initial learning rates.
# """
# ...
# def get_scheduler(self, optimizer: torch.optim.Optimizer):
# ...
# def get_criterion(self):
# ...

View File

@ -21,4 +21,4 @@ Run the server with the official models on a GPU.
```CUDA_VISIBLE_DEVICES="0" python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan --use_cuda True```
Run the server with a custom models.
```python TTS/server/server.py --tts_checkpoint /path/to/tts/model.pth.tar --tts_config /path/to/tts/config.json --vocoder_checkpoint /path/to/vocoder/model.pth.tar --vocoder_config /path/to/vocoder/config.json```
```python TTS/server/server.py --tts_checkpoint /path/to/tts/model.pth --tts_config /path/to/tts/config.json --vocoder_checkpoint /path/to/vocoder/model.pth --vocoder_config /path/to/vocoder/config.json```

View File

@ -1,6 +1,6 @@
{
"tts_path":"/media/erogol/data_ssd/Models/libri_tts/5049/", // tts model root folder
"tts_file":"best_model.pth.tar", // tts checkpoint file
"tts_file":"best_model.pth", // tts checkpoint file
"tts_config":"config.json", // tts config.json file
"tts_speakers": null, // json file listing speaker ids. null if no speaker embedding.
"vocoder_config":null,

View File

@ -246,7 +246,7 @@ def libri_tts(root_path, meta_files=None, ignored_speakers=None):
continue
items.append({"text": text, "audio_file": wav_file, "speaker_name": f"LTTS_{speaker_name}"})
for item in items:
assert os.path.exists(item[1]), f" [!] wav files don't exist - {item[1]}"
assert os.path.exists(item["audio_file"]), f" [!] wav files don't exist - {item['audio_file']}"
return items

View File

@ -7,15 +7,15 @@ import torch.distributed as dist
from coqpit import Coqpit
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data.sampler import WeightedRandomSampler
from trainer.torch import DistributedSampler, DistributedSamplerWrapper
from TTS.model import BaseTrainerModel
from TTS.tts.datasets.dataset import TTSDataset
from TTS.tts.utils.languages import LanguageManager, get_language_balancer_weights
from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager, get_speaker_balancer_weights
from TTS.tts.utils.speakers import SpeakerManager, get_speaker_balancer_weights, get_speaker_manager
from TTS.tts.utils.synthesis import synthesis
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
from torch.utils.data.sampler import WeightedRandomSampler
# pylint: skip-file
@ -258,7 +258,7 @@ class BaseTTS(BaseTrainerModel):
# sampler for DDP
if sampler is None:
sampler = DistributedSampler(dataset) if num_gpus > 1 else None
else: # If a sampler is already defined use this sampler and DDP sampler together
else: # If a sampler is already defined use this sampler and DDP sampler together
sampler = DistributedSamplerWrapper(sampler) if num_gpus > 1 else sampler
return sampler
@ -279,9 +279,7 @@ class BaseTTS(BaseTrainerModel):
# setup multi-speaker attributes
if hasattr(self, "speaker_manager") and self.speaker_manager is not None:
if hasattr(config, "model_args"):
speaker_id_mapping = (
self.speaker_manager.ids if config.model_args.use_speaker_embedding else None
)
speaker_id_mapping = self.speaker_manager.ids if config.model_args.use_speaker_embedding else None
d_vector_mapping = self.speaker_manager.embeddings if config.model_args.use_d_vector_file else None
config.use_d_vector_file = config.model_args.use_d_vector_file
else:
@ -293,9 +291,7 @@ class BaseTTS(BaseTrainerModel):
# setup multi-lingual attributes
if hasattr(self, "language_manager") and self.language_manager is not None:
language_id_mapping = (
self.language_manager.ids if self.args.use_language_embedding else None
)
language_id_mapping = self.language_manager.ids if self.args.use_language_embedding else None
else:
language_id_mapping = None

View File

@ -676,6 +676,7 @@ class Vits(BaseTTS):
raise RuntimeError(
" [!] To use the speaker consistency loss (SCL) you need to specify encoder_model_path and encoder_config_path !!"
)
# load encoder
self.speaker_manager.init_encoder(self.args.encoder_model_path, self.args.encoder_config_path)
self.speaker_manager.encoder.eval()
@ -1095,7 +1096,9 @@ class Vits(BaseTTS):
return outputs
@torch.no_grad()
def inference_voice_conversion(self, reference_wav, speaker_id=None, d_vector=None, reference_speaker_id=None, reference_d_vector=None):
def inference_voice_conversion(
self, reference_wav, speaker_id=None, d_vector=None, reference_speaker_id=None, reference_d_vector=None
):
"""Inference for voice conversion
Args:
@ -1106,7 +1109,13 @@ class Vits(BaseTTS):
reference_d_vector (Tensor): d_vector embedding of the reference_wav speaker. Tensor of shape `[B, C]`
"""
# compute spectrograms
y = wav_to_spec(reference_wav, self.config.audio.fft_size, self.config.audio.hop_length, self.config.audio.win_length, center=False).transpose(1, 2)
y = wav_to_spec(
reference_wav,
self.config.audio.fft_size,
self.config.audio.hop_length,
self.config.audio.win_length,
center=False,
).transpose(1, 2)
y_lengths = torch.tensor([y.size(-1)]).to(y.device)
speaker_cond_src = reference_speaker_id if reference_speaker_id is not None else reference_d_vector
speaker_cond_tgt = speaker_id if speaker_id is not None else d_vector
@ -1346,6 +1355,7 @@ class Vits(BaseTTS):
else:
emotion_id = self.emotion_manager.ids[emotion_name]
return {
"text": text,
"speaker_id": speaker_id,
@ -1419,12 +1429,8 @@ class Vits(BaseTTS):
d_vectors = torch.FloatTensor(d_vectors)
# get language ids from language names
if (
self.language_manager is not None
and self.language_manager.ids
and self.args.use_language_embedding
):
language_ids = [self.language_manager.ids[ln] for ln in batch["f"]]
if self.language_manager is not None and self.language_manager.ids and self.args.use_language_embedding:
language_ids = [self.language_manager.ids[ln] for ln in batch["language_names"]]
if language_ids is not None:
language_ids = torch.LongTensor(language_ids)

View File

@ -1,5 +1,6 @@
import os
from typing import Dict, List, Any
from typing import Any, Dict, List
import fsspec
import numpy as np
@ -9,6 +10,7 @@ from coqpit import Coqpit
from TTS.config import check_config_and_model_args
from TTS.tts.utils.managers import BaseIDManager
class LanguageManager(BaseIDManager):
"""Manage the languages for multi-lingual 🐸TTS models. Load a datafile and parse the information
in a way that can be queried by language.

View File

@ -12,13 +12,11 @@ from TTS.utils.audio import AudioProcessor
class BaseIDManager:
""" Base `ID` Manager class. Every new `ID` manager must inherit this.
"""Base `ID` Manager class. Every new `ID` manager must inherit this.
It defines common `ID` manager specific functions.
"""
def __init__(
self,
id_file_path: str = ""
):
def __init__(self, id_file_path: str = ""):
self.ids = {}
if id_file_path:
@ -85,10 +83,12 @@ class BaseIDManager:
ids = {name: i for i, name in enumerate(classes)}
return ids
class EmbeddingManager(BaseIDManager):
""" Base `Embedding` Manager class. Every new `Embedding` manager must inherit this.
"""Base `Embedding` Manager class. Every new `Embedding` manager must inherit this.
It defines common `Embedding` manager specific functions.
"""
def __init__(
self,
embedding_file_path: str = "",
@ -225,7 +225,10 @@ class EmbeddingManager(BaseIDManager):
"""
self.encoder_config = load_config(config_path)
self.encoder = setup_encoder_model(self.encoder_config)
self.encoder_criterion = self.encoder.load_checkpoint(self.encoder_config, model_path, eval=True, use_cuda=self.use_cuda)
self.encoder_criterion = self.encoder.load_checkpoint(
self.encoder_config, model_path, eval=True, use_cuda=self.use_cuda
)
self.encoder_ap = AudioProcessor(**self.encoder_config.audio)
def compute_embedding_from_clip(self, wav_file: Union[str, List[str]]) -> list:

View File

@ -10,6 +10,7 @@ from coqpit import Coqpit
from TTS.config import get_from_config_or_model_args_with_default
from TTS.tts.utils.managers import EmbeddingManager
class SpeakerManager(EmbeddingManager):
"""Manage the speakers for multi-speaker 🐸TTS models. Load a datafile and parse the information
in a way that can be queried by speaker or clip.
@ -67,6 +68,7 @@ class SpeakerManager(EmbeddingManager):
use_cuda=use_cuda
)
if data_items:
self.set_ids_from_data(data_items, parse_key="speaker_name")

View File

@ -218,6 +218,7 @@ def synthesis(
}
return return_dict
def transfer_voice(
model,
CONFIG,
@ -281,12 +282,7 @@ def transfer_voice(
_func = model.module.inference_voice_conversion
else:
_func = model.inference_voice_conversion
model_outputs = _func(
reference_wav,
speaker_id,
d_vector,
reference_speaker_id,
reference_d_vector)
model_outputs = _func(reference_wav, speaker_id, d_vector, reference_speaker_id, reference_d_vector)
# convert outputs to numpy
# plot results

View File

@ -12,16 +12,9 @@ GRUUT_LANGS = list(Gruut.supported_languages())
# Dict setting default phonemizers for each language
DEF_LANG_TO_PHONEMIZER = {
"ja-jp": JA_JP_Phonemizer.name(),
"zh-cn": ZH_CN_Phonemizer.name(),
}
# Add Gruut languages
_ = [Gruut.name()] * len(GRUUT_LANGS)
_new_dict = dict(list(zip(GRUUT_LANGS, _)))
DEF_LANG_TO_PHONEMIZER.update(_new_dict)
DEF_LANG_TO_PHONEMIZER = dict(list(zip(GRUUT_LANGS, _)))
# Add ESpeak languages and override any existing ones
@ -29,7 +22,10 @@ _ = [ESpeak.name()] * len(ESPEAK_LANGS)
_new_dict = dict(list(zip(list(ESPEAK_LANGS), _)))
DEF_LANG_TO_PHONEMIZER.update(_new_dict)
# Force default for some languages
DEF_LANG_TO_PHONEMIZER["en"] = DEF_LANG_TO_PHONEMIZER["en-us"]
DEF_LANG_TO_PHONEMIZER["ja-jp"] = JA_JP_Phonemizer.name()
DEF_LANG_TO_PHONEMIZER["zh-cn"] = ZH_CN_Phonemizer.name()
def get_phonemizer_by_name(name: str, **kwargs) -> BasePhonemizer:

View File

@ -371,7 +371,9 @@ class AudioProcessor(object):
self.hop_length = hop_length
self.win_length = win_length
assert min_level_db != 0.0, " [!] min_level_db is 0"
assert self.win_length <= self.fft_size, " [!] win_length cannot be larger than fft_size"
assert (
self.win_length <= self.fft_size
), f" [!] win_length cannot be larger than fft_size - {self.win_length} vs {self.fft_size}"
members = vars(self)
if verbose:
print(" > Setting up Audio Processor...")

View File

@ -67,7 +67,7 @@ def get_experiment_folder_path(root_path, model_name):
def remove_experiment_folder(experiment_path):
"""Check folder if there is a checkpoint, otherwise remove the folder"""
fs = fsspec.get_mapper(experiment_path).fs
checkpoint_files = fs.glob(experiment_path + "/*.pth.tar")
checkpoint_files = fs.glob(experiment_path + "/*.pth")
if not checkpoint_files:
if fs.exists(experiment_path):
fs.rm(experiment_path, recursive=True)

View File

@ -140,7 +140,7 @@ def save_checkpoint(
output_folder,
**kwargs,
):
file_name = "checkpoint_{}.pth.tar".format(current_step)
file_name = "checkpoint_{}.pth".format(current_step)
checkpoint_path = os.path.join(output_folder, file_name)
print("\n > CHECKPOINT : {}".format(checkpoint_path))
save_model(
@ -170,7 +170,7 @@ def save_best_model(
**kwargs,
):
if current_loss < best_loss:
best_model_name = f"best_model_{current_step}.pth.tar"
best_model_name = f"best_model_{current_step}.pth"
checkpoint_path = os.path.join(out_path, best_model_name)
print(" > BEST MODEL : {}".format(checkpoint_path))
save_model(
@ -187,12 +187,12 @@ def save_best_model(
fs = fsspec.get_mapper(out_path).fs
# only delete previous if current is saved successfully
if not keep_all_best or (current_step < keep_after):
model_names = fs.glob(os.path.join(out_path, "best_model*.pth.tar"))
model_names = fs.glob(os.path.join(out_path, "best_model*.pth"))
for model_name in model_names:
if os.path.basename(model_name) != best_model_name:
fs.rm(model_name)
# create a shortcut which always points to the currently best model
shortcut_name = "best_model.pth.tar"
shortcut_name = "best_model.pth"
shortcut_path = os.path.join(out_path, shortcut_name)
fs.copy(checkpoint_path, shortcut_path)
best_loss = current_loss

View File

@ -4,6 +4,7 @@ import os
import zipfile
from pathlib import Path
from shutil import copyfile, rmtree
from typing import Tuple
import requests
@ -114,7 +115,7 @@ class ModelManager(object):
e.g. 'tts_model/en/ljspeech/tacotron'
Every model must have the following files:
- *.pth.tar : pytorch model checkpoint file.
- *.pth : pytorch model checkpoint file.
- config.json : model config file.
- scale_stats.npy (if exist): scale values for preprocessing.
@ -127,9 +128,6 @@ class ModelManager(object):
model_item = self.models_dict[model_type][lang][dataset][model]
# set the model specific output path
output_path = os.path.join(self.output_prefix, model_full_name)
output_model_path = os.path.join(output_path, "model_file.pth.tar")
output_config_path = os.path.join(output_path, "config.json")
if os.path.exists(output_path):
print(f" > {model_name} is already downloaded.")
else:
@ -137,10 +135,51 @@ class ModelManager(object):
print(f" > Downloading model to {output_path}")
# download from github release
self._download_zip_file(model_item["github_rls_url"], output_path)
# find downloaded files
output_model_path, output_config_path = self._find_files(output_path)
# update paths in the config.json
self._update_paths(output_path, output_config_path)
return output_model_path, output_config_path, model_item
@staticmethod
def _find_files(output_path: str) -> Tuple[str, str]:
"""Find the model and config files in the output path
Args:
output_path (str): path to the model files
Returns:
Tuple[str, str]: path to the model file and config file
"""
model_file = None
config_file = None
for file_name in os.listdir(output_path):
if file_name in ["model_file.pth", "model_file.pth.tar", "model.pth"]:
model_file = os.path.join(output_path, file_name)
elif file_name == "config.json":
config_file = os.path.join(output_path, file_name)
if model_file is None:
raise ValueError(" [!] Model file not found in the output path")
if config_file is None:
raise ValueError(" [!] Config file not found in the output path")
return model_file, config_file
@staticmethod
def _find_speaker_encoder(output_path: str) -> str:
"""Find the speaker encoder file in the output path
Args:
output_path (str): path to the model files
Returns:
str: path to the speaker encoder file
"""
speaker_encoder_file = None
for file_name in os.listdir(output_path):
if file_name in ["model_se.pth", "model_se.pth.tar"]:
speaker_encoder_file = os.path.join(output_path, file_name)
return speaker_encoder_file
def _update_paths(self, output_path: str, config_path: str) -> None:
"""Update paths for certain files in config.json after download.
@ -174,7 +213,7 @@ class ModelManager(object):
@staticmethod
def _update_path(field_name, new_path, config_path):
"""Update the path in the model config.json for the current environment after download"""
if os.path.exists(new_path):
if new_path and os.path.exists(new_path):
config = load_config(config_path)
field_names = field_name.split(".")
if len(field_names) > 1:

View File

@ -214,8 +214,8 @@ class Synthesizer(object):
if not text and not reference_wav:
raise ValueError(
"You need to define either `text` (for sythesis) or a `reference_wav` (for voice conversion) to use the Coqui TTS API."
)
"You need to define either `text` (for sythesis) or a `reference_wav` (for voice conversion) to use the Coqui TTS API."
)
if text:
sens = self.split_into_sentences(text)
@ -228,8 +228,10 @@ class Synthesizer(object):
if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "ids"):
if speaker_name and isinstance(speaker_name, str):
if self.tts_config.use_d_vector_file:
# get the average speaker embedding from the saved embeddings.
speaker_embedding = self.tts_model.speaker_manager.get_mean_embedding(speaker_name, num_samples=None, randomize=False)
# get the average speaker embedding from the saved d_vectors.
speaker_embedding = self.tts_model.speaker_manager.get_mean_embedding(
speaker_name, num_samples=None, randomize=False
)
speaker_embedding = np.array(speaker_embedding)[None, :] # [1 x embedding_dim]
else:
# get speaker idx from the speaker name
@ -354,26 +356,32 @@ class Synthesizer(object):
if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "speaker_ids"):
if reference_speaker_name and isinstance(reference_speaker_name, str):
if self.tts_config.use_d_vector_file:
# get the speaker embedding from the saved embeddings.
reference_speaker_embedding = self.tts_model.speaker_manager.get_embeddings_by_name(reference_speaker_name)[0]
reference_speaker_embedding = np.array(reference_speaker_embedding)[None, :] # [1 x embedding_dim]
# get the speaker embedding from the saved d_vectors.
reference_speaker_embedding = self.tts_model.speaker_manager.get_embeddings_by_name(
reference_speaker_name
)[0]
reference_speaker_embedding = np.array(reference_speaker_embedding)[
None, :
] # [1 x embedding_dim]
else:
# get speaker idx from the speaker name
reference_speaker_id = self.tts_model.speaker_manager.ids[reference_speaker_name]
else:
reference_speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip(reference_wav)
reference_speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip(
reference_wav
)
outputs = transfer_voice(
model=self.tts_model,
CONFIG=self.tts_config,
use_cuda=self.use_cuda,
reference_wav=reference_wav,
speaker_id=speaker_id,
d_vector=speaker_embedding,
use_griffin_lim=use_gl,
reference_speaker_id=reference_speaker_id,
reference_d_vector=reference_speaker_embedding
)
model=self.tts_model,
CONFIG=self.tts_config,
use_cuda=self.use_cuda,
reference_wav=reference_wav,
speaker_id=speaker_id,
d_vector=speaker_embedding,
use_griffin_lim=use_gl,
reference_speaker_id=reference_speaker_id,
reference_d_vector=reference_speaker_embedding,
)
waveform = outputs
if not use_gl:
mel_postnet_spec = outputs[0].detach().cpu().numpy()

View File

@ -29,7 +29,7 @@ You can continue a previous training run by the following command.
You can fine-tune a pre-trained model by the following command.
```CUDA_VISIBLE_DEVICES='0' python tts/bin/train_vocoder.py --restore_path path/to/your/model.pth.tar```
```CUDA_VISIBLE_DEVICES='0' python tts/bin/train_vocoder.py --restore_path path/to/your/model.pth```
Restoring a model starts a new training in a different folder. It only restores model weights with the given checkpoint file. However, continuing a training starts from the same directory where the previous training run left off.

View File

@ -93,13 +93,13 @@ them and fine-tune it for your own dataset. This will help you in two main ways:
```bash
CUDA_VISIBLE_DEVICES="0" python recipes/ljspeech/glow_tts/train_glowtts.py \
--restore_path /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth.tar
--restore_path /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth
```
```bash
CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tts.py \
--config_path /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/config.json \
--restore_path /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth.tar
--restore_path /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth
```
As stated above, you can also use command-line arguments to change the model configuration.
@ -107,7 +107,7 @@ them and fine-tune it for your own dataset. This will help you in two main ways:
```bash
CUDA_VISIBLE_DEVICES="0" python recipes/ljspeech/glow_tts/train_glowtts.py \
--restore_path /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth.tar
--restore_path /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth
--coqpit.run_name "glow-tts-finetune" \
--coqpit.lr 0.00001
```

View File

@ -44,7 +44,7 @@ Run your own TTS model (Using Griffin-Lim Vocoder)
```bash
tts --text "Text for TTS" \
--model_path path/to/model.pth.tar \
--model_path path/to/model.pth \
--config_path path/to/config.json \
--out_path folder/to/save/output.wav
```
@ -54,9 +54,9 @@ Run your own TTS and Vocoder models
```bash
tts --text "Text for TTS" \
--config_path path/to/config.json \
--model_path path/to/model.pth.tar \
--model_path path/to/model.pth \
--out_path folder/to/save/output.wav \
--vocoder_path path/to/vocoder.pth.tar \
--vocoder_path path/to/vocoder.pth \
--vocoder_config_path path/to/vocoder_config.json
```

View File

@ -33,7 +33,7 @@
If you like to run a multi-gpu training using DDP back-end,
```bash
$ CUDA_VISIBLE_DEVICES="0, 1, 2" python TTS/bin/distribute.py --script <path_to_your_script>/train_glowtts.py
$ CUDA_VISIBLE_DEVICES="0, 1, 2" python -m trainer.distribute --script <path_to_your_script>/train_glowtts.py
```
The example above runs a multi-gpu training using GPUs `0, 1, 2`.
@ -122,7 +122,7 @@
```bash
$ tts --text "Text for TTS" \
--model_path path/to/checkpoint_x.pth.tar \
--model_path path/to/checkpoint_x.pth \
--config_path path/to/config.json \
--out_path folder/to/save/output.wav
```

View File

@ -50,13 +50,13 @@ A breakdown of a simple script that trains a GlowTTS model on the LJspeech datas
- Fine-tune a model.
```bash
CUDA_VISIBLE_DEVICES=0 python train.py --restore_path path/to/model/checkpoint.pth.tar
CUDA_VISIBLE_DEVICES=0 python train.py --restore_path path/to/model/checkpoint.pth
```
- Run multi-gpu training.
```bash
CUDA_VISIBLE_DEVICES=0,1,2 python TTS/bin/distribute.py --script train.py
CUDA_VISIBLE_DEVICES=0,1,2 python -m trainer.distribute --script train.py
```
### CLI Way

View File

@ -66,7 +66,7 @@
"DATASET = \"ljspeech\"\n",
"METADATA_FILE = \"metadata.csv\"\n",
"CONFIG_PATH = \"/home/ubuntu/.local/share/tts/tts_models--en--ljspeech--tacotron2-DDC_ph/config.json\"\n",
"MODEL_FILE = \"/home/ubuntu/.local/share/tts/tts_models--en--ljspeech--tacotron2-DDC_ph/model_file.pth.tar\"\n",
"MODEL_FILE = \"/home/ubuntu/.local/share/tts/tts_models--en--ljspeech--tacotron2-DDC_ph/model_file.pth\"\n",
"BATCH_SIZE = 32\n",
"\n",
"QUANTIZED_WAV = False\n",

View File

@ -66,7 +66,7 @@
"outputs": [],
"source": [
"MODEL_RUN_PATH = \"/media/erogol/data_ssd/Models/libri_tts/speaker_encoder/libritts_360-half-October-31-2019_04+54PM-19d2f5f/\"\n",
"MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth.tar\"\n",
"MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth\"\n",
"CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n",
"\n",
"# My single speaker locations\n",

View File

@ -73,7 +73,7 @@
"\n",
"# Set constants\n",
"ROOT_PATH = '/home/erogol/Models/LJSpeech/ljspeech-May-20-2020_12+29PM-1835628/'\n",
"MODEL_PATH = ROOT_PATH + '/best_model.pth.tar'\n",
"MODEL_PATH = ROOT_PATH + '/best_model.pth'\n",
"CONFIG_PATH = ROOT_PATH + '/config.json'\n",
"OUT_FOLDER = './hard_sentences/'\n",
"CONFIG = load_config(CONFIG_PATH)\n",

View File

@ -416,7 +416,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.1"
"version": "3.9.5"
}
},
"nbformat": 4,

View File

@ -3,6 +3,10 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"%matplotlib inline\n",
"\n",
@ -12,21 +16,51 @@
"\n",
"import IPython.display as ipd\n",
"import glob"
],
"outputs": [],
"metadata": {
"Collapsed": "false"
}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"config_path = \"/home/erogol/gdrive/Projects/TTS/recipes/ljspeech/align_tts/config_transformer2.json\"\n",
"data_path = \"/home/erogol/gdrive/Datasets/LJSpeech-1.1/\"\n",
"\n",
"file_paths = glob.glob(data_path + \"/**/*.wav\", recursive=True)\n",
"CONFIG = load_config(config_path)\n",
"from TTS.config.shared_configs import BaseAudioConfig\n",
"CONFIG = BaseAudioConfig()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ✍️ Set these values "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data_path = \"/root/wav48_silence_trimmed/\"\n",
"file_ext = \".flac\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Read audio files"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"file_paths = glob.glob(data_path + f\"/**/*{file_ext}\", recursive=True)\n",
"\n",
"# Change this to the index of the desired file listed below\n",
"sample_file_index = 10\n",
@ -35,44 +69,45 @@
"\n",
"print(\"File list, by index:\")\n",
"dict(enumerate(file_paths))"
],
"outputs": [],
"metadata": {
"Collapsed": "false"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"Collapsed": "false"
},
"source": [
"### Setup Audio Processor\n",
"## ✍️ Set Audio Processor\n",
"Play with the AP parameters until you find a good fit with the synthesis speech below.\n",
"\n",
"The default values are loaded from your config.json file, so you only need to\n",
"uncomment and modify values below that you'd like to tune."
],
"metadata": {
"Collapsed": "false"
}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"tune_params={\n",
"# 'audio_processor': 'audio',\n",
"# 'num_mels': 80, # In general, you don't need to change this. \n",
"# 'fft_size': 1024, # In general, you don't need to change this.\n",
"# 'sample_rate': 22050, # This must match the sample rate of the dataset.\n",
"# 'hop_length': 256, # In general, you don't need to change this.\n",
"# 'win_length': 1024, # In general, you don't need to change this.\n",
"# 'preemphasis': 0.98, # In general, 0 gives better voice recovery but makes training harder. If your model does not train, try 0.97 - 0.99.\n",
"# 'min_level_db': -100,\n",
"# 'ref_level_db': 0, # The base DB; increase until all background noise is removed in the spectrogram, then lower until you hear better speech below.\n",
"# 'power': 1.5, # Change this value and listen to the synthesized voice. 1.2 - 1.5 are resonable values.\n",
"# 'griffin_lim_iters': 60, # Quality does not improve for values > 60\n",
"# 'mel_fmin': 0.0, # Adjust this and check mel-spectrogram-based voice synthesis below.\n",
"# 'mel_fmax': 8000.0, # Adjust this and check mel-spectrogram-based voice synthesis below.\n",
"# 'do_trim_silence': True # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file.\n",
" 'num_mels': 80, # In general, you don't need to change this. \n",
" 'fft_size': 2400, # In general, you don't need to change this.\n",
" 'frame_length_ms': 50, \n",
" 'frame_shift_ms': 12.5,\n",
" 'sample_rate': 48000, # This must match the sample rate of the dataset.\n",
" 'hop_length': None, # In general, you don't need to change this.\n",
" 'win_length': 1024, # In general, you don't need to change this.\n",
" 'preemphasis': 0.98, # In general, 0 gives better voice recovery but makes training harder. If your model does not train, try 0.97 - 0.99.\n",
" 'min_level_db': -100,\n",
" 'ref_level_db': 0, # The base DB; increase until all background noise is removed in the spectrogram, then lower until you hear better speech below.\n",
" 'power': 1.5, # Change this value and listen to the synthesized voice. 1.2 - 1.5 are resonable values.\n",
" 'griffin_lim_iters': 60, # Quality does not improve for values > 60\n",
" 'mel_fmin': 0.0, # Adjust this and check mel-spectrogram-based voice synthesis below.\n",
" 'mel_fmax': 8000.0, # Adjust this and check mel-spectrogram-based voice synthesis below.\n",
" 'do_trim_silence': True # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file.\n",
"}\n",
"\n",
"# These options have to be forced off in order to avoid errors about the \n",
@ -86,59 +121,57 @@
"}\n",
"\n",
"# Override select parts of loaded config with parameters above\n",
"tuned_config = CONFIG.audio.copy()\n",
"tuned_config = CONFIG.copy()\n",
"tuned_config.update(reset)\n",
"tuned_config.update(tune_params)\n",
"\n",
"AP = AudioProcessor(**tuned_config);"
],
"outputs": [],
"metadata": {
"Collapsed": "false"
}
]
},
{
"cell_type": "markdown",
"source": [
"### Check audio loading "
],
"metadata": {
"Collapsed": "false"
}
},
"source": [
"### Check audio loading "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"wav = AP.load_wav(SAMPLE_FILE_PATH)\n",
"ipd.Audio(data=wav, rate=AP.sample_rate) "
],
"outputs": [],
"metadata": {
"Collapsed": "false"
}
]
},
{
"cell_type": "markdown",
"source": [
"### Generate Mel-Spectrogram and Re-synthesis with GL"
],
"metadata": {
"Collapsed": "false"
}
},
"source": [
"### Generate Mel-Spectrogram and Re-synthesis with GL"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"AP.power = 1.5"
],
"outputs": [],
"metadata": {}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"mel = AP.melspectrogram(wav)\n",
"print(\"Max:\", mel.max())\n",
@ -148,24 +181,24 @@
"\n",
"wav_gen = AP.inv_melspectrogram(mel)\n",
"ipd.Audio(wav_gen, rate=AP.sample_rate)"
],
"outputs": [],
"metadata": {
"Collapsed": "false"
}
]
},
{
"cell_type": "markdown",
"source": [
"### Generate Linear-Spectrogram and Re-synthesis with GL"
],
"metadata": {
"Collapsed": "false"
}
},
"source": [
"### Generate Linear-Spectrogram and Re-synthesis with GL"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"spec = AP.spectrogram(wav)\n",
"print(\"Max:\", spec.max())\n",
@ -175,26 +208,26 @@
"\n",
"wav_gen = AP.inv_spectrogram(spec)\n",
"ipd.Audio(wav_gen, rate=AP.sample_rate)"
],
"outputs": [],
"metadata": {
"Collapsed": "false"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"Collapsed": "false"
},
"source": [
"### Compare values for a certain parameter\n",
"\n",
"Optimize your parameters by comparing different values per parameter at a time."
],
"metadata": {
"Collapsed": "false"
}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"from librosa import display\n",
"from matplotlib import pylab as plt\n",
@ -234,39 +267,39 @@
" val = values[idx]\n",
" print(\" > {} = {}\".format(attribute, val))\n",
" IPython.display.display(IPython.display.Audio(wav_gen, rate=AP.sample_rate))"
],
"outputs": [],
"metadata": {
"Collapsed": "false"
}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"compare_values(\"preemphasis\", [0, 0.5, 0.97, 0.98, 0.99])"
],
"outputs": [],
"metadata": {
"Collapsed": "false"
}
]
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"compare_values(\"ref_level_db\", [2, 5, 10, 15, 20, 25, 30, 35, 40, 1000])"
],
"outputs": [],
"metadata": {
"Collapsed": "false"
}
},
"outputs": [],
"source": [
"compare_values(\"ref_level_db\", [2, 5, 10, 15, 20, 25, 30, 35, 40, 1000])"
]
}
],
"metadata": {
"interpreter": {
"hash": "27648abe09795c3a768a281b31f7524fcf66a207e733f8ecda3a4e1fd1059fb0"
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3.8.5 64-bit ('torch': conda)"
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
@ -278,12 +311,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
},
"interpreter": {
"hash": "27648abe09795c3a768a281b31f7524fcf66a207e733f8ecda3a4e1fd1059fb0"
"version": "3.9.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
}

View File

@ -6,7 +6,7 @@ max-line-length=120
[tool.black]
line-length = 120
target-version = ['py38']
target-version = ['py39']
exclude = '''
(

View File

@ -49,7 +49,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
train_samples, eval_samples = load_tts_samples(
dataset_config,
eval_split=True,
eval_split_max_size=config.eval_split_max_size,
eval_split_size=config.eval_split_size,
)
# init model
model = AlignTTS(config, ap, tokenizer)

View File

@ -84,7 +84,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
train_samples, eval_samples = load_tts_samples(
dataset_config,
eval_split=True,
eval_split_max_size=config.eval_split_max_size,
eval_split_size=config.eval_split_size,
)
# init the model
model = ForwardTTS(config, ap, tokenizer, speaker_manager=None)

View File

@ -83,7 +83,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
train_samples, eval_samples = load_tts_samples(
dataset_config,
eval_split=True,
eval_split_max_size=config.eval_split_max_size,
eval_split_size=config.eval_split_size,
)
# init the model
model = ForwardTTS(config, ap, tokenizer)

View File

@ -60,7 +60,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
train_samples, eval_samples = load_tts_samples(
dataset_config,
eval_split=True,
eval_split_max_size=config.eval_split_max_size,
eval_split_size=config.eval_split_size,
)
# INITIALIZE THE MODEL
# Models take a config object and a speaker manager as input

View File

@ -41,11 +41,6 @@ model = GAN(config, ap)
# init the trainer and 🚀
trainer = Trainer(
TrainerArgs(),
config,
output_path,
model=model,
train_samples=train_samples,
eval_samples=eval_samples
TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)
trainer.fit()

View File

@ -41,11 +41,6 @@ model = GAN(config, ap)
# init the trainer and 🚀
trainer = Trainer(
TrainerArgs(),
config,
output_path,
model=model,
train_samples=train_samples,
eval_samples=eval_samples
TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)
trainer.fit()

View File

@ -67,7 +67,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
train_samples, eval_samples = load_tts_samples(
dataset_config,
eval_split=True,
eval_split_max_size=config.eval_split_max_size,
eval_split_size=config.eval_split_size,
)
# init model
model = ForwardTTS(config, ap, tokenizer)

View File

@ -77,7 +77,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
train_samples, eval_samples = load_tts_samples(
dataset_config,
eval_split=True,
eval_split_max_size=config.eval_split_max_size,
eval_split_size=config.eval_split_size,
)
# INITIALIZE THE MODEL
# Models take a config object and a speaker manager as input

View File

@ -74,7 +74,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
train_samples, eval_samples = load_tts_samples(
dataset_config,
eval_split=True,
eval_split_max_size=config.eval_split_max_size,
eval_split_size=config.eval_split_size,
)
# INITIALIZE THE MODEL
# Models take a config object and a speaker manager as input
@ -84,11 +89,6 @@ model = Tacotron2(config, ap, tokenizer, speaker_manager=None)
# init the trainer and 🚀
trainer = Trainer(
TrainerArgs(),
config,
output_path,
model=model,
train_samples=train_samples,
eval_samples=eval_samples
TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)
trainer.fit()

View File

@ -40,11 +40,6 @@ model = GAN(config, ap)
# init the trainer and 🚀
trainer = Trainer(
TrainerArgs(),
config,
output_path,
model=model,
train_samples=train_samples,
eval_samples=eval_samples
TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)
trainer.fit()

View File

@ -69,7 +69,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
train_samples, eval_samples = load_tts_samples(
dataset_config,
eval_split=True,
eval_split_max_size=config.eval_split_max_size,
eval_split_size=config.eval_split_size,
)
# init model
model = Vits(config, ap, tokenizer, speaker_manager=None)

View File

@ -6,12 +6,11 @@ from trainer import Trainer, TrainerArgs
from TTS.config.shared_configs import BaseAudioConfig
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.configs.vits_config import VitsConfig
from TTS.tts.models.vits import CharactersConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.vits import Vits, VitsArgs
from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs
from TTS.tts.utils.languages import LanguageManager
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
output_path = os.path.dirname(os.path.abspath(__file__))
@ -110,7 +109,12 @@ config.from_dict(config.to_dict())
ap = AudioProcessor(**config.audio.to_dict())
# load training samples
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
train_samples, eval_samples = load_tts_samples(
dataset_config,
eval_split=True,
eval_split_max_size=config.eval_split_max_size,
eval_split_size=config.eval_split_size,
)
# init speaker manager for multi-speaker training
# it maps speaker-id to speaker-name in the model and data-loader
@ -131,11 +135,6 @@ model = Vits(config, ap, tokenizer, speaker_manager, language_manager)
# init the trainer and 🚀
trainer = Trainer(
TrainerArgs(),
config,
output_path,
model=model,
train_samples=train_samples,
eval_samples=eval_samples
TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)
trainer.fit()

View File

@ -71,7 +71,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
train_samples, eval_samples = load_tts_samples(
dataset_config,
eval_split=True,
eval_split_max_size=config.eval_split_max_size,
eval_split_size=config.eval_split_size,
)
# init speaker manager for multi-speaker training
# it maps speaker-id to speaker-name in the model and data-loader

View File

@ -69,7 +69,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
train_samples, eval_samples = load_tts_samples(
dataset_config,
eval_split=True,
eval_split_max_size=config.eval_split_max_size,
eval_split_size=config.eval_split_size,
)
# init speaker manager for multi-speaker training
# it maps speaker-id to speaker-name in the model and data-loader

View File

@ -69,7 +69,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
train_samples, eval_samples = load_tts_samples(
dataset_config,
eval_split=True,
eval_split_max_size=config.eval_split_max_size,
eval_split_size=config.eval_split_size,
)
# init speaker manager for multi-speaker training
# it maps speaker-id to speaker-name in the model and data-loader

View File

@ -69,7 +69,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
train_samples, eval_samples = load_tts_samples(
dataset_config,
eval_split=True,
eval_split_max_size=config.eval_split_max_size,
eval_split_size=config.eval_split_size,
)
# init speaker manager for multi-speaker training
# it maps speaker-id to speaker-name in the model and data-loader

View File

@ -72,7 +72,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
train_samples, eval_samples = load_tts_samples(
dataset_config,
eval_split=True,
eval_split_max_size=config.eval_split_max_size,
eval_split_size=config.eval_split_size,
)
# init speaker manager for multi-speaker training
# it mainly handles speaker-id to speaker-name for the model and the data-loader

View File

@ -78,7 +78,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
train_samples, eval_samples = load_tts_samples(
dataset_config,
eval_split=True,
eval_split_max_size=config.eval_split_max_size,
eval_split_size=config.eval_split_size,
)
# init speaker manager for multi-speaker training
# it mainly handles speaker-id to speaker-name for the model and the data-loader

View File

@ -78,7 +78,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
train_samples, eval_samples = load_tts_samples(
dataset_config,
eval_split=True,
eval_split_max_size=config.eval_split_max_size,
eval_split_size=config.eval_split_size,
)
# init speaker manager for multi-speaker training
# it mainly handles speaker-id to speaker-name for the model and the data-loader

View File

@ -79,7 +79,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
train_samples, eval_samples = load_tts_samples(
dataset_config,
eval_split=True,
eval_split_max_size=config.eval_split_max_size,
eval_split_size=config.eval_split_size,
)
# init speaker manager for multi-speaker training
# it maps speaker-id to speaker-name in the model and data-loader

View File

@ -33,6 +33,6 @@ pypinyin
mecab-python3==1.0.3
unidic-lite==1.0.8
# gruut+supported langs
gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=2.0.0
gruut[cs,de,es,fr,it,nl,pt,ru,sv]==2.2.3
# others
webrtcvad # for VAD

View File

@ -15,7 +15,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase):
def test_GlowTTS():
# set paths
config_path = os.path.join(get_tests_input_path(), "test_glow_tts.json")
checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth.tar")
checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth")
output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/")
# load config
c = load_config(config_path)
@ -33,7 +33,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase):
def test_Tacotron2():
# set paths
config_path = os.path.join(get_tests_input_path(), "test_tacotron2_config.json")
checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth.tar")
checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth")
output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/")
# load config
c = load_config(config_path)
@ -51,7 +51,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase):
def test_Tacotron():
# set paths
config_path = os.path.join(get_tests_input_path(), "test_tacotron_config.json")
checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth.tar")
checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth")
output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/")
# load config
c = load_config(config_path)

View File

@ -12,7 +12,7 @@ from TTS.tts.utils.speakers import SpeakerManager
from TTS.utils.audio import AudioProcessor
encoder_config_path = os.path.join(get_tests_input_path(), "test_speaker_encoder_config.json")
encoder_model_path = os.path.join(get_tests_input_path(), "checkpoint_0.pth.tar")
encoder_model_path = os.path.join(get_tests_input_path(), "checkpoint_0.pth")
sample_wav_path = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0001.wav")
sample_wav_path2 = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0002.wav")
d_vectors_file_path = os.path.join(get_tests_input_path(), "../data/dummy_speakers.json")

View File

@ -1,14 +1,13 @@
import functools
import unittest
import torch
from TTS.config.shared_configs import BaseDatasetConfig
from TTS.encoder.utils.samplers import PerfectBatchSampler
from TTS.tts.datasets import load_tts_samples
from TTS.tts.utils.languages import get_language_balancer_weights
from TTS.tts.utils.speakers import get_speaker_balancer_weights
from TTS.encoder.utils.samplers import PerfectBatchSampler
# Fixing random state to avoid random fails
torch.manual_seed(0)
@ -60,7 +59,9 @@ class TestSamplers(unittest.TestCase):
assert not is_balanced(en, pt), "Random sampler is supposed to be unbalanced"
def test_language_weighted_random_sampler(self): # pylint: disable=no-self-use
weighted_sampler = torch.utils.data.sampler.WeightedRandomSampler(get_language_balancer_weights(train_samples), len(train_samples))
weighted_sampler = torch.utils.data.sampler.WeightedRandomSampler(
get_language_balancer_weights(train_samples), len(train_samples)
)
ids = functools.reduce(lambda a, b: a + b, [list(weighted_sampler) for i in range(100)])
en, pt = 0, 0
for index in ids:
@ -73,7 +74,9 @@ class TestSamplers(unittest.TestCase):
def test_speaker_weighted_random_sampler(self): # pylint: disable=no-self-use
weighted_sampler = torch.utils.data.sampler.WeightedRandomSampler(get_speaker_balancer_weights(train_samples), len(train_samples))
weighted_sampler = torch.utils.data.sampler.WeightedRandomSampler(
get_speaker_balancer_weights(train_samples), len(train_samples)
)
ids = functools.reduce(lambda a, b: a + b, [list(weighted_sampler) for i in range(100)])
spk1, spk2 = 0, 0
for index in ids:
@ -92,11 +95,12 @@ class TestSamplers(unittest.TestCase):
sampler = PerfectBatchSampler(
train_samples,
classes,
batch_size=2 * 3, # total batch size
batch_size=2 * 3, # total batch size
num_classes_in_batch=2,
label_key="speaker_name",
shuffle=False,
drop_last=True)
drop_last=True,
)
batchs = functools.reduce(lambda a, b: a + b, [list(sampler) for i in range(100)])
for batch in batchs:
spk1, spk2 = 0, 0
@ -116,11 +120,12 @@ class TestSamplers(unittest.TestCase):
sampler = PerfectBatchSampler(
train_samples,
classes,
batch_size=2 * 3, # total batch size
batch_size=2 * 3, # total batch size
num_classes_in_batch=2,
label_key="speaker_name",
shuffle=True,
drop_last=False)
drop_last=False,
)
batchs = functools.reduce(lambda a, b: a + b, [list(sampler) for i in range(100)])
for batch in batchs:
spk1, spk2 = 0, 0

View File

@ -20,7 +20,7 @@ class SynthesizerTest(unittest.TestCase):
def test_in_out(self):
self._create_random_model()
tts_root_path = get_tests_output_path()
tts_checkpoint = os.path.join(tts_root_path, "checkpoint_10.pth.tar")
tts_checkpoint = os.path.join(tts_root_path, "checkpoint_10.pth")
tts_config = os.path.join(tts_root_path, "dummy_model_config.json")
synthesizer = Synthesizer(tts_checkpoint, tts_config, None, None)
synthesizer.tts("Better this test works!!")

View File

@ -1,5 +1,5 @@
{
"tts_checkpoint":"checkpoint_10.pth.tar", // tts checkpoint file
"tts_checkpoint":"checkpoint_10.pth", // tts checkpoint file
"tts_config":"dummy_model_config.json", // tts config.json file
"tts_speakers": null, // json file listing speaker ids. null if no speaker embedding.
"wavernn_lib_path": null, // Rootpath to wavernn project folder to be imported. If this is null, model uses GL for speech synthesis.