mirror of https://github.com/coqui-ai/TTS.git
Merge branch 'dev-managers' into dev-emotion
This commit is contained in:
commit
aebbdfc62b
|
@ -1,4 +1,4 @@
|
|||
name: tts-tests
|
||||
name: text-tests
|
||||
|
||||
on:
|
||||
push:
|
||||
|
|
|
@ -115,6 +115,7 @@ venv.bak/
|
|||
*.swo
|
||||
|
||||
# pytorch models
|
||||
*.pth
|
||||
*.pth.tar
|
||||
result/
|
||||
|
||||
|
|
|
@ -0,0 +1,20 @@
|
|||
cff-version: 1.2.0
|
||||
message: "If you want to cite 🐸💬, feel free to use this (but only if you loved it 😊)"
|
||||
title: "Coqui TTS"
|
||||
abstract: "A deep learning toolkit for Text-to-Speech, battle-tested in research and production"
|
||||
date-released: 2021-01-01
|
||||
authors:
|
||||
- family-names: "Eren"
|
||||
given-names: "Gölge"
|
||||
- name: "The Coqui TTS Team"
|
||||
version: 1.4
|
||||
doi: 10.5281/zenodo.6334862
|
||||
license: "MPL-2.0"
|
||||
url: "https://www.coqui.ai"
|
||||
repository-code: "https://github.com/coqui-ai/TTS"
|
||||
keywords:
|
||||
- machine learning
|
||||
- deep learning
|
||||
- artificial intelligence
|
||||
- text to speech
|
||||
- TTS
|
|
@ -1,6 +1,7 @@
|
|||
include README.md
|
||||
include LICENSE.txt
|
||||
include requirements.*.txt
|
||||
include *.cff
|
||||
include requirements.txt
|
||||
include TTS/VERSION
|
||||
recursive-include TTS *.json
|
||||
|
|
2
Makefile
2
Makefile
|
@ -44,6 +44,8 @@ style: ## update code style.
|
|||
|
||||
lint: ## run pylint linter.
|
||||
pylint ${target_dirs}
|
||||
black ${target_dirs} --check
|
||||
isort ${target_dirs} --check-only
|
||||
|
||||
system-deps: ## install linux system deps
|
||||
sudo apt-get install -y libsndfile1-dev
|
||||
|
|
|
@ -159,13 +159,13 @@ If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](ht
|
|||
- Run your own TTS model (Using Griffin-Lim Vocoder):
|
||||
|
||||
```
|
||||
$ tts --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav
|
||||
$ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
|
||||
```
|
||||
|
||||
- Run your own TTS and Vocoder models:
|
||||
```
|
||||
$ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav
|
||||
--vocoder_path path/to/vocoder.pth.tar --vocoder_config_path path/to/vocoder_config.json
|
||||
$ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth --out_path output/path/speech.wav
|
||||
--vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
|
||||
```
|
||||
|
||||
### Multi-speaker Models
|
||||
|
@ -185,7 +185,7 @@ If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](ht
|
|||
- Run your own multi-speaker TTS model:
|
||||
|
||||
```
|
||||
$ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth.tar --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
|
||||
$ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
|
||||
```
|
||||
|
||||
## Directory Structure
|
||||
|
|
|
@ -25,7 +25,7 @@ These masks can be used for different purposes including training a TTS model wi
|
|||
"""
|
||||
Example run:
|
||||
CUDA_VISIBLE_DEVICE="0" python TTS/bin/compute_attention_masks.py
|
||||
--model_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/checkpoint_200000.pth.tar
|
||||
--model_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/checkpoint_200000.pth
|
||||
--config_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/config.json
|
||||
--dataset_metafile metadata.csv
|
||||
--data_path /root/LJSpeech-1.1/
|
||||
|
|
|
@ -13,7 +13,7 @@ parser = argparse.ArgumentParser(
|
|||
description="""Compute embedding vectors for each wav file in a dataset.\n\n"""
|
||||
"""
|
||||
Example runs:
|
||||
python TTS/bin/compute_embeddings.py speaker_encoder_model.pth.tar speaker_encoder_config.json dataset_config.json embeddings_output_path/
|
||||
python TTS/bin/compute_embeddings.py speaker_encoder_model.pth speaker_encoder_config.json dataset_config.json embeddings_output_path/
|
||||
""",
|
||||
formatter_class=RawTextHelpFormatter,
|
||||
)
|
||||
|
|
|
@ -1,55 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import pathlib
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
import torch
|
||||
from trainer import TrainerArgs
|
||||
|
||||
|
||||
def main():
|
||||
"""
|
||||
Call train.py as a new process and pass command arguments
|
||||
"""
|
||||
parser = TrainerArgs().init_argparse(arg_prefix="")
|
||||
parser.add_argument("--script", type=str, help="Target training script to distibute.")
|
||||
args, unargs = parser.parse_known_args()
|
||||
|
||||
num_gpus = torch.cuda.device_count()
|
||||
group_id = time.strftime("%Y_%m_%d-%H%M%S")
|
||||
|
||||
# set arguments for train.py
|
||||
folder_path = pathlib.Path(__file__).parent.absolute()
|
||||
if os.path.exists(os.path.join(folder_path, args.script)):
|
||||
command = [os.path.join(folder_path, args.script)]
|
||||
else:
|
||||
command = [args.script]
|
||||
command.append("--continue_path={}".format(args.continue_path))
|
||||
command.append("--restore_path={}".format(args.restore_path))
|
||||
command.append("--config_path={}".format(args.config_path))
|
||||
command.append("--group_id=group_{}".format(group_id))
|
||||
command.append("--use_ddp=true")
|
||||
command += unargs
|
||||
command.append("")
|
||||
|
||||
# run processes
|
||||
processes = []
|
||||
for i in range(num_gpus):
|
||||
my_env = os.environ.copy()
|
||||
my_env["PYTHON_EGG_CACHE"] = "/tmp/tmp{}".format(i)
|
||||
command[-1] = "--rank={}".format(i)
|
||||
# prevent stdout for processes with rank != 0
|
||||
stdout = None
|
||||
p = subprocess.Popen(["python3"] + command, stdout=stdout, env=my_env) # pylint: disable=consider-using-with
|
||||
processes.append(p)
|
||||
print(command)
|
||||
|
||||
for p in processes:
|
||||
p.wait()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -1,18 +1,18 @@
|
|||
import argparse
|
||||
import torch
|
||||
from argparse import RawTextHelpFormatter
|
||||
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
|
||||
from TTS.config import load_config
|
||||
from TTS.tts.datasets import load_tts_samples
|
||||
from TTS.tts.utils.speakers import SpeakerManager
|
||||
|
||||
|
||||
def compute_encoder_accuracy(dataset_items, encoder_manager):
|
||||
|
||||
class_name_key = encoder_manager.encoder_config.class_name_key
|
||||
map_classid_to_classname = getattr(encoder_manager.encoder_config, 'map_classid_to_classname', None)
|
||||
|
||||
class_acc_dict = {}
|
||||
|
||||
# compute embeddings for all wav_files
|
||||
|
@ -43,11 +43,11 @@ def compute_encoder_accuracy(dataset_items, encoder_manager):
|
|||
|
||||
acc_avg = 0
|
||||
for key, values in class_acc_dict.items():
|
||||
acc = sum(values)/len(values)
|
||||
acc = sum(values) / len(values)
|
||||
print("Class", key, "Accuracy:", acc)
|
||||
acc_avg += acc
|
||||
|
||||
print("Average Accuracy:", acc_avg/len(class_acc_dict))
|
||||
print("Average Accuracy:", acc_avg / len(class_acc_dict))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -55,7 +55,7 @@ if __name__ == "__main__":
|
|||
description="""Compute the accuracy of the encoder.\n\n"""
|
||||
"""
|
||||
Example runs:
|
||||
python TTS/bin/eval_encoder.py emotion_encoder_model.pth.tar emotion_encoder_config.json dataset_config.json
|
||||
python TTS/bin/eval_encoder.py emotion_encoder_model.pth emotion_encoder_config.json dataset_config.json
|
||||
""",
|
||||
formatter_class=RawTextHelpFormatter,
|
||||
)
|
||||
|
|
|
@ -60,13 +60,13 @@ If you don't specify any models, then it uses LJSpeech based English model.
|
|||
- Run your own TTS model (Using Griffin-Lim Vocoder):
|
||||
|
||||
```
|
||||
$ tts --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav
|
||||
$ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
|
||||
```
|
||||
|
||||
- Run your own TTS and Vocoder models:
|
||||
```
|
||||
$ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav
|
||||
--vocoder_path path/to/vocoder.pth.tar --vocoder_config_path path/to/vocoder_config.json
|
||||
$ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth --out_path output/path/speech.wav
|
||||
--vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
|
||||
```
|
||||
|
||||
### Multi-speaker Models
|
||||
|
@ -86,7 +86,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
|
|||
- Run your own multi-speaker TTS model:
|
||||
|
||||
```
|
||||
$ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth.tar --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
|
||||
$ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
|
||||
```
|
||||
"""
|
||||
# We remove Markdown code formatting programmatically here to allow us to copy-and-paste from main README to keep
|
||||
|
@ -217,7 +217,13 @@ If you don't specify any models, then it uses LJSpeech based English model.
|
|||
args = parser.parse_args()
|
||||
|
||||
# print the description if either text or list_models is not set
|
||||
if not args.text and not args.list_models and not args.list_speaker_idxs and not args.list_language_idxs and not args.reference_wav:
|
||||
if (
|
||||
not args.text
|
||||
and not args.list_models
|
||||
and not args.list_speaker_idxs
|
||||
and not args.list_language_idxs
|
||||
and not args.reference_wav
|
||||
):
|
||||
parser.parse_args(["-h"])
|
||||
|
||||
# load model manager
|
||||
|
@ -306,7 +312,15 @@ If you don't specify any models, then it uses LJSpeech based English model.
|
|||
print(" > Text: {}".format(args.text))
|
||||
|
||||
# kick it
|
||||
wav = synthesizer.tts(args.text, args.speaker_idx, args.language_idx, args.speaker_wav, reference_wav=args.reference_wav, reference_speaker_name=args.reference_speaker_idx, emotion_name=args.emotion_idx)
|
||||
wav = synthesizer.tts(
|
||||
args.text,
|
||||
args.speaker_idx,
|
||||
args.language_idx,
|
||||
args.speaker_wav,
|
||||
reference_wav=args.reference_wav,
|
||||
reference_speaker_name=args.reference_speaker_idx,
|
||||
emotion_name=args.emotion_idx
|
||||
)
|
||||
|
||||
# save the results
|
||||
print(" > Saving output to {}".format(args.out_path))
|
||||
|
|
|
@ -9,6 +9,7 @@ import traceback
|
|||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
from trainer.torch import NoamLR
|
||||
from trainer.trainer_utils import get_optimizer
|
||||
|
||||
from TTS.encoder.dataset import EncoderDataset
|
||||
from TTS.encoder.utils.generic_utils import save_best_model, save_checkpoint, setup_encoder_model
|
||||
|
@ -19,7 +20,6 @@ from TTS.tts.datasets import load_tts_samples
|
|||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.generic_utils import count_parameters, remove_experiment_folder
|
||||
from TTS.utils.io import copy_model_files
|
||||
from trainer.trainer_utils import get_optimizer
|
||||
from TTS.utils.training import check_update
|
||||
|
||||
torch.backends.cudnn.enabled = True
|
||||
|
@ -52,16 +52,21 @@ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False
|
|||
sampler = PerfectBatchSampler(
|
||||
dataset.items,
|
||||
classes,
|
||||
batch_size=num_classes_in_batch*num_utter_per_class, # total batch size
|
||||
batch_size=num_classes_in_batch * num_utter_per_class, # total batch size
|
||||
num_classes_in_batch=num_classes_in_batch,
|
||||
num_gpus=1,
|
||||
shuffle=not is_val,
|
||||
drop_last=True)
|
||||
drop_last=True,
|
||||
)
|
||||
|
||||
if len(classes) < num_classes_in_batch:
|
||||
if is_val:
|
||||
raise RuntimeError(f"config.eval_num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Eval dataset) !")
|
||||
raise RuntimeError(f"config.num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Train dataset) !")
|
||||
raise RuntimeError(
|
||||
f"config.eval_num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Eval dataset) !"
|
||||
)
|
||||
raise RuntimeError(
|
||||
f"config.num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Train dataset) !"
|
||||
)
|
||||
|
||||
# set the classes to avoid get wrong class_id when the number of training and eval classes are not equal
|
||||
if is_val:
|
||||
|
@ -76,6 +81,7 @@ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False
|
|||
|
||||
return loader, classes, dataset.get_map_classid_to_classname()
|
||||
|
||||
|
||||
def evaluation(model, criterion, data_loader, global_step):
|
||||
eval_loss = 0
|
||||
for _, data in enumerate(data_loader):
|
||||
|
@ -84,8 +90,12 @@ def evaluation(model, criterion, data_loader, global_step):
|
|||
inputs, labels = data
|
||||
|
||||
# agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
|
||||
labels = torch.transpose(labels.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch), 0, 1).reshape(labels.shape)
|
||||
inputs = torch.transpose(inputs.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch, -1), 0, 1).reshape(inputs.shape)
|
||||
labels = torch.transpose(
|
||||
labels.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch), 0, 1
|
||||
).reshape(labels.shape)
|
||||
inputs = torch.transpose(
|
||||
inputs.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch, -1), 0, 1
|
||||
).reshape(inputs.shape)
|
||||
|
||||
# dispatch data to GPU
|
||||
if use_cuda:
|
||||
|
@ -96,20 +106,23 @@ def evaluation(model, criterion, data_loader, global_step):
|
|||
outputs = model(inputs)
|
||||
|
||||
# loss computation
|
||||
loss = criterion(outputs.view(c.eval_num_classes_in_batch, outputs.shape[0] // c.eval_num_classes_in_batch, -1), labels)
|
||||
loss = criterion(
|
||||
outputs.view(c.eval_num_classes_in_batch, outputs.shape[0] // c.eval_num_classes_in_batch, -1), labels
|
||||
)
|
||||
|
||||
eval_loss += loss.item()
|
||||
|
||||
eval_avg_loss = eval_loss/len(data_loader)
|
||||
eval_avg_loss = eval_loss / len(data_loader)
|
||||
# save stats
|
||||
dashboard_logger.eval_stats(global_step, {"loss": eval_avg_loss})
|
||||
# plot the last batch in the evaluation
|
||||
figures = {
|
||||
"UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
|
||||
"UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
|
||||
}
|
||||
dashboard_logger.eval_figures(global_step, figures)
|
||||
return eval_avg_loss
|
||||
|
||||
|
||||
def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, global_step):
|
||||
model.train()
|
||||
best_loss = float("inf")
|
||||
|
@ -124,8 +137,12 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader,
|
|||
# setup input data
|
||||
inputs, labels = data
|
||||
# agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
|
||||
labels = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(labels.shape)
|
||||
inputs = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(inputs.shape)
|
||||
labels = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(
|
||||
labels.shape
|
||||
)
|
||||
inputs = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(
|
||||
inputs.shape
|
||||
)
|
||||
# ToDo: move it to a unit test
|
||||
# labels_converted = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(labels.shape)
|
||||
# inputs_converted = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(inputs.shape)
|
||||
|
@ -157,7 +174,9 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader,
|
|||
outputs = model(inputs)
|
||||
|
||||
# loss computation
|
||||
loss = criterion(outputs.view(c.num_classes_in_batch, outputs.shape[0] // c.num_classes_in_batch, -1), labels)
|
||||
loss = criterion(
|
||||
outputs.view(c.num_classes_in_batch, outputs.shape[0] // c.num_classes_in_batch, -1), labels
|
||||
)
|
||||
loss.backward()
|
||||
grad_norm, _ = check_update(model, c.grad_clip)
|
||||
optimizer.step()
|
||||
|
@ -211,7 +230,7 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader,
|
|||
print(
|
||||
">>> Epoch:{} AvgLoss: {:.5f} GradNorm:{:.5f} "
|
||||
"EpochTime:{:.2f} AvGLoaderTime:{:.2f} ".format(
|
||||
epoch, tot_loss/len(data_loader), grad_norm, epoch_time, avg_loader_time
|
||||
epoch, tot_loss / len(data_loader), grad_norm, epoch_time, avg_loader_time
|
||||
),
|
||||
flush=True,
|
||||
)
|
||||
|
@ -222,10 +241,8 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader,
|
|||
print("\n\n")
|
||||
print("--> EVAL PERFORMANCE")
|
||||
print(
|
||||
" | > Epoch:{} AvgLoss: {:.5f} ".format(
|
||||
epoch, eval_loss
|
||||
),
|
||||
flush=True,
|
||||
" | > Epoch:{} AvgLoss: {:.5f} ".format(epoch, eval_loss),
|
||||
flush=True,
|
||||
)
|
||||
# save the best checkpoint
|
||||
best_loss = save_best_model(model, optimizer, criterion, eval_loss, best_loss, OUT_PATH, global_step, epoch)
|
||||
|
@ -262,7 +279,9 @@ def main(args): # pylint: disable=redefined-outer-name
|
|||
copy_model_files(c, OUT_PATH)
|
||||
|
||||
if args.restore_path:
|
||||
criterion, args.restore_step = model.load_checkpoint(c, args.restore_path, eval=False, use_cuda=use_cuda, criterion=criterion)
|
||||
criterion, args.restore_step = model.load_checkpoint(
|
||||
c, args.restore_path, eval=False, use_cuda=use_cuda, criterion=criterion
|
||||
)
|
||||
print(" > Model restored from step %d" % args.restore_step, flush=True)
|
||||
else:
|
||||
args.restore_step = 0
|
||||
|
|
|
@ -14,5 +14,5 @@ To run the code, you need to follow the same flow as in TTS.
|
|||
|
||||
- Define 'config.json' for your needs. Note that, audio parameters should match your TTS model.
|
||||
- Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360```
|
||||
- Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth.tar model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files.
|
||||
- Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files.
|
||||
- Watch training on Tensorboard as in TTS
|
||||
|
|
|
@ -33,10 +33,7 @@ class BaseEncoderConfig(BaseTrainingConfig):
|
|||
grad_clip: float = 3.0
|
||||
lr: float = 0.0001
|
||||
optimizer: str = "radam"
|
||||
optimizer_params: Dict = field(default_factory=lambda: {
|
||||
"betas": [0.9, 0.999],
|
||||
"weight_decay": 0
|
||||
})
|
||||
optimizer_params: Dict = field(default_factory=lambda: {"betas": [0.9, 0.999], "weight_decay": 0})
|
||||
lr_decay: bool = False
|
||||
warmup_steps: int = 4000
|
||||
|
||||
|
|
|
@ -5,6 +5,7 @@ from torch.utils.data import Dataset
|
|||
|
||||
from TTS.encoder.utils.generic_utils import AugmentWAV
|
||||
|
||||
|
||||
class EncoderDataset(Dataset):
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -57,7 +58,6 @@ class EncoderDataset(Dataset):
|
|||
print(f" | > Num Classes: {len(self.classes)}")
|
||||
print(f" | > Classes: {self.classes}")
|
||||
|
||||
|
||||
def load_wav(self, filename):
|
||||
audio = self.ap.load_wav(filename, sr=self.ap.sample_rate)
|
||||
return audio
|
||||
|
@ -75,9 +75,7 @@ class EncoderDataset(Dataset):
|
|||
]
|
||||
|
||||
# skip classes with number of samples >= self.num_utter_per_class
|
||||
class_to_utters = {
|
||||
k: v for (k, v) in class_to_utters.items() if len(v) >= self.num_utter_per_class
|
||||
}
|
||||
class_to_utters = {k: v for (k, v) in class_to_utters.items() if len(v) >= self.num_utter_per_class}
|
||||
|
||||
classes = list(class_to_utters.keys())
|
||||
classes.sort()
|
||||
|
@ -105,11 +103,11 @@ class EncoderDataset(Dataset):
|
|||
|
||||
def get_class_list(self):
|
||||
return self.classes
|
||||
|
||||
def set_classes(self, classes):
|
||||
self.classes = classes
|
||||
self.classname_to_classid = {key: i for i, key in enumerate(self.classes)}
|
||||
|
||||
|
||||
def get_map_classid_to_classname(self):
|
||||
return dict((c_id, c_n) for c_n, c_id in self.classname_to_classid.items())
|
||||
|
||||
|
|
|
@ -195,6 +195,7 @@ class SoftmaxLoss(nn.Module):
|
|||
class_id = torch.argmax(activations)
|
||||
return class_id
|
||||
|
||||
|
||||
class SoftmaxAngleProtoLoss(nn.Module):
|
||||
"""
|
||||
Implementation of the Softmax AnglePrototypical loss as defined in https://arxiv.org/abs/2009.14153
|
||||
|
|
|
@ -1,12 +1,13 @@
|
|||
import numpy as np
|
||||
import torch
|
||||
import torchaudio
|
||||
import numpy as np
|
||||
from coqpit import Coqpit
|
||||
from torch import nn
|
||||
|
||||
from TTS.utils.io import load_fsspec
|
||||
from TTS.encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss
|
||||
from TTS.utils.generic_utils import set_init_dict
|
||||
from coqpit import Coqpit
|
||||
from TTS.utils.io import load_fsspec
|
||||
|
||||
|
||||
class PreEmphasis(nn.Module):
|
||||
def __init__(self, coefficient=0.97):
|
||||
|
@ -20,6 +21,7 @@ class PreEmphasis(nn.Module):
|
|||
x = torch.nn.functional.pad(x.unsqueeze(1), (1, 0), "reflect")
|
||||
return torch.nn.functional.conv1d(x, self.filter).squeeze(1)
|
||||
|
||||
|
||||
class BaseEncoder(nn.Module):
|
||||
"""Base `encoder` class. Every new `encoder` model must inherit this.
|
||||
|
||||
|
@ -32,31 +34,31 @@ class BaseEncoder(nn.Module):
|
|||
|
||||
def get_torch_mel_spectrogram_class(self, audio_config):
|
||||
return torch.nn.Sequential(
|
||||
PreEmphasis(audio_config["preemphasis"]),
|
||||
# TorchSTFT(
|
||||
# n_fft=audio_config["fft_size"],
|
||||
# hop_length=audio_config["hop_length"],
|
||||
# win_length=audio_config["win_length"],
|
||||
# sample_rate=audio_config["sample_rate"],
|
||||
# window="hamming_window",
|
||||
# mel_fmin=0.0,
|
||||
# mel_fmax=None,
|
||||
# use_htk=True,
|
||||
# do_amp_to_db=False,
|
||||
# n_mels=audio_config["num_mels"],
|
||||
# power=2.0,
|
||||
# use_mel=True,
|
||||
# mel_norm=None,
|
||||
# )
|
||||
torchaudio.transforms.MelSpectrogram(
|
||||
sample_rate=audio_config["sample_rate"],
|
||||
n_fft=audio_config["fft_size"],
|
||||
win_length=audio_config["win_length"],
|
||||
hop_length=audio_config["hop_length"],
|
||||
window_fn=torch.hamming_window,
|
||||
n_mels=audio_config["num_mels"],
|
||||
)
|
||||
)
|
||||
PreEmphasis(audio_config["preemphasis"]),
|
||||
# TorchSTFT(
|
||||
# n_fft=audio_config["fft_size"],
|
||||
# hop_length=audio_config["hop_length"],
|
||||
# win_length=audio_config["win_length"],
|
||||
# sample_rate=audio_config["sample_rate"],
|
||||
# window="hamming_window",
|
||||
# mel_fmin=0.0,
|
||||
# mel_fmax=None,
|
||||
# use_htk=True,
|
||||
# do_amp_to_db=False,
|
||||
# n_mels=audio_config["num_mels"],
|
||||
# power=2.0,
|
||||
# use_mel=True,
|
||||
# mel_norm=None,
|
||||
# )
|
||||
torchaudio.transforms.MelSpectrogram(
|
||||
sample_rate=audio_config["sample_rate"],
|
||||
n_fft=audio_config["fft_size"],
|
||||
win_length=audio_config["win_length"],
|
||||
hop_length=audio_config["hop_length"],
|
||||
window_fn=torch.hamming_window,
|
||||
n_mels=audio_config["num_mels"],
|
||||
),
|
||||
)
|
||||
|
||||
@torch.no_grad()
|
||||
def inference(self, x, l2_norm=True):
|
||||
|
@ -104,7 +106,9 @@ class BaseEncoder(nn.Module):
|
|||
raise Exception("The %s not is a loss supported" % c.loss)
|
||||
return criterion
|
||||
|
||||
def load_checkpoint(self, config: Coqpit, checkpoint_path: str, eval: bool = False, use_cuda: bool = False, criterion=None):
|
||||
def load_checkpoint(
|
||||
self, config: Coqpit, checkpoint_path: str, eval: bool = False, use_cuda: bool = False, criterion=None
|
||||
):
|
||||
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
|
||||
try:
|
||||
self.load_state_dict(state["model"])
|
||||
|
@ -127,7 +131,12 @@ class BaseEncoder(nn.Module):
|
|||
print(" > Criterion load ignored because of:", error)
|
||||
|
||||
# instance and load the criterion for the encoder classifier in inference time
|
||||
if eval and criterion is None and "criterion" in state and getattr(config, 'map_classid_to_classname', None) is not None:
|
||||
if (
|
||||
eval
|
||||
and criterion is None
|
||||
and "criterion" in state
|
||||
and getattr(config, "map_classid_to_classname", None) is not None
|
||||
):
|
||||
criterion = self.get_criterion(config, len(config.map_classid_to_classname))
|
||||
criterion.load_state_dict(state["criterion"])
|
||||
|
||||
|
|
|
@ -4,6 +4,7 @@ from torch import nn
|
|||
# from TTS.utils.audio import TorchSTFT
|
||||
from TTS.encoder.models.base_encoder import BaseEncoder
|
||||
|
||||
|
||||
class SELayer(nn.Module):
|
||||
def __init__(self, channel, reduction=8):
|
||||
super(SELayer, self).__init__()
|
||||
|
|
|
@ -147,7 +147,7 @@ def setup_encoder_model(config: "Coqpit"):
|
|||
|
||||
|
||||
def save_checkpoint(model, optimizer, criterion, model_loss, out_path, current_step, epoch):
|
||||
checkpoint_path = "checkpoint_{}.pth.tar".format(current_step)
|
||||
checkpoint_path = "checkpoint_{}.pth".format(current_step)
|
||||
checkpoint_path = os.path.join(out_path, checkpoint_path)
|
||||
print(" | | > Checkpoint saving : {}".format(checkpoint_path))
|
||||
|
||||
|
@ -177,7 +177,7 @@ def save_best_model(model, optimizer, criterion, model_loss, best_loss, out_path
|
|||
"date": datetime.date.today().strftime("%B %d, %Y"),
|
||||
}
|
||||
best_loss = model_loss
|
||||
bestmodel_path = "best_model.pth.tar"
|
||||
bestmodel_path = "best_model.pth"
|
||||
bestmodel_path = os.path.join(out_path, bestmodel_path)
|
||||
print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path))
|
||||
save_fsspec(state, bestmodel_path)
|
||||
|
|
|
@ -5,7 +5,7 @@ from TTS.utils.io import save_fsspec
|
|||
|
||||
|
||||
def save_checkpoint(model, optimizer, model_loss, out_path, current_step):
|
||||
checkpoint_path = "checkpoint_{}.pth.tar".format(current_step)
|
||||
checkpoint_path = "checkpoint_{}.pth".format(current_step)
|
||||
checkpoint_path = os.path.join(out_path, checkpoint_path)
|
||||
print(" | | > Checkpoint saving : {}".format(checkpoint_path))
|
||||
|
||||
|
@ -31,7 +31,7 @@ def save_best_model(model, optimizer, model_loss, best_loss, out_path, current_s
|
|||
"date": datetime.date.today().strftime("%B %d, %Y"),
|
||||
}
|
||||
best_loss = model_loss
|
||||
bestmodel_path = "best_model.pth.tar"
|
||||
bestmodel_path = "best_model.pth"
|
||||
bestmodel_path = os.path.join(out_path, bestmodel_path)
|
||||
print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path))
|
||||
save_fsspec(state, bestmodel_path)
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import random
|
||||
|
||||
from torch.utils.data.sampler import Sampler, SubsetRandomSampler
|
||||
|
||||
|
||||
|
@ -34,10 +35,21 @@ class PerfectBatchSampler(Sampler):
|
|||
drop_last (bool): if True, drops last incomplete batch.
|
||||
"""
|
||||
|
||||
def __init__(self, dataset_items, classes, batch_size, num_classes_in_batch, num_gpus=1, shuffle=True, drop_last=False, label_key="class_name"):
|
||||
def __init__(
|
||||
self,
|
||||
dataset_items,
|
||||
classes,
|
||||
batch_size,
|
||||
num_classes_in_batch,
|
||||
num_gpus=1,
|
||||
shuffle=True,
|
||||
drop_last=False,
|
||||
label_key="class_name",
|
||||
):
|
||||
super().__init__(dataset_items)
|
||||
assert batch_size % (num_classes_in_batch * num_gpus) == 0, (
|
||||
'Batch size must be divisible by number of classes times the number of data parallel devices (if enabled).')
|
||||
assert (
|
||||
batch_size % (num_classes_in_batch * num_gpus) == 0
|
||||
), "Batch size must be divisible by number of classes times the number of data parallel devices (if enabled)."
|
||||
|
||||
label_indices = {}
|
||||
for idx, item in enumerate(dataset_items):
|
||||
|
@ -93,7 +105,7 @@ class PerfectBatchSampler(Sampler):
|
|||
if groups % self._dp_devices == 0:
|
||||
yield batch
|
||||
else:
|
||||
batch = batch[:(groups // self._dp_devices) * self._dp_devices * self._num_classes_in_batch]
|
||||
batch = batch[: (groups // self._dp_devices) * self._dp_devices * self._num_classes_in_batch]
|
||||
if len(batch) > 0:
|
||||
yield batch
|
||||
|
||||
|
|
142
TTS/model.py
142
TTS/model.py
|
@ -1,46 +1,34 @@
|
|||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, List, Tuple
|
||||
from abc import abstractmethod
|
||||
from typing import Dict
|
||||
|
||||
import torch
|
||||
from coqpit import Coqpit
|
||||
from torch import nn
|
||||
from trainer import TrainerModel
|
||||
|
||||
# pylint: skip-file
|
||||
|
||||
|
||||
class BaseTrainerModel(ABC, nn.Module):
|
||||
"""Abstract 🐸TTS class. Every new 🐸TTS model must inherit this."""
|
||||
class BaseTrainerModel(TrainerModel):
|
||||
"""BaseTrainerModel model expanding TrainerModel with required functions by 🐸TTS.
|
||||
|
||||
Every new 🐸TTS model must inherit it.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
@abstractmethod
|
||||
def init_from_config(config: Coqpit):
|
||||
"""Init the model from given config.
|
||||
"""Init the model and all its attributes from the given config.
|
||||
|
||||
Override this depending on your model.
|
||||
"""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def forward(self, input: torch.Tensor, *args, aux_input={}, **kwargs) -> Dict:
|
||||
"""Forward ... for the model mainly used in training.
|
||||
|
||||
You can be flexible here and use different number of arguments and argument names since it is intended to be
|
||||
used by `train_step()` without exposing it out of the model.
|
||||
|
||||
Args:
|
||||
input (torch.Tensor): Input tensor.
|
||||
aux_input (Dict): Auxiliary model inputs like embeddings, durations or any other sorts of inputs.
|
||||
|
||||
Returns:
|
||||
Dict: Model outputs. Main model output must be named as "model_outputs".
|
||||
"""
|
||||
outputs_dict = {"model_outputs": None}
|
||||
...
|
||||
return outputs_dict
|
||||
|
||||
@abstractmethod
|
||||
def inference(self, input: torch.Tensor, aux_input={}) -> Dict:
|
||||
"""Forward ... for inference.
|
||||
"""Forward pass for inference.
|
||||
|
||||
It must return a dictionary with the main model output and all the auxiliary outputs. The key ```model_outputs```
|
||||
is considered to be the main output and you can add any other auxiliary outputs as you want.
|
||||
|
||||
We don't use `*kwargs` since it is problematic with the TorchScript API.
|
||||
|
||||
|
@ -55,78 +43,9 @@ class BaseTrainerModel(ABC, nn.Module):
|
|||
...
|
||||
return outputs_dict
|
||||
|
||||
def format_batch(self, batch: Dict) -> Dict:
|
||||
"""Format batch returned by the data loader before sending it to the model.
|
||||
|
||||
If not implemented, model uses the batch as is.
|
||||
Can be used for data augmentation, feature ectraction, etc.
|
||||
"""
|
||||
return batch
|
||||
|
||||
def format_batch_on_device(self, batch: Dict) -> Dict:
|
||||
"""Format batch on device before sending it to the model.
|
||||
|
||||
If not implemented, model uses the batch as is.
|
||||
Can be used for data augmentation, feature ectraction, etc.
|
||||
"""
|
||||
return batch
|
||||
|
||||
@abstractmethod
|
||||
def train_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]:
|
||||
"""Perform a single training step. Run the model forward ... and compute losses.
|
||||
|
||||
Args:
|
||||
batch (Dict): Input tensors.
|
||||
criterion (nn.Module): Loss layer designed for the model.
|
||||
|
||||
Returns:
|
||||
Tuple[Dict, Dict]: Model ouputs and computed losses.
|
||||
"""
|
||||
outputs_dict = {}
|
||||
loss_dict = {} # this returns from the criterion
|
||||
...
|
||||
return outputs_dict, loss_dict
|
||||
|
||||
def train_log(self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int) -> None:
|
||||
"""Create visualizations and waveform examples for training.
|
||||
|
||||
For example, here you can plot spectrograms and generate sample sample waveforms from these spectrograms to
|
||||
be projected onto Tensorboard.
|
||||
|
||||
Args:
|
||||
ap (AudioProcessor): audio processor used at training.
|
||||
batch (Dict): Model inputs used at the previous training step.
|
||||
outputs (Dict): Model outputs generated at the previoud training step.
|
||||
|
||||
Returns:
|
||||
Tuple[Dict, np.ndarray]: training plots and output waveform.
|
||||
"""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def eval_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]:
|
||||
"""Perform a single evaluation step. Run the model forward ... and compute losses. In most cases, you can
|
||||
call `train_step()` with no changes.
|
||||
|
||||
Args:
|
||||
batch (Dict): Input tensors.
|
||||
criterion (nn.Module): Loss layer designed for the model.
|
||||
|
||||
Returns:
|
||||
Tuple[Dict, Dict]: Model ouputs and computed losses.
|
||||
"""
|
||||
outputs_dict = {}
|
||||
loss_dict = {} # this returns from the criterion
|
||||
...
|
||||
return outputs_dict, loss_dict
|
||||
|
||||
def eval_log(self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int) -> None:
|
||||
"""The same as `train_log()`"""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def load_checkpoint(self, config: Coqpit, checkpoint_path: str, eval: bool = False, strict: bool = True) -> None:
|
||||
"""Load a checkpoint and get ready for training or inference.
|
||||
"""Load a model checkpoint gile and get ready for training or inference.
|
||||
|
||||
Args:
|
||||
config (Coqpit): Model configuration.
|
||||
|
@ -135,36 +54,3 @@ class BaseTrainerModel(ABC, nn.Module):
|
|||
strcit (bool, optional): Match all checkpoint keys to model's keys. Defaults to True.
|
||||
"""
|
||||
...
|
||||
|
||||
@staticmethod
|
||||
@abstractmethod
|
||||
def init_from_config(config: Coqpit, samples: List[Dict] = None, verbose=False) -> "BaseTrainerModel":
|
||||
"""Init the model from given config.
|
||||
|
||||
Override this depending on your model.
|
||||
"""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def get_data_loader(
|
||||
self, config: Coqpit, assets: Dict, is_eval: True, data_items: List, verbose: bool, num_gpus: int
|
||||
):
|
||||
...
|
||||
|
||||
# def get_optimizer(self) -> Union["Optimizer", List["Optimizer"]]:
|
||||
# """Setup an return optimizer or optimizers."""
|
||||
# ...
|
||||
|
||||
# def get_lr(self) -> Union[float, List[float]]:
|
||||
# """Return learning rate(s).
|
||||
|
||||
# Returns:
|
||||
# Union[float, List[float]]: Model's initial learning rates.
|
||||
# """
|
||||
# ...
|
||||
|
||||
# def get_scheduler(self, optimizer: torch.optim.Optimizer):
|
||||
# ...
|
||||
|
||||
# def get_criterion(self):
|
||||
# ...
|
||||
|
|
|
@ -21,4 +21,4 @@ Run the server with the official models on a GPU.
|
|||
```CUDA_VISIBLE_DEVICES="0" python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan --use_cuda True```
|
||||
|
||||
Run the server with a custom models.
|
||||
```python TTS/server/server.py --tts_checkpoint /path/to/tts/model.pth.tar --tts_config /path/to/tts/config.json --vocoder_checkpoint /path/to/vocoder/model.pth.tar --vocoder_config /path/to/vocoder/config.json```
|
||||
```python TTS/server/server.py --tts_checkpoint /path/to/tts/model.pth --tts_config /path/to/tts/config.json --vocoder_checkpoint /path/to/vocoder/model.pth --vocoder_config /path/to/vocoder/config.json```
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"tts_path":"/media/erogol/data_ssd/Models/libri_tts/5049/", // tts model root folder
|
||||
"tts_file":"best_model.pth.tar", // tts checkpoint file
|
||||
"tts_file":"best_model.pth", // tts checkpoint file
|
||||
"tts_config":"config.json", // tts config.json file
|
||||
"tts_speakers": null, // json file listing speaker ids. null if no speaker embedding.
|
||||
"vocoder_config":null,
|
||||
|
|
|
@ -246,7 +246,7 @@ def libri_tts(root_path, meta_files=None, ignored_speakers=None):
|
|||
continue
|
||||
items.append({"text": text, "audio_file": wav_file, "speaker_name": f"LTTS_{speaker_name}"})
|
||||
for item in items:
|
||||
assert os.path.exists(item[1]), f" [!] wav files don't exist - {item[1]}"
|
||||
assert os.path.exists(item["audio_file"]), f" [!] wav files don't exist - {item['audio_file']}"
|
||||
return items
|
||||
|
||||
|
||||
|
|
|
@ -7,15 +7,15 @@ import torch.distributed as dist
|
|||
from coqpit import Coqpit
|
||||
from torch import nn
|
||||
from torch.utils.data import DataLoader
|
||||
from torch.utils.data.sampler import WeightedRandomSampler
|
||||
from trainer.torch import DistributedSampler, DistributedSamplerWrapper
|
||||
|
||||
from TTS.model import BaseTrainerModel
|
||||
from TTS.tts.datasets.dataset import TTSDataset
|
||||
from TTS.tts.utils.languages import LanguageManager, get_language_balancer_weights
|
||||
from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager, get_speaker_balancer_weights
|
||||
from TTS.tts.utils.speakers import SpeakerManager, get_speaker_balancer_weights, get_speaker_manager
|
||||
from TTS.tts.utils.synthesis import synthesis
|
||||
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
|
||||
from torch.utils.data.sampler import WeightedRandomSampler
|
||||
|
||||
# pylint: skip-file
|
||||
|
||||
|
@ -258,7 +258,7 @@ class BaseTTS(BaseTrainerModel):
|
|||
# sampler for DDP
|
||||
if sampler is None:
|
||||
sampler = DistributedSampler(dataset) if num_gpus > 1 else None
|
||||
else: # If a sampler is already defined use this sampler and DDP sampler together
|
||||
else: # If a sampler is already defined use this sampler and DDP sampler together
|
||||
sampler = DistributedSamplerWrapper(sampler) if num_gpus > 1 else sampler
|
||||
|
||||
return sampler
|
||||
|
@ -279,9 +279,7 @@ class BaseTTS(BaseTrainerModel):
|
|||
# setup multi-speaker attributes
|
||||
if hasattr(self, "speaker_manager") and self.speaker_manager is not None:
|
||||
if hasattr(config, "model_args"):
|
||||
speaker_id_mapping = (
|
||||
self.speaker_manager.ids if config.model_args.use_speaker_embedding else None
|
||||
)
|
||||
speaker_id_mapping = self.speaker_manager.ids if config.model_args.use_speaker_embedding else None
|
||||
d_vector_mapping = self.speaker_manager.embeddings if config.model_args.use_d_vector_file else None
|
||||
config.use_d_vector_file = config.model_args.use_d_vector_file
|
||||
else:
|
||||
|
@ -293,9 +291,7 @@ class BaseTTS(BaseTrainerModel):
|
|||
|
||||
# setup multi-lingual attributes
|
||||
if hasattr(self, "language_manager") and self.language_manager is not None:
|
||||
language_id_mapping = (
|
||||
self.language_manager.ids if self.args.use_language_embedding else None
|
||||
)
|
||||
language_id_mapping = self.language_manager.ids if self.args.use_language_embedding else None
|
||||
else:
|
||||
language_id_mapping = None
|
||||
|
||||
|
|
|
@ -676,6 +676,7 @@ class Vits(BaseTTS):
|
|||
raise RuntimeError(
|
||||
" [!] To use the speaker consistency loss (SCL) you need to specify encoder_model_path and encoder_config_path !!"
|
||||
)
|
||||
|
||||
# load encoder
|
||||
self.speaker_manager.init_encoder(self.args.encoder_model_path, self.args.encoder_config_path)
|
||||
self.speaker_manager.encoder.eval()
|
||||
|
@ -1095,7 +1096,9 @@ class Vits(BaseTTS):
|
|||
return outputs
|
||||
|
||||
@torch.no_grad()
|
||||
def inference_voice_conversion(self, reference_wav, speaker_id=None, d_vector=None, reference_speaker_id=None, reference_d_vector=None):
|
||||
def inference_voice_conversion(
|
||||
self, reference_wav, speaker_id=None, d_vector=None, reference_speaker_id=None, reference_d_vector=None
|
||||
):
|
||||
"""Inference for voice conversion
|
||||
|
||||
Args:
|
||||
|
@ -1106,7 +1109,13 @@ class Vits(BaseTTS):
|
|||
reference_d_vector (Tensor): d_vector embedding of the reference_wav speaker. Tensor of shape `[B, C]`
|
||||
"""
|
||||
# compute spectrograms
|
||||
y = wav_to_spec(reference_wav, self.config.audio.fft_size, self.config.audio.hop_length, self.config.audio.win_length, center=False).transpose(1, 2)
|
||||
y = wav_to_spec(
|
||||
reference_wav,
|
||||
self.config.audio.fft_size,
|
||||
self.config.audio.hop_length,
|
||||
self.config.audio.win_length,
|
||||
center=False,
|
||||
).transpose(1, 2)
|
||||
y_lengths = torch.tensor([y.size(-1)]).to(y.device)
|
||||
speaker_cond_src = reference_speaker_id if reference_speaker_id is not None else reference_d_vector
|
||||
speaker_cond_tgt = speaker_id if speaker_id is not None else d_vector
|
||||
|
@ -1346,6 +1355,7 @@ class Vits(BaseTTS):
|
|||
else:
|
||||
emotion_id = self.emotion_manager.ids[emotion_name]
|
||||
|
||||
|
||||
return {
|
||||
"text": text,
|
||||
"speaker_id": speaker_id,
|
||||
|
@ -1419,12 +1429,8 @@ class Vits(BaseTTS):
|
|||
d_vectors = torch.FloatTensor(d_vectors)
|
||||
|
||||
# get language ids from language names
|
||||
if (
|
||||
self.language_manager is not None
|
||||
and self.language_manager.ids
|
||||
and self.args.use_language_embedding
|
||||
):
|
||||
language_ids = [self.language_manager.ids[ln] for ln in batch["f"]]
|
||||
if self.language_manager is not None and self.language_manager.ids and self.args.use_language_embedding:
|
||||
language_ids = [self.language_manager.ids[ln] for ln in batch["language_names"]]
|
||||
|
||||
if language_ids is not None:
|
||||
language_ids = torch.LongTensor(language_ids)
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import os
|
||||
from typing import Dict, List, Any
|
||||
from typing import Any, Dict, List
|
||||
|
||||
|
||||
import fsspec
|
||||
import numpy as np
|
||||
|
@ -9,6 +10,7 @@ from coqpit import Coqpit
|
|||
from TTS.config import check_config_and_model_args
|
||||
from TTS.tts.utils.managers import BaseIDManager
|
||||
|
||||
|
||||
class LanguageManager(BaseIDManager):
|
||||
"""Manage the languages for multi-lingual 🐸TTS models. Load a datafile and parse the information
|
||||
in a way that can be queried by language.
|
||||
|
|
|
@ -12,13 +12,11 @@ from TTS.utils.audio import AudioProcessor
|
|||
|
||||
|
||||
class BaseIDManager:
|
||||
""" Base `ID` Manager class. Every new `ID` manager must inherit this.
|
||||
"""Base `ID` Manager class. Every new `ID` manager must inherit this.
|
||||
It defines common `ID` manager specific functions.
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
id_file_path: str = ""
|
||||
):
|
||||
|
||||
def __init__(self, id_file_path: str = ""):
|
||||
self.ids = {}
|
||||
|
||||
if id_file_path:
|
||||
|
@ -85,10 +83,12 @@ class BaseIDManager:
|
|||
ids = {name: i for i, name in enumerate(classes)}
|
||||
return ids
|
||||
|
||||
|
||||
class EmbeddingManager(BaseIDManager):
|
||||
""" Base `Embedding` Manager class. Every new `Embedding` manager must inherit this.
|
||||
"""Base `Embedding` Manager class. Every new `Embedding` manager must inherit this.
|
||||
It defines common `Embedding` manager specific functions.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
embedding_file_path: str = "",
|
||||
|
@ -225,7 +225,10 @@ class EmbeddingManager(BaseIDManager):
|
|||
"""
|
||||
self.encoder_config = load_config(config_path)
|
||||
self.encoder = setup_encoder_model(self.encoder_config)
|
||||
self.encoder_criterion = self.encoder.load_checkpoint(self.encoder_config, model_path, eval=True, use_cuda=self.use_cuda)
|
||||
|
||||
self.encoder_criterion = self.encoder.load_checkpoint(
|
||||
self.encoder_config, model_path, eval=True, use_cuda=self.use_cuda
|
||||
)
|
||||
self.encoder_ap = AudioProcessor(**self.encoder_config.audio)
|
||||
|
||||
def compute_embedding_from_clip(self, wav_file: Union[str, List[str]]) -> list:
|
||||
|
|
|
@ -10,6 +10,7 @@ from coqpit import Coqpit
|
|||
from TTS.config import get_from_config_or_model_args_with_default
|
||||
from TTS.tts.utils.managers import EmbeddingManager
|
||||
|
||||
|
||||
class SpeakerManager(EmbeddingManager):
|
||||
"""Manage the speakers for multi-speaker 🐸TTS models. Load a datafile and parse the information
|
||||
in a way that can be queried by speaker or clip.
|
||||
|
@ -67,6 +68,7 @@ class SpeakerManager(EmbeddingManager):
|
|||
use_cuda=use_cuda
|
||||
)
|
||||
|
||||
|
||||
if data_items:
|
||||
self.set_ids_from_data(data_items, parse_key="speaker_name")
|
||||
|
||||
|
|
|
@ -218,6 +218,7 @@ def synthesis(
|
|||
}
|
||||
return return_dict
|
||||
|
||||
|
||||
def transfer_voice(
|
||||
model,
|
||||
CONFIG,
|
||||
|
@ -281,12 +282,7 @@ def transfer_voice(
|
|||
_func = model.module.inference_voice_conversion
|
||||
else:
|
||||
_func = model.inference_voice_conversion
|
||||
model_outputs = _func(
|
||||
reference_wav,
|
||||
speaker_id,
|
||||
d_vector,
|
||||
reference_speaker_id,
|
||||
reference_d_vector)
|
||||
model_outputs = _func(reference_wav, speaker_id, d_vector, reference_speaker_id, reference_d_vector)
|
||||
|
||||
# convert outputs to numpy
|
||||
# plot results
|
||||
|
|
|
@ -12,16 +12,9 @@ GRUUT_LANGS = list(Gruut.supported_languages())
|
|||
|
||||
|
||||
# Dict setting default phonemizers for each language
|
||||
DEF_LANG_TO_PHONEMIZER = {
|
||||
"ja-jp": JA_JP_Phonemizer.name(),
|
||||
"zh-cn": ZH_CN_Phonemizer.name(),
|
||||
}
|
||||
|
||||
|
||||
# Add Gruut languages
|
||||
_ = [Gruut.name()] * len(GRUUT_LANGS)
|
||||
_new_dict = dict(list(zip(GRUUT_LANGS, _)))
|
||||
DEF_LANG_TO_PHONEMIZER.update(_new_dict)
|
||||
DEF_LANG_TO_PHONEMIZER = dict(list(zip(GRUUT_LANGS, _)))
|
||||
|
||||
|
||||
# Add ESpeak languages and override any existing ones
|
||||
|
@ -29,7 +22,10 @@ _ = [ESpeak.name()] * len(ESPEAK_LANGS)
|
|||
_new_dict = dict(list(zip(list(ESPEAK_LANGS), _)))
|
||||
DEF_LANG_TO_PHONEMIZER.update(_new_dict)
|
||||
|
||||
# Force default for some languages
|
||||
DEF_LANG_TO_PHONEMIZER["en"] = DEF_LANG_TO_PHONEMIZER["en-us"]
|
||||
DEF_LANG_TO_PHONEMIZER["ja-jp"] = JA_JP_Phonemizer.name()
|
||||
DEF_LANG_TO_PHONEMIZER["zh-cn"] = ZH_CN_Phonemizer.name()
|
||||
|
||||
|
||||
def get_phonemizer_by_name(name: str, **kwargs) -> BasePhonemizer:
|
||||
|
|
|
@ -371,7 +371,9 @@ class AudioProcessor(object):
|
|||
self.hop_length = hop_length
|
||||
self.win_length = win_length
|
||||
assert min_level_db != 0.0, " [!] min_level_db is 0"
|
||||
assert self.win_length <= self.fft_size, " [!] win_length cannot be larger than fft_size"
|
||||
assert (
|
||||
self.win_length <= self.fft_size
|
||||
), f" [!] win_length cannot be larger than fft_size - {self.win_length} vs {self.fft_size}"
|
||||
members = vars(self)
|
||||
if verbose:
|
||||
print(" > Setting up Audio Processor...")
|
||||
|
|
|
@ -67,7 +67,7 @@ def get_experiment_folder_path(root_path, model_name):
|
|||
def remove_experiment_folder(experiment_path):
|
||||
"""Check folder if there is a checkpoint, otherwise remove the folder"""
|
||||
fs = fsspec.get_mapper(experiment_path).fs
|
||||
checkpoint_files = fs.glob(experiment_path + "/*.pth.tar")
|
||||
checkpoint_files = fs.glob(experiment_path + "/*.pth")
|
||||
if not checkpoint_files:
|
||||
if fs.exists(experiment_path):
|
||||
fs.rm(experiment_path, recursive=True)
|
||||
|
|
|
@ -140,7 +140,7 @@ def save_checkpoint(
|
|||
output_folder,
|
||||
**kwargs,
|
||||
):
|
||||
file_name = "checkpoint_{}.pth.tar".format(current_step)
|
||||
file_name = "checkpoint_{}.pth".format(current_step)
|
||||
checkpoint_path = os.path.join(output_folder, file_name)
|
||||
print("\n > CHECKPOINT : {}".format(checkpoint_path))
|
||||
save_model(
|
||||
|
@ -170,7 +170,7 @@ def save_best_model(
|
|||
**kwargs,
|
||||
):
|
||||
if current_loss < best_loss:
|
||||
best_model_name = f"best_model_{current_step}.pth.tar"
|
||||
best_model_name = f"best_model_{current_step}.pth"
|
||||
checkpoint_path = os.path.join(out_path, best_model_name)
|
||||
print(" > BEST MODEL : {}".format(checkpoint_path))
|
||||
save_model(
|
||||
|
@ -187,12 +187,12 @@ def save_best_model(
|
|||
fs = fsspec.get_mapper(out_path).fs
|
||||
# only delete previous if current is saved successfully
|
||||
if not keep_all_best or (current_step < keep_after):
|
||||
model_names = fs.glob(os.path.join(out_path, "best_model*.pth.tar"))
|
||||
model_names = fs.glob(os.path.join(out_path, "best_model*.pth"))
|
||||
for model_name in model_names:
|
||||
if os.path.basename(model_name) != best_model_name:
|
||||
fs.rm(model_name)
|
||||
# create a shortcut which always points to the currently best model
|
||||
shortcut_name = "best_model.pth.tar"
|
||||
shortcut_name = "best_model.pth"
|
||||
shortcut_path = os.path.join(out_path, shortcut_name)
|
||||
fs.copy(checkpoint_path, shortcut_path)
|
||||
best_loss = current_loss
|
||||
|
|
|
@ -4,6 +4,7 @@ import os
|
|||
import zipfile
|
||||
from pathlib import Path
|
||||
from shutil import copyfile, rmtree
|
||||
from typing import Tuple
|
||||
|
||||
import requests
|
||||
|
||||
|
@ -114,7 +115,7 @@ class ModelManager(object):
|
|||
e.g. 'tts_model/en/ljspeech/tacotron'
|
||||
|
||||
Every model must have the following files:
|
||||
- *.pth.tar : pytorch model checkpoint file.
|
||||
- *.pth : pytorch model checkpoint file.
|
||||
- config.json : model config file.
|
||||
- scale_stats.npy (if exist): scale values for preprocessing.
|
||||
|
||||
|
@ -127,9 +128,6 @@ class ModelManager(object):
|
|||
model_item = self.models_dict[model_type][lang][dataset][model]
|
||||
# set the model specific output path
|
||||
output_path = os.path.join(self.output_prefix, model_full_name)
|
||||
output_model_path = os.path.join(output_path, "model_file.pth.tar")
|
||||
output_config_path = os.path.join(output_path, "config.json")
|
||||
|
||||
if os.path.exists(output_path):
|
||||
print(f" > {model_name} is already downloaded.")
|
||||
else:
|
||||
|
@ -137,10 +135,51 @@ class ModelManager(object):
|
|||
print(f" > Downloading model to {output_path}")
|
||||
# download from github release
|
||||
self._download_zip_file(model_item["github_rls_url"], output_path)
|
||||
# find downloaded files
|
||||
output_model_path, output_config_path = self._find_files(output_path)
|
||||
# update paths in the config.json
|
||||
self._update_paths(output_path, output_config_path)
|
||||
return output_model_path, output_config_path, model_item
|
||||
|
||||
@staticmethod
|
||||
def _find_files(output_path: str) -> Tuple[str, str]:
|
||||
"""Find the model and config files in the output path
|
||||
|
||||
Args:
|
||||
output_path (str): path to the model files
|
||||
|
||||
Returns:
|
||||
Tuple[str, str]: path to the model file and config file
|
||||
"""
|
||||
model_file = None
|
||||
config_file = None
|
||||
for file_name in os.listdir(output_path):
|
||||
if file_name in ["model_file.pth", "model_file.pth.tar", "model.pth"]:
|
||||
model_file = os.path.join(output_path, file_name)
|
||||
elif file_name == "config.json":
|
||||
config_file = os.path.join(output_path, file_name)
|
||||
if model_file is None:
|
||||
raise ValueError(" [!] Model file not found in the output path")
|
||||
if config_file is None:
|
||||
raise ValueError(" [!] Config file not found in the output path")
|
||||
return model_file, config_file
|
||||
|
||||
@staticmethod
|
||||
def _find_speaker_encoder(output_path: str) -> str:
|
||||
"""Find the speaker encoder file in the output path
|
||||
|
||||
Args:
|
||||
output_path (str): path to the model files
|
||||
|
||||
Returns:
|
||||
str: path to the speaker encoder file
|
||||
"""
|
||||
speaker_encoder_file = None
|
||||
for file_name in os.listdir(output_path):
|
||||
if file_name in ["model_se.pth", "model_se.pth.tar"]:
|
||||
speaker_encoder_file = os.path.join(output_path, file_name)
|
||||
return speaker_encoder_file
|
||||
|
||||
def _update_paths(self, output_path: str, config_path: str) -> None:
|
||||
"""Update paths for certain files in config.json after download.
|
||||
|
||||
|
@ -174,7 +213,7 @@ class ModelManager(object):
|
|||
@staticmethod
|
||||
def _update_path(field_name, new_path, config_path):
|
||||
"""Update the path in the model config.json for the current environment after download"""
|
||||
if os.path.exists(new_path):
|
||||
if new_path and os.path.exists(new_path):
|
||||
config = load_config(config_path)
|
||||
field_names = field_name.split(".")
|
||||
if len(field_names) > 1:
|
||||
|
|
|
@ -214,8 +214,8 @@ class Synthesizer(object):
|
|||
|
||||
if not text and not reference_wav:
|
||||
raise ValueError(
|
||||
"You need to define either `text` (for sythesis) or a `reference_wav` (for voice conversion) to use the Coqui TTS API."
|
||||
)
|
||||
"You need to define either `text` (for sythesis) or a `reference_wav` (for voice conversion) to use the Coqui TTS API."
|
||||
)
|
||||
|
||||
if text:
|
||||
sens = self.split_into_sentences(text)
|
||||
|
@ -228,8 +228,10 @@ class Synthesizer(object):
|
|||
if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "ids"):
|
||||
if speaker_name and isinstance(speaker_name, str):
|
||||
if self.tts_config.use_d_vector_file:
|
||||
# get the average speaker embedding from the saved embeddings.
|
||||
speaker_embedding = self.tts_model.speaker_manager.get_mean_embedding(speaker_name, num_samples=None, randomize=False)
|
||||
# get the average speaker embedding from the saved d_vectors.
|
||||
speaker_embedding = self.tts_model.speaker_manager.get_mean_embedding(
|
||||
speaker_name, num_samples=None, randomize=False
|
||||
)
|
||||
speaker_embedding = np.array(speaker_embedding)[None, :] # [1 x embedding_dim]
|
||||
else:
|
||||
# get speaker idx from the speaker name
|
||||
|
@ -354,26 +356,32 @@ class Synthesizer(object):
|
|||
if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "speaker_ids"):
|
||||
if reference_speaker_name and isinstance(reference_speaker_name, str):
|
||||
if self.tts_config.use_d_vector_file:
|
||||
# get the speaker embedding from the saved embeddings.
|
||||
reference_speaker_embedding = self.tts_model.speaker_manager.get_embeddings_by_name(reference_speaker_name)[0]
|
||||
reference_speaker_embedding = np.array(reference_speaker_embedding)[None, :] # [1 x embedding_dim]
|
||||
# get the speaker embedding from the saved d_vectors.
|
||||
reference_speaker_embedding = self.tts_model.speaker_manager.get_embeddings_by_name(
|
||||
reference_speaker_name
|
||||
)[0]
|
||||
reference_speaker_embedding = np.array(reference_speaker_embedding)[
|
||||
None, :
|
||||
] # [1 x embedding_dim]
|
||||
else:
|
||||
# get speaker idx from the speaker name
|
||||
reference_speaker_id = self.tts_model.speaker_manager.ids[reference_speaker_name]
|
||||
else:
|
||||
reference_speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip(reference_wav)
|
||||
reference_speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip(
|
||||
reference_wav
|
||||
)
|
||||
|
||||
outputs = transfer_voice(
|
||||
model=self.tts_model,
|
||||
CONFIG=self.tts_config,
|
||||
use_cuda=self.use_cuda,
|
||||
reference_wav=reference_wav,
|
||||
speaker_id=speaker_id,
|
||||
d_vector=speaker_embedding,
|
||||
use_griffin_lim=use_gl,
|
||||
reference_speaker_id=reference_speaker_id,
|
||||
reference_d_vector=reference_speaker_embedding
|
||||
)
|
||||
model=self.tts_model,
|
||||
CONFIG=self.tts_config,
|
||||
use_cuda=self.use_cuda,
|
||||
reference_wav=reference_wav,
|
||||
speaker_id=speaker_id,
|
||||
d_vector=speaker_embedding,
|
||||
use_griffin_lim=use_gl,
|
||||
reference_speaker_id=reference_speaker_id,
|
||||
reference_d_vector=reference_speaker_embedding,
|
||||
)
|
||||
waveform = outputs
|
||||
if not use_gl:
|
||||
mel_postnet_spec = outputs[0].detach().cpu().numpy()
|
||||
|
|
|
@ -29,7 +29,7 @@ You can continue a previous training run by the following command.
|
|||
|
||||
You can fine-tune a pre-trained model by the following command.
|
||||
|
||||
```CUDA_VISIBLE_DEVICES='0' python tts/bin/train_vocoder.py --restore_path path/to/your/model.pth.tar```
|
||||
```CUDA_VISIBLE_DEVICES='0' python tts/bin/train_vocoder.py --restore_path path/to/your/model.pth```
|
||||
|
||||
Restoring a model starts a new training in a different folder. It only restores model weights with the given checkpoint file. However, continuing a training starts from the same directory where the previous training run left off.
|
||||
|
||||
|
|
|
@ -93,13 +93,13 @@ them and fine-tune it for your own dataset. This will help you in two main ways:
|
|||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES="0" python recipes/ljspeech/glow_tts/train_glowtts.py \
|
||||
--restore_path /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth.tar
|
||||
--restore_path /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth
|
||||
```
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tts.py \
|
||||
--config_path /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/config.json \
|
||||
--restore_path /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth.tar
|
||||
--restore_path /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth
|
||||
```
|
||||
|
||||
As stated above, you can also use command-line arguments to change the model configuration.
|
||||
|
@ -107,7 +107,7 @@ them and fine-tune it for your own dataset. This will help you in two main ways:
|
|||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES="0" python recipes/ljspeech/glow_tts/train_glowtts.py \
|
||||
--restore_path /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth.tar
|
||||
--restore_path /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth
|
||||
--coqpit.run_name "glow-tts-finetune" \
|
||||
--coqpit.lr 0.00001
|
||||
```
|
||||
|
|
|
@ -44,7 +44,7 @@ Run your own TTS model (Using Griffin-Lim Vocoder)
|
|||
|
||||
```bash
|
||||
tts --text "Text for TTS" \
|
||||
--model_path path/to/model.pth.tar \
|
||||
--model_path path/to/model.pth \
|
||||
--config_path path/to/config.json \
|
||||
--out_path folder/to/save/output.wav
|
||||
```
|
||||
|
@ -54,9 +54,9 @@ Run your own TTS and Vocoder models
|
|||
```bash
|
||||
tts --text "Text for TTS" \
|
||||
--config_path path/to/config.json \
|
||||
--model_path path/to/model.pth.tar \
|
||||
--model_path path/to/model.pth \
|
||||
--out_path folder/to/save/output.wav \
|
||||
--vocoder_path path/to/vocoder.pth.tar \
|
||||
--vocoder_path path/to/vocoder.pth \
|
||||
--vocoder_config_path path/to/vocoder_config.json
|
||||
```
|
||||
|
||||
|
|
|
@ -33,7 +33,7 @@
|
|||
If you like to run a multi-gpu training using DDP back-end,
|
||||
|
||||
```bash
|
||||
$ CUDA_VISIBLE_DEVICES="0, 1, 2" python TTS/bin/distribute.py --script <path_to_your_script>/train_glowtts.py
|
||||
$ CUDA_VISIBLE_DEVICES="0, 1, 2" python -m trainer.distribute --script <path_to_your_script>/train_glowtts.py
|
||||
```
|
||||
|
||||
The example above runs a multi-gpu training using GPUs `0, 1, 2`.
|
||||
|
@ -122,7 +122,7 @@
|
|||
|
||||
```bash
|
||||
$ tts --text "Text for TTS" \
|
||||
--model_path path/to/checkpoint_x.pth.tar \
|
||||
--model_path path/to/checkpoint_x.pth \
|
||||
--config_path path/to/config.json \
|
||||
--out_path folder/to/save/output.wav
|
||||
```
|
||||
|
|
|
@ -50,13 +50,13 @@ A breakdown of a simple script that trains a GlowTTS model on the LJspeech datas
|
|||
- Fine-tune a model.
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 python train.py --restore_path path/to/model/checkpoint.pth.tar
|
||||
CUDA_VISIBLE_DEVICES=0 python train.py --restore_path path/to/model/checkpoint.pth
|
||||
```
|
||||
|
||||
- Run multi-gpu training.
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0,1,2 python TTS/bin/distribute.py --script train.py
|
||||
CUDA_VISIBLE_DEVICES=0,1,2 python -m trainer.distribute --script train.py
|
||||
```
|
||||
|
||||
### CLI Way
|
||||
|
|
|
@ -66,7 +66,7 @@
|
|||
"DATASET = \"ljspeech\"\n",
|
||||
"METADATA_FILE = \"metadata.csv\"\n",
|
||||
"CONFIG_PATH = \"/home/ubuntu/.local/share/tts/tts_models--en--ljspeech--tacotron2-DDC_ph/config.json\"\n",
|
||||
"MODEL_FILE = \"/home/ubuntu/.local/share/tts/tts_models--en--ljspeech--tacotron2-DDC_ph/model_file.pth.tar\"\n",
|
||||
"MODEL_FILE = \"/home/ubuntu/.local/share/tts/tts_models--en--ljspeech--tacotron2-DDC_ph/model_file.pth\"\n",
|
||||
"BATCH_SIZE = 32\n",
|
||||
"\n",
|
||||
"QUANTIZED_WAV = False\n",
|
||||
|
|
|
@ -66,7 +66,7 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"MODEL_RUN_PATH = \"/media/erogol/data_ssd/Models/libri_tts/speaker_encoder/libritts_360-half-October-31-2019_04+54PM-19d2f5f/\"\n",
|
||||
"MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth.tar\"\n",
|
||||
"MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth\"\n",
|
||||
"CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n",
|
||||
"\n",
|
||||
"# My single speaker locations\n",
|
||||
|
|
|
@ -73,7 +73,7 @@
|
|||
"\n",
|
||||
"# Set constants\n",
|
||||
"ROOT_PATH = '/home/erogol/Models/LJSpeech/ljspeech-May-20-2020_12+29PM-1835628/'\n",
|
||||
"MODEL_PATH = ROOT_PATH + '/best_model.pth.tar'\n",
|
||||
"MODEL_PATH = ROOT_PATH + '/best_model.pth'\n",
|
||||
"CONFIG_PATH = ROOT_PATH + '/config.json'\n",
|
||||
"OUT_FOLDER = './hard_sentences/'\n",
|
||||
"CONFIG = load_config(CONFIG_PATH)\n",
|
||||
|
|
|
@ -416,7 +416,7 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.1"
|
||||
"version": "3.9.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
|
|
@ -3,6 +3,10 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%matplotlib inline\n",
|
||||
"\n",
|
||||
|
@ -12,21 +16,51 @@
|
|||
"\n",
|
||||
"import IPython.display as ipd\n",
|
||||
"import glob"
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"config_path = \"/home/erogol/gdrive/Projects/TTS/recipes/ljspeech/align_tts/config_transformer2.json\"\n",
|
||||
"data_path = \"/home/erogol/gdrive/Datasets/LJSpeech-1.1/\"\n",
|
||||
"\n",
|
||||
"file_paths = glob.glob(data_path + \"/**/*.wav\", recursive=True)\n",
|
||||
"CONFIG = load_config(config_path)\n",
|
||||
"from TTS.config.shared_configs import BaseAudioConfig\n",
|
||||
"CONFIG = BaseAudioConfig()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## ✍️ Set these values "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data_path = \"/root/wav48_silence_trimmed/\"\n",
|
||||
"file_ext = \".flac\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Read audio files"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"file_paths = glob.glob(data_path + f\"/**/*{file_ext}\", recursive=True)\n",
|
||||
"\n",
|
||||
"# Change this to the index of the desired file listed below\n",
|
||||
"sample_file_index = 10\n",
|
||||
|
@ -35,44 +69,45 @@
|
|||
"\n",
|
||||
"print(\"File list, by index:\")\n",
|
||||
"dict(enumerate(file_paths))"
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"source": [
|
||||
"### Setup Audio Processor\n",
|
||||
"## ✍️ Set Audio Processor\n",
|
||||
"Play with the AP parameters until you find a good fit with the synthesis speech below.\n",
|
||||
"\n",
|
||||
"The default values are loaded from your config.json file, so you only need to\n",
|
||||
"uncomment and modify values below that you'd like to tune."
|
||||
],
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tune_params={\n",
|
||||
"# 'audio_processor': 'audio',\n",
|
||||
"# 'num_mels': 80, # In general, you don't need to change this. \n",
|
||||
"# 'fft_size': 1024, # In general, you don't need to change this.\n",
|
||||
"# 'sample_rate': 22050, # This must match the sample rate of the dataset.\n",
|
||||
"# 'hop_length': 256, # In general, you don't need to change this.\n",
|
||||
"# 'win_length': 1024, # In general, you don't need to change this.\n",
|
||||
"# 'preemphasis': 0.98, # In general, 0 gives better voice recovery but makes training harder. If your model does not train, try 0.97 - 0.99.\n",
|
||||
"# 'min_level_db': -100,\n",
|
||||
"# 'ref_level_db': 0, # The base DB; increase until all background noise is removed in the spectrogram, then lower until you hear better speech below.\n",
|
||||
"# 'power': 1.5, # Change this value and listen to the synthesized voice. 1.2 - 1.5 are resonable values.\n",
|
||||
"# 'griffin_lim_iters': 60, # Quality does not improve for values > 60\n",
|
||||
"# 'mel_fmin': 0.0, # Adjust this and check mel-spectrogram-based voice synthesis below.\n",
|
||||
"# 'mel_fmax': 8000.0, # Adjust this and check mel-spectrogram-based voice synthesis below.\n",
|
||||
"# 'do_trim_silence': True # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file.\n",
|
||||
" 'num_mels': 80, # In general, you don't need to change this. \n",
|
||||
" 'fft_size': 2400, # In general, you don't need to change this.\n",
|
||||
" 'frame_length_ms': 50, \n",
|
||||
" 'frame_shift_ms': 12.5,\n",
|
||||
" 'sample_rate': 48000, # This must match the sample rate of the dataset.\n",
|
||||
" 'hop_length': None, # In general, you don't need to change this.\n",
|
||||
" 'win_length': 1024, # In general, you don't need to change this.\n",
|
||||
" 'preemphasis': 0.98, # In general, 0 gives better voice recovery but makes training harder. If your model does not train, try 0.97 - 0.99.\n",
|
||||
" 'min_level_db': -100,\n",
|
||||
" 'ref_level_db': 0, # The base DB; increase until all background noise is removed in the spectrogram, then lower until you hear better speech below.\n",
|
||||
" 'power': 1.5, # Change this value and listen to the synthesized voice. 1.2 - 1.5 are resonable values.\n",
|
||||
" 'griffin_lim_iters': 60, # Quality does not improve for values > 60\n",
|
||||
" 'mel_fmin': 0.0, # Adjust this and check mel-spectrogram-based voice synthesis below.\n",
|
||||
" 'mel_fmax': 8000.0, # Adjust this and check mel-spectrogram-based voice synthesis below.\n",
|
||||
" 'do_trim_silence': True # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file.\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"# These options have to be forced off in order to avoid errors about the \n",
|
||||
|
@ -86,59 +121,57 @@
|
|||
"}\n",
|
||||
"\n",
|
||||
"# Override select parts of loaded config with parameters above\n",
|
||||
"tuned_config = CONFIG.audio.copy()\n",
|
||||
"tuned_config = CONFIG.copy()\n",
|
||||
"tuned_config.update(reset)\n",
|
||||
"tuned_config.update(tune_params)\n",
|
||||
"\n",
|
||||
"AP = AudioProcessor(**tuned_config);"
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"### Check audio loading "
|
||||
],
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"### Check audio loading "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"wav = AP.load_wav(SAMPLE_FILE_PATH)\n",
|
||||
"ipd.Audio(data=wav, rate=AP.sample_rate) "
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"### Generate Mel-Spectrogram and Re-synthesis with GL"
|
||||
],
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"### Generate Mel-Spectrogram and Re-synthesis with GL"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"AP.power = 1.5"
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"mel = AP.melspectrogram(wav)\n",
|
||||
"print(\"Max:\", mel.max())\n",
|
||||
|
@ -148,24 +181,24 @@
|
|||
"\n",
|
||||
"wav_gen = AP.inv_melspectrogram(mel)\n",
|
||||
"ipd.Audio(wav_gen, rate=AP.sample_rate)"
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"### Generate Linear-Spectrogram and Re-synthesis with GL"
|
||||
],
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"### Generate Linear-Spectrogram and Re-synthesis with GL"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"spec = AP.spectrogram(wav)\n",
|
||||
"print(\"Max:\", spec.max())\n",
|
||||
|
@ -175,26 +208,26 @@
|
|||
"\n",
|
||||
"wav_gen = AP.inv_spectrogram(spec)\n",
|
||||
"ipd.Audio(wav_gen, rate=AP.sample_rate)"
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"source": [
|
||||
"### Compare values for a certain parameter\n",
|
||||
"\n",
|
||||
"Optimize your parameters by comparing different values per parameter at a time."
|
||||
],
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from librosa import display\n",
|
||||
"from matplotlib import pylab as plt\n",
|
||||
|
@ -234,39 +267,39 @@
|
|||
" val = values[idx]\n",
|
||||
" print(\" > {} = {}\".format(attribute, val))\n",
|
||||
" IPython.display.display(IPython.display.Audio(wav_gen, rate=AP.sample_rate))"
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"compare_values(\"preemphasis\", [0, 0.5, 0.97, 0.98, 0.99])"
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"source": [
|
||||
"compare_values(\"ref_level_db\", [2, 5, 10, 15, 20, 25, 30, 35, 40, 1000])"
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"compare_values(\"ref_level_db\", [2, 5, 10, 15, 20, 25, 30, 35, 40, 1000])"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"interpreter": {
|
||||
"hash": "27648abe09795c3a768a281b31f7524fcf66a207e733f8ecda3a4e1fd1059fb0"
|
||||
},
|
||||
"kernelspec": {
|
||||
"name": "python3",
|
||||
"display_name": "Python 3.8.5 64-bit ('torch': conda)"
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
@ -278,12 +311,9 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.5"
|
||||
},
|
||||
"interpreter": {
|
||||
"hash": "27648abe09795c3a768a281b31f7524fcf66a207e733f8ecda3a4e1fd1059fb0"
|
||||
"version": "3.9.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
}
|
||||
|
|
|
@ -6,7 +6,7 @@ max-line-length=120
|
|||
|
||||
[tool.black]
|
||||
line-length = 120
|
||||
target-version = ['py38']
|
||||
target-version = ['py39']
|
||||
exclude = '''
|
||||
|
||||
(
|
||||
|
|
|
@ -49,7 +49,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
|
|||
# You can define your custom sample loader returning the list of samples.
|
||||
# Or define your custom formatter and pass it to the `load_tts_samples`.
|
||||
# Check `TTS.tts.datasets.load_tts_samples` for more details.
|
||||
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
|
||||
train_samples, eval_samples = load_tts_samples(
|
||||
dataset_config,
|
||||
eval_split=True,
|
||||
eval_split_max_size=config.eval_split_max_size,
|
||||
eval_split_size=config.eval_split_size,
|
||||
)
|
||||
|
||||
# init model
|
||||
model = AlignTTS(config, ap, tokenizer)
|
||||
|
|
|
@ -84,7 +84,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
|
|||
# You can define your custom sample loader returning the list of samples.
|
||||
# Or define your custom formatter and pass it to the `load_tts_samples`.
|
||||
# Check `TTS.tts.datasets.load_tts_samples` for more details.
|
||||
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
|
||||
train_samples, eval_samples = load_tts_samples(
|
||||
dataset_config,
|
||||
eval_split=True,
|
||||
eval_split_max_size=config.eval_split_max_size,
|
||||
eval_split_size=config.eval_split_size,
|
||||
)
|
||||
|
||||
# init the model
|
||||
model = ForwardTTS(config, ap, tokenizer, speaker_manager=None)
|
||||
|
|
|
@ -83,7 +83,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
|
|||
# You can define your custom sample loader returning the list of samples.
|
||||
# Or define your custom formatter and pass it to the `load_tts_samples`.
|
||||
# Check `TTS.tts.datasets.load_tts_samples` for more details.
|
||||
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
|
||||
train_samples, eval_samples = load_tts_samples(
|
||||
dataset_config,
|
||||
eval_split=True,
|
||||
eval_split_max_size=config.eval_split_max_size,
|
||||
eval_split_size=config.eval_split_size,
|
||||
)
|
||||
|
||||
# init the model
|
||||
model = ForwardTTS(config, ap, tokenizer)
|
||||
|
|
|
@ -60,7 +60,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
|
|||
# You can define your custom sample loader returning the list of samples.
|
||||
# Or define your custom formatter and pass it to the `load_tts_samples`.
|
||||
# Check `TTS.tts.datasets.load_tts_samples` for more details.
|
||||
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
|
||||
train_samples, eval_samples = load_tts_samples(
|
||||
dataset_config,
|
||||
eval_split=True,
|
||||
eval_split_max_size=config.eval_split_max_size,
|
||||
eval_split_size=config.eval_split_size,
|
||||
)
|
||||
|
||||
# INITIALIZE THE MODEL
|
||||
# Models take a config object and a speaker manager as input
|
||||
|
|
|
@ -41,11 +41,6 @@ model = GAN(config, ap)
|
|||
|
||||
# init the trainer and 🚀
|
||||
trainer = Trainer(
|
||||
TrainerArgs(),
|
||||
config,
|
||||
output_path,
|
||||
model=model,
|
||||
train_samples=train_samples,
|
||||
eval_samples=eval_samples
|
||||
TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
|
||||
)
|
||||
trainer.fit()
|
||||
|
|
|
@ -41,11 +41,6 @@ model = GAN(config, ap)
|
|||
|
||||
# init the trainer and 🚀
|
||||
trainer = Trainer(
|
||||
TrainerArgs(),
|
||||
config,
|
||||
output_path,
|
||||
model=model,
|
||||
train_samples=train_samples,
|
||||
eval_samples=eval_samples
|
||||
TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
|
||||
)
|
||||
trainer.fit()
|
||||
|
|
|
@ -67,7 +67,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
|
|||
# You can define your custom sample loader returning the list of samples.
|
||||
# Or define your custom formatter and pass it to the `load_tts_samples`.
|
||||
# Check `TTS.tts.datasets.load_tts_samples` for more details.
|
||||
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
|
||||
train_samples, eval_samples = load_tts_samples(
|
||||
dataset_config,
|
||||
eval_split=True,
|
||||
eval_split_max_size=config.eval_split_max_size,
|
||||
eval_split_size=config.eval_split_size,
|
||||
)
|
||||
|
||||
# init model
|
||||
model = ForwardTTS(config, ap, tokenizer)
|
||||
|
|
|
@ -77,7 +77,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
|
|||
# You can define your custom sample loader returning the list of samples.
|
||||
# Or define your custom formatter and pass it to the `load_tts_samples`.
|
||||
# Check `TTS.tts.datasets.load_tts_samples` for more details.
|
||||
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
|
||||
train_samples, eval_samples = load_tts_samples(
|
||||
dataset_config,
|
||||
eval_split=True,
|
||||
eval_split_max_size=config.eval_split_max_size,
|
||||
eval_split_size=config.eval_split_size,
|
||||
)
|
||||
|
||||
# INITIALIZE THE MODEL
|
||||
# Models take a config object and a speaker manager as input
|
||||
|
|
|
@ -74,7 +74,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
|
|||
# You can define your custom sample loader returning the list of samples.
|
||||
# Or define your custom formatter and pass it to the `load_tts_samples`.
|
||||
# Check `TTS.tts.datasets.load_tts_samples` for more details.
|
||||
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
|
||||
train_samples, eval_samples = load_tts_samples(
|
||||
dataset_config,
|
||||
eval_split=True,
|
||||
eval_split_max_size=config.eval_split_max_size,
|
||||
eval_split_size=config.eval_split_size,
|
||||
)
|
||||
|
||||
# INITIALIZE THE MODEL
|
||||
# Models take a config object and a speaker manager as input
|
||||
|
@ -84,11 +89,6 @@ model = Tacotron2(config, ap, tokenizer, speaker_manager=None)
|
|||
|
||||
# init the trainer and 🚀
|
||||
trainer = Trainer(
|
||||
TrainerArgs(),
|
||||
config,
|
||||
output_path,
|
||||
model=model,
|
||||
train_samples=train_samples,
|
||||
eval_samples=eval_samples
|
||||
TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
|
||||
)
|
||||
trainer.fit()
|
||||
|
|
|
@ -40,11 +40,6 @@ model = GAN(config, ap)
|
|||
|
||||
# init the trainer and 🚀
|
||||
trainer = Trainer(
|
||||
TrainerArgs(),
|
||||
config,
|
||||
output_path,
|
||||
model=model,
|
||||
train_samples=train_samples,
|
||||
eval_samples=eval_samples
|
||||
TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
|
||||
)
|
||||
trainer.fit()
|
||||
|
|
|
@ -69,7 +69,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
|
|||
# You can define your custom sample loader returning the list of samples.
|
||||
# Or define your custom formatter and pass it to the `load_tts_samples`.
|
||||
# Check `TTS.tts.datasets.load_tts_samples` for more details.
|
||||
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
|
||||
train_samples, eval_samples = load_tts_samples(
|
||||
dataset_config,
|
||||
eval_split=True,
|
||||
eval_split_max_size=config.eval_split_max_size,
|
||||
eval_split_size=config.eval_split_size,
|
||||
)
|
||||
|
||||
# init model
|
||||
model = Vits(config, ap, tokenizer, speaker_manager=None)
|
||||
|
|
|
@ -6,12 +6,11 @@ from trainer import Trainer, TrainerArgs
|
|||
from TTS.config.shared_configs import BaseAudioConfig
|
||||
from TTS.tts.configs.shared_configs import BaseDatasetConfig
|
||||
from TTS.tts.configs.vits_config import VitsConfig
|
||||
from TTS.tts.models.vits import CharactersConfig
|
||||
from TTS.tts.datasets import load_tts_samples
|
||||
from TTS.tts.models.vits import Vits, VitsArgs
|
||||
from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs
|
||||
from TTS.tts.utils.languages import LanguageManager
|
||||
from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
||||
from TTS.tts.utils.speakers import SpeakerManager
|
||||
from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
|
||||
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||
|
@ -110,7 +109,12 @@ config.from_dict(config.to_dict())
|
|||
ap = AudioProcessor(**config.audio.to_dict())
|
||||
|
||||
# load training samples
|
||||
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
|
||||
train_samples, eval_samples = load_tts_samples(
|
||||
dataset_config,
|
||||
eval_split=True,
|
||||
eval_split_max_size=config.eval_split_max_size,
|
||||
eval_split_size=config.eval_split_size,
|
||||
)
|
||||
|
||||
# init speaker manager for multi-speaker training
|
||||
# it maps speaker-id to speaker-name in the model and data-loader
|
||||
|
@ -131,11 +135,6 @@ model = Vits(config, ap, tokenizer, speaker_manager, language_manager)
|
|||
|
||||
# init the trainer and 🚀
|
||||
trainer = Trainer(
|
||||
TrainerArgs(),
|
||||
config,
|
||||
output_path,
|
||||
model=model,
|
||||
train_samples=train_samples,
|
||||
eval_samples=eval_samples
|
||||
TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
|
||||
)
|
||||
trainer.fit()
|
||||
|
|
|
@ -71,7 +71,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
|
|||
# You can define your custom sample loader returning the list of samples.
|
||||
# Or define your custom formatter and pass it to the `load_tts_samples`.
|
||||
# Check `TTS.tts.datasets.load_tts_samples` for more details.
|
||||
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
|
||||
train_samples, eval_samples = load_tts_samples(
|
||||
dataset_config,
|
||||
eval_split=True,
|
||||
eval_split_max_size=config.eval_split_max_size,
|
||||
eval_split_size=config.eval_split_size,
|
||||
)
|
||||
|
||||
# init speaker manager for multi-speaker training
|
||||
# it maps speaker-id to speaker-name in the model and data-loader
|
||||
|
|
|
@ -69,7 +69,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
|
|||
# You can define your custom sample loader returning the list of samples.
|
||||
# Or define your custom formatter and pass it to the `load_tts_samples`.
|
||||
# Check `TTS.tts.datasets.load_tts_samples` for more details.
|
||||
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
|
||||
train_samples, eval_samples = load_tts_samples(
|
||||
dataset_config,
|
||||
eval_split=True,
|
||||
eval_split_max_size=config.eval_split_max_size,
|
||||
eval_split_size=config.eval_split_size,
|
||||
)
|
||||
|
||||
# init speaker manager for multi-speaker training
|
||||
# it maps speaker-id to speaker-name in the model and data-loader
|
||||
|
|
|
@ -69,7 +69,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
|
|||
# You can define your custom sample loader returning the list of samples.
|
||||
# Or define your custom formatter and pass it to the `load_tts_samples`.
|
||||
# Check `TTS.tts.datasets.load_tts_samples` for more details.
|
||||
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
|
||||
train_samples, eval_samples = load_tts_samples(
|
||||
dataset_config,
|
||||
eval_split=True,
|
||||
eval_split_max_size=config.eval_split_max_size,
|
||||
eval_split_size=config.eval_split_size,
|
||||
)
|
||||
|
||||
# init speaker manager for multi-speaker training
|
||||
# it maps speaker-id to speaker-name in the model and data-loader
|
||||
|
|
|
@ -69,7 +69,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
|
|||
# You can define your custom sample loader returning the list of samples.
|
||||
# Or define your custom formatter and pass it to the `load_tts_samples`.
|
||||
# Check `TTS.tts.datasets.load_tts_samples` for more details.
|
||||
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
|
||||
train_samples, eval_samples = load_tts_samples(
|
||||
dataset_config,
|
||||
eval_split=True,
|
||||
eval_split_max_size=config.eval_split_max_size,
|
||||
eval_split_size=config.eval_split_size,
|
||||
)
|
||||
|
||||
# init speaker manager for multi-speaker training
|
||||
# it maps speaker-id to speaker-name in the model and data-loader
|
||||
|
|
|
@ -72,7 +72,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
|
|||
# You can define your custom sample loader returning the list of samples.
|
||||
# Or define your custom formatter and pass it to the `load_tts_samples`.
|
||||
# Check `TTS.tts.datasets.load_tts_samples` for more details.
|
||||
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
|
||||
train_samples, eval_samples = load_tts_samples(
|
||||
dataset_config,
|
||||
eval_split=True,
|
||||
eval_split_max_size=config.eval_split_max_size,
|
||||
eval_split_size=config.eval_split_size,
|
||||
)
|
||||
|
||||
# init speaker manager for multi-speaker training
|
||||
# it mainly handles speaker-id to speaker-name for the model and the data-loader
|
||||
|
|
|
@ -78,7 +78,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
|
|||
# You can define your custom sample loader returning the list of samples.
|
||||
# Or define your custom formatter and pass it to the `load_tts_samples`.
|
||||
# Check `TTS.tts.datasets.load_tts_samples` for more details.
|
||||
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
|
||||
train_samples, eval_samples = load_tts_samples(
|
||||
dataset_config,
|
||||
eval_split=True,
|
||||
eval_split_max_size=config.eval_split_max_size,
|
||||
eval_split_size=config.eval_split_size,
|
||||
)
|
||||
|
||||
# init speaker manager for multi-speaker training
|
||||
# it mainly handles speaker-id to speaker-name for the model and the data-loader
|
||||
|
|
|
@ -78,7 +78,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
|
|||
# You can define your custom sample loader returning the list of samples.
|
||||
# Or define your custom formatter and pass it to the `load_tts_samples`.
|
||||
# Check `TTS.tts.datasets.load_tts_samples` for more details.
|
||||
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
|
||||
train_samples, eval_samples = load_tts_samples(
|
||||
dataset_config,
|
||||
eval_split=True,
|
||||
eval_split_max_size=config.eval_split_max_size,
|
||||
eval_split_size=config.eval_split_size,
|
||||
)
|
||||
|
||||
# init speaker manager for multi-speaker training
|
||||
# it mainly handles speaker-id to speaker-name for the model and the data-loader
|
||||
|
|
|
@ -79,7 +79,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
|
|||
# You can define your custom sample loader returning the list of samples.
|
||||
# Or define your custom formatter and pass it to the `load_tts_samples`.
|
||||
# Check `TTS.tts.datasets.load_tts_samples` for more details.
|
||||
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
|
||||
train_samples, eval_samples = load_tts_samples(
|
||||
dataset_config,
|
||||
eval_split=True,
|
||||
eval_split_max_size=config.eval_split_max_size,
|
||||
eval_split_size=config.eval_split_size,
|
||||
)
|
||||
|
||||
# init speaker manager for multi-speaker training
|
||||
# it maps speaker-id to speaker-name in the model and data-loader
|
||||
|
|
|
@ -33,6 +33,6 @@ pypinyin
|
|||
mecab-python3==1.0.3
|
||||
unidic-lite==1.0.8
|
||||
# gruut+supported langs
|
||||
gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=2.0.0
|
||||
gruut[cs,de,es,fr,it,nl,pt,ru,sv]==2.2.3
|
||||
# others
|
||||
webrtcvad # for VAD
|
||||
|
|
|
@ -15,7 +15,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase):
|
|||
def test_GlowTTS():
|
||||
# set paths
|
||||
config_path = os.path.join(get_tests_input_path(), "test_glow_tts.json")
|
||||
checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth.tar")
|
||||
checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth")
|
||||
output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/")
|
||||
# load config
|
||||
c = load_config(config_path)
|
||||
|
@ -33,7 +33,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase):
|
|||
def test_Tacotron2():
|
||||
# set paths
|
||||
config_path = os.path.join(get_tests_input_path(), "test_tacotron2_config.json")
|
||||
checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth.tar")
|
||||
checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth")
|
||||
output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/")
|
||||
# load config
|
||||
c = load_config(config_path)
|
||||
|
@ -51,7 +51,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase):
|
|||
def test_Tacotron():
|
||||
# set paths
|
||||
config_path = os.path.join(get_tests_input_path(), "test_tacotron_config.json")
|
||||
checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth.tar")
|
||||
checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth")
|
||||
output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/")
|
||||
# load config
|
||||
c = load_config(config_path)
|
||||
|
|
|
@ -12,7 +12,7 @@ from TTS.tts.utils.speakers import SpeakerManager
|
|||
from TTS.utils.audio import AudioProcessor
|
||||
|
||||
encoder_config_path = os.path.join(get_tests_input_path(), "test_speaker_encoder_config.json")
|
||||
encoder_model_path = os.path.join(get_tests_input_path(), "checkpoint_0.pth.tar")
|
||||
encoder_model_path = os.path.join(get_tests_input_path(), "checkpoint_0.pth")
|
||||
sample_wav_path = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0001.wav")
|
||||
sample_wav_path2 = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0002.wav")
|
||||
d_vectors_file_path = os.path.join(get_tests_input_path(), "../data/dummy_speakers.json")
|
||||
|
|
|
@ -1,14 +1,13 @@
|
|||
import functools
|
||||
|
||||
import unittest
|
||||
|
||||
import torch
|
||||
|
||||
from TTS.config.shared_configs import BaseDatasetConfig
|
||||
from TTS.encoder.utils.samplers import PerfectBatchSampler
|
||||
from TTS.tts.datasets import load_tts_samples
|
||||
from TTS.tts.utils.languages import get_language_balancer_weights
|
||||
from TTS.tts.utils.speakers import get_speaker_balancer_weights
|
||||
from TTS.encoder.utils.samplers import PerfectBatchSampler
|
||||
|
||||
# Fixing random state to avoid random fails
|
||||
torch.manual_seed(0)
|
||||
|
@ -60,7 +59,9 @@ class TestSamplers(unittest.TestCase):
|
|||
assert not is_balanced(en, pt), "Random sampler is supposed to be unbalanced"
|
||||
|
||||
def test_language_weighted_random_sampler(self): # pylint: disable=no-self-use
|
||||
weighted_sampler = torch.utils.data.sampler.WeightedRandomSampler(get_language_balancer_weights(train_samples), len(train_samples))
|
||||
weighted_sampler = torch.utils.data.sampler.WeightedRandomSampler(
|
||||
get_language_balancer_weights(train_samples), len(train_samples)
|
||||
)
|
||||
ids = functools.reduce(lambda a, b: a + b, [list(weighted_sampler) for i in range(100)])
|
||||
en, pt = 0, 0
|
||||
for index in ids:
|
||||
|
@ -73,7 +74,9 @@ class TestSamplers(unittest.TestCase):
|
|||
|
||||
def test_speaker_weighted_random_sampler(self): # pylint: disable=no-self-use
|
||||
|
||||
weighted_sampler = torch.utils.data.sampler.WeightedRandomSampler(get_speaker_balancer_weights(train_samples), len(train_samples))
|
||||
weighted_sampler = torch.utils.data.sampler.WeightedRandomSampler(
|
||||
get_speaker_balancer_weights(train_samples), len(train_samples)
|
||||
)
|
||||
ids = functools.reduce(lambda a, b: a + b, [list(weighted_sampler) for i in range(100)])
|
||||
spk1, spk2 = 0, 0
|
||||
for index in ids:
|
||||
|
@ -92,11 +95,12 @@ class TestSamplers(unittest.TestCase):
|
|||
sampler = PerfectBatchSampler(
|
||||
train_samples,
|
||||
classes,
|
||||
batch_size=2 * 3, # total batch size
|
||||
batch_size=2 * 3, # total batch size
|
||||
num_classes_in_batch=2,
|
||||
label_key="speaker_name",
|
||||
shuffle=False,
|
||||
drop_last=True)
|
||||
drop_last=True,
|
||||
)
|
||||
batchs = functools.reduce(lambda a, b: a + b, [list(sampler) for i in range(100)])
|
||||
for batch in batchs:
|
||||
spk1, spk2 = 0, 0
|
||||
|
@ -116,11 +120,12 @@ class TestSamplers(unittest.TestCase):
|
|||
sampler = PerfectBatchSampler(
|
||||
train_samples,
|
||||
classes,
|
||||
batch_size=2 * 3, # total batch size
|
||||
batch_size=2 * 3, # total batch size
|
||||
num_classes_in_batch=2,
|
||||
label_key="speaker_name",
|
||||
shuffle=True,
|
||||
drop_last=False)
|
||||
drop_last=False,
|
||||
)
|
||||
batchs = functools.reduce(lambda a, b: a + b, [list(sampler) for i in range(100)])
|
||||
for batch in batchs:
|
||||
spk1, spk2 = 0, 0
|
||||
|
|
|
@ -20,7 +20,7 @@ class SynthesizerTest(unittest.TestCase):
|
|||
def test_in_out(self):
|
||||
self._create_random_model()
|
||||
tts_root_path = get_tests_output_path()
|
||||
tts_checkpoint = os.path.join(tts_root_path, "checkpoint_10.pth.tar")
|
||||
tts_checkpoint = os.path.join(tts_root_path, "checkpoint_10.pth")
|
||||
tts_config = os.path.join(tts_root_path, "dummy_model_config.json")
|
||||
synthesizer = Synthesizer(tts_checkpoint, tts_config, None, None)
|
||||
synthesizer.tts("Better this test works!!")
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
{
|
||||
"tts_checkpoint":"checkpoint_10.pth.tar", // tts checkpoint file
|
||||
"tts_checkpoint":"checkpoint_10.pth", // tts checkpoint file
|
||||
"tts_config":"dummy_model_config.json", // tts config.json file
|
||||
"tts_speakers": null, // json file listing speaker ids. null if no speaker embedding.
|
||||
"wavernn_lib_path": null, // Rootpath to wavernn project folder to be imported. If this is null, model uses GL for speech synthesis.
|
||||
|
|
Loading…
Reference in New Issue