Merge pull request #674 from coqui-ai/dev

v0.1.3
This commit is contained in:
Eren Gölge 2021-07-26 18:36:56 +02:00 committed by GitHub
commit d0292dd2d1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
30 changed files with 238 additions and 653 deletions

View File

@ -6,6 +6,7 @@ This repository is governed by [the Contributor Covenant Code of Conduct](https:
## Where to start. ## Where to start.
We welcome everyone who likes to contribute to 🐸TTS. We welcome everyone who likes to contribute to 🐸TTS.
You can contribute not only with code but with bug reports, comments, questions, answers, or just a simple tweet to spread the word. You can contribute not only with code but with bug reports, comments, questions, answers, or just a simple tweet to spread the word.
If you like to contribute code, squash a bug but if you don't know where to start, here are some pointers. If you like to contribute code, squash a bug but if you don't know where to start, here are some pointers.
@ -25,6 +26,16 @@ If you like to contribute code, squash a bug but if you don't know where to star
We list all the target improvements for the next version. You can pick one of them and start contributing. We list all the target improvements for the next version. You can pick one of them and start contributing.
- Also feel free to suggest new features, ideas and models. We're always open for new things. - Also feel free to suggest new features, ideas and models. We're always open for new things.
#####Call for sharing language models
If possible, please consider sharing your pre-trained models in any language (if the licences allow for you to do so). We will include them in our model catalogue for public use and give the proper attribution, whether it be your name, company, website or any other source specified.
This model can be shared in two ways:
1. Share the model files with us and we serve them with the next 🐸 TTS release.
2. Upload your models on GDrive and share the link.
Models are served under `.models.json` file and any model is available under TTS CLI or Server end points.
Either way you choose, please make sure you send the models [here](https://github.com/coqui-ai/TTS/issues/380).
## Sending a ✨**PR**✨ ## Sending a ✨**PR**✨
If you have a new feature, a model to implement, or a bug to squash, go ahead and send a ✨**PR**✨. If you have a new feature, a model to implement, or a bug to squash, go ahead and send a ✨**PR**✨.

View File

@ -132,7 +132,7 @@
"thorsten":{ "thorsten":{
"tacotron2-DCA":{ "tacotron2-DCA":{
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.11/tts_models--de--thorsten--tacotron2-DCA.zip", "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.11/tts_models--de--thorsten--tacotron2-DCA.zip",
"default_vocoder": "vocoder_models/de/thorsten/wavegrad", "default_vocoder": "vocoder_models/de/thorsten/fullband-melgan",
"author": "@thorstenMueller", "author": "@thorstenMueller",
"commit": "unknown" "commit": "unknown"
} }
@ -230,6 +230,11 @@
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.11/vocoder_models--de--thorsten--wavegrad.zip", "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.11/vocoder_models--de--thorsten--wavegrad.zip",
"author": "@thorstenMueller", "author": "@thorstenMueller",
"commit": "unknown" "commit": "unknown"
},
"fullband-melgan":{
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.1.3/vocoder_models--de--thorsten--fullband-melgan.zip",
"author": "@thorstenMueller",
"commit": "unknown"
} }
} }
} }

View File

@ -1,80 +1,47 @@
import argparse import argparse
import glob
import os import os
from argparse import RawTextHelpFormatter
import torch
from tqdm import tqdm from tqdm import tqdm
from TTS.config import BaseDatasetConfig, load_config from TTS.config import load_config
from TTS.speaker_encoder.utils.generic_utils import setup_model
from TTS.tts.datasets import load_meta_data from TTS.tts.datasets import load_meta_data
from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.speakers import SpeakerManager
from TTS.utils.audio import AudioProcessor
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description='Compute embedding vectors for each wav file in a dataset. If "target_dataset" is defined, it generates "speakers.json" necessary for training a multi-speaker model.' description="""Compute embedding vectors for each wav file in a dataset.\n\n"""
"""
Example runs:
python TTS/bin/compute_embeddings.py speaker_encoder_model.pth.tar speaker_encoder_config.json dataset_config.json embeddings_output_path/
""",
formatter_class=RawTextHelpFormatter,
) )
parser.add_argument("model_path", type=str, help="Path to model outputs (checkpoint, tensorboard etc.).") parser.add_argument("model_path", type=str, help="Path to model checkpoint file.")
parser.add_argument( parser.add_argument(
"config_path", "config_path",
type=str, type=str,
help="Path to config file for training.", help="Path to model config file.",
) )
parser.add_argument("data_path", type=str, help="Data path for wav files - directory or CSV file")
parser.add_argument("output_path", type=str, help="path for output speakers.json.")
parser.add_argument( parser.add_argument(
"--target_dataset", "config_dataset_path",
type=str, type=str,
default="", help="Path to dataset config file.",
help="Target dataset to pick a processor from TTS.tts.dataset.preprocess. Necessary to create a speakers.json file.",
) )
parser.add_argument("output_path", type=str, help="path for output speakers.json and/or speakers.npy.")
parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True) parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True)
parser.add_argument("--separator", type=str, help="Separator used in file if CSV is passed for data_path", default="|") parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
args = parser.parse_args() args = parser.parse_args()
c_dataset = load_config(args.config_dataset_path)
c = load_config(args.config_path) meta_data_train, meta_data_eval = load_meta_data(c_dataset.datasets, eval_split=args.eval)
ap = AudioProcessor(**c["audio"]) wav_files = meta_data_train + meta_data_eval
data_path = args.data_path speaker_manager = SpeakerManager(
split_ext = os.path.splitext(data_path) encoder_model_path=args.model_path, encoder_config_path=args.config_path, use_cuda=args.use_cuda
sep = args.separator )
if args.target_dataset != "":
# if target dataset is defined
dataset_config = [
BaseDatasetConfig(name=args.target_dataset, path=args.data_path, meta_file_train=None, meta_file_val=None),
]
wav_files, _ = load_meta_data(dataset_config, eval_split=False)
else:
# if target dataset is not defined
if len(split_ext) > 0 and split_ext[1].lower() == ".csv":
# Parse CSV
print(f"CSV file: {data_path}")
with open(data_path) as f:
wav_path = os.path.join(os.path.dirname(data_path), "wavs")
wav_files = []
print(f"Separator is: {sep}")
for line in f:
components = line.split(sep)
if len(components) != 2:
print("Invalid line")
continue
wav_file = os.path.join(wav_path, components[0] + ".wav")
# print(f'wav_file: {wav_file}')
if os.path.exists(wav_file):
wav_files.append(wav_file)
print(f"Count of wavs imported: {len(wav_files)}")
else:
# Parse all wav files in data_path
wav_files = glob.glob(data_path + "/**/*.wav", recursive=True)
# define Encoder model
model = setup_model(c)
model.load_state_dict(torch.load(args.model_path)["model"])
model.eval()
if args.use_cuda:
model.cuda()
# compute speaker embeddings # compute speaker embeddings
speaker_mapping = {} speaker_mapping = {}
@ -85,18 +52,14 @@ for idx, wav_file in enumerate(tqdm(wav_files)):
else: else:
speaker_name = None speaker_name = None
mel_spec = ap.melspectrogram(ap.load_wav(wav_file, sr=ap.sample_rate)).T # extract the embedding
mel_spec = torch.FloatTensor(mel_spec[None, :, :]) embedd = speaker_manager.compute_d_vector_from_clip(wav_file)
if args.use_cuda:
mel_spec = mel_spec.cuda()
embedd = model.compute_embedding(mel_spec)
embedd = embedd.detach().cpu().numpy()
# create speaker_mapping if target dataset is defined # create speaker_mapping if target dataset is defined
wav_file_name = os.path.basename(wav_file) wav_file_name = os.path.basename(wav_file)
speaker_mapping[wav_file_name] = {} speaker_mapping[wav_file_name] = {}
speaker_mapping[wav_file_name]["name"] = speaker_name speaker_mapping[wav_file_name]["name"] = speaker_name
speaker_mapping[wav_file_name]["embedding"] = embedd.flatten().tolist() speaker_mapping[wav_file_name]["embedding"] = embedd
if speaker_mapping: if speaker_mapping:
# save speaker_mapping if target dataset is defined # save speaker_mapping if target dataset is defined
@ -104,8 +67,9 @@ if speaker_mapping:
mapping_file_path = os.path.join(args.output_path, "speakers.json") mapping_file_path = os.path.join(args.output_path, "speakers.json")
else: else:
mapping_file_path = args.output_path mapping_file_path = args.output_path
os.makedirs(os.path.dirname(mapping_file_path), exist_ok=True) os.makedirs(os.path.dirname(mapping_file_path), exist_ok=True)
speaker_manager = SpeakerManager()
# pylint: disable=W0212 # pylint: disable=W0212
speaker_manager._save_json(mapping_file_path, speaker_mapping) speaker_manager._save_json(mapping_file_path, speaker_mapping)
print("Speaker embeddings saved at:", mapping_file_path) print("Speaker embeddings saved at:", mapping_file_path)

View File

@ -227,7 +227,7 @@ def main(args): # pylint: disable=redefined-outer-name
ap = AudioProcessor(**c.audio) ap = AudioProcessor(**c.audio)
# load data instances # load data instances
meta_data_train, meta_data_eval = load_meta_data(c.datasets) meta_data_train, meta_data_eval = load_meta_data(c.datasets, eval_split=args.eval)
# use eval and training partitions # use eval and training partitions
meta_data = meta_data_train + meta_data_eval meta_data = meta_data_train + meta_data_eval
@ -271,6 +271,7 @@ if __name__ == "__main__":
parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug") parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug")
parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files") parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files")
parser.add_argument("--quantized", action="store_true", help="Save quantized audio files") parser.add_argument("--quantized", action="store_true", help="Save quantized audio files")
parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
args = parser.parse_args() args = parser.parse_args()
c = load_config(args.config_path) c = load_config(args.config_path)

View File

@ -1,40 +1,41 @@
"""Find all the unique characters in a dataset""" """Find all the unique characters in a dataset"""
import argparse import argparse
import os
from argparse import RawTextHelpFormatter from argparse import RawTextHelpFormatter
from TTS.tts.datasets.formatters import get_preprocessor_by_name from TTS.config import load_config
from TTS.tts.datasets import load_meta_data
def main(): def main():
# pylint: disable=bad-option-value # pylint: disable=bad-option-value
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="""Find all the unique characters or phonemes in a dataset.\n\n""" description="""Find all the unique characters or phonemes in a dataset.\n\n"""
"""Target dataset must be defined in TTS.tts.datasets.formatters\n\n"""
""" """
Example runs: Example runs:
python TTS/bin/find_unique_chars.py --dataset ljspeech --meta_file /path/to/LJSpeech/metadata.csv python TTS/bin/find_unique_chars.py --config_path config.json
""", """,
formatter_class=RawTextHelpFormatter, formatter_class=RawTextHelpFormatter,
) )
parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
parser.add_argument(
"--dataset", type=str, default="", help="One of the target dataset names in TTS.tts.datasets.formatters."
)
parser.add_argument("--meta_file", type=str, default=None, help="Path to the transcriptions file of the dataset.")
args = parser.parse_args() args = parser.parse_args()
preprocessor = get_preprocessor_by_name(args.dataset) c = load_config(args.config_path)
items = preprocessor(os.path.dirname(args.meta_file), os.path.basename(args.meta_file))
# load all datasets
train_items, eval_items = load_meta_data(c.datasets, eval_split=True)
items = train_items + eval_items
texts = "".join(item[0] for item in items) texts = "".join(item[0] for item in items)
chars = set(texts) chars = set(texts)
lower_chars = filter(lambda c: c.islower(), chars) lower_chars = filter(lambda c: c.islower(), chars)
chars_force_lower = [c.lower() for c in chars]
chars_force_lower = set(chars_force_lower)
print(f" > Number of unique characters: {len(chars)}") print(f" > Number of unique characters: {len(chars)}")
print(f" > Unique characters: {''.join(sorted(chars))}") print(f" > Unique characters: {''.join(sorted(chars))}")
print(f" > Unique lower characters: {''.join(sorted(lower_chars))}") print(f" > Unique lower characters: {''.join(sorted(lower_chars))}")
print(f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}")
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -164,7 +164,7 @@ def main(args): # pylint: disable=redefined-outer-name
elif c.loss == "angleproto": elif c.loss == "angleproto":
criterion = AngleProtoLoss() criterion = AngleProtoLoss()
elif c.loss == "softmaxproto": elif c.loss == "softmaxproto":
criterion = SoftmaxAngleProtoLoss(c.model["proj_dim"], num_speakers) criterion = SoftmaxAngleProtoLoss(c.model_params["proj_dim"], num_speakers)
else: else:
raise Exception("The %s not is a loss supported" % c.loss) raise Exception("The %s not is a loss supported" % c.loss)

View File

@ -103,7 +103,8 @@ synthesizer = Synthesizer(
model_path, config_path, speakers_file_path, vocoder_path, vocoder_config_path, use_cuda=args.use_cuda model_path, config_path, speakers_file_path, vocoder_path, vocoder_config_path, use_cuda=args.use_cuda
) )
use_multi_speaker = synthesizer.speaker_manager is not None use_multi_speaker = synthesizer.tts_model.speaker_manager is not None and synthesizer.tts_model.num_speakers > 1
speaker_manager = synthesizer.tts_model.speaker_manager if hasattr(synthesizer.tts_model, "speaker_manager") else None
# TODO: set this from SpeakerManager # TODO: set this from SpeakerManager
use_gst = synthesizer.tts_config.get("use_gst", False) use_gst = synthesizer.tts_config.get("use_gst", False)
app = Flask(__name__) app = Flask(__name__)
@ -134,7 +135,7 @@ def index():
"index.html", "index.html",
show_details=args.show_details, show_details=args.show_details,
use_multi_speaker=use_multi_speaker, use_multi_speaker=use_multi_speaker,
speaker_ids=synthesizer.speaker_manager.speaker_ids if synthesizer.speaker_manager else None, speaker_ids=speaker_manager.speaker_ids if speaker_manager is not None else None,
use_gst=use_gst, use_gst=use_gst,
) )

View File

@ -1,3 +1,4 @@
import numpy as np
import torch import torch
from torch import nn from torch import nn
@ -70,24 +71,32 @@ class LSTMSpeakerEncoder(nn.Module):
d = torch.nn.functional.normalize(d, p=2, dim=1) d = torch.nn.functional.normalize(d, p=2, dim=1)
return d return d
def compute_embedding(self, x, num_frames=160, overlap=0.5): def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True):
""" """
Generate embeddings for a batch of utterances Generate embeddings for a batch of utterances
x: 1xTxD x: 1xTxD
""" """
num_overlap = int(num_frames * overlap)
max_len = x.shape[1] max_len = x.shape[1]
embed = None
cur_iter = 0 if max_len < num_frames:
for offset in range(0, max_len, num_frames - num_overlap): num_frames = max_len
cur_iter += 1
end_offset = min(x.shape[1], offset + num_frames) offsets = np.linspace(0, max_len - num_frames, num=num_eval)
frames_batch = []
for offset in offsets:
offset = int(offset)
end_offset = int(offset + num_frames)
frames = x[:, offset:end_offset] frames = x[:, offset:end_offset]
if embed is None: frames_batch.append(frames)
embed = self.inference(frames)
else: frames_batch = torch.cat(frames_batch, dim=0)
embed += self.inference(frames) embeddings = self.inference(frames_batch)
return embed / cur_iter
if return_mean:
embeddings = torch.mean(embeddings, dim=0, keepdim=True)
return embeddings
def batch_compute_embedding(self, x, seq_lens, num_frames=160, overlap=0.5): def batch_compute_embedding(self, x, seq_lens, num_frames=160, overlap=0.5):
""" """
@ -110,9 +119,11 @@ class LSTMSpeakerEncoder(nn.Module):
return embed / num_iters return embed / num_iters
# pylint: disable=unused-argument, redefined-builtin # pylint: disable=unused-argument, redefined-builtin
def load_checkpoint(self, config: dict, checkpoint_path: str, eval: bool = False): def load_checkpoint(self, config: dict, checkpoint_path: str, eval: bool = False, use_cuda: bool = False):
state = torch.load(checkpoint_path, map_location=torch.device("cpu")) state = torch.load(checkpoint_path, map_location=torch.device("cpu"))
self.load_state_dict(state["model"]) self.load_state_dict(state["model"])
if use_cuda:
self.cuda()
if eval: if eval:
self.eval() self.eval()
assert not self.training assert not self.training

View File

@ -199,3 +199,12 @@ class ResNetSpeakerEncoder(nn.Module):
embeddings = torch.mean(embeddings, dim=0, keepdim=True) embeddings = torch.mean(embeddings, dim=0, keepdim=True)
return embeddings return embeddings
def load_checkpoint(self, config: dict, checkpoint_path: str, eval: bool = False, use_cuda: bool = False):
state = torch.load(checkpoint_path, map_location=torch.device("cpu"))
self.load_state_dict(state["model"])
if use_cuda:
self.cuda()
if eval:
self.eval()
assert not self.training

View File

@ -764,11 +764,11 @@ class Trainer:
"""Run test and log the results. Test run must be defined by the model. """Run test and log the results. Test run must be defined by the model.
Model must return figures and audios to be logged by the Tensorboard.""" Model must return figures and audios to be logged by the Tensorboard."""
if hasattr(self.model, "test_run"): if hasattr(self.model, "test_run"):
if hasattr(self.eval_loader.load_test_samples): if hasattr(self.eval_loader.dataset, "load_test_samples"):
samples = self.eval_loader.load_test_samples(1) samples = self.eval_loader.dataset.load_test_samples(1)
figures, audios = self.model.test_run(samples) figures, audios = self.model.test_run(self.ap, samples, None)
else: else:
figures, audios = self.model.test_run() figures, audios = self.model.test_run(self.ap)
self.tb_logger.tb_test_audios(self.total_steps_done, audios, self.config.audio["sample_rate"]) self.tb_logger.tb_test_audios(self.total_steps_done, audios, self.config.audio["sample_rate"])
self.tb_logger.tb_test_figures(self.total_steps_done, figures) self.tb_logger.tb_test_figures(self.total_steps_done, figures)
@ -790,7 +790,7 @@ class Trainer:
self.train_epoch() self.train_epoch()
if self.config.run_eval: if self.config.run_eval:
self.eval_epoch() self.eval_epoch()
if epoch >= self.config.test_delay_epochs and self.args.rank < 0: if epoch >= self.config.test_delay_epochs and self.args.rank <= 0:
self.test_run() self.test_run()
self.c_logger.print_epoch_end( self.c_logger.print_epoch_end(
epoch, self.keep_avg_eval.avg_values if self.config.run_eval else self.keep_avg_train.avg_values epoch, self.keep_avg_eval.avg_values if self.config.run_eval else self.keep_avg_train.avg_values

View File

@ -202,16 +202,20 @@ def libri_tts(root_path, meta_files=None):
items = [] items = []
if meta_files is None: if meta_files is None:
meta_files = glob(f"{root_path}/**/*trans.tsv", recursive=True) meta_files = glob(f"{root_path}/**/*trans.tsv", recursive=True)
else:
if isinstance(meta_files, str):
meta_files = [os.path.join(root_path, meta_files)]
for meta_file in meta_files: for meta_file in meta_files:
_meta_file = os.path.basename(meta_file).split(".")[0] _meta_file = os.path.basename(meta_file).split(".")[0]
speaker_name = _meta_file.split("_")[0]
chapter_id = _meta_file.split("_")[1]
_root_path = os.path.join(root_path, f"{speaker_name}/{chapter_id}")
with open(meta_file, "r") as ttf: with open(meta_file, "r") as ttf:
for line in ttf: for line in ttf:
cols = line.split("\t") cols = line.split("\t")
wav_file = os.path.join(_root_path, cols[0] + ".wav") file_name = cols[0]
text = cols[1] speaker_name, chapter_id, *_ = cols[0].split("_")
_root_path = os.path.join(root_path, f"{speaker_name}/{chapter_id}")
wav_file = os.path.join(_root_path, file_name + ".wav")
text = cols[2]
items.append([text, wav_file, "LTTS_" + speaker_name]) items.append([text, wav_file, "LTTS_" + speaker_name])
for item in items: for item in items:
assert os.path.exists(item[1]), f" [!] wav files don't exist - {item[1]}" assert os.path.exists(item[1]), f" [!] wav files don't exist - {item[1]}"
@ -288,6 +292,19 @@ def vctk_slim(root_path, meta_files=None, wavs_path="wav48"):
return items return items
def mls(root_path, meta_files=None):
"""http://www.openslr.org/94/"""
items = []
with open(os.path.join(root_path, meta_files), "r") as meta:
for line in meta:
file, text = line.split("\t")
text = text[:-1]
speaker, book, *_ = file.split("_")
wav_file = os.path.join(root_path, os.path.dirname(meta_files), "audio", speaker, book, file + ".wav")
items.append([text, wav_file, "MLS_" + speaker])
return items
# ======================================== VOX CELEB =========================================== # ======================================== VOX CELEB ===========================================
def voxceleb2(root_path, meta_file=None): def voxceleb2(root_path, meta_file=None):
""" """

View File

@ -246,9 +246,9 @@ class Huber(nn.Module):
class TacotronLoss(torch.nn.Module): class TacotronLoss(torch.nn.Module):
"""Collection of Tacotron set-up based on provided config.""" """Collection of Tacotron set-up based on provided config."""
def __init__(self, c, stopnet_pos_weight=10, ga_sigma=0.4): def __init__(self, c, ga_sigma=0.4):
super().__init__() super().__init__()
self.stopnet_pos_weight = stopnet_pos_weight self.stopnet_pos_weight = c.stopnet_pos_weight
self.ga_alpha = c.ga_alpha self.ga_alpha = c.ga_alpha
self.decoder_diff_spec_alpha = c.decoder_diff_spec_alpha self.decoder_diff_spec_alpha = c.decoder_diff_spec_alpha
self.postnet_diff_spec_alpha = c.postnet_diff_spec_alpha self.postnet_diff_spec_alpha = c.postnet_diff_spec_alpha
@ -274,7 +274,7 @@ class TacotronLoss(torch.nn.Module):
self.criterion_ssim = SSIMLoss() self.criterion_ssim = SSIMLoss()
# stopnet loss # stopnet loss
# pylint: disable=not-callable # pylint: disable=not-callable
self.criterion_st = BCELossMasked(pos_weight=torch.tensor(stopnet_pos_weight)) if c.stopnet else None self.criterion_st = BCELossMasked(pos_weight=torch.tensor(self.stopnet_pos_weight)) if c.stopnet else None
def forward( def forward(
self, self,
@ -284,6 +284,7 @@ class TacotronLoss(torch.nn.Module):
linear_input, linear_input,
stopnet_output, stopnet_output,
stopnet_target, stopnet_target,
stop_target_length,
output_lens, output_lens,
decoder_b_output, decoder_b_output,
alignments, alignments,
@ -315,12 +316,12 @@ class TacotronLoss(torch.nn.Module):
return_dict["decoder_loss"] = decoder_loss return_dict["decoder_loss"] = decoder_loss
return_dict["postnet_loss"] = postnet_loss return_dict["postnet_loss"] = postnet_loss
# stopnet loss
stop_loss = ( stop_loss = (
self.criterion_st(stopnet_output, stopnet_target, output_lens) if self.config.stopnet else torch.zeros(1) self.criterion_st(stopnet_output, stopnet_target, stop_target_length)
if self.config.stopnet
else torch.zeros(1)
) )
if not self.config.separate_stopnet and self.config.stopnet: loss += stop_loss
loss += stop_loss
return_dict["stopnet_loss"] = stop_loss return_dict["stopnet_loss"] = stop_loss
# backward decoder loss (if enabled) # backward decoder loss (if enabled)

View File

@ -70,7 +70,7 @@ class BaseTTS(BaseModel):
def get_aux_input(self, **kwargs) -> Dict: def get_aux_input(self, **kwargs) -> Dict:
"""Prepare and return `aux_input` used by `forward()`""" """Prepare and return `aux_input` used by `forward()`"""
pass return {"speaker_id": None, "style_wav": None, "d_vector": None}
def format_batch(self, batch: Dict) -> Dict: def format_batch(self, batch: Dict) -> Dict:
"""Generic batch formatting for `TTSDataset`. """Generic batch formatting for `TTSDataset`.
@ -119,9 +119,10 @@ class BaseTTS(BaseModel):
), f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}" ), f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}"
durations[idx, : text_lengths[idx]] = dur durations[idx, : text_lengths[idx]] = dur
# set stop targets view, we predict a single stop token per iteration. # set stop targets wrt reduction factor
stop_targets = stop_targets.view(text_input.shape[0], stop_targets.size(1) // self.config.r, -1) stop_targets = stop_targets.view(text_input.shape[0], stop_targets.size(1) // self.config.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2)
stop_target_lengths = torch.divide(mel_lengths, self.config.r).ceil_()
return { return {
"text_input": text_input, "text_input": text_input,
@ -131,6 +132,7 @@ class BaseTTS(BaseModel):
"mel_lengths": mel_lengths, "mel_lengths": mel_lengths,
"linear_input": linear_input, "linear_input": linear_input,
"stop_targets": stop_targets, "stop_targets": stop_targets,
"stop_target_lengths": stop_target_lengths,
"attn_mask": attn_mask, "attn_mask": attn_mask,
"durations": durations, "durations": durations,
"speaker_ids": speaker_ids, "speaker_ids": speaker_ids,
@ -200,7 +202,7 @@ class BaseTTS(BaseModel):
) )
return loader return loader
def test_run(self) -> Tuple[Dict, Dict]: def test_run(self, ap) -> Tuple[Dict, Dict]:
"""Generic test run for `tts` models used by `Trainer`. """Generic test run for `tts` models used by `Trainer`.
You can override this for a different behaviour. You can override this for a different behaviour.
@ -212,14 +214,14 @@ class BaseTTS(BaseModel):
test_audios = {} test_audios = {}
test_figures = {} test_figures = {}
test_sentences = self.config.test_sentences test_sentences = self.config.test_sentences
aux_inputs = self._get_aux_inputs() aux_inputs = self.get_aux_input()
for idx, sen in enumerate(test_sentences): for idx, sen in enumerate(test_sentences):
wav, alignment, model_outputs, _ = synthesis( wav, alignment, model_outputs, _ = synthesis(
self.model, self,
sen, sen,
self.config, self.config,
self.use_cuda, "cuda" in str(next(self.parameters()).device),
self.ap, ap,
speaker_id=aux_inputs["speaker_id"], speaker_id=aux_inputs["speaker_id"],
d_vector=aux_inputs["d_vector"], d_vector=aux_inputs["d_vector"],
style_wav=aux_inputs["style_wav"], style_wav=aux_inputs["style_wav"],
@ -229,6 +231,6 @@ class BaseTTS(BaseModel):
).values() ).values()
test_audios["{}-audio".format(idx)] = wav test_audios["{}-audio".format(idx)] = wav
test_figures["{}-prediction".format(idx)] = plot_spectrogram(model_outputs, self.ap, output_fig=False) test_figures["{}-prediction".format(idx)] = plot_spectrogram(model_outputs, ap, output_fig=False)
test_figures["{}-alignment".format(idx)] = plot_alignment(alignment, output_fig=False) test_figures["{}-alignment".format(idx)] = plot_alignment(alignment, output_fig=False)
return test_figures, test_audios return test_figures, test_audios

View File

@ -113,7 +113,7 @@ class GlowTTS(BaseTTS):
@staticmethod @staticmethod
def compute_outputs(attn, o_mean, o_log_scale, x_mask): def compute_outputs(attn, o_mean, o_log_scale, x_mask):
""" Compute and format the mode outputs with the given alignment map""" """Compute and format the mode outputs with the given alignment map"""
y_mean = torch.matmul(attn.squeeze(1).transpose(1, 2), o_mean.transpose(1, 2)).transpose( y_mean = torch.matmul(attn.squeeze(1).transpose(1, 2), o_mean.transpose(1, 2)).transpose(
1, 2 1, 2
) # [b, t', t], [b, t, d] -> [b, d, t'] ) # [b, t', t], [b, t, d] -> [b, d, t']

View File

@ -219,6 +219,7 @@ class Tacotron(BaseTacotron):
mel_lengths = batch["mel_lengths"] mel_lengths = batch["mel_lengths"]
linear_input = batch["linear_input"] linear_input = batch["linear_input"]
stop_targets = batch["stop_targets"] stop_targets = batch["stop_targets"]
stop_target_lengths = batch["stop_target_lengths"]
speaker_ids = batch["speaker_ids"] speaker_ids = batch["speaker_ids"]
d_vectors = batch["d_vectors"] d_vectors = batch["d_vectors"]
@ -250,6 +251,7 @@ class Tacotron(BaseTacotron):
linear_input, linear_input,
outputs["stop_tokens"], outputs["stop_tokens"],
stop_targets, stop_targets,
stop_target_lengths,
mel_lengths, mel_lengths,
outputs["decoder_outputs_backward"], outputs["decoder_outputs_backward"],
outputs["alignments"], outputs["alignments"],

View File

@ -224,6 +224,7 @@ class Tacotron2(BaseTacotron):
mel_lengths = batch["mel_lengths"] mel_lengths = batch["mel_lengths"]
linear_input = batch["linear_input"] linear_input = batch["linear_input"]
stop_targets = batch["stop_targets"] stop_targets = batch["stop_targets"]
stop_target_lengths = batch["stop_target_lengths"]
speaker_ids = batch["speaker_ids"] speaker_ids = batch["speaker_ids"]
d_vectors = batch["d_vectors"] d_vectors = batch["d_vectors"]
@ -255,6 +256,7 @@ class Tacotron2(BaseTacotron):
linear_input, linear_input,
outputs["stop_tokens"], outputs["stop_tokens"],
stop_targets, stop_targets,
stop_target_lengths,
mel_lengths, mel_lengths,
outputs["decoder_outputs_backward"], outputs["decoder_outputs_backward"],
outputs["alignments"], outputs["alignments"],

View File

@ -27,10 +27,19 @@ def prepare_tensor(inputs, out_steps):
return np.stack([_pad_tensor(x, pad_len) for x in inputs]) return np.stack([_pad_tensor(x, pad_len) for x in inputs])
def _pad_stop_target(x, length): def _pad_stop_target(x: np.ndarray, length: int, pad_val=1) -> np.ndarray:
_pad = 0.0 """Pad stop target array.
Args:
x (np.ndarray): Stop target array.
length (int): Length after padding.
pad_val (int, optional): Padding value. Defaults to 1.
Returns:
np.ndarray: Padded stop target array.
"""
assert x.ndim == 1 assert x.ndim == 1
return np.pad(x, (0, length - x.shape[0]), mode="constant", constant_values=_pad) return np.pad(x, (0, length - x.shape[0]), mode="constant", constant_values=pad_val)
def prepare_stop_target(inputs, out_steps): def prepare_stop_target(inputs, out_steps):

View File

@ -59,6 +59,7 @@ class SpeakerManager:
speaker_id_file_path: str = "", speaker_id_file_path: str = "",
encoder_model_path: str = "", encoder_model_path: str = "",
encoder_config_path: str = "", encoder_config_path: str = "",
use_cuda: bool = False,
): ):
self.data_items = [] self.data_items = []
@ -67,6 +68,7 @@ class SpeakerManager:
self.clip_ids = [] self.clip_ids = []
self.speaker_encoder = None self.speaker_encoder = None
self.speaker_encoder_ap = None self.speaker_encoder_ap = None
self.use_cuda = use_cuda
if data_items: if data_items:
self.speaker_ids, self.speaker_names, _ = self.parse_speakers_from_data(self.data_items) self.speaker_ids, self.speaker_names, _ = self.parse_speakers_from_data(self.data_items)
@ -222,11 +224,11 @@ class SpeakerManager:
""" """
self.speaker_encoder_config = load_config(config_path) self.speaker_encoder_config = load_config(config_path)
self.speaker_encoder = setup_model(self.speaker_encoder_config) self.speaker_encoder = setup_model(self.speaker_encoder_config)
self.speaker_encoder.load_checkpoint(config_path, model_path, True) self.speaker_encoder.load_checkpoint(config_path, model_path, eval=True, use_cuda=self.use_cuda)
self.speaker_encoder_ap = AudioProcessor(**self.speaker_encoder_config.audio) self.speaker_encoder_ap = AudioProcessor(**self.speaker_encoder_config.audio)
# normalize the input audio level and trim silences # normalize the input audio level and trim silences
self.speaker_encoder_ap.do_sound_norm = True # self.speaker_encoder_ap.do_sound_norm = True
self.speaker_encoder_ap.do_trim_silence = True # self.speaker_encoder_ap.do_trim_silence = True
def compute_d_vector_from_clip(self, wav_file: Union[str, list]) -> list: def compute_d_vector_from_clip(self, wav_file: Union[str, list]) -> list:
"""Compute a d_vector from a given audio file. """Compute a d_vector from a given audio file.
@ -242,6 +244,8 @@ class SpeakerManager:
waveform = self.speaker_encoder_ap.load_wav(wav_file, sr=self.speaker_encoder_ap.sample_rate) waveform = self.speaker_encoder_ap.load_wav(wav_file, sr=self.speaker_encoder_ap.sample_rate)
spec = self.speaker_encoder_ap.melspectrogram(waveform) spec = self.speaker_encoder_ap.melspectrogram(waveform)
spec = torch.from_numpy(spec.T) spec = torch.from_numpy(spec.T)
if self.use_cuda:
spec = spec.cuda()
spec = spec.unsqueeze(0) spec = spec.unsqueeze(0)
d_vector = self.speaker_encoder.compute_embedding(spec) d_vector = self.speaker_encoder.compute_embedding(spec)
return d_vector return d_vector
@ -272,6 +276,8 @@ class SpeakerManager:
feats = torch.from_numpy(feats) feats = torch.from_numpy(feats)
if feats.ndim == 2: if feats.ndim == 2:
feats = feats.unsqueeze(0) feats = feats.unsqueeze(0)
if self.use_cuda:
feats = feats.cuda()
return self.speaker_encoder.compute_embedding(feats) return self.speaker_encoder.compute_embedding(feats)
def run_umap(self): def run_umap(self):

View File

@ -2,6 +2,7 @@ import glob
import os import os
import random import random
from multiprocessing import Manager from multiprocessing import Manager
from typing import List, Tuple
import numpy as np import numpy as np
import torch import torch
@ -67,7 +68,19 @@ class WaveGradDataset(Dataset):
item = self.load_item(idx) item = self.load_item(idx)
return item return item
def load_test_samples(self, num_samples): def load_test_samples(self, num_samples: int) -> List[Tuple]:
"""Return test samples.
Args:
num_samples (int): Number of samples to return.
Returns:
List[Tuple]: melspectorgram and audio.
Shapes:
- melspectrogram (Tensor): :math:`[C, T]`
- audio (Tensor): :math:`[T_audio]`
"""
samples = [] samples = []
return_segments = self.return_segments return_segments = self.return_segments
self.return_segments = False self.return_segments = False

View File

@ -31,7 +31,7 @@ def setup_model(config: Coqpit):
def setup_generator(c): def setup_generator(c):
""" TODO: use config object as arguments""" """TODO: use config object as arguments"""
print(" > Generator Model: {}".format(c.generator_model)) print(" > Generator Model: {}".format(c.generator_model))
MyModel = importlib.import_module("TTS.vocoder.models." + c.generator_model.lower()) MyModel = importlib.import_module("TTS.vocoder.models." + c.generator_model.lower())
MyModel = getattr(MyModel, to_camel(c.generator_model)) MyModel = getattr(MyModel, to_camel(c.generator_model))
@ -94,7 +94,7 @@ def setup_generator(c):
def setup_discriminator(c): def setup_discriminator(c):
""" TODO: use config objekt as arguments""" """TODO: use config objekt as arguments"""
print(" > Discriminator Model: {}".format(c.discriminator_model)) print(" > Discriminator Model: {}".format(c.discriminator_model))
if "parallel_wavegan" in c.discriminator_model: if "parallel_wavegan" in c.discriminator_model:
MyModel = importlib.import_module("TTS.vocoder.models.parallel_wavegan_discriminator") MyModel = importlib.import_module("TTS.vocoder.models.parallel_wavegan_discriminator")

View File

@ -124,11 +124,16 @@ class Wavegrad(BaseModel):
@torch.no_grad() @torch.no_grad()
def inference(self, x, y_n=None): def inference(self, x, y_n=None):
"""x: B x D X T""" """
Shapes:
x: :math:`[B, C , T]`
y_n: :math:`[B, 1, T]`
"""
if y_n is None: if y_n is None:
y_n = torch.randn(x.shape[0], 1, self.hop_len * x.shape[-1], dtype=torch.float32).to(x) y_n = torch.randn(x.shape[0], 1, self.hop_len * x.shape[-1])
else: else:
y_n = torch.FloatTensor(y_n).unsqueeze(0).unsqueeze(0).to(x) y_n = torch.FloatTensor(y_n).unsqueeze(0).unsqueeze(0)
y_n = y_n.type_as(x)
sqrt_alpha_hat = self.noise_level.to(x) sqrt_alpha_hat = self.noise_level.to(x)
for n in range(len(self.alpha) - 1, -1, -1): for n in range(len(self.alpha) - 1, -1, -1):
y_n = self.c1[n] * (y_n - self.c2[n] * self.forward(y_n, x, sqrt_alpha_hat[n].repeat(x.shape[0]))) y_n = self.c1[n] * (y_n - self.c2[n] * self.forward(y_n, x, sqrt_alpha_hat[n].repeat(x.shape[0])))
@ -267,8 +272,10 @@ class Wavegrad(BaseModel):
betas = np.linspace(noise_schedule["min_val"], noise_schedule["max_val"], noise_schedule["num_steps"]) betas = np.linspace(noise_schedule["min_val"], noise_schedule["max_val"], noise_schedule["num_steps"])
self.compute_noise_level(betas) self.compute_noise_level(betas)
for sample in samples: for sample in samples:
x = sample["input"] x = sample[0]
y = sample["waveform"] x = x[None, :, :].to(next(self.parameters()).device)
y = sample[1]
y = y[None, :]
# compute voice # compute voice
y_pred = self.inference(x) y_pred = self.inference(x)
# compute spectrograms # compute spectrograms

View File

@ -322,7 +322,7 @@ class Wavernn(BaseVocoder):
with torch.no_grad(): with torch.no_grad():
if isinstance(mels, np.ndarray): if isinstance(mels, np.ndarray):
mels = torch.FloatTensor(mels).type_as(mels) mels = torch.FloatTensor(mels).to(str(next(self.parameters()).device))
if mels.ndim == 2: if mels.ndim == 2:
mels = mels.unsqueeze(0) mels = mels.unsqueeze(0)
@ -576,7 +576,8 @@ class Wavernn(BaseVocoder):
figures = {} figures = {}
audios = {} audios = {}
for idx, sample in enumerate(samples): for idx, sample in enumerate(samples):
x = sample["input"] x = torch.FloatTensor(sample[0])
x = x.to(next(self.parameters()).device)
y_hat = self.inference(x, self.config.batched, self.config.target_samples, self.config.overlap_samples) y_hat = self.inference(x, self.config.batched, self.config.target_samples, self.config.overlap_samples)
x_hat = ap.melspectrogram(y_hat) x_hat = ap.melspectrogram(y_hat)
figures.update( figures.update(
@ -585,7 +586,7 @@ class Wavernn(BaseVocoder):
f"test_{idx}/prediction": plot_spectrogram(x_hat.T), f"test_{idx}/prediction": plot_spectrogram(x_hat.T),
} }
) )
audios.update({f"test_{idx}/audio", y_hat}) audios.update({f"test_{idx}/audio": y_hat})
return figures, audios return figures, audios
@staticmethod @staticmethod

View File

@ -11,6 +11,6 @@ Some of the known public datasets that we successfully applied 🐸TTS:
- [Spanish](https://drive.google.com/file/d/1Sm_zyBo67XHkiFhcRSQ4YaHPYM0slO_e/view?usp=sharing) - thx! @carlfm01 - [Spanish](https://drive.google.com/file/d/1Sm_zyBo67XHkiFhcRSQ4YaHPYM0slO_e/view?usp=sharing) - thx! @carlfm01
- [German - Thorsten OGVD](https://github.com/thorstenMueller/deep-learning-german-tts) - [German - Thorsten OGVD](https://github.com/thorstenMueller/deep-learning-german-tts)
- [Japanese - Kokoro](https://www.kaggle.com/kaiida/kokoro-speech-dataset-v11-small/version/1) - [Japanese - Kokoro](https://www.kaggle.com/kaiida/kokoro-speech-dataset-v11-small/version/1)
- [Chinese](https://www.data-baker.com/open_source.html) - [Chinese](https://www.data-baker.com/data/index/source/)
Let us know if you use 🐸TTS on a different dataset. Let us know if you use 🐸TTS on a different dataset.

View File

@ -1,5 +1,5 @@
dependencies = [ dependencies = [
'torch', 'gdown', 'pysbd', 'gruut', 'anyascii', 'pypinyin', 'coqpit', 'mecab-python3', 'unidic-lite` 'torch', 'gdown', 'pysbd', 'gruut', 'anyascii', 'pypinyin', 'coqpit', 'mecab-python3', 'unidic-lite'
] ]
import torch import torch

File diff suppressed because one or more lines are too long

View File

@ -50,7 +50,7 @@
"stopnet_pos_weight": 15.0, "stopnet_pos_weight": 15.0,
"run_eval": true, "run_eval": true,
"test_delay_epochs": 10, "test_delay_epochs": 10,
"max_decoder_steps": 50, "max_decoder_steps": 1000,
"noam_schedule": true, "noam_schedule": true,
"grad_clip": 0.05, "grad_clip": 0.05,
"epochs": 1000, "epochs": 1000,

View File

@ -56,7 +56,7 @@
"run_eval": true, "run_eval": true,
"test_delay_epochs": 10, "test_delay_epochs": 10,
"test_sentences_file": null, "test_sentences_file": null,
"max_decoder_steps": 50, "max_decoder_steps": 1000,
"noam_schedule": true, "noam_schedule": true,
"grad_clip": 0.05, "grad_clip": 0.05,
"epochs": 1000, "epochs": 1000,

View File

@ -207,7 +207,7 @@ class TestTTSDataset(unittest.TestCase):
assert linear_input[1 - idx, -1].sum() == 0 assert linear_input[1 - idx, -1].sum() == 0
assert mel_input[1 - idx, -1].sum() == 0 assert mel_input[1 - idx, -1].sum() == 0
assert stop_target[1, mel_lengths[1] - 1] == 1 assert stop_target[1, mel_lengths[1] - 1] == 1
assert stop_target[1, mel_lengths[1] :].sum() == 0 assert stop_target[1, mel_lengths[1] :].sum() == stop_target.shape[1] - mel_lengths[1]
assert len(mel_lengths.shape) == 1 assert len(mel_lengths.shape) == 1
# check batch zero-frame conditions (zero-frame disabled) # check batch zero-frame conditions (zero-frame disabled)

View File

@ -35,7 +35,7 @@ class LSTMSpeakerEncoderTests(unittest.TestCase):
assert abs(assert_diff) < 1e-4, f" [!] output_norm has wrong values - {assert_diff}" assert abs(assert_diff) < 1e-4, f" [!] output_norm has wrong values - {assert_diff}"
# compute d for a given batch # compute d for a given batch
dummy_input = T.rand(1, 240, 80) # B x T x D dummy_input = T.rand(1, 240, 80) # B x T x D
output = model.compute_embedding(dummy_input, num_frames=160, overlap=0.5) output = model.compute_embedding(dummy_input, num_frames=160, num_eval=5)
assert output.shape[0] == 1 assert output.shape[0] == 1
assert output.shape[1] == 256 assert output.shape[1] == 256
assert len(output.shape) == 2 assert len(output.shape) == 2

View File

@ -6,7 +6,20 @@ from tests import get_device_id, get_tests_output_path, run_cli
from TTS.config.shared_configs import BaseAudioConfig from TTS.config.shared_configs import BaseAudioConfig
from TTS.speaker_encoder.speaker_encoder_config import SpeakerEncoderConfig from TTS.speaker_encoder.speaker_encoder_config import SpeakerEncoderConfig
config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
def run_test_train():
command = (
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --config_path {config_path} "
f"--coqpit.output_path {output_path} "
"--coqpit.datasets.0.name ljspeech "
"--coqpit.datasets.0.meta_file_train metadata.csv "
"--coqpit.datasets.0.meta_file_val metadata.csv "
"--coqpit.datasets.0.path tests/data/ljspeech "
)
run_cli(command)
config_path = os.path.join(get_tests_output_path(), "test_speaker_encoder_config.json")
output_path = os.path.join(get_tests_output_path(), "train_outputs") output_path = os.path.join(get_tests_output_path(), "train_outputs")
config = SpeakerEncoderConfig( config = SpeakerEncoderConfig(
@ -24,16 +37,9 @@ config.audio.do_trim_silence = True
config.audio.trim_db = 60 config.audio.trim_db = 60
config.save_json(config_path) config.save_json(config_path)
print(config)
# train the model for one epoch # train the model for one epoch
command_train = ( run_test_train()
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --config_path {config_path} "
f"--coqpit.output_path {output_path} "
"--coqpit.datasets.0.name ljspeech "
"--coqpit.datasets.0.meta_file_train metadata.csv "
"--coqpit.datasets.0.meta_file_val metadata.csv "
"--coqpit.datasets.0.path tests/data/ljspeech "
)
run_cli(command_train)
# Find latest folder # Find latest folder
continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
@ -50,15 +56,7 @@ config.model_params["model_name"] = "resnet"
config.save_json(config_path) config.save_json(config_path)
# train the model for one epoch # train the model for one epoch
command_train = ( run_test_train()
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --config_path {config_path} "
f"--coqpit.output_path {output_path} "
"--coqpit.datasets.0.name ljspeech "
"--coqpit.datasets.0.meta_file_train metadata.csv "
"--coqpit.datasets.0.meta_file_val metadata.csv "
"--coqpit.datasets.0.path tests/data/ljspeech "
)
run_cli(command_train)
# Find latest folder # Find latest folder
continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
@ -69,3 +67,18 @@ command_train = (
) )
run_cli(command_train) run_cli(command_train)
shutil.rmtree(continue_path) shutil.rmtree(continue_path)
# test model with ge2e loss function
config.loss = "ge2e"
config.save_json(config_path)
run_test_train()
# test model with angleproto loss function
config.loss = "angleproto"
config.save_json(config_path)
run_test_train()
# test model with softmaxproto loss function
config.loss = "softmaxproto"
config.save_json(config_path)
run_test_train()