From 4b7b88dd3d1c8377544648a1ce81d73e8d61a45e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 26 Jul 2021 14:56:05 +0200 Subject: [PATCH] Add fullband-melgan DE vocoder --- TTS/.models.json | 5 +++++ TTS/bin/compute_embeddings.py | 13 +++++++------ TTS/bin/find_unique_chars.py | 8 ++++---- TTS/speaker_encoder/models/lstm.py | 6 +++--- TTS/speaker_encoder/models/resnet.py | 2 +- TTS/tts/datasets/formatters.py | 8 +++++--- 6 files changed, 25 insertions(+), 17 deletions(-) diff --git a/TTS/.models.json b/TTS/.models.json index 73204db6..d46237b9 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -230,6 +230,11 @@ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.11/vocoder_models--de--thorsten--wavegrad.zip", "author": "@thorstenMueller", "commit": "unknown" + }, + "fullband-melgan":{ + "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.1.3/vocoder_models--de--thorsten--fullband-melgan.zip", + "author": "@thorstenMueller", + "commit": "unknown" } } } diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py index f485514a..8c4d275f 100644 --- a/TTS/bin/compute_embeddings.py +++ b/TTS/bin/compute_embeddings.py @@ -1,21 +1,20 @@ - import argparse import os +from argparse import RawTextHelpFormatter from tqdm import tqdm -from argparse import RawTextHelpFormatter from TTS.config import load_config from TTS.tts.datasets import load_meta_data from TTS.tts.utils.speakers import SpeakerManager parser = argparse.ArgumentParser( description="""Compute embedding vectors for each wav file in a dataset.\n\n""" - """ + """ Example runs: python TTS/bin/compute_embeddings.py speaker_encoder_model.pth.tar speaker_encoder_config.json dataset_config.json embeddings_output_path/ """, - formatter_class=RawTextHelpFormatter, + formatter_class=RawTextHelpFormatter, ) parser.add_argument("model_path", type=str, help="Path to model checkpoint file.") parser.add_argument( @@ -40,7 +39,9 @@ c_dataset = load_config(args.config_dataset_path) meta_data_train, meta_data_eval = load_meta_data(c_dataset.datasets, eval_split=args.eval) wav_files = meta_data_train + meta_data_eval -speaker_manager = SpeakerManager(encoder_model_path=args.model_path, encoder_config_path=args.config_path, use_cuda=args.use_cuda) +speaker_manager = SpeakerManager( + encoder_model_path=args.model_path, encoder_config_path=args.config_path, use_cuda=args.use_cuda +) # compute speaker embeddings speaker_mapping = {} @@ -62,7 +63,7 @@ for idx, wav_file in enumerate(tqdm(wav_files)): if speaker_mapping: # save speaker_mapping if target dataset is defined - if '.json' not in args.output_path: + if ".json" not in args.output_path: mapping_file_path = os.path.join(args.output_path, "speakers.json") else: mapping_file_path = args.output_path diff --git a/TTS/bin/find_unique_chars.py b/TTS/bin/find_unique_chars.py index c7c25d80..16768e43 100644 --- a/TTS/bin/find_unique_chars.py +++ b/TTS/bin/find_unique_chars.py @@ -1,8 +1,9 @@ """Find all the unique characters in a dataset""" import argparse from argparse import RawTextHelpFormatter -from TTS.tts.datasets import load_meta_data + from TTS.config import load_config +from TTS.tts.datasets import load_meta_data def main(): @@ -16,9 +17,7 @@ def main(): """, formatter_class=RawTextHelpFormatter, ) - parser.add_argument( - "--config_path", type=str, help="Path to dataset config file.", required=True - ) + parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True) args = parser.parse_args() c = load_config(args.config_path) @@ -38,5 +37,6 @@ def main(): print(f" > Unique lower characters: {''.join(sorted(lower_chars))}") print(f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}") + if __name__ == "__main__": main() diff --git a/TTS/speaker_encoder/models/lstm.py b/TTS/speaker_encoder/models/lstm.py index 21439d6b..7e39087a 100644 --- a/TTS/speaker_encoder/models/lstm.py +++ b/TTS/speaker_encoder/models/lstm.py @@ -1,5 +1,5 @@ -import torch import numpy as np +import torch from torch import nn @@ -81,12 +81,12 @@ class LSTMSpeakerEncoder(nn.Module): if max_len < num_frames: num_frames = max_len - offsets = np.linspace(0, max_len-num_frames, num=num_eval) + offsets = np.linspace(0, max_len - num_frames, num=num_eval) frames_batch = [] for offset in offsets: offset = int(offset) - end_offset = int(offset+num_frames) + end_offset = int(offset + num_frames) frames = x[:, offset:end_offset] frames_batch.append(frames) diff --git a/TTS/speaker_encoder/models/resnet.py b/TTS/speaker_encoder/models/resnet.py index 29f3ae61..f52bb4d5 100644 --- a/TTS/speaker_encoder/models/resnet.py +++ b/TTS/speaker_encoder/models/resnet.py @@ -207,4 +207,4 @@ class ResNetSpeakerEncoder(nn.Module): self.cuda() if eval: self.eval() - assert not self.training \ No newline at end of file + assert not self.training diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index ef5299cb..c057c51e 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -291,18 +291,20 @@ def vctk_slim(root_path, meta_files=None, wavs_path="wav48"): return items + def mls(root_path, meta_files=None): """http://www.openslr.org/94/""" items = [] with open(os.path.join(root_path, meta_files), "r") as meta: for line in meta: - file, text = line.split('\t') + file, text = line.split("\t") text = text[:-1] - speaker, book, *_ = file.split('_') - wav_file = os.path.join(root_path, os.path.dirname(meta_files), 'audio', speaker, book, file + ".wav") + speaker, book, *_ = file.split("_") + wav_file = os.path.join(root_path, os.path.dirname(meta_files), "audio", speaker, book, file + ".wav") items.append([text, wav_file, "MLS_" + speaker]) return items + # ======================================== VOX CELEB =========================================== def voxceleb2(root_path, meta_file=None): """